Mail Archives: djgpp/1997/02/05/00:07:10
Ok, lets try again... :)
> > Can we *please* kill this myth.
> > On Pentium there is NO speed difference between using a float or double.
> > On 387,486/487 float is slightly *faster* to load, store or read from
> > ram as an operand, than a double.
> Myth ? which myth? This is true.
> Try this :
Its not a myth... see the following...
[big snip...]
> On my 486dx75, this program prints "20 12", which means the calculation
> in floats is about twice slower than the calculation in doubles.
I don't know about 486's, I have never owned one...
> As you can see, there are no fancy functions with prototypes in doubles
> involved, just plain multiplies and adds...
Its doesn't matter ON THE PENTIUM...
> I have no Pentium handy to test this, but I suspect the same will happen:
> if you look at the code produced by DJGPP, you'll notice that the float
> version has a few more instructions than the double version (cast
> instructions actually)...
Don't take this badly or the wrong way or anything, I say this nicely:
If you don't have a pentium, don't say anthing. :) Its REALLY REALLY
simple.
ON THE PENTIUM: ALL FLOATS AND DOUBLES ARE CONVERTED TO 80 BIT VALUES
BY THE FPU BEFORE THE CALCULATION, CALCULATED IN 80 BIT, AND THEN STORED
IN ITS ORIGINAL RESOLUTION (FLOAT/DOUBLE). THIS TAKES *NO* TIME ON A
PENTIUM.
Whew. I feel better now. The Pentium is so superior to the 486 FPU its
just not funny... :) In fact, I am going to now _upload_ and _post_
_source_ code for the _pentium_ which counts everything in _cycles_.
Maybe then we can dispense this thread eh? :) If somebody else has
a pentium and is willing to test this stuff, _please_ do so and post
your results to me... sound cool?
Leathal.
--- fp.c ---
#include "fp.h"
double result;
double x0, y0, z0, x1, y1, z1;
int main()
{
unsigned int prof0[2], prof1[2], timer[2];
int i;
float time;
x0 = 10; y0 = 20; z0 = 15;
x1 = 20; y1 = 15; z1 = 25;
ProfSetProfiles(DataRead | PROF_EVENTS | RING_0123,
DataRead | PROF_CYCLES | RING_0123);
ProfBeginProfiles();
ProfZeroTimer();
asm volatile ("
pushl %eax;
movl $1000000, %eax;
.align 4;
loop:
fldl _x0;
fmull _x1;
fldl _y0;
fmull _y1;
fldl _z0;
fmull _z1;
fxch %st(2);
faddp %st, %st(1);
faddp %st, %st(1);
fstpl _result;
decl %eax;
jnz loop;
popl %eax;
");
ProfGetProfiles(prof0, prof1);
ProfReadTimer(timer);
printf("%d %d\n", timer[0], timer[1]);
printf("%d %d\n", prof0[0], prof0[1]);
printf("%d %d\n", prof1[0], prof1[1]);
time = (float) timer[1];
time /= 1000000;
printf("\n%f\n", time);
printf("\nResult: %f\n", result);
return 0;
}
--- fp.h ---
#include <stdio.h>
#define RDTSC(_dst) \
__asm__("
.byte 0x0F, 0x31
movl %%edx, (%%edi)
movl %%eax, 4(%%edi)"\
: : "D" _dst : "eax", "edx", "edi")
#define RDMSR(_msri, _msrd) \
__asm__("
.byte 0x0F, 0x32
movl %%edx, (%%edi)
movl %%eax, 4(%%edi)"\
: : "c" (_msri), "D" (_msrd) : "eax", "ecx", "edx", "edi")
#define WRMSR(_msri, _msrd) \
__asm__("
xorl %%edx, %%edx
.byte 0x0F, 0x30"\
: : "c" (_msri), "a" (_msrd) : "eax", "ecx", "edx")
#define RDMSR_0x12_0x13(_msr12, _msr13) \
__asm__("
movl $0x12, %%ecx
.byte 0x0F, 0x032
movl %%edx, (%%edi)
movl %%eax, 4(%%edi)
movl $0x13, %%ecx
.byte 0x0F, 0x32
movl %%edx, (%%esi)
movl %%eax, 4(%%esi)" \
: : "D" (_msr12), "S" (_msr13) : "eax", "ecx", "edx", "edi")
#define ZERO_MSR_0x12_0x13()\
__asm__("
xorl %%edx, %%edx
xorl %%eax, %%eax
movl $0x12, %%ecx
.byte 0x0F, 0x30
movl $0x13, %%ecx
.byte 0x0F, 0x30"\
: : : "eax", "ecx", "edx")
enum
{
DataRead,
DataWrite,
DataTLBMiss,
DataReadMiss,
DataWriteMiss,
WriteHitEM,
DataCacheLinesWritten,
DataCacheSnoops,
DataCacheSnoophit,
MemAccessBothPipes,
BankConflict,
MisalignedDataRef,
CodeRead,
CodeTLBMiss,
CodeCacheMiss,
SegRegLoad,
RESERVED0,
RESERVED1,
Branch,
BTBHit,
TakenBranchOrBTBHit,
PipelineFlush,
InstructionsExeced,
InstructionsExecedVPipe,
BusUtilizationClocks,
PipelineStalledWrtieBackup,
PipelineStalledDateMemRead,
PipeLineStalledWriteEM,
LockedBusCycle,
IOReadOrWriteCycle,
NonCacheableMemRef,
AGI,
RESERVED2,
RESERVED3,
FPOperation,
Breakpoint0Match,
Breakpoint1Match,
Breakpoint2Match,
Breakpoint3Match,
HWInterrupt,
DataReadOrWrite,
DataReadOrWriteMiss,
};
#define PROF_CYCLES (0x100)
#define PROF_EVENTS (0x000)
#define RING_012 (0x40)
#define RING_3 (0x80)
#define RING_0123 (RING_012 | RING_3)
#define ProfSetProfiles(_msr12, _msr13) \
{\
unsigned int prof;\
\
prof = (_msr12) | ((_msr13) << 16);\
WRMSR(0x11, prof);\
}
#define ProfBeginProfiles() \
ZERO_MSR_0x12_0x13();
#define ProfGetProfiles(_msr12, _msr13)\
RDMSR_0x12_0x13(_msr12, _msr13);
#define ProfZeroTimer()\
WRMSR(0x10, 0);
#define ProfReadTimer(_timer)\
RDMSR(0x10, timer);
- Raw text -