From: leathm AT solwarra DOT gbrmpa DOT gov DOT au (Leath Muller) Message-Id: <199702050444.OAA21257@solwarra.gbrmpa.gov.au> Subject: Re: floats v doubles myth To: djgpp AT delorie DOT com Date: Wed, 5 Feb 1997 14:44:15 +1000 (EST) Content-Type: text Ok, lets try again... :) > > Can we *please* kill this myth. > > On Pentium there is NO speed difference between using a float or double. > > On 387,486/487 float is slightly *faster* to load, store or read from > > ram as an operand, than a double. > Myth ? which myth? This is true. > Try this : Its not a myth... see the following... [big snip...] > On my 486dx75, this program prints "20 12", which means the calculation > in floats is about twice slower than the calculation in doubles. I don't know about 486's, I have never owned one... > As you can see, there are no fancy functions with prototypes in doubles > involved, just plain multiplies and adds... Its doesn't matter ON THE PENTIUM... > I have no Pentium handy to test this, but I suspect the same will happen: > if you look at the code produced by DJGPP, you'll notice that the float > version has a few more instructions than the double version (cast > instructions actually)... Don't take this badly or the wrong way or anything, I say this nicely: If you don't have a pentium, don't say anthing. :) Its REALLY REALLY simple. ON THE PENTIUM: ALL FLOATS AND DOUBLES ARE CONVERTED TO 80 BIT VALUES BY THE FPU BEFORE THE CALCULATION, CALCULATED IN 80 BIT, AND THEN STORED IN ITS ORIGINAL RESOLUTION (FLOAT/DOUBLE). THIS TAKES *NO* TIME ON A PENTIUM. Whew. I feel better now. The Pentium is so superior to the 486 FPU its just not funny... :) In fact, I am going to now _upload_ and _post_ _source_ code for the _pentium_ which counts everything in _cycles_. Maybe then we can dispense this thread eh? :) If somebody else has a pentium and is willing to test this stuff, _please_ do so and post your results to me... sound cool? Leathal. --- fp.c --- #include "fp.h" double result; double x0, y0, z0, x1, y1, z1; int main() { unsigned int prof0[2], prof1[2], timer[2]; int i; float time; x0 = 10; y0 = 20; z0 = 15; x1 = 20; y1 = 15; z1 = 25; ProfSetProfiles(DataRead | PROF_EVENTS | RING_0123, DataRead | PROF_CYCLES | RING_0123); ProfBeginProfiles(); ProfZeroTimer(); asm volatile (" pushl %eax; movl $1000000, %eax; .align 4; loop: fldl _x0; fmull _x1; fldl _y0; fmull _y1; fldl _z0; fmull _z1; fxch %st(2); faddp %st, %st(1); faddp %st, %st(1); fstpl _result; decl %eax; jnz loop; popl %eax; "); ProfGetProfiles(prof0, prof1); ProfReadTimer(timer); printf("%d %d\n", timer[0], timer[1]); printf("%d %d\n", prof0[0], prof0[1]); printf("%d %d\n", prof1[0], prof1[1]); time = (float) timer[1]; time /= 1000000; printf("\n%f\n", time); printf("\nResult: %f\n", result); return 0; } --- fp.h --- #include #define RDTSC(_dst) \ __asm__(" .byte 0x0F, 0x31 movl %%edx, (%%edi) movl %%eax, 4(%%edi)"\ : : "D" _dst : "eax", "edx", "edi") #define RDMSR(_msri, _msrd) \ __asm__(" .byte 0x0F, 0x32 movl %%edx, (%%edi) movl %%eax, 4(%%edi)"\ : : "c" (_msri), "D" (_msrd) : "eax", "ecx", "edx", "edi") #define WRMSR(_msri, _msrd) \ __asm__(" xorl %%edx, %%edx .byte 0x0F, 0x30"\ : : "c" (_msri), "a" (_msrd) : "eax", "ecx", "edx") #define RDMSR_0x12_0x13(_msr12, _msr13) \ __asm__(" movl $0x12, %%ecx .byte 0x0F, 0x032 movl %%edx, (%%edi) movl %%eax, 4(%%edi) movl $0x13, %%ecx .byte 0x0F, 0x32 movl %%edx, (%%esi) movl %%eax, 4(%%esi)" \ : : "D" (_msr12), "S" (_msr13) : "eax", "ecx", "edx", "edi") #define ZERO_MSR_0x12_0x13()\ __asm__(" xorl %%edx, %%edx xorl %%eax, %%eax movl $0x12, %%ecx .byte 0x0F, 0x30 movl $0x13, %%ecx .byte 0x0F, 0x30"\ : : : "eax", "ecx", "edx") enum { DataRead, DataWrite, DataTLBMiss, DataReadMiss, DataWriteMiss, WriteHitEM, DataCacheLinesWritten, DataCacheSnoops, DataCacheSnoophit, MemAccessBothPipes, BankConflict, MisalignedDataRef, CodeRead, CodeTLBMiss, CodeCacheMiss, SegRegLoad, RESERVED0, RESERVED1, Branch, BTBHit, TakenBranchOrBTBHit, PipelineFlush, InstructionsExeced, InstructionsExecedVPipe, BusUtilizationClocks, PipelineStalledWrtieBackup, PipelineStalledDateMemRead, PipeLineStalledWriteEM, LockedBusCycle, IOReadOrWriteCycle, NonCacheableMemRef, AGI, RESERVED2, RESERVED3, FPOperation, Breakpoint0Match, Breakpoint1Match, Breakpoint2Match, Breakpoint3Match, HWInterrupt, DataReadOrWrite, DataReadOrWriteMiss, }; #define PROF_CYCLES (0x100) #define PROF_EVENTS (0x000) #define RING_012 (0x40) #define RING_3 (0x80) #define RING_0123 (RING_012 | RING_3) #define ProfSetProfiles(_msr12, _msr13) \ {\ unsigned int prof;\ \ prof = (_msr12) | ((_msr13) << 16);\ WRMSR(0x11, prof);\ } #define ProfBeginProfiles() \ ZERO_MSR_0x12_0x13(); #define ProfGetProfiles(_msr12, _msr13)\ RDMSR_0x12_0x13(_msr12, _msr13); #define ProfZeroTimer()\ WRMSR(0x10, 0); #define ProfReadTimer(_timer)\ RDMSR(0x10, timer);