delorie.com/archives/browse.cgi   search  
Mail Archives: djgpp/1997/02/05/00:07:10

From: leathm AT solwarra DOT gbrmpa DOT gov DOT au (Leath Muller)
Message-Id: <199702050444.OAA21257@solwarra.gbrmpa.gov.au>
Subject: Re: floats v doubles myth
To: djgpp AT delorie DOT com
Date: Wed, 5 Feb 1997 14:44:15 +1000 (EST)

Ok, lets try again... :)

> > Can we *please* kill this myth.
> > On Pentium there is NO speed difference between using a float or double.
> > On 387,486/487 float is slightly *faster* to load, store or read from
> > ram as an operand, than a double.
 
> Myth ? which myth? This is true.
> Try this : 

Its not a myth... see the following...

[big snip...]

> On my 486dx75, this program prints "20 12", which means the calculation 
> in floats is about twice slower than the calculation in doubles.

I don't know about 486's, I have never owned one...

> As you can see, there are no fancy functions with prototypes in doubles 
> involved, just plain multiplies and adds...

Its doesn't matter ON THE PENTIUM...

> I have no Pentium handy to test this, but I suspect the same will happen: 
> if you look at the code produced by DJGPP, you'll notice that the float 
> version has a few more instructions than the double version (cast 
> instructions actually)... 

Don't take this badly or the wrong way or anything, I say this nicely:
If you don't have a pentium, don't say anthing. :)  Its REALLY REALLY
simple.

ON THE PENTIUM: ALL FLOATS AND DOUBLES ARE CONVERTED TO 80 BIT VALUES
BY THE FPU BEFORE THE CALCULATION, CALCULATED IN 80 BIT, AND THEN STORED
IN ITS ORIGINAL RESOLUTION (FLOAT/DOUBLE). THIS TAKES *NO* TIME ON A
PENTIUM.

Whew. I feel better now. The Pentium is so superior to the 486 FPU its
just not funny... :)  In fact, I am going to now _upload_ and _post_
_source_ code for the _pentium_ which counts everything in _cycles_.
Maybe then we can dispense this thread eh? :)  If somebody else has
a pentium and is willing to test this stuff, _please_ do so and post
your results to me... sound cool?

Leathal.

--- fp.c ---

#include "fp.h"

double result;
double x0, y0, z0, x1, y1, z1;

int main()
{
	unsigned int prof0[2], prof1[2], timer[2];
	int i;
	float time;

	x0 = 10; y0 = 20; z0 = 15;
	x1 = 20; y1 = 15; z1 = 25;

	ProfSetProfiles(DataRead | PROF_EVENTS | RING_0123,
			DataRead | PROF_CYCLES | RING_0123);
	ProfBeginProfiles();
	ProfZeroTimer();

	asm volatile ("
		pushl	%eax;
		movl	$1000000, %eax;
		.align	4;
	loop:
		fldl	_x0;
		fmull	_x1;
		fldl	_y0;
		fmull	_y1;
		fldl	_z0;
		fmull	_z1;
		fxch	%st(2);
		faddp	%st, %st(1);
		faddp	%st, %st(1);
		fstpl	_result;
		decl	%eax;
		jnz	loop;
		popl	%eax;
	");

	ProfGetProfiles(prof0, prof1);
	ProfReadTimer(timer);

	printf("%d  %d\n", timer[0], timer[1]);
	printf("%d  %d\n", prof0[0], prof0[1]);
	printf("%d  %d\n", prof1[0], prof1[1]);

	time = (float) timer[1];
	time /= 1000000;
	printf("\n%f\n", time);
	printf("\nResult: %f\n", result);
	return 0;
}

--- fp.h ---

#include <stdio.h>

#define RDTSC(_dst) \
__asm__("
	.byte 0x0F, 0x31
	movl	%%edx, (%%edi)
	movl	%%eax, 4(%%edi)"\
: : "D" _dst : "eax", "edx", "edi")

#define RDMSR(_msri, _msrd) \
__asm__("
	.byte 0x0F, 0x32
	movl	%%edx, (%%edi)
	movl	%%eax, 4(%%edi)"\
: : "c" (_msri), "D" (_msrd) : "eax", "ecx", "edx", "edi")

#define WRMSR(_msri, _msrd) \
__asm__("
	xorl	%%edx, %%edx
	.byte	0x0F, 0x30"\
: : "c" (_msri), "a" (_msrd) : "eax", "ecx", "edx")

#define RDMSR_0x12_0x13(_msr12, _msr13) \
__asm__("
	movl	$0x12, %%ecx
	.byte	0x0F, 0x032
	movl	%%edx, (%%edi)
	movl	%%eax, 4(%%edi)
	movl	$0x13, %%ecx
	.byte	0x0F, 0x32
	movl	%%edx, (%%esi)
	movl	%%eax, 4(%%esi)" \
: : "D" (_msr12), "S" (_msr13) : "eax", "ecx", "edx", "edi")

#define ZERO_MSR_0x12_0x13()\
__asm__("
	xorl	%%edx, %%edx
	xorl	%%eax, %%eax
	movl	$0x12, %%ecx
	.byte	0x0F, 0x30
	movl	$0x13, %%ecx
	.byte	0x0F, 0x30"\
: : : "eax", "ecx", "edx")

enum
{
	DataRead,
	DataWrite,
	DataTLBMiss,
	DataReadMiss,
	DataWriteMiss,
	WriteHitEM,
	DataCacheLinesWritten,
	DataCacheSnoops,
	DataCacheSnoophit,
	MemAccessBothPipes,
	BankConflict,
	MisalignedDataRef,
	CodeRead,
	CodeTLBMiss,
	CodeCacheMiss,
	SegRegLoad,
	RESERVED0,
	RESERVED1,
	Branch,
	BTBHit,
	TakenBranchOrBTBHit,
	PipelineFlush,
	InstructionsExeced,
	InstructionsExecedVPipe,
	BusUtilizationClocks,
	PipelineStalledWrtieBackup,
	PipelineStalledDateMemRead,
	PipeLineStalledWriteEM,
	LockedBusCycle,
	IOReadOrWriteCycle,
	NonCacheableMemRef,
	AGI,
	RESERVED2,
	RESERVED3,
	FPOperation,
	Breakpoint0Match,
	Breakpoint1Match,
	Breakpoint2Match,
	Breakpoint3Match,
	HWInterrupt,
	DataReadOrWrite,
	DataReadOrWriteMiss,
};

#define PROF_CYCLES	(0x100)
#define PROF_EVENTS	(0x000)
#define RING_012	(0x40)
#define RING_3		(0x80)
#define RING_0123	(RING_012 | RING_3)

#define ProfSetProfiles(_msr12, _msr13) \
{\
	unsigned int prof;\
\
	prof = (_msr12) | ((_msr13) << 16);\
	WRMSR(0x11, prof);\
}

#define ProfBeginProfiles() \
	ZERO_MSR_0x12_0x13();

#define ProfGetProfiles(_msr12, _msr13)\
	RDMSR_0x12_0x13(_msr12, _msr13);

#define ProfZeroTimer()\
	WRMSR(0x10, 0);

#define ProfReadTimer(_timer)\
	RDMSR(0x10, timer);	

- Raw text -


  webmaster     delorie software   privacy  
  Copyright © 2019   by DJ Delorie     Updated Jul 2019