Mail Archives: djgpp/1997/11/29/07:47:32
On Sun, 23 Nov 1997, Speed wrote:
> So it is equivalent to a 'rep movsl'? (assuming, of course, the source,
> dest, and count are already set)
>
Yes, it is. Why you don't just look at the output. There are at least 3 good
debuggers for DJGPP.
Other thing is that I tried to write memcpy using 64bit FPU registers as
someone here suggested. It's about _20% faster_!!
__here is the source code of memcpyfpu.c __
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dos.h>
#include <time.h>
void memcpyfpu(void *source,void *destination, unsigned long length)
{
__asm__ __volatile__ (
"push %%edx
andl $0xfffffff8,%%edx
xorl %%ecx,%%ecx
_LoopPoint:
fildq (%%eax,%%ecx)
fistpq (%%ebx,%%ecx)
addl $8,%%ecx
cmpl %%edx,%%ecx
jb _LoopPoint
popl %%edx
xorl %%ecx,%%edx
je _AllDone
addl %%ecx,%%eax
addl %%ecx,%%ebx
movl %%edx,%%ecx
_SmallLoop:
movb (%%eax,%%ecx),%%dl
movb %%dl,(%%ebx,%%ecx)
decl %%ecx
jnz _SmallLoop
_AllDone:"
:
: "a" (source), "b" (destination), "d" (length)
: "%ecx","%cc","memory");
}
int main(void)
{
char *source,*dest;
unsigned long clocks1,clocks2;
int i;
__asm__ __volatile__ ("fninit");
source=(char*)malloc(1000000);
dest=(char*)malloc(1000000);
if (source==NULL || dest==NULL) {
fprintf(stderr,"Insuficient memory\n");
return -1;
}
clocks1=uclock();
for (i=0;i<10;i++)
memcpyfpu(source,dest,1000000);
clocks1=uclock()-clocks1;
printf("FPU copy took %lu usec\n",clocks1);
clocks2=uclock();
for (i=0;i<10;i++)
memcpy(dest,source,1000000);
clocks2=uclock()-clocks2;
printf("REP_MOVSL copy took %lu usec\n",clocks2);
printf("FPU/REP_MOVSL %.4f\n",((double)clocks1)/clocks2);
return 0;
}
__end of memcpycpu.c __
Interesting thing is that is run only 10-12% faster with cwsdpmi r3 and r4 but
with pmode (1.2), cwsdpr0 (both r3 and r4), qdpmi (1.1 form QEMM 8.0) run the
cpu code faster. The normal memcpy is about the same.
Michal "MiMe" Mertl
xmerm05 AT vse DOT cz
- Raw text -