delorie.com/archives/browse.cgi   search  
Mail Archives: djgpp/1997/11/29/07:47:32

From: Michal Mertl <xmerm05 AT manes DOT vse DOT cz>
Newsgroups: comp.os.msdos.djgpp
Subject: Re: 32bit memcpy function? _NEW_ Tried FPU memcpy (problem with CWSDPMI)
Date: Thu, 27 Nov 1997 18:17:05 +0100
Organization: Prague University of Economics
Lines: 83
Message-ID: <Pine.ULT.3.95.971127181022.831A-100000@dec5.vse.cz>
References: <Pine DOT SUN DOT 3 DOT 91 DOT 971123153118 DOT 19570a-100000 AT is> <34791FD2 DOT DC541BF1 AT linux DOT dpilink DOT com>
NNTP-Posting-Host: dec5.vse.cz
Mime-Version: 1.0
In-Reply-To: <34791FD2.DC541BF1@linux.dpilink.com>
To: djgpp AT delorie DOT com
DJ-Gateway: from newsgroup comp.os.msdos.djgpp

On Sun, 23 Nov 1997, Speed wrote:

> So it is equivalent to a 'rep movsl'? (assuming, of course, the source,
> dest, and count are already set)
> 
Yes, it is. Why you don't just look at the output. There are at least 3 good
debuggers for DJGPP.

Other thing is that I tried to write memcpy using 64bit FPU registers as
someone here suggested. It's about _20% faster_!!

__here is the source code of memcpyfpu.c __
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dos.h>
#include <time.h>

void memcpyfpu(void *source,void *destination, unsigned long length)
{
  __asm__ __volatile__ (
  "push    %%edx
        andl     $0xfffffff8,%%edx
        xorl     %%ecx,%%ecx
_LoopPoint:
        fildq    (%%eax,%%ecx)
        fistpq   (%%ebx,%%ecx)
        addl     $8,%%ecx
        cmpl     %%edx,%%ecx
        jb      _LoopPoint
        popl     %%edx
        xorl     %%ecx,%%edx
        je      _AllDone
        addl     %%ecx,%%eax
        addl     %%ecx,%%ebx
        movl     %%edx,%%ecx
_SmallLoop:
        movb     (%%eax,%%ecx),%%dl
        movb     %%dl,(%%ebx,%%ecx)
        decl     %%ecx
        jnz     _SmallLoop
_AllDone:"
   :
   : "a" (source), "b" (destination), "d" (length)
   : "%ecx","%cc","memory");
}

int main(void)
{
  char *source,*dest;
  unsigned long clocks1,clocks2;
  int i;

  __asm__ __volatile__ ("fninit");
  source=(char*)malloc(1000000);
  dest=(char*)malloc(1000000);
  if (source==NULL || dest==NULL) {
    fprintf(stderr,"Insuficient memory\n");
    return -1;
  }
  clocks1=uclock();
  for (i=0;i<10;i++)
    memcpyfpu(source,dest,1000000);
  clocks1=uclock()-clocks1;
  printf("FPU copy took %lu usec\n",clocks1);
  clocks2=uclock();
  for (i=0;i<10;i++)
    memcpy(dest,source,1000000);
  clocks2=uclock()-clocks2;
  printf("REP_MOVSL copy took %lu usec\n",clocks2);
  printf("FPU/REP_MOVSL %.4f\n",((double)clocks1)/clocks2);
  return 0;
}
__end of memcpycpu.c __

Interesting thing is that is run only 10-12% faster with cwsdpmi r3 and r4 but
with pmode (1.2), cwsdpr0 (both r3 and r4), qdpmi (1.1 form QEMM 8.0) run the
cpu code faster. The normal memcpy is about the same.

Michal "MiMe" Mertl
  xmerm05 AT vse DOT cz


- Raw text -


  webmaster     delorie software   privacy  
  Copyright © 2019   by DJ Delorie     Updated Jul 2019