Date: Fri, 15 Apr 94 23:21:23 -0400
From: dj AT ctron DOT com (DJ Delorie)
To: djgpp AT sun DOT soe DOT clarkson DOT edu
Subject: fastest memcpy?


I took the time to look up opcode timings to try to get a "fastest"
memcpy.  Sorry, Duff's device loses on both aligning and dribbles in
favor of a binary alignment search (486 timings).

The extra numbers are timings (averages in parens) for the various
alignment cases, with multiple options for each chunk of code.  This
is in pseudo-assembler and hasn't been tested, but if someone has a
benchmark and is willing to make it testable . . .

DJ

--

memcpy:		; [esi] -> [edi], len [ecx]
		; memcpy(dest, src, len)

	push	esi
	push	edi
	mov	edi, 12[esp]
	mov	esi, 16[esp]
	mov	ecx, 20[esp]

	cmp	ecx, MAGIC (>16?)
	ja	longalign
	rep	stosb
	jmp	doret

--------------------------------------------
longalign:

	xor	eax, eax	1
	sub	eax, esi	1
	and	eax, 3		1
	jz	la2		3/1
	add	esi, eax	1
	add	edi, eax	1
	sub	ecx, eax	1
	test	eax, 1		1
	jz	la1		3/1
	movsb			7
la1:
	test	eax, 2		1
	jz	la2		3/1
	movsw			7
la2:

; 0  1  2  3
; 6 25 20 20 (17.75)

--------------------------------------------
longalign:

	mov	eax, esi	1
	and	eax, 3		1
	jmp	longa[eax]	5

longa:
	movsb			7
	movsb			7
	movsb			7
	movsb			7

; 0  1  2  3
; 35 28 21 14 (24.5)

--------------------------------------------
loop:
	mov	eax, ecx
	shr	ecx, 2

	rep	movsl

--------------------------------------------
end:
	test	al,2		1
	jz	end1		3/1
	movsw			7
end1:
	test	al,1		1
	jz	end2		3/1
	movsb			7
end2:

; 0  1  2  3
; 8 13 13 18 (13)

--------------------------------------------
end:
	xor	al,3		1
	and	eax,3		1
	jmp	endo[eax]	5
endo:
	movsb			7
	movsb			7
	movsb			7

; 0  1  2  3
; 7 14 21 28 (17.5)

--------------------------------------------
end:
	and	eax,3		1
	mov	ecx, eax	1
	rep	movsb		13/12 + 3 * ecx

; 0  1  2  3
; 15 17 20 23 (18.75)

--------------------------------------------
doret:
	pop	edi
	pop	esi
	ret