Mail Archives: djgpp/1994/04/15/23:24:41
I took the time to look up opcode timings to try to get a "fastest"
memcpy. Sorry, Duff's device loses on both aligning and dribbles in
favor of a binary alignment search (486 timings).
The extra numbers are timings (averages in parens) for the various
alignment cases, with multiple options for each chunk of code. This
is in pseudo-assembler and hasn't been tested, but if someone has a
benchmark and is willing to make it testable . . .
DJ
--
memcpy: ; [esi] -> [edi], len [ecx]
; memcpy(dest, src, len)
push esi
push edi
mov edi, 12[esp]
mov esi, 16[esp]
mov ecx, 20[esp]
cmp ecx, MAGIC (>16?)
ja longalign
rep stosb
jmp doret
--------------------------------------------
longalign:
xor eax, eax 1
sub eax, esi 1
and eax, 3 1
jz la2 3/1
add esi, eax 1
add edi, eax 1
sub ecx, eax 1
test eax, 1 1
jz la1 3/1
movsb 7
la1:
test eax, 2 1
jz la2 3/1
movsw 7
la2:
; 0 1 2 3
; 6 25 20 20 (17.75)
--------------------------------------------
longalign:
mov eax, esi 1
and eax, 3 1
jmp longa[eax] 5
longa:
movsb 7
movsb 7
movsb 7
movsb 7
; 0 1 2 3
; 35 28 21 14 (24.5)
--------------------------------------------
loop:
mov eax, ecx
shr ecx, 2
rep movsl
--------------------------------------------
end:
test al,2 1
jz end1 3/1
movsw 7
end1:
test al,1 1
jz end2 3/1
movsb 7
end2:
; 0 1 2 3
; 8 13 13 18 (13)
--------------------------------------------
end:
xor al,3 1
and eax,3 1
jmp endo[eax] 5
endo:
movsb 7
movsb 7
movsb 7
; 0 1 2 3
; 7 14 21 28 (17.5)
--------------------------------------------
end:
and eax,3 1
mov ecx, eax 1
rep movsb 13/12 + 3 * ecx
; 0 1 2 3
; 15 17 20 23 (18.75)
--------------------------------------------
doret:
pop edi
pop esi
ret
- Raw text -