Mail Archives: djgpp/1995/07/05/14:17:58
/*
* This code is NOT tested. Just wrote it to propose
* some ideas for further optimisation of memset for 486/pentium.
* i 'll further optimize it some day...
* (coz it a generally usefull rout for gfx programming for example :)
* but i would like to hear comments on this !
* remember ! consider this as pseudocode
* dunno if gas can assemble it :)
* by George (tmL) Moshovitis / ETD
*
*/
.file "memset.s"
.text
.globl _memset
.align 4 # this is align 2^4 right ?
# coz the 468-cache has a 16 byte line
# (p5 has 32 byte line but data can
# cross the line so align 16 is enough)
_memset:
pushl %edi
# use esp as frame pointer...
movl 8(%esp),%edi # this REALLY annoys me.
movl 12(%esp),%eax # How about declaring an inline asm
movl 16(%esp),%ecx # memset rout in string.h that gets
# those parameters and jumps to this
# code (without this stack code).
# perhaps this isnt elegant but memset
# should really be as fast as possible.
# (lots of people use it for example
# to draw scanlines in triangle fillers)
# think about it...
cmpl $15,%ecx
jle L3
movb %al,%ah
movl $16,%ebx # prepare ebx for later (pairs with above)
movl %eax,%edx
sall $16,%eax
movw %dx,%ax
movl %ecx,%edx
shrl $4,%ecx
.AREPEAT 3 # save some jumps
tstl $3,%edi
jz L1
movb %al,0(%edi)
decb %dl
incl %edi
.AENDR
jmp L1
.align 4
L1: # this loop could be better anti-stall
movl %eax,0(%edi) # optimized but i am not in the mood...
movl %eax,16(%edi) # anyway it fits in a cache line...
addl %ebx,%edi # use ebx instead of a constant...
movl %eax,(2-16)(%edi)
decl %ecx
movl %eax,(3-16)(%edi) # mov doesn't mess with the carry...
jnz L2 # i hope gas converts this to a short jmp...
movb %dl,%cl
andb $3,%cl
L2: # "RISCified" rep stosb :)
movb %al,(%edi) # faster than rep stosb on 486/pentium.
decb %cl # and i think on pentium too ??
incl %edi # does NOT mess with carry...
jnz L2
movl 8(%esp),%eax
popl %edi
ret
.align 4 # some code bytes more make no big deal.
L3: # not that we gain that much, but anyway...
movb %al,(%edi) # faster than rep stosb on 486.
decb %cl # and i think on pentium too ??
incl %edi # does NOT mess with carry...
jnz L3
movl 8(%esp),%eax
popl %edi
ret
- Raw text -