From: chris AT microtech DOT se (Chris Rossall)
Newsgroups: comp.os.msdos.djgpp,rec.games.programmer
Subject: Re: Optimization and bug smashing.. a lot of other questions too :)
Date: Thu, 14 Aug 1997 10:54:20 GMT
Reply-To: chris AT microtech DOT se
References: <33ee3f7f DOT 4973504 AT news DOT inlink DOT com>
NNTP-Posting-Host: 194.132.148.235
Message-ID: <33f2e1be.0@news.sto.telegate.se>
Lines: 82
To: djgpp AT delorie DOT com
DJ-Gateway: from newsgroup comp.os.msdos.djgpp
Precedence: bulk

vecna AT inlink DOT com ([vecna]) wrote:


>Okay, this is the single most important routine to optimize. It's the
>transparent blitter. It's very important to optimize already, and it
>will get VERY important to optimize in the next version... EVERY CYCLE
>COUNTS in this one. 

>tcopysprite(int x, int y, int width, int height, char *spr)
>{ asm("movl %3, %%ecx                   \n\t"
>      "movl %4, %%esi                   \n\t"
>"tcsl0:                                 \n\t"
>      "movl %1, %%eax                   \n\t"
>      "imul $352, %%eax                 \n\t"
>      "addl %0, %%eax                   \n\t"
>      "addl _virscr, %%eax              \n\t"
>      "movl %%eax, %%edi                \n\t"
>      "movl %2, %%edx                   \n\t"
>"drawloop:                              \n\t"
>      "lodsb                            \n\t"
>      "orb %%al, %%al                   \n\t"
>      "jz nodraw                        \n\t"
>      "stosb                            \n\t"
>      "decl %%edx                       \n\t"
>      "orl %%edx, %%edx                 \n\t"
>      "jz endline                       \n\t"
>      "jmp drawloop                     \n\t"
>"nodraw:                                \n\t"
>      "incl %%edi                       \n\t"
>      "decl %%edx                       \n\t"
>      "orl %%edx, %%edx                 \n\t"
>      "jnz drawloop                     \n\t"
>"endline:                               \n\t"
>      "incl %1                          \n\t"
>      "decl %%ecx                       \n\t"
>      "jnz tcsl0                        \n\t"
>      :
>      : "m" (x), "m" (y), "m" (width), "m" (height), "m" (spr)
>      : "eax","edx","esi","edi","ecx","cc" );
>}

Ok, a couple of ideas...
I think you should try to avoid that imul in the address calculation,
use shifts and adds instead, 352=256+64+32.
I haven't tested the following and it may have to be rearranged to
avoid stalls (if you are targetting the pentium processor)

mov eax,ypos
mov ebx,xpos
shl eax,5			; ypos*32
add ebx,eax
add eax,eax		;ypos*64
lea eax,[eax*4+eax]	;ypos*320 and we have already added ypos*32 to
ebx
add ebx,eax		; now you only have to add the start of the buffer	

Now for the transparency...

Have you tried using a mask? If you had a mask you could avoid so many
conditional jumps. If you construct the mask while loading the
sprites, you could set the mask to 255 for transparent pixels and 0
for transparen ones, then you could do the following:

mov esi,maskstart
mov edi,backg
mov eax,spritedata
mov ebx,[esi]			; 4 pixels at a time
mov ecx,[edi]
and ebx,ecx
mov edx,[eax]
or ebx,edx
mov[edi],ebx			;put back to screen


Again, I haven't tried this but you should be able to unroll this so
that you perhaps do a whole line at a time if your sprites are a fixed
size.


-Chris