Newsgroups: comp.os.msdos.djgpp From: tob AT world DOT std DOT com Subject: Stretcher code (LONG! 900+ lines) (Was: Re: Allegro & sprite stretching optimization) Message-ID: Sender: tob AT world DOT std DOT com (Tom Breton) Reply-To: tob AT world DOT std DOT com Organization: BREnterprises References: <0ufufgARsrmzEwwy AT talula DOT demon DOT co DOT uk> Date: Sun, 8 Jun 1997 16:55:03 GMT Lines: 917 To: djgpp AT delorie DOT com DJ-Gateway: from newsgroup comp.os.msdos.djgpp Precedence: bulk The following message contains what I wrote to speed up Allegro's sprite-stretching. It requires C++ with templates, which should be no problem for gcc 2.7.2, which I assume all Allegro-users have. It could be fairly easily changed to templateless C++, and could be changed to C with quite a bit of editing and a high tolerance for moving declarations to the top of functions. I have named the overall function do_stretch_blit_TB to avoid name-collision with do_stretch_blit for the moment. For your header: extern "C" { extern void do_stretch_blit_TB(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked); } make_stretcher and _do_stretch are unchanged. I also wrote an open version of this, but haven't tested it. It will require some patching, I'm sure. Files appended. Tom Mercy-mark: /* ------------------- cachefnc.tpp --------------------- */ /* Function-cache template by Tom Breton. Copyright 1997 by Tom Breton. Permission is hereby granted for use in the Allegro library. */ /* Purpose: To cache the result of expensive functions that are called multiple times with the same arguments. Design: There are two separate templates, cache_result and cache_result_lightweight. Because the templates require parameters that are not passed to the functions, a situation Stroustrup & committee believed would never occur, both template functions are wrapped in template structs. That means you instantiate the structure, not the function. Parameters: key_T is the key type, comprising the args to get_value value_T is the value type, the type get_value gives us hash_t should be an unsigned scalar type, generally unsigned char. max_saved_values should be a scalar value that fits in hash_t. Required member functions for key_T: get_value The function that actually does the work. Takes void and returns a value of type value_T release_value A function to release the value (generally, free a pointer) Takes value_T and returns void hash_key A function to hash the key-type to fit within the table Takes void and returns int operator== A function to compare keys: Takes key_T & and returns int Usage: template struct //Only for -fno-implicit-templates cache_result< stretcher_args_t, void *, unsigned char, 255, 10 >; typedef cache_result< stretcher_args_t, void *, unsigned char, 255, 10 > cache_stretcher; stretcher_args_t args ( sx, sxd, dest_x, dest_width, dest_is_linear, is_masked ); return cache_stretcher::f( args ); Rationale: We have to pass 4 functions, BUT gcc 2.7.2's template implementation isn't strong enough to do it directly, so we pass them as member-functions of "key", the type most likely to be constructed just for this template. */ /* Wrapper struct to satisfy C++ that we've given the template all parameters. */ template < class key_T, class value_T, class hash_t, int hash_table_sz, int max_saved_values > struct cache_result { static value_T f( key_T & key ); }; template < class key_T, class value_T, class hash_t, int hash_table_sz, int max_saved_values > value_T cache_result < key_T, value_T, hash_t, hash_table_sz, max_saved_values > ::f( key_T & key ) { /* Hash table to quickly find previous values. */ const hash_t empty_index = 0; const hash_t first_index = 1; static hash_t hash_table[ hash_table_sz ] = { empty_index }; static int hash_table_initted = 0; if( !hash_table_initted ) { hash_table_initted = 1; for( int i = 0; i < hash_table_sz; i++ ) { hash_table[ i ] = empty_index; } } struct saved_T { key_T key; value_T value; hash_t next_table_ix; }; /* Previous invocations. */ static saved_T cached_values[ max_saved_values ]; /* How many previous values we have made, up to max. */ static int num_cached_values = 0; /* Which previous value to kick out if neccessary to make room for another. */ static int ancient_cached_value = first_index; /* Because empty_index occupies 0 (Otherwise we'd have a problem initializing the array) we look one step down. */ #define GET_ENTRY( X ) ( cached_values[ (X) - first_index ] ) int table_ix = key.get_hash(); /* use the old result if possible. */ for ( hash_t cached_value_ix = hash_table[ table_ix ]; ( cached_value_ix != empty_index ); cached_value_ix = GET_ENTRY( cached_value_ix ).next_table_ix ) { if ( key == GET_ENTRY( cached_value_ix ).key ) { return GET_ENTRY( cached_value_ix ).value; } } /* If we reach here, we didn't find an old result. */ /* Figure out where our new result will go. */ int target_index = empty_index; if( num_cached_values < max_saved_values ) { target_index = num_cached_values + first_index; num_cached_values++; } else { target_index = ancient_cached_value; /* Release any resources held by the value. */ key.release_value( GET_ENTRY( ancient_cached_value ).value ); int ancient_table_ix = GET_ENTRY( ancient_cached_value ).key.get_hash(); hash_t * p_cached_value_ix = &hash_table[ ancient_table_ix ]; /* Find it... should almost always be immediate. */ while( ( *p_cached_value_ix ) != ( ancient_cached_value ) ) { assert( *p_cached_value_ix != empty_index ); p_cached_value_ix = &GET_ENTRY( *p_cached_value_ix ).next_table_ix; } /* Remove it in a list-safe way. */ *p_cached_value_ix = GET_ENTRY( ancient_cached_value ).next_table_ix; if( ancient_cached_value == ( max_saved_values - 1 + first_index ) ) { ancient_cached_value = first_index; } else { ancient_cached_value++; } } assert( target_index != empty_index ); saved_T * entry_to_be_filled = &GET_ENTRY( target_index ); /* Fill up the new entry. */ assert( entry_to_be_filled != 0 ); entry_to_be_filled->key = key; entry_to_be_filled->value = key.get_value(); /* Add it to the hash table: Push */ entry_to_be_filled->next_table_ix = hash_table[ table_ix ]; hash_table[ table_ix ] = target_index; #undef GET_ENTRY return entry_to_be_filled->value; } /* cache_result_lightweight Less ambitious version of cache_result. Does not require hash_t nor a hash_key member function, otherwise invoked identically. Usage: template struct //Only for -fno-implicit-templates cache_result_lightweight< stretcher_args_t, void *, 10 >; typedef cache_result_lightweight< stretcher_args_t, void *, 10 > cache_stretcher; stretcher_args_t args ( sx, sxd, dest_x, dest_width, dest_is_linear, is_masked ); return cache_stretcher::f( args ); */ /* Wrapper struct to satisfy C++ that we've given the template all parameters. */ template < class key_T, class value_T, int max_saved_values > struct cache_result_lightweight { static value_T f( key_T & key ); }; template < class key_T, class value_T, int max_saved_values > value_T cache_result_lightweight < key_T, value_T, max_saved_values > ::f ( key_T & key ) { struct saved_T { key_T key; value_T value; }; /* Previous values. */ static saved_T cached_values[ max_saved_values ]; /* How many previous values we have made, up to max. */ static int num_cached_values = 0; /* Which previous value to kick out if neccessary to make room for another. */ static int ancient_cached_value = 0; #define GET_ENTRY(X) ( cached_values[ (X) ] ) /* use the old result if possible. */ for( int i = 0; i < num_cached_values; i++ ) { if ( key == GET_ENTRY( i ).key ) { return GET_ENTRY( i ).value; } } /* If we reach here, we didn't find an old result. */ /* Figure out where new result will go. */ int target_index = -1; if( num_cached_values < max_saved_values ) { target_index = num_cached_values; num_cached_values++; } else { target_index = ancient_cached_value; /* Release any resources held by the value. */ key.release_value( GET_ENTRY( ancient_cached_value ).value ); if( ancient_cached_value == ( max_saved_values - 1 ) ) { ancient_cached_value = 0; } else { ancient_cached_value++; } } assert( target_index != -1 ); saved_T * entry_to_be_filled = &GET_ENTRY( target_index ); /* Fill up the new entry. */ assert( entry_to_be_filled != 0 ); entry_to_be_filled->key = key; entry_to_be_filled->value = key.get_value(); #undef GET_ENTRY return entry_to_be_filled->value; } /* ------------------- stretch.cc --------------------- */ /* More efficient bitmap/sprite stretcher. All modifications by Tom Breton, borrowing from Allegro by Shawn Hargreaves. */ #if !defined __cplusplus #error This has to be compiled as C++ #endif #include #include #include #include #include #include #define alleg_mouse_unused #define alleg_keyboard_unused #define alleg_gui_unused #define alleg_datafile_unused #define alleg_fileIO_unused #define alleg_fli_flc_unused #define alleg_gfx_driver_unused #define alleg_joystick_unused #define fix_class_unused #ifndef ALLEGRO_H #include "allegro.h" #endif extern "C" { #include "internal.h" #include "opcodes.h" } const int fixed_scale = 0x10000; const int num_planes = 4; #define ambitious_cache #include "cachefnc.tpp" extern "C" { extern void do_stretch_blit_TB(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked); } extern void* compile_stretcher ( fixed src_start, fixed src_ratio, int dest_x, int dest_width, int dest_is_linear, int is_masked ); typedef int stretch_hash_t; /* This doesn't care about the real source start, it only cares about the pixel-fraction induced by clipping, and it doesn't care about the real destination start, it only cares about what plane it starts on. */ struct stretcher_args_t { stretch_hash_t hash; //To quickly reject most cases. fixed src_start; fixed src_ratio; int dest_x; int dest_width; int dest_is_linear; int is_masked; /* Template support functions */ void release_value( void * data ) { free( data ); }; int get_hash( void ) { return hash % 255; }; void * get_value( void ) { return compile_stretcher ( src_start, src_ratio, dest_x, dest_width, dest_is_linear, is_masked ); }; int operator== ( stretcher_args_t & other ); /* Construction functions */ int internal_hash ( fixed src_start, fixed src_ratio, int dest_x, int dest_width, int dest_is_linear, int is_masked ); stretcher_args_t ( fixed src_start, fixed src_ratio, int dest_x, int dest_width, int dest_is_linear, int is_masked ); stretcher_args_t( void ) {}; }; inline int stretcher_args_t:: internal_hash ( fixed src_start, fixed src_ratio, int dest_x, int dest_width, int dest_is_linear, int is_masked ) { /* I made this hash up off the cuff, so it could probably be improved. Hash might not want to include dest_x which is rarely discriminatory, nor sx, same reason. */ return 0 + ( src_start << 0 ) + ( src_ratio << 10 ) + ( dest_x << 20 ) + ( dest_width << 22 ) + ( dest_is_linear<< 30 ) + ( is_masked << 31 ) ; } inline int stretcher_args_t:: operator== ( stretcher_args_t & other ) { return ( ( hash == other.hash ) && ( src_start == other.src_start ) && ( src_ratio == other.src_ratio ) && ( dest_x == other.dest_x ) && ( dest_width == other.dest_width ) && ( dest_is_linear == other.dest_is_linear ) && ( is_masked == other.is_masked ) ); } /* Chop some data so that we're not comparing on parts that won't be used. */ inline stretcher_args_t:: stretcher_args_t ( fixed _src_start, fixed _src_ratio, int _dest_x, int _dest_width, int _dest_is_linear, int _is_masked ) : hash( internal_hash( src_start, src_ratio, dest_x, dest_width, dest_is_linear, is_masked ) ), /* sx only uses its fractional part. */ src_start (_src_start & ( fixed_scale - 1 ) ), src_ratio (_src_ratio ), /* dest_x only uses what plane it's on, and that only if unchained. */ dest_x (_dest_x & ( dest_is_linear ? 0 : ( num_planes - 1 ) ) ), dest_width (_dest_width ), dest_is_linear(_dest_is_linear), is_masked (_is_masked ) {}; #if defined ambitious_cache template struct /* For -fno-implicit-templates */ cache_result< stretcher_args_t, void *, unsigned char, 255, 10 >; typedef cache_result< stretcher_args_t, void *, unsigned char, 255, 10 > cache_stretcher; #else template struct /* For -fno-implicit-templates */ cache_result_lightweight< stretcher_args_t, void *, 10 >; typedef cache_result_lightweight< stretcher_args_t, void *, 10 > cache_stretcher; #endif inline void * get_compiled_stretcher ( fixed sx, fixed sxd, int dest_x, int dest_width, int dest_is_linear, int is_masked ) { stretcher_args_t args ( sx, sxd, dest_x, dest_width, dest_is_linear, is_masked ); return cache_stretcher::f( args ); } /* do_stretch_blit: * Like blit(), except it can scale images so the source and destination * rectangles don't need to be the same size. This routine doesn't do as * much safety checking as the regular blit: in particular you must take * care not to copy from areas outside the source bitmap, and you cannot * blit between overlapping regions, ie. you must use different bitmaps for * the source and the destination. This function can draw onto both linear * and mode-X bitmaps. * * This routine does some very dodgy stuff. It dynamically generates a * chunk of machine code to scale a line of the bitmap, and then calls this. * I just _had_ to use self modifying code _somewhere_ in Allegro :-) [Shawn's comments - TB] */ void do_stretch_blit_TB(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked) { fixed sx, sy, sxd, syd; /* trivial reject for zero sizes */ if ((source_width <= 0) || (source_height <= 0) || (dest_width <= 0) || (dest_height <= 0)) { return; } /* convert to fixed point */ sx = itofix(source_x); sy = itofix(source_y); /* calculate delta values */ sxd = itofix(source_width) / dest_width; syd = itofix(source_height) / dest_height; /* clip? */ if (dest->clip) { #if 1 /* Couldn't resist streamlining this. Logic is the same as before. */ const int x_overshoot = dest->cl - dest_x; if( x_overshoot > 0 ) { dest_x += x_overshoot; dest_width -= x_overshoot; sx += ( sxd * x_overshoot ); } #else while (dest_x < dest->cl) { dest_x++; dest_width--; sx += sxd; } #endif #if 1 /* Couldn't resist streamlining this. Logic is the same as before. */ const int y_overshoot = dest->ct - dest_y; if( y_overshoot > 0 ) { dest_y += y_overshoot; dest_height-= y_overshoot; sy += ( syd * y_overshoot ); } #else while (dest_y < dest->ct) { dest_y++; dest_height--; sy += syd; } #endif if (dest_x+dest_width > dest->cr) dest_width = dest->cr - dest_x; if (dest_y+dest_height > dest->cb) dest_height = dest->cb - dest_y; if ((dest_width <= 0) || (dest_height <= 0)) return; } { void * routine = get_compiled_stretcher ( sx, sxd, dest_x, dest_width, is_linear_bitmap(dest), masked ); /* call the stretcher function for each line */ _do_stretch(source, dest, routine, sx>>16, sy, syd, dest_x, dest_y, dest_height); } } /* Unchanged - TB */ /* make_stretcher: * Helper function for stretch_blit(). Builds a machine code stretching * routine in the scratch memory area. */ static int make_stretcher(int compiler_pos, fixed sx, fixed sxd, int dest_width, int masked) { int x, x2; int c; if (dest_width > 0) { if (sxd == itofix(1)) { /* easy case for 1 -> 1 scaling */ if (masked) { for (c=0; c itofix(1)) { /* big -> little scaling */ for (x=0; x> 16) + 1; sx += sxd; x2 = (sx >> 16) - x2; if (x2 > 1) { COMPILER_ADD_ESI(x2); } else if (x2 == 1) { COMPILER_INC_ESI(); } } } else { /* little -> big scaling */ x2 = sx >> 16; COMPILER_LODSB(); for (x=0; x> 16) > x2) { COMPILER_LODSB(); x2++; } } } } return compiler_pos; } void* compile_stretcher ( fixed sx, fixed sxd, int dest_x, int dest_width, int dest_is_linear, int is_masked ) { int compiler_pos = 0; int plane; int d; /* Save old scratch memory, start from 0. */ void * old_scratch_mem = _scratch_mem; int old_scratch_mem_size = _scratch_mem_size; _scratch_mem = 0; _scratch_mem_size = 0; if ( dest_is_linear ) { /* build a simple linear stretcher */ compiler_pos = make_stretcher(0, sx, sxd, dest_width, is_masked ); } else { /* build four stretchers, one for each mode-X plane */ for (plane=0; plane<4; plane++) { COMPILER_PUSH_ESI(); COMPILER_PUSH_EDI(); COMPILER_MOV_EAX((0x100<<((dest_x+plane)&3))|2); COMPILER_MOV_EDX(0x3C4); COMPILER_OUTW(); compiler_pos = make_stretcher(compiler_pos, sx+sxd*plane, sxd<<2, (dest_width-plane+3)>>2, is_masked ); COMPILER_POP_EDI(); COMPILER_POP_ESI(); if (((dest_x+plane) & 3) == 3) { COMPILER_INC_EDI(); } d = ((sx+sxd*(plane+1))>>16) - ((sx+sxd*plane)>>16); if (d > 0) { COMPILER_ADD_ESI(d); } } dest_x >>= 2; } COMPILER_RET(); void * routine = _scratch_mem; /* Pop original scratch memory. */ _scratch_mem = old_scratch_mem; _scratch_mem_size = old_scratch_mem_size; return routine; } /* ------------------- stretch1.cc --------------------- */ /* A replacement bitmap/sprite stretcher for greater control and efficiency. All modifications by Tom Breton, borrowing from Allegro by Shawn Hargreaves. This code is compiled but untested. Use it with care, and don't tell me "IT DOESN"T WORK!", because I never said it would. Use this as a replacement for most of stretch.cc, the replaced section starting where the headers end and continuing down to the end of do_stretch_blit_TB. In other words, I left out everything that just mirrored stretch.cc */ /* A struct to hold the intermediate state. Yeah, it does expose part of the API, but can be treated as a cookie. In other words, if the contents of stretch_blit_intermed_t ever change, the user can remain ignorant of the fact. User must understand that the source bitmap can be retargeted WRT the target bitmap, but shouldn't move wrt planes in a planar mode, shouldn't move across clipping lines, might not want to move wrt horizontal subpixels the post-routine ignores some values, the pre-routine ignores others. Design: All args are passed to both routines, which may be a reason to pass 'em indirectly, but we'll see. masked is only relevant to the first one, though. If it were relevant to the other, we'd store it. It would be possible to rearrange this so that it works both ways, with cache and with manual control. stretcher_args_t::get_value would have to be rewritten to call get_stretch_blit_intermed_t. do_stretch_blit would be rewritten to call that cached version of get_stretch_blit_intermed_t and then to call use_stretch_blit_intermed_t. */ struct stretch_blit_intermed_t { void * routine; }; const stretch_blit_intermed_t sbi_NULL = { 0, }; stretch_blit_intermed_t get_stretch_blit_intermed_t (BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked ) { fixed sx, sxd; /* trivial reject for zero sizes */ if ((source_width <= 0) || (dest_width <= 0) ) { /* Should compile an instant return instead, though. */ return sbi_NULL; } /* convert to fixed point */ sx = itofix(source_x); /* calculate delta values */ sxd = itofix(source_width) / dest_width; /* clip? */ if (dest->clip) { #if 1 /* Couldn't resist streamlining this. Logic is the same as before. */ const int x_overshoot = dest->cl - dest_x; if( x_overshoot > 0 ) { dest_x += x_overshoot; dest_width -= x_overshoot; sx += ( sxd * x_overshoot ); } #else while (dest_x < dest->cl) { dest_x++; dest_width--; sx += sxd; } #endif if (dest_x+dest_width > dest->cr) dest_width = dest->cr - dest_x; if (dest_width <= 0) { return sbi_NULL; } } void * routine = compile_stretcher ( sx, sxd, dest_x, dest_width, is_linear_bitmap(dest), masked ); stretch_blit_intermed_t ret = { routine, }; return ret; } void use_stretch_blit_intermed_t(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, stretch_blit_intermed_t intermed ) { fixed sy, syd; /* trivial reject for zero sizes */ if ((source_height <= 0) || (dest_height <= 0)) { return; } /* convert to fixed point */ sy = itofix(source_y); /* calculate delta values */ syd = itofix(source_height) / dest_height; /* clip? */ if (dest->clip) { #if 1 /* Couldn't resist streamlining this. Logic is the same as before. */ const int y_overshoot = dest->ct - dest_y; if( y_overshoot > 0 ) { dest_y += y_overshoot; dest_height-= y_overshoot; sy += ( syd * y_overshoot ); } #else while (dest_y < dest->ct) { dest_y++; dest_height--; sy += syd; } #endif if (dest_y+dest_height > dest->cb) { dest_height = dest->cb - dest_y;} if (dest_height <= 0) { return;} } /* call the stretcher function for each line */ _do_stretch(source, dest, intermed.routine, source_x>>16, sy, syd, dest_x, dest_y, dest_height); }