Mail Archives: djgpp/1997/06/08/16:19:00
The following message contains what I wrote to speed up Allegro's
sprite-stretching.
It requires C++ with templates, which should be no problem for gcc
2.7.2, which I assume all Allegro-users have. It could be fairly easily
changed to templateless C++, and could be changed to C with quite a bit
of editing and a high tolerance for moving declarations to the top of
functions.
I have named the overall function do_stretch_blit_TB to avoid
name-collision with do_stretch_blit for the moment. For your header:
extern "C"
{
extern void do_stretch_blit_TB(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked);
}
make_stretcher and _do_stretch are unchanged.
I also wrote an open version of this, but haven't tested it. It will
require some patching, I'm sure. Files appended.
Tom
Mercy-mark:
/* ------------------- cachefnc.tpp --------------------- */
/* Function-cache template by Tom Breton. Copyright 1997 by Tom Breton.
Permission is hereby granted for use in the Allegro library. */
/* Purpose: To cache the result of expensive functions that are called
multiple times with the same arguments.
Design:
There are two separate templates, cache_result and
cache_result_lightweight.
Because the templates require parameters that are not passed to
the functions, a situation Stroustrup & committee believed would
never occur, both template functions are wrapped in template
structs. That means you instantiate the structure, not the
function.
Parameters:
key_T is the key type, comprising the args to get_value
value_T is the value type, the type get_value gives us
hash_t should be an unsigned scalar type, generally unsigned char.
max_saved_values should be a scalar value that fits in hash_t.
Required member functions for key_T:
get_value The function that actually does the work.
Takes void and returns a value of type value_T
release_value A function to release the value (generally, free a pointer)
Takes value_T and returns void
hash_key A function to hash the key-type to fit within the table
Takes void and returns int
operator== A function to compare keys:
Takes key_T & and returns int
Usage:
template struct //Only for -fno-implicit-templates
cache_result< stretcher_args_t, void *, unsigned char, 255, 10 >;
typedef
cache_result< stretcher_args_t, void *, unsigned char, 255, 10 >
cache_stretcher;
stretcher_args_t
args ( sx, sxd, dest_x, dest_width, dest_is_linear, is_masked );
return cache_stretcher::f( args );
Rationale:
We have to pass 4 functions, BUT gcc 2.7.2's template
implementation isn't strong enough to do it directly, so we pass
them as member-functions of "key", the type most likely to be
constructed just for this template.
*/
/* Wrapper struct to satisfy C++ that we've given the template all
parameters. */
template <
class key_T,
class value_T,
class hash_t,
int hash_table_sz,
int max_saved_values
>
struct
cache_result
{
static value_T f( key_T & key );
};
template <
class key_T,
class value_T,
class hash_t,
int hash_table_sz,
int max_saved_values
>
value_T
cache_result < key_T, value_T, hash_t, hash_table_sz, max_saved_values >
::f( key_T & key )
{
/* Hash table to quickly find previous values. */
const hash_t empty_index = 0;
const hash_t first_index = 1;
static hash_t hash_table[ hash_table_sz ] = { empty_index };
static int hash_table_initted = 0;
if( !hash_table_initted )
{
hash_table_initted = 1;
for( int i = 0; i < hash_table_sz; i++ )
{ hash_table[ i ] = empty_index; }
}
struct saved_T { key_T key; value_T value; hash_t next_table_ix; };
/* Previous invocations. */
static saved_T cached_values[ max_saved_values ];
/* How many previous values we have made, up to max. */
static int num_cached_values = 0;
/* Which previous value to kick out if neccessary to make room
for another. */
static int ancient_cached_value = first_index;
/* Because empty_index occupies 0 (Otherwise we'd have a problem
initializing the array) we look one step down. */
#define GET_ENTRY( X ) ( cached_values[ (X) - first_index ] )
int table_ix = key.get_hash();
/* use the old result if possible. */
for (
hash_t cached_value_ix = hash_table[ table_ix ];
( cached_value_ix != empty_index );
cached_value_ix = GET_ENTRY( cached_value_ix ).next_table_ix
)
{
if ( key == GET_ENTRY( cached_value_ix ).key )
{ return GET_ENTRY( cached_value_ix ).value; }
}
/* If we reach here, we didn't find an old result. */
/* Figure out where our new result will go. */
int target_index = empty_index;
if( num_cached_values < max_saved_values )
{
target_index = num_cached_values + first_index;
num_cached_values++;
}
else
{
target_index = ancient_cached_value;
/* Release any resources held by the value. */
key.release_value( GET_ENTRY( ancient_cached_value ).value );
int ancient_table_ix = GET_ENTRY( ancient_cached_value ).key.get_hash();
hash_t * p_cached_value_ix = &hash_table[ ancient_table_ix ];
/* Find it... should almost always be immediate. */
while( ( *p_cached_value_ix ) != ( ancient_cached_value ) )
{
assert( *p_cached_value_ix != empty_index );
p_cached_value_ix = &GET_ENTRY( *p_cached_value_ix ).next_table_ix;
}
/* Remove it in a list-safe way. */
*p_cached_value_ix = GET_ENTRY( ancient_cached_value ).next_table_ix;
if( ancient_cached_value == ( max_saved_values - 1 + first_index ) )
{
ancient_cached_value = first_index;
}
else
{
ancient_cached_value++;
}
}
assert( target_index != empty_index );
saved_T * entry_to_be_filled = &GET_ENTRY( target_index );
/* Fill up the new entry. */
assert( entry_to_be_filled != 0 );
entry_to_be_filled->key = key;
entry_to_be_filled->value = key.get_value();
/* Add it to the hash table: Push */
entry_to_be_filled->next_table_ix = hash_table[ table_ix ];
hash_table[ table_ix ] = target_index;
#undef GET_ENTRY
return entry_to_be_filled->value;
}
/* cache_result_lightweight
Less ambitious version of cache_result. Does not require hash_t nor
a hash_key member function, otherwise invoked identically.
Usage:
template struct //Only for -fno-implicit-templates
cache_result_lightweight< stretcher_args_t, void *, 10 >;
typedef
cache_result_lightweight< stretcher_args_t, void *, 10 >
cache_stretcher;
stretcher_args_t
args ( sx, sxd, dest_x, dest_width, dest_is_linear, is_masked );
return cache_stretcher::f( args );
*/
/* Wrapper struct to satisfy C++ that we've given the template all
parameters. */
template <
class key_T,
class value_T,
int max_saved_values
>
struct
cache_result_lightweight
{
static value_T f( key_T & key );
};
template <
class key_T,
class value_T,
int max_saved_values
>
value_T
cache_result_lightweight
< key_T, value_T, max_saved_values >
::f ( key_T & key )
{
struct saved_T { key_T key; value_T value; };
/* Previous values. */
static saved_T cached_values[ max_saved_values ];
/* How many previous values we have made, up to max. */
static int num_cached_values = 0;
/* Which previous value to kick out if neccessary to make room
for another. */
static int ancient_cached_value = 0;
#define GET_ENTRY(X) ( cached_values[ (X) ] )
/* use the old result if possible. */
for( int i = 0; i < num_cached_values; i++ )
{
if ( key == GET_ENTRY( i ).key )
{
return GET_ENTRY( i ).value;
}
}
/* If we reach here, we didn't find an old result. */
/* Figure out where new result will go. */
int target_index = -1;
if( num_cached_values < max_saved_values )
{
target_index = num_cached_values;
num_cached_values++;
}
else
{
target_index = ancient_cached_value;
/* Release any resources held by the value. */
key.release_value( GET_ENTRY( ancient_cached_value ).value );
if( ancient_cached_value == ( max_saved_values - 1 ) )
{
ancient_cached_value = 0;
}
else
{
ancient_cached_value++;
}
}
assert( target_index != -1 );
saved_T * entry_to_be_filled = &GET_ENTRY( target_index );
/* Fill up the new entry. */
assert( entry_to_be_filled != 0 );
entry_to_be_filled->key = key;
entry_to_be_filled->value = key.get_value();
#undef GET_ENTRY
return entry_to_be_filled->value;
}
/* ------------------- stretch.cc --------------------- */
/* More efficient bitmap/sprite stretcher. All modifications by Tom
Breton, borrowing from Allegro by Shawn Hargreaves. */
#if !defined __cplusplus
#error This has to be compiled as C++
#endif
#include <stdlib.h>
#include <stdio.h>
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#define alleg_mouse_unused
#define alleg_keyboard_unused
#define alleg_gui_unused
#define alleg_datafile_unused
#define alleg_fileIO_unused
#define alleg_fli_flc_unused
#define alleg_gfx_driver_unused
#define alleg_joystick_unused
#define fix_class_unused
#ifndef ALLEGRO_H
#include "allegro.h"
#endif
extern "C"
{
#include "internal.h"
#include "opcodes.h"
}
const int fixed_scale = 0x10000;
const int num_planes = 4;
#define ambitious_cache
#include "cachefnc.tpp"
extern "C"
{
extern void do_stretch_blit_TB(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked);
}
extern void* compile_stretcher
(
fixed src_start,
fixed src_ratio,
int dest_x,
int dest_width,
int dest_is_linear,
int is_masked
);
typedef int stretch_hash_t;
/* This doesn't care about the real source start, it only cares about
the pixel-fraction induced by clipping, and it doesn't care about
the real destination start, it only cares about what plane it starts
on.
*/
struct
stretcher_args_t
{
stretch_hash_t hash; //To quickly reject most cases.
fixed src_start;
fixed src_ratio;
int dest_x;
int dest_width;
int dest_is_linear;
int is_masked;
/* Template support functions */
void release_value( void * data ) { free( data ); };
int get_hash( void ) { return hash % 255; };
void * get_value( void )
{
return compile_stretcher
( src_start, src_ratio, dest_x, dest_width, dest_is_linear, is_masked );
};
int operator== ( stretcher_args_t & other );
/* Construction functions */
int internal_hash
(
fixed src_start,
fixed src_ratio,
int dest_x,
int dest_width,
int dest_is_linear,
int is_masked
);
stretcher_args_t
(
fixed src_start,
fixed src_ratio,
int dest_x,
int dest_width,
int dest_is_linear,
int is_masked
);
stretcher_args_t( void ) {};
};
inline
int
stretcher_args_t::
internal_hash
(
fixed src_start,
fixed src_ratio,
int dest_x,
int dest_width,
int dest_is_linear,
int is_masked
)
{
/* I made this hash up off the cuff, so it could probably be
improved. Hash might not want to include dest_x which is rarely
discriminatory, nor sx, same reason. */
return
0
+ ( src_start << 0 )
+ ( src_ratio << 10 )
+ ( dest_x << 20 )
+ ( dest_width << 22 )
+ ( dest_is_linear<< 30 )
+ ( is_masked << 31 )
;
}
inline
int
stretcher_args_t::
operator==
( stretcher_args_t & other )
{
return
(
( hash == other.hash ) &&
( src_start == other.src_start ) &&
( src_ratio == other.src_ratio ) &&
( dest_x == other.dest_x ) &&
( dest_width == other.dest_width ) &&
( dest_is_linear == other.dest_is_linear ) &&
( is_masked == other.is_masked )
);
}
/* Chop some data so that we're not comparing on parts that won't be
used. */
inline
stretcher_args_t::
stretcher_args_t
(
fixed _src_start,
fixed _src_ratio,
int _dest_x,
int _dest_width,
int _dest_is_linear,
int _is_masked
)
: hash( internal_hash( src_start, src_ratio, dest_x, dest_width,
dest_is_linear, is_masked ) ),
/* sx only uses its fractional part. */
src_start (_src_start & ( fixed_scale - 1 ) ),
src_ratio (_src_ratio ),
/* dest_x only uses what plane it's on, and that only if unchained. */
dest_x (_dest_x & ( dest_is_linear ? 0 : ( num_planes - 1 ) ) ),
dest_width (_dest_width ),
dest_is_linear(_dest_is_linear),
is_masked (_is_masked )
{};
#if defined ambitious_cache
template struct /* For -fno-implicit-templates */
cache_result< stretcher_args_t, void *, unsigned char, 255, 10 >;
typedef
cache_result< stretcher_args_t, void *, unsigned char, 255, 10 >
cache_stretcher;
#else
template struct /* For -fno-implicit-templates */
cache_result_lightweight< stretcher_args_t, void *, 10 >;
typedef
cache_result_lightweight< stretcher_args_t, void *, 10 >
cache_stretcher;
#endif
inline void *
get_compiled_stretcher
(
fixed sx,
fixed sxd,
int dest_x,
int dest_width,
int dest_is_linear,
int is_masked
)
{
stretcher_args_t
args ( sx, sxd, dest_x, dest_width, dest_is_linear, is_masked );
return cache_stretcher::f( args );
}
/* do_stretch_blit:
* Like blit(), except it can scale images so the source and destination
* rectangles don't need to be the same size. This routine doesn't do as
* much safety checking as the regular blit: in particular you must take
* care not to copy from areas outside the source bitmap, and you cannot
* blit between overlapping regions, ie. you must use different bitmaps for
* the source and the destination. This function can draw onto both linear
* and mode-X bitmaps.
*
* This routine does some very dodgy stuff. It dynamically generates a
* chunk of machine code to scale a line of the bitmap, and then calls this.
* I just _had_ to use self modifying code _somewhere_ in Allegro :-)
[Shawn's comments - TB]
*/
void do_stretch_blit_TB(BITMAP *source, BITMAP *dest, int source_x, int source_y, int source_width, int source_height, int dest_x, int dest_y, int dest_width, int dest_height, int masked)
{
fixed sx, sy, sxd, syd;
/* trivial reject for zero sizes */
if ((source_width <= 0) || (source_height <= 0) ||
(dest_width <= 0) || (dest_height <= 0))
{ return; }
/* convert to fixed point */
sx = itofix(source_x);
sy = itofix(source_y);
/* calculate delta values */
sxd = itofix(source_width) / dest_width;
syd = itofix(source_height) / dest_height;
/* clip? */
if (dest->clip) {
#if 1
/* Couldn't resist streamlining this. Logic is the same as before. */
const int x_overshoot = dest->cl - dest_x;
if( x_overshoot > 0 )
{
dest_x += x_overshoot;
dest_width -= x_overshoot;
sx += ( sxd * x_overshoot );
}
#else
while (dest_x < dest->cl) {
dest_x++;
dest_width--;
sx += sxd;
}
#endif
#if 1
/* Couldn't resist streamlining this. Logic is the same as before. */
const int y_overshoot = dest->ct - dest_y;
if( y_overshoot > 0 )
{
dest_y += y_overshoot;
dest_height-= y_overshoot;
sy += ( syd * y_overshoot );
}
#else
while (dest_y < dest->ct)
{
dest_y++;
dest_height--;
sy += syd;
}
#endif
if (dest_x+dest_width > dest->cr)
dest_width = dest->cr - dest_x;
if (dest_y+dest_height > dest->cb)
dest_height = dest->cb - dest_y;
if ((dest_width <= 0) || (dest_height <= 0))
return;
}
{
void * routine =
get_compiled_stretcher
( sx, sxd, dest_x, dest_width, is_linear_bitmap(dest),
masked );
/* call the stretcher function for each line */
_do_stretch(source, dest, routine, sx>>16, sy, syd, dest_x, dest_y,
dest_height);
}
}
/* Unchanged - TB */
/* make_stretcher:
* Helper function for stretch_blit(). Builds a machine code stretching
* routine in the scratch memory area.
*/
static int make_stretcher(int compiler_pos, fixed sx, fixed sxd, int dest_width, int masked)
{
int x, x2;
int c;
if (dest_width > 0) {
if (sxd == itofix(1)) { /* easy case for 1 -> 1 scaling */
if (masked) {
for (c=0; c<dest_width; c++) {
COMPILER_LODSB();
COMPILER_MASKED_STOSB();
}
}
else {
COMPILER_MOV_ECX(dest_width);
COMPILER_REP_MOVSB();
}
}
else if (sxd > itofix(1)) { /* big -> little scaling */
for (x=0; x<dest_width; x++) {
COMPILER_LODSB();
if (masked) {
COMPILER_MASKED_STOSB();
}
else {
COMPILER_STOSB();
}
x2 = (sx >> 16) + 1;
sx += sxd;
x2 = (sx >> 16) - x2;
if (x2 > 1) {
COMPILER_ADD_ESI(x2);
}
else if (x2 == 1) {
COMPILER_INC_ESI();
}
}
}
else { /* little -> big scaling */
x2 = sx >> 16;
COMPILER_LODSB();
for (x=0; x<dest_width; x++) {
if (masked) {
COMPILER_MASKED_STOSB();
}
else {
COMPILER_STOSB();
}
sx += sxd;
if ((sx >> 16) > x2) {
COMPILER_LODSB();
x2++;
}
}
}
}
return compiler_pos;
}
void* compile_stretcher
(
fixed sx,
fixed sxd,
int dest_x,
int dest_width,
int dest_is_linear,
int is_masked
)
{
int compiler_pos = 0;
int plane;
int d;
/* Save old scratch memory, start from 0. */
void * old_scratch_mem = _scratch_mem;
int old_scratch_mem_size = _scratch_mem_size;
_scratch_mem = 0;
_scratch_mem_size = 0;
if ( dest_is_linear ) {
/* build a simple linear stretcher */
compiler_pos = make_stretcher(0, sx, sxd, dest_width, is_masked );
}
else {
/* build four stretchers, one for each mode-X plane */
for (plane=0; plane<4; plane++) {
COMPILER_PUSH_ESI();
COMPILER_PUSH_EDI();
COMPILER_MOV_EAX((0x100<<((dest_x+plane)&3))|2);
COMPILER_MOV_EDX(0x3C4);
COMPILER_OUTW();
compiler_pos = make_stretcher(compiler_pos, sx+sxd*plane, sxd<<2,
(dest_width-plane+3)>>2, is_masked );
COMPILER_POP_EDI();
COMPILER_POP_ESI();
if (((dest_x+plane) & 3) == 3) {
COMPILER_INC_EDI();
}
d = ((sx+sxd*(plane+1))>>16) - ((sx+sxd*plane)>>16);
if (d > 0) {
COMPILER_ADD_ESI(d);
}
}
dest_x >>= 2;
}
COMPILER_RET();
void * routine = _scratch_mem;
/* Pop original scratch memory. */
_scratch_mem = old_scratch_mem;
_scratch_mem_size = old_scratch_mem_size;
return routine;
}
/* ------------------- stretch1.cc --------------------- */
/* A replacement bitmap/sprite stretcher for greater control and
efficiency. All modifications by Tom Breton, borrowing from Allegro
by Shawn Hargreaves.
This code is compiled but untested. Use it with care, and don't tell
me "IT DOESN"T WORK!", because I never said it would.
Use this as a replacement for most of stretch.cc, the replaced
section starting where the headers end and continuing down to the
end of do_stretch_blit_TB. In other words, I left out everything
that just mirrored stretch.cc
*/
/* A struct to hold the intermediate state. Yeah, it does expose part
of the API, but can be treated as a cookie. In other words, if the
contents of stretch_blit_intermed_t ever change, the user can remain
ignorant of the fact.
User must understand that
the source bitmap can be retargeted WRT the target bitmap, but
shouldn't move wrt planes in a planar mode,
shouldn't move across clipping lines,
might not want to move wrt horizontal subpixels
the post-routine ignores some values, the pre-routine ignores
others.
Design:
All args are passed to both routines, which may be a reason to
pass 'em indirectly, but we'll see.
masked is only relevant to the first one, though. If it were
relevant to the other, we'd store it.
It would be possible to rearrange this so that it works both
ways, with cache and with manual control.
stretcher_args_t::get_value would have to be rewritten to call
get_stretch_blit_intermed_t. do_stretch_blit would be rewritten
to call that cached version of get_stretch_blit_intermed_t and
then to call use_stretch_blit_intermed_t.
*/
struct
stretch_blit_intermed_t
{
void * routine;
};
const stretch_blit_intermed_t sbi_NULL = { 0, };
stretch_blit_intermed_t
get_stretch_blit_intermed_t
(BITMAP *source, BITMAP *dest,
int source_x, int source_y, int source_width, int source_height,
int dest_x, int dest_y, int dest_width, int dest_height,
int masked
)
{
fixed sx, sxd;
/* trivial reject for zero sizes */
if ((source_width <= 0) || (dest_width <= 0) )
{
/* Should compile an instant return instead, though. */
return sbi_NULL;
}
/* convert to fixed point */
sx = itofix(source_x);
/* calculate delta values */
sxd = itofix(source_width) / dest_width;
/* clip? */
if (dest->clip) {
#if 1
/* Couldn't resist streamlining this. Logic is the same as before. */
const int x_overshoot = dest->cl - dest_x;
if( x_overshoot > 0 )
{
dest_x += x_overshoot;
dest_width -= x_overshoot;
sx += ( sxd * x_overshoot );
}
#else
while (dest_x < dest->cl) {
dest_x++;
dest_width--;
sx += sxd;
}
#endif
if (dest_x+dest_width > dest->cr)
dest_width = dest->cr - dest_x;
if (dest_width <= 0)
{ return sbi_NULL; }
}
void * routine =
compile_stretcher
( sx, sxd, dest_x, dest_width, is_linear_bitmap(dest),
masked );
stretch_blit_intermed_t ret = { routine, };
return ret;
}
void
use_stretch_blit_intermed_t(BITMAP *source, BITMAP *dest,
int source_x, int source_y, int source_width, int source_height,
int dest_x, int dest_y, int dest_width, int dest_height,
stretch_blit_intermed_t intermed )
{
fixed sy, syd;
/* trivial reject for zero sizes */
if ((source_height <= 0) || (dest_height <= 0))
{ return; }
/* convert to fixed point */
sy = itofix(source_y);
/* calculate delta values */
syd = itofix(source_height) / dest_height;
/* clip? */
if (dest->clip) {
#if 1
/* Couldn't resist streamlining this. Logic is the same as before. */
const int y_overshoot = dest->ct - dest_y;
if( y_overshoot > 0 )
{
dest_y += y_overshoot;
dest_height-= y_overshoot;
sy += ( syd * y_overshoot );
}
#else
while (dest_y < dest->ct)
{
dest_y++;
dest_height--;
sy += syd;
}
#endif
if (dest_y+dest_height > dest->cb)
{ dest_height = dest->cb - dest_y;}
if (dest_height <= 0)
{ return;}
}
/* call the stretcher function for each line */
_do_stretch(source, dest, intermed.routine, source_x>>16, sy, syd, dest_x,
dest_y, dest_height);
}
- Raw text -