delorie.com/archives/browse.cgi   search  
Mail Archives: djgpp/1998/08/21/12:00:30

From: DougEleveld <deleveld AT dds DOT nl>
Newsgroups: comp.os.msdos.djgpp
Subject: Re: DJGPPv2.02 & Profiling in a Windoze 98 Dos Box
Date: Fri, 21 Aug 1998 17:52:16 +0200
Organization: Rijksuniversiteit Groningen
Lines: 236
Message-ID: <35DD97B0.91B557B6@dds.nl>
References: <e#uUvNMz9GA DOT 274 AT upnetnews03>
NNTP-Posting-Host: client36-53.oprit.rug.nl
Mime-Version: 1.0
To: djgpp AT delorie DOT com
DJ-Gateway: from newsgroup comp.os.msdos.djgpp

Shelby Cain wrote:

> I am having a problem getting profiling to work under DJGPPv2.02...
>
> Basically, after I have compiled/linked my program with the '-pg'
> switch, it
> appears as if only one sample is taken and the profiling code stops
> collected data (as reported by gprof).  The end result is the
> implication
> that the total execution time was around 0.06 seconds on my system
> with 100%
> of it taken up by __dpmi_int.
>
> The above example is simply ex35.c from the Allegro gaming library...
> but I
> want to be able to use this for a more serious optimization project...
>
> Any ideas???

Well, I never use -pg for profiling anymore.  I use some macros that
count cycles on a pentium.  Much better resolution than using -pg.

here is the header file that I use.  I hope it's usefull....  If you
don't know how to use it, them give me a mail

//----------------------------------------------------------------------------

//
// DETL - A template library 2.0 beta
//
// Douglas Eleveld (D DOT J DOT Eleveld AT anest DOT azg DOT nl or deleveld AT dds DOT nl)
//
//----------------------------------------------------------------------------

//----------------------------------------------------------------------------

// Pentium timer class and macros
//
// C++ class wrapper around some macros that I got from
comp.os.msdos.djgpp
//
//----------------------------------------------------------------------------

#ifndef PENTIUM_TIMER_HEADER
#define PENTIUM_TIMER_HEADER

//----------------------------------------------------------------------------

//#include "detl.h"

//----------------------------------------------------------------------------

// What I found on comp.os.msdos.djgpp through dejanews:

// Subject:      DJGPP RDTSC demo  (Pentium-only, ~100 lines)
// From:         Tom Burgess <Tom_Burgess AT bc DOT sympatico DOT ca>
// Date:         1997/04/20
// Message-Id:   <3359E27B DOT 6C9A AT bc DOT sympatico DOT ca>
// Newsgroups:   comp.os.msdos.djgpp

/* rdtsc.c: DJGPP inline asm demo of Pentium cycle counter usage */
/* Reference: Agner Fog's "How to optimize for the Pentium" */
/* also thanks to Leath Muller for earlier posted RDTSC code */

// Hi, here's some code that might be useful to some for low-level
// Pentium optimization. If you get weird results, look carefully at
// what is known to be in cache when the code executes, code & data
// alignment, cache line conflicts, AGIs etc. Agner Fog warns that
// RDTSC doesn't work with virtual 86 mode but I've noted no problems
// with win95 dos shell, RHIDE or whatever. He also points out
// special Pentium Pro considerations which I have not addressed.
// Check out: http://announce.com/agner/assem/assem.html

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

/* RDTSC1 and RDTSC2 are macros to get Pentium RDTSC cycle count */
/* This returns 64 bits in EAX and EDX. */
/* If dest is address of a GNU long long, the 64 bit subtraction */
/* needed for interval measurement can be done directly */

/* RDTSC1 generates code for the initial timestamp read - the cld
and nops are included for repeatable pairing and to eliminate
shadowing effects from previous instructions */

#define RDTSC1(dest) \
__asm__(".byte 0x0F, 0x31\n\t"\
        "movl    %%eax, (%%edi)\n\t"\
        "movl    %%edx, 4(%%edi)\n\t"\
        "cld \n\t"\
        "nop \n\t nop \n\t nop \n\t"\
        "nop \n\t nop \n\t nop \n\t"\
        "nop \n\t nop \n\t"\
        : : "D" (dest) : "eax", "edx")

// I added here the extra nops that were mentioned in a later posting

/* use RDTSC2 immediately after the code under test. The clc is a
non-pairable filler that also elimate potential shadow effects */

#define RDTSC2(dest) \
__asm__("clc \n\t"\
        ".byte 0x0F, 0x31\n\t"\
        "movl    %%eax, (%%edi)\n\t"\
        "movl    %%edx, 4(%%edi)\n\t"\
        : : "D" (dest) : "eax", "edx")


//----------------------------------------------------------------------------

// C access to the timer and profiler
#ifndef __cplusplus

typedef struct { unsigned long long _overhead;
                 unsigned long long _start;
                 unsigned long long _end;
                 unsigned long long _runs;
                 unsigned long long _total;
               } c_pentium_profiler;

#define PROFILER_RESET(x)  { RDTSC1(&(x)._start); \
                             RDTSC2(&(x)._end); \
                             \
                             RDTSC1(&(x)._start); \
                             RDTSC2(&(x)._end); \
                             (x)._overhead = (x)._end - (x)._start; \
                             \
                             (x)._runs = 0.; \
                             (x)._total = 0.; \
                           }

#define PROFILER_START(x)  { RDTSC1(&(x)._start); }

#define PROFILER_STOP(x)   { RDTSC2(&(x)._end); \
                            (x)._runs++; \
                            (x)._total += ((x)._end -
(x)._start)-(x)._overhead; \
                           }

#define PROFILER_CYCLES(x)
(((double)((x)._total))/((double)((x)._runs)))

#define PROFILER_OVERHEAD(x) ((double)((x)._overhead))

//----------------------------------------------------------------------------

// C++ access to the timer and profiler
#else

// Pentuim timer class for cycle counts
class pentium_timer
      {
      private:
            unsigned long long _overhead;
            unsigned long long _start;
            unsigned long long _end;

      public:
            // Basic constructor
            pentium_timer (void)
               {
               /* Just want to get stuff into L1 cache */
               RDTSC1(&_start);
               RDTSC2(&_end);

               /* Measure overhead */
               RDTSC1(&_start);
               RDTSC2(&_end);

               _overhead = _end - _start;
               };

            // Start and stop the timer
            inline void start (void) { RDTSC1(&_start); };
            inline void stop  (void) { RDTSC2(&_end);   };

            // Info functions
            inline unsigned long long overhead (void) const { return
_overhead; };
            inline unsigned long long cycles   (void) const { return
(_end - _start)-_overhead; };
      };

//----------------------------------------------------------------------------

// Pentuim timer class for cycle counts
class pentium_profiler
      {
      private:
            // The internal timer
            pentium_timer timer;

            // Stats info
            unsigned long long _runs;
            unsigned long long _total;

      public:
            // Basic constructor
            pentium_profiler (void)
               :_runs(0),
               _total(0)
               { };

            // Start and stop the timer
            inline void start (void)
               {
               timer.start();
               };
            inline void stop  (void)
               {
               timer.stop();
               _runs++;
               _total+=timer.cycles();
               };

            // Info functions
            inline unsigned long long cycles (void) const
               {
               if(_runs==0) return 0;
               return (_total-timer.overhead()*_runs)/_runs;
               };
            inline unsigned long long runs (void) const
               {
               return _runs;
               };
      };

#endif
//----------------------------------------------------------------------------

#endif




- Raw text -


  webmaster     delorie software   privacy  
  Copyright © 2019   by DJ Delorie     Updated Jul 2019