Skip to content

Commit

Permalink
[prof] in gux_taptamggux.mad counters.h, improve the handling of coun…
Browse files Browse the repository at this point in the history
…ter overhead

These are the results

(1) keep overhead

./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp
 [COUNTERS] *** USING RDTSC-BASED TIMERS (do not remove timer overhead) ***
 [COUNTERS] PROGRAM TOTAL                         :    4.4766s
 [COUNTERS] Fortran Other                  (  0 ) :    0.1202s
 [COUNTERS] Fortran Initialise(I/O)        (  1 ) :    0.0685s
 [COUNTERS] Fortran PhaseSpaceSampling     (  3 ) :    3.2400s for  1087437 events => throughput is 3.36E+05 events/s
 [COUNTERS] Fortran PDFs                   (  4 ) :    0.1007s for    32768 events => throughput is 3.25E+05 events/s
 [COUNTERS] Fortran UpdateScaleCouplings   (  5 ) :    0.1673s for    16384 events => throughput is 9.79E+04 events/s
 [COUNTERS] Fortran Reweight               (  6 ) :    0.0521s for    16384 events => throughput is 3.14E+05 events/s
 [COUNTERS] Fortran Unweight(LHE-I/O)      (  7 ) :    0.0687s for    16384 events => throughput is 2.38E+05 events/s
 [COUNTERS] Fortran SamplePutPoint         (  8 ) :    0.1237s for  1087437 events => throughput is 8.79E+06 events/s
 [COUNTERS] CudaCpp Initialise             ( 11 ) :    0.4728s
 [COUNTERS] CudaCpp Finalise               ( 12 ) :    0.0269s
 [COUNTERS] CudaCpp MEs                    ( 19 ) :    0.0357s for    16384 events => throughput is 4.59E+05 events/s
 [COUNTERS] TEST    SampleGetX             ( 21 ) :    2.3496s for 14136681 events => throughput is 6.02E+06 events/s
 [COUNTERS] OVERALL NON-MEs                ( 31 ) :    4.4409s
 [COUNTERS] OVERALL MEs                    ( 32 ) :    0.0357s for    16384 events => throughput is 4.59E+05 events/s

CUDACPP_RUNTIME_USECHRONOTIMERS=1 \
./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp
 [COUNTERS] *** USING STD::CHRONO TIMERS (do not remove timer overhead) ***
 [COUNTERS] PROGRAM TOTAL                         :    5.3144s
 [COUNTERS] Fortran Other                  (  0 ) :    0.1588s
 [COUNTERS] Fortran Initialise(I/O)        (  1 ) :    0.0674s
 [COUNTERS] Fortran PhaseSpaceSampling     (  3 ) :    4.0191s for  1087437 events => throughput is 2.71E+05 events/s
 [COUNTERS] Fortran PDFs                   (  4 ) :    0.0996s for    32768 events => throughput is 3.29E+05 events/s
 [COUNTERS] Fortran UpdateScaleCouplings   (  5 ) :    0.1660s for    16384 events => throughput is 9.87E+04 events/s
 [COUNTERS] Fortran Reweight               (  6 ) :    0.0508s for    16384 events => throughput is 3.22E+05 events/s
 [COUNTERS] Fortran Unweight(LHE-I/O)      (  7 ) :    0.0704s for    16384 events => throughput is 2.33E+05 events/s
 [COUNTERS] Fortran SamplePutPoint         (  8 ) :    0.1482s for  1087437 events => throughput is 7.34E+06 events/s
 [COUNTERS] CudaCpp Initialise             ( 11 ) :    0.4718s
 [COUNTERS] CudaCpp Finalise               ( 12 ) :    0.0267s
 [COUNTERS] CudaCpp MEs                    ( 19 ) :    0.0357s for    16384 events => throughput is 4.59E+05 events/s
 [COUNTERS] TEST    SampleGetX             ( 21 ) :    2.8646s for 14136681 events => throughput is 4.94E+06 events/s
 [COUNTERS] OVERALL NON-MEs                ( 31 ) :    5.2787s
 [COUNTERS] OVERALL MEs                    ( 32 ) :    0.0357s for    16384 events => throughput is 4.59E+05 events/s

(2) remove overhead

CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \
./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp
 INFO: COUNTERS overhead :    0.0338s for 1M start/stop cycles
 [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD         :    4.8244s
 [COUNTERS] PROGRAM COUNTEROVERHEAD               :    0.8905s
 -------------------------------------------------------------
 [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) ***
 [COUNTERS] PROGRAM TOTAL                         :    3.9339s
 [COUNTERS] Fortran Other                  (  0 ) :    0.2954s
 [COUNTERS] Fortran Initialise(I/O)        (  1 ) :    0.0674s
 [COUNTERS] Fortran PhaseSpaceSampling     (  3 ) :    2.7332s for  1087437 events => throughput is 3.98E+05 events/s
 [COUNTERS] Fortran PDFs                   (  4 ) :    0.1003s for    32768 events => throughput is 3.27E+05 events/s
 [COUNTERS] Fortran UpdateScaleCouplings   (  5 ) :    0.1688s for    16384 events => throughput is 9.71E+04 events/s
 [COUNTERS] Fortran Reweight               (  6 ) :    0.0507s for    16384 events => throughput is 3.23E+05 events/s
 [COUNTERS] Fortran Unweight(LHE-I/O)      (  7 ) :    0.0695s for    16384 events => throughput is 2.36E+05 events/s
 [COUNTERS] Fortran SamplePutPoint         (  8 ) :    0.0924s for  1087437 events => throughput is 1.18E+07 events/s
 [COUNTERS] CudaCpp Initialise             ( 11 ) :    0.4692s
 [COUNTERS] CudaCpp Finalise               ( 12 ) :    0.0263s
 [COUNTERS] CudaCpp MEs                    ( 19 ) :    0.0357s for    16384 events => throughput is 4.59E+05 events/s
 [COUNTERS] TEST    SampleGetX             ( 21 ) :    1.8723s for 14136681 events => throughput is 7.55E+06 events/s
 [COUNTERS] OVERALL NON-MEs                ( 31 ) :    3.8982s
 [COUNTERS] OVERALL MEs                    ( 32 ) :    0.0357s for    16384 events => throughput is 4.59E+05 events/s

CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \
./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp
 INFO: COUNTERS overhead :    0.0637s for 1M start/stop cycles
 [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD         :    5.8826s
 [COUNTERS] PROGRAM COUNTEROVERHEAD               :    1.6786s
 -------------------------------------------------------------
 [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) ***
 [COUNTERS] PROGRAM TOTAL                         :    4.2040s
 [COUNTERS] Fortran Other                  (  0 ) :    0.4831s
 [COUNTERS] Fortran Initialise(I/O)        (  1 ) :    0.0691s
 [COUNTERS] Fortran PhaseSpaceSampling     (  3 ) :    2.9924s for  1087437 events => throughput is 3.63E+05 events/s
 [COUNTERS] Fortran PDFs                   (  4 ) :    0.0983s for    32768 events => throughput is 3.33E+05 events/s
 [COUNTERS] Fortran UpdateScaleCouplings   (  5 ) :    0.1669s for    16384 events => throughput is 9.81E+04 events/s
 [COUNTERS] Fortran Reweight               (  6 ) :    0.0506s for    16384 events => throughput is 3.24E+05 events/s
 [COUNTERS] Fortran Unweight(LHE-I/O)      (  7 ) :    0.0676s for    16384 events => throughput is 2.42E+05 events/s
 [COUNTERS] Fortran SamplePutPoint         (  8 ) :    0.0698s for  1087437 events => throughput is 1.56E+07 events/s
 [COUNTERS] CudaCpp Initialise             ( 11 ) :    0.4712s
 [COUNTERS] CudaCpp Finalise               ( 12 ) :    0.0267s
 [COUNTERS] CudaCpp MEs                    ( 19 ) :    0.0350s for    16384 events => throughput is 4.68E+05 events/s
 [COUNTERS] TEST    SampleGetX             ( 21 ) :    1.9227s for 14136681 events => throughput is 7.35E+06 events/s
 [COUNTERS] OVERALL NON-MEs                ( 31 ) :    4.1690s
 [COUNTERS] OVERALL MEs                    ( 32 ) :    0.0350s for    16384 events => throughput is 4.68E+05 events/s

(3) remove overhead, disable individual timers (so here the overhead is 0)

CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \
./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp
 INFO: COUNTERS overhead :    0.0333s for 1M start/stop cycles
 [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD         :    4.1897s
 [COUNTERS] PROGRAM COUNTEROVERHEAD               :    0.3330s
 -------------------------------------------------------------
 [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) ***
 [COUNTERS] PROGRAM TOTAL                         :    3.8567s

CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \
./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp
 INFO: COUNTERS overhead :    0.0659s for 1M start/stop cycles
 [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD         :    4.5119s
 [COUNTERS] PROGRAM COUNTEROVERHEAD               :    0.6594s
 -------------------------------------------------------------
 [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) ***
 [COUNTERS] PROGRAM TOTAL                         :    3.8525s
  • Loading branch information
valassi committed Aug 23, 2024
1 parent 3577a55 commit 6083af1
Showing 1 changed file with 12 additions and 10 deletions.
22 changes: 12 additions & 10 deletions epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,11 @@ extern "C"
void counters_initialise_()
{
using namespace counters;
if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true;
if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true;
#ifdef MGONGPU_HASRDTSC
if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) usechronotimers = true;
#endif
if( getenv( "CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD" ) ) removetimeroverhead = true;
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ )
for( int icounter = 0; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
{
array_tags[icounter] = ""; // ensure that this is initialized to ""
array_istesttimer[icounter] = false; // ensure that this is initialized to false
Expand All @@ -193,7 +191,7 @@ extern "C"
counters_register_counter_( &icalibcounter, "OVERHEAD CALIBRATION" );
mgOnGpu::ChronoTimer<std::chrono::high_resolution_clock> calibtimer;
calibtimer.start();
constexpr size_t ncall = 1000000;
constexpr size_t ncall = 10000000; // 10M calls are expected to take slightly less than ~1s (this will be in counter overhead)
for( size_t icall = 0; icall < ncall; icall++ )
{
counters_start_counter_( &icalibcounter, &nevtdummy );
Expand All @@ -202,6 +200,8 @@ extern "C"
calibtimer.stop();
overheadpercallseconds = calibtimer.getTotalDurationSeconds() / ncall;
}
if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true;
if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true;
return;
}

Expand All @@ -216,10 +216,12 @@ extern "C"
float program_totaltime = ( usechronotimers ? program_chronotimer.getTotalDurationSeconds() : program_rdtsctimer.getTotalDurationSeconds() );
float program_overhead = 0;
// Extract time duration from all timers
float array_totaltimes[NCOUNTERSMAX + 3] = { 0 };
float array_overheads[NCOUNTERSMAX + 3] = { 0 };
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
float array_totaltimes[NCOUNTERSMAX + 4] = { 0 };
float array_overheads[NCOUNTERSMAX + 4] = { 0 };
for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
{
if( icounter == NCOUNTERSMAX + 1 ) continue;
if( icounter == NCOUNTERSMAX + 2 ) continue;
if( usechronotimers )
array_totaltimes[icounter] = array_chronotimers[icounter].getTotalDurationSeconds();
else
Expand All @@ -235,7 +237,7 @@ extern "C"
// Remove overheads of included timers if any
if( removetimeroverhead )
{
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) // no need to include icalibcounter = NCOUNTERSMAX+3
{
for( int icounterIn : array_included[icounter] )
array_totaltimes[icounter] -= array_overheads[icounterIn];
Expand All @@ -259,7 +261,7 @@ extern "C"
array_tags[0] = "Fortran Other";
array_counters[0] = 1;
array_totaltimes[0] = program_totaltime;
for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ )
for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3
{
if( !array_istesttimer[icounter] ) // skip TEST counters
array_totaltimes[0] -= array_totaltimes[icounter];
Expand All @@ -280,7 +282,7 @@ extern "C"
array_counters[NCOUNTERSMAX + 1] = 1;
array_totaltimes[NCOUNTERSMAX + 1] = program_totaltime - array_totaltimes[NCOUNTERSMAX + 2];
// Dump individual counters
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ )
for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ ) // exclude icalibcounter = NCOUNTERSMAX+3 (would print a negative value here!)
{
if( array_tags[icounter] != "" )
{
Expand Down

0 comments on commit 6083af1

Please sign in to comment.