From 6083af15d641d3ae1d1207cb7dc275e313abbd45 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 23 Aug 2024 17:08:39 +0200 Subject: [PATCH] [prof] in gux_taptamggux.mad counters.h, improve the handling of counter overhead These are the results (1) keep overhead ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING RDTSC-BASED TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 4.4766s [COUNTERS] Fortran Other ( 0 ) : 0.1202s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0685s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 3.2400s for 1087437 events => throughput is 3.36E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1007s for 32768 events => throughput is 3.25E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1673s for 16384 events => throughput is 9.79E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0521s for 16384 events => throughput is 3.14E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0687s for 16384 events => throughput is 2.38E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1237s for 1087437 events => throughput is 8.79E+06 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4728s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0269s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 2.3496s for 14136681 events => throughput is 6.02E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 4.4409s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s CUDACPP_RUNTIME_USECHRONOTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp [COUNTERS] *** USING STD::CHRONO TIMERS (do not remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 5.3144s [COUNTERS] Fortran Other ( 0 ) : 0.1588s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0674s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 4.0191s for 1087437 events => throughput is 2.71E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.0996s for 32768 events => throughput is 3.29E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1660s for 16384 events => throughput is 9.87E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0508s for 16384 events => throughput is 3.22E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0704s for 16384 events => throughput is 2.33E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1482s for 1087437 events => throughput is 7.34E+06 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4718s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 2.8646s for 14136681 events => throughput is 4.94E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 5.2787s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s (2) remove overhead CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0338s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.8244s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.8905s ------------------------------------------------------------- [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.9339s [COUNTERS] Fortran Other ( 0 ) : 0.2954s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0674s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 2.7332s for 1087437 events => throughput is 3.98E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1003s for 32768 events => throughput is 3.27E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1688s for 16384 events => throughput is 9.71E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0507s for 16384 events => throughput is 3.23E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0695s for 16384 events => throughput is 2.36E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.0924s for 1087437 events => throughput is 1.18E+07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4692s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0263s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 1.8723s for 14136681 events => throughput is 7.55E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 3.8982s [COUNTERS] OVERALL MEs ( 32 ) : 0.0357s for 16384 events => throughput is 4.59E+05 events/s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0637s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 5.8826s [COUNTERS] PROGRAM COUNTEROVERHEAD : 1.6786s ------------------------------------------------------------- [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 4.2040s [COUNTERS] Fortran Other ( 0 ) : 0.4831s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0691s [COUNTERS] Fortran PhaseSpaceSampling ( 3 ) : 2.9924s for 1087437 events => throughput is 3.63E+05 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.0983s for 32768 events => throughput is 3.33E+05 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1669s for 16384 events => throughput is 9.81E+04 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0506s for 16384 events => throughput is 3.24E+05 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0676s for 16384 events => throughput is 2.42E+05 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.0698s for 1087437 events => throughput is 1.56E+07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4712s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0350s for 16384 events => throughput is 4.68E+05 events/s [COUNTERS] TEST SampleGetX ( 21 ) : 1.9227s for 14136681 events => throughput is 7.35E+06 events/s [COUNTERS] OVERALL NON-MEs ( 31 ) : 4.1690s [COUNTERS] OVERALL MEs ( 32 ) : 0.0350s for 16384 events => throughput is 4.68E+05 events/s (3) remove overhead, disable individual timers (so here the overhead is 0) CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0333s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.1897s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.3330s ------------------------------------------------------------- [COUNTERS] *** USING RDTSC-BASED TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8567s CUDACPP_RUNTIME_USECHRONOTIMERS=1 CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD=1 CUDACPP_RUNTIME_DISABLECALLTIMERS=1 \ ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp INFO: COUNTERS overhead : 0.0659s for 1M start/stop cycles [COUNTERS] PROGRAM TOTAL+COUNTEROVERHEAD : 4.5119s [COUNTERS] PROGRAM COUNTEROVERHEAD : 0.6594s ------------------------------------------------------------- [COUNTERS] *** USING STD::CHRONO TIMERS (remove timer overhead) *** [COUNTERS] PROGRAM TOTAL : 3.8525s --- .../SubProcesses/counters.cc | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc b/epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc index 75eab049aa..20db22d5bb 100644 --- a/epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc +++ b/epochX/cudacpp/gux_taptamggux.mad/SubProcesses/counters.cc @@ -171,13 +171,11 @@ extern "C" void counters_initialise_() { using namespace counters; - if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true; - if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true; #ifdef MGONGPU_HASRDTSC if( getenv( "CUDACPP_RUNTIME_USECHRONOTIMERS" ) ) usechronotimers = true; #endif if( getenv( "CUDACPP_RUNTIME_REMOVECOUNTEROVERHEAD" ) ) removetimeroverhead = true; - for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ ) + for( int icounter = 0; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3 { array_tags[icounter] = ""; // ensure that this is initialized to "" array_istesttimer[icounter] = false; // ensure that this is initialized to false @@ -193,7 +191,7 @@ extern "C" counters_register_counter_( &icalibcounter, "OVERHEAD CALIBRATION" ); mgOnGpu::ChronoTimer calibtimer; calibtimer.start(); - constexpr size_t ncall = 1000000; + constexpr size_t ncall = 10000000; // 10M calls are expected to take slightly less than ~1s (this will be in counter overhead) for( size_t icall = 0; icall < ncall; icall++ ) { counters_start_counter_( &icalibcounter, &nevtdummy ); @@ -202,6 +200,8 @@ extern "C" calibtimer.stop(); overheadpercallseconds = calibtimer.getTotalDurationSeconds() / ncall; } + if( getenv( "CUDACPP_RUNTIME_DISABLECALLTIMERS" ) ) disablecalltimers = true; + if( getenv( "CUDACPP_RUNTIME_DISABLETESTTIMERS" ) ) disabletesttimers = true; return; } @@ -216,10 +216,12 @@ extern "C" float program_totaltime = ( usechronotimers ? program_chronotimer.getTotalDurationSeconds() : program_rdtsctimer.getTotalDurationSeconds() ); float program_overhead = 0; // Extract time duration from all timers - float array_totaltimes[NCOUNTERSMAX + 3] = { 0 }; - float array_overheads[NCOUNTERSMAX + 3] = { 0 }; - for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) + float array_totaltimes[NCOUNTERSMAX + 4] = { 0 }; + float array_overheads[NCOUNTERSMAX + 4] = { 0 }; + for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3 { + if( icounter == NCOUNTERSMAX + 1 ) continue; + if( icounter == NCOUNTERSMAX + 2 ) continue; if( usechronotimers ) array_totaltimes[icounter] = array_chronotimers[icounter].getTotalDurationSeconds(); else @@ -235,7 +237,7 @@ extern "C" // Remove overheads of included timers if any if( removetimeroverhead ) { - for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) + for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) // no need to include icalibcounter = NCOUNTERSMAX+3 { for( int icounterIn : array_included[icounter] ) array_totaltimes[icounter] -= array_overheads[icounterIn]; @@ -259,7 +261,7 @@ extern "C" array_tags[0] = "Fortran Other"; array_counters[0] = 1; array_totaltimes[0] = program_totaltime; - for( int icounter = 1; icounter < NCOUNTERSMAX + 1; icounter++ ) + for( int icounter = 1; icounter < NCOUNTERSMAX + 4; icounter++ ) // include icalibcounter = NCOUNTERSMAX+3 { if( !array_istesttimer[icounter] ) // skip TEST counters array_totaltimes[0] -= array_totaltimes[icounter]; @@ -280,7 +282,7 @@ extern "C" array_counters[NCOUNTERSMAX + 1] = 1; array_totaltimes[NCOUNTERSMAX + 1] = program_totaltime - array_totaltimes[NCOUNTERSMAX + 2]; // Dump individual counters - for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ ) + for( int icounter = 0; icounter < NCOUNTERSMAX + 3; icounter++ ) // exclude icalibcounter = NCOUNTERSMAX+3 (would print a negative value here!) { if( array_tags[icounter] != "" ) {