[cmsdy] in dsample.f of pp_dy3j.mad P0_gux_taptamggux, cache xbin_min…

… for xmin=0 and xbin_max for xmax=1 (part2 of madgraph5#969) There is indeed another clear and not too small improvement CUDACPP_RUNTIME_DISABLEFPE=1 ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_dy3j_x1_cudacpp [COUNTERS] PROGRAM TOTAL : 4.2184s [COUNTERS] Fortran Other ( 0 ) : 0.1695s [COUNTERS] Fortran Initialise(I/O) ( 1 ) : 0.0672s [COUNTERS] Fortran Random2Momenta ( 3 ) : 2.9293s for 1170103 events => throughput is 2.50E-06 events/s [COUNTERS] Fortran PDFs ( 4 ) : 0.1094s for 49152 events => throughput is 2.23E-06 events/s [COUNTERS] Fortran UpdateScaleCouplings ( 5 ) : 0.1379s for 16384 events => throughput is 8.42E-06 events/s [COUNTERS] Fortran Reweight ( 6 ) : 0.0560s for 16384 events => throughput is 3.42E-06 events/s [COUNTERS] Fortran Unweight(LHE-I/O) ( 7 ) : 0.0707s for 16384 events => throughput is 4.31E-06 events/s [COUNTERS] Fortran SamplePutPoint ( 8 ) : 0.1447s for 1170103 events => throughput is 1.24E-07 events/s [COUNTERS] CudaCpp Initialise ( 11 ) : 0.4719s [COUNTERS] CudaCpp Finalise ( 12 ) : 0.0267s [COUNTERS] CudaCpp MEs ( 19 ) : 0.0350s for 16384 events => throughput is 2.13E-06 events/s [COUNTERS] OVERALL NON-MEs ( 21 ) : 4.1834s [COUNTERS] OVERALL MEs ( 22 ) : 0.0350s for 16384 events => throughput is 2.13E-06 events/s
valassi · Aug 15, 2024 · a6d57a8 · a6d57a8
1 parent 9a65860
commit a6d57a8
Showing 1 changed file with 36 additions and 5 deletions.
diff --git a/epochX/cudacpp/pp_dy3j.mad/Source/dsample.f b/epochX/cudacpp/pp_dy3j.mad/Source/dsample.f
@@ -1250,7 +1250,20 @@ subroutine sample_get_x(wgt, x, j, ipole, xmin, xmax)
 c     Local
 c
       integer  im, ip,ij,icount,it_warned
-      double precision xbin_min,xbin_max,ddum(maxdim),xo,y
+      double precision xbin_min,xbin_max,ddum(maxdim),xo,y 
+c
+c     Local (performance optimization #969)
+c
+      integer xbinarraydim
+      parameter (xbinarraydim=maxdim*lmaxconfigs)
+      double precision xbin_min0_array(maxdim, lmaxconfigs)
+      double precision xbin_max1_array(maxdim, lmaxconfigs)
+      logical xbin_min0_saved(maxdim, lmaxconfigs)
+      logical xbin_max1_saved(maxdim, lmaxconfigs)
+      save xbin_min0_array, xbin_max1_array
+      save xbin_min0_saved, xbin_max1_saved
+      data xbin_min0_saved/xbinarraydim*.false./
+      data xbin_max1_saved/xbinarraydim*.false./
 c
 c     External
 c
@@ -1301,10 +1314,28 @@ subroutine sample_get_x(wgt, x, j, ipole, xmin, xmax)
       endif
       if (ituple .eq. 1) then
 c         write(*,*) 'Getting variable',ipole,j,minvar(j,ipole)
-        xbin_min = xbin(xmin,minvar(j,ipole))
-        xbin_max = xbin(xmax,minvar(j,ipole))
-        if (xbin_min .gt. xbin_max-1) then
-          xbin_min = min(xbin_min, xbin_max)
+
+        if(xmax.ne.1 .or. .not.xbin_max1_saved(j,ipole)) then
+          xbin_max = xbin(xmax, minvar(j,ipole))
+          if(xmax.eq.1) then
+            xbin_max1_array(j,ipole) = xbin_max
+            xbin_max1_saved(j,ipole) = .true.
+          endif
+        else
+          xbin_max = xbin_max1_array(j,ipole)
+        endif
+
+        if(xmin.ne.0 .or. .not.xbin_min0_saved(j,ipole)) then
+          xbin_min = xbin(xmin, minvar(j,ipole))
+          if (xbin_min .gt. xbin_max-1) then
+            xbin_min = min(xbin_min, xbin_max)
+          endif
+          if(xmin.eq.0) then
+            xbin_min0_array(j,ipole) = xbin_min
+            xbin_min0_saved(j,ipole) = .true.
+          endif
+        else
+          xbin_min = xbin_min0_array(j,ipole)
         endif
 c
 c     Line which allows us to keep choosing same x