add doc

AMReX-Codes · Aug 25, 2024 · 376a30e · 376a30e
1 parent e976f42
commit 376a30e
Showing 1 changed file with 105 additions and 4 deletions.
diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H
@@ -3,7 +3,7 @@
 
 #include <AMReX_BLassert.H>
 #include <AMReX_Box.H>
-#include <AMReX_Tuple.H>
+#include <AMReX_TypeList.H>
 
 #include <array>
 #include <type_traits>
@@ -18,7 +18,7 @@ namespace amrex {
 
 template <int... ctr>
 struct CompileTimeOptions {
-    // TypeList is defined in AMReX_Tuple.H
+    // TypeList is defined in AMReX_TypeList.H
     using list_type = TypeList<std::integral_constant<int, ctr>...>;
 };
 
@@ -34,7 +34,7 @@ namespace detail
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         auto operator() (Args... args) const noexcept
             -> decltype(f(args..., std::integral_constant<int, ctr>{}...)) {
-            f(args..., std::integral_constant<int, ctr>{}...);
+            return f(args..., std::integral_constant<int, ctr>{}...);
         }
 
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -69,8 +69,109 @@ namespace detail
 
 #endif
 
+/**
+ * \brief Compile time optimization of kernels with run time options.
+ *
+ * This is a generalized version of ParallelFor with CTOs that can support any function that
+ * takes in one lambda to launch a GPU kernel such as ParallelFor, ParallelForRNG, launch, etc.
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelForRNGs
+ * for all combinations of the run time parameters.
+ \verbatim
+    int A_runtime_option = ...;
+    int B_runtime_option = ...;
+    enum A_options : int { A0, A1, A2, A3 };
+    enum B_options : int { B0, B1 };
+    AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                    CompileTimeOptions<B0,B1>>{},
+        {A_runtime_option, B_runtime_option},
+        [&](cto_func){
+            ParallelForRNG(N, cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (int i, const RandomEngine& engine,
+                              auto A_control, auto B_control)
+        {
+            ...
+            if constexpr (A_control.value == A0) {
+                ...
+            } else if constexpr (A_control.value == A1) {
+                ...
+            } else if constexpr (A_control.value == A2) {
+                ...
+            else {
+                ...
+            }
+            if constexpr (A_control.value != A3 && B_control.value == B1) {
+                ...
+            }
+            ...
+        }
+    );
+
+    constexpr int nthreads_per_block = ...;
+    int nblocks = ...;
+    AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                    CompileTimeOptions<B0,B1>>{},
+        {A_runtime_option, B_runtime_option},
+        [&](cto_func){
+            launch<nthreads_per_block>(nblocks, Gpu::gpuStream(), cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (auto A_control, auto B_control){
+            ...
+        }
+    );
+ \endverbatim
+ * The static member function cto_func.GetOptions() can be used to obtain the runtime_options
+ * passed into AnyCTO, but at compile time. This enables some advanced use cases,
+ * such as changing the number of threads per block or the dimensionality of ParallelFor at runtime.
+ * For the second example -> decltype(void(intvect.size())) is necessary to
+ * disambiguate IntVectND<1> and int for the first argument of the kernel function.
+ \verbatim
+    int nthreads_per_block = ...;
+    AnyCTO(TypeList<CompileTimeOptions<128,256,512,1024>>{},
+        {nthreads_per_block},
+        [&](auto cto_func){
+            constexpr std::array<int, 1> ctos = cto_func.GetOptions();
+            constexpr int c_nthreads_per_block = ctos[0];
+            ParallelFor<c_nthreads_per_block>(N, cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (int i, auto){
+            ...
+        }
+    );
+
+    BoxND<6> box6D = ...;
+    int dims_needed = ...;
+    AnyCTO(TypeList<CompileTimeOptions<1,2,3,4,5,6>>{},
+        {dims_needed},
+        [&](auto cto_func){
+            constexpr std::array<int, 1> ctos = cto_func.GetOptions();
+            constexpr int c_dims_needed = ctos[0];
+            const auto box = BoxShrink<c_dims_needed>(box6D);
+            ParallelFor(box, cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (auto intvect, auto) -> decltype(void(intvect.size())) {
+            ...
+        }
+    );
+ \endverbatim
+
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if. Alternatively, the constexpr if can be replaced with a regular if.
+ * Compilers can still perform the same optimizations since the condition is known at compile time.
+ * The data type for the parameters is int.
+ *
+ * \param list_of_compile_time_options list of all possible values of the parameters.
+ * \param runtime_options the run time parameters.
+ * \param l a callable object containing a CPU function that launches the provided GPU kernel.
+ * \param f a callable object containing the GPU kernel with optimizations.
+ */
 template <class L, class F, typename... CTOs>
-void AnyCTO (TypeList<CTOs...> /*list_of_compile_time_options*/,
+void AnyCTO ([[maybe_unused]] TypeList<CTOs...> list_of_compile_time_options,
              std::array<int,sizeof...(CTOs)> const& runtime_options,
              L&& l, F&& f)
 {