Skip to content

Commit

Permalink
add doc
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexanderSinn committed Aug 25, 2024
1 parent e976f42 commit 376a30e
Showing 1 changed file with 105 additions and 4 deletions.
109 changes: 105 additions & 4 deletions Src/Base/AMReX_CTOParallelForImpl.H
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include <AMReX_BLassert.H>
#include <AMReX_Box.H>
#include <AMReX_Tuple.H>
#include <AMReX_TypeList.H>

#include <array>
#include <type_traits>
Expand All @@ -18,7 +18,7 @@ namespace amrex {

template <int... ctr>
struct CompileTimeOptions {
// TypeList is defined in AMReX_Tuple.H
// TypeList is defined in AMReX_TypeList.H
using list_type = TypeList<std::integral_constant<int, ctr>...>;
};

Expand All @@ -34,7 +34,7 @@ namespace detail
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
auto operator() (Args... args) const noexcept
-> decltype(f(args..., std::integral_constant<int, ctr>{}...)) {
f(args..., std::integral_constant<int, ctr>{}...);
return f(args..., std::integral_constant<int, ctr>{}...);
}

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Expand Down Expand Up @@ -69,8 +69,109 @@ namespace detail

#endif

/**
* \brief Compile time optimization of kernels with run time options.
*
* This is a generalized version of ParallelFor with CTOs that can support any function that
* takes in one lambda to launch a GPU kernel such as ParallelFor, ParallelForRNG, launch, etc.
* It uses fold expression to generate kernel launches for all combinations
* of the run time options. The kernel function can use constexpr if to
* discard unused code blocks for better run time performance. In the
* example below, the code will be expanded into 4*2=8 normal ParallelForRNGs
* for all combinations of the run time parameters.
\verbatim
int A_runtime_option = ...;
int B_runtime_option = ...;
enum A_options : int { A0, A1, A2, A3 };
enum B_options : int { B0, B1 };
AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
CompileTimeOptions<B0,B1>>{},
{A_runtime_option, B_runtime_option},
[&](cto_func){
ParallelForRNG(N, cto_func);
},
[=] AMREX_GPU_DEVICE (int i, const RandomEngine& engine,
auto A_control, auto B_control)
{
...
if constexpr (A_control.value == A0) {
...
} else if constexpr (A_control.value == A1) {
...
} else if constexpr (A_control.value == A2) {
...
else {
...
}
if constexpr (A_control.value != A3 && B_control.value == B1) {
...
}
...
}
);
constexpr int nthreads_per_block = ...;
int nblocks = ...;
AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
CompileTimeOptions<B0,B1>>{},
{A_runtime_option, B_runtime_option},
[&](cto_func){
launch<nthreads_per_block>(nblocks, Gpu::gpuStream(), cto_func);
},
[=] AMREX_GPU_DEVICE (auto A_control, auto B_control){
...
}
);
\endverbatim
* The static member function cto_func.GetOptions() can be used to obtain the runtime_options
* passed into AnyCTO, but at compile time. This enables some advanced use cases,
* such as changing the number of threads per block or the dimensionality of ParallelFor at runtime.
* For the second example -> decltype(void(intvect.size())) is necessary to
* disambiguate IntVectND<1> and int for the first argument of the kernel function.
\verbatim
int nthreads_per_block = ...;
AnyCTO(TypeList<CompileTimeOptions<128,256,512,1024>>{},
{nthreads_per_block},
[&](auto cto_func){
constexpr std::array<int, 1> ctos = cto_func.GetOptions();
constexpr int c_nthreads_per_block = ctos[0];
ParallelFor<c_nthreads_per_block>(N, cto_func);
},
[=] AMREX_GPU_DEVICE (int i, auto){
...
}
);
BoxND<6> box6D = ...;
int dims_needed = ...;
AnyCTO(TypeList<CompileTimeOptions<1,2,3,4,5,6>>{},
{dims_needed},
[&](auto cto_func){
constexpr std::array<int, 1> ctos = cto_func.GetOptions();
constexpr int c_dims_needed = ctos[0];
const auto box = BoxShrink<c_dims_needed>(box6D);
ParallelFor(box, cto_func);
},
[=] AMREX_GPU_DEVICE (auto intvect, auto) -> decltype(void(intvect.size())) {
...
}
);
\endverbatim
* Note that due to a limitation of CUDA's extended device lambda, the
* constexpr if block cannot be the one that captures a variable first.
* If nvcc complains about it, you will have to manually capture it outside
* constexpr if. Alternatively, the constexpr if can be replaced with a regular if.
* Compilers can still perform the same optimizations since the condition is known at compile time.
* The data type for the parameters is int.
*
* \param list_of_compile_time_options list of all possible values of the parameters.
* \param runtime_options the run time parameters.
* \param l a callable object containing a CPU function that launches the provided GPU kernel.
* \param f a callable object containing the GPU kernel with optimizations.
*/
template <class L, class F, typename... CTOs>
void AnyCTO (TypeList<CTOs...> /*list_of_compile_time_options*/,
void AnyCTO ([[maybe_unused]] TypeList<CTOs...> list_of_compile_time_options,
std::array<int,sizeof...(CTOs)> const& runtime_options,
L&& l, F&& f)
{
Expand Down

0 comments on commit 376a30e

Please sign in to comment.