From b78849057c0f533466da6b517a59d2eddc99c1a8 Mon Sep 17 00:00:00 2001 From: Bruce Perry Date: Mon, 2 Sep 2024 11:18:21 -0600 Subject: [PATCH 01/18] EB: don't abort for no-op case in unsupported addFineLevels functions (#4123) ## Summary The `addFineLevels` function is not supported for EB2 for chk_file and stl geometries. However, it may be called in some for some trivial cases where it is adding 0 levels, in which case it is a no-op. There is no reason to abort in those cases. ## Additional background For PeleC, a work-around was put in to not call the function in the trivial cases (https://github.com/AMReX-Combustion/PeleC/pull/771). I was thinking about adding the same work around to address the same thing in PeleLMeX (https://github.com/AMReX-Combustion/PeleLMeX/issues/407), but maybe it would be better to simply allow the function to be called in trivial cases. If there's a reason not to do this, I'll just put the workaround in for PeleLMeX. --- Src/EB/AMReX_EB2_IndexSpace_STL.cpp | 7 +++++-- Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp index 662aaf14dd6..70e3b492d82 100644 --- a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp +++ b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp @@ -83,9 +83,12 @@ IndexSpaceSTL::getGeometry (const Box& dom) const } void -IndexSpaceSTL::addFineLevels (int /*num_new_fine_levels*/) +IndexSpaceSTL::addFineLevels (int num_new_fine_levels) { - amrex::Abort("IndexSpaceSTL::addFineLevels: todo"); + // This function is a no op if not adding levels, otherwise TODO + if (num_new_fine_levels > 0) { + amrex::Abort("IndexSpaceSTL::addFineLevels: todo"); + } } void diff --git a/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp index cd811d73688..966b5c424e1 100644 --- a/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp +++ b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp @@ -78,9 +78,12 @@ IndexSpaceChkptFile::getGeometry (const Box& dom) const } void -IndexSpaceChkptFile::addFineLevels (int /*num_new_fine_levels*/) +IndexSpaceChkptFile::addFineLevels (int num_new_fine_levels) { - amrex::Abort("IndexSpaceChkptFile::addFineLevels: not supported"); + // This function is a no op if not adding levels, otherwise TODO + if (num_new_fine_levels > 0) { + amrex::Abort("IndexSpaceChkptFile::addFineLevels: not supported"); + } } void From a5896383e793b60bd6d1b0174b00722cb3d2a2eb Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Mon, 2 Sep 2024 10:48:03 -0700 Subject: [PATCH 02/18] `AMREX_DEVICE_PRINTF`: Host (#4116) In AMReX, device means host if compiled for CPUs. Add support for printf debugging on CPUs for `AMREX_DEVICE_PRINTF`. Currently, the macro was undefined, unless for the special case of SYCL compilation for host code paths where it worked. --- Src/Base/AMReX_GpuPrint.H | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Src/Base/AMReX_GpuPrint.H b/Src/Base/AMReX_GpuPrint.H index 9e029ea8835..37f74f30bfc 100644 --- a/Src/Base/AMReX_GpuPrint.H +++ b/Src/Base/AMReX_GpuPrint.H @@ -2,8 +2,6 @@ #define AMREX_GPU_PRINT_H_ #include -#if !defined(__APPLE__) - #include #ifdef AMREX_USE_SYCL @@ -23,7 +21,9 @@ # define AMREX_DEVICE_PRINTF(...) std::printf(__VA_ARGS__); #elif defined(AMREX_USE_HIP) # define AMREX_DEVICE_PRINTF(...) ::printf(__VA_ARGS__); +#else +# define AMREX_DEVICE_PRINTF(format,...) { \ + std::printf(format, __VA_ARGS__); } #endif -#endif // !defined(__APPLE__) #endif // AMREX_GPU_PRINT_H_ From 778e782acfd5412a21e7b4d7c100e43a735536f6 Mon Sep 17 00:00:00 2001 From: Youngjun Lee Date: Mon, 2 Sep 2024 12:50:14 -0500 Subject: [PATCH 03/18] Fix Fortran interface compilation issue using `nvfortran` (#4115) ## Summary This PR will fix the compilation issue for Fortran interfaces when using `nvfortran`, reported in #4111. ## Additional background The new `module procedure` interfaces for `final` subroutines are introduced to bypass the `nvfortran` compilation error. This change may not be needed according to the Fortran standard, but it effectively resolves compilation errors using `nvfortran` with `-DAMReX_FORTRAN_INTERFACES=ON`. --- .github/workflows/cuda.yml | 1 + Src/Base/AMReX_parmparse_mod.F90 | 4 ++++ Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90 | 4 ++++ Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 | 4 ++++ Src/F_Interfaces/Base/AMReX_distromap_mod.F90 | 4 ++++ Src/F_Interfaces/Base/AMReX_fab_mod.F90 | 4 ++++ Src/F_Interfaces/Base/AMReX_geometry_mod.F90 | 4 ++++ Src/F_Interfaces/Base/AMReX_multifab_mod.F90 | 12 ++++++++++++ Src/F_Interfaces/Base/AMReX_physbc_mod.F90 | 4 ++++ .../LinearSolvers/AMReX_abeclaplacian_mod.F90 | 4 ++++ .../LinearSolvers/AMReX_multigrid_mod.F90 | 4 ++++ Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 | 4 ++++ .../Particle/AMReX_particlecontainer_mod.F90 | 4 ++++ 13 files changed, 57 insertions(+) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 50d45846f02..9e96aefac5e 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -154,6 +154,7 @@ jobs: -DAMReX_ENABLE_TESTS=ON \ -DAMReX_TEST_TYPE=Small \ -DAMReX_FORTRAN=ON \ + -DAMReX_FORTRAN_INTERFACES=ON \ -DAMReX_GPU_BACKEND=CUDA \ -DCMAKE_C_COMPILER=$(which nvc) \ -DCMAKE_CXX_COMPILER=$(which nvc++) \ diff --git a/Src/Base/AMReX_parmparse_mod.F90 b/Src/Base/AMReX_parmparse_mod.F90 index 2d33b4da40d..e7a8748503b 100644 --- a/Src/Base/AMReX_parmparse_mod.F90 +++ b/Src/Base/AMReX_parmparse_mod.F90 @@ -241,6 +241,10 @@ subroutine amrex_parmparse_add_stringarr (pp, name, v, n) bind(c) end subroutine amrex_parmparse_add_stringarr end interface + interface amrex_parmparse_destroy + module procedure amrex_parmparse_destroy + end interface amrex_parmparse_destroy + contains subroutine amrex_parmparse_build (pp, name) diff --git a/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90 b/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90 index 3e40c9cd517..21937b57b10 100644 --- a/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90 +++ b/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90 @@ -103,6 +103,10 @@ subroutine amrex_fi_fluxregister_overwrite (fr, flxs, scale, geom) bind(c) end subroutine amrex_fi_fluxregister_overwrite end interface + interface amrex_fluxregister_destroy + module procedure amrex_fluxregister_destroy + end interface amrex_fluxregister_destroy + contains subroutine amrex_fluxregister_build (fr, ba, dm, ref_ratio, fine_lev, ncomp) diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 index b156a8ae996..0181c6cfb9c 100644 --- a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 @@ -45,6 +45,10 @@ module amrex_boxarray_module module procedure amrex_boxarray_print end interface amrex_print + interface amrex_boxarray_destroy + module procedure amrex_boxarray_destroy + end interface amrex_boxarray_destroy + ! interfaces to cpp functions interface diff --git a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 index 26316ffab21..adbb91b4421 100644 --- a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 @@ -34,6 +34,10 @@ module amrex_distromap_module module procedure amrex_distromap_print end interface amrex_print + interface amrex_distromap_destroy + module procedure amrex_distromap_destroy + end interface amrex_distromap_destroy + ! interfaces to cpp functions interface diff --git a/Src/F_Interfaces/Base/AMReX_fab_mod.F90 b/Src/F_Interfaces/Base/AMReX_fab_mod.F90 index d36e8f4ea87..b76e8645ea3 100644 --- a/Src/F_Interfaces/Base/AMReX_fab_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_fab_mod.F90 @@ -42,6 +42,10 @@ module amrex_fab_module module procedure amrex_fab_build_install end interface amrex_fab_build + interface amrex_fab_destroy + module procedure amrex_fab_destroy + end interface amrex_fab_destroy + contains ! Build a fab, allocate own memory diff --git a/Src/F_Interfaces/Base/AMReX_geometry_mod.F90 b/Src/F_Interfaces/Base/AMReX_geometry_mod.F90 index d65a8f6d980..7492b80c7b2 100644 --- a/Src/F_Interfaces/Base/AMReX_geometry_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_geometry_mod.F90 @@ -76,6 +76,10 @@ subroutine amrex_fi_geometry_get_intdomain (geom,lo,hi) bind(c) end subroutine amrex_fi_geometry_get_intdomain end interface + interface amrex_geometry_destroy + module procedure amrex_geometry_destroy + end interface amrex_geometry_destroy + contains subroutine amrex_geometry_finalize () diff --git a/Src/F_Interfaces/Base/AMReX_multifab_mod.F90 b/Src/F_Interfaces/Base/AMReX_multifab_mod.F90 index 81ce03b9f3d..93f2c4be628 100644 --- a/Src/F_Interfaces/Base/AMReX_multifab_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_multifab_mod.F90 @@ -101,6 +101,10 @@ module amrex_multifab_module module procedure amrex_multifab_build_a end interface amrex_multifab_build + interface amrex_multifab_destroy + module procedure amrex_multifab_destroy + end interface amrex_multifab_destroy + type, public :: amrex_imultifab logical :: owner = .false. type (c_ptr) :: p = c_null_ptr @@ -129,6 +133,10 @@ module amrex_multifab_module module procedure amrex_imultifab_build_a end interface amrex_imultifab_build + interface amrex_imultifab_destroy + module procedure amrex_imultifab_destroy + end interface amrex_imultifab_destroy + type, public :: amrex_mfiter type(c_ptr) :: p = c_null_ptr integer ,private :: counter = -1 @@ -159,6 +167,10 @@ module amrex_multifab_module module procedure amrex_mfiter_build_badm_s end interface amrex_mfiter_build + interface amrex_mfiter_destroy + module procedure amrex_mfiter_destroy + end interface amrex_mfiter_destroy + ! interfaces to c++ functions interface diff --git a/Src/F_Interfaces/Base/AMReX_physbc_mod.F90 b/Src/F_Interfaces/Base/AMReX_physbc_mod.F90 index 6831fe81514..1f707575a83 100644 --- a/Src/F_Interfaces/Base/AMReX_physbc_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_physbc_mod.F90 @@ -47,6 +47,10 @@ subroutine amrex_fi_delete_physbc (pbc) bind(c) end subroutine amrex_fi_delete_physbc end interface + interface amrex_physbc_destroy + module procedure amrex_physbc_destroy + end interface amrex_physbc_destroy + contains subroutine amrex_physbc_build (pbc, fill, geom) diff --git a/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90 b/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90 index 5c9425168ec..d90a2faeae4 100644 --- a/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90 +++ b/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90 @@ -58,6 +58,10 @@ subroutine amrex_fi_abeclap_set_bcoeffs (abeclap, amrlev, beta) bind(c) end subroutine amrex_fi_abeclap_set_bcoeffs end interface + interface amrex_abeclaplacian_destroy + module procedure amrex_abeclaplacian_destroy + end interface amrex_abeclaplacian_destroy + contains subroutine amrex_abeclaplacian_assign (dst, src) diff --git a/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90 b/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90 index bc7ae734d01..5ea57e5196a 100644 --- a/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90 +++ b/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90 @@ -154,6 +154,10 @@ subroutine amrex_fi_multigrid_set_final_fill_bc (mg, f) bind(c) end subroutine amrex_fi_multigrid_set_final_fill_bc end interface + interface amrex_multigrid_destroy + module procedure amrex_multigrid_destroy + end interface amrex_multigrid_destroy + contains subroutine amrex_multigrid_assign (dst, src) diff --git a/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 b/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 index 3690419f22d..33a02336af1 100644 --- a/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 +++ b/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 @@ -35,6 +35,10 @@ subroutine amrex_fi_delete_linop (linop) bind(c) end subroutine amrex_fi_delete_linop end interface + interface amrex_poisson_destroy + module procedure amrex_poisson_destroy + end interface amrex_poisson_destroy + contains subroutine amrex_poisson_assign (dst, src) diff --git a/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90 b/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90 index bdb4c9a2310..4def2e000f9 100644 --- a/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90 +++ b/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90 @@ -163,6 +163,10 @@ end subroutine amrex_fi_num_particles_i end interface + interface amrex_particlecontainer_destroy + module procedure amrex_particlecontainer_destroy + end interface amrex_particlecontainer_destroy + contains subroutine amrex_particlecontainer_build (pc, amrcore) From a31abb532dd8f1c62806fd093a864ac009e0e041 Mon Sep 17 00:00:00 2001 From: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com> Date: Mon, 2 Sep 2024 20:04:41 +0200 Subject: [PATCH 04/18] TinyProfiler with BArena and PArena (#4113) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds the capability to profile BArena and PArena with TinyProfiler. Previously, only CArena was profiled. Note that some allocations are still not profiled when running on CPU because `amrex::DefaultAllocator` and `amrex::PODVector` use `std::allocator` instead of `amrex::ArenaAllocator`. --- Src/Base/AMReX_Arena.H | 37 ++++++++++++++++++++++++++- Src/Base/AMReX_Arena.cpp | 45 +++++++++++++++++++++++++++++++-- Src/Base/AMReX_BArena.cpp | 5 +++- Src/Base/AMReX_CArena.H | 12 --------- Src/Base/AMReX_CArena.cpp | 39 ++++++---------------------- Src/Base/AMReX_PArena.cpp | 2 ++ Src/Base/AMReX_TinyProfiler.H | 2 +- Src/Base/AMReX_TinyProfiler.cpp | 5 ++-- 8 files changed, 97 insertions(+), 50 deletions(-) diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index 2a6cbb25a08..51b5d983fdf 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -4,9 +4,21 @@ #include #include + +#ifdef AMREX_TINY_PROFILING +#include +#else +namespace amrex { + struct MemStat {}; +} +#endif + #include #include #include +#include +#include +#include #include namespace amrex { @@ -156,7 +168,7 @@ public: * \brief Add this Arena to the list of Arenas that are profiled by TinyProfiler. * \param memory_name The name of this arena in the TinyProfiler output. */ - virtual void registerForProfiling (const std::string& memory_name); + void registerForProfiling (const std::string& memory_name); #ifdef AMREX_USE_GPU //! Is this GPU stream ordered memory allocator? @@ -199,6 +211,29 @@ protected: virtual std::size_t freeUnused_protected () { return 0; } void* allocate_system (std::size_t nbytes); void deallocate_system (void* p, std::size_t nbytes); + + struct ArenaProfiler { + //! If this arena is profiled by TinyProfiler + bool m_do_profiling = false; + //! Mutex for the profiling + std::mutex m_arena_profiler_mutex; + //! Data structure used for profiling with TinyProfiler + std::map m_profiling_stats; + //! Track the currently allocated memory, not used by CArena + std::unordered_map> m_currently_allocated; + + ~ArenaProfiler (); + ArenaProfiler () noexcept = default; + ArenaProfiler (const ArenaProfiler& rhs) = delete; + ArenaProfiler (ArenaProfiler&& rhs) = delete; + ArenaProfiler& operator= (const ArenaProfiler& rhs) = delete; + ArenaProfiler& operator= (ArenaProfiler&& rhs) = delete; + + void profile_alloc (void* ptr, std::size_t nbytes); + + void profile_free (void* ptr); + + } m_profiler; }; } diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp index ce4ece3b643..c2de5464574 100644 --- a/Src/Base/AMReX_Arena.cpp +++ b/Src/Base/AMReX_Arena.cpp @@ -117,9 +117,13 @@ Arena::hasFreeDeviceMemory (std::size_t) } void -Arena::registerForProfiling (const std::string&) +Arena::registerForProfiling ([[maybe_unused]] const std::string& memory_name) { - amrex::Abort("Profiling is not implemented for this type of Arena"); +#ifdef AMREX_TINY_PROFILING + AMREX_ALWAYS_ASSERT(m_profiler.m_do_profiling == false); + m_profiler.m_do_profiling = + TinyProfiler::RegisterArena(memory_name, m_profiler.m_profiling_stats); +#endif } std::size_t @@ -330,6 +334,7 @@ Arena::Initialize () } the_async_arena = new PArena(the_async_arena_release_threshold); + the_async_arena->registerForProfiling("Async Memory"); #ifdef AMREX_USE_GPU if (the_arena->isDevice()) { @@ -403,6 +408,7 @@ Arena::Initialize () } the_cpu_arena = The_BArena(); + the_cpu_arena->registerForProfiling("Cpu Memory"); // Initialize the null arena auto* null_arena = The_Null_Arena(); @@ -654,4 +660,39 @@ The_Comms_Arena () } } +Arena::ArenaProfiler::~ArenaProfiler () { +#ifdef AMREX_TINY_PROFILING + if (m_do_profiling) { + TinyProfiler::DeregisterArena(m_profiling_stats); + } +#endif +} + +void Arena::ArenaProfiler::profile_alloc ([[maybe_unused]] void* ptr, + [[maybe_unused]] std::size_t nbytes) { +#ifdef AMREX_TINY_PROFILING + if (m_do_profiling) { + std::lock_guard lock(m_arena_profiler_mutex); + MemStat* stat = TinyProfiler::memory_alloc(nbytes, m_profiling_stats); + if (stat) { + m_currently_allocated.insert({ptr, {stat, nbytes}}); + } + } +#endif +} + +void Arena::ArenaProfiler::profile_free ([[maybe_unused]] void* ptr) { +#ifdef AMREX_TINY_PROFILING + if (m_do_profiling) { + std::lock_guard lock(m_arena_profiler_mutex); + auto it = m_currently_allocated.find(ptr); + if (it != m_currently_allocated.end()) { + auto [stat, nbytes] = it->second; + TinyProfiler::memory_free(nbytes, stat); + m_currently_allocated.erase(it); + } + } +#endif +} + } diff --git a/Src/Base/AMReX_BArena.cpp b/Src/Base/AMReX_BArena.cpp index c22affa687a..054e64b854c 100644 --- a/Src/Base/AMReX_BArena.cpp +++ b/Src/Base/AMReX_BArena.cpp @@ -3,12 +3,15 @@ void* amrex::BArena::alloc (std::size_t sz_) { - return std::malloc(sz_); + void* pt = std::malloc(sz_); + m_profiler.profile_alloc(pt, sz_); + return pt; } void amrex::BArena::free (void* pt) { + m_profiler.profile_free(pt); std::free(pt); } diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H index 9547bc92f21..bc46d008241 100644 --- a/Src/Base/AMReX_CArena.H +++ b/Src/Base/AMReX_CArena.H @@ -16,8 +16,6 @@ namespace amrex { -struct MemStat; - /** * \brief A Concrete Class for Dynamic Memory Management using first fit. * This is a coalescing memory manager. It allocates (possibly) large @@ -75,12 +73,6 @@ public: */ [[nodiscard]] bool hasFreeDeviceMemory (std::size_t sz) final; - /** - * \brief Add this Arena to the list of Arenas that are profiled by TinyProfiler. - * \param memory_name The name of this arena in the TinyProfiler output. - */ - void registerForProfiling (const std::string& memory_name) final; - //! The current amount of heap space used by the CArena object. std::size_t heap_space_used () const noexcept; @@ -191,10 +183,6 @@ protected: std::size_t m_used{0}; //! The amount of memory given out via alloc(). std::size_t m_actually_used{0}; - //! If this arena is profiled by TinyProfiler - bool m_do_profiling = false; - //! Data structure used for profiling with TinyProfiler - std::map m_profiling_stats; std::mutex carena_mutex; diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp index 42987f47a86..bc5297f4d52 100644 --- a/Src/Base/AMReX_CArena.cpp +++ b/Src/Base/AMReX_CArena.cpp @@ -5,14 +5,6 @@ #include #include -#ifdef AMREX_TINY_PROFILING -#include -#else -namespace amrex { - struct MemStat {}; -} -#endif - #include #include #include @@ -32,12 +24,6 @@ CArena::~CArena () for (auto const& a : m_alloc) { deallocate_system(a.first, a.second); } - -#ifdef AMREX_TINY_PROFILING - if (m_do_profiling) { - TinyProfiler::DeregisterArena(m_profiling_stats); - } -#endif } void* @@ -53,8 +39,8 @@ CArena::alloc_protected (std::size_t nbytes) { MemStat* stat = nullptr; #ifdef AMREX_TINY_PROFILING - if (m_do_profiling) { - stat = TinyProfiler::memory_alloc(nbytes, m_profiling_stats); + if (m_profiler.m_do_profiling) { + stat = TinyProfiler::memory_alloc(nbytes, m_profiler.m_profiling_stats); } #endif @@ -173,10 +159,10 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax) free_node.size(left_size); } #ifdef AMREX_TINY_PROFILING - if (m_do_profiling) { + if (m_profiler.m_do_profiling) { TinyProfiler::memory_free(busy_it->size(), busy_it->mem_stat()); auto* stat = TinyProfiler::memory_alloc(new_size, - m_profiling_stats); + m_profiler.m_profiling_stats); const_cast(*busy_it).mem_stat(stat); } #endif @@ -186,10 +172,10 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax) } else if (total_size >= szmin) { m_freelist.erase(next_it); #ifdef AMREX_TINY_PROFILING - if (m_do_profiling) { + if (m_profiler.m_do_profiling) { TinyProfiler::memory_free(busy_it->size(), busy_it->mem_stat()); auto* stat = TinyProfiler::memory_alloc(total_size, - m_profiling_stats); + m_profiler.m_profiling_stats); const_cast(*busy_it).mem_stat(stat); } #endif @@ -255,9 +241,9 @@ CArena::shrink_in_place (void* pt, std::size_t new_size) m_actually_used -= leftover_size; #ifdef AMREX_TINY_PROFILING - if (m_do_profiling) { + if (m_profiler.m_do_profiling) { TinyProfiler::memory_free(old_size, busy_it->mem_stat()); - auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats); + auto* stat = TinyProfiler::memory_alloc(new_size, m_profiler.m_profiling_stats); const_cast(*busy_it).mem_stat(stat); } #endif @@ -431,15 +417,6 @@ CArena::hasFreeDeviceMemory (std::size_t sz) } } -void -CArena::registerForProfiling ([[maybe_unused]] const std::string& memory_name) -{ -#ifdef AMREX_TINY_PROFILING - m_do_profiling = true; - TinyProfiler::RegisterArena(memory_name, m_profiling_stats); -#endif -} - std::size_t CArena::heap_space_used () const noexcept { diff --git a/Src/Base/AMReX_PArena.cpp b/Src/Base/AMReX_PArena.cpp index 36155f3d32c..bbe2717ab07 100644 --- a/Src/Base/AMReX_PArena.cpp +++ b/Src/Base/AMReX_PArena.cpp @@ -62,6 +62,7 @@ PArena::alloc (std::size_t nbytes) AMREX_HIP_SAFE_CALL(hipMallocAsync(&p, nbytes, m_pool, Gpu::gpuStream()));, AMREX_CUDA_SAFE_CALL(cudaMallocAsync(&p, nbytes, m_pool, Gpu::gpuStream())); ) + m_profiler.profile_alloc(p, nbytes); return p; } else #endif @@ -93,6 +94,7 @@ PArena::free (void* p) #if defined (AMREX_GPU_STREAM_ALLOC_SUPPORT) if (Gpu::Device::memoryPoolsSupported()) { + m_profiler.profile_free(p); AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipFreeAsync(p, Gpu::gpuStream()));, AMREX_CUDA_SAFE_CALL(cudaFreeAsync(p, Gpu::gpuStream())); diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H index 0228949beb1..9a0731175bd 100644 --- a/Src/Base/AMReX_TinyProfiler.H +++ b/Src/Base/AMReX_TinyProfiler.H @@ -57,7 +57,7 @@ public: static void MemoryInitialize () noexcept; static void MemoryFinalize (bool bFlushing = false) noexcept; - static void RegisterArena (const std::string& memory_name, + static bool RegisterArena (const std::string& memory_name, std::map& memstats) noexcept; static void DeregisterArena (std::map& memstats) noexcept; diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp index db922745784..fab9e38aa82 100644 --- a/Src/Base/AMReX_TinyProfiler.cpp +++ b/Src/Base/AMReX_TinyProfiler.cpp @@ -490,14 +490,15 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept if(os) { os->precision(oldprec); } } -void +bool TinyProfiler::RegisterArena (const std::string& memory_name, std::map& memstats) noexcept { - if (!memprof_enabled) { return; } + if (!memprof_enabled) { return false; } all_memstats.push_back(&memstats); all_memnames.push_back(memory_name); + return true; } void From de4dc974dda7feead09d4cf03cc7250bf80d5bc2 Mon Sep 17 00:00:00 2001 From: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com> Date: Tue, 3 Sep 2024 00:33:11 +0200 Subject: [PATCH 05/18] CTOParallelFor with BoxND / add AnyCTO (#4109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This PR adds support for BoxND to CTOParallelFor by adding the AnyCTO function which can be used to implement compile time options with any kernel launching function such as ParallelFor, ParallelForRNG, launch, etc. I'm not sure if AnyCTO is a good name, are there other suggestions? ## Additional background AnyCTO Examples: ``` C++ int A_runtime_option = ...; int B_runtime_option = ...; enum A_options : int { A0, A1, A2, A3 }; enum B_options : int { B0, B1 }; AnyCTO(TypeList, CompileTimeOptions>{}, {A_runtime_option, B_runtime_option}, [&](auto cto_func){ ParallelForRNG(N, cto_func); }, [=] AMREX_GPU_DEVICE (int i, const RandomEngine& engine, auto A_control, auto B_control) { ... if constexpr (A_control.value == A0) { ... } else if constexpr (A_control.value == A1) { ... } else if constexpr (A_control.value == A2) { ... else { ... } if constexpr (A_control.value != A3 && B_control.value == B1) { ... } ... } ); constexpr int nthreads_per_block = ...; int nblocks = ...; AnyCTO(TypeList, CompileTimeOptions>{}, {A_runtime_option, B_runtime_option}, [&](auto cto_func){ launch(nblocks, Gpu::gpuStream(), cto_func); }, [=] AMREX_GPU_DEVICE (auto A_control, auto B_control){ ... } ); ``` Additionally, .GetOptions() can be used to use the compile time options in the function that launches the kernel: ```C++ int nthreads_per_block = ...; AnyCTO(TypeList>{}, {nthreads_per_block}, [&](auto cto_func){ constexpr std::array ctos = cto_func.GetOptions(); constexpr int c_nthreads_per_block = ctos[0]; ParallelFor(N, cto_func); }, [=] AMREX_GPU_DEVICE (int i, auto){ ... } ); BoxND<6> box6D = ...; int dims_needed = ...; AnyCTO(TypeList>{}, {dims_needed}, [&](auto cto_func){ constexpr std::array ctos = cto_func.GetOptions(); constexpr int c_dims_needed = ctos[0]; const auto box = BoxShrink(box6D); ParallelFor(box, cto_func); }, [=] AMREX_GPU_DEVICE (auto intvect, auto) -> decltype(void(intvect.size())) { ... } ); ``` --- Src/Base/AMReX_CTOParallelForImpl.H | 261 ++++++++++++++++++---------- 1 file changed, 174 insertions(+), 87 deletions(-) diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H index 35e0ec3be7b..8f7e8ce567f 100644 --- a/Src/Base/AMReX_CTOParallelForImpl.H +++ b/Src/Base/AMReX_CTOParallelForImpl.H @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include @@ -18,7 +18,7 @@ namespace amrex { template struct CompileTimeOptions { - // TypeList is defined in AMReX_Tuple.H + // TypeList is defined in AMReX_TypeList.H using list_type = TypeList...>; }; @@ -26,65 +26,42 @@ struct CompileTimeOptions { namespace detail { - template - std::enable_if_t || std::is_same_v, bool> - ParallelFor_helper2 (T const& N, F const& f, TypeList, - std::array const& runtime_options) - { - if (runtime_options == std::array{As::value...}) { - if constexpr (std::is_integral_v) { - ParallelFor(N, [f] AMREX_GPU_DEVICE (T i) noexcept - { - f(i, As{}...); - }); - } else { - ParallelFor(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept - { - f(i, j, k, As{}...); - }); - } - return true; - } else { - return false; + template + struct CTOWrapper { + F f; + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + auto operator() (Args... args) const noexcept + -> decltype(f(args..., std::integral_constant{}...)) { + return f(args..., std::integral_constant{}...); } - } - template - std::enable_if_t, bool> - ParallelFor_helper2 (Box const& box, T ncomp, F const& f, TypeList, - std::array const& runtime_options) + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + static constexpr + std::array GetOptions () noexcept { + return {ctr...}; + } + }; + + template + bool + AnyCTO_helper2 (const L& l, const F& f, TypeList, + std::array const& runtime_options) { if (runtime_options == std::array{As::value...}) { - ParallelFor(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept - { - f(i, j, k, n, As{}...); - }); + l(CTOWrapper{f}); return true; } else { return false; } } - template - std::enable_if_t || std::is_same_v> - ParallelFor_helper1 (T const& N, F const& f, TypeList, - RO const& runtime_options) - { - bool found_option = (false || ... || - ParallelFor_helper2(N, f, - PPs{}, runtime_options)); - amrex::ignore_unused(found_option); - AMREX_ASSERT(found_option); - } - - template - std::enable_if_t> - ParallelFor_helper1 (Box const& box, T ncomp, F const& f, TypeList, - RO const& runtime_options) + template + void + AnyCTO_helper1 (const L& l, const F& f, TypeList, RO const& runtime_options) { - bool found_option = (false || ... || - ParallelFor_helper2(box, ncomp, f, - PPs{}, runtime_options)); + bool found_option = (false || ... || AnyCTO_helper2(l, f, PPs{}, runtime_options)); amrex::ignore_unused(found_option); AMREX_ASSERT(found_option); } @@ -92,51 +69,161 @@ namespace detail #endif -template -std::enable_if_t> -ParallelFor (TypeList /*list_of_compile_time_options*/, +/** + * \brief Compile time optimization of kernels with run time options. + * + * This is a generalized version of ParallelFor with CTOs that can support any function that + * takes in one lambda to launch a GPU kernel such as ParallelFor, ParallelForRNG, launch, etc. + * It uses fold expression to generate kernel launches for all combinations + * of the run time options. The kernel function can use constexpr if to + * discard unused code blocks for better run time performance. In the + * example below, the code will be expanded into 4*2=8 normal ParallelForRNGs + * for all combinations of the run time parameters. + \verbatim + int A_runtime_option = ...; + int B_runtime_option = ...; + enum A_options : int { A0, A1, A2, A3 }; + enum B_options : int { B0, B1 }; + AnyCTO(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + [&](auto cto_func){ + ParallelForRNG(N, cto_func); + }, + [=] AMREX_GPU_DEVICE (int i, const RandomEngine& engine, + auto A_control, auto B_control) + { + ... + if constexpr (A_control.value == A0) { + ... + } else if constexpr (A_control.value == A1) { + ... + } else if constexpr (A_control.value == A2) { + ... + } else { + ... + } + if constexpr (A_control.value != A3 && B_control.value == B1) { + ... + } + ... + } + ); + + constexpr int nthreads_per_block = ...; + int nblocks = ...; + AnyCTO(TypeList, + CompileTimeOptions>{}, + {A_runtime_option, B_runtime_option}, + [&](auto cto_func){ + launch(nblocks, Gpu::gpuStream(), cto_func); + }, + [=] AMREX_GPU_DEVICE (auto A_control, auto B_control){ + ... + } + ); + \endverbatim + * The static member function cto_func.GetOptions() can be used to obtain the runtime_options + * passed into AnyCTO, but at compile time. This enables some advanced use cases, + * such as changing the number of threads per block or the dimensionality of ParallelFor at runtime. + * For the second example -> decltype(void(intvect.size())) is necessary to + * disambiguate IntVectND<1> and int for the first argument of the kernel function. + \verbatim + int nthreads_per_block = ...; + AnyCTO(TypeList>{}, + {nthreads_per_block}, + [&](auto cto_func){ + constexpr std::array ctos = cto_func.GetOptions(); + constexpr int c_nthreads_per_block = ctos[0]; + ParallelFor(N, cto_func); + }, + [=] AMREX_GPU_DEVICE (int i, auto){ + ... + } + ); + + BoxND<6> box6D = ...; + int dims_needed = ...; + AnyCTO(TypeList>{}, + {dims_needed}, + [&](auto cto_func){ + constexpr std::array ctos = cto_func.GetOptions(); + constexpr int c_dims_needed = ctos[0]; + const auto box = BoxShrink(box6D); + ParallelFor(box, cto_func); + }, + [=] AMREX_GPU_DEVICE (auto intvect, auto) -> decltype(void(intvect.size())) { + ... + } + ); + \endverbatim + + * Note that due to a limitation of CUDA's extended device lambda, the + * constexpr if block cannot be the one that captures a variable first. + * If nvcc complains about it, you will have to manually capture it outside + * constexpr if. Alternatively, the constexpr if can be replaced with a regular if. + * Compilers can still perform the same optimizations since the condition is known at compile time. + * The data type for the parameters is int. + * + * \param list_of_compile_time_options list of all possible values of the parameters. + * \param runtime_options the run time parameters. + * \param l a callable object containing a CPU function that launches the provided GPU kernel. + * \param f a callable object containing the GPU kernel with optimizations. + */ +template +void AnyCTO ([[maybe_unused]] TypeList list_of_compile_time_options, std::array const& runtime_options, - T N, F&& f) + L&& l, F&& f) { #if (__cplusplus >= 201703L) - detail::ParallelFor_helper1(N, std::forward(f), - CartesianProduct(typename CTOs::list_type{}...), - runtime_options); + detail::AnyCTO_helper1(std::forward(l), std::forward(f), + CartesianProduct(typename CTOs::list_type{}...), + runtime_options); #else - amrex::ignore_unused(N, f, runtime_options); + amrex::ignore_unused(runtime_options, l, f); static_assert(std::is_integral::value, "This requires C++17"); #endif } -template -void ParallelFor (TypeList /*list_of_compile_time_options*/, +template +std::enable_if_t> +ParallelFor (TypeList ctos, + std::array const& runtime_options, + T N, F&& f) +{ + AnyCTO(ctos, runtime_options, + [&](auto cto_func){ + ParallelFor(N, cto_func); + }, + std::forward(f) + ); +} + +template +void ParallelFor (TypeList ctos, std::array const& runtime_options, - Box const& box, F&& f) + BoxND const& box, F&& f) { -#if (__cplusplus >= 201703L) - detail::ParallelFor_helper1(box, std::forward(f), - CartesianProduct(typename CTOs::list_type{}...), - runtime_options); -#else - amrex::ignore_unused(box, f, runtime_options); - static_assert(std::is_integral::value, "This requires C++17"); -#endif + AnyCTO(ctos, runtime_options, + [&](auto cto_func){ + ParallelFor(box, cto_func); + }, + std::forward(f) + ); } -template +template std::enable_if_t> -ParallelFor (TypeList /*list_of_compile_time_options*/, +ParallelFor (TypeList ctos, std::array const& runtime_options, - Box const& box, T ncomp, F&& f) + BoxND const& box, T ncomp, F&& f) { -#if (__cplusplus >= 201703L) - detail::ParallelFor_helper1(box, ncomp, std::forward(f), - CartesianProduct(typename CTOs::list_type{}...), - runtime_options); -#else - amrex::ignore_unused(box, ncomp, f, runtime_options); - static_assert(std::is_integral::value, "This requires C++17"); -#endif + AnyCTO(ctos, runtime_options, + [&](auto cto_func){ + ParallelFor(box, ncomp, cto_func); + }, + std::forward(f) + ); } /** @@ -164,7 +251,7 @@ ParallelFor (TypeList /*list_of_compile_time_options*/, ... } else if constexpr (A_control.value == A2) { ... - else { + } else { ... } if constexpr (A_control.value != A3 && B_control.value == B1) { @@ -218,7 +305,7 @@ ParallelFor (TypeList ctos, ... } else if constexpr (A_control.value == A2) { ... - else { + } else { ... } if constexpr (A_control.value != A3 && B_control.value == B1) { @@ -237,10 +324,10 @@ ParallelFor (TypeList ctos, * \param box a Box specifying the 3D for loop's range. * \param f a callable object taking three integers and working on the given cell. */ -template +template void ParallelFor (TypeList ctos, std::array const& option, - Box const& box, F&& f) + BoxND const& box, F&& f) { ParallelFor(ctos, option, box, std::forward(f)); } @@ -271,7 +358,7 @@ void ParallelFor (TypeList ctos, ... } else if constexpr (A_control.value == A2) { ... - else { + } else { ... } if constexpr (A_control.value != A3 && B_control.value == B1) { @@ -291,11 +378,11 @@ void ParallelFor (TypeList ctos, * \param ncomp an integer specifying the range for iteration over components. * \param f a callable object taking three integers and working on the given cell. */ -template +template std::enable_if_t> ParallelFor (TypeList ctos, std::array const& option, - Box const& box, T ncomp, F&& f) + BoxND const& box, T ncomp, F&& f) { ParallelFor(ctos, option, box, ncomp, std::forward(f)); } From 8627fbe22d527a00bc477244dc7c99e9c29090a0 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 4 Sep 2024 12:00:06 -0500 Subject: [PATCH 06/18] ArenaProfiler: Fix clang-tidy warning (#4128) When TINY_PROFILING is off, ~ArenaProfiler is trivial. So clang-tidy wants us to use `= default`. --- Src/Base/AMReX_Arena.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp index c2de5464574..2320ab8c664 100644 --- a/Src/Base/AMReX_Arena.cpp +++ b/Src/Base/AMReX_Arena.cpp @@ -660,14 +660,21 @@ The_Comms_Arena () } } -Arena::ArenaProfiler::~ArenaProfiler () { #ifdef AMREX_TINY_PROFILING + +Arena::ArenaProfiler::~ArenaProfiler () +{ if (m_do_profiling) { TinyProfiler::DeregisterArena(m_profiling_stats); } -#endif } +#else + +Arena::ArenaProfiler::~ArenaProfiler () = default; + +#endif + void Arena::ArenaProfiler::profile_alloc ([[maybe_unused]] void* ptr, [[maybe_unused]] std::size_t nbytes) { #ifdef AMREX_TINY_PROFILING From dea9bb1b6dfaac0b295e1a62480b9afcb8b9ca15 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 4 Sep 2024 12:07:26 -0500 Subject: [PATCH 07/18] TinyProfiler: A few updates (#4102) * Make it safe for Initialize and Finalize being called multiple times. * Delay the reading of tiny_profiler.output_file so that if the user wants to override the default, they do not have to do it by passing a function to amrex::Initialize. * Update the documentation, thanks to @ax3l's suggestion. --- .../source/RuntimeParameters.rst | 4 +- Src/Base/AMReX_TinyProfiler.H | 1 + Src/Base/AMReX_TinyProfiler.cpp | 71 +++++++++++++------ 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/Docs/sphinx_documentation/source/RuntimeParameters.rst b/Docs/sphinx_documentation/source/RuntimeParameters.rst index c57c74c8fb1..4e9f4196804 100644 --- a/Docs/sphinx_documentation/source/RuntimeParameters.rst +++ b/Docs/sphinx_documentation/source/RuntimeParameters.rst @@ -1234,5 +1234,5 @@ enabled. If this parameter is empty, the output of tiny profiling is dumped on the default out stream of AMReX. If it's not empty, it specifies the file - name for the output. Note that ``/dev/null`` is a special name that mean - a null file. + name for the output. Note that ``/dev/null`` is a special name that means + no output. diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H index 9a0731175bd..3c7f2da7758 100644 --- a/Src/Base/AMReX_TinyProfiler.H +++ b/Src/Base/AMReX_TinyProfiler.H @@ -143,6 +143,7 @@ private: static bool memprof_enabled; static std::string output_file; + static std::string const& get_output_file (); static void PrintStats (std::map& regstats, double dt_max, std::ostream* os); static void PrintMemStats (std::map& memstats, diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp index fab9e38aa82..5b4a5ef3b44 100644 --- a/Src/Base/AMReX_TinyProfiler.cpp +++ b/Src/Base/AMReX_TinyProfiler.cpp @@ -56,6 +56,8 @@ std::string TinyProfiler::output_file; namespace { constexpr char mainregion[] = "main"; + bool finalized = false; + bool memprof_finalized = false; } TinyProfiler::TinyProfiler (std::string funcname) noexcept @@ -318,23 +320,14 @@ TinyProfiler::Initialize () noexcept pp.queryAdd("print_threshold", print_threshold); pp.queryAdd("enabled", enabled); - pp.queryAdd("output_file", output_file); } if (!enabled) { return; } - if (ParallelDescriptor::IOProcessor()) { - static bool first = true; - if (first && !output_file.empty() && output_file != "/dev/null") { - if (FileSystem::Exists(output_file)) { - FileSystem::Remove(output_file); - } - first = false; - } - } - regionstack.emplace_back(mainregion); t_init = amrex::second(); + + finalized = false; } void @@ -352,6 +345,8 @@ TinyProfiler::MemoryInitialize () noexcept #ifdef AMREX_USE_OMP mem_stack_thread_private.resize(omp_get_max_threads()); #endif + + memprof_finalized = false; } void @@ -359,7 +354,6 @@ TinyProfiler::Finalize (bool bFlushing) noexcept { if (!enabled) { return; } - static bool finalized = false; if (!bFlushing) { // If flushing, don't make this the last time! if (finalized) { return; @@ -388,12 +382,13 @@ TinyProfiler::Finalize (bool bFlushing) noexcept std::ostream* os = nullptr; std::streamsize oldprec = 0; if (ParallelDescriptor::IOProcessor()) { - if (output_file.empty()) { + auto const& ofile = get_output_file(); + if (ofile.empty()) { os = &(amrex::OutStream()); - } else if (output_file != "/dev/null") { - ofs.open(output_file, std::ios_base::app); + } else if (ofile != "/dev/null") { + ofs.open(ofile, std::ios_base::app); if (!ofs.is_open()) { - amrex::Error("TinyProfiler failed to open "+output_file); + amrex::Error("TinyProfiler failed to open "+ofile); } os = static_cast(&ofs); } @@ -439,6 +434,12 @@ TinyProfiler::Finalize (bool bFlushing) noexcept } } + if (!bFlushing) { + regionstack.clear(); + ttstack.clear(); + statsmap.clear(); + } + if(os) { os->precision(oldprec); } } @@ -449,12 +450,11 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept // This function must be called BEFORE the profiled arenas are deleted - static bool finalized = false; if (!bFlushing) { // If flushing, don't make this the last time! - if (finalized) { + if (memprof_finalized) { return; } else { - finalized = true; + memprof_finalized = true; } } @@ -467,12 +467,13 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept std::ostream* os = nullptr; std::streamsize oldprec = 0; if (ParallelDescriptor::IOProcessor()) { - if (output_file.empty()) { + auto const& ofile = get_output_file(); + if (ofile.empty()) { os = &(amrex::OutStream()); - } else if (output_file != "/dev/null") { - ofs.open(output_file, std::ios_base::app); + } else if (ofile != "/dev/null") { + ofs.open(ofile, std::ios_base::app); if (!ofs.is_open()) { - amrex::Error("TinyProfiler failed to open "+output_file); + amrex::Error("TinyProfiler failed to open "+ofile); } os = static_cast(&ofs); } @@ -951,4 +952,28 @@ TinyProfiler::PrintCallStack (std::ostream& os) } } +std::string const& +TinyProfiler::get_output_file () +{ + // Instead of reading it only once, we could try to read the parameter + // every time. But I am not sure how useful that might be. + static bool first = true; + if (first) { + first = false; + + amrex::ParmParse pp("tiny_profiler"); + pp.query("output_file", output_file); + + if (ParallelDescriptor::IOProcessor()) { + if (!output_file.empty() && output_file != "/dev/null") { + if (FileSystem::Exists(output_file)) { + FileSystem::Remove(output_file); + } + } + } + } + + return output_file; +} + } From 65d10a1a7592ecf7e81a92d5c5863a0d1f1e3662 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 4 Sep 2024 12:08:40 -0500 Subject: [PATCH 08/18] Fortran Interfaces: Add new average down functions (#4124) Add average down function for cell-centered data without volume weighting. Add average down function for nodal data. --- .../Base/AMReX_multifabutil_fi.cpp | 10 +++++-- .../Base/AMReX_multifabutil_mod.F90 | 26 ++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp b/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp index d59c2d02053..61d0385276b 100644 --- a/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp +++ b/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp @@ -5,12 +5,18 @@ using namespace amrex; extern "C" { void amrex_fi_average_down (const MultiFab* S_fine, MultiFab* S_crse, - const Geometry* fgeom, const Geometry* cgeom, - int scomp, int ncomp, int rr) + const Geometry* fgeom, const Geometry* cgeom, + int scomp, int ncomp, int rr) { amrex::average_down(*S_fine, *S_crse, *fgeom, *cgeom, scomp, ncomp, rr); } + void amrex_fi_average_down_cell_node (const MultiFab* S_fine, MultiFab* S_crse, + int scomp, int ncomp, int rr) + { + amrex::average_down(*S_fine, *S_crse, scomp, ncomp, rr); + } + void amrex_fi_average_down_faces (MultiFab const* fmf[], MultiFab* cmf[], Geometry const* cgeom, int scomp, int ncomp, int rr) diff --git a/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90 b/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90 index 9575b217a73..643f661649f 100644 --- a/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90 +++ b/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90 @@ -8,7 +8,11 @@ module amrex_multifabutil_module implicit none private - public :: amrex_average_down, amrex_average_down_faces, amrex_average_cellcenter_to_face + public :: amrex_average_down, & ! volume weighted average down of cell data + & amrex_average_down_cell, & ! average down of cell data + & amrex_average_down_node, & ! average down of nodal data + & amrex_average_down_faces, & ! average down of face data + & amrex_average_cellcenter_to_face ! average from cell centers to faces interface subroutine amrex_fi_average_down (fmf, cmf, fgeom, cgeom, scomp, ncomp, rr) bind(c) @@ -18,6 +22,13 @@ subroutine amrex_fi_average_down (fmf, cmf, fgeom, cgeom, scomp, ncomp, rr) bind integer(c_int), value :: scomp, ncomp, rr end subroutine amrex_fi_average_down + subroutine amrex_fi_average_down_cell_node (fmf, cmf, scomp, ncomp, rr) bind(c) + import + implicit none + type(c_ptr), value :: fmf, cmf + integer(c_int), value :: scomp, ncomp, rr + end subroutine amrex_fi_average_down_cell_node + subroutine amrex_fi_average_down_faces (fmf, cmf, cgeom, scomp, ncomp, rr) bind(c) import implicit none @@ -45,6 +56,19 @@ subroutine amrex_average_down (fmf, cmf, fgeom, cgeom, scomp, ncomp, rr) call amrex_fi_average_down(fmf%p, cmf%p, fgeom%p, cgeom%p, scomp-1, ncomp, rr) end subroutine amrex_average_down + subroutine amrex_average_down_cell (fmf, cmf, scomp, ncomp, rr) + type(amrex_multifab), intent(in ) :: fmf + type(amrex_multifab), intent(inout) :: cmf + integer, intent(in) :: scomp, ncomp, rr + call amrex_fi_average_down_cell_node(fmf%p, cmf%p, scomp-1, ncomp, rr) + end subroutine amrex_average_down_cell + + subroutine amrex_average_down_node (fmf, cmf, scomp, ncomp, rr) + type(amrex_multifab), intent(in ) :: fmf + type(amrex_multifab), intent(inout) :: cmf + integer, intent(in) :: scomp, ncomp, rr + call amrex_fi_average_down_cell_node(fmf%p, cmf%p, scomp-1, ncomp, rr) + end subroutine amrex_average_down_node subroutine amrex_average_down_faces (fmf, cmf, cgeom, scomp, ncomp, rr) type(amrex_multifab), intent(in ) :: fmf(amrex_spacedim) From 61b91f9e52a13718384be076b02627068721f619 Mon Sep 17 00:00:00 2001 From: Bruce Perry Date: Wed, 4 Sep 2024 14:00:29 -0600 Subject: [PATCH 09/18] use perl instead of sed in style checks for portability to MacOS (#4127) ## Summary `sed -i` does not function the same for GNU and BSD sed and a simple portable command between the two is apparently not possible. Therefore, the style check scripts do not work by default when running on Macs, unless the user installs `gsed` and aliases `sed` to run that instead. For portability,`perl` can be used instead of `sed`. ## Additional background Proposed solution taken from here: https://stackoverflow.com/questions/4247068/sed-command-with-i-option-failing-on-mac-but-works-on-linux --- .github/workflows/style/check_tabs.sh | 2 +- .github/workflows/style/check_trailing_whitespaces.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/style/check_tabs.sh b/.github/workflows/style/check_tabs.sh index f4418644e3a..e8bb9f14bc3 100755 --- a/.github/workflows/style/check_tabs.sh +++ b/.github/workflows/style/check_tabs.sh @@ -20,7 +20,7 @@ find . -type d \( -name .git \ -a ! -name "*.lex.h" -a ! -name "*.lex.nolint.H" \) \ \) \ -exec grep -Iq . {} \; \ - -exec sed -i 's/\t/\ \ \ \ /g' {} + + -exec perl -i -pe's/\t/\ \ \ \ /g' {} + gitdiff=`git diff` diff --git a/.github/workflows/style/check_trailing_whitespaces.sh b/.github/workflows/style/check_trailing_whitespaces.sh index a0f941f8c2b..a4ece8a0ee4 100755 --- a/.github/workflows/style/check_trailing_whitespaces.sh +++ b/.github/workflows/style/check_trailing_whitespaces.sh @@ -20,7 +20,7 @@ find . -type d \( -name .git \ -a ! -name "*.lex.h" -a ! -name "*.lex.nolint.H" \) \ \) \ -exec grep -Iq . {} \; \ - -exec sed -i 's/[[:blank:]]\+$//g' {} + + -exec perl -i -pe's/[[:blank:]]+$//g' {} + gitdiff=`git diff` From c88b2b51c1ce54dd7a6ca87b77ffe4402a92e58f Mon Sep 17 00:00:00 2001 From: Bruce Perry Date: Wed, 4 Sep 2024 14:15:37 -0600 Subject: [PATCH 10/18] Capability adds for ParmParse enum (#4119) ## Summary - The ParmParse functions for AMReX Enums added in #4069 did not carry the optional arguments found in other ParmParse functions. Those are added so that, for example, querying an enum has the same interface as querying other types. - These functions shouldn't modify the ParmParse object, so they are made `const` - The `prefixedName` function is made a public member instead of a protected member. ## Additional background The use case for these changes is https://github.com/erf-model/ERF/pull/1772 --- Src/Base/AMReX_ParmParse.H | 62 ++++++++++++++++++++++++++---------- Src/Base/AMReX_ParmParse.cpp | 34 ++++++++++---------- 2 files changed, 63 insertions(+), 33 deletions(-) diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H index cc9588793da..d49b2cac5d7 100644 --- a/Src/Base/AMReX_ParmParse.H +++ b/Src/Base/AMReX_ParmParse.H @@ -1192,14 +1192,18 @@ public: */ template , std::enable_if_t = 0> - int query (const char* name, T& ref) + int query (const char* name, T& ref, int ival = FIRST) const { std::string s; - int exist = this->query(name, s); + int exist = this->query(name, s, ival); if (exist) { try { ref = amrex::getEnum(s); } catch (...) { + if (amrex::Verbose() > 0 ) { + amrex::Print() << "amrex::ParmParse::query (input name: " + << this->prefixedName(name) << "):\n"; + } throw; } } @@ -1216,13 +1220,17 @@ public: */ template , std::enable_if_t = 0> - void get (const char* name, T& ref) + void get (const char* name, T& ref, int ival = FIRST) const { std::string s; - this->get(name, s); + this->get(name, s, ival); try { ref = amrex::getEnum(s); } catch (...) { + if (amrex::Verbose() > 0 ) { + amrex::Print() << "amrex::ParmParse::get (input name: " + << this->prefixedName(name) << "):\n"; + } throw; } } @@ -1230,14 +1238,25 @@ public: //! Query an array of enum values using given name. template , std::enable_if_t = 0> - int queryarr (const char* name, std::vector& ref) + int queryarr (const char* name, + std::vector& ref, + int start_ix = FIRST, + int num_val = ALL) const { std::vector s; - int exist = this->queryarr(name, s); + int exist = this->queryarr(name, s, start_ix, num_val); if (exist) { ref.resize(s.size()); for (std::size_t i = 0; i < s.size(); ++i) { - ref[i] = amrex::getEnum(s[i]); + try { + ref[i] = amrex::getEnum(s[i]); + } catch (...) { + if (amrex::Verbose() > 0 ) { + amrex::Print() << "amrex::ParmParse::queryarr (input name: " + << this->prefixedName(name) << "):\n"; + } + throw; + } } } return exist; @@ -1246,13 +1265,24 @@ public: //! Get an array of enum values using given name. template , std::enable_if_t = 0> - void getarr (const char* name, std::vector& ref) + void getarr (const char* name, + std::vector& ref, + int start_ix = FIRST, + int num_val = ALL) const { std::vector s; - this->getarr(name, s); + this->getarr(name, s, start_ix, num_val); ref.resize(s.size()); for (std::size_t i = 0; i < s.size(); ++i) { - ref[i] = amrex::getEnum(s[i]); + try { + ref[i] = amrex::getEnum(s[i]); + } catch (...) { + if (amrex::Verbose() > 0 ) { + amrex::Print() << "amrex::ParmParse::getarr (input name: " + << this->prefixedName(name) << "):\n"; + } + throw; + } } } @@ -1268,10 +1298,10 @@ public: */ template , std::enable_if_t = 0> - int query_enum_case_insensitive (const char* name, T& ref) + int query_enum_case_insensitive (const char* name, T& ref, int ival = FIRST) const { std::string s; - int exist = this->query(name, s); + int exist = this->query(name, s, ival); if (exist) { s = amrex::toLower(s); auto const& enum_names = amrex::getEnumNameStrings(); @@ -1303,9 +1333,9 @@ public: */ template , std::enable_if_t = 0> - void get_enum_case_insensitive (const char* name, T& ref) + void get_enum_case_insensitive (const char* name, T& ref, int ival = FIRST) const { - int exist = this->query_enum_case_insensitive(name, ref); + int exist = this->query_enum_case_insensitive(name, ref, ival); if (!exist) { std::string msg("get_enum_case_insensitive(\""); msg.append(name).append("\",").append(amrex::getEnumClassName()) @@ -1387,10 +1417,10 @@ public: static std::string ParserPrefix; -protected: - [[nodiscard]] std::string prefixedName (const std::string_view& str) const; +protected: + std::string m_prefix; // Prefix used in keyword search std::string m_parser_prefix; // Prefix used by Parser Table* m_table; diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp index df1e18e9b08..572d9013da8 100644 --- a/Src/Base/AMReX_ParmParse.cpp +++ b/Src/Base/AMReX_ParmParse.cpp @@ -1283,7 +1283,7 @@ ParmParse::query (const char* name, } void -ParmParse::add (const char* name, +ParmParse::add (const char* name, // NOLINT(readability-make-member-function-const) const bool val) { saddval(prefixedName(name),val); @@ -1315,7 +1315,7 @@ ParmParse::query (const char* name, int& ref, int ival) const } void -ParmParse::add (const char* name, const int val) +ParmParse::add (const char* name, const int val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1349,7 +1349,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1381,7 +1381,7 @@ ParmParse::query (const char* name, long& ref, int ival) const } void -ParmParse::add (const char* name, +ParmParse::add (const char* name, // NOLINT(readability-make-member-function-const) const long val) { saddval(prefixedName(name),val); @@ -1416,7 +1416,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1447,7 +1447,7 @@ ParmParse::query (const char* name, long long& ref, int ival) const } void -ParmParse::add (const char* name, const long long val) +ParmParse::add (const char* name, const long long val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1481,7 +1481,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, int start_ix } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1512,7 +1512,7 @@ ParmParse::query (const char* name, float& ref, int ival) const } void -ParmParse::add (const char* name, const float val) +ParmParse::add (const char* name, const float val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1546,7 +1546,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1579,7 +1579,7 @@ ParmParse::query (const char* name, double& ref, int ival) const } void -ParmParse::add (const char* name, const double val) +ParmParse::add (const char* name, const double val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1613,7 +1613,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, int start_ix, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1646,7 +1646,7 @@ ParmParse::query (const char* name, std::string& ref, int ival) const } void -ParmParse::add (const char* name, const std::string& val) +ParmParse::add (const char* name, const std::string& val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1680,7 +1680,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1713,7 +1713,7 @@ ParmParse::query (const char* name, IntVect& ref, int ival) const } void -ParmParse::add (const char* name, const IntVect& val) +ParmParse::add (const char* name, const IntVect& val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1747,7 +1747,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } @@ -1778,7 +1778,7 @@ ParmParse::query (const char* name, Box& ref, int ival) const } void -ParmParse::add (const char* name, const Box& val) +ParmParse::add (const char* name, const Box& val) // NOLINT(readability-make-member-function-const) { saddval(prefixedName(name),val); } @@ -1812,7 +1812,7 @@ ParmParse::queryarr (const char* name, std::vector& ref, } void -ParmParse::addarr (const char* name, const std::vector& ref) +ParmParse::addarr (const char* name, const std::vector& ref) // NOLINT(readability-make-member-function-const) { saddarr(prefixedName(name),ref); } From 41353f6ab5cb7048ccaf491cdee978ad5e993ac1 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 4 Sep 2024 20:54:12 -0500 Subject: [PATCH 11/18] MLMG: Minimum domain width (#4129) By default, the minimum domain width at the coarsest multigrid level is 2 and 4, for non-EB and EB runs, respectively. Previously, it was set to 4 for runs compiled with EB support but without EB at run time. So the results of those runs would be different from runs not compiled with EB support. This is not a correctness issue. Nevertheless, for the sake of consistence, we set the minimum domain width to 4 only if there is non-trivial EB at run time. --- Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H | 2 +- Src/LinearSolvers/MLMG/AMReX_MLLinOp.H | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H index 9b56b8049df..7b5fb069423 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H @@ -1281,7 +1281,7 @@ MLABecLaplacianT::supportNSolve () const bool support = false; if (this->m_overset_mask[0][0]) { if (this->m_geom[0].back().Domain().coarsenable(MLLinOp::mg_coarsen_ratio, - MLLinOp::mg_domain_min_width) + this->mg_domain_min_width) && this->m_grids[0].back().coarsenable(MLLinOp::mg_coarsen_ratio, MLLinOp::mg_box_min_width)) { support = true; diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H index 3cc623b761e..ce6a8b53335 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H @@ -570,11 +570,7 @@ protected: static constexpr int mg_coarsen_ratio = 2; static constexpr int mg_box_min_width = 2; -#ifdef AMREX_USE_EB - static constexpr int mg_domain_min_width = 4; -#else - static constexpr int mg_domain_min_width = 2; -#endif + int mg_domain_min_width = 2; LPInfo info; @@ -803,6 +799,15 @@ MLLinOpT::defineGrids (const Vector& a_geom, { BL_PROFILE("MLLinOp::defineGrids()"); +#ifdef AMREX_USE_EB + if ( ! a_factory.empty() ) { + auto const* ebf = dynamic_cast(a_factory[0]); + if (ebf && !(ebf->isAllRegular())) { // Has non-trivial EB + mg_domain_min_width = 4; + } + } +#endif + m_num_amr_levels = 0; for (int amrlev = 0; amrlev < a_geom.size(); amrlev++) { if (!a_grids[amrlev].empty()) { From 216ce6f37de4b65be57fc1006b3457b4fc318e03 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Thu, 5 Sep 2024 09:05:19 -0700 Subject: [PATCH 12/18] `ParmParse`: Prefix to `FILE` (#4126) ## Summary For CI/CD workflows and out-of-source tests we often want to include dependent inputs files via `FILE = `. For development, we prefer to run in temporary run directories but want to avoid having to copy over the latest inputs file from a source directory (mostly to avoid confusion between source and copy and to enable rapid development cycles). Now, the environment variable `AMREX_INPUTS_FILE_PREFIX` can be set to prefix every `FILE = ` with a custom path. We will use this in the CTests integration of WarpX. ## Additional background CC @EZoni https://github.com/ECP-WarpX/WarpX/pull/5068 --- Src/Base/AMReX_ParmParse.H | 3 +++ Src/Base/AMReX_ParmParse.cpp | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H index d49b2cac5d7..274858ed253 100644 --- a/Src/Base/AMReX_ParmParse.H +++ b/Src/Base/AMReX_ParmParse.H @@ -60,6 +60,9 @@ class RealVect; // '\n's. The "FILE = " definition is special. Rather than just // adding this entry to the database, it reads the contents of // into the database. +// For CI/CD workflows and out-of-source tests, the environment variable +// AMREX_INPUTS_FILE_PREFIX can be set to prefix every FILE = +// with a custom path. // // ParmParse stores all entries in a static table which is built the // first time a ParmParse object is constructed (usually in main()). diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp index 572d9013da8..767bd2a4d0d 100644 --- a/Src/Base/AMReX_ParmParse.cpp +++ b/Src/Base/AMReX_ParmParse.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -407,6 +408,19 @@ read_file (const char* fname, ParmParse::Table& tab) // if ( fname != nullptr && fname[0] != 0 ) { + std::string filename = fname; + + // optional prefix to search files in + char const *amrex_inputs_file_prefix_c = std::getenv("AMREX_INPUTS_FILE_PREFIX"); + if (amrex_inputs_file_prefix_c != nullptr) { + // we expect a directory path as the prefix: append a trailing "/" if missing + auto amrex_inputs_file_prefix = std::string(amrex_inputs_file_prefix_c); + if (amrex_inputs_file_prefix.back() != '/') { + amrex_inputs_file_prefix += "/"; + } + filename = amrex_inputs_file_prefix + filename; + } + #ifdef AMREX_USE_MPI if (ParallelDescriptor::Communicator() == MPI_COMM_NULL) { @@ -415,7 +429,6 @@ read_file (const char* fname, ParmParse::Table& tab) #endif Vector fileCharPtr; - std::string filename = fname; ParallelDescriptor::ReadAndBcastFile(filename, fileCharPtr); std::istringstream is(fileCharPtr.data()); From 08eed9562a53c9929e5f522a669649028bac548f Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Thu, 5 Sep 2024 13:41:38 -0500 Subject: [PATCH 13/18] EB Boundary Area: Fix issues for anisotropic cell size (#4131) * In 2D, the scaling in incorrect. For example, if dx >> dy and the boundary is parallel to the x-direction, it would produce inf for the scaled boundary area. * In 3D, the scaling has been modified so that it's easy to convert from the dimensionless boundary area to physical units. --- Docs/sphinx_documentation/source/EB.rst | 17 +++++++++++++++++ Src/EB/AMReX_EB2_2D_C.cpp | 3 +-- Src/EB/AMReX_EB2_3D_C.cpp | 4 +++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Docs/sphinx_documentation/source/EB.rst b/Docs/sphinx_documentation/source/EB.rst index 9b847a3259c..2c78856f937 100644 --- a/Docs/sphinx_documentation/source/EB.rst +++ b/Docs/sphinx_documentation/source/EB.rst @@ -271,6 +271,12 @@ following data: // embedded boundary centroid const MultiCutFab& getBndryCent () const; + // embedded boundary normal direction + const MultiCutFab& getBndryNormal () const; + + // embedded boundary surface area + const MultiCutFab& getBndryArea () const; + // area fractions Array getAreaFrac () const; @@ -291,6 +297,17 @@ following data: of the data is in the range of :math:`[-0.5,0.5]`, based on each cell's local coordinates with respect to the regular cell's center. +- **Boundary normal** is in a :cpp:`MultiCutFab` with ``AMREX_SPACEDIM`` + components representing the unit vector pointing toward the covered part. + +- **Boundary area** is in a :cpp:`MultiCutFab` with a single component + representing the dimensionless boundary area. When the cell is isotropic + (i.e., :math:`\Delta x = \Delta y = \Delta z`), it's trivial to convert it + to physical units. If the cell size is anisotropic, the conversion + requires multiplying by a factor of :math:`\sqrt{(n_x \Delta y \Delta + z)^2 + (n_y \Delta x \Delta z)^2 + (n_z \Delta x \Delta y)^2}`, where + :math:`n` is the boundary normal vector. + - **Face centroid** is in a :cpp:`MultiCutFab` with ``AMREX_SPACEDIM`` components. Each component of the data is in the range of :math:`[-0.5,0.5]`, based on each cell's local coordinates with respect to the embedded boundary. diff --git a/Src/EB/AMReX_EB2_2D_C.cpp b/Src/EB/AMReX_EB2_2D_C.cpp index b2bbde200c5..b99b5559c77 100644 --- a/Src/EB/AMReX_EB2_2D_C.cpp +++ b/Src/EB/AMReX_EB2_2D_C.cpp @@ -30,8 +30,7 @@ void set_eb_data (const int i, const int j, const Real apnorm = std::hypot(daxp,dayp) + 1.e-30_rt*std::sqrt(dx[0]*dx[1]); const Real nx = daxp * (1.0_rt/apnorm); const Real ny = dayp * (1.0_rt/apnorm); - const Real bareascaling = std::sqrt( (nx*dx[0])*(nx*dx[0]) + - (ny*dx[1])*(ny*dx[1]) ); + const Real bareascaling = std::sqrt(Math::powi<2>(nx*dx[1]) + Math::powi<2>(ny*dx[0])); const Real nxabs = std::abs(nx); const Real nyabs = std::abs(ny); diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp index 73170adaeeb..2a366e587ab 100644 --- a/Src/EB/AMReX_EB2_3D_C.cpp +++ b/Src/EB/AMReX_EB2_3D_C.cpp @@ -101,7 +101,9 @@ void set_eb_data (const int i, const int j, const int k, bnorm(i,j,k,0) = nx; bnorm(i,j,k,1) = ny; bnorm(i,j,k,2) = nz; - barea(i,j,k) = (nx*dapx/(dx[1]*dx[2]) + ny*dapy/(dx[0]*dx[2]) + nz*dapz/(dx[0]*dx[1])); + barea(i,j,k) = (nx*dapx + ny*dapy + nz*dapz) / (Math::powi<2>(nx*dx[1]*dx[2]) + + Math::powi<2>(ny*dx[0]*dx[2]) + + Math::powi<2>(nz*dx[0]*dx[1])); Real aax = 0.5_rt*(axm+axp); Real aay = 0.5_rt*(aym+ayp); From 13d20a21103b53d7942c3d41b247e51ca695b1f3 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Thu, 5 Sep 2024 14:25:02 -0500 Subject: [PATCH 14/18] Add iMultiFab::sum that returns the sum over a region (#4132) --- Src/Base/AMReX_iMultiFab.H | 5 ++++ Src/Base/AMReX_iMultiFab.cpp | 45 ++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/Src/Base/AMReX_iMultiFab.H b/Src/Base/AMReX_iMultiFab.H index eb1e350433a..39896975b7b 100644 --- a/Src/Base/AMReX_iMultiFab.H +++ b/Src/Base/AMReX_iMultiFab.H @@ -164,6 +164,11 @@ public: */ [[nodiscard]] Long sum (int comp, int nghost = 0, bool local = false) const; + /** + * \brief Returns the sum of component "comp" in the given "region". -- no ghost cells are included. + */ + [[nodiscard]] Long sum (Box const& region, int comp = 0, bool local = false) const; + /** * \brief Adds the scalar value val to the value of each cell in the * specified subregion of the iMultiFab. The subregion consists diff --git a/Src/Base/AMReX_iMultiFab.cpp b/Src/Base/AMReX_iMultiFab.cpp index 090f1ca96d6..c78e3b4a367 100644 --- a/Src/Base/AMReX_iMultiFab.cpp +++ b/Src/Base/AMReX_iMultiFab.cpp @@ -431,6 +431,51 @@ iMultiFab::sum (int comp, int nghost, bool local) const return sm; } +Long +iMultiFab::sum (Box const& region, int comp, bool local) const +{ + BL_PROFILE("iMultiFab::sum(region)"); + + Long sm = 0; + +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + auto const& ma = this->const_arrays(); + sm = ParReduce(TypeList{}, TypeList{}, *this, IntVect(0), + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple + { + return (region.contains(i,j,k)) ? static_cast(ma[box_no](i,j,k,comp)) : Long(0); + }); + } + else +#endif + { +#ifdef AMREX_USE_OMP +#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm) +#endif + for (MFIter mfi(*this,true); mfi.isValid(); ++mfi) + { + const Box& bx = mfi.tilebox() & region; + if (bx.ok()) { + Array4 const& fab = this->const_array(mfi); + auto tmp = Long(0); + AMREX_LOOP_3D(bx, i, j, k, + { + tmp += fab(i,j,k,comp); + }); + sm += tmp; + } + } + } + + if (!local) { + ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub()); + } + + return sm; +} + namespace { IntVect From ad118ded0068fee42c829e2ae8656872e1764dd6 Mon Sep 17 00:00:00 2001 From: Andrew Myers Date: Thu, 5 Sep 2024 13:14:53 -0700 Subject: [PATCH 15/18] Fix unused variables in IntVect (#4133) The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Src/Base/AMReX_IntVect.H | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Src/Base/AMReX_IntVect.H b/Src/Base/AMReX_IntVect.H index 6452f29b036..64457f123ed 100644 --- a/Src/Base/AMReX_IntVect.H +++ b/Src/Base/AMReX_IntVect.H @@ -55,8 +55,11 @@ public: static constexpr unsigned shift1 = sizeof(size_t)>=8 ? 20 : 10; static constexpr unsigned shift2 = sizeof(size_t)>=8 ? 40 : 20; if constexpr (dim == 1) { + amrex::ignore_unused(shift1); + amrex::ignore_unused(shift2); return static_cast(vec[0]); } else if constexpr (dim == 2) { + amrex::ignore_unused(shift2); return static_cast(vec[0]) ^ (static_cast(vec[1]) << shift1); } else if constexpr (dim == 3) { From b454719b10b8d73307852788e54cc62db99a6814 Mon Sep 17 00:00:00 2001 From: Andrew Myers Date: Thu, 5 Sep 2024 13:40:29 -0700 Subject: [PATCH 16/18] Use BL_PROFILE instead of BL_PROFILE_VAR to time in knapsack()swap (#4134) This timer was using `BL_PROFILE_VAR` without an associated `BL_PROFILE_STOP`, resulting in unused variable warnings. It seems to me that we can just use `BL_PROFILE` here and rely on scope to stop the timer. The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Src/Base/AMReX_DistributionMapping.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp index cfe16b346a1..45304cf5e04 100644 --- a/Src/Base/AMReX_DistributionMapping.cpp +++ b/Src/Base/AMReX_DistributionMapping.cpp @@ -651,7 +651,7 @@ knapsack (const std::vector& wgts, if (efficiency < max_efficiency && do_full_knapsack && wblv.size() > 1 && wblv.begin()->size() > 1) { - BL_PROFILE_VAR("knapsack()swap", swap); + BL_PROFILE("knapsack()swap"); top: ; if (efficiency < max_efficiency && wblv.begin()->size() > 1) @@ -980,7 +980,7 @@ DistributionMapping::KnapSackProcessorMap (const DistributionMapping& olddm, new_efficiency = avg_weight / max_weight; if (new_efficiency < max_efficiency && wblv.size() > 1) { - BL_PROFILE_VAR("knapsack()swap", swap); + BL_PROFILE("knapsack()swap"); std::sort(wblv.begin(), wblv.end()); From 87de52ef071811629120df6247c53e0e6840c4ec Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 6 Sep 2024 08:52:12 -0500 Subject: [PATCH 17/18] Fix a bug in #4131 (#4138) Missing sqrt. --- Src/EB/AMReX_EB2_3D_C.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp index 2a366e587ab..2d02e53bdc7 100644 --- a/Src/EB/AMReX_EB2_3D_C.cpp +++ b/Src/EB/AMReX_EB2_3D_C.cpp @@ -101,9 +101,9 @@ void set_eb_data (const int i, const int j, const int k, bnorm(i,j,k,0) = nx; bnorm(i,j,k,1) = ny; bnorm(i,j,k,2) = nz; - barea(i,j,k) = (nx*dapx + ny*dapy + nz*dapz) / (Math::powi<2>(nx*dx[1]*dx[2]) + - Math::powi<2>(ny*dx[0]*dx[2]) + - Math::powi<2>(nz*dx[0]*dx[1])); + barea(i,j,k) = (nx*dapx + ny*dapy + nz*dapz) / std::sqrt(Math::powi<2>(nx*dx[1]*dx[2]) + + Math::powi<2>(ny*dx[0]*dx[2]) + + Math::powi<2>(nz*dx[0]*dx[1])); Real aax = 0.5_rt*(axm+axp); Real aay = 0.5_rt*(aym+ayp); From 51db5ece14372d6f770531054a52edd4abd07b23 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Sat, 7 Sep 2024 19:08:34 -0500 Subject: [PATCH 18/18] amrex::Stack (#4139) Move the Stack class used in Parser to its own header so that it can be used by others. The Stack class has a fixed maximum size. This is useful for traversing a tree on GPU, because recursive function does not work well in device code. --- Src/Base/AMReX_Stack.H | 24 ++++++++++++++++++++++++ Src/Base/CMakeLists.txt | 1 + Src/Base/Make.package | 2 ++ Src/Base/Parser/AMReX_IParser_Exe.H | 15 ++------------- Src/Base/Parser/AMReX_Parser_Exe.H | 15 ++------------- 5 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 Src/Base/AMReX_Stack.H diff --git a/Src/Base/AMReX_Stack.H b/Src/Base/AMReX_Stack.H new file mode 100644 index 00000000000..8e9e83f8682 --- /dev/null +++ b/Src/Base/AMReX_Stack.H @@ -0,0 +1,24 @@ +#ifndef AMREX_STACK_H_ +#define AMREX_STACK_H_ + +namespace amrex { + +template +struct Stack +{ +public: + constexpr void push (T v) { m_data[m_size++] = v; } + constexpr void pop () { --m_size; } + [[nodiscard]] constexpr bool empty () const { return m_size == 0; } + [[nodiscard]] constexpr int size () const { return m_size; } + [[nodiscard]] constexpr T const& top () const { return m_data[m_size-1]; } + [[nodiscard]] constexpr T & top () { return m_data[m_size-1]; } + [[nodiscard]] constexpr T operator[] (int i) const { return m_data[i]; } +private: + T m_data[N]; + int m_size = 0; +}; + +} + +#endif diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 0436ad032e4..3a19b917c35 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -31,6 +31,7 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_parmparse_fi.cpp AMReX_ParmParse.H AMReX_Functional.H + AMReX_Stack.H AMReX_String.H AMReX_String.cpp AMReX_Utility.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index b009ebf7d65..361ca079388 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -23,6 +23,8 @@ C$(AMREX_BASE)_sources += AMReX_PODVector.cpp C$(AMREX_BASE)_headers += AMReX_BlockMutex.H C$(AMREX_BASE)_sources += AMReX_BlockMutex.cpp +C$(AMREX_BASE)_headers += AMReX_Stack.H + C$(AMREX_BASE)_headers += AMReX_String.H C$(AMREX_BASE)_sources += AMReX_String.cpp diff --git a/Src/Base/Parser/AMReX_IParser_Exe.H b/Src/Base/Parser/AMReX_IParser_Exe.H index e7e41c44d48..a68fa32981e 100644 --- a/Src/Base/Parser/AMReX_IParser_Exe.H +++ b/Src/Base/Parser/AMReX_IParser_Exe.H @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -226,24 +227,12 @@ struct alignas(8) IParserExeJUMP { int offset; }; -template -struct IParserStack -{ - long long m_data[N]; - int m_size = 0; - constexpr void push (long long v) { m_data[m_size++] = v; } - constexpr void pop () { --m_size; } - [[nodiscard]] constexpr long long const& top () const { return m_data[m_size-1]; } - [[nodiscard]] constexpr long long & top () { return m_data[m_size-1]; } - [[nodiscard]] constexpr long long operator[] (int i) const { return m_data[i]; } -}; - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE long long iparser_exe_eval (const char* p, long long const* x) { if (p == nullptr) { return std::numeric_limits::max(); } - IParserStack pstack; + Stack pstack; while (*((iparser_exe_t*)p) != IPARSER_EXE_NULL) { switch (*((iparser_exe_t*)p)) { diff --git a/Src/Base/Parser/AMReX_Parser_Exe.H b/Src/Base/Parser/AMReX_Parser_Exe.H index 37a0b89da78..a5427e6e65c 100644 --- a/Src/Base/Parser/AMReX_Parser_Exe.H +++ b/Src/Base/Parser/AMReX_Parser_Exe.H @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -217,24 +218,12 @@ struct alignas(8) ParserExeJUMP { int offset; }; -template -struct ParserStack -{ - double m_data[N]; - int m_size = 0; - constexpr void push (double v) { m_data[m_size++] = v; } - constexpr void pop () { --m_size; } - [[nodiscard]] constexpr double const& top () const { return m_data[m_size-1]; } - [[nodiscard]] constexpr double & top () { return m_data[m_size-1]; } - [[nodiscard]] constexpr double operator[] (int i) const { return m_data[i]; } -}; - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE double parser_exe_eval (const char* p, double const* x) { if (p == nullptr) { return std::numeric_limits::max(); } - ParserStack pstack; + Stack pstack; while (*((parser_exe_t*)p) != PARSER_EXE_NULL) { // NOLINT switch (*((parser_exe_t*)p)) {