From b78849057c0f533466da6b517a59d2eddc99c1a8 Mon Sep 17 00:00:00 2001
From: Bruce Perry <Bruce.Perry@nrel.gov>
Date: Mon, 2 Sep 2024 11:18:21 -0600
Subject: [PATCH 01/18] EB: don't abort for no-op case in unsupported
 addFineLevels functions (#4123)

## Summary

The `addFineLevels` function is not supported for EB2 for chk_file and
stl geometries. However, it may be called in some for some trivial cases
where it is adding 0 levels, in which case it is a no-op. There is no
reason to abort in those cases.

## Additional background

For PeleC, a work-around was put in to not call the function in the
trivial cases (https://github.com/AMReX-Combustion/PeleC/pull/771). I
was thinking about adding the same work around to address the same thing
in PeleLMeX (https://github.com/AMReX-Combustion/PeleLMeX/issues/407),
but maybe it would be better to simply allow the function to be called
in trivial cases. If there's a reason not to do this, I'll just put the
workaround in for PeleLMeX.
---
 Src/EB/AMReX_EB2_IndexSpace_STL.cpp        | 7 +++++--
 Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
index 662aaf14dd6..70e3b492d82 100644
--- a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
+++ b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
@@ -83,9 +83,12 @@ IndexSpaceSTL::getGeometry (const Box& dom) const
 }
 
 void
-IndexSpaceSTL::addFineLevels (int /*num_new_fine_levels*/)
+IndexSpaceSTL::addFineLevels (int num_new_fine_levels)
 {
-    amrex::Abort("IndexSpaceSTL::addFineLevels: todo");
+    // This function is a no op if not adding levels, otherwise TODO
+    if (num_new_fine_levels > 0) {
+        amrex::Abort("IndexSpaceSTL::addFineLevels: todo");
+    }
 }
 
 void
diff --git a/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp
index cd811d73688..966b5c424e1 100644
--- a/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp
+++ b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp
@@ -78,9 +78,12 @@ IndexSpaceChkptFile::getGeometry (const Box& dom) const
 }
 
 void
-IndexSpaceChkptFile::addFineLevels (int /*num_new_fine_levels*/)
+IndexSpaceChkptFile::addFineLevels (int num_new_fine_levels)
 {
-    amrex::Abort("IndexSpaceChkptFile::addFineLevels: not supported");
+    // This function is a no op if not adding levels, otherwise TODO
+    if (num_new_fine_levels > 0) {
+        amrex::Abort("IndexSpaceChkptFile::addFineLevels: not supported");
+    }
 }
 
 void

From a5896383e793b60bd6d1b0174b00722cb3d2a2eb Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 2 Sep 2024 10:48:03 -0700
Subject: [PATCH 02/18] `AMREX_DEVICE_PRINTF`: Host (#4116)

In AMReX, device means host if compiled for CPUs.
Add support for printf debugging on CPUs for `AMREX_DEVICE_PRINTF`.

Currently, the macro was undefined, unless for the special case of SYCL
compilation for host code paths where it worked.
---
 Src/Base/AMReX_GpuPrint.H | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Src/Base/AMReX_GpuPrint.H b/Src/Base/AMReX_GpuPrint.H
index 9e029ea8835..37f74f30bfc 100644
--- a/Src/Base/AMReX_GpuPrint.H
+++ b/Src/Base/AMReX_GpuPrint.H
@@ -2,8 +2,6 @@
 #define AMREX_GPU_PRINT_H_
 #include <AMReX_Config.H>
 
-#if !defined(__APPLE__)
-
 #include <cstdio>
 
 #ifdef AMREX_USE_SYCL
@@ -23,7 +21,9 @@
 #  define AMREX_DEVICE_PRINTF(...) std::printf(__VA_ARGS__);
 #elif defined(AMREX_USE_HIP)
 #  define AMREX_DEVICE_PRINTF(...) ::printf(__VA_ARGS__);
+#else
+#  define AMREX_DEVICE_PRINTF(format,...) { \
+      std::printf(format, __VA_ARGS__); }
 #endif
 
-#endif  // !defined(__APPLE__)
 #endif  // AMREX_GPU_PRINT_H_

From 778e782acfd5412a21e7b4d7c100e43a735536f6 Mon Sep 17 00:00:00 2001
From: Youngjun Lee <youngjunlee7@gmail.com>
Date: Mon, 2 Sep 2024 12:50:14 -0500
Subject: [PATCH 03/18] Fix Fortran interface compilation issue using
 `nvfortran` (#4115)

## Summary
This PR will fix the compilation issue for Fortran interfaces when using
`nvfortran`, reported in #4111.

## Additional background
The new `module procedure` interfaces for `final` subroutines are
introduced to bypass the `nvfortran` compilation error. This change may
not be needed according to the Fortran standard, but it effectively
resolves compilation errors using `nvfortran` with
`-DAMReX_FORTRAN_INTERFACES=ON`.
---
 .github/workflows/cuda.yml                           |  1 +
 Src/Base/AMReX_parmparse_mod.F90                     |  4 ++++
 Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90  |  4 ++++
 Src/F_Interfaces/Base/AMReX_boxarray_mod.F90         |  4 ++++
 Src/F_Interfaces/Base/AMReX_distromap_mod.F90        |  4 ++++
 Src/F_Interfaces/Base/AMReX_fab_mod.F90              |  4 ++++
 Src/F_Interfaces/Base/AMReX_geometry_mod.F90         |  4 ++++
 Src/F_Interfaces/Base/AMReX_multifab_mod.F90         | 12 ++++++++++++
 Src/F_Interfaces/Base/AMReX_physbc_mod.F90           |  4 ++++
 .../LinearSolvers/AMReX_abeclaplacian_mod.F90        |  4 ++++
 .../LinearSolvers/AMReX_multigrid_mod.F90            |  4 ++++
 Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 |  4 ++++
 .../Particle/AMReX_particlecontainer_mod.F90         |  4 ++++
 13 files changed, 57 insertions(+)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 50d45846f02..9e96aefac5e 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -154,6 +154,7 @@ jobs:
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_TEST_TYPE=Small                      \
             -DAMReX_FORTRAN=ON                           \
+            -DAMReX_FORTRAN_INTERFACES=ON                \
             -DAMReX_GPU_BACKEND=CUDA                     \
             -DCMAKE_C_COMPILER=$(which nvc)              \
             -DCMAKE_CXX_COMPILER=$(which nvc++)          \
diff --git a/Src/Base/AMReX_parmparse_mod.F90 b/Src/Base/AMReX_parmparse_mod.F90
index 2d33b4da40d..e7a8748503b 100644
--- a/Src/Base/AMReX_parmparse_mod.F90
+++ b/Src/Base/AMReX_parmparse_mod.F90
@@ -241,6 +241,10 @@ subroutine amrex_parmparse_add_stringarr (pp, name, v, n) bind(c)
      end subroutine amrex_parmparse_add_stringarr
   end interface
 
+  interface amrex_parmparse_destroy
+     module procedure amrex_parmparse_destroy
+  end interface amrex_parmparse_destroy
+
 contains
 
   subroutine amrex_parmparse_build (pp, name)
diff --git a/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90 b/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90
index 3e40c9cd517..21937b57b10 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90
+++ b/Src/F_Interfaces/AmrCore/AMReX_fluxregister_mod.F90
@@ -103,6 +103,10 @@ subroutine amrex_fi_fluxregister_overwrite (fr, flxs, scale, geom) bind(c)
      end subroutine amrex_fi_fluxregister_overwrite
   end interface
 
+  interface amrex_fluxregister_destroy
+     module procedure amrex_fluxregister_destroy
+  end interface amrex_fluxregister_destroy
+
 contains
 
   subroutine amrex_fluxregister_build (fr, ba, dm, ref_ratio, fine_lev, ncomp)
diff --git a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90 b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
index b156a8ae996..0181c6cfb9c 100644
--- a/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_boxarray_mod.F90
@@ -45,6 +45,10 @@ module amrex_boxarray_module
      module procedure amrex_boxarray_print
   end interface amrex_print
 
+  interface amrex_boxarray_destroy
+     module procedure amrex_boxarray_destroy
+  end interface amrex_boxarray_destroy
+
   ! interfaces to cpp functions
 
   interface
diff --git a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90 b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90
index 26316ffab21..adbb91b4421 100644
--- a/Src/F_Interfaces/Base/AMReX_distromap_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_distromap_mod.F90
@@ -34,6 +34,10 @@ module amrex_distromap_module
      module procedure amrex_distromap_print
   end interface amrex_print
 
+  interface amrex_distromap_destroy
+     module procedure amrex_distromap_destroy
+  end interface amrex_distromap_destroy
+
   ! interfaces to cpp functions
 
   interface
diff --git a/Src/F_Interfaces/Base/AMReX_fab_mod.F90 b/Src/F_Interfaces/Base/AMReX_fab_mod.F90
index d36e8f4ea87..b76e8645ea3 100644
--- a/Src/F_Interfaces/Base/AMReX_fab_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_fab_mod.F90
@@ -42,6 +42,10 @@ module amrex_fab_module
      module procedure amrex_fab_build_install
   end interface amrex_fab_build
 
+  interface amrex_fab_destroy
+     module procedure amrex_fab_destroy
+  end interface amrex_fab_destroy
+
 contains
 
   ! Build a fab, allocate own memory
diff --git a/Src/F_Interfaces/Base/AMReX_geometry_mod.F90 b/Src/F_Interfaces/Base/AMReX_geometry_mod.F90
index d65a8f6d980..7492b80c7b2 100644
--- a/Src/F_Interfaces/Base/AMReX_geometry_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_geometry_mod.F90
@@ -76,6 +76,10 @@ subroutine amrex_fi_geometry_get_intdomain (geom,lo,hi) bind(c)
      end subroutine amrex_fi_geometry_get_intdomain
   end interface
 
+  interface amrex_geometry_destroy
+     module procedure amrex_geometry_destroy
+  end interface amrex_geometry_destroy
+
 contains
 
   subroutine amrex_geometry_finalize ()
diff --git a/Src/F_Interfaces/Base/AMReX_multifab_mod.F90 b/Src/F_Interfaces/Base/AMReX_multifab_mod.F90
index 81ce03b9f3d..93f2c4be628 100644
--- a/Src/F_Interfaces/Base/AMReX_multifab_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_multifab_mod.F90
@@ -101,6 +101,10 @@ module amrex_multifab_module
      module procedure amrex_multifab_build_a
   end interface amrex_multifab_build
 
+  interface amrex_multifab_destroy
+    module procedure amrex_multifab_destroy
+  end interface amrex_multifab_destroy
+
   type, public   :: amrex_imultifab
      logical               :: owner = .false.
      type   (c_ptr)        :: p     =  c_null_ptr
@@ -129,6 +133,10 @@ module amrex_multifab_module
      module procedure amrex_imultifab_build_a
   end interface amrex_imultifab_build
 
+  interface amrex_imultifab_destroy
+    module procedure amrex_imultifab_destroy
+  end interface amrex_imultifab_destroy
+
   type, public :: amrex_mfiter
      type(c_ptr)      :: p       = c_null_ptr
      integer ,private :: counter = -1
@@ -159,6 +167,10 @@ module amrex_multifab_module
      module procedure amrex_mfiter_build_badm_s
   end interface amrex_mfiter_build
 
+  interface amrex_mfiter_destroy
+    module procedure amrex_mfiter_destroy
+  end interface amrex_mfiter_destroy
+
   ! interfaces to c++ functions
 
   interface
diff --git a/Src/F_Interfaces/Base/AMReX_physbc_mod.F90 b/Src/F_Interfaces/Base/AMReX_physbc_mod.F90
index 6831fe81514..1f707575a83 100644
--- a/Src/F_Interfaces/Base/AMReX_physbc_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_physbc_mod.F90
@@ -47,6 +47,10 @@ subroutine amrex_fi_delete_physbc (pbc) bind(c)
      end subroutine amrex_fi_delete_physbc
   end interface
 
+  interface amrex_physbc_destroy
+    module procedure amrex_physbc_destroy
+  end interface amrex_physbc_destroy
+
 contains
 
   subroutine amrex_physbc_build (pbc, fill, geom)
diff --git a/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90 b/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90
index 5c9425168ec..d90a2faeae4 100644
--- a/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90
+++ b/Src/F_Interfaces/LinearSolvers/AMReX_abeclaplacian_mod.F90
@@ -58,6 +58,10 @@ subroutine amrex_fi_abeclap_set_bcoeffs (abeclap, amrlev, beta) bind(c)
      end subroutine amrex_fi_abeclap_set_bcoeffs
   end interface
 
+  interface amrex_abeclaplacian_destroy
+     module procedure amrex_abeclaplacian_destroy
+  end interface amrex_abeclaplacian_destroy
+
 contains
 
   subroutine amrex_abeclaplacian_assign (dst, src)
diff --git a/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90 b/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90
index bc7ae734d01..5ea57e5196a 100644
--- a/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90
+++ b/Src/F_Interfaces/LinearSolvers/AMReX_multigrid_mod.F90
@@ -154,6 +154,10 @@ subroutine amrex_fi_multigrid_set_final_fill_bc (mg, f) bind(c)
      end subroutine amrex_fi_multigrid_set_final_fill_bc
   end interface
 
+  interface amrex_multigrid_destroy
+     module procedure amrex_multigrid_destroy
+  end interface amrex_multigrid_destroy
+
 contains
 
   subroutine amrex_multigrid_assign (dst, src)
diff --git a/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90 b/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90
index 3690419f22d..33a02336af1 100644
--- a/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90
+++ b/Src/F_Interfaces/LinearSolvers/AMReX_poisson_mod.F90
@@ -35,6 +35,10 @@ subroutine amrex_fi_delete_linop (linop) bind(c)
      end subroutine amrex_fi_delete_linop
   end interface
 
+  interface amrex_poisson_destroy
+     module procedure amrex_poisson_destroy
+  end interface amrex_poisson_destroy
+
 contains
 
   subroutine amrex_poisson_assign (dst, src)
diff --git a/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90 b/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90
index bdb4c9a2310..4def2e000f9 100644
--- a/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90
+++ b/Src/F_Interfaces/Particle/AMReX_particlecontainer_mod.F90
@@ -163,6 +163,10 @@ end subroutine amrex_fi_num_particles_i
 
   end interface
 
+  interface amrex_particlecontainer_destroy
+     module procedure amrex_particlecontainer_destroy
+  end interface amrex_particlecontainer_destroy
+
 contains
 
   subroutine amrex_particlecontainer_build (pc, amrcore)

From a31abb532dd8f1c62806fd093a864ac009e0e041 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com>
Date: Mon, 2 Sep 2024 20:04:41 +0200
Subject: [PATCH 04/18] TinyProfiler with BArena and PArena (#4113)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds the capability to profile BArena and PArena with
TinyProfiler. Previously, only CArena was profiled. Note that some
allocations are still not profiled when running on CPU because
`amrex::DefaultAllocator` and `amrex::PODVector` use `std::allocator<T>`
instead of `amrex::ArenaAllocator<T>`.
---
 Src/Base/AMReX_Arena.H          | 37 ++++++++++++++++++++++++++-
 Src/Base/AMReX_Arena.cpp        | 45 +++++++++++++++++++++++++++++++--
 Src/Base/AMReX_BArena.cpp       |  5 +++-
 Src/Base/AMReX_CArena.H         | 12 ---------
 Src/Base/AMReX_CArena.cpp       | 39 ++++++----------------------
 Src/Base/AMReX_PArena.cpp       |  2 ++
 Src/Base/AMReX_TinyProfiler.H   |  2 +-
 Src/Base/AMReX_TinyProfiler.cpp |  5 ++--
 8 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index 2a6cbb25a08..51b5d983fdf 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -4,9 +4,21 @@
 
 #include <AMReX_BLassert.H>
 #include <AMReX_INT.H>
+
+#ifdef AMREX_TINY_PROFILING
+#include <AMReX_TinyProfiler.H>
+#else
+namespace amrex {
+    struct MemStat {};
+}
+#endif
+
 #include <cstddef>
 #include <cstdlib>
 #include <limits>
+#include <map>
+#include <mutex>
+#include <unordered_map>
 #include <utility>
 
 namespace amrex {
@@ -156,7 +168,7 @@ public:
      * \brief Add this Arena to the list of Arenas that are profiled by TinyProfiler.
      * \param memory_name The name of this arena in the TinyProfiler output.
      */
-    virtual void registerForProfiling (const std::string& memory_name);
+    void registerForProfiling (const std::string& memory_name);
 
 #ifdef AMREX_USE_GPU
     //! Is this GPU stream ordered memory allocator?
@@ -199,6 +211,29 @@ protected:
     virtual std::size_t freeUnused_protected () { return 0; }
     void* allocate_system (std::size_t nbytes);
     void deallocate_system (void* p, std::size_t nbytes);
+
+    struct ArenaProfiler {
+        //! If this arena is profiled by TinyProfiler
+        bool m_do_profiling = false;
+        //! Mutex for the profiling
+        std::mutex m_arena_profiler_mutex;
+        //! Data structure used for profiling with TinyProfiler
+        std::map<std::string, MemStat> m_profiling_stats;
+        //! Track the currently allocated memory, not used by CArena
+        std::unordered_map<void*, std::pair<MemStat*, std::size_t>> m_currently_allocated;
+
+        ~ArenaProfiler ();
+        ArenaProfiler () noexcept = default;
+        ArenaProfiler (const ArenaProfiler& rhs) = delete;
+        ArenaProfiler (ArenaProfiler&& rhs) = delete;
+        ArenaProfiler& operator= (const ArenaProfiler& rhs) = delete;
+        ArenaProfiler& operator= (ArenaProfiler&& rhs) = delete;
+
+        void profile_alloc (void* ptr, std::size_t nbytes);
+
+        void profile_free (void* ptr);
+
+    } m_profiler;
 };
 
 }
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index ce4ece3b643..c2de5464574 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -117,9 +117,13 @@ Arena::hasFreeDeviceMemory (std::size_t)
 }
 
 void
-Arena::registerForProfiling (const std::string&)
+Arena::registerForProfiling ([[maybe_unused]] const std::string& memory_name)
 {
-    amrex::Abort("Profiling is not implemented for this type of Arena");
+#ifdef AMREX_TINY_PROFILING
+    AMREX_ALWAYS_ASSERT(m_profiler.m_do_profiling == false);
+    m_profiler.m_do_profiling =
+        TinyProfiler::RegisterArena(memory_name, m_profiler.m_profiling_stats);
+#endif
 }
 
 std::size_t
@@ -330,6 +334,7 @@ Arena::Initialize ()
     }
 
     the_async_arena = new PArena(the_async_arena_release_threshold);
+    the_async_arena->registerForProfiling("Async Memory");
 
 #ifdef AMREX_USE_GPU
     if (the_arena->isDevice()) {
@@ -403,6 +408,7 @@ Arena::Initialize ()
     }
 
     the_cpu_arena = The_BArena();
+    the_cpu_arena->registerForProfiling("Cpu Memory");
 
     // Initialize the null arena
     auto* null_arena = The_Null_Arena();
@@ -654,4 +660,39 @@ The_Comms_Arena ()
     }
 }
 
+Arena::ArenaProfiler::~ArenaProfiler () {
+#ifdef AMREX_TINY_PROFILING
+    if (m_do_profiling) {
+        TinyProfiler::DeregisterArena(m_profiling_stats);
+    }
+#endif
+}
+
+void Arena::ArenaProfiler::profile_alloc ([[maybe_unused]] void* ptr,
+                                          [[maybe_unused]] std::size_t nbytes) {
+#ifdef AMREX_TINY_PROFILING
+    if (m_do_profiling) {
+        std::lock_guard<std::mutex> lock(m_arena_profiler_mutex);
+        MemStat* stat = TinyProfiler::memory_alloc(nbytes, m_profiling_stats);
+        if (stat) {
+            m_currently_allocated.insert({ptr, {stat, nbytes}});
+        }
+    }
+#endif
+}
+
+void Arena::ArenaProfiler::profile_free ([[maybe_unused]] void* ptr) {
+#ifdef AMREX_TINY_PROFILING
+    if (m_do_profiling) {
+        std::lock_guard<std::mutex> lock(m_arena_profiler_mutex);
+        auto it = m_currently_allocated.find(ptr);
+        if (it != m_currently_allocated.end()) {
+            auto [stat, nbytes] = it->second;
+            TinyProfiler::memory_free(nbytes, stat);
+            m_currently_allocated.erase(it);
+        }
+    }
+#endif
+}
+
 }
diff --git a/Src/Base/AMReX_BArena.cpp b/Src/Base/AMReX_BArena.cpp
index c22affa687a..054e64b854c 100644
--- a/Src/Base/AMReX_BArena.cpp
+++ b/Src/Base/AMReX_BArena.cpp
@@ -3,12 +3,15 @@
 void*
 amrex::BArena::alloc (std::size_t sz_)
 {
-    return std::malloc(sz_);
+    void* pt = std::malloc(sz_);
+    m_profiler.profile_alloc(pt, sz_);
+    return pt;
 }
 
 void
 amrex::BArena::free (void* pt)
 {
+    m_profiler.profile_free(pt);
     std::free(pt);
 }
 
diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
index 9547bc92f21..bc46d008241 100644
--- a/Src/Base/AMReX_CArena.H
+++ b/Src/Base/AMReX_CArena.H
@@ -16,8 +16,6 @@
 
 namespace amrex {
 
-struct MemStat;
-
 /**
 * \brief A Concrete Class for Dynamic Memory Management using first fit.
 * This is a coalescing memory manager.  It allocates (possibly) large
@@ -75,12 +73,6 @@ public:
      */
     [[nodiscard]] bool hasFreeDeviceMemory (std::size_t sz) final;
 
-    /**
-     * \brief Add this Arena to the list of Arenas that are profiled by TinyProfiler.
-     * \param memory_name The name of this arena in the TinyProfiler output.
-     */
-    void registerForProfiling (const std::string& memory_name) final;
-
     //! The current amount of heap space used by the CArena object.
     std::size_t heap_space_used () const noexcept;
 
@@ -191,10 +183,6 @@ protected:
     std::size_t m_used{0};
     //! The amount of memory given out via alloc().
     std::size_t m_actually_used{0};
-    //! If this arena is profiled by TinyProfiler
-    bool m_do_profiling = false;
-    //! Data structure used for profiling with TinyProfiler
-    std::map<std::string, MemStat> m_profiling_stats;
 
 
     std::mutex carena_mutex;
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index 42987f47a86..bc5297f4d52 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -5,14 +5,6 @@
 #include <AMReX_MFIter.H>
 #include <AMReX_ParallelReduce.H>
 
-#ifdef AMREX_TINY_PROFILING
-#include <AMReX_TinyProfiler.H>
-#else
-namespace amrex {
-    struct MemStat {};
-}
-#endif
-
 #include <utility>
 #include <cstring>
 #include <iostream>
@@ -32,12 +24,6 @@ CArena::~CArena ()
     for (auto const& a : m_alloc) {
         deallocate_system(a.first, a.second);
     }
-
-#ifdef AMREX_TINY_PROFILING
-    if (m_do_profiling) {
-        TinyProfiler::DeregisterArena(m_profiling_stats);
-    }
-#endif
 }
 
 void*
@@ -53,8 +39,8 @@ CArena::alloc_protected (std::size_t nbytes)
 {
     MemStat* stat = nullptr;
 #ifdef AMREX_TINY_PROFILING
-    if (m_do_profiling) {
-        stat = TinyProfiler::memory_alloc(nbytes, m_profiling_stats);
+    if (m_profiler.m_do_profiling) {
+        stat = TinyProfiler::memory_alloc(nbytes, m_profiler.m_profiling_stats);
     }
 #endif
 
@@ -173,10 +159,10 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax)
                     free_node.size(left_size);
                 }
 #ifdef AMREX_TINY_PROFILING
-                if (m_do_profiling) {
+                if (m_profiler.m_do_profiling) {
                     TinyProfiler::memory_free(busy_it->size(), busy_it->mem_stat());
                     auto* stat = TinyProfiler::memory_alloc(new_size,
-                                                            m_profiling_stats);
+                                                            m_profiler.m_profiling_stats);
                     const_cast<Node&>(*busy_it).mem_stat(stat);
                 }
 #endif
@@ -186,10 +172,10 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax)
             } else if (total_size >= szmin) {
                 m_freelist.erase(next_it);
 #ifdef AMREX_TINY_PROFILING
-                if (m_do_profiling) {
+                if (m_profiler.m_do_profiling) {
                     TinyProfiler::memory_free(busy_it->size(), busy_it->mem_stat());
                     auto* stat = TinyProfiler::memory_alloc(total_size,
-                                                            m_profiling_stats);
+                                                            m_profiler.m_profiling_stats);
                     const_cast<Node&>(*busy_it).mem_stat(stat);
                 }
 #endif
@@ -255,9 +241,9 @@ CArena::shrink_in_place (void* pt, std::size_t new_size)
         m_actually_used -= leftover_size;
 
 #ifdef AMREX_TINY_PROFILING
-        if (m_do_profiling) {
+        if (m_profiler.m_do_profiling) {
             TinyProfiler::memory_free(old_size, busy_it->mem_stat());
-            auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats);
+            auto* stat = TinyProfiler::memory_alloc(new_size, m_profiler.m_profiling_stats);
             const_cast<Node&>(*busy_it).mem_stat(stat);
         }
 #endif
@@ -431,15 +417,6 @@ CArena::hasFreeDeviceMemory (std::size_t sz)
     }
 }
 
-void
-CArena::registerForProfiling ([[maybe_unused]] const std::string& memory_name)
-{
-#ifdef AMREX_TINY_PROFILING
-    m_do_profiling = true;
-    TinyProfiler::RegisterArena(memory_name, m_profiling_stats);
-#endif
-}
-
 std::size_t
 CArena::heap_space_used () const noexcept
 {
diff --git a/Src/Base/AMReX_PArena.cpp b/Src/Base/AMReX_PArena.cpp
index 36155f3d32c..bbe2717ab07 100644
--- a/Src/Base/AMReX_PArena.cpp
+++ b/Src/Base/AMReX_PArena.cpp
@@ -62,6 +62,7 @@ PArena::alloc (std::size_t nbytes)
             AMREX_HIP_SAFE_CALL(hipMallocAsync(&p, nbytes, m_pool, Gpu::gpuStream()));,
             AMREX_CUDA_SAFE_CALL(cudaMallocAsync(&p, nbytes, m_pool, Gpu::gpuStream()));
         )
+        m_profiler.profile_alloc(p, nbytes);
         return p;
     } else
 #endif
@@ -93,6 +94,7 @@ PArena::free (void* p)
 
 #if defined (AMREX_GPU_STREAM_ALLOC_SUPPORT)
     if (Gpu::Device::memoryPoolsSupported()) {
+        m_profiler.profile_free(p);
         AMREX_HIP_OR_CUDA(
             AMREX_HIP_SAFE_CALL(hipFreeAsync(p, Gpu::gpuStream()));,
             AMREX_CUDA_SAFE_CALL(cudaFreeAsync(p, Gpu::gpuStream()));
diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H
index 0228949beb1..9a0731175bd 100644
--- a/Src/Base/AMReX_TinyProfiler.H
+++ b/Src/Base/AMReX_TinyProfiler.H
@@ -57,7 +57,7 @@ public:
     static void MemoryInitialize () noexcept;
     static void MemoryFinalize (bool bFlushing = false) noexcept;
 
-    static void RegisterArena (const std::string& memory_name,
+    static bool RegisterArena (const std::string& memory_name,
                                std::map<std::string, MemStat>& memstats) noexcept;
 
     static void DeregisterArena (std::map<std::string, MemStat>& memstats) noexcept;
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index db922745784..fab9e38aa82 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -490,14 +490,15 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
     if(os) { os->precision(oldprec); }
 }
 
-void
+bool
 TinyProfiler::RegisterArena (const std::string& memory_name,
                              std::map<std::string, MemStat>& memstats) noexcept
 {
-    if (!memprof_enabled) { return; }
+    if (!memprof_enabled) { return false; }
 
     all_memstats.push_back(&memstats);
     all_memnames.push_back(memory_name);
+    return true;
 }
 
 void

From de4dc974dda7feead09d4cf03cc7250bf80d5bc2 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <64009254+AlexanderSinn@users.noreply.github.com>
Date: Tue, 3 Sep 2024 00:33:11 +0200
Subject: [PATCH 05/18] CTOParallelFor with BoxND / add AnyCTO (#4109)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

This PR adds support for BoxND to CTOParallelFor by adding the AnyCTO
function which can be used to implement compile time options with any
kernel launching function such as ParallelFor, ParallelForRNG, launch,
etc.

I'm not sure if AnyCTO is a good name, are there other suggestions?

## Additional background

AnyCTO Examples:
``` C++
    int A_runtime_option = ...;
    int B_runtime_option = ...;
    enum A_options : int { A0, A1, A2, A3 };
    enum B_options : int { B0, B1 };
    AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
                    CompileTimeOptions<B0,B1>>{},
        {A_runtime_option, B_runtime_option},
        [&](auto cto_func){
            ParallelForRNG(N, cto_func);
        },
        [=] AMREX_GPU_DEVICE (int i, const RandomEngine& engine,
                              auto A_control, auto B_control)
        {
            ...
            if constexpr (A_control.value == A0) {
                ...
            } else if constexpr (A_control.value == A1) {
                ...
            } else if constexpr (A_control.value == A2) {
                ...
            else {
                ...
            }
            if constexpr (A_control.value != A3 && B_control.value == B1) {
                ...
            }
            ...
        }
    );


    constexpr int nthreads_per_block = ...;
    int nblocks = ...;
    AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
                    CompileTimeOptions<B0,B1>>{},
        {A_runtime_option, B_runtime_option},
        [&](auto cto_func){
            launch<nthreads_per_block>(nblocks, Gpu::gpuStream(), cto_func);
        },
        [=] AMREX_GPU_DEVICE (auto A_control, auto B_control){
            ...
        }
    );
```
Additionally, .GetOptions() can be used to use the compile time options
in the function that launches the kernel:
```C++
    int nthreads_per_block = ...;
    AnyCTO(TypeList<CompileTimeOptions<128,256,512,1024>>{},
        {nthreads_per_block},
        [&](auto cto_func){
            constexpr std::array<int, 1> ctos = cto_func.GetOptions();
            constexpr int c_nthreads_per_block = ctos[0];
            ParallelFor<c_nthreads_per_block>(N, cto_func);
        },
        [=] AMREX_GPU_DEVICE (int i, auto){
            ...
        }
    );


    BoxND<6> box6D = ...;
    int dims_needed = ...;
    AnyCTO(TypeList<CompileTimeOptions<1,2,3,4,5,6>>{},
        {dims_needed},
        [&](auto cto_func){
            constexpr std::array<int, 1> ctos = cto_func.GetOptions();
            constexpr int c_dims_needed = ctos[0];
            const auto box = BoxShrink<c_dims_needed>(box6D);
            ParallelFor(box, cto_func);
        },
        [=] AMREX_GPU_DEVICE (auto intvect, auto) -> decltype(void(intvect.size())) {
            ...
        }
    );
```
---
 Src/Base/AMReX_CTOParallelForImpl.H | 261 ++++++++++++++++++----------
 1 file changed, 174 insertions(+), 87 deletions(-)

diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H
index 35e0ec3be7b..8f7e8ce567f 100644
--- a/Src/Base/AMReX_CTOParallelForImpl.H
+++ b/Src/Base/AMReX_CTOParallelForImpl.H
@@ -3,7 +3,7 @@
 
 #include <AMReX_BLassert.H>
 #include <AMReX_Box.H>
-#include <AMReX_Tuple.H>
+#include <AMReX_TypeList.H>
 
 #include <array>
 #include <type_traits>
@@ -18,7 +18,7 @@ namespace amrex {
 
 template <int... ctr>
 struct CompileTimeOptions {
-    // TypeList is defined in AMReX_Tuple.H
+    // TypeList is defined in AMReX_TypeList.H
     using list_type = TypeList<std::integral_constant<int, ctr>...>;
 };
 
@@ -26,65 +26,42 @@ struct CompileTimeOptions {
 
 namespace detail
 {
-    template <int MT, typename T, class F, typename... As>
-    std::enable_if_t<std::is_integral_v<T> || std::is_same_v<T,Box>, bool>
-    ParallelFor_helper2 (T const& N, F const& f, TypeList<As...>,
-                         std::array<int,sizeof...(As)> const& runtime_options)
-    {
-        if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
-            if constexpr (std::is_integral_v<T>) {
-                ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (T i) noexcept
-                {
-                    f(i, As{}...);
-                });
-            } else {
-                ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-                {
-                    f(i, j, k, As{}...);
-                });
-            }
-            return true;
-        } else {
-            return false;
+    template<class F, int... ctr>
+    struct CTOWrapper {
+        F f;
+
+        template<class... Args>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        auto operator() (Args... args) const noexcept
+            -> decltype(f(args..., std::integral_constant<int, ctr>{}...)) {
+            return f(args..., std::integral_constant<int, ctr>{}...);
         }
-    }
 
-    template <int MT, typename T, class F, typename... As>
-    std::enable_if_t<std::is_integral_v<T>, bool>
-    ParallelFor_helper2 (Box const& box, T ncomp, F const& f, TypeList<As...>,
-                         std::array<int,sizeof...(As)> const& runtime_options)
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        static constexpr
+        std::array<int, sizeof...(ctr)> GetOptions () noexcept {
+            return {ctr...};
+        }
+    };
+
+    template <class L, class F, typename... As>
+    bool
+    AnyCTO_helper2 (const L& l, const F& f, TypeList<As...>,
+                    std::array<int,sizeof...(As)> const& runtime_options)
     {
         if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
-            ParallelFor<MT>(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept
-            {
-                f(i, j, k, n, As{}...);
-            });
+            l(CTOWrapper<F, As::value...>{f});
             return true;
         } else {
             return false;
         }
     }
 
-    template <int MT, typename T, class F, typename... PPs, typename RO>
-    std::enable_if_t<std::is_integral_v<T> || std::is_same_v<T,Box>>
-    ParallelFor_helper1 (T const& N, F const& f, TypeList<PPs...>,
-                         RO const& runtime_options)
-    {
-        bool found_option = (false || ... ||
-                             ParallelFor_helper2<MT>(N, f,
-                                                     PPs{}, runtime_options));
-        amrex::ignore_unused(found_option);
-        AMREX_ASSERT(found_option);
-    }
-
-    template <int MT, typename T, class F, typename... PPs, typename RO>
-    std::enable_if_t<std::is_integral_v<T>>
-    ParallelFor_helper1 (Box const& box, T ncomp, F const& f, TypeList<PPs...>,
-                         RO const& runtime_options)
+    template <class L, class F, typename... PPs, typename RO>
+    void
+    AnyCTO_helper1 (const L& l, const F& f, TypeList<PPs...>, RO const& runtime_options)
     {
-        bool found_option = (false || ... ||
-                             ParallelFor_helper2<MT>(box, ncomp, f,
-                                                     PPs{}, runtime_options));
+        bool found_option = (false || ... || AnyCTO_helper2(l, f, PPs{}, runtime_options));
         amrex::ignore_unused(found_option);
         AMREX_ASSERT(found_option);
     }
@@ -92,51 +69,161 @@ namespace detail
 
 #endif
 
-template <int MT, typename T, class F, typename... CTOs>
-std::enable_if_t<std::is_integral_v<T>>
-ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+/**
+ * \brief Compile time optimization of kernels with run time options.
+ *
+ * This is a generalized version of ParallelFor with CTOs that can support any function that
+ * takes in one lambda to launch a GPU kernel such as ParallelFor, ParallelForRNG, launch, etc.
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelForRNGs
+ * for all combinations of the run time parameters.
+ \verbatim
+    int A_runtime_option = ...;
+    int B_runtime_option = ...;
+    enum A_options : int { A0, A1, A2, A3 };
+    enum B_options : int { B0, B1 };
+    AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                    CompileTimeOptions<B0,B1>>{},
+        {A_runtime_option, B_runtime_option},
+        [&](auto cto_func){
+            ParallelForRNG(N, cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (int i, const RandomEngine& engine,
+                              auto A_control, auto B_control)
+        {
+            ...
+            if constexpr (A_control.value == A0) {
+                ...
+            } else if constexpr (A_control.value == A1) {
+                ...
+            } else if constexpr (A_control.value == A2) {
+                ...
+            } else {
+                ...
+            }
+            if constexpr (A_control.value != A3 && B_control.value == B1) {
+                ...
+            }
+            ...
+        }
+    );
+
+    constexpr int nthreads_per_block = ...;
+    int nblocks = ...;
+    AnyCTO(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                    CompileTimeOptions<B0,B1>>{},
+        {A_runtime_option, B_runtime_option},
+        [&](auto cto_func){
+            launch<nthreads_per_block>(nblocks, Gpu::gpuStream(), cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (auto A_control, auto B_control){
+            ...
+        }
+    );
+ \endverbatim
+ * The static member function cto_func.GetOptions() can be used to obtain the runtime_options
+ * passed into AnyCTO, but at compile time. This enables some advanced use cases,
+ * such as changing the number of threads per block or the dimensionality of ParallelFor at runtime.
+ * For the second example -> decltype(void(intvect.size())) is necessary to
+ * disambiguate IntVectND<1> and int for the first argument of the kernel function.
+ \verbatim
+    int nthreads_per_block = ...;
+    AnyCTO(TypeList<CompileTimeOptions<128,256,512,1024>>{},
+        {nthreads_per_block},
+        [&](auto cto_func){
+            constexpr std::array<int, 1> ctos = cto_func.GetOptions();
+            constexpr int c_nthreads_per_block = ctos[0];
+            ParallelFor<c_nthreads_per_block>(N, cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (int i, auto){
+            ...
+        }
+    );
+
+    BoxND<6> box6D = ...;
+    int dims_needed = ...;
+    AnyCTO(TypeList<CompileTimeOptions<1,2,3,4,5,6>>{},
+        {dims_needed},
+        [&](auto cto_func){
+            constexpr std::array<int, 1> ctos = cto_func.GetOptions();
+            constexpr int c_dims_needed = ctos[0];
+            const auto box = BoxShrink<c_dims_needed>(box6D);
+            ParallelFor(box, cto_func);
+        },
+        [=] AMREX_GPU_DEVICE (auto intvect, auto) -> decltype(void(intvect.size())) {
+            ...
+        }
+    );
+ \endverbatim
+
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if. Alternatively, the constexpr if can be replaced with a regular if.
+ * Compilers can still perform the same optimizations since the condition is known at compile time.
+ * The data type for the parameters is int.
+ *
+ * \param list_of_compile_time_options list of all possible values of the parameters.
+ * \param runtime_options the run time parameters.
+ * \param l a callable object containing a CPU function that launches the provided GPU kernel.
+ * \param f a callable object containing the GPU kernel with optimizations.
+ */
+template <class L, class F, typename... CTOs>
+void AnyCTO ([[maybe_unused]] TypeList<CTOs...> list_of_compile_time_options,
              std::array<int,sizeof...(CTOs)> const& runtime_options,
-             T N, F&& f)
+             L&& l, F&& f)
 {
 #if (__cplusplus >= 201703L)
-    detail::ParallelFor_helper1<MT>(N, std::forward<F>(f),
-                                    CartesianProduct(typename CTOs::list_type{}...),
-                                    runtime_options);
+    detail::AnyCTO_helper1(std::forward<L>(l), std::forward<F>(f),
+                           CartesianProduct(typename CTOs::list_type{}...),
+                           runtime_options);
 #else
-    amrex::ignore_unused(N, f, runtime_options);
+    amrex::ignore_unused(runtime_options, l, f);
     static_assert(std::is_integral<F>::value, "This requires C++17");
 #endif
 }
 
-template <int MT, class F, typename... CTOs>
-void ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+template <int MT, typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral_v<T>>
+ParallelFor (TypeList<CTOs...> ctos,
+             std::array<int,sizeof...(CTOs)> const& runtime_options,
+             T N, F&& f)
+{
+    AnyCTO(ctos, runtime_options,
+        [&](auto cto_func){
+            ParallelFor<MT>(N, cto_func);
+        },
+        std::forward<F>(f)
+    );
+}
+
+template <int MT, class F, int dim, typename... CTOs>
+void ParallelFor (TypeList<CTOs...> ctos,
                   std::array<int,sizeof...(CTOs)> const& runtime_options,
-                  Box const& box, F&& f)
+                  BoxND<dim> const& box, F&& f)
 {
-#if (__cplusplus >= 201703L)
-    detail::ParallelFor_helper1<MT>(box, std::forward<F>(f),
-                                    CartesianProduct(typename CTOs::list_type{}...),
-                                    runtime_options);
-#else
-    amrex::ignore_unused(box, f, runtime_options);
-    static_assert(std::is_integral<F>::value, "This requires C++17");
-#endif
+    AnyCTO(ctos, runtime_options,
+        [&](auto cto_func){
+            ParallelFor<MT>(box, cto_func);
+        },
+        std::forward<F>(f)
+    );
 }
 
-template <int MT, typename T, class F, typename... CTOs>
+template <int MT, typename T, class F, int dim, typename... CTOs>
 std::enable_if_t<std::is_integral_v<T>>
-ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+ParallelFor (TypeList<CTOs...> ctos,
              std::array<int,sizeof...(CTOs)> const& runtime_options,
-             Box const& box, T ncomp, F&& f)
+             BoxND<dim> const& box, T ncomp, F&& f)
 {
-#if (__cplusplus >= 201703L)
-    detail::ParallelFor_helper1<MT>(box, ncomp, std::forward<F>(f),
-                                    CartesianProduct(typename CTOs::list_type{}...),
-                                    runtime_options);
-#else
-    amrex::ignore_unused(box, ncomp, f, runtime_options);
-    static_assert(std::is_integral<F>::value, "This requires C++17");
-#endif
+    AnyCTO(ctos, runtime_options,
+        [&](auto cto_func){
+            ParallelFor<MT>(box, ncomp, cto_func);
+        },
+        std::forward<F>(f)
+    );
 }
 
 /**
@@ -164,7 +251,7 @@ ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
              ...
          } else if constexpr (A_control.value == A2) {
              ...
-         else {
+         } else {
              ...
          }
          if constexpr (A_control.value != A3 && B_control.value == B1) {
@@ -218,7 +305,7 @@ ParallelFor (TypeList<CTOs...> ctos,
              ...
          } else if constexpr (A_control.value == A2) {
              ...
-         else {
+         } else {
              ...
          }
          if constexpr (A_control.value != A3 && B_control.value == B1) {
@@ -237,10 +324,10 @@ ParallelFor (TypeList<CTOs...> ctos,
  * \param box      a Box specifying the 3D for loop's range.
  * \param f        a callable object taking three integers and working on the given cell.
  */
-template <class F, typename... CTOs>
+template <class F, int dim, typename... CTOs>
 void ParallelFor (TypeList<CTOs...> ctos,
                   std::array<int,sizeof...(CTOs)> const& option,
-                  Box const& box, F&& f)
+                  BoxND<dim> const& box, F&& f)
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, std::forward<F>(f));
 }
@@ -271,7 +358,7 @@ void ParallelFor (TypeList<CTOs...> ctos,
              ...
          } else if constexpr (A_control.value == A2) {
              ...
-         else {
+         } else {
              ...
          }
          if constexpr (A_control.value != A3 && B_control.value == B1) {
@@ -291,11 +378,11 @@ void ParallelFor (TypeList<CTOs...> ctos,
  * \param ncomp  an integer specifying the range for iteration over components.
  * \param f      a callable object taking three integers and working on the given cell.
  */
-template <typename T, class F, typename... CTOs>
+template <typename T, class F, int dim, typename... CTOs>
 std::enable_if_t<std::is_integral_v<T>>
 ParallelFor (TypeList<CTOs...> ctos,
              std::array<int,sizeof...(CTOs)> const& option,
-             Box const& box, T ncomp, F&& f)
+             BoxND<dim> const& box, T ncomp, F&& f)
 {
     ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, ncomp, std::forward<F>(f));
 }

From 8627fbe22d527a00bc477244dc7c99e9c29090a0 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 4 Sep 2024 12:00:06 -0500
Subject: [PATCH 06/18] ArenaProfiler: Fix clang-tidy warning (#4128)

When TINY_PROFILING is off, ~ArenaProfiler is trivial. So clang-tidy
wants us to use `= default`.
---
 Src/Base/AMReX_Arena.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index c2de5464574..2320ab8c664 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -660,14 +660,21 @@ The_Comms_Arena ()
     }
 }
 
-Arena::ArenaProfiler::~ArenaProfiler () {
 #ifdef AMREX_TINY_PROFILING
+
+Arena::ArenaProfiler::~ArenaProfiler ()
+{
     if (m_do_profiling) {
         TinyProfiler::DeregisterArena(m_profiling_stats);
     }
-#endif
 }
 
+#else
+
+Arena::ArenaProfiler::~ArenaProfiler () = default;
+
+#endif
+
 void Arena::ArenaProfiler::profile_alloc ([[maybe_unused]] void* ptr,
                                           [[maybe_unused]] std::size_t nbytes) {
 #ifdef AMREX_TINY_PROFILING

From dea9bb1b6dfaac0b295e1a62480b9afcb8b9ca15 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 4 Sep 2024 12:07:26 -0500
Subject: [PATCH 07/18] TinyProfiler: A few updates (#4102)

* Make it safe for Initialize and Finalize being called multiple times.

* Delay the reading of tiny_profiler.output_file so that if the user
wants to override the default, they do not have to do it by passing a
function to amrex::Initialize.

* Update the documentation, thanks to @ax3l's suggestion.
---
 .../source/RuntimeParameters.rst              |  4 +-
 Src/Base/AMReX_TinyProfiler.H                 |  1 +
 Src/Base/AMReX_TinyProfiler.cpp               | 71 +++++++++++++------
 3 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/Docs/sphinx_documentation/source/RuntimeParameters.rst b/Docs/sphinx_documentation/source/RuntimeParameters.rst
index c57c74c8fb1..4e9f4196804 100644
--- a/Docs/sphinx_documentation/source/RuntimeParameters.rst
+++ b/Docs/sphinx_documentation/source/RuntimeParameters.rst
@@ -1234,5 +1234,5 @@ enabled.
 
    If this parameter is empty, the output of tiny profiling is dumped on the
    default out stream of AMReX. If it's not empty, it specifies the file
-   name for the output. Note that ``/dev/null`` is a special name that mean
-   a null file.
+   name for the output. Note that ``/dev/null`` is a special name that means
+   no output.
diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H
index 9a0731175bd..3c7f2da7758 100644
--- a/Src/Base/AMReX_TinyProfiler.H
+++ b/Src/Base/AMReX_TinyProfiler.H
@@ -143,6 +143,7 @@ private:
     static bool memprof_enabled;
     static std::string output_file;
 
+    static std::string const& get_output_file ();
     static void PrintStats (std::map<std::string,Stats>& regstats, double dt_max,
                             std::ostream* os);
     static void PrintMemStats (std::map<std::string, MemStat>& memstats,
diff --git a/Src/Base/AMReX_TinyProfiler.cpp b/Src/Base/AMReX_TinyProfiler.cpp
index fab9e38aa82..5b4a5ef3b44 100644
--- a/Src/Base/AMReX_TinyProfiler.cpp
+++ b/Src/Base/AMReX_TinyProfiler.cpp
@@ -56,6 +56,8 @@ std::string TinyProfiler::output_file;
 
 namespace {
     constexpr char mainregion[] = "main";
+    bool finalized = false;
+    bool memprof_finalized = false;
 }
 
 TinyProfiler::TinyProfiler (std::string funcname) noexcept
@@ -318,23 +320,14 @@ TinyProfiler::Initialize () noexcept
         pp.queryAdd("print_threshold", print_threshold);
 
         pp.queryAdd("enabled", enabled);
-        pp.queryAdd("output_file", output_file);
     }
 
     if (!enabled) { return; }
 
-    if (ParallelDescriptor::IOProcessor()) {
-        static bool first = true;
-        if (first && !output_file.empty() && output_file != "/dev/null") {
-            if (FileSystem::Exists(output_file)) {
-                FileSystem::Remove(output_file);
-            }
-            first = false;
-        }
-    }
-
     regionstack.emplace_back(mainregion);
     t_init = amrex::second();
+
+    finalized = false;
 }
 
 void
@@ -352,6 +345,8 @@ TinyProfiler::MemoryInitialize () noexcept
 #ifdef AMREX_USE_OMP
     mem_stack_thread_private.resize(omp_get_max_threads());
 #endif
+
+    memprof_finalized = false;
 }
 
 void
@@ -359,7 +354,6 @@ TinyProfiler::Finalize (bool bFlushing) noexcept
 {
     if (!enabled) { return; }
 
-    static bool finalized = false;
     if (!bFlushing) {                // If flushing, don't make this the last time!
         if (finalized) {
             return;
@@ -388,12 +382,13 @@ TinyProfiler::Finalize (bool bFlushing) noexcept
     std::ostream* os = nullptr;
     std::streamsize oldprec = 0;
     if (ParallelDescriptor::IOProcessor()) {
-        if (output_file.empty()) {
+        auto const& ofile = get_output_file();
+        if (ofile.empty()) {
             os = &(amrex::OutStream());
-        } else if (output_file != "/dev/null") {
-            ofs.open(output_file, std::ios_base::app);
+        } else if (ofile != "/dev/null") {
+            ofs.open(ofile, std::ios_base::app);
             if (!ofs.is_open()) {
-                amrex::Error("TinyProfiler failed to open "+output_file);
+                amrex::Error("TinyProfiler failed to open "+ofile);
             }
             os = static_cast<std::ostream*>(&ofs);
         }
@@ -439,6 +434,12 @@ TinyProfiler::Finalize (bool bFlushing) noexcept
         }
     }
 
+    if (!bFlushing) {
+        regionstack.clear();
+        ttstack.clear();
+        statsmap.clear();
+    }
+
     if(os) { os->precision(oldprec); }
 }
 
@@ -449,12 +450,11 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
 
     // This function must be called BEFORE the profiled arenas are deleted
 
-    static bool finalized = false;
     if (!bFlushing) {                // If flushing, don't make this the last time!
-        if (finalized) {
+        if (memprof_finalized) {
             return;
         } else {
-            finalized = true;
+            memprof_finalized = true;
         }
     }
 
@@ -467,12 +467,13 @@ TinyProfiler::MemoryFinalize (bool bFlushing) noexcept
     std::ostream* os = nullptr;
     std::streamsize oldprec = 0;
     if (ParallelDescriptor::IOProcessor()) {
-        if (output_file.empty()) {
+        auto const& ofile = get_output_file();
+        if (ofile.empty()) {
             os = &(amrex::OutStream());
-        } else if (output_file != "/dev/null") {
-            ofs.open(output_file, std::ios_base::app);
+        } else if (ofile != "/dev/null") {
+            ofs.open(ofile, std::ios_base::app);
             if (!ofs.is_open()) {
-                amrex::Error("TinyProfiler failed to open "+output_file);
+                amrex::Error("TinyProfiler failed to open "+ofile);
             }
             os = static_cast<std::ostream*>(&ofs);
         }
@@ -951,4 +952,28 @@ TinyProfiler::PrintCallStack (std::ostream& os)
     }
 }
 
+std::string const&
+TinyProfiler::get_output_file ()
+{
+    // Instead of reading it only once, we could try to read the parameter
+    // every time. But I am not sure how useful that might be.
+    static bool first = true;
+    if (first) {
+        first = false;
+
+        amrex::ParmParse pp("tiny_profiler");
+        pp.query("output_file", output_file);
+
+        if (ParallelDescriptor::IOProcessor()) {
+            if (!output_file.empty() && output_file != "/dev/null") {
+                if (FileSystem::Exists(output_file)) {
+                    FileSystem::Remove(output_file);
+                }
+            }
+        }
+    }
+
+    return output_file;
+}
+
 }

From 65d10a1a7592ecf7e81a92d5c5863a0d1f1e3662 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 4 Sep 2024 12:08:40 -0500
Subject: [PATCH 08/18] Fortran Interfaces: Add new average down functions
 (#4124)

Add average down function for cell-centered data without volume
weighting.

Add average down function for nodal data.
---
 .../Base/AMReX_multifabutil_fi.cpp            | 10 +++++--
 .../Base/AMReX_multifabutil_mod.F90           | 26 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp b/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp
index d59c2d02053..61d0385276b 100644
--- a/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp
+++ b/Src/F_Interfaces/Base/AMReX_multifabutil_fi.cpp
@@ -5,12 +5,18 @@ using namespace amrex;
 extern "C"
 {
     void amrex_fi_average_down (const MultiFab* S_fine, MultiFab* S_crse,
-                             const Geometry* fgeom, const Geometry* cgeom,
-                             int scomp, int ncomp, int rr)
+                                const Geometry* fgeom, const Geometry* cgeom,
+                                int scomp, int ncomp, int rr)
     {
         amrex::average_down(*S_fine, *S_crse, *fgeom, *cgeom, scomp, ncomp, rr);
     }
 
+    void amrex_fi_average_down_cell_node (const MultiFab* S_fine, MultiFab* S_crse,
+                                          int scomp, int ncomp, int rr)
+    {
+        amrex::average_down(*S_fine, *S_crse, scomp, ncomp, rr);
+    }
+
     void amrex_fi_average_down_faces (MultiFab const* fmf[], MultiFab* cmf[],
                                       Geometry const* cgeom, int scomp, int ncomp,
                                       int rr)
diff --git a/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90 b/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90
index 9575b217a73..643f661649f 100644
--- a/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90
+++ b/Src/F_Interfaces/Base/AMReX_multifabutil_mod.F90
@@ -8,7 +8,11 @@ module amrex_multifabutil_module
   implicit none
   private
 
-  public :: amrex_average_down, amrex_average_down_faces, amrex_average_cellcenter_to_face
+  public :: amrex_average_down, & ! volume weighted average down of cell data
+       &    amrex_average_down_cell,  & ! average down of cell data
+       &    amrex_average_down_node,  & ! average down of nodal data
+       &    amrex_average_down_faces, & ! average down of face data
+       &    amrex_average_cellcenter_to_face ! average from cell centers to faces
 
   interface
      subroutine amrex_fi_average_down (fmf, cmf, fgeom, cgeom, scomp, ncomp, rr) bind(c)
@@ -18,6 +22,13 @@ subroutine amrex_fi_average_down (fmf, cmf, fgeom, cgeom, scomp, ncomp, rr) bind
        integer(c_int), value :: scomp, ncomp, rr
      end subroutine amrex_fi_average_down
 
+     subroutine amrex_fi_average_down_cell_node (fmf, cmf, scomp, ncomp, rr) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fmf, cmf
+       integer(c_int), value :: scomp, ncomp, rr
+     end subroutine amrex_fi_average_down_cell_node
+
      subroutine amrex_fi_average_down_faces (fmf, cmf, cgeom, scomp, ncomp, rr) bind(c)
        import
        implicit none
@@ -45,6 +56,19 @@ subroutine amrex_average_down (fmf, cmf, fgeom, cgeom, scomp, ncomp, rr)
     call amrex_fi_average_down(fmf%p, cmf%p, fgeom%p, cgeom%p, scomp-1, ncomp, rr)
   end subroutine amrex_average_down
 
+  subroutine amrex_average_down_cell (fmf, cmf, scomp, ncomp, rr)
+    type(amrex_multifab), intent(in   ) :: fmf
+    type(amrex_multifab), intent(inout) :: cmf
+    integer, intent(in) :: scomp, ncomp, rr
+    call amrex_fi_average_down_cell_node(fmf%p, cmf%p, scomp-1, ncomp, rr)
+  end subroutine amrex_average_down_cell
+
+  subroutine amrex_average_down_node (fmf, cmf, scomp, ncomp, rr)
+    type(amrex_multifab), intent(in   ) :: fmf
+    type(amrex_multifab), intent(inout) :: cmf
+    integer, intent(in) :: scomp, ncomp, rr
+    call amrex_fi_average_down_cell_node(fmf%p, cmf%p, scomp-1, ncomp, rr)
+  end subroutine amrex_average_down_node
 
   subroutine amrex_average_down_faces (fmf, cmf, cgeom, scomp, ncomp, rr)
     type(amrex_multifab), intent(in   ) :: fmf(amrex_spacedim)

From 61b91f9e52a13718384be076b02627068721f619 Mon Sep 17 00:00:00 2001
From: Bruce Perry <Bruce.Perry@nrel.gov>
Date: Wed, 4 Sep 2024 14:00:29 -0600
Subject: [PATCH 09/18] use perl instead of sed in style checks for portability
 to MacOS (#4127)

## Summary

`sed -i` does not function the same for GNU and BSD sed and a simple
portable command between the two is apparently not possible. Therefore,
the style check scripts do not work by default when running on Macs,
unless the user installs `gsed` and aliases `sed` to run that instead.
For portability,`perl` can be used instead of `sed`.

## Additional background

Proposed solution taken from here:
https://stackoverflow.com/questions/4247068/sed-command-with-i-option-failing-on-mac-but-works-on-linux
---
 .github/workflows/style/check_tabs.sh                 | 2 +-
 .github/workflows/style/check_trailing_whitespaces.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/style/check_tabs.sh b/.github/workflows/style/check_tabs.sh
index f4418644e3a..e8bb9f14bc3 100755
--- a/.github/workflows/style/check_tabs.sh
+++ b/.github/workflows/style/check_tabs.sh
@@ -20,7 +20,7 @@ find . -type d \( -name .git \
                     -a ! -name "*.lex.h" -a ! -name "*.lex.nolint.H" \) \
                \) \
     -exec grep -Iq . {} \; \
-    -exec sed -i 's/\t/\ \ \ \ /g' {} +
+    -exec perl -i -pe's/\t/\ \ \ \ /g' {} +
 
 gitdiff=`git diff`
 
diff --git a/.github/workflows/style/check_trailing_whitespaces.sh b/.github/workflows/style/check_trailing_whitespaces.sh
index a0f941f8c2b..a4ece8a0ee4 100755
--- a/.github/workflows/style/check_trailing_whitespaces.sh
+++ b/.github/workflows/style/check_trailing_whitespaces.sh
@@ -20,7 +20,7 @@ find . -type d \( -name .git \
                     -a ! -name "*.lex.h" -a ! -name "*.lex.nolint.H" \) \
                \) \
     -exec grep -Iq . {} \; \
-    -exec sed -i 's/[[:blank:]]\+$//g' {} +
+    -exec perl -i -pe's/[[:blank:]]+$//g' {} +
 
 gitdiff=`git diff`
 

From c88b2b51c1ce54dd7a6ca87b77ffe4402a92e58f Mon Sep 17 00:00:00 2001
From: Bruce Perry <Bruce.Perry@nrel.gov>
Date: Wed, 4 Sep 2024 14:15:37 -0600
Subject: [PATCH 10/18] Capability adds for ParmParse enum (#4119)

## Summary

- The ParmParse functions for AMReX Enums added in #4069 did not carry
the optional arguments found in other ParmParse functions. Those are
added so that, for example, querying an enum has the same interface as
querying other types.
- These functions shouldn't modify the ParmParse object, so they are
made `const`
- The `prefixedName` function is made a public member instead of a
protected member.

## Additional background

The use case for these changes is
https://github.com/erf-model/ERF/pull/1772
---
 Src/Base/AMReX_ParmParse.H   | 62 ++++++++++++++++++++++++++----------
 Src/Base/AMReX_ParmParse.cpp | 34 ++++++++++----------
 2 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H
index cc9588793da..d49b2cac5d7 100644
--- a/Src/Base/AMReX_ParmParse.H
+++ b/Src/Base/AMReX_ParmParse.H
@@ -1192,14 +1192,18 @@ public:
      */
     template <typename T, typename ET = amrex_enum_traits<T>,
               std::enable_if_t<ET::value,int> = 0>
-    int query (const char* name, T& ref)
+    int query (const char* name, T& ref, int ival = FIRST) const
     {
         std::string s;
-        int exist = this->query(name, s);
+        int exist = this->query(name, s, ival);
         if (exist) {
             try {
                 ref = amrex::getEnum<T>(s);
             } catch (...) {
+                if (amrex::Verbose() > 0 ) {
+                    amrex::Print() << "amrex::ParmParse::query (input name: "
+                                   << this->prefixedName(name) << "):\n";
+                }
                 throw;
             }
         }
@@ -1216,13 +1220,17 @@ public:
      */
     template <typename T, typename ET = amrex_enum_traits<T>,
               std::enable_if_t<ET::value,int> = 0>
-    void get (const char* name, T& ref)
+    void get (const char* name, T& ref, int ival = FIRST) const
     {
         std::string s;
-        this->get(name, s);
+        this->get(name, s, ival);
         try {
             ref = amrex::getEnum<T>(s);
         } catch (...) {
+            if (amrex::Verbose() > 0 ) {
+                amrex::Print() << "amrex::ParmParse::get (input name:  "
+                               << this->prefixedName(name) << "):\n";
+            }
             throw;
         }
     }
@@ -1230,14 +1238,25 @@ public:
     //! Query an array of enum values using given name.
     template <typename T, typename ET = amrex_enum_traits<T>,
               std::enable_if_t<ET::value,int> = 0>
-    int queryarr (const char* name, std::vector<T>& ref)
+    int queryarr (const char* name,
+                  std::vector<T>& ref,
+                  int start_ix = FIRST,
+                  int num_val = ALL) const
     {
         std::vector<std::string> s;
-        int exist = this->queryarr(name, s);
+        int exist = this->queryarr(name, s, start_ix, num_val);
         if (exist) {
             ref.resize(s.size());
             for (std::size_t i = 0; i < s.size(); ++i) {
-                ref[i] = amrex::getEnum<T>(s[i]);
+                try {
+                    ref[i] = amrex::getEnum<T>(s[i]);
+                }  catch (...) {
+                    if (amrex::Verbose() > 0 ) {
+                        amrex::Print() << "amrex::ParmParse::queryarr (input name:  "
+                                       << this->prefixedName(name) << "):\n";
+                    }
+                    throw;
+                }
             }
         }
         return exist;
@@ -1246,13 +1265,24 @@ public:
     //! Get an array of enum values using given name.
     template <typename T, typename ET = amrex_enum_traits<T>,
               std::enable_if_t<ET::value,int> = 0>
-    void getarr (const char* name, std::vector<T>& ref)
+    void getarr (const char* name,
+                 std::vector<T>& ref,
+                 int start_ix = FIRST,
+                 int num_val = ALL) const
     {
         std::vector<std::string> s;
-        this->getarr(name, s);
+        this->getarr(name, s, start_ix, num_val);
         ref.resize(s.size());
         for (std::size_t i = 0; i < s.size(); ++i) {
-            ref[i] = amrex::getEnum<T>(s[i]);
+            try {
+                ref[i] = amrex::getEnum<T>(s[i]);
+            }  catch (...) {
+                if (amrex::Verbose() > 0 ) {
+                    amrex::Print() << "amrex::ParmParse::getarr (input name:  "
+                                   << this->prefixedName(name) << "):\n";
+                }
+                throw;
+            }
         }
     }
 
@@ -1268,10 +1298,10 @@ public:
      */
     template <typename T, typename ET = amrex_enum_traits<T>,
               std::enable_if_t<ET::value,int> = 0>
-    int query_enum_case_insensitive (const char* name, T& ref)
+    int query_enum_case_insensitive (const char* name, T& ref, int ival = FIRST) const
     {
         std::string s;
-        int exist = this->query(name, s);
+        int exist = this->query(name, s, ival);
         if (exist) {
             s = amrex::toLower(s);
             auto const& enum_names = amrex::getEnumNameStrings<T>();
@@ -1303,9 +1333,9 @@ public:
      */
     template <typename T, typename ET = amrex_enum_traits<T>,
               std::enable_if_t<ET::value,int> = 0>
-    void get_enum_case_insensitive (const char* name, T& ref)
+    void get_enum_case_insensitive (const char* name, T& ref, int ival = FIRST) const
     {
-        int exist = this->query_enum_case_insensitive(name, ref);
+        int exist = this->query_enum_case_insensitive(name, ref, ival);
         if (!exist) {
             std::string msg("get_enum_case_insensitive(\"");
             msg.append(name).append("\",").append(amrex::getEnumClassName<T>())
@@ -1387,10 +1417,10 @@ public:
 
     static std::string ParserPrefix;
 
-protected:
-
     [[nodiscard]] std::string prefixedName (const std::string_view& str) const;
 
+protected:
+
     std::string m_prefix; // Prefix used in keyword search
     std::string m_parser_prefix; // Prefix used by Parser
     Table* m_table;
diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp
index df1e18e9b08..572d9013da8 100644
--- a/Src/Base/AMReX_ParmParse.cpp
+++ b/Src/Base/AMReX_ParmParse.cpp
@@ -1283,7 +1283,7 @@ ParmParse::query (const char* name,
 }
 
 void
-ParmParse::add (const char* name,
+ParmParse::add (const char* name, // NOLINT(readability-make-member-function-const)
                 const bool  val)
 {
     saddval(prefixedName(name),val);
@@ -1315,7 +1315,7 @@ ParmParse::query (const char* name, int& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const int val)
+ParmParse::add (const char* name, const int val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1349,7 +1349,7 @@ ParmParse::queryarr (const char* name, std::vector<int>& ref, int start_ix,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<int>& ref)
+ParmParse::addarr (const char* name, const std::vector<int>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1381,7 +1381,7 @@ ParmParse::query (const char* name, long& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name,
+ParmParse::add (const char* name, // NOLINT(readability-make-member-function-const)
                 const long  val)
 {
     saddval(prefixedName(name),val);
@@ -1416,7 +1416,7 @@ ParmParse::queryarr (const char* name, std::vector<long>& ref, int start_ix,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<long>& ref)
+ParmParse::addarr (const char* name, const std::vector<long>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1447,7 +1447,7 @@ ParmParse::query (const char* name, long long& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const long long val)
+ParmParse::add (const char* name, const long long val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1481,7 +1481,7 @@ ParmParse::queryarr (const char* name, std::vector<long long>& ref, int start_ix
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<long long>& ref)
+ParmParse::addarr (const char* name, const std::vector<long long>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1512,7 +1512,7 @@ ParmParse::query (const char* name, float& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const float val)
+ParmParse::add (const char* name, const float val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1546,7 +1546,7 @@ ParmParse::queryarr (const char* name, std::vector<float>& ref, int start_ix,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<float>& ref)
+ParmParse::addarr (const char* name, const std::vector<float>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1579,7 +1579,7 @@ ParmParse::query (const char* name, double& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const double val)
+ParmParse::add (const char* name, const double val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1613,7 +1613,7 @@ ParmParse::queryarr (const char* name, std::vector<double>& ref, int start_ix,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<double>& ref)
+ParmParse::addarr (const char* name, const std::vector<double>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1646,7 +1646,7 @@ ParmParse::query (const char* name, std::string& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const std::string& val)
+ParmParse::add (const char* name, const std::string& val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1680,7 +1680,7 @@ ParmParse::queryarr (const char* name, std::vector<std::string>& ref,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<std::string>& ref)
+ParmParse::addarr (const char* name, const std::vector<std::string>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1713,7 +1713,7 @@ ParmParse::query (const char* name, IntVect& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const IntVect& val)
+ParmParse::add (const char* name, const IntVect& val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1747,7 +1747,7 @@ ParmParse::queryarr (const char* name, std::vector<IntVect>& ref,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<IntVect>& ref)
+ParmParse::addarr (const char* name, const std::vector<IntVect>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }
@@ -1778,7 +1778,7 @@ ParmParse::query (const char* name, Box& ref, int ival) const
 }
 
 void
-ParmParse::add (const char* name, const Box& val)
+ParmParse::add (const char* name, const Box& val) // NOLINT(readability-make-member-function-const)
 {
     saddval(prefixedName(name),val);
 }
@@ -1812,7 +1812,7 @@ ParmParse::queryarr (const char* name, std::vector<Box>& ref,
 }
 
 void
-ParmParse::addarr (const char* name, const std::vector<Box>& ref)
+ParmParse::addarr (const char* name, const std::vector<Box>& ref) // NOLINT(readability-make-member-function-const)
 {
     saddarr(prefixedName(name),ref);
 }

From 41353f6ab5cb7048ccaf491cdee978ad5e993ac1 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 4 Sep 2024 20:54:12 -0500
Subject: [PATCH 11/18] MLMG: Minimum domain width (#4129)

By default, the minimum domain width at the coarsest multigrid level is
2 and 4, for non-EB and EB runs, respectively. Previously, it was set to
4 for runs compiled with EB support but without EB at run time. So the
results of those runs would be different from runs not compiled with EB
support. This is not a correctness issue. Nevertheless, for the sake of
consistence, we set the minimum domain width to 4 only if there is
non-trivial EB at run time.
---
 Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H |  2 +-
 Src/LinearSolvers/MLMG/AMReX_MLLinOp.H         | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
index 9b56b8049df..7b5fb069423 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H
@@ -1281,7 +1281,7 @@ MLABecLaplacianT<MF>::supportNSolve () const
     bool support = false;
     if (this->m_overset_mask[0][0]) {
         if (this->m_geom[0].back().Domain().coarsenable(MLLinOp::mg_coarsen_ratio,
-                                                        MLLinOp::mg_domain_min_width)
+                                                        this->mg_domain_min_width)
             && this->m_grids[0].back().coarsenable(MLLinOp::mg_coarsen_ratio, MLLinOp::mg_box_min_width))
         {
             support = true;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index 3cc623b761e..ce6a8b53335 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -570,11 +570,7 @@ protected:
 
     static constexpr int mg_coarsen_ratio = 2;
     static constexpr int mg_box_min_width = 2;
-#ifdef AMREX_USE_EB
-    static constexpr int mg_domain_min_width = 4;
-#else
-    static constexpr int mg_domain_min_width = 2;
-#endif
+    int mg_domain_min_width = 2;
 
     LPInfo info;
 
@@ -803,6 +799,15 @@ MLLinOpT<MF>::defineGrids (const Vector<Geometry>& a_geom,
 {
     BL_PROFILE("MLLinOp::defineGrids()");
 
+#ifdef AMREX_USE_EB
+    if ( ! a_factory.empty() ) {
+        auto const* ebf = dynamic_cast<EBFArrayBoxFactory const*>(a_factory[0]);
+        if (ebf && !(ebf->isAllRegular())) { // Has non-trivial EB
+            mg_domain_min_width = 4;
+        }
+    }
+#endif
+
     m_num_amr_levels = 0;
     for (int amrlev = 0; amrlev < a_geom.size(); amrlev++) {
         if (!a_grids[amrlev].empty()) {

From 216ce6f37de4b65be57fc1006b3457b4fc318e03 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Thu, 5 Sep 2024 09:05:19 -0700
Subject: [PATCH 12/18] `ParmParse`: Prefix to `FILE` (#4126)

## Summary

For CI/CD workflows and out-of-source tests we often want to include
dependent inputs files via `FILE = <filename>`. For development, we
prefer to run in temporary run directories but want to avoid having to
copy over the latest inputs file from a source directory (mostly to
avoid confusion between source and copy and to enable rapid development
cycles).

Now, the environment variable `AMREX_INPUTS_FILE_PREFIX` can be set to
prefix every `FILE = <filename>` with a custom path. We will use this in
the CTests integration of WarpX.

## Additional background

CC @EZoni
https://github.com/ECP-WarpX/WarpX/pull/5068
---
 Src/Base/AMReX_ParmParse.H   |  3 +++
 Src/Base/AMReX_ParmParse.cpp | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H
index d49b2cac5d7..274858ed253 100644
--- a/Src/Base/AMReX_ParmParse.H
+++ b/Src/Base/AMReX_ParmParse.H
@@ -60,6 +60,9 @@ class RealVect;
 // '\n's. The "FILE = <filename>" definition is special.  Rather than just
 // adding this entry to the database, it reads the contents of <filename>
 // into the database.
+// For CI/CD workflows and out-of-source tests, the environment variable
+// AMREX_INPUTS_FILE_PREFIX can be set to prefix every FILE = <filename>
+// with a custom path.
 //
 // ParmParse stores all entries in a static table which is built the
 // first time a ParmParse object is constructed (usually in main()).
diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp
index 572d9013da8..767bd2a4d0d 100644
--- a/Src/Base/AMReX_ParmParse.cpp
+++ b/Src/Base/AMReX_ParmParse.cpp
@@ -9,6 +9,7 @@
 
 #include <algorithm>
 #include <cctype>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 #include <numeric>
@@ -407,6 +408,19 @@ read_file (const char* fname, ParmParse::Table& tab)
     //
     if ( fname != nullptr && fname[0] != 0 )
     {
+        std::string filename = fname;
+
+        // optional prefix to search files in
+        char const *amrex_inputs_file_prefix_c = std::getenv("AMREX_INPUTS_FILE_PREFIX");
+        if (amrex_inputs_file_prefix_c != nullptr) {
+            // we expect a directory path as the prefix: append a trailing "/" if missing
+            auto amrex_inputs_file_prefix = std::string(amrex_inputs_file_prefix_c);
+            if (amrex_inputs_file_prefix.back() != '/') {
+                amrex_inputs_file_prefix += "/";
+            }
+            filename = amrex_inputs_file_prefix + filename;
+        }
+
 #ifdef AMREX_USE_MPI
         if (ParallelDescriptor::Communicator() == MPI_COMM_NULL)
         {
@@ -415,7 +429,6 @@ read_file (const char* fname, ParmParse::Table& tab)
 #endif
 
         Vector<char> fileCharPtr;
-        std::string filename = fname;
         ParallelDescriptor::ReadAndBcastFile(filename, fileCharPtr);
 
         std::istringstream is(fileCharPtr.data());

From 08eed9562a53c9929e5f522a669649028bac548f Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Thu, 5 Sep 2024 13:41:38 -0500
Subject: [PATCH 13/18] EB Boundary Area: Fix issues for anisotropic cell size
 (#4131)

* In 2D, the scaling in incorrect. For example, if dx >> dy and the
boundary is parallel to the x-direction, it would produce inf for the
scaled boundary area.

* In 3D, the scaling has been modified so that it's easy to convert from
the dimensionless boundary area to physical units.
---
 Docs/sphinx_documentation/source/EB.rst | 17 +++++++++++++++++
 Src/EB/AMReX_EB2_2D_C.cpp               |  3 +--
 Src/EB/AMReX_EB2_3D_C.cpp               |  4 +++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/Docs/sphinx_documentation/source/EB.rst b/Docs/sphinx_documentation/source/EB.rst
index 9b847a3259c..2c78856f937 100644
--- a/Docs/sphinx_documentation/source/EB.rst
+++ b/Docs/sphinx_documentation/source/EB.rst
@@ -271,6 +271,12 @@ following data:
     // embedded boundary centroid
     const MultiCutFab& getBndryCent () const;
 
+    // embedded boundary normal direction
+    const MultiCutFab& getBndryNormal () const;
+
+    // embedded boundary surface area
+    const MultiCutFab& getBndryArea () const;
+
     // area fractions
     Array<const MultiCutFab*,AMREX_SPACEDIM> getAreaFrac () const;
 
@@ -291,6 +297,17 @@ following data:
   of the data is in the range of :math:`[-0.5,0.5]`, based on each
   cell's local coordinates with respect to the regular cell's center.
 
+- **Boundary normal** is in a :cpp:`MultiCutFab` with ``AMREX_SPACEDIM``
+  components representing the unit vector pointing toward the covered part.
+
+- **Boundary area** is in a :cpp:`MultiCutFab` with a single component
+  representing the dimensionless boundary area. When the cell is isotropic
+  (i.e., :math:`\Delta x = \Delta y = \Delta z`), it's trivial to convert it
+  to physical units. If the cell size is anisotropic, the conversion
+  requires multiplying by a factor of :math:`\sqrt{(n_x \Delta y \Delta
+  z)^2 + (n_y \Delta x \Delta z)^2 + (n_z \Delta x \Delta y)^2}`, where
+  :math:`n` is the boundary normal vector.
+
 - **Face centroid** is in a :cpp:`MultiCutFab` with ``AMREX_SPACEDIM`` components.
   Each component of the data is in the range of :math:`[-0.5,0.5]`, based on
   each cell's local coordinates with respect to the embedded boundary.
diff --git a/Src/EB/AMReX_EB2_2D_C.cpp b/Src/EB/AMReX_EB2_2D_C.cpp
index b2bbde200c5..b99b5559c77 100644
--- a/Src/EB/AMReX_EB2_2D_C.cpp
+++ b/Src/EB/AMReX_EB2_2D_C.cpp
@@ -30,8 +30,7 @@ void set_eb_data (const int i, const int j,
     const Real apnorm = std::hypot(daxp,dayp) + 1.e-30_rt*std::sqrt(dx[0]*dx[1]);
     const Real nx = daxp * (1.0_rt/apnorm);
     const Real ny = dayp * (1.0_rt/apnorm);
-    const Real bareascaling = std::sqrt( (nx*dx[0])*(nx*dx[0]) +
-            (ny*dx[1])*(ny*dx[1]) );
+    const Real bareascaling = std::sqrt(Math::powi<2>(nx*dx[1]) + Math::powi<2>(ny*dx[0]));
 
     const Real nxabs = std::abs(nx);
     const Real nyabs = std::abs(ny);
diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp
index 73170adaeeb..2a366e587ab 100644
--- a/Src/EB/AMReX_EB2_3D_C.cpp
+++ b/Src/EB/AMReX_EB2_3D_C.cpp
@@ -101,7 +101,9 @@ void set_eb_data (const int i, const int j, const int k,
     bnorm(i,j,k,0) = nx;
     bnorm(i,j,k,1) = ny;
     bnorm(i,j,k,2) = nz;
-    barea(i,j,k) = (nx*dapx/(dx[1]*dx[2]) + ny*dapy/(dx[0]*dx[2]) + nz*dapz/(dx[0]*dx[1]));
+    barea(i,j,k) = (nx*dapx + ny*dapy + nz*dapz) / (Math::powi<2>(nx*dx[1]*dx[2]) +
+                                                    Math::powi<2>(ny*dx[0]*dx[2]) +
+                                                    Math::powi<2>(nz*dx[0]*dx[1]));
 
     Real aax = 0.5_rt*(axm+axp);
     Real aay = 0.5_rt*(aym+ayp);

From 13d20a21103b53d7942c3d41b247e51ca695b1f3 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Thu, 5 Sep 2024 14:25:02 -0500
Subject: [PATCH 14/18] Add iMultiFab::sum that returns the sum over a region
 (#4132)

---
 Src/Base/AMReX_iMultiFab.H   |  5 ++++
 Src/Base/AMReX_iMultiFab.cpp | 45 ++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/Src/Base/AMReX_iMultiFab.H b/Src/Base/AMReX_iMultiFab.H
index eb1e350433a..39896975b7b 100644
--- a/Src/Base/AMReX_iMultiFab.H
+++ b/Src/Base/AMReX_iMultiFab.H
@@ -164,6 +164,11 @@ public:
     */
     [[nodiscard]] Long sum (int comp, int nghost = 0, bool local = false) const;
 
+    /**
+    * \brief Returns the sum of component "comp" in the given "region". -- no ghost cells are included.
+    */
+    [[nodiscard]] Long sum (Box const& region, int comp = 0, bool local = false) const;
+
     /**
     * \brief Adds the scalar value val to the value of each cell in the
     * specified subregion of the iMultiFab.  The subregion consists
diff --git a/Src/Base/AMReX_iMultiFab.cpp b/Src/Base/AMReX_iMultiFab.cpp
index 090f1ca96d6..c78e3b4a367 100644
--- a/Src/Base/AMReX_iMultiFab.cpp
+++ b/Src/Base/AMReX_iMultiFab.cpp
@@ -431,6 +431,51 @@ iMultiFab::sum (int comp, int nghost, bool local) const
     return sm;
 }
 
+Long
+iMultiFab::sum (Box const& region, int comp, bool local) const
+{
+    BL_PROFILE("iMultiFab::sum(region)");
+
+    Long sm = 0;
+
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion())
+    {
+        auto const& ma = this->const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Long>{}, *this, IntVect(0),
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Long>
+        {
+            return (region.contains(i,j,k)) ? static_cast<Long>(ma[box_no](i,j,k,comp)) : Long(0);
+        });
+    }
+    else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(*this,true); mfi.isValid(); ++mfi)
+        {
+            const Box& bx = mfi.tilebox() & region;
+            if (bx.ok()) {
+                Array4<int const> const& fab = this->const_array(mfi);
+                auto tmp = Long(0);
+                AMREX_LOOP_3D(bx, i, j, k,
+                {
+                    tmp += fab(i,j,k,comp);
+                });
+                sm += tmp;
+            }
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
 namespace {
 
 IntVect

From ad118ded0068fee42c829e2ae8656872e1764dd6 Mon Sep 17 00:00:00 2001
From: Andrew Myers <atmyers2@gmail.com>
Date: Thu, 5 Sep 2024 13:14:53 -0700
Subject: [PATCH 15/18] Fix unused variables in IntVect (#4133)

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Src/Base/AMReX_IntVect.H | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Src/Base/AMReX_IntVect.H b/Src/Base/AMReX_IntVect.H
index 6452f29b036..64457f123ed 100644
--- a/Src/Base/AMReX_IntVect.H
+++ b/Src/Base/AMReX_IntVect.H
@@ -55,8 +55,11 @@ public:
             static constexpr unsigned shift1 = sizeof(size_t)>=8 ? 20 : 10;
             static constexpr unsigned shift2 = sizeof(size_t)>=8 ? 40 : 20;
             if constexpr (dim == 1) {
+                amrex::ignore_unused(shift1);
+                amrex::ignore_unused(shift2);
                 return static_cast<std::size_t>(vec[0]);
             } else if constexpr (dim == 2) {
+                amrex::ignore_unused(shift2);
                 return static_cast<std::size_t>(vec[0]) ^
                       (static_cast<std::size_t>(vec[1]) << shift1);
             } else if constexpr (dim == 3) {

From b454719b10b8d73307852788e54cc62db99a6814 Mon Sep 17 00:00:00 2001
From: Andrew Myers <atmyers2@gmail.com>
Date: Thu, 5 Sep 2024 13:40:29 -0700
Subject: [PATCH 16/18] Use BL_PROFILE instead of BL_PROFILE_VAR to time in
 knapsack()swap (#4134)

This timer was using `BL_PROFILE_VAR` without an associated
`BL_PROFILE_STOP`, resulting in unused variable warnings. It seems to me
that we can just use `BL_PROFILE` here and rely on scope to stop the
timer.

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Src/Base/AMReX_DistributionMapping.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp
index cfe16b346a1..45304cf5e04 100644
--- a/Src/Base/AMReX_DistributionMapping.cpp
+++ b/Src/Base/AMReX_DistributionMapping.cpp
@@ -651,7 +651,7 @@ knapsack (const std::vector<Long>&         wgts,
     if (efficiency < max_efficiency && do_full_knapsack
         && wblv.size() > 1 && wblv.begin()->size() > 1)
     {
-        BL_PROFILE_VAR("knapsack()swap", swap);
+        BL_PROFILE("knapsack()swap");
 top: ;
 
         if (efficiency < max_efficiency && wblv.begin()->size() > 1)
@@ -980,7 +980,7 @@ DistributionMapping::KnapSackProcessorMap (const DistributionMapping& olddm,
             new_efficiency = avg_weight / max_weight;
 
             if (new_efficiency < max_efficiency && wblv.size() > 1) {
-                BL_PROFILE_VAR("knapsack()swap", swap);
+                BL_PROFILE("knapsack()swap");
 
                 std::sort(wblv.begin(), wblv.end());
 

From 87de52ef071811629120df6247c53e0e6840c4ec Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Fri, 6 Sep 2024 08:52:12 -0500
Subject: [PATCH 17/18] Fix a bug in #4131 (#4138)

Missing sqrt.
---
 Src/EB/AMReX_EB2_3D_C.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp
index 2a366e587ab..2d02e53bdc7 100644
--- a/Src/EB/AMReX_EB2_3D_C.cpp
+++ b/Src/EB/AMReX_EB2_3D_C.cpp
@@ -101,9 +101,9 @@ void set_eb_data (const int i, const int j, const int k,
     bnorm(i,j,k,0) = nx;
     bnorm(i,j,k,1) = ny;
     bnorm(i,j,k,2) = nz;
-    barea(i,j,k) = (nx*dapx + ny*dapy + nz*dapz) / (Math::powi<2>(nx*dx[1]*dx[2]) +
-                                                    Math::powi<2>(ny*dx[0]*dx[2]) +
-                                                    Math::powi<2>(nz*dx[0]*dx[1]));
+    barea(i,j,k) = (nx*dapx + ny*dapy + nz*dapz) / std::sqrt(Math::powi<2>(nx*dx[1]*dx[2]) +
+                                                             Math::powi<2>(ny*dx[0]*dx[2]) +
+                                                             Math::powi<2>(nz*dx[0]*dx[1]));
 
     Real aax = 0.5_rt*(axm+axp);
     Real aay = 0.5_rt*(aym+ayp);

From 51db5ece14372d6f770531054a52edd4abd07b23 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Sat, 7 Sep 2024 19:08:34 -0500
Subject: [PATCH 18/18] amrex::Stack (#4139)

Move the Stack class used in Parser to its own header so that it can be
used by others. The Stack class has a fixed maximum size. This is useful
for traversing a tree on GPU, because recursive function does not work
well in device code.
---
 Src/Base/AMReX_Stack.H              | 24 ++++++++++++++++++++++++
 Src/Base/CMakeLists.txt             |  1 +
 Src/Base/Make.package               |  2 ++
 Src/Base/Parser/AMReX_IParser_Exe.H | 15 ++-------------
 Src/Base/Parser/AMReX_Parser_Exe.H  | 15 ++-------------
 5 files changed, 31 insertions(+), 26 deletions(-)
 create mode 100644 Src/Base/AMReX_Stack.H

diff --git a/Src/Base/AMReX_Stack.H b/Src/Base/AMReX_Stack.H
new file mode 100644
index 00000000000..8e9e83f8682
--- /dev/null
+++ b/Src/Base/AMReX_Stack.H
@@ -0,0 +1,24 @@
+#ifndef AMREX_STACK_H_
+#define AMREX_STACK_H_
+
+namespace amrex {
+
+template <typename T, int N>
+struct Stack
+{
+public:
+    constexpr void push (T v) { m_data[m_size++] = v; }
+    constexpr void pop () { --m_size; }
+    [[nodiscard]] constexpr bool empty () const { return m_size == 0; }
+    [[nodiscard]] constexpr int size () const { return m_size; }
+    [[nodiscard]] constexpr T const& top () const { return m_data[m_size-1]; }
+    [[nodiscard]] constexpr T      & top ()       { return m_data[m_size-1]; }
+    [[nodiscard]] constexpr T operator[] (int i) const { return m_data[i]; }
+private:
+    T m_data[N];
+    int m_size = 0;
+};
+
+}
+
+#endif
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 0436ad032e4..3a19b917c35 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -31,6 +31,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_parmparse_fi.cpp
        AMReX_ParmParse.H
        AMReX_Functional.H
+       AMReX_Stack.H
        AMReX_String.H
        AMReX_String.cpp
        AMReX_Utility.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index b009ebf7d65..361ca079388 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -23,6 +23,8 @@ C$(AMREX_BASE)_sources += AMReX_PODVector.cpp
 C$(AMREX_BASE)_headers += AMReX_BlockMutex.H
 C$(AMREX_BASE)_sources += AMReX_BlockMutex.cpp
 
+C$(AMREX_BASE)_headers += AMReX_Stack.H
+
 C$(AMREX_BASE)_headers += AMReX_String.H
 C$(AMREX_BASE)_sources += AMReX_String.cpp
 
diff --git a/Src/Base/Parser/AMReX_IParser_Exe.H b/Src/Base/Parser/AMReX_IParser_Exe.H
index e7e41c44d48..a68fa32981e 100644
--- a/Src/Base/Parser/AMReX_IParser_Exe.H
+++ b/Src/Base/Parser/AMReX_IParser_Exe.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_IParser_Y.H>
+#include <AMReX_Stack.H>
 #include <AMReX_Vector.H>
 
 #include <limits>
@@ -226,24 +227,12 @@ struct alignas(8) IParserExeJUMP {
     int offset;
 };
 
-template <int N>
-struct IParserStack
-{
-    long long m_data[N];
-    int m_size = 0;
-    constexpr void push (long long v) { m_data[m_size++] = v; }
-    constexpr void pop () { --m_size; }
-    [[nodiscard]] constexpr long long const& top () const { return m_data[m_size-1]; }
-    [[nodiscard]] constexpr long long      & top ()       { return m_data[m_size-1]; }
-    [[nodiscard]] constexpr long long operator[] (int i) const { return m_data[i]; }
-};
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 long long iparser_exe_eval (const char* p, long long const* x)
 {
     if (p == nullptr) { return std::numeric_limits<long long>::max(); }
 
-    IParserStack<AMREX_IPARSER_STACK_SIZE> pstack;
+    Stack<long long, AMREX_IPARSER_STACK_SIZE> pstack;
     while (*((iparser_exe_t*)p) != IPARSER_EXE_NULL) {
         switch (*((iparser_exe_t*)p))
         {
diff --git a/Src/Base/Parser/AMReX_Parser_Exe.H b/Src/Base/Parser/AMReX_Parser_Exe.H
index 37a0b89da78..a5427e6e65c 100644
--- a/Src/Base/Parser/AMReX_Parser_Exe.H
+++ b/Src/Base/Parser/AMReX_Parser_Exe.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Parser_Y.H>
+#include <AMReX_Stack.H>
 #include <AMReX_Vector.H>
 
 #include <limits>
@@ -217,24 +218,12 @@ struct alignas(8) ParserExeJUMP {
     int offset;
 };
 
-template <int N>
-struct ParserStack
-{
-    double m_data[N];
-    int m_size = 0;
-    constexpr void push (double v) { m_data[m_size++] = v; }
-    constexpr void pop () { --m_size; }
-    [[nodiscard]] constexpr double const& top () const { return m_data[m_size-1]; }
-    [[nodiscard]] constexpr double      & top ()       { return m_data[m_size-1]; }
-    [[nodiscard]] constexpr double operator[] (int i) const { return m_data[i]; }
-};
-
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 double parser_exe_eval (const char* p, double const* x)
 {
     if (p == nullptr) { return std::numeric_limits<double>::max(); }
 
-    ParserStack<AMREX_PARSER_STACK_SIZE> pstack;
+    Stack<double, AMREX_PARSER_STACK_SIZE> pstack;
     while (*((parser_exe_t*)p) != PARSER_EXE_NULL) { // NOLINT
         switch (*((parser_exe_t*)p))
         {