diff --git a/Src/Boundary/Make.package b/Src/Boundary/Make.package index 7dae7ec913f..009591b2b47 100644 --- a/Src/Boundary/Make.package +++ b/Src/Boundary/Make.package @@ -1,3 +1,5 @@ +ifndef AMREX_BOUNDARY_MAKE + AMREX_BOUNDARY_MAKE := 1 CEXE_sources += AMReX_Mask.cpp AMReX_MultiMask.cpp AMReX_LO_BCTYPES.cpp @@ -27,3 +29,4 @@ endif VPATH_LOCATIONS += $(AMREX_HOME)/Src/Boundary INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Boundary +endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H index 4f982e07752..91d02257396 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_1D_K.H @@ -194,7 +194,7 @@ void mlndlap_normalize_aa (int i, int j, int k, Array4 const& x, mlndlap_normalize_ha(i,j,k,x,sx,msk,dxinv); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mlndlap_jacobi_ha (int i, int, int, Array4 const& sol, Real Ax, Array4 const& rhs, Array4 const& sx, Array4 const& msk, @@ -208,7 +208,7 @@ void mlndlap_jacobi_ha (int i, int, int, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_ha (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Array4 const& sx, Array4 const& msk, @@ -216,7 +216,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& sol, Real Ax, Array4 const& rhs, Array4 const& sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -235,7 +235,7 @@ void mlndlap_jacobi_aa (int i, int j, int k, Array4 const& sol, Real Ax, mlndlap_jacobi_ha(i,j,k,sol,Ax,rhs,sig,msk,dxinv); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_aa (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Array4 const& sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -243,7 +243,7 @@ void mlndlap_jacobi_aa (Box const& bx, Array4 const& sol, Array4 const& sol, Real Ax, Array4 const& rhs, Real sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -256,12 +256,12 @@ void mlndlap_jacobi_c (int i, int, int, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_c (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Real sig, Array4 const& msk, GpuArray const& dxinv) noexcept { - amrex::LoopConcurrent(bx, [=] (int i, int, int) noexcept + amrex::LoopConcurrentOnCpu(bx, [&] (int i, int, int) noexcept { if (msk(i,0,0)) { sol(i,0,0) = Real(0.0); @@ -272,7 +272,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& rhs, Array4 const& sx, @@ -281,7 +281,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4 const& sol, { Real fac = dxinv[0]*dxinv[0]; - amrex::Loop(bx, [=] (int i, int, int) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int, int) noexcept { if (msk(i,0,0)) { sol(i,0,0) = Real(0.0); @@ -295,7 +295,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, Array4 const& rhs, Array4 const& sx, @@ -305,7 +305,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, mlndlap_gauss_seidel_ha(bx,sol,rhs,sx,msk,dxinv); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, Array4 const& rhs, Real sig, Array4 const& msk, @@ -313,7 +313,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, { Real fac = dxinv[0]*dxinv[0]; - amrex::Loop(bx, [=] (int i, int, int) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int, int) noexcept { if (msk(i,0,0)) { sol(i,0,0) = Real(0.0); @@ -327,7 +327,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_with_line_solve_aa(Box const&, Array4 const&, Array4 const&, Array4 const&, Array4 const&, GpuArray const&) noexcept @@ -556,7 +556,7 @@ Real mlndlap_adotx_sten (int /*i*/, int /*j*/, int /*k*/, Array4 con Array4 const&, Array4 const&) noexcept { return Real(0.0); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_sten (Box const&, Array4 const&, Array4 const&, Array4 const&, @@ -575,6 +575,72 @@ void mlndlap_restriction_rap (int /*i*/, int /*j*/, int /*k*/, Array4 cons Array4 const&) noexcept {} +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +int mlndlap_color (int i, int, int) +{ + return i%2; +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_ha (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, + Array4 const& sx, + Array4 const& msk, + GpuArray const& dxinv, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,0,0)) { + sol(i,0,0) = Real(0.0); + } else { + Real fac = dxinv[0]*dxinv[0]; + + Real s0 = Real(-1.0) * fac * (sx(i-1,0,0)+sx(i,0,0)); + Real Ax = sol(i-1,0,0)*fac*sx(i-1,0,0) + + sol(i+1,0,0)*fac*sx(i ,0,0) + + sol(i ,0,0)*s0; + sol(i,0,0) += (rhs(i,0,0) - Ax) / s0; + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_aa (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, + Array4 const& sx, + Array4 const& msk, + GpuArray const& dxinv, int color) noexcept +{ + mlndlap_gscolor_ha(i,j,k,sol,rhs,sx,msk,dxinv,color); +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_c (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Real sig, + Array4 const& msk, + GpuArray const& dxinv, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,0,0)) { + sol(i,0,0) = Real(0.0); + } else { + Real fac = dxinv[0]*dxinv[0]; + + Real s0 = Real(-2.0) * fac * sig; + Real Ax = sol(i-1,0,0)*fac*sig + + sol(i+1,0,0)*fac*sig + + sol(i ,0,0)*s0; + sol(i,0,0) += (rhs(i,0,0) - Ax) / s0; + } + } +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_sten (int, int, int, Array4 const&, + Array4 const&, + Array4 const&, + Array4 const&, int) noexcept +{} + } #endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H index 372215f5d73..05f02aaa927 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_2D_K.H @@ -418,7 +418,7 @@ void mlndlap_normalize_aa (int i, int j, int k, Array4 const& x, Array4 const& sol, Real Ax, Array4 const& rhs, Array4 const& sx, Array4 const& sy, Array4 const& msk, @@ -436,7 +436,7 @@ void mlndlap_jacobi_ha (int i, int j, int k, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_ha (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Array4 const& sx, Array4 const& sy, Array4 const& msk, @@ -445,7 +445,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& sol, Real Ax, Array4 const& rhs, Array4 const& sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -472,7 +472,7 @@ void mlndlap_jacobi_aa (int i, int j, int k, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mlndlap_jacobi_c (int i, int j, int k, Array4 const& sol, Real Ax, Array4 const& rhs, Real sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -487,14 +487,14 @@ void mlndlap_jacobi_c (int i, int j, int k, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_aa (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Array4 const& sig, Array4 const& msk, GpuArray const& dxinv) noexcept { Real fac = -Real(2.0/6.0)*(dxinv[0]*dxinv[0] + dxinv[1]*dxinv[1]); - amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept + amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -505,14 +505,14 @@ void mlndlap_jacobi_aa (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Real sig, Array4 const& msk, GpuArray const& dxinv) noexcept { Real fac = -Real(2.0/6.0)*(dxinv[0]*dxinv[0] + dxinv[1]*dxinv[1]); - amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept + amrex::LoopConcurrentOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -523,7 +523,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& rhs, Array4 const& sx, Array4 const& sy, Array4 const& msk, @@ -533,7 +533,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4 const& sol, Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0]; Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1]; - amrex::Loop(bx, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -570,7 +570,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, Array4 const& rhs, Array4 const& sig, Array4 const& msk, @@ -583,7 +583,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, Real f2xmy = Real(2.0)*facx - facy; Real fmx2y = Real(2.0)*facy - facx; - amrex::Loop(bx, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -614,7 +614,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, Array4 const& rhs, Real sig, Array4 const& msk, @@ -627,7 +627,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, Real f2xmy = Real(2.0)*facx - facy; Real fmx2y = Real(2.0)*facy - facx; - amrex::Loop(bx, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -658,7 +658,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +AMREX_FORCE_INLINE void tridiagonal_solve (Array1D& a_ls, Array1D& b_ls, Array1D& c_ls, Array1D& r_ls, Array1D& u_ls, Array1D& gam, int ilen ) noexcept @@ -677,7 +677,7 @@ void tridiagonal_solve (Array1D& a_ls, Array1D& b_ls, Arra } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4 const& sol, Array4 const& rhs, Array4 const& sig, Array4 const& msk, @@ -1819,6 +1819,21 @@ void mlndlap_stencil_rap (int i, int j, int, Array4 const& csten, csten(i,j,k,3) = Real(0.5)*(cross1+cross2); } +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Real mlndlap_adotx_sten_doit (int i, int j, int k, Array4 const& x, + Array4 const& sten) noexcept +{ + return x(i-1,j-1,k)*sten(i-1,j-1,k,3) + + x(i ,j-1,k)*sten(i ,j-1,k,2) + + x(i+1,j-1,k)*sten(i ,j-1,k,3) + + x(i-1,j ,k)*sten(i-1,j ,k,1) + + x(i ,j ,k)*sten(i ,j ,k,0) + + x(i+1,j ,k)*sten(i ,j ,k,1) + + x(i-1,j+1,k)*sten(i-1,j ,k,3) + + x(i ,j+1,k)*sten(i ,j ,k,2) + + x(i+1,j+1,k)*sten(i ,j ,k,3); +} + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real mlndlap_adotx_sten (int i, int j, int k, Array4 const& x, Array4 const& sten, Array4 const& msk) noexcept @@ -1826,40 +1841,33 @@ Real mlndlap_adotx_sten (int i, int j, int k, Array4 const& x, if (msk(i,j,k)) { return Real(0.0); } else { - return x(i-1,j-1,k)*sten(i-1,j-1,k,3) - + x(i ,j-1,k)*sten(i ,j-1,k,2) - + x(i+1,j-1,k)*sten(i ,j-1,k,3) - + x(i-1,j ,k)*sten(i-1,j ,k,1) - + x(i ,j ,k)*sten(i ,j ,k,0) - + x(i+1,j ,k)*sten(i ,j ,k,1) - + x(i-1,j+1,k)*sten(i-1,j ,k,3) - + x(i ,j+1,k)*sten(i ,j ,k,2) - + x(i+1,j+1,k)*sten(i ,j ,k,3); + return mlndlap_adotx_sten_doit(i,j,k,x,sten); } } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlndlap_gauss_seidel_sten (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, + Array4 const& sten, + Array4 const& msk) noexcept +{ + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else if (sten(i,j,k,0) != Real(0.0)) { + Real Ax = mlndlap_adotx_sten_doit(i,j,k,sol,sten); + sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,0); + } +} + +inline void mlndlap_gauss_seidel_sten (Box const& bx, Array4 const& sol, Array4 const& rhs, Array4 const& sten, Array4 const& msk) noexcept { - amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept + AMREX_LOOP_3D(bx, i, j, k, { - if (msk(i,j,k)) { - sol(i,j,k) = Real(0.0); - } else if (sten(i,j,k,0) != Real(0.0)) { - Real Ax = sol(i-1,j-1,k)*sten(i-1,j-1,k,3) - + sol(i ,j-1,k)*sten(i ,j-1,k,2) - + sol(i+1,j-1,k)*sten(i ,j-1,k,3) - + sol(i-1,j ,k)*sten(i-1,j ,k,1) - + sol(i ,j ,k)*sten(i ,j ,k,0) - + sol(i+1,j ,k)*sten(i ,j ,k,1) - + sol(i-1,j+1,k)*sten(i-1,j ,k,3) - + sol(i ,j+1,k)*sten(i ,j ,k,2) - + sol(i+1,j+1,k)*sten(i ,j ,k,3); - sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,0); - } + mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk); }); } @@ -3536,5 +3544,154 @@ void mlndlap_fillijmat_cs_gpu (const int ps, const int i, const int j, const int #endif +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +int mlndlap_color (int i, int j, int) +{ + return (i%2) + (j%2)*2; +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_ha (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Array4 const& sx, + Array4 const& sy, Array4 const& msk, + GpuArray const& dxinv, int color, + bool is_rz) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else { + Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0]; + Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1]; + + Real s0 = Real(-2.0)*(facx*(sx(i-1,j-1,k)+sx(i,j-1,k)+sx(i-1,j,k)+sx(i,j,k)) + +facy*(sy(i-1,j-1,k)+sy(i,j-1,k)+sy(i-1,j,k)+sy(i,j,k))); + + Real Ax = sol(i-1,j-1,k)*(facx*sx(i-1,j-1,k)+facy*sy(i-1,j-1,k)) + + sol(i+1,j-1,k)*(facx*sx(i ,j-1,k)+facy*sy(i ,j-1,k)) + + sol(i-1,j+1,k)*(facx*sx(i-1,j ,k)+facy*sy(i-1,j ,k)) + + sol(i+1,j+1,k)*(facx*sx(i ,j ,k)+facy*sy(i ,j ,k)) + + sol(i-1,j,k)*(Real(2.0)*facx*(sx(i-1,j-1,k)+sx(i-1,j,k)) + - facy*(sy(i-1,j-1,k)+sy(i-1,j,k))) + + sol(i+1,j,k)*(Real(2.0)*facx*(sx(i ,j-1,k)+sx(i ,j,k)) + - facy*(sy(i ,j-1,k)+sy(i ,j,k))) + + sol(i,j-1,k)*( -facx*(sx(i-1,j-1,k)+sx(i,j-1,k)) + +Real(2.0)*facy*(sy(i-1,j-1,k)+sy(i,j-1,k))) + + sol(i,j+1,k)*( -facx*(sx(i-1,j ,k)+sx(i,j ,k)) + +Real(2.0)*facy*(sy(i-1,j ,k)+sy(i,j ,k))) + + sol(i,j,k)*s0; + + if (is_rz) { + Real fp = facy / static_cast(2*i+1); + Real fm = facy / static_cast(2*i-1); + Real frzlo = fm*sy(i-1,j-1,k)-fp*sy(i,j-1,k); + Real frzhi = fm*sy(i-1,j ,k)-fp*sy(i,j ,k); + s0 += - frzhi - frzlo; + Ax += frzhi*(sol(i,j+1,k)-sol(i,j,k)) + + frzlo*(sol(i,j-1,k)-sol(i,j,k)); + } + + sol(i,j,k) += (rhs(i,j,k) - Ax) / s0; + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_aa (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Array4 const& sig, + Array4 const& msk, + GpuArray const& dxinv, int color, + bool is_rz) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else { + Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0]; + Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1]; + Real fxy = facx + facy; + Real f2xmy = Real(2.0)*facx - facy; + Real fmx2y = Real(2.0)*facy - facx; + + Real s0 = (-Real(2.0))*fxy*(sig(i-1,j-1,k)+sig(i,j-1,k)+sig(i-1,j,k)+sig(i,j,k)); + Real Ax = sol(i-1,j-1,k)*fxy*sig(i-1,j-1,k) + + sol(i+1,j-1,k)*fxy*sig(i ,j-1,k) + + sol(i-1,j+1,k)*fxy*sig(i-1,j ,k) + + sol(i+1,j+1,k)*fxy*sig(i ,j ,k) + + sol(i-1,j,k)*f2xmy*(sig(i-1,j-1,k)+sig(i-1,j,k)) + + sol(i+1,j,k)*f2xmy*(sig(i ,j-1,k)+sig(i ,j,k)) + + sol(i,j-1,k)*fmx2y*(sig(i-1,j-1,k)+sig(i,j-1,k)) + + sol(i,j+1,k)*fmx2y*(sig(i-1,j ,k)+sig(i,j ,k)) + + sol(i,j,k)*s0; + + if (is_rz) { + Real fp = facy / static_cast(2*i+1); + Real fm = facy / static_cast(2*i-1); + Real frzlo = fm*sig(i-1,j-1,k)-fp*sig(i,j-1,k); + Real frzhi = fm*sig(i-1,j ,k)-fp*sig(i,j ,k); + s0 += - frzhi - frzlo; + Ax += frzhi*(sol(i,j+1,k)-sol(i,j,k)) + + frzlo*(sol(i,j-1,k)-sol(i,j,k)); + } + + sol(i,j,k) += (rhs(i,j,k) - Ax) / s0; + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_c (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Real sig, + Array4 const& msk, + GpuArray const& dxinv, int color, + bool is_rz) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else { + Real facx = Real(1.0/6.0)*dxinv[0]*dxinv[0]; + Real facy = Real(1.0/6.0)*dxinv[1]*dxinv[1]; + Real fxy = facx + facy; + Real f2xmy = Real(2.0)*facx - facy; + Real fmx2y = Real(2.0)*facy - facx; + + Real s0 = (-Real(2.0))*fxy*Real(4.); + Real Ax = sol(i-1,j-1,k)*fxy + + sol(i+1,j-1,k)*fxy + + sol(i-1,j+1,k)*fxy + + sol(i+1,j+1,k)*fxy + + sol(i-1,j,k)*f2xmy*Real(2.) + + sol(i+1,j,k)*f2xmy*Real(2.) + + sol(i,j-1,k)*fmx2y*Real(2.) + + sol(i,j+1,k)*fmx2y*Real(2.) + + sol(i,j,k)*s0; + + if (is_rz) { + Real fp = facy / static_cast(2*i+1); + Real fm = facy / static_cast(2*i-1); + Real frzlo = fm-fp; + Real frzhi = fm-fp; + s0 += - frzhi - frzlo; + Ax += frzhi*(sol(i,j+1,k)-sol(i,j,k)) + + frzlo*(sol(i,j-1,k)-sol(i,j,k)); + } + + sol(i,j,k) += (rhs(i,j,k) - Ax*sig) / (s0*sig); + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_sten (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, + Array4 const& sten, + Array4 const& msk, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk); + } +} + } #endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H index f6b94e7c526..5d31de02711 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLap_3D_K.H @@ -988,7 +988,7 @@ void mlndlap_normalize_aa (int i, int j, int k, Array4 const& x, Array4 const& sol, Real Ax, Array4 const& rhs, Array4 const& sx, Array4 const& sy, Array4 const& sz, @@ -1011,7 +1011,7 @@ void mlndlap_jacobi_ha (int i, int j, int k, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_ha (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Array4 const& sx, Array4 const& sy, Array4 const& sz, @@ -1021,7 +1021,7 @@ void mlndlap_jacobi_ha (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& sol, Real Ax, Array4 const& rhs, Array4 const& sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -1055,7 +1055,7 @@ void mlndlap_jacobi_aa (int i, int j, int k, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mlndlap_jacobi_c (int i, int j, int k, Array4 const& sol, Real Ax, Array4 const& rhs, Real sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -1072,7 +1072,7 @@ void mlndlap_jacobi_c (int i, int j, int k, Array4 const& sol, Real Ax, } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_jacobi_aa (Box const& bx, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Array4 const& sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -1081,7 +1081,7 @@ void mlndlap_jacobi_aa (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& sol, Array4 const& Ax, Array4 const& rhs, Real sig, Array4 const& msk, GpuArray const& dxinv) noexcept @@ -1102,7 +1102,7 @@ void mlndlap_jacobi_c (Box const& bx, Array4 const& sol, Array4 const& sol, Array4 const& sol, Array4 const& rhs, Array4 const& sx, Array4 const& sy, Array4 const& sz, @@ -1124,7 +1124,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4 const& sol, Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1]; Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2]; - amrex::Loop(bx, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -1220,7 +1220,7 @@ void mlndlap_gauss_seidel_ha (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, Array4 const& rhs, Array4 const& sig, Array4 const& msk, @@ -1237,7 +1237,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz; Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz; - amrex::Loop(bx, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -1277,7 +1277,7 @@ void mlndlap_gauss_seidel_aa (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, Array4 const& rhs, Real sig, Array4 const& msk, @@ -1294,7 +1294,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz; Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz; - amrex::Loop(bx, [=] (int i, int j, int k) noexcept + amrex::LoopOnCpu(bx, [&] (int i, int j, int k) noexcept { if (msk(i,j,k)) { sol(i,j,k) = Real(0.0); @@ -1333,7 +1333,7 @@ void mlndlap_gauss_seidel_c (Box const& bx, Array4 const& sol, }); } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +AMREX_FORCE_INLINE void tridiagonal_solve (Array1D& a_ls, Array1D& b_ls, Array1D& c_ls, Array1D& r_ls, Array1D& u_ls, Array1D& gam, int ilen ) noexcept @@ -1352,7 +1352,7 @@ void tridiagonal_solve (Array1D& a_ls, Array1D& b_ls, Arra } } -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +inline void mlndlap_gauss_seidel_with_line_solve_aa (Box const& bx, Array4 const& sol, Array4 const& rhs, Array4 const& sig, Array4 const& msk, @@ -5460,101 +5460,83 @@ void mlndlap_stencil_rap (int i, int j, int k, Array4 const& csten, } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -Real mlndlap_adotx_sten (int i, int j, int k, Array4 const& x, - Array4 const& sten, Array4 const& msk) noexcept +Real mlndlap_adotx_sten_doit (int i, int j, int k, Array4 const& x, + Array4 const& sten) noexcept { using namespace nodelap_detail; + return x(i ,j ,k ) * sten(i ,j ,k ,ist_000) + // + + x(i-1,j ,k ) * sten(i-1,j ,k ,ist_p00) + + x(i+1,j ,k ) * sten(i ,j ,k ,ist_p00) + // + + x(i ,j-1,k ) * sten(i ,j-1,k ,ist_0p0) + + x(i ,j+1,k ) * sten(i ,j ,k ,ist_0p0) + // + + x(i ,j ,k-1) * sten(i ,j ,k-1,ist_00p) + + x(i ,j ,k+1) * sten(i ,j ,k ,ist_00p) + // + + x(i-1,j-1,k ) * sten(i-1,j-1,k ,ist_pp0) + + x(i+1,j-1,k ) * sten(i ,j-1,k ,ist_pp0) + + x(i-1,j+1,k ) * sten(i-1,j ,k ,ist_pp0) + + x(i+1,j+1,k ) * sten(i ,j ,k ,ist_pp0) + // + + x(i-1,j ,k-1) * sten(i-1,j ,k-1,ist_p0p) + + x(i+1,j ,k-1) * sten(i ,j ,k-1,ist_p0p) + + x(i-1,j ,k+1) * sten(i-1,j ,k ,ist_p0p) + + x(i+1,j ,k+1) * sten(i ,j ,k ,ist_p0p) + // + + x(i ,j-1,k-1) * sten(i ,j-1,k-1,ist_0pp) + + x(i ,j+1,k-1) * sten(i ,j ,k-1,ist_0pp) + + x(i ,j-1,k+1) * sten(i ,j-1,k ,ist_0pp) + + x(i ,j+1,k+1) * sten(i ,j ,k ,ist_0pp) + // + + x(i-1,j-1,k-1) * sten(i-1,j-1,k-1,ist_ppp) + + x(i+1,j-1,k-1) * sten(i ,j-1,k-1,ist_ppp) + + x(i-1,j+1,k-1) * sten(i-1,j ,k-1,ist_ppp) + + x(i+1,j+1,k-1) * sten(i ,j ,k-1,ist_ppp) + + x(i-1,j-1,k+1) * sten(i-1,j-1,k ,ist_ppp) + + x(i+1,j-1,k+1) * sten(i ,j-1,k ,ist_ppp) + + x(i-1,j+1,k+1) * sten(i-1,j ,k ,ist_ppp) + + x(i+1,j+1,k+1) * sten(i ,j ,k ,ist_ppp); +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Real mlndlap_adotx_sten (int i, int j, int k, Array4 const& x, + Array4 const& sten, Array4 const& msk) noexcept +{ if (msk(i,j,k)) { return Real(0.0); } else { - return x(i ,j ,k ) * sten(i ,j ,k ,ist_000) - // - + x(i-1,j ,k ) * sten(i-1,j ,k ,ist_p00) - + x(i+1,j ,k ) * sten(i ,j ,k ,ist_p00) - // - + x(i ,j-1,k ) * sten(i ,j-1,k ,ist_0p0) - + x(i ,j+1,k ) * sten(i ,j ,k ,ist_0p0) - // - + x(i ,j ,k-1) * sten(i ,j ,k-1,ist_00p) - + x(i ,j ,k+1) * sten(i ,j ,k ,ist_00p) - // - + x(i-1,j-1,k ) * sten(i-1,j-1,k ,ist_pp0) - + x(i+1,j-1,k ) * sten(i ,j-1,k ,ist_pp0) - + x(i-1,j+1,k ) * sten(i-1,j ,k ,ist_pp0) - + x(i+1,j+1,k ) * sten(i ,j ,k ,ist_pp0) - // - + x(i-1,j ,k-1) * sten(i-1,j ,k-1,ist_p0p) - + x(i+1,j ,k-1) * sten(i ,j ,k-1,ist_p0p) - + x(i-1,j ,k+1) * sten(i-1,j ,k ,ist_p0p) - + x(i+1,j ,k+1) * sten(i ,j ,k ,ist_p0p) - // - + x(i ,j-1,k-1) * sten(i ,j-1,k-1,ist_0pp) - + x(i ,j+1,k-1) * sten(i ,j ,k-1,ist_0pp) - + x(i ,j-1,k+1) * sten(i ,j-1,k ,ist_0pp) - + x(i ,j+1,k+1) * sten(i ,j ,k ,ist_0pp) - // - + x(i-1,j-1,k-1) * sten(i-1,j-1,k-1,ist_ppp) - + x(i+1,j-1,k-1) * sten(i ,j-1,k-1,ist_ppp) - + x(i-1,j+1,k-1) * sten(i-1,j ,k-1,ist_ppp) - + x(i+1,j+1,k-1) * sten(i ,j ,k-1,ist_ppp) - + x(i-1,j-1,k+1) * sten(i-1,j-1,k ,ist_ppp) - + x(i+1,j-1,k+1) * sten(i ,j-1,k ,ist_ppp) - + x(i-1,j+1,k+1) * sten(i-1,j ,k ,ist_ppp) - + x(i+1,j+1,k+1) * sten(i ,j ,k ,ist_ppp); + return mlndlap_adotx_sten_doit(i,j,k,x,sten); } } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -void mlndlap_gauss_seidel_sten (Box const& bx, Array4 const& sol, +void mlndlap_gauss_seidel_sten (int i, int j, int k, Array4 const& sol, Array4 const& rhs, Array4 const& sten, Array4 const& msk) noexcept { using namespace nodelap_detail; - amrex::LoopConcurrent(bx, [=] (int i, int j, int k) noexcept + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else if (sten(i,j,k,ist_000) != Real(0.0)) { + Real Ax = mlndlap_adotx_sten_doit(i,j,k,sol,sten); + sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,ist_000); + } +} + +inline +void mlndlap_gauss_seidel_sten (Box const& bx, Array4 const& sol, + Array4 const& rhs, + Array4 const& sten, + Array4 const& msk) noexcept +{ + AMREX_LOOP_3D(bx, i, j, k, { - if (msk(i,j,k)) { - sol(i,j,k) = Real(0.0); - } else if (sten(i,j,k,ist_000) != Real(0.0)) { - Real Ax = sol(i ,j ,k ) * sten(i ,j ,k ,ist_000) - // - + sol(i-1,j ,k ) * sten(i-1,j ,k ,ist_p00) - + sol(i+1,j ,k ) * sten(i ,j ,k ,ist_p00) - // - + sol(i ,j-1,k ) * sten(i ,j-1,k ,ist_0p0) - + sol(i ,j+1,k ) * sten(i ,j ,k ,ist_0p0) - // - + sol(i ,j ,k-1) * sten(i ,j ,k-1,ist_00p) - + sol(i ,j ,k+1) * sten(i ,j ,k ,ist_00p) - // - + sol(i-1,j-1,k ) * sten(i-1,j-1,k ,ist_pp0) - + sol(i+1,j-1,k ) * sten(i ,j-1,k ,ist_pp0) - + sol(i-1,j+1,k ) * sten(i-1,j ,k ,ist_pp0) - + sol(i+1,j+1,k ) * sten(i ,j ,k ,ist_pp0) - // - + sol(i-1,j ,k-1) * sten(i-1,j ,k-1,ist_p0p) - + sol(i+1,j ,k-1) * sten(i ,j ,k-1,ist_p0p) - + sol(i-1,j ,k+1) * sten(i-1,j ,k ,ist_p0p) - + sol(i+1,j ,k+1) * sten(i ,j ,k ,ist_p0p) - // - + sol(i ,j-1,k-1) * sten(i ,j-1,k-1,ist_0pp) - + sol(i ,j+1,k-1) * sten(i ,j ,k-1,ist_0pp) - + sol(i ,j-1,k+1) * sten(i ,j-1,k ,ist_0pp) - + sol(i ,j+1,k+1) * sten(i ,j ,k ,ist_0pp) - // - + sol(i-1,j-1,k-1) * sten(i-1,j-1,k-1,ist_ppp) - + sol(i+1,j-1,k-1) * sten(i ,j-1,k-1,ist_ppp) - + sol(i-1,j+1,k-1) * sten(i-1,j ,k-1,ist_ppp) - + sol(i+1,j+1,k-1) * sten(i ,j ,k-1,ist_ppp) - + sol(i-1,j-1,k+1) * sten(i-1,j-1,k ,ist_ppp) - + sol(i+1,j-1,k+1) * sten(i ,j-1,k ,ist_ppp) - + sol(i-1,j+1,k+1) * sten(i-1,j ,k ,ist_ppp) - + sol(i+1,j+1,k+1) * sten(i ,j ,k ,ist_ppp); - - sol(i,j,k) += (rhs(i,j,k) - Ax) / sten(i,j,k,ist_000); - } + mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk); }); } @@ -10887,5 +10869,239 @@ void mlndlap_fillijmat_cs_gpu (const int ps, const int i, const int j, const int #endif +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +int mlndlap_color (int i, int j, int k) +{ + return (i%2) + (j%2)*2 + (k%2)*4; +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_ha (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Array4 const& sx, + Array4 const& sy, Array4 const& sz, + Array4 const& msk, + GpuArray const& dxinv, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else { + Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0]; + Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1]; + Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2]; + + Real s0 = Real(-4.0)*(facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)+sx(i-1,j,k-1)+sx(i,j,k-1) + +sx(i-1,j-1,k )+sx(i,j-1,k )+sx(i-1,j,k )+sx(i,j,k )) + +facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)+sy(i-1,j,k-1)+sy(i,j,k-1) + +sy(i-1,j-1,k )+sy(i,j-1,k )+sy(i-1,j,k )+sy(i,j,k )) + +facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)+sz(i-1,j,k-1)+sz(i,j,k-1) + +sz(i-1,j-1,k )+sz(i,j-1,k )+sz(i-1,j,k )+sz(i,j,k ))); + Real Ax = sol(i,j,k)*s0 + + sol(i-1,j-1,k-1)*(facx*sx(i-1,j-1,k-1) + +facy*sy(i-1,j-1,k-1) + +facz*sz(i-1,j-1,k-1)) + + sol(i+1,j-1,k-1)*(facx*sx(i ,j-1,k-1) + +facy*sy(i ,j-1,k-1) + +facz*sz(i ,j-1,k-1)) + + sol(i-1,j+1,k-1)*(facx*sx(i-1,j ,k-1) + +facy*sy(i-1,j ,k-1) + +facz*sz(i-1,j ,k-1)) + + sol(i+1,j+1,k-1)*(facx*sx(i ,j ,k-1) + +facy*sy(i ,j ,k-1) + +facz*sz(i ,j ,k-1)) + + sol(i-1,j-1,k+1)*(facx*sx(i-1,j-1,k ) + +facy*sy(i-1,j-1,k ) + +facz*sz(i-1,j-1,k )) + + sol(i+1,j-1,k+1)*(facx*sx(i ,j-1,k ) + +facy*sy(i ,j-1,k ) + +facz*sz(i ,j-1,k )) + + sol(i-1,j+1,k+1)*(facx*sx(i-1,j ,k ) + +facy*sy(i-1,j ,k ) + +facz*sz(i-1,j ,k )) + + sol(i+1,j+1,k+1)*(facx*sx(i ,j ,k ) + +facy*sy(i ,j ,k ) + +facz*sz(i ,j ,k )) + +sol(i ,j-1,k-1)*( -facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)) + +Real(2.0)*facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)) + +Real(2.0)*facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1))) + +sol(i ,j+1,k-1)*( -facx*(sx(i-1,j ,k-1)+sx(i,j ,k-1)) + +Real(2.0)*facy*(sy(i-1,j ,k-1)+sy(i,j ,k-1)) + +Real(2.0)*facz*(sz(i-1,j ,k-1)+sz(i,j ,k-1))) + +sol(i ,j-1,k+1)*( -facx*(sx(i-1,j-1,k )+sx(i,j-1,k )) + +Real(2.0)*facy*(sy(i-1,j-1,k )+sy(i,j-1,k )) + +Real(2.0)*facz*(sz(i-1,j-1,k )+sz(i,j-1,k ))) + +sol(i ,j+1,k+1)*( -facx*(sx(i-1,j ,k )+sx(i,j ,k )) + +Real(2.0)*facy*(sy(i-1,j ,k )+sy(i,j ,k )) + +Real(2.0)*facz*(sz(i-1,j ,k )+sz(i,j ,k ))) + +sol(i-1,j ,k-1)*( Real(2.0)*facx*(sx(i-1,j-1,k-1)+sx(i-1,j,k-1)) + -facy*(sy(i-1,j-1,k-1)+sy(i-1,j,k-1)) + +Real(2.0)*facz*(sz(i-1,j-1,k-1)+sz(i-1,j,k-1))) + +sol(i+1,j ,k-1)*( Real(2.0)*facx*(sx(i ,j-1,k-1)+sx(i ,j,k-1)) + -facy*(sy(i ,j-1,k-1)+sy(i ,j,k-1)) + +Real(2.0)*facz*(sz(i ,j-1,k-1)+sz(i ,j,k-1))) + +sol(i-1,j ,k+1)*( Real(2.0)*facx*(sx(i-1,j-1,k )+sx(i-1,j,k )) + -facy*(sy(i-1,j-1,k )+sy(i-1,j,k )) + +Real(2.0)*facz*(sz(i-1,j-1,k )+sz(i-1,j,k ))) + +sol(i+1,j ,k+1)*( Real(2.0)*facx*(sx(i ,j-1,k )+sx(i ,j,k )) + -facy*(sy(i ,j-1,k )+sy(i ,j,k )) + +Real(2.0)*facz*(sz(i ,j-1,k )+sz(i ,j,k ))) + +sol(i-1,j-1,k )*( Real(2.0)*facx*(sx(i-1,j-1,k-1)+sx(i-1,j-1,k)) + +Real(2.0)*facy*(sy(i-1,j-1,k-1)+sy(i-1,j-1,k)) + -facz*(sz(i-1,j-1,k-1)+sz(i-1,j-1,k))) + +sol(i+1,j-1,k )*( Real(2.0)*facx*(sx(i ,j-1,k-1)+sx(i ,j-1,k)) + +Real(2.0)*facy*(sy(i ,j-1,k-1)+sy(i ,j-1,k)) + -facz*(sz(i ,j-1,k-1)+sz(i ,j-1,k))) + +sol(i-1,j+1,k )*( Real(2.0)*facx*(sx(i-1,j ,k-1)+sx(i-1,j ,k)) + +Real(2.0)*facy*(sy(i-1,j ,k-1)+sy(i-1,j ,k)) + -facz*(sz(i-1,j ,k-1)+sz(i-1,j ,k))) + +sol(i+1,j+1,k )*( Real(2.0)*facx*(sx(i ,j ,k-1)+sx(i ,j ,k)) + +Real(2.0)*facy*(sy(i ,j ,k-1)+sy(i ,j ,k)) + -facz*(sz(i ,j ,k-1)+sz(i ,j ,k))) + + Real(2.0)*sol(i-1,j,k)*(Real(2.0)*facx*(sx(i-1,j-1,k-1)+sx(i-1,j,k-1)+sx(i-1,j-1,k)+sx(i-1,j,k)) + -facy*(sy(i-1,j-1,k-1)+sy(i-1,j,k-1)+sy(i-1,j-1,k)+sy(i-1,j,k)) + -facz*(sz(i-1,j-1,k-1)+sz(i-1,j,k-1)+sz(i-1,j-1,k)+sz(i-1,j,k))) + + Real(2.0)*sol(i+1,j,k)*(Real(2.0)*facx*(sx(i ,j-1,k-1)+sx(i ,j,k-1)+sx(i ,j-1,k)+sx(i ,j,k)) + -facy*(sy(i ,j-1,k-1)+sy(i ,j,k-1)+sy(i ,j-1,k)+sy(i ,j,k)) + -facz*(sz(i ,j-1,k-1)+sz(i ,j,k-1)+sz(i ,j-1,k)+sz(i ,j,k))) + + Real(2.0)*sol(i,j-1,k)*( -facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)+sx(i-1,j-1,k)+sx(i,j-1,k)) + +Real(2.0)*facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)+sy(i-1,j-1,k)+sy(i,j-1,k)) + -facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)+sz(i-1,j-1,k)+sz(i,j-1,k))) + + Real(2.0)*sol(i,j+1,k)*( -facx*(sx(i-1,j ,k-1)+sx(i,j ,k-1)+sx(i-1,j ,k)+sx(i,j ,k)) + +Real(2.0)*facy*(sy(i-1,j ,k-1)+sy(i,j ,k-1)+sy(i-1,j ,k)+sy(i,j ,k)) + -facz*(sz(i-1,j ,k-1)+sz(i,j ,k-1)+sz(i-1,j ,k)+sz(i,j ,k))) + + Real(2.0)*sol(i,j,k-1)*( -facx*(sx(i-1,j-1,k-1)+sx(i,j-1,k-1)+sx(i-1,j,k-1)+sx(i,j,k-1)) + -facy*(sy(i-1,j-1,k-1)+sy(i,j-1,k-1)+sy(i-1,j,k-1)+sy(i,j,k-1)) + +Real(2.0)*facz*(sz(i-1,j-1,k-1)+sz(i,j-1,k-1)+sz(i-1,j,k-1)+sz(i,j,k-1))) + + Real(2.0)*sol(i,j,k+1)*( -facx*(sx(i-1,j-1,k )+sx(i,j-1,k )+sx(i-1,j,k )+sx(i,j,k )) + -facy*(sy(i-1,j-1,k )+sy(i,j-1,k )+sy(i-1,j,k )+sy(i,j,k )) + +Real(2.0)*facz*(sz(i-1,j-1,k )+sz(i,j-1,k )+sz(i-1,j,k )+sz(i,j,k ))); + + sol(i,j,k) += (rhs(i,j,k) - Ax) / s0; + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_aa (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Array4 const& sig, + Array4 const& msk, + GpuArray const& dxinv, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else { + Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0]; + Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1]; + Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2]; + Real fxyz = facx + facy + facz; + Real fmx2y2z = -facx + Real(2.0)*facy + Real(2.0)*facz; + Real f2xmy2z = Real(2.0)*facx - facy + Real(2.0)*facz; + Real f2x2ymz = Real(2.0)*facx + Real(2.0)*facy - facz; + Real f4xm2ym2z = Real(4.0)*facx - Real(2.0)*facy - Real(2.0)*facz; + Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz; + Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz; + + Real s0 = Real(-4.0)*fxyz*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)+sig(i-1,j,k-1)+sig(i,j,k-1) + +sig(i-1,j-1,k )+sig(i,j-1,k )+sig(i-1,j,k )+sig(i,j,k )); + Real Ax = sol(i,j,k)*s0 + + fxyz*(sol(i-1,j-1,k-1)*sig(i-1,j-1,k-1) + + sol(i+1,j-1,k-1)*sig(i ,j-1,k-1) + + sol(i-1,j+1,k-1)*sig(i-1,j ,k-1) + + sol(i+1,j+1,k-1)*sig(i ,j ,k-1) + + sol(i-1,j-1,k+1)*sig(i-1,j-1,k ) + + sol(i+1,j-1,k+1)*sig(i ,j-1,k ) + + sol(i-1,j+1,k+1)*sig(i-1,j ,k ) + + sol(i+1,j+1,k+1)*sig(i ,j ,k )) + + fmx2y2z*(sol(i ,j-1,k-1)*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)) + + sol(i ,j+1,k-1)*(sig(i-1,j ,k-1)+sig(i,j ,k-1)) + + sol(i ,j-1,k+1)*(sig(i-1,j-1,k )+sig(i,j-1,k )) + + sol(i ,j+1,k+1)*(sig(i-1,j ,k )+sig(i,j ,k ))) + + f2xmy2z*(sol(i-1,j ,k-1)*(sig(i-1,j-1,k-1)+sig(i-1,j,k-1)) + + sol(i+1,j ,k-1)*(sig(i ,j-1,k-1)+sig(i ,j,k-1)) + + sol(i-1,j ,k+1)*(sig(i-1,j-1,k )+sig(i-1,j,k )) + + sol(i+1,j ,k+1)*(sig(i ,j-1,k )+sig(i ,j,k ))) + + f2x2ymz*(sol(i-1,j-1,k )*(sig(i-1,j-1,k-1)+sig(i-1,j-1,k)) + + sol(i+1,j-1,k )*(sig(i ,j-1,k-1)+sig(i ,j-1,k)) + + sol(i-1,j+1,k )*(sig(i-1,j ,k-1)+sig(i-1,j ,k)) + + sol(i+1,j+1,k )*(sig(i ,j ,k-1)+sig(i ,j ,k))) + + f4xm2ym2z*(sol(i-1,j,k)*(sig(i-1,j-1,k-1)+sig(i-1,j,k-1)+sig(i-1,j-1,k)+sig(i-1,j,k)) + + sol(i+1,j,k)*(sig(i ,j-1,k-1)+sig(i ,j,k-1)+sig(i ,j-1,k)+sig(i ,j,k))) + + fm2x4ym2z*(sol(i,j-1,k)*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)+sig(i-1,j-1,k)+sig(i,j-1,k)) + + sol(i,j+1,k)*(sig(i-1,j ,k-1)+sig(i,j ,k-1)+sig(i-1,j ,k)+sig(i,j ,k))) + + fm2xm2y4z*(sol(i,j,k-1)*(sig(i-1,j-1,k-1)+sig(i,j-1,k-1)+sig(i-1,j,k-1)+sig(i,j,k-1)) + + sol(i,j,k+1)*(sig(i-1,j-1,k )+sig(i,j-1,k )+sig(i-1,j,k )+sig(i,j,k ))); + + sol(i,j,k) += (rhs(i,j,k) - Ax) / s0; + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_c (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, Real sig, + Array4 const& msk, + GpuArray const& dxinv, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + if (msk(i,j,k)) { + sol(i,j,k) = Real(0.0); + } else { + Real facx = Real(1.0/36.0)*dxinv[0]*dxinv[0]; + Real facy = Real(1.0/36.0)*dxinv[1]*dxinv[1]; + Real facz = Real(1.0/36.0)*dxinv[2]*dxinv[2]; + Real fxyz = facx + facy + facz; + Real fmx2y2z = -facx + Real(2.0)*facy + Real(2.0)*facz; + Real f2xmy2z = Real(2.0)*facx - facy + Real(2.0)*facz; + Real f2x2ymz = Real(2.0)*facx + Real(2.0)*facy - facz; + Real f4xm2ym2z = Real(4.0)*facx - Real(2.0)*facy - Real(2.0)*facz; + Real fm2x4ym2z = -Real(2.0)*facx + Real(4.0)*facy - Real(2.0)*facz; + Real fm2xm2y4z = -Real(2.0)*facx - Real(2.0)*facy + Real(4.0)*facz; + + Real s0 = Real(-4.0)*fxyz*Real(8.); + Real Ax = sol(i,j,k)*s0 + + fxyz*(sol(i-1,j-1,k-1) + + sol(i+1,j-1,k-1) + + sol(i-1,j+1,k-1) + + sol(i+1,j+1,k-1) + + sol(i-1,j-1,k+1) + + sol(i+1,j-1,k+1) + + sol(i-1,j+1,k+1) + + sol(i+1,j+1,k+1)) + + fmx2y2z*(sol(i ,j-1,k-1)*Real(2.) + + sol(i ,j+1,k-1)*Real(2.) + + sol(i ,j-1,k+1)*Real(2.) + + sol(i ,j+1,k+1)*Real(2.)) + + f2xmy2z*(sol(i-1,j ,k-1)*Real(2.) + + sol(i+1,j ,k-1)*Real(2.) + + sol(i-1,j ,k+1)*Real(2.) + + sol(i+1,j ,k+1)*Real(2.)) + + f2x2ymz*(sol(i-1,j-1,k )*Real(2.) + + sol(i+1,j-1,k )*Real(2.) + + sol(i-1,j+1,k )*Real(2.) + + sol(i+1,j+1,k )*Real(2.)) + + f4xm2ym2z*(sol(i-1,j,k)*Real(4.) + + sol(i+1,j,k)*Real(4.)) + + fm2x4ym2z*(sol(i,j-1,k)*Real(4.) + + sol(i,j+1,k)*Real(4.)) + + fm2xm2y4z*(sol(i,j,k-1)*Real(4.) + + sol(i,j,k+1)*Real(4.)); + + sol(i,j,k) += (rhs(i,j,k) - Ax*sig) / (s0*sig); + } + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void mlndlap_gscolor_sten (int i, int j, int k, Array4 const& sol, + Array4 const& rhs, + Array4 const& sten, + Array4 const& msk, int color) noexcept +{ + if (mlndlap_color(i,j,k) == color) { + mlndlap_gauss_seidel_sten(i,j,k,sol,rhs,sten,msk); + } +} + } #endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp index da90f84e829..8e490f30348 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp @@ -356,95 +356,38 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& const iMultiFab& dmsk = *m_dirichlet_mask[amrlev][mglev]; #ifdef AMREX_USE_GPU - if (Gpu::inLaunchRegion()) + auto const& solarr_ma = sol.arrays(); + auto const& rhsarr_ma = rhs.const_arrays(); + auto const& dmskarr_ma = dmsk.const_arrays(); +#else + bool regular_coarsening = true; + if (amrlev == 0 && mglev > 0) { - auto solarr_ma = sol.arrays(); - auto rhsarr_ma = rhs.const_arrays(); - auto dmskarr_ma = dmsk.const_arrays(); - if (m_coarsening_strategy == CoarseningStrategy::RAP) - { - auto starr_ma = stencil->const_arrays(); - for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) - { - ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Real Ax = mlndlap_adotx_sten(i,j,k,solarr_ma[box_no],starr_ma[box_no],dmskarr_ma[box_no]); - mlndlap_jacobi_sten(i,j,k,solarr_ma[box_no],Ax,rhsarr_ma[box_no],starr_ma[box_no],dmskarr_ma[box_no]); - }); - } - } - else if (sigma[0] == nullptr) - { - for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) - { - Real const_sigma = m_const_sigma; - ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Real Ax = mlndlap_adotx_c(i,j,k,solarr_ma[box_no],const_sigma,dmskarr_ma[box_no], -#if (AMREX_SPACEDIM == 2) - is_rz, -#endif - dxinvarr); - mlndlap_jacobi_c(i,j,k, solarr_ma[box_no], Ax, rhsarr_ma[box_no], const_sigma, - dmskarr_ma[box_no], dxinvarr); - }); - } - } - else if ((m_use_harmonic_average && mglev > 0) || m_use_mapped) - { - AMREX_D_TERM(MultiArray4 const& sxarr_ma = sigma[0]->const_arrays();, - MultiArray4 const& syarr_ma = sigma[1]->const_arrays();, - MultiArray4 const& szarr_ma = sigma[2]->const_arrays();); - for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) - { - ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Real Ax = mlndlap_adotx_ha(i,j,k,solarr_ma[box_no],AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]), dmskarr_ma[box_no], -#if (AMREX_SPACEDIM == 2) - is_rz, + regular_coarsening = mg_coarsen_ratio_vec[mglev-1] == mg_coarsen_ratio; + } + if (sigma[0] == nullptr) { + AMREX_ALWAYS_ASSERT(regular_coarsening); + } #endif - dxinvarr); - mlndlap_jacobi_ha(i,j,k, solarr_ma[box_no], Ax, rhsarr_ma[box_no], AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]), - dmskarr_ma[box_no], dxinvarr); - }); - } - } - else + + if (m_use_gauss_seidel) + { + if (m_coarsening_strategy == CoarseningStrategy::RAP) { - auto sarr_ma = sigma[0]->const_arrays(); - for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { - ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + auto const& starr_ma = stencil->const_arrays(); + for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color) { - Real Ax = mlndlap_adotx_aa(i,j,k,solarr_ma[box_no],sarr_ma[box_no],dmskarr_ma[box_no], -#if (AMREX_SPACEDIM == 2) - is_rz, -#endif - dxinvarr); - mlndlap_jacobi_aa(i,j,k, solarr_ma[box_no], Ax, rhsarr_ma[box_no], sarr_ma[box_no], - dmskarr_ma[box_no], dxinvarr); - }); - } - } - - Gpu::streamSynchronize(); - if (m_smooth_num_sweeps > 1) { nodalSync(amrlev, mglev, sol); } - } - else // cpu + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_gscolor_sten(i,j,k,solarr_ma[box_no],rhsarr_ma[box_no], + starr_ma[box_no],dmskarr_ma[box_no],color); + }); + } + } else #endif - { - bool regular_coarsening = true; - if (amrlev == 0 && mglev > 0) - { - regular_coarsening = mg_coarsen_ratio_vec[mglev-1] == mg_coarsen_ratio; - } - if (sigma[0] == nullptr) { - AMREX_ALWAYS_ASSERT(regular_coarsening); - } - - if (m_use_gauss_seidel) - { - if (m_coarsening_strategy == CoarseningStrategy::RAP) { #ifdef AMREX_USE_OMP #pragma omp parallel @@ -462,9 +405,27 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& } } } - else if (sigma[0] == nullptr) + } + else if (sigma[0] == nullptr) + { + Real const_sigma = m_const_sigma; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color) + { + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_gscolor_c(i,j,k, solarr_ma[box_no], rhsarr_ma[box_no], + const_sigma, dmskarr_ma[box_no], dxinvarr, color +#if (AMREX_SPACEDIM == 2) + ,is_rz +#endif + ); + }); + } + } else +#endif { - Real const_sigma = m_const_sigma; #ifdef AMREX_USE_OMP #pragma omp parallel #endif @@ -485,8 +446,32 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& } } } - else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped ) + } + else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped ) + { +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + AMREX_D_TERM(MultiArray4 const& sxarr_ma = sigma[0]->const_arrays();, + MultiArray4 const& syarr_ma = sigma[1]->const_arrays();, + MultiArray4 const& szarr_ma = sigma[2]->const_arrays();); + for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color) + { + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_gscolor_ha(i,j,k, solarr_ma[box_no], rhsarr_ma[box_no], + AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]), + dmskarr_ma[box_no], dxinvarr, color +#if (AMREX_SPACEDIM == 2) + ,is_rz +#endif + ); + }); + } + } else +#endif + { + #ifdef AMREX_USE_OMP #pragma omp parallel #endif @@ -511,51 +496,94 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& } } } - else + } + else + { +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + auto const& sarr_ma = sigma[0]->const_arrays(); + for (int color = 0; color < AMREX_D_TERM(2,*2,*2); ++color) + { + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_gscolor_aa(i,j,k, solarr_ma[box_no], rhsarr_ma[box_no], + sarr_ma[box_no], dmskarr_ma[box_no], dxinvarr, color +#if (AMREX_SPACEDIM == 2) + ,is_rz +#endif + ); + }); + } + } else +#endif { #ifdef AMREX_USE_OMP #pragma omp parallel #endif for (MFIter mfi(sol); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); Array4 const& sarr = sigma[0]->const_array(mfi); Array4 const& solarr = sol.array(mfi); Array4 const& rhsarr = rhs.const_array(mfi); Array4 const& dmskarr = dmsk.const_array(mfi); +#ifndef AMREX_USE_GPU if ( regular_coarsening ) +#endif { for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) { mlndlap_gauss_seidel_aa(bx, solarr, rhsarr, sarr, dmskarr, dxinvarr #if (AMREX_SPACEDIM == 2) - ,is_rz + ,is_rz #endif - ); + ); } - } else { + } +#ifndef AMREX_USE_GPU + else { for (int ns = 0; ns < m_smooth_num_sweeps; ++ns) { mlndlap_gauss_seidel_with_line_solve_aa(bx, solarr, rhsarr, sarr, dmskarr, dxinvarr #if (AMREX_SPACEDIM == 2) - ,is_rz + ,is_rz #endif ); } } +#endif } } - - nodalSync(amrlev, mglev, sol); } - else - { - MultiFab Ax(sol.boxArray(), sol.DistributionMap(), 1, 0); - Fapply(amrlev, mglev, Ax, sol); - if (m_coarsening_strategy == CoarseningStrategy::RAP) + Gpu::streamSynchronize(); + nodalSync(amrlev, mglev, sol); + } + else + { + MultiFab Ax(sol.boxArray(), sol.DistributionMap(), 1, 0); + Fapply(amrlev, mglev, Ax, sol); + +#ifdef AMREX_USE_GPU + auto const& Axarr_ma = Ax.const_arrays(); +#endif + + if (m_coarsening_strategy == CoarseningStrategy::RAP) + { +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + auto const& starr_ma = stencil->const_arrays(); + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_jacobi_sten(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k), + rhsarr_ma[box_no],starr_ma[box_no], + dmskarr_ma[box_no]); + }); + } else +#endif { #ifdef AMREX_USE_OMP #pragma omp parallel @@ -572,9 +600,22 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& mlndlap_jacobi_sten(bx,solarr,Axarr,rhsarr,stenarr,dmskarr); } } - else if (sigma[0] == nullptr) + } + else if (sigma[0] == nullptr) + { + Real const_sigma = m_const_sigma; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_jacobi_c(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k), + rhsarr_ma[box_no],const_sigma, + dmskarr_ma[box_no], dxinvarr); + }); + } else +#endif { - Real const_sigma = m_const_sigma; #ifdef AMREX_USE_OMP #pragma omp parallel #endif @@ -590,7 +631,23 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& dmskarr, dxinvarr); } } - else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped ) + } + else if ( (m_use_harmonic_average && mglev > 0) || m_use_mapped ) + { +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + AMREX_D_TERM(MultiArray4 const& sxarr_ma = sigma[0]->const_arrays();, + MultiArray4 const& syarr_ma = sigma[1]->const_arrays();, + MultiArray4 const& szarr_ma = sigma[2]->const_arrays();); + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_jacobi_ha(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k),rhsarr_ma[box_no], + AMREX_D_DECL(sxarr_ma[box_no],syarr_ma[box_no],szarr_ma[box_no]), + dmskarr_ma[box_no], dxinvarr); + }); + } else +#endif { #ifdef AMREX_USE_OMP #pragma omp parallel @@ -610,7 +667,21 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& dmskarr, dxinvarr); } } - else + } + else + { +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + auto const& sarr_ma = sigma[0]->const_arrays(); + ParallelFor(sol, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + mlndlap_jacobi_aa(i,j,k,solarr_ma[box_no],Axarr_ma[box_no](i,j,k), + rhsarr_ma[box_no],sarr_ma[box_no], + dmskarr_ma[box_no], dxinvarr); + }); + } else +#endif { #ifdef AMREX_USE_OMP #pragma omp parallel @@ -629,6 +700,8 @@ MLNodeLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& } } } + + Gpu::streamSynchronize(); } } diff --git a/Src/LinearSolvers/MLMG/Make.package b/Src/LinearSolvers/MLMG/Make.package index a8f267d4c26..940f7f88f7a 100644 --- a/Src/LinearSolvers/MLMG/Make.package +++ b/Src/LinearSolvers/MLMG/Make.package @@ -96,4 +96,6 @@ endif VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/MLMG +include $(AMREX_HOME)/Src/Boundary/Make.package + endif