diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H index fbf324d6c98..29f8fd9f7e9 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_1D_K.H @@ -157,6 +157,69 @@ void abec_gsrb_os (int i, int, int, int n, Array4 const& phi, Array4 template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_jacobi (int i, int, int, int n, Array4 const& phi, + Array4 const& rhs, Array4 const& Ax, + T alpha, Array4 const& a, + T dhx, + Array4 const& bX, + Array4 const& m0, + Array4 const& m1, + Array4 const& f0, + Array4 const& f1, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0) + ? f0(vlo.x,0,0,n) : T(0.0); + T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0) + ? f1(vhi.x,0,0,n) : T(0.0); + + T delta = dhx*(bX(i,0,0,n)*cf0 + bX(i+1,0,0,n)*cf1); + + T gamma = alpha*a(i,0,0) + + dhx*( bX(i,0,0,n) + bX(i+1,0,0,n) ); + + phi(i,0,0,n) += T(2.0/3.0) * (rhs(i,0,0,n) - Ax(i,0,0,n)) / (gamma - delta); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_jacobi_os (int i, int, int, int n, Array4 const& phi, + Array4 const& rhs, Array4 const& Ax, + T alpha, Array4 const& a, + T dhx, + Array4 const& bX, + Array4 const& m0, + Array4 const& m1, + Array4 const& f0, + Array4 const& f1, + Array4 const& osm, + Box const& vbox) noexcept +{ + if (osm(i,0,0) == 0) { + phi(i,0,0) = T(0.0); + } else { + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0) + ? f0(vlo.x,0,0,n) : T(0.0); + T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0) + ? f1(vhi.x,0,0,n) : T(0.0); + + T delta = dhx*(bX(i,0,0,n)*cf0 + bX(i+1,0,0,n)*cf1); + + T gamma = alpha*a(i,0,0) + + dhx*( bX(i,0,0,n) + bX(i+1,0,0,n) ); + + phi(i,0,0,n) += T(2.0/3.0) * (rhs(i,0,0,n) - Ax(i,0,0,n)) / (gamma - delta); + } +} + +template +AMREX_FORCE_INLINE void abec_gsrb_with_line_solve ( Box const& /*box*/, Array4 const& /*phi*/, Array4 const& /*rhs*/, T /*alpha*/, Array4 const& /*a*/, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H index 9184a755151..2beecd9d422 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_2D_K.H @@ -230,6 +230,81 @@ void abec_gsrb_os (int i, int j, int, int n, Array4 const& phi, Array4 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_jacobi (int i, int j, int, int n, Array4 const& phi, + Array4 const& rhs, Array4< T const> const& Ax, + T alpha, Array4 const& a, + T dhx, T dhy, + Array4 const& bX, Array4 const& bY, + Array4 const& m0, Array4 const& m2, + Array4 const& m1, Array4 const& m3, + Array4 const& f0, Array4 const& f2, + Array4 const& f1, Array4 const& f3, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0) + ? f0(vlo.x,j,0,n) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0) + ? f1(i,vlo.y,0,n) : T(0.0); + T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0) + ? f2(vhi.x,j,0,n) : T(0.0); + T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0) + ? f3(i,vhi.y,0,n) : T(0.0); + + T delta = dhx*(bX(i,j,0,n)*cf0 + bX(i+1,j,0,n)*cf2) + + dhy*(bY(i,j,0,n)*cf1 + bY(i,j+1,0,n)*cf3); + + T gamma = alpha*a(i,j,0) + + dhx*( bX(i,j,0,n) + bX(i+1,j,0,n) ) + + dhy*( bY(i,j,0,n) + bY(i,j+1,0,n) ); + + phi(i,j,0,n) += T(2.0/3.0) * (rhs(i,j,0,n) - Ax(i,j,0,n)) / (gamma - delta); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_jacobi_os (int i, int j, int, int n, Array4 const& phi, + Array4 const& rhs, Array4 const& Ax, + T alpha, Array4 const& a, + T dhx, T dhy, + Array4 const& bX, Array4 const& bY, + Array4 const& m0, Array4 const& m2, + Array4 const& m1, Array4 const& m3, + Array4 const& f0, Array4 const& f2, + Array4 const& f1, Array4 const& f3, + Array4 const& osm, + Box const& vbox) noexcept +{ + if (osm(i,j,0) == 0) { + phi(i,j,0,n) = T(0.0); + } else { + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0) + ? f0(vlo.x,j,0,n) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0) + ? f1(i,vlo.y,0,n) : T(0.0); + T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0) + ? f2(vhi.x,j,0,n) : T(0.0); + T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0) + ? f3(i,vhi.y,0,n) : T(0.0); + + T delta = dhx*(bX(i,j,0,n)*cf0 + bX(i+1,j,0,n)*cf2) + + dhy*(bY(i,j,0,n)*cf1 + bY(i,j+1,0,n)*cf3); + + T gamma = alpha*a(i,j,0) + + dhx*( bX(i,j,0,n) + bX(i+1,j,0,n) ) + + dhy*( bY(i,j,0,n) + bY(i,j+1,0,n) ); + + phi(i,j,0,n) += T(2.0/3.0) * (rhs(i,j,0,n) - Ax(i,j,0,n)) / (gamma - delta); + } +} + +template +AMREX_FORCE_INLINE void abec_gsrb_with_line_solve ( Box const& box, Array4 const& phi, Array4 const& rhs, T alpha, Array4 const& a, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H index 7d6cca59b49..bb5172396cf 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLap_3D_K.H @@ -329,6 +329,106 @@ void abec_gsrb_os (int i, int j, int k, int n, template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_jacobi (int i, int j, int k, int n, Array4 const& phi, + Array4 const& rhs, Array4 const& Ax, + T alpha, Array4 const& a, + T dhx, T dhy, T dhz, + Array4 const& bX, Array4 const& bY, + Array4 const& bZ, + Array4 const& m0, Array4 const& m2, + Array4 const& m4, + Array4 const& m1, Array4 const& m3, + Array4 const& m5, + Array4 const& f0, Array4 const& f2, + Array4 const& f4, + Array4 const& f1, Array4 const& f3, + Array4 const& f5, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0) + ? f0(vlo.x,j,k,n) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0) + ? f1(i,vlo.y,k,n) : T(0.0); + T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0) + ? f2(i,j,vlo.z,n) : T(0.0); + T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0) + ? f3(vhi.x,j,k,n) : T(0.0); + T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0) + ? f4(i,vhi.y,k,n) : T(0.0); + T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0) + ? f5(i,j,vhi.z,n) : T(0.0); + + T gamma = alpha*a(i,j,k) + + dhx*(bX(i,j,k,n)+bX(i+1,j,k,n)) + + dhy*(bY(i,j,k,n)+bY(i,j+1,k,n)) + + dhz*(bZ(i,j,k,n)+bZ(i,j,k+1,n)); + + T g_m_d = gamma + - (dhx*(bX(i,j,k,n)*cf0 + bX(i+1,j,k,n)*cf3) + + dhy*(bY(i,j,k,n)*cf1 + bY(i,j+1,k,n)*cf4) + + dhz*(bZ(i,j,k,n)*cf2 + bZ(i,j,k+1,n)*cf5)); + + phi(i,j,k,n) += T(2.0/3.0) * (rhs(i,j,k,n) - Ax(i,j,k,n)) / g_m_d; +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void abec_jacobi_os (int i, int j, int k, int n, + Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, + T alpha, Array4 const& a, + T dhx, T dhy, T dhz, + Array4 const& bX, Array4 const& bY, + Array4 const& bZ, + Array4 const& m0, Array4 const& m2, + Array4 const& m4, + Array4 const& m1, Array4 const& m3, + Array4 const& m5, + Array4 const& f0, Array4 const& f2, + Array4 const& f4, + Array4 const& f1, Array4 const& f3, + Array4 const& f5, + Array4 const& osm, + Box const& vbox) noexcept +{ + if (osm(i,j,k) == 0) { + phi(i,j,k,n) = T(0.0); + } else { + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0) + ? f0(vlo.x,j,k,n) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0) + ? f1(i,vlo.y,k,n) : T(0.0); + T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0) + ? f2(i,j,vlo.z,n) : T(0.0); + T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0) + ? f3(vhi.x,j,k,n) : T(0.0); + T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0) + ? f4(i,vhi.y,k,n) : T(0.0); + T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0) + ? f5(i,j,vhi.z,n) : T(0.0); + + T gamma = alpha*a(i,j,k) + + dhx*(bX(i,j,k,n)+bX(i+1,j,k,n)) + + dhy*(bY(i,j,k,n)+bY(i,j+1,k,n)) + + dhz*(bZ(i,j,k,n)+bZ(i,j,k+1,n)); + + T g_m_d = gamma + - (dhx*(bX(i,j,k,n)*cf0 + bX(i+1,j,k,n)*cf3) + + dhy*(bY(i,j,k,n)*cf1 + bY(i,j+1,k,n)*cf4) + + dhz*(bZ(i,j,k,n)*cf2 + bZ(i,j,k+1,n)*cf5)); + + phi(i,j,k,n) += T(2.0/3.0) * (rhs(i,j,k,n) - Ax(i,j,k,n)) / g_m_d; + } +} + +template +AMREX_FORCE_INLINE void tridiagonal_solve (Array1D& a_ls, Array1D& b_ls, Array1D& c_ls, Array1D& r_ls, Array1D& u_ls, Array1D& gam, int ilen ) noexcept @@ -348,7 +448,7 @@ void tridiagonal_solve (Array1D& a_ls, Array1D& b_ls, Array1D -AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +AMREX_FORCE_INLINE void abec_gsrb_with_line_solve ( Box const& box, Array4 const& phi, Array4 const& rhs, T alpha, Array4 const& a, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H index 0f90a6ead17..cba682834ca 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.H @@ -864,6 +864,12 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in regular_coarsening = this->mg_coarsen_ratio_vec[mglev-1] == this->mg_coarsen_ratio; } + MultiFab Ax; + if (! this->m_use_gauss_seidel && regular_coarsening) { // jacobi + Ax.define(sol.boxArray(), sol.DistributionMap(), sol.nComp(), 0); + Fapply(amrlev, mglev, Ax, sol); + } + const MF& acoef = m_a_coeffs[amrlev][mglev]; AMREX_ALWAYS_ASSERT(acoef.nGrowVect() == 0); AMREX_D_TERM(const MF& bxcoef = m_b_coeffs[amrlev][mglev][0];, @@ -939,40 +945,76 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in if (this->m_overset_mask[amrlev][mglev]) { const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays(); - ParallelFor(sol, IntVect(0), nc, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept - { - Box vbx(ama[box_no]); - abec_gsrb_os(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no], - AMREX_D_DECL(dhx, dhy, dhz), - AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), - AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), - AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), - AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), - AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), - osmma[box_no], vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + ParallelFor(sol, IntVect(0), nc, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept + { + Box vbx(ama[box_no]); + abec_gsrb_os(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), + AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), + AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), + AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), + AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), + osmma[box_no], vbx, redblack); + }); + } else { + const auto& axma = Ax.const_arrays(); + ParallelFor(sol, IntVect(0), nc, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept + { + Box vbx(ama[box_no]); + abec_jacobi_os(i,j,k,n, solnma[box_no], rhsma[box_no], axma[box_no], + alpha, ama[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), + AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), + AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), + AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), + AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), + osmma[box_no], vbx); + }); + } } else if (regular_coarsening) { - ParallelFor(sol, IntVect(0), nc, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept - { - Box vbx(ama[box_no]); - abec_gsrb(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no], - AMREX_D_DECL(dhx, dhy, dhz), - AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), - AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), - AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), - AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), - AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + ParallelFor(sol, IntVect(0), nc, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept + { + Box vbx(ama[box_no]); + abec_gsrb(i,j,k,n, solnma[box_no], rhsma[box_no], alpha, ama[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), + AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), + AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), + AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), + AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), + vbx, redblack); + }); + } else { + const auto& axma = Ax.const_arrays(); + ParallelFor(sol, IntVect(0), nc, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n) noexcept + { + Box vbx(ama[box_no]); + abec_jacobi(i,j,k,n, solnma[box_no], rhsma[box_no], axma[box_no], + alpha, ama[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxma[box_no],byma[box_no],bzma[box_no]), + AMREX_D_DECL(m0ma[box_no],m2ma[box_no],m4ma[box_no]), + AMREX_D_DECL(m1ma[box_no],m3ma[box_no],m5ma[box_no]), + AMREX_D_DECL(f0ma[box_no],f2ma[box_no],f4ma[box_no]), + AMREX_D_DECL(f1ma[box_no],f3ma[box_no],f5ma[box_no]), + vbx); + }); + } } Gpu::streamSynchronize(); } else #endif { MFItInfo mfi_info; - if (Gpu::notInLaunchRegion()) { mfi_info.EnableTiling().SetDynamic(true); } + mfi_info.EnableTiling().SetDynamic(true); #ifdef AMREX_USE_OMP #pragma omp parallel if (Gpu::notInLaunchRegion()) @@ -1013,43 +1055,71 @@ MLABecLaplacianT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, in if (this->m_overset_mask[amrlev][mglev]) { const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D(tbx, nc, i, j, k, n, - { - abec_gsrb_os(i,j,k,n, solnfab, rhsfab, alpha, afab, - AMREX_D_DECL(dhx, dhy, dhz), - AMREX_D_DECL(bxfab, byfab, bzfab), - AMREX_D_DECL(m0,m2,m4), - AMREX_D_DECL(m1,m3,m5), - AMREX_D_DECL(f0fab,f2fab,f4fab), - AMREX_D_DECL(f1fab,f3fab,f5fab), - osm, vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_LOOP_4D(tbx, nc, i, j, k, n, + { + abec_gsrb_os(i,j,k,n, solnfab, rhsfab, alpha, afab, + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxfab, byfab, bzfab), + AMREX_D_DECL(m0,m2,m4), + AMREX_D_DECL(m1,m3,m5), + AMREX_D_DECL(f0fab,f2fab,f4fab), + AMREX_D_DECL(f1fab,f3fab,f5fab), + osm, vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_LOOP_4D(tbx, nc, i, j, k, n, + { + abec_jacobi_os(i,j,k,n, solnfab, rhsfab, axfab, + alpha, afab, + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxfab, byfab, bzfab), + AMREX_D_DECL(m0,m2,m4), + AMREX_D_DECL(m1,m3,m5), + AMREX_D_DECL(f0fab,f2fab,f4fab), + AMREX_D_DECL(f1fab,f3fab,f5fab), + osm, vbx); + }); + } } else if (regular_coarsening) { - AMREX_HOST_DEVICE_PARALLEL_FOR_4D(tbx, nc, i, j, k, n, - { - abec_gsrb(i,j,k,n, solnfab, rhsfab, alpha, afab, - AMREX_D_DECL(dhx, dhy, dhz), - AMREX_D_DECL(bxfab, byfab, bzfab), - AMREX_D_DECL(m0,m2,m4), - AMREX_D_DECL(m1,m3,m5), - AMREX_D_DECL(f0fab,f2fab,f4fab), - AMREX_D_DECL(f1fab,f3fab,f5fab), - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_LOOP_4D(tbx, nc, i, j, k, n, + { + abec_gsrb(i,j,k,n, solnfab, rhsfab, alpha, afab, + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxfab, byfab, bzfab), + AMREX_D_DECL(m0,m2,m4), + AMREX_D_DECL(m1,m3,m5), + AMREX_D_DECL(f0fab,f2fab,f4fab), + AMREX_D_DECL(f1fab,f3fab,f5fab), + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_LOOP_4D(tbx, nc, i, j, k, n, + { + abec_jacobi(i,j,k,n, solnfab, rhsfab, axfab, + alpha, afab, + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxfab, byfab, bzfab), + AMREX_D_DECL(m0,m2,m4), + AMREX_D_DECL(m1,m3,m5), + AMREX_D_DECL(f0fab,f2fab,f4fab), + AMREX_D_DECL(f1fab,f3fab,f5fab), + vbx); + }); + } } else { - Gpu::LaunchSafeGuard lsg(false); // xxxxx gpu todo // line solve does not with with GPU - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box, - { - abec_gsrb_with_line_solve(thread_box, solnfab, rhsfab, alpha, afab, - AMREX_D_DECL(dhx, dhy, dhz), - AMREX_D_DECL(bxfab, byfab, bzfab), - AMREX_D_DECL(m0,m2,m4), - AMREX_D_DECL(m1,m3,m5), - AMREX_D_DECL(f0fab,f2fab,f4fab), - AMREX_D_DECL(f1fab,f3fab,f5fab), - vbx, redblack, nc); - }); + abec_gsrb_with_line_solve(tbx, solnfab, rhsfab, alpha, afab, + AMREX_D_DECL(dhx, dhy, dhz), + AMREX_D_DECL(bxfab, byfab, bzfab), + AMREX_D_DECL(m0,m2,m4), + AMREX_D_DECL(m1,m3,m5), + AMREX_D_DECL(f0fab,f2fab,f4fab), + AMREX_D_DECL(f1fab,f3fab,f5fab), + vbx, redblack, nc); } } } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H index 783d17bbac0..e04e16f8bd6 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H @@ -55,6 +55,8 @@ public: } void update () override; + void setGaussSeidel (bool flag) noexcept { m_use_gauss_seidel = flag; } + virtual bool isCrossStencil () const { return true; } virtual bool isTensorOp () const { return false; } @@ -201,6 +203,8 @@ protected: mutable Vector> m_fluxreg; + bool m_use_gauss_seidel = true; // use red-black Gauss-Seidel by default + private: void defineAuxData (); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H index 0a5b90a7bab..630f685f49c 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H @@ -367,6 +367,12 @@ MLPoissonT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb { BL_PROFILE("MLPoisson::Fsmooth()"); + MultiFab Ax; + if (! this->m_use_gauss_seidel) { // jacobi + Ax.define(sol.boxArray(), sol.DistributionMap(), sol.nComp(), 0); + Fapply(amrlev, mglev, Ax, sol); + } + const auto& undrrelxr = this->m_undrrelxr[amrlev][mglev]; const auto& maskvals = this->m_maskvals [amrlev][mglev]; @@ -443,122 +449,130 @@ MLPoissonT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb #endif #endif -#if (AMREX_SPACEDIM == 1) if (this->m_overset_mask[amrlev][mglev]) { AMREX_ASSERT(!this->m_has_metric_term); const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays(); - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no], - osmma[box_no], dhx, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - vbx, redblack); - }); - } else if (this->m_has_metric_term) { - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb_m(i, j, k, solnma[box_no], rhsma[box_no], dhx, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - vbx, redblack, - dx, probxlo); - }); - } else { - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], dhx, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - vbx, redblack); - }); - } + if (this->m_use_gauss_seidel) { + ParallelFor(sol, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + Box vbx(rhsma[box_no]); + mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no], + osmma[box_no], AMREX_D_DECL(dhx, dhy, dhz), + f0ma[box_no], m0ma[box_no], + f1ma[box_no], m1ma[box_no], +#if (AMREX_SPACEDIM > 1) + f2ma[box_no], m2ma[box_no], + f3ma[box_no], m3ma[box_no], +#if (AMREX_SPACEDIM > 2) + f4ma[box_no], m4ma[box_no], + f5ma[box_no], m5ma[box_no], #endif - -#if (AMREX_SPACEDIM == 2) - if (this->m_overset_mask[amrlev][mglev]) { - AMREX_ASSERT(!this->m_has_metric_term); - const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays(); - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no], - osmma[box_no], dhx, dhy, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - f2ma[box_no], m2ma[box_no], - f3ma[box_no], m3ma[box_no], - vbx, redblack); - }); - } else if (this->m_has_metric_term) { - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb_m(i, j, k, solnma[box_no], rhsma[box_no], dhx, dhy, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - f2ma[box_no], m2ma[box_no], - f3ma[box_no], m3ma[box_no], - vbx, redblack, - dx, probxlo); - }); - } else { - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], dhx, dhy, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - f2ma[box_no], m2ma[box_no], - f3ma[box_no], m3ma[box_no], - vbx, redblack); - }); +#endif + vbx, redblack); + }); + } else { + const auto& axma = Ax.const_arrays(); + ParallelFor(sol, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + Box vbx(rhsma[box_no]); + mlpoisson_jacobi_os(i, j, k, solnma[box_no], rhsma[box_no], + axma[box_no], osmma[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + f0ma[box_no], m0ma[box_no], + f1ma[box_no], m1ma[box_no], +#if (AMREX_SPACEDIM > 1) + f2ma[box_no], m2ma[box_no], + f3ma[box_no], m3ma[box_no], +#if (AMREX_SPACEDIM > 2) + f4ma[box_no], m4ma[box_no], + f5ma[box_no], m5ma[box_no], +#endif +#endif + vbx); + }); + } } +#if (AMREX_SPACEDIM < 3) + else if (this->m_has_metric_term) { + if (this->m_use_gauss_seidel) { + ParallelFor(sol, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + Box vbx(rhsma[box_no]); + mlpoisson_gsrb_m(i, j, k, solnma[box_no], rhsma[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + f0ma[box_no], m0ma[box_no], + f1ma[box_no], m1ma[box_no], +#if (AMREX_SPACEDIM > 1) + f2ma[box_no], m2ma[box_no], + f3ma[box_no], m3ma[box_no], #endif - -#if (AMREX_SPACEDIM == 3) - if (this->m_overset_mask[amrlev][mglev]) { - AMREX_ASSERT(!this->m_has_metric_term); - const auto& osmma = this->m_overset_mask[amrlev][mglev]->const_arrays(); - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb_os(i, j, k, solnma[box_no], rhsma[box_no], - osmma[box_no], dhx, dhy, dhz, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - f2ma[box_no], m2ma[box_no], - f3ma[box_no], m3ma[box_no], - f4ma[box_no], m4ma[box_no], - f5ma[box_no], m5ma[box_no], - vbx, redblack); - }); - } else { - ParallelFor(sol, - [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept - { - Box vbx(rhsma[box_no]); - mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], dhx, dhy, dhz, - f0ma[box_no], m0ma[box_no], - f1ma[box_no], m1ma[box_no], - f2ma[box_no], m2ma[box_no], - f3ma[box_no], m3ma[box_no], - f4ma[box_no], m4ma[box_no], - f5ma[box_no], m5ma[box_no], - vbx, redblack); - }); + vbx, redblack, + dx, probxlo); + }); + } else { + const auto& axma = Ax.const_arrays(); + ParallelFor(sol, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + Box vbx(rhsma[box_no]); + mlpoisson_jacobi_m(i, j, k, solnma[box_no], rhsma[box_no], + axma[box_no], AMREX_D_DECL(dhx, dhy, dhz), + f0ma[box_no], m0ma[box_no], + f1ma[box_no], m1ma[box_no], +#if (AMREX_SPACEDIM > 1) + f2ma[box_no], m2ma[box_no], + f3ma[box_no], m3ma[box_no], +#endif + vbx, dx, probxlo); + }); + } } #endif + else { + if (this->m_use_gauss_seidel) { + ParallelFor(sol, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + Box vbx(rhsma[box_no]); + mlpoisson_gsrb(i, j, k, solnma[box_no], rhsma[box_no], + AMREX_D_DECL(dhx, dhy, dhz), + f0ma[box_no], m0ma[box_no], + f1ma[box_no], m1ma[box_no], +#if (AMREX_SPACEDIM > 1) + f2ma[box_no], m2ma[box_no], + f3ma[box_no], m3ma[box_no], +#if (AMREX_SPACEDIM > 2) + f4ma[box_no], m4ma[box_no], + f5ma[box_no], m5ma[box_no], +#endif +#endif + vbx, redblack); + }); + } else { + const auto& axma = Ax.const_arrays(); + ParallelFor(sol, + [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept + { + Box vbx(rhsma[box_no]); + mlpoisson_jacobi(i, j, k, solnma[box_no], rhsma[box_no], + axma[box_no], AMREX_D_DECL(dhx, dhy, dhz), + f0ma[box_no], m0ma[box_no], + f1ma[box_no], m1ma[box_no], +#if (AMREX_SPACEDIM > 1) + f2ma[box_no], m2ma[box_no], + f3ma[box_no], m3ma[box_no], +#if (AMREX_SPACEDIM > 2) + f4ma[box_no], m4ma[box_no], + f5ma[box_no], m5ma[box_no], +#endif +#endif + vbx); + }); + } + } } else #endif { @@ -598,30 +612,64 @@ MLPoissonT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb if (this->m_overset_mask[amrlev][mglev]) { AMREX_ASSERT(!this->m_has_metric_term); const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, - f0fab, m0, - f1fab, m1, - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, + f0fab, m0, + f1fab, m1, + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi_os(i, j, k, solnfab, rhsfab, axfab, + osm, dhx, + f0fab, m0, + f1fab, m1, + vbx); + }); + } } else if (this->m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx, - f0fab, m0, - f1fab, m1, - vbx, redblack, - dx, probxlo); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx, + f0fab, m0, + f1fab, m1, + vbx, redblack, + dx, probxlo); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi_m(i, j, k, solnfab, rhsfab, axfab, dhx, + f0fab, m0, + f1fab, m1, + vbx, dx, probxlo); + }); + } } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, - f0fab, m0, - f1fab, m1, - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, + f0fab, m0, + f1fab, m1, + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi(i, j, k, solnfab, rhsfab, axfab, dhx, + f0fab, m0, + f1fab, m1, + vbx); + }); + } } #endif @@ -629,55 +677,110 @@ MLPoissonT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb if (this->m_overset_mask[amrlev][mglev]) { AMREX_ASSERT(!this->m_has_metric_term); const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy, - f0fab, m0, - f1fab, m1, - f2fab, m2, - f3fab, m3, - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi_os(i, j, k, solnfab, rhsfab, axfab, + osm, dhx, dhy, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + vbx); + }); + } } else if (this->m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx, dhy, - f0fab, m0, - f1fab, m1, - f2fab, m2, - f3fab, m3, - vbx, redblack, - dx, probxlo); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb_m(i, j, k, solnfab, rhsfab, dhx, dhy, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + vbx, redblack, + dx, probxlo); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi_m(i, j, k, solnfab, rhsfab, axfab, dhx, dhy, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + vbx, dx, probxlo); + }); + } } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy, - f0fab, m0, - f1fab, m1, - f2fab, m2, - f3fab, m3, - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi(i, j, k, solnfab, rhsfab, axfab, dhx, dhy, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + vbx); + }); + } } - #endif #if (AMREX_SPACEDIM == 3) if (this->m_overset_mask[amrlev][mglev]) { AMREX_ASSERT(!this->m_has_metric_term); const auto& osm = this->m_overset_mask[amrlev][mglev]->const_array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy, dhz, - f0fab, m0, - f1fab, m1, - f2fab, m2, - f3fab, m3, - f4fab, m4, - f5fab, m5, - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb_os(i, j, k, solnfab, rhsfab, osm, dhx, dhy, dhz, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + f4fab, m4, + f5fab, m5, + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi_os(i, j, k, solnfab, rhsfab, axfab, + osm, dhx, dhy, dhz, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + f4fab, m4, + f5fab, m5, + vbx); + }); + } } else if (this->hasHiddenDimension()) { Box const& tbx_2d = this->compactify(tbx); Box const& vbx_2d = this->compactify(vbx); @@ -691,27 +794,58 @@ MLPoissonT::Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redb const auto& m1_2d = this->compactify(this->get_d1(m0,m1,m2)); const auto& m2_2d = this->compactify(this->get_d0(m3,m4,m5)); const auto& m3_2d = this->compactify(this->get_d1(m3,m4,m5)); - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx_2d, i, j, k, - { - TwoD::mlpoisson_gsrb(i, j, k, solnfab_2d, rhsfab_2d, dh0, dh1, - f0fab_2d, m0_2d, - f1fab_2d, m1_2d, - f2fab_2d, m2_2d, - f3fab_2d, m3_2d, - vbx_2d, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx_2d, i, j, k, + { + TwoD::mlpoisson_gsrb(i, j, k, solnfab_2d, rhsfab_2d, dh0, dh1, + f0fab_2d, m0_2d, + f1fab_2d, m1_2d, + f2fab_2d, m2_2d, + f3fab_2d, m3_2d, + vbx_2d, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + const auto& axfab_2d = this->compactify(axfab); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx_2d, i, j, k, + { + TwoD::mlpoisson_jacobi(i, j, k, solnfab_2d, rhsfab_2d, + axfab_2d, dh0, dh1, + f0fab_2d, m0_2d, + f1fab_2d, m1_2d, + f2fab_2d, m2_2d, + f3fab_2d, m3_2d, + vbx_2d); + }); + } } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, - { - mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy, dhz, - f0fab, m0, - f1fab, m1, - f2fab, m2, - f3fab, m3, - f4fab, m4, - f5fab, m5, - vbx, redblack); - }); + if (this->m_use_gauss_seidel) { + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_gsrb(i, j, k, solnfab, rhsfab, dhx, dhy, dhz, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + f4fab, m4, + f5fab, m5, + vbx, redblack); + }); + } else { + const auto& axfab = Ax.const_array(mfi); + AMREX_HOST_DEVICE_PARALLEL_FOR_3D ( tbx, i, j, k, + { + mlpoisson_jacobi(i, j, k, solnfab, rhsfab, axfab, + dhx, dhy, dhz, + f0fab, m0, + f1fab, m1, + f2fab, m2, + f3fab, m3, + f4fab, m4, + f5fab, m5, + vbx); + }); + } } #endif } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H index 071e97b4ea7..59257cf8b1c 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_1D_K.H @@ -188,6 +188,81 @@ void mlpoisson_gsrb_m (int i, int, int, Array4 const& phi, Array4 co } } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi (int i, int, int, Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, T dhx, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T gamma = -dhx*T(2.0); + + T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0) + ? f0(vlo.x,0,0) : T(0.0); + T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0) + ? f1(vhi.x,0,0) : T(0.0); + + T g_m_d = gamma + dhx*(cf0+cf1); + + phi(i,0,0) += T(2.0/3.0) * (rhs(i,0,0) - Ax(i,0,0)) / g_m_d; +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi_os (int i, int, int, Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, Array4 const& osm, T dhx, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + if (osm(i,0,0) == 0) { + phi(i,0,0) = T(0.0); + } else { + T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0) + ? f0(vlo.x,0,0) : T(0.0); + T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0) + ? f1(vhi.x,0,0) : T(0.0); + + T gamma = -dhx*T(2.0); + T g_m_d = gamma + dhx*(cf0+cf1); + + phi(i,0,0) += T(2.0/3.0) * (rhs(i,0,0) - Ax(i,0,0)) / g_m_d; + } +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi_m (int i, int, int, Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, T dhx, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Box const& vbox, T dx, T probxlo) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,0,0) > 0) + ? f0(vlo.x,0,0) : T(0.0); + T cf1 = (i == vhi.x && m1(vhi.x+1,0,0) > 0) + ? f1(vhi.x,0,0) : T(0.0); + + T rel = (probxlo + i *dx) * (probxlo + i *dx); + T rer = (probxlo +(i+1)*dx) * (probxlo +(i+1)*dx); + + T gamma = -dhx*(rel+rer); + + T g_m_d = gamma + dhx*(rel*cf0+rer*cf1); + + phi(i,0,0) += T(2.0/3.0) * (rhs(i,0,0) - Ax(i,0,0)) / g_m_d; +} + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlpoisson_normalize (int i, int, int, Array4 const& x, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H index 9604de38feb..5feba2a0066 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_2D_K.H @@ -305,6 +305,102 @@ void mlpoisson_gsrb_m (int i, int j, int, Array4 const& phi, Array4 } } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi (int i, int j, int, Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, T dhx, T dhy, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Array4 const& f2, Array4 const& m2, + Array4 const& f3, Array4 const& m3, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T gamma = T(-2.0)*(dhx+dhy); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0) + ? f0(vlo.x,j,0) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0) + ? f1(i,vlo.y,0) : T(0.0); + T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0) + ? f2(vhi.x,j,0) : T(0.0); + T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0) + ? f3(i,vhi.y,0) : T(0.0); + + T g_m_d = gamma + dhx*(cf0+cf2) + dhy*(cf1+cf3); + + phi(i,j,0) += T(2.0/3.0) * (rhs(i,j,0) - Ax(i,j,0)) / g_m_d; +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi_os (int i, int j, int, Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, Array4 const& osm, + T dhx, T dhy, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Array4 const& f2, Array4 const& m2, + Array4 const& f3, Array4 const& m3, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T gamma = T(-2.0)*(dhx+dhy); + + if (osm(i,j,0) == 0) { + phi(i,j,0) = T(0.0); + } else { + T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0) + ? f0(vlo.x,j,0) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0) + ? f1(i,vlo.y,0) : T(0.0); + T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0) + ? f2(vhi.x,j,0) : T(0.0); + T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0) + ? f3(i,vhi.y,0) : T(0.0); + + T g_m_d = gamma + dhx*(cf0+cf2) + dhy*(cf1+cf3); + + phi(i,j,0) += T(2.0/3.0) * (rhs(i,j,0) - Ax(i,j,0)) / g_m_d; + } +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi_m (int i, int j, int, Array4 const& phi, Array4 const& rhs, + Array4 const& Ax, T dhx, T dhy, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Array4 const& f2, Array4 const& m2, + Array4 const& f3, Array4 const& m3, + Box const& vbox, T dx, T probxlo) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,0) > 0) + ? f0(vlo.x,j,0) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,0) > 0) + ? f1(i,vlo.y,0) : T(0.0); + T cf2 = (i == vhi.x && m2(vhi.x+1,j,0) > 0) + ? f2(vhi.x,j,0) : T(0.0); + T cf3 = (j == vhi.y && m3(i,vhi.y+1,0) > 0) + ? f3(i,vhi.y,0) : T(0.0); + + T rel = probxlo + i*dx; + T rer = probxlo +(i+1)*dx; + T rc = probxlo + (i+T(0.5))*dx; + + T gamma = -dhx*(rel+rer) - T(2.0)*dhy*rc; + + T g_m_d = gamma + dhx*(rel*cf0+rer*cf2) + dhy*rc*(cf1+cf3); + + phi(i,j,0) += T(2.0/3.0) * (rhs(i,j,0) - Ax(i,j,0)) / g_m_d; +} + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void mlpoisson_normalize (int i, int j, int, Array4 const& x, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H index e0823294d03..fa23bc4b6dd 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson_3D_K.H @@ -245,6 +245,84 @@ void mlpoisson_gsrb_os (int i, int j, int k, Array4 const& phi, } } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi (int i, int j, int k, Array4 const& phi, + Array4 const& rhs, Array4 const& Ax, + T dhx, T dhy, T dhz, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Array4 const& f2, Array4 const& m2, + Array4 const& f3, Array4 const& m3, + Array4 const& f4, Array4 const& m4, + Array4 const& f5, Array4 const& m5, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + const T gamma = T(-2.)*(dhx+dhy+dhz); + + T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0) + ? f0(vlo.x,j,k) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0) + ? f1(i,vlo.y,k) : T(0.0); + T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0) + ? f2(i,j,vlo.z) : T(0.0); + T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0) + ? f3(vhi.x,j,k) : T(0.0); + T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0) + ? f4(i,vhi.y,k) : T(0.0); + T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0) + ? f5(i,j,vhi.z) : T(0.0); + + T g_m_d = gamma + dhx*(cf0+cf3) + dhy*(cf1+cf4) + dhz*(cf2+cf5); + + phi(i,j,k) += T(2.0/3.0) * (rhs(i,j,k) - Ax(i,j,k)) / g_m_d; +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void mlpoisson_jacobi_os (int i, int j, int k, Array4 const& phi, + Array4 const& rhs, + Array4 const& Ax, + Array4 const& osm, + T dhx, T dhy, T dhz, + Array4 const& f0, Array4 const& m0, + Array4 const& f1, Array4 const& m1, + Array4 const& f2, Array4 const& m2, + Array4 const& f3, Array4 const& m3, + Array4 const& f4, Array4 const& m4, + Array4 const& f5, Array4 const& m5, + Box const& vbox) noexcept +{ + const auto vlo = amrex::lbound(vbox); + const auto vhi = amrex::ubound(vbox); + + const T gamma = T(-2.)*(dhx+dhy+dhz); + + if (osm(i,j,k) == 0) { + phi(i,j,k) = T(0.0); + } else { + T cf0 = (i == vlo.x && m0(vlo.x-1,j,k) > 0) + ? f0(vlo.x,j,k) : T(0.0); + T cf1 = (j == vlo.y && m1(i,vlo.y-1,k) > 0) + ? f1(i,vlo.y,k) : T(0.0); + T cf2 = (k == vlo.z && m2(i,j,vlo.z-1) > 0) + ? f2(i,j,vlo.z) : T(0.0); + T cf3 = (i == vhi.x && m3(vhi.x+1,j,k) > 0) + ? f3(vhi.x,j,k) : T(0.0); + T cf4 = (j == vhi.y && m4(i,vhi.y+1,k) > 0) + ? f4(i,vhi.y,k) : T(0.0); + T cf5 = (k == vhi.z && m5(i,j,vhi.z+1) > 0) + ? f5(i,j,vhi.z) : T(0.0); + + T g_m_d = gamma + dhx*(cf0+cf3) + dhy*(cf1+cf4) + dhz*(cf2+cf5); + + phi(i,j,k) += T(2.0/3.0) * (rhs(i,j,k) - Ax(i,j,k)) / g_m_d; + } +} + } #endif diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H index 5834e9dc624..eb93bf2836c 100644 --- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H +++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.H @@ -61,6 +61,7 @@ private: bool semicoarsening = false; int max_coarsening_level = 30; int max_semicoarsening_level = 0; + bool use_gauss_seidel = true; // true: red-black, false: jacobi bool use_hypre = false; bool use_petsc = false; diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp index 27cd0b7a4b4..7a7647ce93a 100644 --- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp +++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp @@ -62,9 +62,10 @@ MyTest::solvePoisson () if (composite_solve) { - MLPoisson mlpoisson(geom, grids, dmap, info); + mlpoisson.setGaussSeidel(use_gauss_seidel); + mlpoisson.setMaxOrder(linop_maxorder); // This is a 3d problem with Dirichlet BC @@ -105,6 +106,8 @@ MyTest::solvePoisson () { MLPoisson mlpoisson({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info); + mlpoisson.setGaussSeidel(use_gauss_seidel); + mlpoisson.setMaxOrder(linop_maxorder); // This is a 3d problem with Dirichlet BC @@ -163,6 +166,8 @@ MyTest::solveABecLaplacian () MLABecLaplacian mlabec(geom, grids, dmap, info); + mlabec.setGaussSeidel(use_gauss_seidel); + mlabec.setMaxOrder(linop_maxorder); mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet, @@ -220,6 +225,8 @@ MyTest::solveABecLaplacian () { MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info); + mlabec.setGaussSeidel(use_gauss_seidel); + mlabec.setMaxOrder(linop_maxorder); mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet, @@ -290,6 +297,8 @@ MyTest::solveABecLaplacianInhomNeumann () MLABecLaplacian mlabec(geom, grids, dmap, info); + mlabec.setGaussSeidel(use_gauss_seidel); + mlabec.setMaxOrder(linop_maxorder); // This is a 3d problem with inhomogeneous Neumann BC @@ -349,6 +358,8 @@ MyTest::solveABecLaplacianInhomNeumann () { MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info); + mlabec.setGaussSeidel(use_gauss_seidel); + mlabec.setMaxOrder(linop_maxorder); // This is a 3d problem with inhomogeneous Neumann BC @@ -547,6 +558,8 @@ MyTest::readParameters () pp.query("max_coarsening_level", max_coarsening_level); pp.query("max_semicoarsening_level", max_semicoarsening_level); + pp.query("use_gauss_seidel", use_gauss_seidel); + pp.query("use_gmres", use_gmres); AMREX_ALWAYS_ASSERT(use_gmres == false || prob_type == 2);