From a726bd30735fd50956edf9c8cfc59bb6ac398b02 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 4 Apr 2024 18:05:18 +0100 Subject: [PATCH 1/4] Updating README-sycl.md to capture the 3.5 modifications (#16) * Updating README-sycl.md to capture the 3.5 modifications * Update README-sycl.md Co-authored-by: aacostadiaz * Remove the sgemm_nt_1_sycl PoC (#15) * Remove sgemm_nt_1 PoC * Fix build issues * Fix code style format * Remove ENABLE_NVPTX flag * Update include/cute/util/debug.hpp Co-authored-by: Mehdi Goli * Cosmetic --------- Co-authored-by: Mehdi Goli * Applying the comments --------- Co-authored-by: aacostadiaz --- README-sycl.md | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/README-sycl.md b/README-sycl.md index 36467e58f7..ea41236438 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -17,14 +17,24 @@ resources for GPUs. Currently, only one example works on NVIDIA SM 80. -## Building with SYCL support - -To build CUTLASS SYCL support you need the latest version of DPC++ compiler, -you can either use a recent [nighly build](https://github.com/intel/llvm/releases) +## Requirements + +To build CUTLASS SYCL support you need the latest version of DPC++ compiler, you can either use a recent [nighly build](https://github.com/intel/llvm/releases) or build the compiler from source. -In either case, make sure to enable the NVIDIA plugin so you can build applications +For the latter, make sure to enable the NVIDIA plugin so you can build applications for NVIDIA GPUs. + +I see, in that case let's not call it plugins, which confuses with the Plugins available on the codeplay's website to people who are completely new to SYCL, + +we can phrase it as - + +Suggested change +In either case, make sure to enable the NVIDIA plugin so you can build applications +To build CUTLASS with SYCL support, install the latest DPC++ compiler with the CUDA backend enabled, either by building from source as described [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda) , or by downloading the [nightly releases](https://github.com/intel/llvm/releases) + + +## Building with SYCL support Once you have your compiler installed, you need to point the `CMAKE_CUDA_HOST_COMPILER` flag to the clang++ provided by it. This enables the compilation of SYCL sources without altering the current NVCC path. @@ -44,18 +54,29 @@ make -G Ninja \ # Running the example +## CuTe Currently, you can build the CuTe Tutorial using the following command: ``` -ninja sgemm_nt_1_sycl +ninja [EXAMPLE_NAME]_sycl ``` You can run it like this from your build directory ``` -LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/sgemm_nt_1_sycl +LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/[EXAMPLE_NAME]_sycl ``` +## CUTLASS Example + Currently, the example `14_amper_tf32_tensorop_gemm` has been implemented for SYCL on Nvidia Ampere architecture. You can build this from your build directory by running : + ``` + ninja 14_ampere_tf32_tensorop_gemm_cute + ``` + You can run it like this from your build directory + ``` + NVIDIA_TF32_OVERRIDE=1 LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm_cute + ``` + # References [1] https://www.khronos.org/sycl/ From 84e730f339038152f2c0bf7208af6e286d3fdbe7 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 4 Apr 2024 18:07:24 +0100 Subject: [PATCH 2/4] Revert "Updating README-sycl.md to capture the 3.5 modifications (#16)" (#17) This reverts commit a726bd30735fd50956edf9c8cfc59bb6ac398b02. --- README-sycl.md | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/README-sycl.md b/README-sycl.md index ea41236438..36467e58f7 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -17,24 +17,14 @@ resources for GPUs. Currently, only one example works on NVIDIA SM 80. -## Requirements - -To build CUTLASS SYCL support you need the latest version of DPC++ compiler, you can either use a recent [nighly build](https://github.com/intel/llvm/releases) -or build the compiler from source. -For the latter, make sure to enable the NVIDIA plugin so you can build applications -for NVIDIA GPUs. - - -I see, in that case let's not call it plugins, which confuses with the Plugins available on the codeplay's website to people who are completely new to SYCL, - -we can phrase it as - +## Building with SYCL support -Suggested change +To build CUTLASS SYCL support you need the latest version of DPC++ compiler, +you can either use a recent [nighly build](https://github.com/intel/llvm/releases) +or build the compiler from source. In either case, make sure to enable the NVIDIA plugin so you can build applications -To build CUTLASS with SYCL support, install the latest DPC++ compiler with the CUDA backend enabled, either by building from source as described [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda) , or by downloading the [nightly releases](https://github.com/intel/llvm/releases) - +for NVIDIA GPUs. -## Building with SYCL support Once you have your compiler installed, you need to point the `CMAKE_CUDA_HOST_COMPILER` flag to the clang++ provided by it. This enables the compilation of SYCL sources without altering the current NVCC path. @@ -54,29 +44,18 @@ make -G Ninja \ # Running the example -## CuTe Currently, you can build the CuTe Tutorial using the following command: ``` -ninja [EXAMPLE_NAME]_sycl +ninja sgemm_nt_1_sycl ``` You can run it like this from your build directory ``` -LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/[EXAMPLE_NAME]_sycl +LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/sgemm_nt_1_sycl ``` -## CUTLASS Example - Currently, the example `14_amper_tf32_tensorop_gemm` has been implemented for SYCL on Nvidia Ampere architecture. You can build this from your build directory by running : - ``` - ninja 14_ampere_tf32_tensorop_gemm_cute - ``` - You can run it like this from your build directory - ``` - NVIDIA_TF32_OVERRIDE=1 LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm_cute - ``` - # References [1] https://www.khronos.org/sycl/ From 274e4abfb25b4e6206c2e6425751d5e407581d21 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Fri, 24 May 2024 17:30:04 +0100 Subject: [PATCH 3/4] fix batch indexing --- include/cutlass/gemm/kernel/intel_pvc_gemm.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp index 90c2014a01..acd84bbe30 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp @@ -222,11 +222,11 @@ class GemmUniversal< const int n_coord = (BlockIdxY() * num_sg + thread_idx / SubgroupSize) * get<1>(subgroup_shape); const int l_coord = BlockIdxZ(); - Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, l_coord), + Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, _), make_shape(_1{}, K, L), make_stride(Int{}, _1{})); - Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, l_coord), + Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, _), make_shape(K, Int{}, L), make_stride(_1{}, Int{})); @@ -260,7 +260,7 @@ class GemmUniversal< ); auto gmem_tiled_copy_c = make_xe_2d_copy(make_tensor(params.epilogue.ptr_D, make_shape(M, N, L), params.epilogue.dD)); - Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, l_coord), + Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, _), make_shape(Int{}, Int{}, L), make_stride(Int{}, Int{})); From 2192aba32e372af21f4fbeba5c752edd709b747d Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 27 May 2024 11:30:41 +0100 Subject: [PATCH 4/4] change _ to 0 --- include/cutlass/gemm/kernel/intel_pvc_gemm.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp index acd84bbe30..24517f916a 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp +++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp @@ -222,11 +222,11 @@ class GemmUniversal< const int n_coord = (BlockIdxY() * num_sg + thread_idx / SubgroupSize) * get<1>(subgroup_shape); const int l_coord = BlockIdxZ(); - Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, _), + Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, 0), make_shape(_1{}, K, L), make_stride(Int{}, _1{})); - Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, _), + Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, 0), make_shape(K, Int{}, L), make_stride(_1{}, Int{})); @@ -260,7 +260,7 @@ class GemmUniversal< ); auto gmem_tiled_copy_c = make_xe_2d_copy(make_tensor(params.epilogue.ptr_D, make_shape(M, N, L), params.epilogue.dD)); - Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, _), + Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, 0), make_shape(Int{}, Int{}, L), make_stride(Int{}, Int{}));