From a726bd30735fd50956edf9c8cfc59bb6ac398b02 Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 4 Apr 2024 18:05:18 +0100
Subject: [PATCH 1/4] Updating README-sycl.md to capture the 3.5 modifications
 (#16)

* Updating README-sycl.md to capture the 3.5 modifications

* Update README-sycl.md

Co-authored-by: aacostadiaz <alejandro.acosta@codeplay.com>

* Remove the sgemm_nt_1_sycl PoC (#15)

* Remove sgemm_nt_1 PoC

* Fix build issues

* Fix code style format

* Remove ENABLE_NVPTX flag

* Update include/cute/util/debug.hpp

Co-authored-by: Mehdi Goli <mehdi.goli@codeplay.com>

* Cosmetic

---------

Co-authored-by: Mehdi Goli <mehdi.goli@codeplay.com>

* Applying the comments

---------

Co-authored-by: aacostadiaz <alejandro.acosta@codeplay.com>
---
 README-sycl.md | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/README-sycl.md b/README-sycl.md
index 36467e58f7..ea41236438 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -17,14 +17,24 @@ resources for GPUs.
 
 Currently, only one example works on NVIDIA SM 80.
 
-## Building with SYCL support
-
-To build CUTLASS SYCL support you need the latest version of DPC++ compiler, 
-you can either use a recent [nighly build](https://github.com/intel/llvm/releases)
+## Requirements
+ 
+To build CUTLASS SYCL support you need the latest version of DPC++ compiler, you can either use a recent [nighly build](https://github.com/intel/llvm/releases)
 or build the compiler from source.
-In either case, make sure to enable the NVIDIA plugin so you can build applications
+For the latter, make sure to enable the NVIDIA plugin so you can build applications
 for NVIDIA GPUs.
 
+
+I see, in that case let's not call it plugins, which confuses with the Plugins available on the codeplay's website to people who are completely new to SYCL,
+
+we can phrase it as -
+
+Suggested change
+In either case, make sure to enable the NVIDIA plugin so you can build applications
+To build CUTLASS with SYCL support, install the latest DPC++ compiler with the CUDA backend enabled, either by building from source as described [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda) ,  or by downloading the [nightly releases](https://github.com/intel/llvm/releases)
+
+
+## Building with SYCL support
 Once you have your compiler installed, you need to point the
 `CMAKE_CUDA_HOST_COMPILER` flag to the clang++ provided by it.
 This enables the compilation of SYCL sources without altering the current NVCC path.
@@ -44,18 +54,29 @@ make -G Ninja  \
 
 # Running the example
 
+## CuTe 
 Currently, you can build the CuTe Tutorial using the following command: 
 
 ```
-ninja sgemm_nt_1_sycl
+ninja [EXAMPLE_NAME]_sycl
 ```
 
 You can run it like this from your build directory
 
 ```
-LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/sgemm_nt_1_sycl
+LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/[EXAMPLE_NAME]_sycl
 ```
 
+## CUTLASS Example
+ Currently, the example `14_amper_tf32_tensorop_gemm` has been implemented for SYCL on Nvidia Ampere architecture. You can build this from your build directory by running :
+ ```
+  ninja 14_ampere_tf32_tensorop_gemm_cute
+ ```
+ You can run it like this from your build directory
+ ```
+  NVIDIA_TF32_OVERRIDE=1 LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm_cute
+ ```
+
 # References
 
 [1] https://www.khronos.org/sycl/

From 84e730f339038152f2c0bf7208af6e286d3fdbe7 Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Thu, 4 Apr 2024 18:07:24 +0100
Subject: [PATCH 2/4] Revert "Updating README-sycl.md to capture the 3.5
 modifications (#16)" (#17)

This reverts commit a726bd30735fd50956edf9c8cfc59bb6ac398b02.
---
 README-sycl.md | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/README-sycl.md b/README-sycl.md
index ea41236438..36467e58f7 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -17,24 +17,14 @@ resources for GPUs.
 
 Currently, only one example works on NVIDIA SM 80.
 
-## Requirements
- 
-To build CUTLASS SYCL support you need the latest version of DPC++ compiler, you can either use a recent [nighly build](https://github.com/intel/llvm/releases)
-or build the compiler from source.
-For the latter, make sure to enable the NVIDIA plugin so you can build applications
-for NVIDIA GPUs.
-
-
-I see, in that case let's not call it plugins, which confuses with the Plugins available on the codeplay's website to people who are completely new to SYCL,
-
-we can phrase it as -
+## Building with SYCL support
 
-Suggested change
+To build CUTLASS SYCL support you need the latest version of DPC++ compiler, 
+you can either use a recent [nighly build](https://github.com/intel/llvm/releases)
+or build the compiler from source.
 In either case, make sure to enable the NVIDIA plugin so you can build applications
-To build CUTLASS with SYCL support, install the latest DPC++ compiler with the CUDA backend enabled, either by building from source as described [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda) ,  or by downloading the [nightly releases](https://github.com/intel/llvm/releases)
-
+for NVIDIA GPUs.
 
-## Building with SYCL support
 Once you have your compiler installed, you need to point the
 `CMAKE_CUDA_HOST_COMPILER` flag to the clang++ provided by it.
 This enables the compilation of SYCL sources without altering the current NVCC path.
@@ -54,29 +44,18 @@ make -G Ninja  \
 
 # Running the example
 
-## CuTe 
 Currently, you can build the CuTe Tutorial using the following command: 
 
 ```
-ninja [EXAMPLE_NAME]_sycl
+ninja sgemm_nt_1_sycl
 ```
 
 You can run it like this from your build directory
 
 ```
-LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/[EXAMPLE_NAME]_sycl
+LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/cute/tutorial/sgemm_nt_1_sycl
 ```
 
-## CUTLASS Example
- Currently, the example `14_amper_tf32_tensorop_gemm` has been implemented for SYCL on Nvidia Ampere architecture. You can build this from your build directory by running :
- ```
-  ninja 14_ampere_tf32_tensorop_gemm_cute
- ```
- You can run it like this from your build directory
- ```
-  NVIDIA_TF32_OVERRIDE=1 LD_LIBRARY_PATH=/path/to/sycl/install/lib ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm_cute
- ```
-
 # References
 
 [1] https://www.khronos.org/sycl/

From 274e4abfb25b4e6206c2e6425751d5e407581d21 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Fri, 24 May 2024 17:30:04 +0100
Subject: [PATCH 3/4] fix batch indexing

---
 include/cutlass/gemm/kernel/intel_pvc_gemm.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
index 90c2014a01..acd84bbe30 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
@@ -222,11 +222,11 @@ class GemmUniversal<
     const int n_coord = (BlockIdxY() * num_sg + thread_idx / SubgroupSize) * get<1>(subgroup_shape);
     const int l_coord = BlockIdxZ();
 
-    Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, l_coord),
+    Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, _),
                                                                   make_shape(_1{}, K, L),
                                                                   make_stride(Int<FragsM * DpasM>{}, _1{}));
 
-    Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, l_coord),
+    Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, _),
                                                                   make_shape(K, Int<FragsN>{}, L),
                                                                   make_stride(_1{}, Int<DpasN>{}));
 
@@ -260,7 +260,7 @@ class GemmUniversal<
     );
     auto gmem_tiled_copy_c = make_xe_2d_copy<XE_2D_U32x8x16x1x1_ST_N>(make_tensor(params.epilogue.ptr_D, make_shape(M, N, L), params.epilogue.dD));
 
-    Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, l_coord),
+    Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, _),
                                                   make_shape(Int<FragsM>{}, Int<FragsN>{}, L),
                                                   make_stride(Int<DpasM>{}, Int<DpasN>{}));
 

From 2192aba32e372af21f4fbeba5c752edd709b747d Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 27 May 2024 11:30:41 +0100
Subject: [PATCH 4/4] change _ to 0

---
 include/cutlass/gemm/kernel/intel_pvc_gemm.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
index acd84bbe30..24517f916a 100644
--- a/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
+++ b/include/cutlass/gemm/kernel/intel_pvc_gemm.hpp
@@ -222,11 +222,11 @@ class GemmUniversal<
     const int n_coord = (BlockIdxY() * num_sg + thread_idx / SubgroupSize) * get<1>(subgroup_shape);
     const int l_coord = BlockIdxZ();
 
-    Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, _),
+    Tensor tAi = params.mainloop.gmem_tiled_copy_a.get_pvc_tensor(make_coord(m_coord, 0, 0),
                                                                   make_shape(_1{}, K, L),
                                                                   make_stride(Int<FragsM * DpasM>{}, _1{}));
 
-    Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, _),
+    Tensor tBi = params.mainloop.gmem_tiled_copy_b.get_pvc_tensor(make_coord(0, n_coord, 0),
                                                                   make_shape(K, Int<FragsN>{}, L),
                                                                   make_stride(_1{}, Int<DpasN>{}));
 
@@ -260,7 +260,7 @@ class GemmUniversal<
     );
     auto gmem_tiled_copy_c = make_xe_2d_copy<XE_2D_U32x8x16x1x1_ST_N>(make_tensor(params.epilogue.ptr_D, make_shape(M, N, L), params.epilogue.dD));
 
-    Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, _),
+    Tensor tCi = gmem_tiled_copy_c.get_pvc_tensor(make_coord(m_coord, n_coord, 0),
                                                   make_shape(Int<FragsM>{}, Int<FragsN>{}, L),
                                                   make_stride(Int<DpasM>{}, Int<DpasN>{}));