From d3924feb09e7a3652d7fe1b2c1671fcaaa99f711 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Wed, 18 Sep 2024 06:36:39 -0700
Subject: [PATCH 1/8] add tests for matrix size be runtime dimension

---
 ...l_k_cache_dimensions_function_argument.cpp |   0
 ...16_fill_k_cache_runtime_dimension_impl.hpp | 465 ++++++++++++++++++
 ..._matrix_bf16_fill_k_cache_runtime_impl.hpp | 244 +++++++++
 3 files changed, 709 insertions(+)
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp
new file mode 100644
index 0000000000000..2a5f6345327be
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp
@@ -0,0 +1,465 @@
+//------------------------------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------------===//
+
+#include <random>
+#include <sycl/usm.hpp>
+
+#ifdef SLM
+#include "slm_utils.hpp"
+#endif
+
+// number of test iterations
+constexpr unsigned int testIterations = 100;
+// start recording time after X iterations
+constexpr unsigned int recordThresh = 10;
+
+#ifndef MATRIX_SIZE
+#define MATRIX_SIZE 256
+#endif
+
+#ifdef MANUAL_UNROLL
+template <class T, T... inds, class F>
+static constexpr void loop(std::integer_sequence<T, inds...>, F &&f) {
+  (f(std::integral_constant<T, inds>{}), ...); // C++17 fold expression
+}
+
+template <class T, T count, class F>
+static constexpr void manually_unroll_loop(F &&f) {
+  loop(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+#endif
+
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
+          size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
+          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
+  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
+
+  // throw error if padding needed
+  assert(colsA == rowsB);
+  assert(rowsA % TM == 0);
+  assert(colsA % TK == 0);
+  assert(colsB % TN == 0);
+  // submit main kernel
+  std::chrono::high_resolution_clock::time_point start =
+      std::chrono::high_resolution_clock::now();
+
+  q.submit([&](handler &h) {
+#ifdef SLM
+    local_accessor<TOperand, 2> tileA{{MCache2, KCache2}, h};
+    local_accessor<TOperand, 2> tileB{
+        {KCache2 / vnniFactor, NCache2 * vnniFactor}, h};
+#endif
+
+    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
+        nd_range<2>{global, cachelocal},
+        // loop global
+        // loop localrange
+        [=](nd_item<2> it)
+#ifdef SG_SZ
+            [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif // SG_SZ
+        {
+          // sg::load and sg::store expect decorations to be ON
+          auto pA =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(A);
+          auto pB =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(B);
+          auto pC =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(C);
+          auto m2 = it.get_group(0);
+          auto n2 = it.get_group(1);
+          auto m1 = it.get_local_id(0);
+          auto n1 = it.get_local_id(1) / sgSize;
+          auto sg = it.get_sub_group();
+#ifdef PREFETCH
+          size_t sgId = sg.get_group_id()[0];
+          // There are MCache2/MCache1 x NCache2/NCache1 subgroups: NumSGs
+          // PVC case: this is 8x4 subgroups
+          // BKM for PVC is to use prefetch of 8x32 for each subgroup
+          constexpr size_t prefRow = 8;
+          constexpr size_t prefCol = 32;
+      // All the SGs of one workgroup prefetch MCache2xKCache2 of A
+      // All the SGs of one workgroup prefetch KCache2xNCache2 of B
+      // PVC case: 256x32 of A and 32x256 of B
+      // For both A and B: each subgroup performs a prefetch of
+      // prefRow rows and prefCol cols at a time
+      // For A, the subgroups are distributed along the row dimension:
+      // PVC: A layed as MCache2/prefRow (256/32)
+      // For B: the subgroups are distributed along the column dimension
+      // PVC: NCache2/prefCol = 256/32 = 8 SGs on the column dimension and
+      // KCache2/prefRow = 32/8 = 4 SGs on the row dimension
+#ifdef VNNI
+          // In the VNNI case, each subgroup still gets prefRow x prefCol
+          // In the PVC case: subgroups distribution become
+          // (NCache2*2)/prefCol = 512/32 = 16 SGs on the column dimension and
+          // (KCache2/2)/prefRow = 16/8 = 2 SGs on the row dimension
+          // pm1B and pn1B are used to identify the distribution of subgroups
+          // along the workgroup prefetch for B matrix. For A matrix, sgId is
+          // enough.
+          size_t pm1B = sgId / 16;   // prefetch m1 (sgId/16)
+          size_t pn1B = sgId & 0x15; // prefetch n1 (sgId%16)
+#else                                // VNNI
+          size_t pm1B = sgId / 8;   // prefetch m1 (sgId/8)
+          size_t pn1B = sgId & 0x7; // prefetch n1 (sgId%8)
+#endif                               // VNNI
+          constexpr size_t prefDistance = 3;
+          for (int p = 0; p < prefDistance; p++)
+            joint_matrix_prefetch<prefRow, prefCol>(
+                sg, A + (m2 * MCache2 + sgId * prefRow) * colsA + p * prefCol,
+                colsA, layout::row_major,
+                syclex::properties{syclex::prefetch_hint_L1});
+
+          for (int p = 0; p < prefDistance; p++)
+            joint_matrix_prefetch<prefRow, prefCol>(
+                sg,
+                B +
+                    (p * (KCache2 / vnniFactor) + pm1B * prefRow) * colsB *
+                        vnniFactor +
+                    (n2 * NCache2 * vnniFactor + pn1B * prefCol),
+                colsB * vnniFactor, layout::row_major,
+                syclex::properties{syclex::prefetch_hint_L1});
+#endif // PREFETCH
+
+          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+              tC[MCache1 / TM][NCache1 / TN]
+#ifdef INIT_LIST
+              = {}; // default initialization of all array elements
+#else               // INIT_LIST
+              ; // no initialization
+#endif              // INIT_LIST
+
+#ifdef MANUAL_UNROLL
+          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
+            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
+#else  // MANUAL_UNROLL
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+#endif // MANUAL_UNROLL
+              joint_matrix_fill(sg, tC[m][n], 0);
+#ifdef MANUAL_UNROLL
+            });
+          });
+#else  // MANUAL_UNROLL
+            }
+          }
+#endif // MANUAL_UNROLL
+
+#ifdef SLM
+          constexpr unsigned int SGs =
+              (MCache2 / MCache1) * (NCache2 / NCache1);
+#endif // SLM
+          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+#ifdef SLM
+            slm_read_write<colsA, colsB, MCache2, NCache2, KCache2, vnniFactor,
+                           SGs>(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize);
+            it.barrier(access::fence_space::local_space);
+#endif // SLM
+            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+                tA[MCache1 / TM][KCache2 / KCache1]
+#ifdef INIT_LIST
+                = {}; // default initialization of all array elements
+#else                 // INIT_LIST
+                ; // no initialization
+#endif                // INIT_LIST
+#ifdef VNNI
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::ext_intel_packed>
+                tB[NCache1 / TN][KCache2 / KCache1]
+#else  // VNNI
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::row_major>
+                tB[NCache1 / TN][KCache2 / KCache1]
+#endif // VNNI
+#ifdef INIT_LIST
+                = {}; // default initialization of all array elements
+#else                 // INIT_LIST
+                ; // no initialization
+#endif                // INIT_LIST
+
+#ifdef MANUAL_UNROLL
+            manually_unroll_loop<unsigned int, KCache2 / KCache1>([&](auto k1) {
+#else  // MANUAL_UNROLL
+            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
+#endif // MANUAL_UNROLL
+       // physical layer
+              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
+#ifdef MANUAL_UNROLL
+              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
+#else  // MANUAL_UNROLL
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
+#endif // MANUAL_UNROLL
+#ifdef SLM
+                joint_matrix_load(sg, tA[m][k1],
+                                  tileA.template get_multi_ptr<
+                                      sycl::access::decorated::no>() +
+                                      (m1 * MCache1 + m * TM) * KCache2 +
+                                      k1 * TK,
+                                  KCache2);
+#else // SLM
+#ifdef OOB
+                ext::intel::experimental::matrix::joint_matrix_load_checked(
+                    sg, tA[m][k1], pA, colsA, rowsA, colsA,
+                    m2 * MCache2 + m1 * MCache1 + m * TM, k * TK);
+#else  // OOB
+                joint_matrix_load(
+                    sg, tA[m][k1],
+                    pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+                        k * TK,
+                    colsA);
+#endif // OOB
+#endif // SLM
+#ifdef MANUAL_UNROLL
+              }); // m
+#else             // MANUAL_UNROLL
+              } // m
+#endif            // MANUAL_UNROLL
+#ifdef MANUAL_UNROLL
+              manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
+#else  // MANUAL_UNROLL
+              for (unsigned int n = 0; n < NCache1 / TN; n++) {
+#endif // MANUAL_UNROLL
+#ifdef SLM
+                joint_matrix_load(sg, tB[n][k1],
+                                  tileB.template get_multi_ptr<
+                                      sycl::access::decorated::no>() +
+                                      (k1 * TK / vnniFactor) *
+                                          (NCache2 * vnniFactor) +
+                                      (n1 * NCache1 + n * TN) * vnniFactor,
+                                  NCache2 * vnniFactor);
+#else // SLM
+#ifdef OOB
+                ext::intel::experimental::matrix::joint_matrix_load_checked(
+                    sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
+                    colsB * vnniFactor, k * TK / vnniFactor,
+                    (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor);
+#else  // OOB
+                joint_matrix_load(
+                    sg, tB[n][k1],
+                    pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
+                        (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
+                    colsB * vnniFactor);
+#endif // OOB
+#endif // SLM
+#ifdef MANUAL_UNROLL
+              }); // n
+#else             // MANUAL_UNROLL
+              } // n
+#endif            // MANUAL_UNROLL
+#ifdef MANUAL_UNROLL
+              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
+#else  // MANUAL_UNROLL
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
+#endif // MANUAL_UNROLL
+#ifdef MANUAL_UNROLL
+                manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
+#else // MANUAL_UNROLL
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
+
+#endif // MANUAL_UNROLL
+                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
+                                   tC[m][n]);
+#ifdef MANUAL_UNROLL
+                }); // n
+              });   // m
+            });     // for k1
+#else               // MANUAL_UNROLL
+                } // n
+              } // m
+            } // k1
+#endif              // MANUAL_UNROLL
+#ifdef SLM
+            it.barrier(access::fence_space::local_space);
+#endif // SLM
+#ifdef PREFETCH
+            auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
+                                    (k2 + prefDistance) * prefCol;
+            if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) <
+                (MATRIX_SIZE * MATRIX_SIZE))
+              joint_matrix_prefetch<prefRow, prefCol>(
+                  sg, A + prefetch_offsetA, colsA, layout::row_major,
+                  syclex::properties{syclex::prefetch_hint_L1});
+
+            auto prefetch_offsetB =
+                ((k2 + prefDistance) * (KCache2 / vnniFactor) +
+                 pm1B * prefRow) *
+                    (colsB)*vnniFactor +
+                (n2 * NCache2 * vnniFactor + pn1B * prefCol);
+            if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) +
+                 prefCol) < (MATRIX_SIZE * MATRIX_SIZE))
+              joint_matrix_prefetch<prefRow, prefCol>(
+                  sg, B + prefetch_offsetB, colsB * vnniFactor,
+                  layout::row_major,
+                  syclex::properties{syclex::prefetch_hint_L1});
+#endif // PREFETCH
+          } // for k2
+#ifdef MANUAL_UNROLL
+          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
+#else  // MANUAL_UNROLL
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+#endif // MANUAL_UNROLL
+#ifdef MANUAL_UNROLL
+            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
+#else  // MANUAL_UNROLL
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+#endif // MANUAL_UNROLL
+#ifdef OOB
+              ext::intel::experimental::matrix::joint_matrix_store_checked(
+                  sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB,
+                  m2 * MCache2 + m1 * MCache1 + m * TM,
+                  n2 * NCache2 + n1 * NCache1 + n * TN);
+#else  // OOB
+              joint_matrix_store(
+                  sg, tC[m][n],
+                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+                      (n2 * NCache2 + n1 * NCache1 + n * TN),
+                  colsB, layout::row_major);
+#endif // OOB
+#ifdef MANUAL_UNROLL
+            }); // n
+          });   // m
+#else           // MANUAL_UNROLL
+            } // n
+          } // m
+#endif          // MANUAL_UNROLL
+        });     // parallel_for
+  });           // queue.submit
+
+  if (i == testIterations - 1)
+    q.wait();
+  std::chrono::duration<double, std::milli> duration =
+      std::chrono::high_resolution_clock::now() - start;
+
+  return duration.count();
+}
+
+template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
+          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+void test() {
+  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
+         "invalid matrix size");
+  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
+         (MATRIX_SIZE % TK) == 0 &&
+         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
+
+  queue q;
+  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
+
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
+                                        MATRIX_SIZE);
+
+#ifdef VNNI
+  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor);
+  free(B, q);
+  B = vnniB;
+#endif
+
+  // run testIterations time, aggregate and calculate average run time
+  double totalDuration = 0;
+  for (unsigned int i = 0; i < testIterations; i++) {
+    double duration =
+        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
+                     vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+                     KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);
+    if (i >= recordThresh) {
+      totalDuration += duration;
+    }
+  }
+
+  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
+
+  double msecPerMatrixMul =
+      totalDuration / static_cast<double>(testIterations - recordThresh);
+  double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) /
+                  (msecPerMatrixMul / 1000.f);
+
+  std::cout << "DONE for size " << MATRIX_SIZE << std::endl;
+  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
+
+  free(A, q);
+  free(B, q);
+  free(C, q);
+  free(refC, q);
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  constexpr size_t MCache1 = 32;
+  constexpr size_t MCache2 = 256;
+  constexpr size_t NCache2 = 256;
+  constexpr size_t KCache2 = 32;
+
+#ifdef VNNI
+  constexpr unsigned int VnniFactor = 2;
+#else  // VNNI
+  constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      constexpr size_t NCache1 = 32;
+      constexpr size_t KCache1 = 32;
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      constexpr size_t NCache1 = 4 * /*TN*/ 16;
+      constexpr size_t KCache1 = 16;
+      // test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+      //      NCache1, KCache1, MCache2, NCache2, KCache2>();
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // // IGC
+      // test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+      //      MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      constexpr size_t NCache1 = 4 * /*TN*/ 8;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+      // MCache1,
+      //      NCache1, KCache1, MCache2, NCache2, KCache2>();
+      break;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp
new file mode 100644
index 0000000000000..fae8dbadee3c7
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp
@@ -0,0 +1,244 @@
+//------------------------------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------------===//
+
+#include <random>
+#include <sycl/usm.hpp>
+#include <sycl/stream.hpp>
+
+// number of test iterations
+constexpr unsigned int testIterations = 100;
+// start recording time after X iterations
+constexpr unsigned int recordThresh = 10;
+
+#ifdef MANUAL_UNROLL
+template <class T, T... inds, class F>
+static constexpr void loop(std::integer_sequence<T, inds...>, F &&f) {
+  (f(std::integral_constant<T, inds>{}), ...); // C++17 fold expression
+}
+
+template <class T, T count, class F>
+static constexpr void manually_unroll_loop(F &&f) {
+  loop(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+#endif
+
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t VNNI,
+          typename TOperand, typename TResult, size_t TM, size_t TN, size_t TK,
+          size_t MCache1, size_t NCache1, size_t KCache1, size_t MCache2,
+          size_t NCache2, size_t KCache2>
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
+  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
+
+  // throw error if padding needed
+  assert(colsA == rowsB);
+  assert(rowsA % TM == 0);
+  assert(colsA % TK == 0);
+  assert(colsB % TN == 0);
+  // submit main kernel
+  std::chrono::high_resolution_clock::time_point start =
+      std::chrono::high_resolution_clock::now();
+
+  q.submit([&](handler &h) {
+    sycl::stream os { 5000, 5000, h};
+    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
+        nd_range<2>{global, cachelocal},
+        // loop global
+        // loop localrange
+        [=](nd_item<2> it)
+#ifdef SG_SZ
+            [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+        {
+          auto pA =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(A);
+          auto pB =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(B);
+          auto pC =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(C);
+          auto m2 = it.get_group(0);
+          auto n2 = it.get_group(1);
+          auto m1 = it.get_local_id(0);
+          auto n1 = it.get_local_id(1) / sgSize;
+          auto sg = it.get_sub_group();
+          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+              tC[MCache1 / TM][NCache1 / TN];
+
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+              joint_matrix_fill(sg, tC[m][n], 0);
+            }
+          }
+
+          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+                tA[MCache1 / TM][KCache2 / KCache1];
+
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::ext_intel_packed>
+                tB[NCache1 / TN][KCache2 / KCache1]; 
+
+            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
+              // physical layer
+              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
+                joint_matrix_load(
+                    sg, tA[m][k1],
+                    pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+                        k * TK,
+                    colsA);
+              } // m
+
+              for (unsigned int n = 0; n < NCache1 / TN; n++) {
+                joint_matrix_load(sg, tB[n][k1],
+                                  pB + (k * TK / VNNI) * (colsB * VNNI) +
+                                      (n2 * NCache2 + n1 * NCache1 + n * TN) *
+                                          VNNI,
+                                  colsB * VNNI);
+              } // n
+              
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
+                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
+                                   tC[m][n]);
+                } // n
+              } // m
+            } // k1
+          } // for k2
+
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+              joint_matrix_store(
+                  sg, tC[m][n],
+                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+                      (n2 * NCache2 + n1 * NCache1 + n * TN),
+                  colsB, layout::row_major);
+            } // n
+          } // m
+// #endif
+        }); // parallel_for
+  });       // queue.submit
+
+  if (i == testIterations - 1)
+    q.wait();
+  std::chrono::duration<double, std::milli> duration =
+      std::chrono::high_resolution_clock::now() - start;
+
+  return duration.count();
+}
+
+template <typename T, typename TResult, size_t VNNI, size_t TM, size_t TN,
+          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+void test(size_t MATRIX_SIZE) {
+  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
+         "invalid matrix size");
+  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
+         (MATRIX_SIZE % TK) == 0 &&
+         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
+
+  queue q;
+  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
+  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI);
+
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
+                                        MATRIX_SIZE);
+
+  // run testIterations time, aggregate and calculate average run time
+  double totalDuration = 0;
+  for (unsigned int i = 0; i < testIterations; i++) {
+    double duration =
+        joint_matmul<VNNI,
+                     T, TResult, TM, TN, TK, MCache1, NCache1, KCache1, MCache2,
+                     NCache2, KCache2>(A, vnniB, C, q, i, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE);
+    if (i >= recordThresh) {
+      totalDuration += duration;
+    }
+  }
+
+  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
+
+  double msecPerMatrixMul =
+      totalDuration / static_cast<double>(testIterations - recordThresh);
+  double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) /
+                  (msecPerMatrixMul / 1000.f);
+
+  std::cout << "DONE for size " << MATRIX_SIZE << std::endl;
+  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
+
+  free(A, q);
+  free(B, q);
+  free(vnniB, q);
+  free(C, q);
+  free(refC, q);
+}
+
+int main(int argc, char *argv[]) {
+  size_t MATRIX_SIZE;
+  MATRIX_SIZE = std::stoul(argv[1]);
+
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  constexpr size_t MCache1 = 32;
+  constexpr size_t MCache2 = 256;
+  constexpr size_t NCache2 = 256;
+  constexpr size_t KCache2 = 32;
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      constexpr size_t NCache1 = 32;
+      constexpr size_t KCache1 = 32;
+
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      constexpr size_t NCache1 = 4 * /*TN*/ 16;
+      constexpr size_t KCache1 = 16;
+
+      // test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      // test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
+      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      constexpr size_t NCache1 = 4 * /*TN*/ 8;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16>();
+      break;
+    }
+  }
+  return 0;
+}

From 4e6593fc7f4ffb614a262fb73a247394d94efd94 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Wed, 18 Sep 2024 13:27:13 -0700
Subject: [PATCH 2/8] update the arg_dim test to reuse the bf16_cache test

---
 ...joint_matrix_bf16_fill_k_cache_arg_dim.cpp |  22 +
 ...l_k_cache_dimensions_function_argument.cpp |   0
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   |  19 +
 ...16_fill_k_cache_runtime_dimension_impl.hpp | 465 ------------------
 ..._matrix_bf16_fill_k_cache_runtime_impl.hpp | 244 ---------
 5 files changed, 41 insertions(+), 709 deletions(-)
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
 delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp
 delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp
 delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
new file mode 100644
index 0000000000000..aa2540b3781b7
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
@@ -0,0 +1,22 @@
+//==--- joint_matrix_bf16_fill_k_cache_OOB.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+
+// https://jira.devtools.intel.com/browse/GSD-9716
+// XFAIL: arch-intel_gpu_pvc
+
+// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM -DVNNI
+// RUN: %{run} %t_arg_dim_vnni.out
+
+// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM
+// RUN: %{run} %t_arg_dim.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index b561bd073038a..4584aedbe3d01 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -36,11 +36,23 @@ static constexpr void manually_unroll_loop(F &&f) {
 
 template <size_t TM, size_t TN, size_t TK> class MatMul;
 
+#ifdef ARG_DIM
+template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
+          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+#else // ARG_DIM
 template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
           size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
           size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
+#endif // ARG_DIM
+
+#ifdef ARG_DIM
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
+#else  // ARG_DIM
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
+#endif // ARG_DIM
+
   size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
   range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
   range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
@@ -381,10 +393,17 @@ void test() {
   // run testIterations time, aggregate and calculate average run time
   double totalDuration = 0;
   for (unsigned int i = 0; i < testIterations; i++) {
+#ifdef ARG_DIM
+    double duration =
+        joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+                     KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i, 
+                     MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE);
+#else // ARG_DIM
     double duration =
         joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                      vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
                      KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);
+#endif // ARG_DIM
     if (i >= recordThresh) {
       totalDuration += duration;
     }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp
deleted file mode 100644
index 2a5f6345327be..0000000000000
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp
+++ /dev/null
@@ -1,465 +0,0 @@
-//------------------------------------------------------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-------------------------------------------------------------------------===//
-
-#include <random>
-#include <sycl/usm.hpp>
-
-#ifdef SLM
-#include "slm_utils.hpp"
-#endif
-
-// number of test iterations
-constexpr unsigned int testIterations = 100;
-// start recording time after X iterations
-constexpr unsigned int recordThresh = 10;
-
-#ifndef MATRIX_SIZE
-#define MATRIX_SIZE 256
-#endif
-
-#ifdef MANUAL_UNROLL
-template <class T, T... inds, class F>
-static constexpr void loop(std::integer_sequence<T, inds...>, F &&f) {
-  (f(std::integral_constant<T, inds>{}), ...); // C++17 fold expression
-}
-
-template <class T, T count, class F>
-static constexpr void manually_unroll_loop(F &&f) {
-  loop(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-#endif
-
-template <size_t TM, size_t TN, size_t TK> class MatMul;
-
-template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
-          size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
-          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
-          size_t MCache2, size_t NCache2, size_t KCache2>
-double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
-  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
-  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
-  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
-
-  // throw error if padding needed
-  assert(colsA == rowsB);
-  assert(rowsA % TM == 0);
-  assert(colsA % TK == 0);
-  assert(colsB % TN == 0);
-  // submit main kernel
-  std::chrono::high_resolution_clock::time_point start =
-      std::chrono::high_resolution_clock::now();
-
-  q.submit([&](handler &h) {
-#ifdef SLM
-    local_accessor<TOperand, 2> tileA{{MCache2, KCache2}, h};
-    local_accessor<TOperand, 2> tileB{
-        {KCache2 / vnniFactor, NCache2 * vnniFactor}, h};
-#endif
-
-    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
-        nd_range<2>{global, cachelocal},
-        // loop global
-        // loop localrange
-        [=](nd_item<2> it)
-#ifdef SG_SZ
-            [[intel::reqd_sub_group_size(SG_SZ)]]
-#endif // SG_SZ
-        {
-          // sg::load and sg::store expect decorations to be ON
-          auto pA =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::yes>(A);
-          auto pB =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::yes>(B);
-          auto pC =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::yes>(C);
-          auto m2 = it.get_group(0);
-          auto n2 = it.get_group(1);
-          auto m1 = it.get_local_id(0);
-          auto n1 = it.get_local_id(1) / sgSize;
-          auto sg = it.get_sub_group();
-#ifdef PREFETCH
-          size_t sgId = sg.get_group_id()[0];
-          // There are MCache2/MCache1 x NCache2/NCache1 subgroups: NumSGs
-          // PVC case: this is 8x4 subgroups
-          // BKM for PVC is to use prefetch of 8x32 for each subgroup
-          constexpr size_t prefRow = 8;
-          constexpr size_t prefCol = 32;
-      // All the SGs of one workgroup prefetch MCache2xKCache2 of A
-      // All the SGs of one workgroup prefetch KCache2xNCache2 of B
-      // PVC case: 256x32 of A and 32x256 of B
-      // For both A and B: each subgroup performs a prefetch of
-      // prefRow rows and prefCol cols at a time
-      // For A, the subgroups are distributed along the row dimension:
-      // PVC: A layed as MCache2/prefRow (256/32)
-      // For B: the subgroups are distributed along the column dimension
-      // PVC: NCache2/prefCol = 256/32 = 8 SGs on the column dimension and
-      // KCache2/prefRow = 32/8 = 4 SGs on the row dimension
-#ifdef VNNI
-          // In the VNNI case, each subgroup still gets prefRow x prefCol
-          // In the PVC case: subgroups distribution become
-          // (NCache2*2)/prefCol = 512/32 = 16 SGs on the column dimension and
-          // (KCache2/2)/prefRow = 16/8 = 2 SGs on the row dimension
-          // pm1B and pn1B are used to identify the distribution of subgroups
-          // along the workgroup prefetch for B matrix. For A matrix, sgId is
-          // enough.
-          size_t pm1B = sgId / 16;   // prefetch m1 (sgId/16)
-          size_t pn1B = sgId & 0x15; // prefetch n1 (sgId%16)
-#else                                // VNNI
-          size_t pm1B = sgId / 8;   // prefetch m1 (sgId/8)
-          size_t pn1B = sgId & 0x7; // prefetch n1 (sgId%8)
-#endif                               // VNNI
-          constexpr size_t prefDistance = 3;
-          for (int p = 0; p < prefDistance; p++)
-            joint_matrix_prefetch<prefRow, prefCol>(
-                sg, A + (m2 * MCache2 + sgId * prefRow) * colsA + p * prefCol,
-                colsA, layout::row_major,
-                syclex::properties{syclex::prefetch_hint_L1});
-
-          for (int p = 0; p < prefDistance; p++)
-            joint_matrix_prefetch<prefRow, prefCol>(
-                sg,
-                B +
-                    (p * (KCache2 / vnniFactor) + pm1B * prefRow) * colsB *
-                        vnniFactor +
-                    (n2 * NCache2 * vnniFactor + pn1B * prefCol),
-                colsB * vnniFactor, layout::row_major,
-                syclex::properties{syclex::prefetch_hint_L1});
-#endif // PREFETCH
-
-          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
-              tC[MCache1 / TM][NCache1 / TN]
-#ifdef INIT_LIST
-              = {}; // default initialization of all array elements
-#else               // INIT_LIST
-              ; // no initialization
-#endif              // INIT_LIST
-
-#ifdef MANUAL_UNROLL
-          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
-            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
-#else  // MANUAL_UNROLL
-          for (unsigned int m = 0; m < MCache1 / TM; m++) {
-            for (unsigned int n = 0; n < NCache1 / TN; n++) {
-#endif // MANUAL_UNROLL
-              joint_matrix_fill(sg, tC[m][n], 0);
-#ifdef MANUAL_UNROLL
-            });
-          });
-#else  // MANUAL_UNROLL
-            }
-          }
-#endif // MANUAL_UNROLL
-
-#ifdef SLM
-          constexpr unsigned int SGs =
-              (MCache2 / MCache1) * (NCache2 / NCache1);
-#endif // SLM
-          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
-#ifdef SLM
-            slm_read_write<colsA, colsB, MCache2, NCache2, KCache2, vnniFactor,
-                           SGs>(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize);
-            it.barrier(access::fence_space::local_space);
-#endif // SLM
-            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
-                tA[MCache1 / TM][KCache2 / KCache1]
-#ifdef INIT_LIST
-                = {}; // default initialization of all array elements
-#else                 // INIT_LIST
-                ; // no initialization
-#endif                // INIT_LIST
-#ifdef VNNI
-            joint_matrix<sub_group, TOperand, use::b, TK, TN,
-                         layout::ext_intel_packed>
-                tB[NCache1 / TN][KCache2 / KCache1]
-#else  // VNNI
-            joint_matrix<sub_group, TOperand, use::b, TK, TN,
-                         layout::row_major>
-                tB[NCache1 / TN][KCache2 / KCache1]
-#endif // VNNI
-#ifdef INIT_LIST
-                = {}; // default initialization of all array elements
-#else                 // INIT_LIST
-                ; // no initialization
-#endif                // INIT_LIST
-
-#ifdef MANUAL_UNROLL
-            manually_unroll_loop<unsigned int, KCache2 / KCache1>([&](auto k1) {
-#else  // MANUAL_UNROLL
-            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
-#endif // MANUAL_UNROLL
-       // physical layer
-              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
-#ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
-#else  // MANUAL_UNROLL
-              for (unsigned int m = 0; m < MCache1 / TM; m++) {
-#endif // MANUAL_UNROLL
-#ifdef SLM
-                joint_matrix_load(sg, tA[m][k1],
-                                  tileA.template get_multi_ptr<
-                                      sycl::access::decorated::no>() +
-                                      (m1 * MCache1 + m * TM) * KCache2 +
-                                      k1 * TK,
-                                  KCache2);
-#else // SLM
-#ifdef OOB
-                ext::intel::experimental::matrix::joint_matrix_load_checked(
-                    sg, tA[m][k1], pA, colsA, rowsA, colsA,
-                    m2 * MCache2 + m1 * MCache1 + m * TM, k * TK);
-#else  // OOB
-                joint_matrix_load(
-                    sg, tA[m][k1],
-                    pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
-                        k * TK,
-                    colsA);
-#endif // OOB
-#endif // SLM
-#ifdef MANUAL_UNROLL
-              }); // m
-#else             // MANUAL_UNROLL
-              } // m
-#endif            // MANUAL_UNROLL
-#ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
-#else  // MANUAL_UNROLL
-              for (unsigned int n = 0; n < NCache1 / TN; n++) {
-#endif // MANUAL_UNROLL
-#ifdef SLM
-                joint_matrix_load(sg, tB[n][k1],
-                                  tileB.template get_multi_ptr<
-                                      sycl::access::decorated::no>() +
-                                      (k1 * TK / vnniFactor) *
-                                          (NCache2 * vnniFactor) +
-                                      (n1 * NCache1 + n * TN) * vnniFactor,
-                                  NCache2 * vnniFactor);
-#else // SLM
-#ifdef OOB
-                ext::intel::experimental::matrix::joint_matrix_load_checked(
-                    sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
-                    colsB * vnniFactor, k * TK / vnniFactor,
-                    (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor);
-#else  // OOB
-                joint_matrix_load(
-                    sg, tB[n][k1],
-                    pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
-                        (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
-                    colsB * vnniFactor);
-#endif // OOB
-#endif // SLM
-#ifdef MANUAL_UNROLL
-              }); // n
-#else             // MANUAL_UNROLL
-              } // n
-#endif            // MANUAL_UNROLL
-#ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
-#else  // MANUAL_UNROLL
-              for (unsigned int m = 0; m < MCache1 / TM; m++) {
-#endif // MANUAL_UNROLL
-#ifdef MANUAL_UNROLL
-                manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
-#else // MANUAL_UNROLL
-                for (unsigned int n = 0; n < NCache1 / TN; n++) {
-
-#endif // MANUAL_UNROLL
-                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
-                                   tC[m][n]);
-#ifdef MANUAL_UNROLL
-                }); // n
-              });   // m
-            });     // for k1
-#else               // MANUAL_UNROLL
-                } // n
-              } // m
-            } // k1
-#endif              // MANUAL_UNROLL
-#ifdef SLM
-            it.barrier(access::fence_space::local_space);
-#endif // SLM
-#ifdef PREFETCH
-            auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
-                                    (k2 + prefDistance) * prefCol;
-            if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) <
-                (MATRIX_SIZE * MATRIX_SIZE))
-              joint_matrix_prefetch<prefRow, prefCol>(
-                  sg, A + prefetch_offsetA, colsA, layout::row_major,
-                  syclex::properties{syclex::prefetch_hint_L1});
-
-            auto prefetch_offsetB =
-                ((k2 + prefDistance) * (KCache2 / vnniFactor) +
-                 pm1B * prefRow) *
-                    (colsB)*vnniFactor +
-                (n2 * NCache2 * vnniFactor + pn1B * prefCol);
-            if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) +
-                 prefCol) < (MATRIX_SIZE * MATRIX_SIZE))
-              joint_matrix_prefetch<prefRow, prefCol>(
-                  sg, B + prefetch_offsetB, colsB * vnniFactor,
-                  layout::row_major,
-                  syclex::properties{syclex::prefetch_hint_L1});
-#endif // PREFETCH
-          } // for k2
-#ifdef MANUAL_UNROLL
-          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
-#else  // MANUAL_UNROLL
-          for (unsigned int m = 0; m < MCache1 / TM; m++) {
-#endif // MANUAL_UNROLL
-#ifdef MANUAL_UNROLL
-            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
-#else  // MANUAL_UNROLL
-            for (unsigned int n = 0; n < NCache1 / TN; n++) {
-#endif // MANUAL_UNROLL
-#ifdef OOB
-              ext::intel::experimental::matrix::joint_matrix_store_checked(
-                  sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB,
-                  m2 * MCache2 + m1 * MCache1 + m * TM,
-                  n2 * NCache2 + n1 * NCache1 + n * TN);
-#else  // OOB
-              joint_matrix_store(
-                  sg, tC[m][n],
-                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
-                      (n2 * NCache2 + n1 * NCache1 + n * TN),
-                  colsB, layout::row_major);
-#endif // OOB
-#ifdef MANUAL_UNROLL
-            }); // n
-          });   // m
-#else           // MANUAL_UNROLL
-            } // n
-          } // m
-#endif          // MANUAL_UNROLL
-        });     // parallel_for
-  });           // queue.submit
-
-  if (i == testIterations - 1)
-    q.wait();
-  std::chrono::duration<double, std::milli> duration =
-      std::chrono::high_resolution_clock::now() - start;
-
-  return duration.count();
-}
-
-template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
-          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
-          size_t MCache2, size_t NCache2, size_t KCache2>
-void test() {
-  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
-         "invalid matrix size");
-  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
-         (MATRIX_SIZE % TK) == 0 &&
-         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
-
-  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
-            << " [TM x TN x TK]" << std::endl;
-
-  queue q;
-  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
-  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
-
-  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
-  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
-
-  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
-                                        MATRIX_SIZE);
-
-#ifdef VNNI
-  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor);
-  free(B, q);
-  B = vnniB;
-#endif
-
-  // run testIterations time, aggregate and calculate average run time
-  double totalDuration = 0;
-  for (unsigned int i = 0; i < testIterations; i++) {
-    double duration =
-        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
-                     vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
-                     KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);
-    if (i >= recordThresh) {
-      totalDuration += duration;
-    }
-  }
-
-  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
-
-  double msecPerMatrixMul =
-      totalDuration / static_cast<double>(testIterations - recordThresh);
-  double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) /
-                  (msecPerMatrixMul / 1000.f);
-
-  std::cout << "DONE for size " << MATRIX_SIZE << std::endl;
-  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
-
-  free(A, q);
-  free(B, q);
-  free(C, q);
-  free(refC, q);
-}
-
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  constexpr size_t MCache1 = 32;
-  constexpr size_t MCache2 = 256;
-  constexpr size_t NCache2 = 256;
-  constexpr size_t KCache2 = 32;
-
-#ifdef VNNI
-  constexpr unsigned int VnniFactor = 2;
-#else  // VNNI
-  constexpr unsigned int VnniFactor = 1;
-#endif // VNNI
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      constexpr size_t NCache1 = 32;
-      constexpr size_t KCache1 = 32;
-      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      constexpr size_t NCache1 = 4 * /*TN*/ 16;
-      constexpr size_t KCache1 = 16;
-      // test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
-      //      NCache1, KCache1, MCache2, NCache2, KCache2>();
-#if (!defined(SG_SZ) || SG_SZ != 32)
-      // These combination are not currently supported for subgroup size = 32 in
-      // // IGC
-      // test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
-      //      MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
-      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
-#endif
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      constexpr size_t NCache1 = 4 * /*TN*/ 8;
-      constexpr size_t KCache1 = 16;
-
-      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
-      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
-      // MCache1,
-      //      NCache1, KCache1, MCache2, NCache2, KCache2>();
-      break;
-    }
-  }
-  return 0;
-}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp
deleted file mode 100644
index fae8dbadee3c7..0000000000000
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp
+++ /dev/null
@@ -1,244 +0,0 @@
-//------------------------------------------------------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-------------------------------------------------------------------------===//
-
-#include <random>
-#include <sycl/usm.hpp>
-#include <sycl/stream.hpp>
-
-// number of test iterations
-constexpr unsigned int testIterations = 100;
-// start recording time after X iterations
-constexpr unsigned int recordThresh = 10;
-
-#ifdef MANUAL_UNROLL
-template <class T, T... inds, class F>
-static constexpr void loop(std::integer_sequence<T, inds...>, F &&f) {
-  (f(std::integral_constant<T, inds>{}), ...); // C++17 fold expression
-}
-
-template <class T, T count, class F>
-static constexpr void manually_unroll_loop(F &&f) {
-  loop(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-#endif
-
-template <size_t TM, size_t TN, size_t TK> class MatMul;
-
-template <size_t VNNI,
-          typename TOperand, typename TResult, size_t TM, size_t TN, size_t TK,
-          size_t MCache1, size_t NCache1, size_t KCache1, size_t MCache2,
-          size_t NCache2, size_t KCache2>
-double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
-  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
-  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
-  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
-
-  // throw error if padding needed
-  assert(colsA == rowsB);
-  assert(rowsA % TM == 0);
-  assert(colsA % TK == 0);
-  assert(colsB % TN == 0);
-  // submit main kernel
-  std::chrono::high_resolution_clock::time_point start =
-      std::chrono::high_resolution_clock::now();
-
-  q.submit([&](handler &h) {
-    sycl::stream os { 5000, 5000, h};
-    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
-        nd_range<2>{global, cachelocal},
-        // loop global
-        // loop localrange
-        [=](nd_item<2> it)
-#ifdef SG_SZ
-            [[intel::reqd_sub_group_size(SG_SZ)]]
-#endif
-        {
-          auto pA =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(A);
-          auto pB =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(B);
-          auto pC =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(C);
-          auto m2 = it.get_group(0);
-          auto n2 = it.get_group(1);
-          auto m1 = it.get_local_id(0);
-          auto n1 = it.get_local_id(1) / sgSize;
-          auto sg = it.get_sub_group();
-          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
-              tC[MCache1 / TM][NCache1 / TN];
-
-          for (unsigned int m = 0; m < MCache1 / TM; m++) {
-            for (unsigned int n = 0; n < NCache1 / TN; n++) {
-              joint_matrix_fill(sg, tC[m][n], 0);
-            }
-          }
-
-          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
-            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
-                tA[MCache1 / TM][KCache2 / KCache1];
-
-            joint_matrix<sub_group, TOperand, use::b, TK, TN,
-                         layout::ext_intel_packed>
-                tB[NCache1 / TN][KCache2 / KCache1]; 
-
-            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
-              // physical layer
-              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
-              for (unsigned int m = 0; m < MCache1 / TM; m++) {
-                joint_matrix_load(
-                    sg, tA[m][k1],
-                    pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
-                        k * TK,
-                    colsA);
-              } // m
-
-              for (unsigned int n = 0; n < NCache1 / TN; n++) {
-                joint_matrix_load(sg, tB[n][k1],
-                                  pB + (k * TK / VNNI) * (colsB * VNNI) +
-                                      (n2 * NCache2 + n1 * NCache1 + n * TN) *
-                                          VNNI,
-                                  colsB * VNNI);
-              } // n
-              
-              for (unsigned int m = 0; m < MCache1 / TM; m++) {
-                for (unsigned int n = 0; n < NCache1 / TN; n++) {
-                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
-                                   tC[m][n]);
-                } // n
-              } // m
-            } // k1
-          } // for k2
-
-          for (unsigned int m = 0; m < MCache1 / TM; m++) {
-            for (unsigned int n = 0; n < NCache1 / TN; n++) {
-              joint_matrix_store(
-                  sg, tC[m][n],
-                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
-                      (n2 * NCache2 + n1 * NCache1 + n * TN),
-                  colsB, layout::row_major);
-            } // n
-          } // m
-// #endif
-        }); // parallel_for
-  });       // queue.submit
-
-  if (i == testIterations - 1)
-    q.wait();
-  std::chrono::duration<double, std::milli> duration =
-      std::chrono::high_resolution_clock::now() - start;
-
-  return duration.count();
-}
-
-template <typename T, typename TResult, size_t VNNI, size_t TM, size_t TN,
-          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
-          size_t MCache2, size_t NCache2, size_t KCache2>
-void test(size_t MATRIX_SIZE) {
-  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
-         "invalid matrix size");
-  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
-         (MATRIX_SIZE % TK) == 0 &&
-         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
-
-  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
-            << " [TM x TN x TK]" << std::endl;
-
-  queue q;
-  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
-  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
-
-  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
-  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
-  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI);
-
-  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
-                                        MATRIX_SIZE);
-
-  // run testIterations time, aggregate and calculate average run time
-  double totalDuration = 0;
-  for (unsigned int i = 0; i < testIterations; i++) {
-    double duration =
-        joint_matmul<VNNI,
-                     T, TResult, TM, TN, TK, MCache1, NCache1, KCache1, MCache2,
-                     NCache2, KCache2>(A, vnniB, C, q, i, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE);
-    if (i >= recordThresh) {
-      totalDuration += duration;
-    }
-  }
-
-  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
-
-  double msecPerMatrixMul =
-      totalDuration / static_cast<double>(testIterations - recordThresh);
-  double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) /
-                  (msecPerMatrixMul / 1000.f);
-
-  std::cout << "DONE for size " << MATRIX_SIZE << std::endl;
-  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
-
-  free(A, q);
-  free(B, q);
-  free(vnniB, q);
-  free(C, q);
-  free(refC, q);
-}
-
-int main(int argc, char *argv[]) {
-  size_t MATRIX_SIZE;
-  MATRIX_SIZE = std::stoul(argv[1]);
-
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  constexpr size_t MCache1 = 32;
-  constexpr size_t MCache2 = 256;
-  constexpr size_t NCache2 = 256;
-  constexpr size_t KCache2 = 32;
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      constexpr size_t NCache1 = 32;
-      constexpr size_t KCache1 = 32;
-
-      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      constexpr size_t NCache1 = 4 * /*TN*/ 16;
-      constexpr size_t KCache1 = 16;
-
-      // test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
-#if (!defined(SG_SZ) || SG_SZ != 32)
-      // These combination are not currently supported for subgroup size = 32 in
-      // IGC
-      // test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
-      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
-#endif
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      constexpr size_t NCache1 = 4 * /*TN*/ 8;
-      constexpr size_t KCache1 = 16;
-
-      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16>();
-      break;
-    }
-  }
-  return 0;
-}

From b56a34ce1142ddb858db37a802558d91c4c650de Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Wed, 18 Sep 2024 14:19:20 -0700
Subject: [PATCH 3/8] update the test to address the commets

---
 ...joint_matrix_bf16_fill_k_cache_arg_dim.cpp |  8 ++--
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   | 40 +++++++++----------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
index aa2540b3781b7..5caf08a5f6bfc 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
@@ -1,4 +1,4 @@
-//==--- joint_matrix_bf16_fill_k_cache_OOB.cpp  - DPC++ joint_matrix--------==//
+//==--- joint_matrix_bf16_fill_k_cache_arg_dim.cpp  - DPC++ joint_matrix--------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-ext_intel_matrix
-
-// https://jira.devtools.intel.com/browse/GSD-9716
 // XFAIL: arch-intel_gpu_pvc
 
-// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM -DVNNI
+// RUN: %{build} -o %t_arg_dim_vnni.out -ffp-model=precise -DARG_DIM -DVNNI
 // RUN: %{run} %t_arg_dim_vnni.out
 
 // RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM
@@ -19,4 +17,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
\ No newline at end of file
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 4584aedbe3d01..47cfab5506187 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -36,22 +36,19 @@ static constexpr void manually_unroll_loop(F &&f) {
 
 template <size_t TM, size_t TN, size_t TK> class MatMul;
 
-#ifdef ARG_DIM
-template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
-          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
-          size_t MCache2, size_t NCache2, size_t KCache2>
-#else // ARG_DIM
-template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
+template <
+#ifndef ARG_DIM
+          size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
+#endif // ARG_DIM
           size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
           size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
-#endif // ARG_DIM
 
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
 #ifdef ARG_DIM
-double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
-#else  // ARG_DIM
-double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
-#endif // ARG_DIM
+                   , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
+#endif // ARG_DIM                   
+                  ) {
 
   size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
   range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
@@ -393,17 +390,20 @@ void test() {
   // run testIterations time, aggregate and calculate average run time
   double totalDuration = 0;
   for (unsigned int i = 0; i < testIterations; i++) {
+
+  double duration =
+        joint_matmul<        
+#ifndef ARG_DIM
+                    MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
+#endif // ARG_DIM
+                    vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+                    KCache1, MCache2, NCache2, KCache2>
+                    (A, B, C, q, i
 #ifdef ARG_DIM
-    double duration =
-        joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
-                     KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i, 
-                     MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE);
-#else // ARG_DIM
-    double duration =
-        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
-                     vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
-                     KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);
+                    , MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE
 #endif // ARG_DIM
+                    );
+
     if (i >= recordThresh) {
       totalDuration += duration;
     }

From a184cbc9cc0dcfe82c7ed1e2ad0e35ccbdd97a87 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Thu, 19 Sep 2024 15:07:08 -0700
Subject: [PATCH 4/8] add test for runtime matrix dimension

---
 ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp |  22 ++
 ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 249 ++++++++++++++++++
 2 files changed, 271 insertions(+)
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
new file mode 100644
index 0000000000000..30ba7db3d227e
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -0,0 +1,22 @@
+//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+
+// https://jira.devtools.intel.com/browse/GSD-9716
+// XFAIL: arch-intel_gpu_pvc
+
+// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI
+// RUN: %{run} %t_runtime_dim_vnni.out
+
+// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise
+// RUN: %{run} %t_runtime_dim.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
new file mode 100644
index 0000000000000..cbfb4de8f18aa
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
@@ -0,0 +1,249 @@
+//------------------------------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------------===//
+
+#include <random>
+#include <sycl/usm.hpp>
+
+// number of test iterations
+constexpr unsigned int testIterations = 100;
+// start recording time after X iterations
+constexpr unsigned int recordThresh = 10;
+
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
+          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, 
+                    size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
+
+  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
+
+  // throw error if padding needed
+  assert(colsA == rowsB);
+  assert(rowsA % TM == 0);
+  assert(colsA % TK == 0);
+  assert(colsB % TN == 0);
+  // submit main kernel
+  std::chrono::high_resolution_clock::time_point start =
+      std::chrono::high_resolution_clock::now();
+
+  q.submit([&](handler &h) {
+    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
+        nd_range<2>{global, cachelocal},
+        // loop global
+        // loop localrange
+        [=](nd_item<2> it)
+        {
+          // sg::load and sg::store expect decorations to be ON
+          auto pA =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(A);
+          auto pB =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(B);
+          auto pC =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::yes>(C);
+          auto m2 = it.get_group(0);
+          auto n2 = it.get_group(1);
+          auto m1 = it.get_local_id(0);
+          auto n1 = it.get_local_id(1) / sgSize;
+          auto sg = it.get_sub_group();
+
+          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+              tC[MCache1 / TM][NCache1 / TN];
+
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+              joint_matrix_fill(sg, tC[m][n], 0);
+            }
+          }
+
+          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+                tA[MCache1 / TM][KCache2 / KCache1];
+#ifdef VNNI
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::ext_intel_packed>
+                tB[NCache1 / TN][KCache2 / KCache1];
+#else  // VNNI
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
+                         layout::row_major>
+                tB[NCache1 / TN][KCache2 / KCache1];
+#endif // VNNI
+
+            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
+              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
+                for (unsigned int m = 0; m < MCache1 / TM; m++) {
+                    joint_matrix_load(
+                        sg, tA[m][k1],
+                        pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+                            k * TK,
+                        colsA);
+                }
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
+                    joint_matrix_load(
+                        sg, tB[n][k1],
+                        pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
+                            (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
+                        colsB * vnniFactor);
+                } // n
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
+                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
+                                   tC[m][n]);
+                } // n
+              } // m
+            } // k1
+          } // for k2
+
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
+              joint_matrix_store(
+                  sg, tC[m][n],
+                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+                      (n2 * NCache2 + n1 * NCache1 + n * TN),
+                  colsB, layout::row_major);
+            } // n
+          } // m
+        });     // parallel_for
+  });           // queue.submit
+
+  if (i == testIterations - 1)
+    q.wait();
+  std::chrono::duration<double, std::milli> duration =
+      std::chrono::high_resolution_clock::now() - start;
+
+  return duration.count();
+}
+
+template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
+          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+void test(size_t matrix_size) {
+  assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
+         "invalid matrix size");
+  assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
+         (matrix_size % TK) == 0 &&
+         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
+
+  queue q;
+  T *A = malloc_shared<T>(matrix_size * matrix_size, q);
+  T *B = malloc_shared<T>(matrix_size * matrix_size, q);
+  TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
+  TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);
+
+  matrix_rand<T>(matrix_size, matrix_size, A, T(1));
+  matrix_rand<T>(matrix_size, matrix_size, B, T(1));
+
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
+                                        matrix_size);
+
+#ifdef VNNI
+  T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
+  matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
+  free(B, q);
+  B = vnniB;
+#endif
+
+  // run testIterations time, aggregate and calculate average run time
+  double totalDuration = 0;
+  for (unsigned int i = 0; i < testIterations; i++) {
+
+  double duration =
+        joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+                    KCache1, MCache2, NCache2, KCache2>
+                    (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size);
+
+    if (i >= recordThresh) {
+      totalDuration += duration;
+    }
+  }
+
+  assert(matrix_compare(matrix_size, matrix_size, C, refC));
+
+  double msecPerMatrixMul =
+      totalDuration / static_cast<double>(testIterations - recordThresh);
+  double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
+                  (msecPerMatrixMul / 1000.f);
+
+  std::cout << "DONE for size " << matrix_size << std::endl;
+  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
+
+  free(A, q);
+  free(B, q);
+  free(C, q);
+  free(refC, q);
+}
+
+int main(int argc, char *argv[]) {
+  size_t matrix_size;
+  matrix_size = std::stoul(argv[1]);
+
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  constexpr size_t MCache1 = 32;
+  constexpr size_t MCache2 = 256;
+  constexpr size_t NCache2 = 256;
+  constexpr size_t KCache2 = 32;
+
+#ifdef VNNI
+  constexpr unsigned int VnniFactor = 2;
+#else  // VNNI
+  constexpr unsigned int VnniFactor = 1;
+#endif // VNNI
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      constexpr size_t NCache1 = 32;
+      constexpr size_t KCache1 = 32;
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      constexpr size_t NCache1 = 4 * /*TN*/ 16;
+      constexpr size_t KCache1 = 16;
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      constexpr size_t NCache1 = 4 * /*TN*/ 8;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
+      // MCache1,
+      //      NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      break;
+    }
+  }
+  return 0;
+}

From 11aa43c26883b62205c89e01ae9f9f9eb7f677d4 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Fri, 20 Sep 2024 08:11:10 -0700
Subject: [PATCH 5/8] reuse joint_matmul from
 joint_matrix_bf16_fill_k_cache_impl.hpp in
 joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp

---
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   |  10 +-
 ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp |   6 +-
 ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 127 ++----------------
 3 files changed, 21 insertions(+), 122 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 47cfab5506187..b4a902a5b2281 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -14,9 +14,9 @@
 #endif
 
 // number of test iterations
-constexpr unsigned int testIterations = 100;
+extern constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
-constexpr unsigned int recordThresh = 10;
+extern constexpr unsigned int recordThresh = 10;
 
 #ifndef MATRIX_SIZE
 #define MATRIX_SIZE 256
@@ -46,9 +46,9 @@ template <
 
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
 #ifdef ARG_DIM
-                   , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
+       , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
 #endif // ARG_DIM                   
-                  ) {
+       ) {
 
   size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
   range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
@@ -355,6 +355,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
   return duration.count();
 }
 
+#ifndef EXCLUDE_MAIN_TEST
 template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
           size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
@@ -482,3 +483,4 @@ int main() {
   }
   return 0;
 }
+#endif //EXCLUDE_MAIN_TEST
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
index 30ba7db3d227e..4c9766bc1751f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -18,5 +18,9 @@
 
 // -ffp-model=precise is added to not depend on compiler defaults.
 
+#define EXCLUDE_MAIN_TEST 1
+#define ARG_DIM 1
+
 #include "common.hpp"
-#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
\ No newline at end of file
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
+#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
index cbfb4de8f18aa..495185f46fbe4 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
@@ -8,123 +8,8 @@
 
 #include <random>
 #include <sycl/usm.hpp>
-
-// number of test iterations
-constexpr unsigned int testIterations = 100;
-// start recording time after X iterations
-constexpr unsigned int recordThresh = 10;
-
 template <size_t TM, size_t TN, size_t TK> class MatMul;
 
-template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
-          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
-          size_t MCache2, size_t NCache2, size_t KCache2>
-double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, 
-                    size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
-
-  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
-  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
-  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
-
-  // throw error if padding needed
-  assert(colsA == rowsB);
-  assert(rowsA % TM == 0);
-  assert(colsA % TK == 0);
-  assert(colsB % TN == 0);
-  // submit main kernel
-  std::chrono::high_resolution_clock::time_point start =
-      std::chrono::high_resolution_clock::now();
-
-  q.submit([&](handler &h) {
-    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
-        nd_range<2>{global, cachelocal},
-        // loop global
-        // loop localrange
-        [=](nd_item<2> it)
-        {
-          // sg::load and sg::store expect decorations to be ON
-          auto pA =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::yes>(A);
-          auto pB =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::yes>(B);
-          auto pC =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::yes>(C);
-          auto m2 = it.get_group(0);
-          auto n2 = it.get_group(1);
-          auto m1 = it.get_local_id(0);
-          auto n1 = it.get_local_id(1) / sgSize;
-          auto sg = it.get_sub_group();
-
-          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
-              tC[MCache1 / TM][NCache1 / TN];
-
-          for (unsigned int m = 0; m < MCache1 / TM; m++) {
-            for (unsigned int n = 0; n < NCache1 / TN; n++) {
-              joint_matrix_fill(sg, tC[m][n], 0);
-            }
-          }
-
-          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
-            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
-                tA[MCache1 / TM][KCache2 / KCache1];
-#ifdef VNNI
-            joint_matrix<sub_group, TOperand, use::b, TK, TN,
-                         layout::ext_intel_packed>
-                tB[NCache1 / TN][KCache2 / KCache1];
-#else  // VNNI
-            joint_matrix<sub_group, TOperand, use::b, TK, TN,
-                         layout::row_major>
-                tB[NCache1 / TN][KCache2 / KCache1];
-#endif // VNNI
-
-            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
-              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
-                for (unsigned int m = 0; m < MCache1 / TM; m++) {
-                    joint_matrix_load(
-                        sg, tA[m][k1],
-                        pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
-                            k * TK,
-                        colsA);
-                }
-                for (unsigned int n = 0; n < NCache1 / TN; n++) {
-                    joint_matrix_load(
-                        sg, tB[n][k1],
-                        pB + (k * TK / vnniFactor) * (colsB * vnniFactor) +
-                            (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor,
-                        colsB * vnniFactor);
-                } // n
-              for (unsigned int m = 0; m < MCache1 / TM; m++) {
-                for (unsigned int n = 0; n < NCache1 / TN; n++) {
-                  joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
-                                   tC[m][n]);
-                } // n
-              } // m
-            } // k1
-          } // for k2
-
-          for (unsigned int m = 0; m < MCache1 / TM; m++) {
-            for (unsigned int n = 0; n < NCache1 / TN; n++) {
-              joint_matrix_store(
-                  sg, tC[m][n],
-                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
-                      (n2 * NCache2 + n1 * NCache1 + n * TN),
-                  colsB, layout::row_major);
-            } // n
-          } // m
-        });     // parallel_for
-  });           // queue.submit
-
-  if (i == testIterations - 1)
-    q.wait();
-  std::chrono::duration<double, std::milli> duration =
-      std::chrono::high_resolution_clock::now() - start;
-
-  return duration.count();
-}
-
 template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
           size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
@@ -164,7 +49,8 @@ void test(size_t matrix_size) {
   double duration =
         joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
                     KCache1, MCache2, NCache2, KCache2>
-                    (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size);
+                    (A, B, C, q, i, 
+                    matrix_size, matrix_size, matrix_size, matrix_size);
 
     if (i >= recordThresh) {
       totalDuration += duration;
@@ -189,7 +75,14 @@ void test(size_t matrix_size) {
 
 int main(int argc, char *argv[]) {
   size_t matrix_size;
-  matrix_size = std::stoul(argv[1]);
+
+  // Check for command line argument
+  if (argc == 2) {
+    matrix_size = std::stoul(argv[1]);
+  } else {
+    std::cerr << "Usage: ./program matrix_size\n";
+    return 1; // Error if no argument
+  }
 
   queue q;
   std::vector<combination> combinations =

From beb49164183305765ec811f5d4c3f3715085b029 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Fri, 20 Sep 2024 08:16:37 -0700
Subject: [PATCH 6/8] remove extern for testIterations and recordThresh

---
 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index b4a902a5b2281..cfbf93dcddbed 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -14,9 +14,9 @@
 #endif
 
 // number of test iterations
-extern constexpr unsigned int testIterations = 100;
+constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
-extern constexpr unsigned int recordThresh = 10;
+constexpr unsigned int recordThresh = 10;
 
 #ifndef MATRIX_SIZE
 #define MATRIX_SIZE 256

From 694db15dc371169a57644f864741a635d463bc8c Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Fri, 20 Sep 2024 09:41:32 -0700
Subject: [PATCH 7/8] remove the link to jira

---
 .../Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp       | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
index 4c9766bc1751f..b55894fe2e858 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-ext_intel_matrix
-
-// https://jira.devtools.intel.com/browse/GSD-9716
 // XFAIL: arch-intel_gpu_pvc
 
 // RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI

From 4b67488cc3f6f710d4af11793d2d27f75e619325 Mon Sep 17 00:00:00 2001
From: "Zhang, Yixing" <yixing.zhang@intel.com>
Date: Fri, 20 Sep 2024 14:04:59 -0700
Subject: [PATCH 8/8] remove
 joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp

---
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   |  96 ++++++------
 ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp |   8 +-
 ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 142 ------------------
 3 files changed, 56 insertions(+), 190 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index cfbf93dcddbed..db8ddafba61a1 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -37,17 +37,16 @@ static constexpr void manually_unroll_loop(F &&f) {
 template <size_t TM, size_t TN, size_t TK> class MatMul;
 
 template <
-#ifndef ARG_DIM
+#if !defined(ARG_DIM) && !defined(RUNTIME_DIM)
           size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
-#endif // ARG_DIM
+#endif // ARG_DIM, RUNTIME_DIM
           size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
           size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
-
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
-#ifdef ARG_DIM
+#if defined(ARG_DIM) || defined(RUNTIME_DIM)
        , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
-#endif // ARG_DIM                   
+#endif // ARG_DIM, RUNTIME_DIM
        ) {
 
   size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
@@ -296,8 +295,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
 #ifdef PREFETCH
             auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA +
                                     (k2 + prefDistance) * prefCol;
-            if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) <
-                (MATRIX_SIZE * MATRIX_SIZE))
+            if ((prefetch_offsetA + (prefRow * colsA) + prefCol) <
+                (rowsA * colsA))
               joint_matrix_prefetch<prefRow, prefCol>(
                   sg, A + prefetch_offsetA, colsA, layout::row_major,
                   syclex::properties{syclex::prefetch_hint_L1});
@@ -307,8 +306,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
                  pm1B * prefRow) *
                     (colsB)*vnniFactor +
                 (n2 * NCache2 * vnniFactor + pn1B * prefCol);
-            if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) +
-                 prefCol) < (MATRIX_SIZE * MATRIX_SIZE))
+            if ((prefetch_offsetB + (prefRow * colsA * vnniFactor) +
+                 prefCol) < (rowsA * colsA))
               joint_matrix_prefetch<prefRow, prefCol>(
                   sg, B + prefetch_offsetB, colsB * vnniFactor,
                   layout::row_major,
@@ -355,35 +354,34 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
   return duration.count();
 }
 
-#ifndef EXCLUDE_MAIN_TEST
 template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
           size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
-void test() {
-  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
+void test(size_t matrix_size) {
+  assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
          "invalid matrix size");
-  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
-         (MATRIX_SIZE % TK) == 0 &&
+  assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
+         (matrix_size % TK) == 0 &&
          "invalid matrix size detected: not a multiple of <TM,TN,TK>");
 
   std::cout << "Testing: " << TM << " x " << TN << " x " << TK
             << " [TM x TN x TK]" << std::endl;
 
   queue q;
-  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
-  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *A = malloc_shared<T>(matrix_size * matrix_size, q);
+  T *B = malloc_shared<T>(matrix_size * matrix_size, q);
+  TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
+  TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);
 
-  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
-  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
+  matrix_rand<T>(matrix_size, matrix_size, A, T(1));
+  matrix_rand<T>(matrix_size, matrix_size, B, T(1));
 
-  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
-                                        MATRIX_SIZE);
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
+                                        matrix_size);
 
 #ifdef VNNI
-  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
-  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor);
+  T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
+  matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
   free(B, q);
   B = vnniB;
 #endif
@@ -394,15 +392,15 @@ void test() {
 
   double duration =
         joint_matmul<        
-#ifndef ARG_DIM
+#if !defined(ARG_DIM) && !defined(RUNTIME_DIM)
                     MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
-#endif // ARG_DIM
+#endif // ARG_DIM, RUNTIME_DIM
                     vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
                     KCache1, MCache2, NCache2, KCache2>
                     (A, B, C, q, i
-#ifdef ARG_DIM
-                    , MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE
-#endif // ARG_DIM
+#if defined(ARG_DIM) || defined(RUNTIME_DIM)
+                    , matrix_size, matrix_size, matrix_size, matrix_size
+#endif // ARG_DIM, RUNTIME_DIM
                     );
 
     if (i >= recordThresh) {
@@ -410,14 +408,14 @@ void test() {
     }
   }
 
-  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
+  assert(matrix_compare(matrix_size, matrix_size, C, refC));
 
   double msecPerMatrixMul =
       totalDuration / static_cast<double>(testIterations - recordThresh);
-  double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) /
+  double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
                   (msecPerMatrixMul / 1000.f);
 
-  std::cout << "DONE for size " << MATRIX_SIZE << std::endl;
+  std::cout << "DONE for size " << matrix_size << std::endl;
   std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
 
   free(A, q);
@@ -426,7 +424,23 @@ void test() {
   free(refC, q);
 }
 
-int main() {
+int main(
+#ifdef RUNTIME_DIM
+  int argc, char *argv[]
+#endif //RUNTIME_DIM
+  ) {
+
+size_t matrix_size = MATRIX_SIZE;
+#ifdef RUNTIME_DIM
+  // Check for command line argument
+  if (argc == 2) {
+    matrix_size = std::stoul(argv[1]);
+  } else {
+    std::cerr << "Usage: ./program matrix_size\n";
+    return 1; // Error if no argument
+  }
+#endif //RUNTIME_DIM
+
   queue q;
   std::vector<combination> combinations =
       q.get_device()
@@ -449,7 +463,7 @@ int main() {
       constexpr size_t NCache1 = 32;
       constexpr size_t KCache1 = 32;
       test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
       break;
     }
 
@@ -457,14 +471,14 @@ int main() {
       constexpr size_t NCache1 = 4 * /*TN*/ 16;
       constexpr size_t KCache1 = 16;
       test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
+           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
 #if (!defined(SG_SZ) || SG_SZ != 32)
       // These combination are not currently supported for subgroup size = 32 in
       // IGC
       test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
       test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>();
+           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
 #endif
       break;
     }
@@ -474,13 +488,11 @@ int main() {
       constexpr size_t KCache1 = 16;
 
       test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>();
-      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
-      // MCache1,
-      //      NCache1, KCache1, MCache2, NCache2, KCache2>();
+           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
+      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16, MCache1,
+      //      NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
       break;
     }
   }
   return 0;
 }
-#endif //EXCLUDE_MAIN_TEST
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
index b55894fe2e858..857c47b04ed56 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
@@ -8,17 +8,13 @@
 // REQUIRES: aspect-ext_intel_matrix
 // XFAIL: arch-intel_gpu_pvc
 
-// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI
+// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DRUNTIME_DIM -DVNNI
 // RUN: %{run} %t_runtime_dim_vnni.out
 
-// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise
+// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise -DRUNTIME_DIM
 // RUN: %{run} %t_runtime_dim.out
 
 // -ffp-model=precise is added to not depend on compiler defaults.
 
-#define EXCLUDE_MAIN_TEST 1
-#define ARG_DIM 1
-
 #include "common.hpp"
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
-#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
deleted file mode 100644
index 495185f46fbe4..0000000000000
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//------------------------------------------------------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-------------------------------------------------------------------------===//
-
-#include <random>
-#include <sycl/usm.hpp>
-template <size_t TM, size_t TN, size_t TK> class MatMul;
-
-template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
-          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
-          size_t MCache2, size_t NCache2, size_t KCache2>
-void test(size_t matrix_size) {
-  assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
-         "invalid matrix size");
-  assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
-         (matrix_size % TK) == 0 &&
-         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
-
-  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
-            << " [TM x TN x TK]" << std::endl;
-
-  queue q;
-  T *A = malloc_shared<T>(matrix_size * matrix_size, q);
-  T *B = malloc_shared<T>(matrix_size * matrix_size, q);
-  TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
-  TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);
-
-  matrix_rand<T>(matrix_size, matrix_size, A, T(1));
-  matrix_rand<T>(matrix_size, matrix_size, B, T(1));
-
-  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
-                                        matrix_size);
-
-#ifdef VNNI
-  T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
-  matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
-  free(B, q);
-  B = vnniB;
-#endif
-
-  // run testIterations time, aggregate and calculate average run time
-  double totalDuration = 0;
-  for (unsigned int i = 0; i < testIterations; i++) {
-
-  double duration =
-        joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
-                    KCache1, MCache2, NCache2, KCache2>
-                    (A, B, C, q, i, 
-                    matrix_size, matrix_size, matrix_size, matrix_size);
-
-    if (i >= recordThresh) {
-      totalDuration += duration;
-    }
-  }
-
-  assert(matrix_compare(matrix_size, matrix_size, C, refC));
-
-  double msecPerMatrixMul =
-      totalDuration / static_cast<double>(testIterations - recordThresh);
-  double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
-                  (msecPerMatrixMul / 1000.f);
-
-  std::cout << "DONE for size " << matrix_size << std::endl;
-  std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;
-
-  free(A, q);
-  free(B, q);
-  free(C, q);
-  free(refC, q);
-}
-
-int main(int argc, char *argv[]) {
-  size_t matrix_size;
-
-  // Check for command line argument
-  if (argc == 2) {
-    matrix_size = std::stoul(argv[1]);
-  } else {
-    std::cerr << "Usage: ./program matrix_size\n";
-    return 1; // Error if no argument
-  }
-
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  constexpr size_t MCache1 = 32;
-  constexpr size_t MCache2 = 256;
-  constexpr size_t NCache2 = 256;
-  constexpr size_t KCache2 = 32;
-
-#ifdef VNNI
-  constexpr unsigned int VnniFactor = 2;
-#else  // VNNI
-  constexpr unsigned int VnniFactor = 1;
-#endif // VNNI
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      constexpr size_t NCache1 = 32;
-      constexpr size_t KCache1 = 32;
-      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      constexpr size_t NCache1 = 4 * /*TN*/ 16;
-      constexpr size_t KCache1 = 16;
-      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-#if (!defined(SG_SZ) || SG_SZ != 32)
-      // These combination are not currently supported for subgroup size = 32 in
-      // IGC
-      test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
-           MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-#endif
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      constexpr size_t NCache1 = 4 * /*TN*/ 8;
-      constexpr size_t KCache1 = 16;
-
-      test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
-           NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-      // test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
-      // MCache1,
-      //      NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-      break;
-    }
-  }
-  return 0;
-}