intel · YixingZhang007 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024
@@ -0,0 +1,22 @@
+//==--- joint_matrix_bf16_fill_k_cache_OOB.cpp  - DPC++ joint_matrix--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+
+// https://jira.devtools.intel.com/browse/GSD-9716
+// XFAIL: arch-intel_gpu_pvc
+
+// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM -DVNNI
+// RUN: %{run} %t_arg_dim_vnni.out
+
+// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM
+// RUN: %{run} %t_arg_dim.out
+
+// -ffp-model=precise is added to not depend on compiler defaults.
+
+#include "common.hpp"
+#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
@@ -36,11 +36,23 @@ static constexpr void manually_unroll_loop(F &&f) {
 
 template <size_t TM, size_t TN, size_t TK> class MatMul;
 
+#ifdef ARG_DIM
+template <size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
+          size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+#else // ARG_DIM
 template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
           size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
           size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
           size_t MCache2, size_t NCache2, size_t KCache2>
+#endif // ARG_DIM
+
+#ifdef ARG_DIM
+double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) {
+#else  // ARG_DIM
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
+#endif // ARG_DIM
+
   size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
   range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
   range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
@@ -381,10 +393,17 @@ void test() {
   // run testIterations time, aggregate and calculate average run time
   double totalDuration = 0;
   for (unsigned int i = 0; i < testIterations; i++) {
+#ifdef ARG_DIM
+    double duration =
+        joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
+                     KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i, 
+                     MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE);
+#else // ARG_DIM
     double duration =
         joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                      vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
                      KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);
+#endif // ARG_DIM
     if (i >= recordThresh) {
       totalDuration += duration;
     }