diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp new file mode 100644 index 0000000000000..5caf08a5f6bfc --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp @@ -0,0 +1,20 @@ +//==--- joint_matrix_bf16_fill_k_cache_arg_dim.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// XFAIL: arch-intel_gpu_pvc + +// RUN: %{build} -o %t_arg_dim_vnni.out -ffp-model=precise -DARG_DIM -DVNNI +// RUN: %{run} %t_arg_dim_vnni.out + +// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM +// RUN: %{run} %t_arg_dim.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index b561bd073038a..db8ddafba61a1 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -36,11 +36,19 @@ static constexpr void manually_unroll_loop(F &&f) { template class MatMul; -template -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i +#if defined(ARG_DIM) || defined(RUNTIME_DIM) + , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB +#endif // ARG_DIM, RUNTIME_DIM + ) { + size_t sgSize = get_sg_size>(q); range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; @@ -287,8 +295,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { #ifdef PREFETCH auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA + (k2 + prefDistance) * prefCol; - if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) < - (MATRIX_SIZE * MATRIX_SIZE)) + if ((prefetch_offsetA + (prefRow * colsA) + prefCol) < + (rowsA * colsA)) joint_matrix_prefetch( sg, A + prefetch_offsetA, colsA, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); @@ -298,8 +306,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { pm1B * prefRow) * (colsB)*vnniFactor + (n2 * NCache2 * vnniFactor + pn1B * prefCol); - if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) + - prefCol) < (MATRIX_SIZE * MATRIX_SIZE)) + if ((prefetch_offsetB + (prefRow * colsA * vnniFactor) + + prefCol) < (rowsA * colsA)) joint_matrix_prefetch( sg, B + prefetch_offsetB, colsB * vnniFactor, layout::row_major, @@ -349,31 +357,31 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { template -void test() { - assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && +void test(size_t matrix_size) { + assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN && "invalid matrix size"); - assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && - (MATRIX_SIZE % TK) == 0 && + assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 && + (matrix_size % TK) == 0 && "invalid matrix size detected: not a multiple of "); std::cout << "Testing: " << TM << " x " << TN << " x " << TK << " [TM x TN x TK]" << std::endl; queue q; - T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *A = malloc_shared(matrix_size * matrix_size, q); + T *B = malloc_shared(matrix_size * matrix_size, q); + TResult *C = malloc_shared(matrix_size * matrix_size, q); + TResult *refC = malloc_shared(matrix_size * matrix_size, q); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); + matrix_rand(matrix_size, matrix_size, A, T(1)); + matrix_rand(matrix_size, matrix_size, B, T(1)); - matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, - MATRIX_SIZE); + matrix_multiply_ref(A, B, refC, matrix_size, matrix_size, + matrix_size); #ifdef VNNI - T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor); + T *vnniB = malloc_shared(matrix_size * matrix_size, q); + matrix_vnni(matrix_size, matrix_size, B, vnniB, vnniFactor); free(B, q); B = vnniB; #endif @@ -381,23 +389,33 @@ void test() { // run testIterations time, aggregate and calculate average run time double totalDuration = 0; for (unsigned int i = 0; i < testIterations; i++) { - double duration = - joint_matmul(A, B, C, q, i); + + double duration = + joint_matmul< +#if !defined(ARG_DIM) && !defined(RUNTIME_DIM) + MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, +#endif // ARG_DIM, RUNTIME_DIM + vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1, + KCache1, MCache2, NCache2, KCache2> + (A, B, C, q, i +#if defined(ARG_DIM) || defined(RUNTIME_DIM) + , matrix_size, matrix_size, matrix_size, matrix_size +#endif // ARG_DIM, RUNTIME_DIM + ); + if (i >= recordThresh) { totalDuration += duration; } } - assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); + assert(matrix_compare(matrix_size, matrix_size, C, refC)); double msecPerMatrixMul = totalDuration / static_cast(testIterations - recordThresh); - double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / + double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) / (msecPerMatrixMul / 1000.f); - std::cout << "DONE for size " << MATRIX_SIZE << std::endl; + std::cout << "DONE for size " << matrix_size << std::endl; std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; free(A, q); @@ -406,7 +424,23 @@ void test() { free(refC, q); } -int main() { +int main( +#ifdef RUNTIME_DIM + int argc, char *argv[] +#endif //RUNTIME_DIM + ) { + +size_t matrix_size = MATRIX_SIZE; +#ifdef RUNTIME_DIM + // Check for command line argument + if (argc == 2) { + matrix_size = std::stoul(argv[1]); + } else { + std::cerr << "Usage: ./program matrix_size\n"; + return 1; // Error if no argument + } +#endif //RUNTIME_DIM + queue q; std::vector combinations = q.get_device() @@ -429,7 +463,7 @@ int main() { constexpr size_t NCache1 = 32; constexpr size_t KCache1 = 32; test(); + MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); break; } @@ -437,14 +471,14 @@ int main() { constexpr size_t NCache1 = 4 * /*TN*/ 16; constexpr size_t KCache1 = 16; test(); + NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); #if (!defined(SG_SZ) || SG_SZ != 32) // These combination are not currently supported for subgroup size = 32 in // IGC test(); + MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); test(); + MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); #endif break; } @@ -454,10 +488,9 @@ int main() { constexpr size_t KCache1 = 16; test(); - // test(); + NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); + // test(matrix_size); break; } } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp new file mode 100644 index 0000000000000..857c47b04ed56 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -0,0 +1,20 @@ +//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// XFAIL: arch-intel_gpu_pvc + +// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DRUNTIME_DIM -DVNNI +// RUN: %{run} %t_runtime_dim_vnni.out + +// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise -DRUNTIME_DIM +// RUN: %{run} %t_runtime_dim.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp"