From d3924feb09e7a3652d7fe1b2c1671fcaaa99f711 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Wed, 18 Sep 2024 06:36:39 -0700 Subject: [PATCH 1/8] add tests for matrix size be runtime dimension --- ...l_k_cache_dimensions_function_argument.cpp | 0 ...16_fill_k_cache_runtime_dimension_impl.hpp | 465 ++++++++++++++++++ ..._matrix_bf16_fill_k_cache_runtime_impl.hpp | 244 +++++++++ 3 files changed, 709 insertions(+) create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp new file mode 100644 index 0000000000000..2a5f6345327be --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp @@ -0,0 +1,465 @@ +//------------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// + +#include +#include + +#ifdef SLM +#include "slm_utils.hpp" +#endif + +// number of test iterations +constexpr unsigned int testIterations = 100; +// start recording time after X iterations +constexpr unsigned int recordThresh = 10; + +#ifndef MATRIX_SIZE +#define MATRIX_SIZE 256 +#endif + +#ifdef MANUAL_UNROLL +template +static constexpr void loop(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); // C++17 fold expression +} + +template +static constexpr void manually_unroll_loop(F &&f) { + loop(std::make_integer_sequence{}, std::forward(f)); +} +#endif + +template class MatMul; + +template +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { + size_t sgSize = get_sg_size>(q); + range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; + range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; + + // throw error if padding needed + assert(colsA == rowsB); + assert(rowsA % TM == 0); + assert(colsA % TK == 0); + assert(colsB % TN == 0); + // submit main kernel + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + + q.submit([&](handler &h) { +#ifdef SLM + local_accessor tileA{{MCache2, KCache2}, h}; + local_accessor tileB{ + {KCache2 / vnniFactor, NCache2 * vnniFactor}, h}; +#endif + + h.parallel_for>( // cache layer#1 + nd_range<2>{global, cachelocal}, + // loop global + // loop localrange + [=](nd_item<2> it) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif // SG_SZ + { + // sg::load and sg::store expect decorations to be ON + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + auto m2 = it.get_group(0); + auto n2 = it.get_group(1); + auto m1 = it.get_local_id(0); + auto n1 = it.get_local_id(1) / sgSize; + auto sg = it.get_sub_group(); +#ifdef PREFETCH + size_t sgId = sg.get_group_id()[0]; + // There are MCache2/MCache1 x NCache2/NCache1 subgroups: NumSGs + // PVC case: this is 8x4 subgroups + // BKM for PVC is to use prefetch of 8x32 for each subgroup + constexpr size_t prefRow = 8; + constexpr size_t prefCol = 32; + // All the SGs of one workgroup prefetch MCache2xKCache2 of A + // All the SGs of one workgroup prefetch KCache2xNCache2 of B + // PVC case: 256x32 of A and 32x256 of B + // For both A and B: each subgroup performs a prefetch of + // prefRow rows and prefCol cols at a time + // For A, the subgroups are distributed along the row dimension: + // PVC: A layed as MCache2/prefRow (256/32) + // For B: the subgroups are distributed along the column dimension + // PVC: NCache2/prefCol = 256/32 = 8 SGs on the column dimension and + // KCache2/prefRow = 32/8 = 4 SGs on the row dimension +#ifdef VNNI + // In the VNNI case, each subgroup still gets prefRow x prefCol + // In the PVC case: subgroups distribution become + // (NCache2*2)/prefCol = 512/32 = 16 SGs on the column dimension and + // (KCache2/2)/prefRow = 16/8 = 2 SGs on the row dimension + // pm1B and pn1B are used to identify the distribution of subgroups + // along the workgroup prefetch for B matrix. For A matrix, sgId is + // enough. + size_t pm1B = sgId / 16; // prefetch m1 (sgId/16) + size_t pn1B = sgId & 0x15; // prefetch n1 (sgId%16) +#else // VNNI + size_t pm1B = sgId / 8; // prefetch m1 (sgId/8) + size_t pn1B = sgId & 0x7; // prefetch n1 (sgId%8) +#endif // VNNI + constexpr size_t prefDistance = 3; + for (int p = 0; p < prefDistance; p++) + joint_matrix_prefetch( + sg, A + (m2 * MCache2 + sgId * prefRow) * colsA + p * prefCol, + colsA, layout::row_major, + syclex::properties{syclex::prefetch_hint_L1}); + + for (int p = 0; p < prefDistance; p++) + joint_matrix_prefetch( + sg, + B + + (p * (KCache2 / vnniFactor) + pm1B * prefRow) * colsB * + vnniFactor + + (n2 * NCache2 * vnniFactor + pn1B * prefCol), + colsB * vnniFactor, layout::row_major, + syclex::properties{syclex::prefetch_hint_L1}); +#endif // PREFETCH + + joint_matrix + tC[MCache1 / TM][NCache1 / TN] +#ifdef INIT_LIST + = {}; // default initialization of all array elements +#else // INIT_LIST + ; // no initialization +#endif // INIT_LIST + +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { + manually_unroll_loop([&](auto n) { +#else // MANUAL_UNROLL + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { +#endif // MANUAL_UNROLL + joint_matrix_fill(sg, tC[m][n], 0); +#ifdef MANUAL_UNROLL + }); + }); +#else // MANUAL_UNROLL + } + } +#endif // MANUAL_UNROLL + +#ifdef SLM + constexpr unsigned int SGs = + (MCache2 / MCache1) * (NCache2 / NCache1); +#endif // SLM + for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { +#ifdef SLM + slm_read_write(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize); + it.barrier(access::fence_space::local_space); +#endif // SLM + joint_matrix + tA[MCache1 / TM][KCache2 / KCache1] +#ifdef INIT_LIST + = {}; // default initialization of all array elements +#else // INIT_LIST + ; // no initialization +#endif // INIT_LIST +#ifdef VNNI + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1] +#else // VNNI + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1] +#endif // VNNI +#ifdef INIT_LIST + = {}; // default initialization of all array elements +#else // INIT_LIST + ; // no initialization +#endif // INIT_LIST + +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto k1) { +#else // MANUAL_UNROLL + for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { +#endif // MANUAL_UNROLL + // physical layer + unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { +#else // MANUAL_UNROLL + for (unsigned int m = 0; m < MCache1 / TM; m++) { +#endif // MANUAL_UNROLL +#ifdef SLM + joint_matrix_load(sg, tA[m][k1], + tileA.template get_multi_ptr< + sycl::access::decorated::no>() + + (m1 * MCache1 + m * TM) * KCache2 + + k1 * TK, + KCache2); +#else // SLM +#ifdef OOB + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, tA[m][k1], pA, colsA, rowsA, colsA, + m2 * MCache2 + m1 * MCache1 + m * TM, k * TK); +#else // OOB + joint_matrix_load( + sg, tA[m][k1], + pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + + k * TK, + colsA); +#endif // OOB +#endif // SLM +#ifdef MANUAL_UNROLL + }); // m +#else // MANUAL_UNROLL + } // m +#endif // MANUAL_UNROLL +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto n) { +#else // MANUAL_UNROLL + for (unsigned int n = 0; n < NCache1 / TN; n++) { +#endif // MANUAL_UNROLL +#ifdef SLM + joint_matrix_load(sg, tB[n][k1], + tileB.template get_multi_ptr< + sycl::access::decorated::no>() + + (k1 * TK / vnniFactor) * + (NCache2 * vnniFactor) + + (n1 * NCache1 + n * TN) * vnniFactor, + NCache2 * vnniFactor); +#else // SLM +#ifdef OOB + ext::intel::experimental::matrix::joint_matrix_load_checked( + sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor, + colsB * vnniFactor, k * TK / vnniFactor, + (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor); +#else // OOB + joint_matrix_load( + sg, tB[n][k1], + pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + + (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, + colsB * vnniFactor); +#endif // OOB +#endif // SLM +#ifdef MANUAL_UNROLL + }); // n +#else // MANUAL_UNROLL + } // n +#endif // MANUAL_UNROLL +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { +#else // MANUAL_UNROLL + for (unsigned int m = 0; m < MCache1 / TM; m++) { +#endif // MANUAL_UNROLL +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto n) { +#else // MANUAL_UNROLL + for (unsigned int n = 0; n < NCache1 / TN; n++) { + +#endif // MANUAL_UNROLL + joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], + tC[m][n]); +#ifdef MANUAL_UNROLL + }); // n + }); // m + }); // for k1 +#else // MANUAL_UNROLL + } // n + } // m + } // k1 +#endif // MANUAL_UNROLL +#ifdef SLM + it.barrier(access::fence_space::local_space); +#endif // SLM +#ifdef PREFETCH + auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA + + (k2 + prefDistance) * prefCol; + if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) < + (MATRIX_SIZE * MATRIX_SIZE)) + joint_matrix_prefetch( + sg, A + prefetch_offsetA, colsA, layout::row_major, + syclex::properties{syclex::prefetch_hint_L1}); + + auto prefetch_offsetB = + ((k2 + prefDistance) * (KCache2 / vnniFactor) + + pm1B * prefRow) * + (colsB)*vnniFactor + + (n2 * NCache2 * vnniFactor + pn1B * prefCol); + if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) + + prefCol) < (MATRIX_SIZE * MATRIX_SIZE)) + joint_matrix_prefetch( + sg, B + prefetch_offsetB, colsB * vnniFactor, + layout::row_major, + syclex::properties{syclex::prefetch_hint_L1}); +#endif // PREFETCH + } // for k2 +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto m) { +#else // MANUAL_UNROLL + for (unsigned int m = 0; m < MCache1 / TM; m++) { +#endif // MANUAL_UNROLL +#ifdef MANUAL_UNROLL + manually_unroll_loop([&](auto n) { +#else // MANUAL_UNROLL + for (unsigned int n = 0; n < NCache1 / TN; n++) { +#endif // MANUAL_UNROLL +#ifdef OOB + ext::intel::experimental::matrix::joint_matrix_store_checked( + sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB, + m2 * MCache2 + m1 * MCache1 + m * TM, + n2 * NCache2 + n1 * NCache1 + n * TN); +#else // OOB + joint_matrix_store( + sg, tC[m][n], + pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + + (n2 * NCache2 + n1 * NCache1 + n * TN), + colsB, layout::row_major); +#endif // OOB +#ifdef MANUAL_UNROLL + }); // n + }); // m +#else // MANUAL_UNROLL + } // n + } // m +#endif // MANUAL_UNROLL + }); // parallel_for + }); // queue.submit + + if (i == testIterations - 1) + q.wait(); + std::chrono::duration duration = + std::chrono::high_resolution_clock::now() - start; + + return duration.count(); +} + +template +void test() { + assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && + "invalid matrix size"); + assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && + (MATRIX_SIZE % TK) == 0 && + "invalid matrix size detected: not a multiple of "); + + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + queue q; + T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + + matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); + matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); + + matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, + MATRIX_SIZE); + +#ifdef VNNI + T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor); + free(B, q); + B = vnniB; +#endif + + // run testIterations time, aggregate and calculate average run time + double totalDuration = 0; + for (unsigned int i = 0; i < testIterations; i++) { + double duration = + joint_matmul(A, B, C, q, i); + if (i >= recordThresh) { + totalDuration += duration; + } + } + + assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); + + double msecPerMatrixMul = + totalDuration / static_cast(testIterations - recordThresh); + double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / + (msecPerMatrixMul / 1000.f); + + std::cout << "DONE for size " << MATRIX_SIZE << std::endl; + std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; + + free(A, q); + free(B, q); + free(C, q); + free(refC, q); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + constexpr size_t MCache1 = 32; + constexpr size_t MCache2 = 256; + constexpr size_t NCache2 = 256; + constexpr size_t KCache2 = 32; + +#ifdef VNNI + constexpr unsigned int VnniFactor = 2; +#else // VNNI + constexpr unsigned int VnniFactor = 1; +#endif // VNNI + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + constexpr size_t NCache1 = 32; + constexpr size_t KCache1 = 32; + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + constexpr size_t NCache1 = 4 * /*TN*/ 16; + constexpr size_t KCache1 = 16; + // test(); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // // IGC + // test(); + test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + constexpr size_t NCache1 = 4 * /*TN*/ 8; + constexpr size_t KCache1 = 16; + + test(); + // test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp new file mode 100644 index 0000000000000..fae8dbadee3c7 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp @@ -0,0 +1,244 @@ +//------------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// + +#include +#include +#include + +// number of test iterations +constexpr unsigned int testIterations = 100; +// start recording time after X iterations +constexpr unsigned int recordThresh = 10; + +#ifdef MANUAL_UNROLL +template +static constexpr void loop(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); // C++17 fold expression +} + +template +static constexpr void manually_unroll_loop(F &&f) { + loop(std::make_integer_sequence{}, std::forward(f)); +} +#endif + +template class MatMul; + +template +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { + size_t sgSize = get_sg_size>(q); + range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; + range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; + + // throw error if padding needed + assert(colsA == rowsB); + assert(rowsA % TM == 0); + assert(colsA % TK == 0); + assert(colsB % TN == 0); + // submit main kernel + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + + q.submit([&](handler &h) { + sycl::stream os { 5000, 5000, h}; + h.parallel_for>( // cache layer#1 + nd_range<2>{global, cachelocal}, + // loop global + // loop localrange + [=](nd_item<2> it) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + auto m2 = it.get_group(0); + auto n2 = it.get_group(1); + auto m1 = it.get_local_id(0); + auto n1 = it.get_local_id(1) / sgSize; + auto sg = it.get_sub_group(); + joint_matrix + tC[MCache1 / TM][NCache1 / TN]; + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_fill(sg, tC[m][n], 0); + } + } + + for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { + joint_matrix + tA[MCache1 / TM][KCache2 / KCache1]; + + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1]; + + for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { + // physical layer + unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; + for (unsigned int m = 0; m < MCache1 / TM; m++) { + joint_matrix_load( + sg, tA[m][k1], + pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + + k * TK, + colsA); + } // m + + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_load(sg, tB[n][k1], + pB + (k * TK / VNNI) * (colsB * VNNI) + + (n2 * NCache2 + n1 * NCache1 + n * TN) * + VNNI, + colsB * VNNI); + } // n + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], + tC[m][n]); + } // n + } // m + } // k1 + } // for k2 + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_store( + sg, tC[m][n], + pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + + (n2 * NCache2 + n1 * NCache1 + n * TN), + colsB, layout::row_major); + } // n + } // m +// #endif + }); // parallel_for + }); // queue.submit + + if (i == testIterations - 1) + q.wait(); + std::chrono::duration duration = + std::chrono::high_resolution_clock::now() - start; + + return duration.count(); +} + +template +void test(size_t MATRIX_SIZE) { + assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && + "invalid matrix size"); + assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && + (MATRIX_SIZE % TK) == 0 && + "invalid matrix size detected: not a multiple of "); + + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + queue q; + T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + + matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); + matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); + matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI); + + matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, + MATRIX_SIZE); + + // run testIterations time, aggregate and calculate average run time + double totalDuration = 0; + for (unsigned int i = 0; i < testIterations; i++) { + double duration = + joint_matmul(A, vnniB, C, q, i, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE); + if (i >= recordThresh) { + totalDuration += duration; + } + } + + assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); + + double msecPerMatrixMul = + totalDuration / static_cast(testIterations - recordThresh); + double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / + (msecPerMatrixMul / 1000.f); + + std::cout << "DONE for size " << MATRIX_SIZE << std::endl; + std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; + + free(A, q); + free(B, q); + free(vnniB, q); + free(C, q); + free(refC, q); +} + +int main(int argc, char *argv[]) { + size_t MATRIX_SIZE; + MATRIX_SIZE = std::stoul(argv[1]); + + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + constexpr size_t MCache1 = 32; + constexpr size_t MCache2 = 256; + constexpr size_t NCache2 = 256; + constexpr size_t KCache2 = 32; + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + constexpr size_t NCache1 = 32; + constexpr size_t KCache1 = 32; + + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + constexpr size_t NCache1 = 4 * /*TN*/ 16; + constexpr size_t KCache1 = 16; + + // test(); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // IGC + // test(); + test(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + constexpr size_t NCache1 = 4 * /*TN*/ 8; + constexpr size_t KCache1 = 16; + + test(); + break; + } + } + return 0; +} From 4e6593fc7f4ffb614a262fb73a247394d94efd94 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Wed, 18 Sep 2024 13:27:13 -0700 Subject: [PATCH 2/8] update the arg_dim test to reuse the bf16_cache test --- ...joint_matrix_bf16_fill_k_cache_arg_dim.cpp | 22 + ...l_k_cache_dimensions_function_argument.cpp | 0 .../joint_matrix_bf16_fill_k_cache_impl.hpp | 19 + ...16_fill_k_cache_runtime_dimension_impl.hpp | 465 ------------------ ..._matrix_bf16_fill_k_cache_runtime_impl.hpp | 244 --------- 5 files changed, 41 insertions(+), 709 deletions(-) create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp new file mode 100644 index 0000000000000..aa2540b3781b7 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp @@ -0,0 +1,22 @@ +//==--- joint_matrix_bf16_fill_k_cache_OOB.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// https://jira.devtools.intel.com/browse/GSD-9716 +// XFAIL: arch-intel_gpu_pvc + +// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM -DVNNI +// RUN: %{run} %t_arg_dim_vnni.out + +// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM +// RUN: %{run} %t_arg_dim.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_dimensions_function_argument.cpp deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index b561bd073038a..4584aedbe3d01 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -36,11 +36,23 @@ static constexpr void manually_unroll_loop(F &&f) { template class MatMul; +#ifdef ARG_DIM +template +#else // ARG_DIM template +#endif // ARG_DIM + +#ifdef ARG_DIM +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { +#else // ARG_DIM double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { +#endif // ARG_DIM + size_t sgSize = get_sg_size>(q); range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; @@ -381,10 +393,17 @@ void test() { // run testIterations time, aggregate and calculate average run time double totalDuration = 0; for (unsigned int i = 0; i < testIterations; i++) { +#ifdef ARG_DIM + double duration = + joint_matmul(A, B, C, q, i, + MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE); +#else // ARG_DIM double duration = joint_matmul(A, B, C, q, i); +#endif // ARG_DIM if (i >= recordThresh) { totalDuration += duration; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp deleted file mode 100644 index 2a5f6345327be..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dimension_impl.hpp +++ /dev/null @@ -1,465 +0,0 @@ -//------------------------------------------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-------------------------------------------------------------------------===// - -#include -#include - -#ifdef SLM -#include "slm_utils.hpp" -#endif - -// number of test iterations -constexpr unsigned int testIterations = 100; -// start recording time after X iterations -constexpr unsigned int recordThresh = 10; - -#ifndef MATRIX_SIZE -#define MATRIX_SIZE 256 -#endif - -#ifdef MANUAL_UNROLL -template -static constexpr void loop(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); // C++17 fold expression -} - -template -static constexpr void manually_unroll_loop(F &&f) { - loop(std::make_integer_sequence{}, std::forward(f)); -} -#endif - -template class MatMul; - -template -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { - size_t sgSize = get_sg_size>(q); - range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; - range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; - - // throw error if padding needed - assert(colsA == rowsB); - assert(rowsA % TM == 0); - assert(colsA % TK == 0); - assert(colsB % TN == 0); - // submit main kernel - std::chrono::high_resolution_clock::time_point start = - std::chrono::high_resolution_clock::now(); - - q.submit([&](handler &h) { -#ifdef SLM - local_accessor tileA{{MCache2, KCache2}, h}; - local_accessor tileB{ - {KCache2 / vnniFactor, NCache2 * vnniFactor}, h}; -#endif - - h.parallel_for>( // cache layer#1 - nd_range<2>{global, cachelocal}, - // loop global - // loop localrange - [=](nd_item<2> it) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif // SG_SZ - { - // sg::load and sg::store expect decorations to be ON - auto pA = - address_space_cast(A); - auto pB = - address_space_cast(B); - auto pC = - address_space_cast(C); - auto m2 = it.get_group(0); - auto n2 = it.get_group(1); - auto m1 = it.get_local_id(0); - auto n1 = it.get_local_id(1) / sgSize; - auto sg = it.get_sub_group(); -#ifdef PREFETCH - size_t sgId = sg.get_group_id()[0]; - // There are MCache2/MCache1 x NCache2/NCache1 subgroups: NumSGs - // PVC case: this is 8x4 subgroups - // BKM for PVC is to use prefetch of 8x32 for each subgroup - constexpr size_t prefRow = 8; - constexpr size_t prefCol = 32; - // All the SGs of one workgroup prefetch MCache2xKCache2 of A - // All the SGs of one workgroup prefetch KCache2xNCache2 of B - // PVC case: 256x32 of A and 32x256 of B - // For both A and B: each subgroup performs a prefetch of - // prefRow rows and prefCol cols at a time - // For A, the subgroups are distributed along the row dimension: - // PVC: A layed as MCache2/prefRow (256/32) - // For B: the subgroups are distributed along the column dimension - // PVC: NCache2/prefCol = 256/32 = 8 SGs on the column dimension and - // KCache2/prefRow = 32/8 = 4 SGs on the row dimension -#ifdef VNNI - // In the VNNI case, each subgroup still gets prefRow x prefCol - // In the PVC case: subgroups distribution become - // (NCache2*2)/prefCol = 512/32 = 16 SGs on the column dimension and - // (KCache2/2)/prefRow = 16/8 = 2 SGs on the row dimension - // pm1B and pn1B are used to identify the distribution of subgroups - // along the workgroup prefetch for B matrix. For A matrix, sgId is - // enough. - size_t pm1B = sgId / 16; // prefetch m1 (sgId/16) - size_t pn1B = sgId & 0x15; // prefetch n1 (sgId%16) -#else // VNNI - size_t pm1B = sgId / 8; // prefetch m1 (sgId/8) - size_t pn1B = sgId & 0x7; // prefetch n1 (sgId%8) -#endif // VNNI - constexpr size_t prefDistance = 3; - for (int p = 0; p < prefDistance; p++) - joint_matrix_prefetch( - sg, A + (m2 * MCache2 + sgId * prefRow) * colsA + p * prefCol, - colsA, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); - - for (int p = 0; p < prefDistance; p++) - joint_matrix_prefetch( - sg, - B + - (p * (KCache2 / vnniFactor) + pm1B * prefRow) * colsB * - vnniFactor + - (n2 * NCache2 * vnniFactor + pn1B * prefCol), - colsB * vnniFactor, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); -#endif // PREFETCH - - joint_matrix - tC[MCache1 / TM][NCache1 / TN] -#ifdef INIT_LIST - = {}; // default initialization of all array elements -#else // INIT_LIST - ; // no initialization -#endif // INIT_LIST - -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { - manually_unroll_loop([&](auto n) { -#else // MANUAL_UNROLL - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { -#endif // MANUAL_UNROLL - joint_matrix_fill(sg, tC[m][n], 0); -#ifdef MANUAL_UNROLL - }); - }); -#else // MANUAL_UNROLL - } - } -#endif // MANUAL_UNROLL - -#ifdef SLM - constexpr unsigned int SGs = - (MCache2 / MCache1) * (NCache2 / NCache1); -#endif // SLM - for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { -#ifdef SLM - slm_read_write(pA, pB, tileA, tileB, sg, k2, m2, n2, sgSize); - it.barrier(access::fence_space::local_space); -#endif // SLM - joint_matrix - tA[MCache1 / TM][KCache2 / KCache1] -#ifdef INIT_LIST - = {}; // default initialization of all array elements -#else // INIT_LIST - ; // no initialization -#endif // INIT_LIST -#ifdef VNNI - joint_matrix - tB[NCache1 / TN][KCache2 / KCache1] -#else // VNNI - joint_matrix - tB[NCache1 / TN][KCache2 / KCache1] -#endif // VNNI -#ifdef INIT_LIST - = {}; // default initialization of all array elements -#else // INIT_LIST - ; // no initialization -#endif // INIT_LIST - -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto k1) { -#else // MANUAL_UNROLL - for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { -#endif // MANUAL_UNROLL - // physical layer - unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { -#else // MANUAL_UNROLL - for (unsigned int m = 0; m < MCache1 / TM; m++) { -#endif // MANUAL_UNROLL -#ifdef SLM - joint_matrix_load(sg, tA[m][k1], - tileA.template get_multi_ptr< - sycl::access::decorated::no>() + - (m1 * MCache1 + m * TM) * KCache2 + - k1 * TK, - KCache2); -#else // SLM -#ifdef OOB - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, tA[m][k1], pA, colsA, rowsA, colsA, - m2 * MCache2 + m1 * MCache1 + m * TM, k * TK); -#else // OOB - joint_matrix_load( - sg, tA[m][k1], - pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + - k * TK, - colsA); -#endif // OOB -#endif // SLM -#ifdef MANUAL_UNROLL - }); // m -#else // MANUAL_UNROLL - } // m -#endif // MANUAL_UNROLL -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto n) { -#else // MANUAL_UNROLL - for (unsigned int n = 0; n < NCache1 / TN; n++) { -#endif // MANUAL_UNROLL -#ifdef SLM - joint_matrix_load(sg, tB[n][k1], - tileB.template get_multi_ptr< - sycl::access::decorated::no>() + - (k1 * TK / vnniFactor) * - (NCache2 * vnniFactor) + - (n1 * NCache1 + n * TN) * vnniFactor, - NCache2 * vnniFactor); -#else // SLM -#ifdef OOB - ext::intel::experimental::matrix::joint_matrix_load_checked( - sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor, - colsB * vnniFactor, k * TK / vnniFactor, - (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor); -#else // OOB - joint_matrix_load( - sg, tB[n][k1], - pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + - (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, - colsB * vnniFactor); -#endif // OOB -#endif // SLM -#ifdef MANUAL_UNROLL - }); // n -#else // MANUAL_UNROLL - } // n -#endif // MANUAL_UNROLL -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { -#else // MANUAL_UNROLL - for (unsigned int m = 0; m < MCache1 / TM; m++) { -#endif // MANUAL_UNROLL -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto n) { -#else // MANUAL_UNROLL - for (unsigned int n = 0; n < NCache1 / TN; n++) { - -#endif // MANUAL_UNROLL - joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], - tC[m][n]); -#ifdef MANUAL_UNROLL - }); // n - }); // m - }); // for k1 -#else // MANUAL_UNROLL - } // n - } // m - } // k1 -#endif // MANUAL_UNROLL -#ifdef SLM - it.barrier(access::fence_space::local_space); -#endif // SLM -#ifdef PREFETCH - auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA + - (k2 + prefDistance) * prefCol; - if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) < - (MATRIX_SIZE * MATRIX_SIZE)) - joint_matrix_prefetch( - sg, A + prefetch_offsetA, colsA, layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); - - auto prefetch_offsetB = - ((k2 + prefDistance) * (KCache2 / vnniFactor) + - pm1B * prefRow) * - (colsB)*vnniFactor + - (n2 * NCache2 * vnniFactor + pn1B * prefCol); - if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) + - prefCol) < (MATRIX_SIZE * MATRIX_SIZE)) - joint_matrix_prefetch( - sg, B + prefetch_offsetB, colsB * vnniFactor, - layout::row_major, - syclex::properties{syclex::prefetch_hint_L1}); -#endif // PREFETCH - } // for k2 -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto m) { -#else // MANUAL_UNROLL - for (unsigned int m = 0; m < MCache1 / TM; m++) { -#endif // MANUAL_UNROLL -#ifdef MANUAL_UNROLL - manually_unroll_loop([&](auto n) { -#else // MANUAL_UNROLL - for (unsigned int n = 0; n < NCache1 / TN; n++) { -#endif // MANUAL_UNROLL -#ifdef OOB - ext::intel::experimental::matrix::joint_matrix_store_checked( - sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB, - m2 * MCache2 + m1 * MCache1 + m * TM, - n2 * NCache2 + n1 * NCache1 + n * TN); -#else // OOB - joint_matrix_store( - sg, tC[m][n], - pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + - (n2 * NCache2 + n1 * NCache1 + n * TN), - colsB, layout::row_major); -#endif // OOB -#ifdef MANUAL_UNROLL - }); // n - }); // m -#else // MANUAL_UNROLL - } // n - } // m -#endif // MANUAL_UNROLL - }); // parallel_for - }); // queue.submit - - if (i == testIterations - 1) - q.wait(); - std::chrono::duration duration = - std::chrono::high_resolution_clock::now() - start; - - return duration.count(); -} - -template -void test() { - assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && - "invalid matrix size"); - assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && - (MATRIX_SIZE % TK) == 0 && - "invalid matrix size detected: not a multiple of "); - - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - - queue q; - T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); - - matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, - MATRIX_SIZE); - -#ifdef VNNI - T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor); - free(B, q); - B = vnniB; -#endif - - // run testIterations time, aggregate and calculate average run time - double totalDuration = 0; - for (unsigned int i = 0; i < testIterations; i++) { - double duration = - joint_matmul(A, B, C, q, i); - if (i >= recordThresh) { - totalDuration += duration; - } - } - - assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); - - double msecPerMatrixMul = - totalDuration / static_cast(testIterations - recordThresh); - double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / - (msecPerMatrixMul / 1000.f); - - std::cout << "DONE for size " << MATRIX_SIZE << std::endl; - std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; - - free(A, q); - free(B, q); - free(C, q); - free(refC, q); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - constexpr size_t MCache1 = 32; - constexpr size_t MCache2 = 256; - constexpr size_t NCache2 = 256; - constexpr size_t KCache2 = 32; - -#ifdef VNNI - constexpr unsigned int VnniFactor = 2; -#else // VNNI - constexpr unsigned int VnniFactor = 1; -#endif // VNNI - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - constexpr size_t NCache1 = 32; - constexpr size_t KCache1 = 32; - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - constexpr size_t NCache1 = 4 * /*TN*/ 16; - constexpr size_t KCache1 = 16; - // test(); -#if (!defined(SG_SZ) || SG_SZ != 32) - // These combination are not currently supported for subgroup size = 32 in - // // IGC - // test(); - test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - constexpr size_t NCache1 = 4 * /*TN*/ 8; - constexpr size_t KCache1 = 16; - - test(); - // test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp deleted file mode 100644 index fae8dbadee3c7..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_impl.hpp +++ /dev/null @@ -1,244 +0,0 @@ -//------------------------------------------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-------------------------------------------------------------------------===// - -#include -#include -#include - -// number of test iterations -constexpr unsigned int testIterations = 100; -// start recording time after X iterations -constexpr unsigned int recordThresh = 10; - -#ifdef MANUAL_UNROLL -template -static constexpr void loop(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); // C++17 fold expression -} - -template -static constexpr void manually_unroll_loop(F &&f) { - loop(std::make_integer_sequence{}, std::forward(f)); -} -#endif - -template class MatMul; - -template -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { - size_t sgSize = get_sg_size>(q); - range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; - range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; - - // throw error if padding needed - assert(colsA == rowsB); - assert(rowsA % TM == 0); - assert(colsA % TK == 0); - assert(colsB % TN == 0); - // submit main kernel - std::chrono::high_resolution_clock::time_point start = - std::chrono::high_resolution_clock::now(); - - q.submit([&](handler &h) { - sycl::stream os { 5000, 5000, h}; - h.parallel_for>( // cache layer#1 - nd_range<2>{global, cachelocal}, - // loop global - // loop localrange - [=](nd_item<2> it) -#ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] -#endif - { - auto pA = - address_space_cast(A); - auto pB = - address_space_cast(B); - auto pC = - address_space_cast(C); - auto m2 = it.get_group(0); - auto n2 = it.get_group(1); - auto m1 = it.get_local_id(0); - auto n1 = it.get_local_id(1) / sgSize; - auto sg = it.get_sub_group(); - joint_matrix - tC[MCache1 / TM][NCache1 / TN]; - - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_fill(sg, tC[m][n], 0); - } - } - - for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { - joint_matrix - tA[MCache1 / TM][KCache2 / KCache1]; - - joint_matrix - tB[NCache1 / TN][KCache2 / KCache1]; - - for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { - // physical layer - unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; - for (unsigned int m = 0; m < MCache1 / TM; m++) { - joint_matrix_load( - sg, tA[m][k1], - pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + - k * TK, - colsA); - } // m - - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_load(sg, tB[n][k1], - pB + (k * TK / VNNI) * (colsB * VNNI) + - (n2 * NCache2 + n1 * NCache1 + n * TN) * - VNNI, - colsB * VNNI); - } // n - - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], - tC[m][n]); - } // n - } // m - } // k1 - } // for k2 - - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_store( - sg, tC[m][n], - pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + - (n2 * NCache2 + n1 * NCache1 + n * TN), - colsB, layout::row_major); - } // n - } // m -// #endif - }); // parallel_for - }); // queue.submit - - if (i == testIterations - 1) - q.wait(); - std::chrono::duration duration = - std::chrono::high_resolution_clock::now() - start; - - return duration.count(); -} - -template -void test(size_t MATRIX_SIZE) { - assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && - "invalid matrix size"); - assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && - (MATRIX_SIZE % TK) == 0 && - "invalid matrix size detected: not a multiple of "); - - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - - queue q; - T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); - matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI); - - matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, - MATRIX_SIZE); - - // run testIterations time, aggregate and calculate average run time - double totalDuration = 0; - for (unsigned int i = 0; i < testIterations; i++) { - double duration = - joint_matmul(A, vnniB, C, q, i, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE); - if (i >= recordThresh) { - totalDuration += duration; - } - } - - assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); - - double msecPerMatrixMul = - totalDuration / static_cast(testIterations - recordThresh); - double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / - (msecPerMatrixMul / 1000.f); - - std::cout << "DONE for size " << MATRIX_SIZE << std::endl; - std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; - - free(A, q); - free(B, q); - free(vnniB, q); - free(C, q); - free(refC, q); -} - -int main(int argc, char *argv[]) { - size_t MATRIX_SIZE; - MATRIX_SIZE = std::stoul(argv[1]); - - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - constexpr size_t MCache1 = 32; - constexpr size_t MCache2 = 256; - constexpr size_t NCache2 = 256; - constexpr size_t KCache2 = 32; - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - constexpr size_t NCache1 = 32; - constexpr size_t KCache1 = 32; - - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - constexpr size_t NCache1 = 4 * /*TN*/ 16; - constexpr size_t KCache1 = 16; - - // test(); -#if (!defined(SG_SZ) || SG_SZ != 32) - // These combination are not currently supported for subgroup size = 32 in - // IGC - // test(); - test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - constexpr size_t NCache1 = 4 * /*TN*/ 8; - constexpr size_t KCache1 = 16; - - test(); - break; - } - } - return 0; -} From b56a34ce1142ddb858db37a802558d91c4c650de Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Wed, 18 Sep 2024 14:19:20 -0700 Subject: [PATCH 3/8] update the test to address the commets --- ...joint_matrix_bf16_fill_k_cache_arg_dim.cpp | 8 ++-- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 40 +++++++++---------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp index aa2540b3781b7..5caf08a5f6bfc 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp @@ -1,4 +1,4 @@ -//==--- joint_matrix_bf16_fill_k_cache_OOB.cpp - DPC++ joint_matrix--------==// +//==--- joint_matrix_bf16_fill_k_cache_arg_dim.cpp - DPC++ joint_matrix--------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,9 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix - -// https://jira.devtools.intel.com/browse/GSD-9716 // XFAIL: arch-intel_gpu_pvc -// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM -DVNNI +// RUN: %{build} -o %t_arg_dim_vnni.out -ffp-model=precise -DARG_DIM -DVNNI // RUN: %{run} %t_arg_dim_vnni.out // RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM @@ -19,4 +17,4 @@ // -ffp-model=precise is added to not depend on compiler defaults. #include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_impl.hpp" \ No newline at end of file +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 4584aedbe3d01..47cfab5506187 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -36,22 +36,19 @@ static constexpr void manually_unroll_loop(F &&f) { template class MatMul; -#ifdef ARG_DIM -template -#else // ARG_DIM -template -#endif // ARG_DIM +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i #ifdef ARG_DIM -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { -#else // ARG_DIM -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { -#endif // ARG_DIM + , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB +#endif // ARG_DIM + ) { size_t sgSize = get_sg_size>(q); range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; @@ -393,17 +390,20 @@ void test() { // run testIterations time, aggregate and calculate average run time double totalDuration = 0; for (unsigned int i = 0; i < testIterations; i++) { + + double duration = + joint_matmul< +#ifndef ARG_DIM + MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, +#endif // ARG_DIM + vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1, + KCache1, MCache2, NCache2, KCache2> + (A, B, C, q, i #ifdef ARG_DIM - double duration = - joint_matmul(A, B, C, q, i, - MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE); -#else // ARG_DIM - double duration = - joint_matmul(A, B, C, q, i); + , MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE #endif // ARG_DIM + ); + if (i >= recordThresh) { totalDuration += duration; } From a184cbc9cc0dcfe82c7ed1e2ad0e35ccbdd97a87 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Thu, 19 Sep 2024 15:07:08 -0700 Subject: [PATCH 4/8] add test for runtime matrix dimension --- ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp | 22 ++ ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 249 ++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp new file mode 100644 index 0000000000000..30ba7db3d227e --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -0,0 +1,22 @@ +//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// https://jira.devtools.intel.com/browse/GSD-9716 +// XFAIL: arch-intel_gpu_pvc + +// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI +// RUN: %{run} %t_runtime_dim_vnni.out + +// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise +// RUN: %{run} %t_runtime_dim.out + +// -ffp-model=precise is added to not depend on compiler defaults. + +#include "common.hpp" +#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp" \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp new file mode 100644 index 0000000000000..cbfb4de8f18aa --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp @@ -0,0 +1,249 @@ +//------------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------------===// + +#include +#include + +// number of test iterations +constexpr unsigned int testIterations = 100; +// start recording time after X iterations +constexpr unsigned int recordThresh = 10; + +template class MatMul; + +template +double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, + size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { + + size_t sgSize = get_sg_size>(q); + range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; + range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; + + // throw error if padding needed + assert(colsA == rowsB); + assert(rowsA % TM == 0); + assert(colsA % TK == 0); + assert(colsB % TN == 0); + // submit main kernel + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + + q.submit([&](handler &h) { + h.parallel_for>( // cache layer#1 + nd_range<2>{global, cachelocal}, + // loop global + // loop localrange + [=](nd_item<2> it) + { + // sg::load and sg::store expect decorations to be ON + auto pA = + address_space_cast(A); + auto pB = + address_space_cast(B); + auto pC = + address_space_cast(C); + auto m2 = it.get_group(0); + auto n2 = it.get_group(1); + auto m1 = it.get_local_id(0); + auto n1 = it.get_local_id(1) / sgSize; + auto sg = it.get_sub_group(); + + joint_matrix + tC[MCache1 / TM][NCache1 / TN]; + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_fill(sg, tC[m][n], 0); + } + } + + for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { + joint_matrix + tA[MCache1 / TM][KCache2 / KCache1]; +#ifdef VNNI + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1]; +#else // VNNI + joint_matrix + tB[NCache1 / TN][KCache2 / KCache1]; +#endif // VNNI + + for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { + unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; + for (unsigned int m = 0; m < MCache1 / TM; m++) { + joint_matrix_load( + sg, tA[m][k1], + pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + + k * TK, + colsA); + } + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_load( + sg, tB[n][k1], + pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + + (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, + colsB * vnniFactor); + } // n + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], + tC[m][n]); + } // n + } // m + } // k1 + } // for k2 + + for (unsigned int m = 0; m < MCache1 / TM; m++) { + for (unsigned int n = 0; n < NCache1 / TN; n++) { + joint_matrix_store( + sg, tC[m][n], + pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + + (n2 * NCache2 + n1 * NCache1 + n * TN), + colsB, layout::row_major); + } // n + } // m + }); // parallel_for + }); // queue.submit + + if (i == testIterations - 1) + q.wait(); + std::chrono::duration duration = + std::chrono::high_resolution_clock::now() - start; + + return duration.count(); +} + +template +void test(size_t matrix_size) { + assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN && + "invalid matrix size"); + assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 && + (matrix_size % TK) == 0 && + "invalid matrix size detected: not a multiple of "); + + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + queue q; + T *A = malloc_shared(matrix_size * matrix_size, q); + T *B = malloc_shared(matrix_size * matrix_size, q); + TResult *C = malloc_shared(matrix_size * matrix_size, q); + TResult *refC = malloc_shared(matrix_size * matrix_size, q); + + matrix_rand(matrix_size, matrix_size, A, T(1)); + matrix_rand(matrix_size, matrix_size, B, T(1)); + + matrix_multiply_ref(A, B, refC, matrix_size, matrix_size, + matrix_size); + +#ifdef VNNI + T *vnniB = malloc_shared(matrix_size * matrix_size, q); + matrix_vnni(matrix_size, matrix_size, B, vnniB, vnniFactor); + free(B, q); + B = vnniB; +#endif + + // run testIterations time, aggregate and calculate average run time + double totalDuration = 0; + for (unsigned int i = 0; i < testIterations; i++) { + + double duration = + joint_matmul + (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size); + + if (i >= recordThresh) { + totalDuration += duration; + } + } + + assert(matrix_compare(matrix_size, matrix_size, C, refC)); + + double msecPerMatrixMul = + totalDuration / static_cast(testIterations - recordThresh); + double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) / + (msecPerMatrixMul / 1000.f); + + std::cout << "DONE for size " << matrix_size << std::endl; + std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; + + free(A, q); + free(B, q); + free(C, q); + free(refC, q); +} + +int main(int argc, char *argv[]) { + size_t matrix_size; + matrix_size = std::stoul(argv[1]); + + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + constexpr size_t MCache1 = 32; + constexpr size_t MCache2 = 256; + constexpr size_t NCache2 = 256; + constexpr size_t KCache2 = 32; + +#ifdef VNNI + constexpr unsigned int VnniFactor = 2; +#else // VNNI + constexpr unsigned int VnniFactor = 1; +#endif // VNNI + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + constexpr size_t NCache1 = 32; + constexpr size_t KCache1 = 32; + test(matrix_size); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + constexpr size_t NCache1 = 4 * /*TN*/ 16; + constexpr size_t KCache1 = 16; + test(matrix_size); +#if (!defined(SG_SZ) || SG_SZ != 32) + // These combination are not currently supported for subgroup size = 32 in + // IGC + test(matrix_size); + test(matrix_size); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + constexpr size_t NCache1 = 4 * /*TN*/ 8; + constexpr size_t KCache1 = 16; + + test(matrix_size); + // test(matrix_size); + break; + } + } + return 0; +} From 11aa43c26883b62205c89e01ae9f9f9eb7f677d4 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Fri, 20 Sep 2024 08:11:10 -0700 Subject: [PATCH 5/8] reuse joint_matmul from joint_matrix_bf16_fill_k_cache_impl.hpp in joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp --- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 10 +- ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp | 6 +- ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 127 ++---------------- 3 files changed, 21 insertions(+), 122 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 47cfab5506187..b4a902a5b2281 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -14,9 +14,9 @@ #endif // number of test iterations -constexpr unsigned int testIterations = 100; +extern constexpr unsigned int testIterations = 100; // start recording time after X iterations -constexpr unsigned int recordThresh = 10; +extern constexpr unsigned int recordThresh = 10; #ifndef MATRIX_SIZE #define MATRIX_SIZE 256 @@ -46,9 +46,9 @@ template < double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i #ifdef ARG_DIM - , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB + , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB #endif // ARG_DIM - ) { + ) { size_t sgSize = get_sg_size>(q); range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; @@ -355,6 +355,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i return duration.count(); } +#ifndef EXCLUDE_MAIN_TEST template @@ -482,3 +483,4 @@ int main() { } return 0; } +#endif //EXCLUDE_MAIN_TEST diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp index 30ba7db3d227e..4c9766bc1751f 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -18,5 +18,9 @@ // -ffp-model=precise is added to not depend on compiler defaults. +#define EXCLUDE_MAIN_TEST 1 +#define ARG_DIM 1 + #include "common.hpp" -#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp" \ No newline at end of file +#include "joint_matrix_bf16_fill_k_cache_impl.hpp" +#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp index cbfb4de8f18aa..495185f46fbe4 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp @@ -8,123 +8,8 @@ #include #include - -// number of test iterations -constexpr unsigned int testIterations = 100; -// start recording time after X iterations -constexpr unsigned int recordThresh = 10; - template class MatMul; -template -double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i, - size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { - - size_t sgSize = get_sg_size>(q); - range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize}; - range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize}; - - // throw error if padding needed - assert(colsA == rowsB); - assert(rowsA % TM == 0); - assert(colsA % TK == 0); - assert(colsB % TN == 0); - // submit main kernel - std::chrono::high_resolution_clock::time_point start = - std::chrono::high_resolution_clock::now(); - - q.submit([&](handler &h) { - h.parallel_for>( // cache layer#1 - nd_range<2>{global, cachelocal}, - // loop global - // loop localrange - [=](nd_item<2> it) - { - // sg::load and sg::store expect decorations to be ON - auto pA = - address_space_cast(A); - auto pB = - address_space_cast(B); - auto pC = - address_space_cast(C); - auto m2 = it.get_group(0); - auto n2 = it.get_group(1); - auto m1 = it.get_local_id(0); - auto n1 = it.get_local_id(1) / sgSize; - auto sg = it.get_sub_group(); - - joint_matrix - tC[MCache1 / TM][NCache1 / TN]; - - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_fill(sg, tC[m][n], 0); - } - } - - for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) { - joint_matrix - tA[MCache1 / TM][KCache2 / KCache1]; -#ifdef VNNI - joint_matrix - tB[NCache1 / TN][KCache2 / KCache1]; -#else // VNNI - joint_matrix - tB[NCache1 / TN][KCache2 / KCache1]; -#endif // VNNI - - for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) { - unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK; - for (unsigned int m = 0; m < MCache1 / TM; m++) { - joint_matrix_load( - sg, tA[m][k1], - pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA + - k * TK, - colsA); - } - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_load( - sg, tB[n][k1], - pB + (k * TK / vnniFactor) * (colsB * vnniFactor) + - (n2 * NCache2 + n1 * NCache1 + n * TN) * vnniFactor, - colsB * vnniFactor); - } // n - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1], - tC[m][n]); - } // n - } // m - } // k1 - } // for k2 - - for (unsigned int m = 0; m < MCache1 / TM; m++) { - for (unsigned int n = 0; n < NCache1 / TN; n++) { - joint_matrix_store( - sg, tC[m][n], - pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB + - (n2 * NCache2 + n1 * NCache1 + n * TN), - colsB, layout::row_major); - } // n - } // m - }); // parallel_for - }); // queue.submit - - if (i == testIterations - 1) - q.wait(); - std::chrono::duration duration = - std::chrono::high_resolution_clock::now() - start; - - return duration.count(); -} - template @@ -164,7 +49,8 @@ void test(size_t matrix_size) { double duration = joint_matmul - (A, B, C, q, i, matrix_size, matrix_size, matrix_size, matrix_size); + (A, B, C, q, i, + matrix_size, matrix_size, matrix_size, matrix_size); if (i >= recordThresh) { totalDuration += duration; @@ -189,7 +75,14 @@ void test(size_t matrix_size) { int main(int argc, char *argv[]) { size_t matrix_size; - matrix_size = std::stoul(argv[1]); + + // Check for command line argument + if (argc == 2) { + matrix_size = std::stoul(argv[1]); + } else { + std::cerr << "Usage: ./program matrix_size\n"; + return 1; // Error if no argument + } queue q; std::vector combinations = From beb49164183305765ec811f5d4c3f3715085b029 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Fri, 20 Sep 2024 08:16:37 -0700 Subject: [PATCH 6/8] remove extern for testIterations and recordThresh --- sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index b4a902a5b2281..cfbf93dcddbed 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -14,9 +14,9 @@ #endif // number of test iterations -extern constexpr unsigned int testIterations = 100; +constexpr unsigned int testIterations = 100; // start recording time after X iterations -extern constexpr unsigned int recordThresh = 10; +constexpr unsigned int recordThresh = 10; #ifndef MATRIX_SIZE #define MATRIX_SIZE 256 From 694db15dc371169a57644f864741a635d463bc8c Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Fri, 20 Sep 2024 09:41:32 -0700 Subject: [PATCH 7/8] remove the link to jira --- .../Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp index 4c9766bc1751f..b55894fe2e858 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -6,8 +6,6 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix - -// https://jira.devtools.intel.com/browse/GSD-9716 // XFAIL: arch-intel_gpu_pvc // RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI From 4b67488cc3f6f710d4af11793d2d27f75e619325 Mon Sep 17 00:00:00 2001 From: "Zhang, Yixing" Date: Fri, 20 Sep 2024 14:04:59 -0700 Subject: [PATCH 8/8] remove joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp --- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 96 ++++++------ ...t_matrix_bf16_fill_k_cache_runtime_dim.cpp | 8 +- ...rix_bf16_fill_k_cache_runtime_dim_impl.hpp | 142 ------------------ 3 files changed, 56 insertions(+), 190 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index cfbf93dcddbed..db8ddafba61a1 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -37,17 +37,16 @@ static constexpr void manually_unroll_loop(F &&f) { template class MatMul; template < -#ifndef ARG_DIM +#if !defined(ARG_DIM) && !defined(RUNTIME_DIM) size_t rowsA, size_t colsA, size_t rowsB, size_t colsB, -#endif // ARG_DIM +#endif // ARG_DIM, RUNTIME_DIM size_t vnniFactor, typename TOperand, typename TResult, size_t TM, size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1, size_t MCache2, size_t NCache2, size_t KCache2> - double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i -#ifdef ARG_DIM +#if defined(ARG_DIM) || defined(RUNTIME_DIM) , size_t rowsA, size_t colsA, size_t rowsB, size_t colsB -#endif // ARG_DIM +#endif // ARG_DIM, RUNTIME_DIM ) { size_t sgSize = get_sg_size>(q); @@ -296,8 +295,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i #ifdef PREFETCH auto prefetch_offsetA = (m2 * MCache2 + sgId * prefRow) * colsA + (k2 + prefDistance) * prefCol; - if ((prefetch_offsetA + (prefRow * MATRIX_SIZE) + prefCol) < - (MATRIX_SIZE * MATRIX_SIZE)) + if ((prefetch_offsetA + (prefRow * colsA) + prefCol) < + (rowsA * colsA)) joint_matrix_prefetch( sg, A + prefetch_offsetA, colsA, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); @@ -307,8 +306,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i pm1B * prefRow) * (colsB)*vnniFactor + (n2 * NCache2 * vnniFactor + pn1B * prefCol); - if ((prefetch_offsetB + (prefRow * MATRIX_SIZE * vnniFactor) + - prefCol) < (MATRIX_SIZE * MATRIX_SIZE)) + if ((prefetch_offsetB + (prefRow * colsA * vnniFactor) + + prefCol) < (rowsA * colsA)) joint_matrix_prefetch( sg, B + prefetch_offsetB, colsB * vnniFactor, layout::row_major, @@ -355,35 +354,34 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i return duration.count(); } -#ifndef EXCLUDE_MAIN_TEST template -void test() { - assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN && +void test(size_t matrix_size) { + assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN && "invalid matrix size"); - assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 && - (MATRIX_SIZE % TK) == 0 && + assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 && + (matrix_size % TK) == 0 && "invalid matrix size detected: not a multiple of "); std::cout << "Testing: " << TM << " x " << TN << " x " << TK << " [TM x TN x TK]" << std::endl; queue q; - T *A = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - T *B = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *C = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - TResult *refC = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); + T *A = malloc_shared(matrix_size * matrix_size, q); + T *B = malloc_shared(matrix_size * matrix_size, q); + TResult *C = malloc_shared(matrix_size * matrix_size, q); + TResult *refC = malloc_shared(matrix_size * matrix_size, q); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, A, T(1)); - matrix_rand(MATRIX_SIZE, MATRIX_SIZE, B, T(1)); + matrix_rand(matrix_size, matrix_size, A, T(1)); + matrix_rand(matrix_size, matrix_size, B, T(1)); - matrix_multiply_ref(A, B, refC, MATRIX_SIZE, MATRIX_SIZE, - MATRIX_SIZE); + matrix_multiply_ref(A, B, refC, matrix_size, matrix_size, + matrix_size); #ifdef VNNI - T *vnniB = malloc_shared(MATRIX_SIZE * MATRIX_SIZE, q); - matrix_vnni(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, vnniFactor); + T *vnniB = malloc_shared(matrix_size * matrix_size, q); + matrix_vnni(matrix_size, matrix_size, B, vnniB, vnniFactor); free(B, q); B = vnniB; #endif @@ -394,15 +392,15 @@ void test() { double duration = joint_matmul< -#ifndef ARG_DIM +#if !defined(ARG_DIM) && !defined(RUNTIME_DIM) MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, -#endif // ARG_DIM +#endif // ARG_DIM, RUNTIME_DIM vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1, KCache1, MCache2, NCache2, KCache2> (A, B, C, q, i -#ifdef ARG_DIM - , MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE -#endif // ARG_DIM +#if defined(ARG_DIM) || defined(RUNTIME_DIM) + , matrix_size, matrix_size, matrix_size, matrix_size +#endif // ARG_DIM, RUNTIME_DIM ); if (i >= recordThresh) { @@ -410,14 +408,14 @@ void test() { } } - assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC)); + assert(matrix_compare(matrix_size, matrix_size, C, refC)); double msecPerMatrixMul = totalDuration / static_cast(testIterations - recordThresh); - double gflops = (2.f * MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE * 1.0e-9f) / + double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) / (msecPerMatrixMul / 1000.f); - std::cout << "DONE for size " << MATRIX_SIZE << std::endl; + std::cout << "DONE for size " << matrix_size << std::endl; std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; free(A, q); @@ -426,7 +424,23 @@ void test() { free(refC, q); } -int main() { +int main( +#ifdef RUNTIME_DIM + int argc, char *argv[] +#endif //RUNTIME_DIM + ) { + +size_t matrix_size = MATRIX_SIZE; +#ifdef RUNTIME_DIM + // Check for command line argument + if (argc == 2) { + matrix_size = std::stoul(argv[1]); + } else { + std::cerr << "Usage: ./program matrix_size\n"; + return 1; // Error if no argument + } +#endif //RUNTIME_DIM + queue q; std::vector combinations = q.get_device() @@ -449,7 +463,7 @@ int main() { constexpr size_t NCache1 = 32; constexpr size_t KCache1 = 32; test(); + MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); break; } @@ -457,14 +471,14 @@ int main() { constexpr size_t NCache1 = 4 * /*TN*/ 16; constexpr size_t KCache1 = 16; test(); + NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); #if (!defined(SG_SZ) || SG_SZ != 32) // These combination are not currently supported for subgroup size = 32 in // IGC test(); + MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); test(); + MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); #endif break; } @@ -474,13 +488,11 @@ int main() { constexpr size_t KCache1 = 16; test(); - // test(); + NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size); + // test(matrix_size); break; } } return 0; } -#endif //EXCLUDE_MAIN_TEST diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp index b55894fe2e858..857c47b04ed56 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -8,17 +8,13 @@ // REQUIRES: aspect-ext_intel_matrix // XFAIL: arch-intel_gpu_pvc -// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI +// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DRUNTIME_DIM -DVNNI // RUN: %{run} %t_runtime_dim_vnni.out -// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise +// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise -DRUNTIME_DIM // RUN: %{run} %t_runtime_dim.out // -ffp-model=precise is added to not depend on compiler defaults. -#define EXCLUDE_MAIN_TEST 1 -#define ARG_DIM 1 - #include "common.hpp" #include "joint_matrix_bf16_fill_k_cache_impl.hpp" -#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp deleted file mode 100644 index 495185f46fbe4..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp +++ /dev/null @@ -1,142 +0,0 @@ -//------------------------------------------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-------------------------------------------------------------------------===// - -#include -#include -template class MatMul; - -template -void test(size_t matrix_size) { - assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN && - "invalid matrix size"); - assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 && - (matrix_size % TK) == 0 && - "invalid matrix size detected: not a multiple of "); - - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - - queue q; - T *A = malloc_shared(matrix_size * matrix_size, q); - T *B = malloc_shared(matrix_size * matrix_size, q); - TResult *C = malloc_shared(matrix_size * matrix_size, q); - TResult *refC = malloc_shared(matrix_size * matrix_size, q); - - matrix_rand(matrix_size, matrix_size, A, T(1)); - matrix_rand(matrix_size, matrix_size, B, T(1)); - - matrix_multiply_ref(A, B, refC, matrix_size, matrix_size, - matrix_size); - -#ifdef VNNI - T *vnniB = malloc_shared(matrix_size * matrix_size, q); - matrix_vnni(matrix_size, matrix_size, B, vnniB, vnniFactor); - free(B, q); - B = vnniB; -#endif - - // run testIterations time, aggregate and calculate average run time - double totalDuration = 0; - for (unsigned int i = 0; i < testIterations; i++) { - - double duration = - joint_matmul - (A, B, C, q, i, - matrix_size, matrix_size, matrix_size, matrix_size); - - if (i >= recordThresh) { - totalDuration += duration; - } - } - - assert(matrix_compare(matrix_size, matrix_size, C, refC)); - - double msecPerMatrixMul = - totalDuration / static_cast(testIterations - recordThresh); - double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) / - (msecPerMatrixMul / 1000.f); - - std::cout << "DONE for size " << matrix_size << std::endl; - std::cout << "GOPS is " << gflops << " Gop/s" << std::endl; - - free(A, q); - free(B, q); - free(C, q); - free(refC, q); -} - -int main(int argc, char *argv[]) { - size_t matrix_size; - - // Check for command line argument - if (argc == 2) { - matrix_size = std::stoul(argv[1]); - } else { - std::cerr << "Usage: ./program matrix_size\n"; - return 1; // Error if no argument - } - - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - constexpr size_t MCache1 = 32; - constexpr size_t MCache2 = 256; - constexpr size_t NCache2 = 256; - constexpr size_t KCache2 = 32; - -#ifdef VNNI - constexpr unsigned int VnniFactor = 2; -#else // VNNI - constexpr unsigned int VnniFactor = 1; -#endif // VNNI - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - constexpr size_t NCache1 = 32; - constexpr size_t KCache1 = 32; - test(matrix_size); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - constexpr size_t NCache1 = 4 * /*TN*/ 16; - constexpr size_t KCache1 = 16; - test(matrix_size); -#if (!defined(SG_SZ) || SG_SZ != 32) - // These combination are not currently supported for subgroup size = 32 in - // IGC - test(matrix_size); - test(matrix_size); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - constexpr size_t NCache1 = 4 * /*TN*/ 8; - constexpr size_t KCache1 = 16; - - test(matrix_size); - // test(matrix_size); - break; - } - } - return 0; -}