Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add tests for matrix size be runtime dimension #15429

Draft
wants to merge 8 commits into
base: sycl
Choose a base branch
from
20 changes: 20 additions & 0 deletions sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//==--- joint_matrix_bf16_fill_k_cache_arg_dim.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix
// XFAIL: arch-intel_gpu_pvc

// RUN: %{build} -o %t_arg_dim_vnni.out -ffp-model=precise -DARG_DIM -DVNNI
// RUN: %{run} %t_arg_dim_vnni.out

// RUN: %{build} -o %t_arg_dim.out -ffp-model=precise -DARG_DIM
// RUN: %{run} %t_arg_dim.out

// -ffp-model=precise is added to not depend on compiler defaults.

#include "common.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
33 changes: 27 additions & 6 deletions sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,20 @@ static constexpr void manually_unroll_loop(F &&f) {

template <size_t TM, size_t TN, size_t TK> class MatMul;

template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
template <
#ifndef ARG_DIM
size_t rowsA, size_t colsA, size_t rowsB, size_t colsB,
#endif // ARG_DIM
size_t vnniFactor, typename TOperand, typename TResult, size_t TM,
size_t TN, size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
YixingZhang007 marked this conversation as resolved.
Show resolved Hide resolved
double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {

double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i
#ifdef ARG_DIM
, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB
#endif // ARG_DIM
) {

size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
Expand Down Expand Up @@ -346,6 +355,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
return duration.count();
}

#ifndef EXCLUDE_MAIN_TEST
template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
Expand Down Expand Up @@ -381,10 +391,20 @@ void test() {
// run testIterations time, aggregate and calculate average run time
double totalDuration = 0;
for (unsigned int i = 0; i < testIterations; i++) {
double duration =
joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>(A, B, C, q, i);

double duration =
joint_matmul<
#ifndef ARG_DIM
MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
#endif // ARG_DIM
vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>
(A, B, C, q, i
#ifdef ARG_DIM
, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE
#endif // ARG_DIM
);

if (i >= recordThresh) {
totalDuration += duration;
}
Expand Down Expand Up @@ -463,3 +483,4 @@ int main() {
}
return 0;
}
#endif //EXCLUDE_MAIN_TEST
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//==--- joint_matrix_bf16_fill_k_cache_runtime_dim.cpp - DPC++ joint_matrix--------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix

// https://jira.devtools.intel.com/browse/GSD-9716
YixingZhang007 marked this conversation as resolved.
Show resolved Hide resolved
// XFAIL: arch-intel_gpu_pvc

// RUN: %{build} -o %t_runtime_dim_vnni.out -ffp-model=precise -DVNNI
// RUN: %{run} %t_runtime_dim_vnni.out

// RUN: %{build} -o %t_runtime_dim.out -ffp-model=precise
// RUN: %{run} %t_runtime_dim.out

// -ffp-model=precise is added to not depend on compiler defaults.

#define EXCLUDE_MAIN_TEST 1
#define ARG_DIM 1

#include "common.hpp"
#include "joint_matrix_bf16_fill_k_cache_impl.hpp"
#include "joint_matrix_bf16_fill_k_cache_runtime_dim_impl.hpp"
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
//------------------------------------------------------------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===-------------------------------------------------------------------------===//

#include <random>
#include <sycl/usm.hpp>
template <size_t TM, size_t TN, size_t TK> class MatMul;

template <typename T, typename TResult, size_t vnniFactor, size_t TM, size_t TN,
size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
size_t MCache2, size_t NCache2, size_t KCache2>
void test(size_t matrix_size) {
Copy link
Contributor

@YuriPlyakhin YuriPlyakhin Sep 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest not to introduce this file with mostly duplicated code.
Instead, can we add one more macro in the sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp, like

#if runtime_matrix_size
    void test(size_t matrix_size) {
#else
    void test() {
        constexpr size_t matrix_size = MATRIX_SIZE;
#endif

and propagate the change up to main and down to kernel call?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion! I have modified main to include the following,

int main(
#ifdef RUNTIME_DIM
  int argc, char *argv[]
#endif //RUNTIME_DIM
  ) {

size_t matrix_size = MATRIX_SIZE;
#ifdef RUNTIME_DIM
  // Check for command line argument
  if (argc == 2) {
    matrix_size = std::stoul(argv[1]);
  } else {
    std::cerr << "Usage: ./program matrix_size\n";
    return 1; // Error if no argument
  }
#endif //RUNTIME_DIM

and modified test as below,

void test(size_t matrix_size) 

This could help eliminate duplicate code for calling the test function in main. Please let me know if you’d prefer to use another way instead (having two interface for test function and having different calls to test in main).

assert(matrix_size >= TM && matrix_size >= TK && matrix_size >= TN &&
"invalid matrix size");
assert((matrix_size % TM) == 0 && (matrix_size % TN) == 0 &&
(matrix_size % TK) == 0 &&
"invalid matrix size detected: not a multiple of <TM,TN,TK>");

std::cout << "Testing: " << TM << " x " << TN << " x " << TK
<< " [TM x TN x TK]" << std::endl;

queue q;
T *A = malloc_shared<T>(matrix_size * matrix_size, q);
T *B = malloc_shared<T>(matrix_size * matrix_size, q);
TResult *C = malloc_shared<TResult>(matrix_size * matrix_size, q);
TResult *refC = malloc_shared<TResult>(matrix_size * matrix_size, q);

matrix_rand<T>(matrix_size, matrix_size, A, T(1));
matrix_rand<T>(matrix_size, matrix_size, B, T(1));

matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, matrix_size, matrix_size,
matrix_size);

#ifdef VNNI
T *vnniB = malloc_shared<T>(matrix_size * matrix_size, q);
matrix_vnni<T>(matrix_size, matrix_size, B, vnniB, vnniFactor);
free(B, q);
B = vnniB;
#endif

// run testIterations time, aggregate and calculate average run time
double totalDuration = 0;
for (unsigned int i = 0; i < testIterations; i++) {

double duration =
joint_matmul<vnniFactor, T, TResult, TM, TN, TK, MCache1, NCache1,
KCache1, MCache2, NCache2, KCache2>
(A, B, C, q, i,
matrix_size, matrix_size, matrix_size, matrix_size);

if (i >= recordThresh) {
totalDuration += duration;
}
}

assert(matrix_compare(matrix_size, matrix_size, C, refC));

double msecPerMatrixMul =
totalDuration / static_cast<double>(testIterations - recordThresh);
double gflops = (2.f * matrix_size * matrix_size * matrix_size * 1.0e-9f) /
(msecPerMatrixMul / 1000.f);

std::cout << "DONE for size " << matrix_size << std::endl;
std::cout << "GOPS is " << gflops << " Gop/s" << std::endl;

free(A, q);
free(B, q);
free(C, q);
free(refC, q);
}

int main(int argc, char *argv[]) {
size_t matrix_size;

// Check for command line argument
if (argc == 2) {
matrix_size = std::stoul(argv[1]);
} else {
std::cerr << "Usage: ./program matrix_size\n";
return 1; // Error if no argument
}

queue q;
std::vector<combination> combinations =
q.get_device()
.get_info<sycl::ext::oneapi::experimental::info::device::
matrix_combinations>();

constexpr size_t MCache1 = 32;
constexpr size_t MCache2 = 256;
constexpr size_t NCache2 = 256;
constexpr size_t KCache2 = 32;

#ifdef VNNI
constexpr unsigned int VnniFactor = 2;
#else // VNNI
constexpr unsigned int VnniFactor = 1;
#endif // VNNI

for (unsigned int i = 0; i < combinations.size(); i++) {
if (combinations[i].nsize == 0) { // Intel AMX
constexpr size_t NCache1 = 32;
constexpr size_t KCache1 = 32;
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
break;
}

if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
constexpr size_t NCache1 = 4 * /*TN*/ 16;
constexpr size_t KCache1 = 16;
test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
#if (!defined(SG_SZ) || SG_SZ != 32)
// These combination are not currently supported for subgroup size = 32 in
// IGC
test<bfloat16, float, VnniFactor, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
#endif
break;
}

if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
constexpr size_t NCache1 = 4 * /*TN*/ 8;
constexpr size_t KCache1 = 16;

test<bfloat16, float, VnniFactor, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1,
NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
// test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 32, /*TK*/ 16,
// MCache1,
// NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
break;
}
}
return 0;
}
Loading