Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove caching effects in the Benchmarks #136

Open
wants to merge 4 commits into
base: sycl-develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 59 additions & 14 deletions benchmarks/common/benchmark_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,21 @@

using namespace cute;

namespace cutlass {
std::size_t get_llc_size() {
#if defined(CUTLASS_ENABLE_SYCL)
return syclcompat::get_default_queue().get_device().get_info<sycl::info::device::global_mem_cache_size>();
#else
cudaDeviceProp prop_struct;
auto result = cudaGetDeviceProperties(&prop_struct, 0);
if (result != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(result));
}
return static_cast<std::size_t>(prop_struct.l2CacheSize);
#endif
}
}

namespace cutlass::benchmark {

///////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -158,6 +173,8 @@ struct BenchmarkRunnerGemm {

using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;

int32_t count;

//
// Data members
//
Expand All @@ -170,9 +187,9 @@ struct BenchmarkRunnerGemm {

uint64_t seed;

DeviceAllocation<ElementA> block_A;
DeviceAllocation<ElementB> block_B;
DeviceAllocation<ElementC> block_C;
std::vector<DeviceAllocation<ElementA>> block_A;
std::vector<DeviceAllocation<ElementB>> block_B;
std::vector<DeviceAllocation<ElementC>> block_C;
DeviceAllocation<ElementOutput> block_D;
DeviceAllocation<ElementOutput> block_ref_D;

Expand All @@ -185,9 +202,9 @@ struct BenchmarkRunnerGemm {
bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
auto [M, N, K, L] = problem_size;

TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
TensorRef ref_A(block_A[0].get(), LayoutA::packed({M, K}));
TensorRef ref_B(block_B[0].get(), LayoutB::packed({K, N}));
TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N}));
TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));

reference::device::GemmComplex(
Expand Down Expand Up @@ -231,15 +248,28 @@ struct BenchmarkRunnerGemm {
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));

block_A.reset(M * K * L);
block_B.reset(K * N * L);
block_C.reset(M * N * L);
std::size_t mem_occupied_ABC = (M * K * L * sizeof(ElementA)) + (K * N * L * sizeof(ElementB)) +
(M * N * L * sizeof(ElementC));
count = std::ceil(static_cast<float>(cutlass::get_llc_size()) / static_cast<float>(mem_occupied_ABC));

for(int i=0; i < count; i++) {
block_A.emplace_back();
block_B.emplace_back();
block_C.emplace_back();
}

for (int i=0; i < count; i++) {
block_A[i].reset(M * K * L);
block_B[i].reset(K * N * L);
block_C[i].reset(M * N * L);
initialize_block(block_A[i], seed + i);
initialize_block(block_B[i], seed + i);
initialize_block(block_C[i], seed + i);
}

block_D.reset(M * N * L);
block_ref_D.reset(M * N * L);

initialize_block(block_A, seed + 2023);
initialize_block(block_B, seed + 2022);
initialize_block(block_C, seed + 2021);
}

void run(::benchmark::State& state, const Options& options, const KernelHardwareInfo& hw_info) {
Expand All @@ -250,8 +280,8 @@ struct BenchmarkRunnerGemm {
typename Gemm::GemmKernel::Arguments arguments{
gemm::GemmUniversalMode::kGemm,
problem_size,
{block_A.get(), stride_A, block_B.get(), stride_B},
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
{block_A[0].get(), stride_A, block_B[0].get(), stride_B},
{{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D},
hw_info
};

Expand Down Expand Up @@ -284,14 +314,29 @@ struct BenchmarkRunnerGemm {
(options.k * options.n) * sizeof(ElementB) +
(options.beta != 0 ? 2 : 1) * (options.m * options.n) * sizeof(ElementC)) * 1e-9) *
options.l;

int32_t counter = 0;
initialize_counters(state);
for(auto _ : state) {
state.PauseTiming();
int input_num = std::max(int(0), (counter % count) - 1);
typename Gemm::GemmKernel::Arguments arguments{
gemm::GemmUniversalMode::kGemm,
problem_size,
{block_A[input_num].get(), stride_A, block_B[input_num].get(), stride_B},
{{options.alpha, options.beta}, block_C[input_num].get(), stride_C, block_D.get(), stride_D},
hw_info
};
gemm_op.initialize(arguments, workspace.get());
state.ResumeTiming();

GPU_Clock timer;
timer.start();
gemm_op.run();
auto ms_elapsed = timer.milliseconds();
update_counters(state, ms_elapsed, tflop, giga_bytes_transferred);
state.SetIterationTime(ms_elapsed / 1000);
counter++;
}
finalize_counters(state, tflop, giga_bytes_transferred);
}
Expand Down