From 25a88ce2683dcc97ab825b608a7f4b8306072b6a Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 9 Sep 2024 13:02:38 +0100 Subject: [PATCH 1/3] use memset to invalidate cache --- benchmarks/common/benchmark_runner.hpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 7bac43119..242534b85 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -56,6 +56,19 @@ using namespace cute; +namespace cutlass { + void memset(void* ptr, int val, std::size_t num_bytes) { + #if defined(CUTLASS_ENABLE_SYCL) + syclcompat::memset(ptr, val, num_bytes); + #else + auto cuda_result = cudaMemset(ptr, val, num_bytes); + if (cuda_result != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(cuda_result)); + } + #endif + } +} + namespace cutlass::benchmark { /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -284,8 +297,18 @@ struct BenchmarkRunnerGemm { (options.k * options.n) * sizeof(ElementB) + (options.beta != 0 ? 2 : 1) * (options.m * options.n) * sizeof(ElementC)) * 1e-9) * options.l; - initialize_counters(state); + + int32_t counter = 0; for(auto _ : state) { + + state.PauseTiming(); + // Invalidate LLC by changing the data in the global pointer to random data, as verification is not required + // initialize_block is not being used beacuse it would otherwise be too slow. + cutlass::memset(block_A.get(), 3 * counter + 1, block_A.size() * sizeof(ElementA)); + cutlass::memset(block_B.get(), 3 * counter + 2, block_B.size() * sizeof(ElementB)); + cutlass::memset(block_C.get(), 3 * counter + 3, block_C.size() * sizeof(ElementC)); + state.ResumeTiming(); + GPU_Clock timer; timer.start(); gemm_op.run(); From 590bebf80a41e9f636b174fb918abdc9c4fcd16e Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 9 Sep 2024 14:37:04 +0100 Subject: [PATCH 2/3] add back mistakenly deleted initialize_counters call --- benchmarks/common/benchmark_runner.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 242534b85..0751355a7 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -299,6 +299,7 @@ struct BenchmarkRunnerGemm { options.l; int32_t counter = 0; + initialize_counters(state); for(auto _ : state) { state.PauseTiming(); @@ -315,6 +316,7 @@ struct BenchmarkRunnerGemm { auto ms_elapsed = timer.milliseconds(); update_counters(state, ms_elapsed, tflop, giga_bytes_transferred); state.SetIterationTime(ms_elapsed / 1000); + counter++; } finalize_counters(state, tflop, giga_bytes_transferred); } From e308b9ac17a0525b0a4b14487306160c10bd8127 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Tue, 10 Sep 2024 14:32:14 +0100 Subject: [PATCH 3/3] use proper data and multiple allocations for avoiding hot cache --- benchmarks/common/benchmark_runner.hpp | 72 ++++++++++++++++---------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 0751355a7..bc727b4b5 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -57,14 +57,16 @@ using namespace cute; namespace cutlass { - void memset(void* ptr, int val, std::size_t num_bytes) { + std::size_t get_llc_size() { #if defined(CUTLASS_ENABLE_SYCL) - syclcompat::memset(ptr, val, num_bytes); + return syclcompat::get_default_queue().get_device().get_info(); #else - auto cuda_result = cudaMemset(ptr, val, num_bytes); - if (cuda_result != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(cuda_result)); - } + cudaDeviceProp prop_struct; + auto result = cudaGetDeviceProperties(&prop_struct, 0); + if (result != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(result)); + } + return static_cast(prop_struct.l2CacheSize); #endif } } @@ -171,6 +173,8 @@ struct BenchmarkRunnerGemm { using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + int32_t count; + // // Data members // @@ -183,9 +187,9 @@ struct BenchmarkRunnerGemm { uint64_t seed; - DeviceAllocation block_A; - DeviceAllocation block_B; - DeviceAllocation block_C; + std::vector> block_A; + std::vector> block_B; + std::vector> block_C; DeviceAllocation block_D; DeviceAllocation block_ref_D; @@ -198,9 +202,9 @@ struct BenchmarkRunnerGemm { bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { auto [M, N, K, L] = problem_size; - TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); - TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); - TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + TensorRef ref_A(block_A[0].get(), LayoutA::packed({M, K})); + TensorRef ref_B(block_B[0].get(), LayoutB::packed({K, N})); + TensorRef ref_C(block_C[0].get(), LayoutC::packed({M, N})); TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); reference::device::GemmComplex( @@ -244,15 +248,28 @@ struct BenchmarkRunnerGemm { stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); - block_A.reset(M * K * L); - block_B.reset(K * N * L); - block_C.reset(M * N * L); + std::size_t mem_occupied_ABC = (M * K * L * sizeof(ElementA)) + (K * N * L * sizeof(ElementB)) + + (M * N * L * sizeof(ElementC)); + count = std::ceil(static_cast(cutlass::get_llc_size()) / static_cast(mem_occupied_ABC)); + + for(int i=0; i < count; i++) { + block_A.emplace_back(); + block_B.emplace_back(); + block_C.emplace_back(); + } + + for (int i=0; i < count; i++) { + block_A[i].reset(M * K * L); + block_B[i].reset(K * N * L); + block_C[i].reset(M * N * L); + initialize_block(block_A[i], seed + i); + initialize_block(block_B[i], seed + i); + initialize_block(block_C[i], seed + i); + } + block_D.reset(M * N * L); block_ref_D.reset(M * N * L); - initialize_block(block_A, seed + 2023); - initialize_block(block_B, seed + 2022); - initialize_block(block_C, seed + 2021); } void run(::benchmark::State& state, const Options& options, const KernelHardwareInfo& hw_info) { @@ -263,8 +280,8 @@ struct BenchmarkRunnerGemm { typename Gemm::GemmKernel::Arguments arguments{ gemm::GemmUniversalMode::kGemm, problem_size, - {block_A.get(), stride_A, block_B.get(), stride_B}, - {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + {block_A[0].get(), stride_A, block_B[0].get(), stride_B}, + {{options.alpha, options.beta}, block_C[0].get(), stride_C, block_D.get(), stride_D}, hw_info }; @@ -301,13 +318,16 @@ struct BenchmarkRunnerGemm { int32_t counter = 0; initialize_counters(state); for(auto _ : state) { - state.PauseTiming(); - // Invalidate LLC by changing the data in the global pointer to random data, as verification is not required - // initialize_block is not being used beacuse it would otherwise be too slow. - cutlass::memset(block_A.get(), 3 * counter + 1, block_A.size() * sizeof(ElementA)); - cutlass::memset(block_B.get(), 3 * counter + 2, block_B.size() * sizeof(ElementB)); - cutlass::memset(block_C.get(), 3 * counter + 3, block_C.size() * sizeof(ElementC)); + int input_num = std::max(int(0), (counter % count) - 1); + typename Gemm::GemmKernel::Arguments arguments{ + gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A[input_num].get(), stride_A, block_B[input_num].get(), stride_B}, + {{options.alpha, options.beta}, block_C[input_num].get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + gemm_op.initialize(arguments, workspace.get()); state.ResumeTiming(); GPU_Clock timer;