Skip to content

Commit

Permalink
Rebase including USM implementation
Browse files Browse the repository at this point in the history
Fix rebasing issue and include tests and benchmarks for USM.
  • Loading branch information
s-Nick committed Sep 12, 2023
1 parent 71699f3 commit 4839e9a
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 121 deletions.
62 changes: 45 additions & 17 deletions benchmark/portblas/extension/omatcopy_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "../../../test/unittest/extension/extension_reference.hpp"
#include "../utils.hpp"

template <typename scalar_t>
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
index_t stride_a_mul, index_t stride_b_mul, index_t batch_size,
Expand Down Expand Up @@ -55,15 +55,21 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
batch_size);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

// Input matrix/vector, output vector.
std::vector<scalar_t> m_a =
blas_benchmark::utils::random_data<scalar_t>(size_a);
std::vector<scalar_t> m_b =
blas_benchmark::utils::random_data<scalar_t>(size_b);

auto m_a_gpu = blas::make_sycl_iterator_buffer<scalar_t>(m_a, size_a);
auto m_b_gpu = blas::make_sycl_iterator_buffer(m_b, size_b);
auto m_a_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_a, q);
auto m_b_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_b, q);

auto copy_a = blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, size_a);
auto copy_b = blas::helper::copy_to_device(q, m_b.data(), m_b_gpu, size_b);

sb_handle.wait({copy_a, copy_b});

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
Expand All @@ -76,14 +82,19 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,

std::vector<scalar_t> m_b_temp = m_b;
{
auto m_b_temp_gpu =
blas::make_sycl_iterator_buffer<scalar_t>(m_b_temp, size_b);
auto m_b_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_b, q);
auto copy_tmp = blas::helper::copy_to_device<scalar_t>(
q, m_b_temp.data(), m_b_temp_gpu, size_b);

auto event = blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu,
lda, stride_a, m_b_temp_gpu, ldb,
stride_b, batch_size);
stride_b, batch_size, {copy_tmp});
sb_handle.wait(event);
auto copy_res = blas::helper::copy_to_host<scalar_t>(
q, m_b_temp_gpu, m_b_temp.data(), size_b);

sb_handle.wait();
sb_handle.wait(copy_res);
blas::helper::deallocate<mem_alloc>(m_b_temp_gpu, q);
}

std::ostringstream err_stream;
Expand Down Expand Up @@ -130,15 +141,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto omatcopy_batch_params =
blas_benchmark::utils::get_matcopy_batch_params<scalar_t>(args);
blas::helper::deallocate<mem_alloc>(m_a_gpu, q);
blas::helper::deallocate<mem_alloc>(m_b_gpu, q);
}

for (auto p : omatcopy_batch_params) {
template <typename scalar_t, typename blas::helper::AllocType mem_alloc>
void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
std::string mem_type,
std::vector<matcopy_batch_param_t<scalar_t>> params) {
for (auto p : params) {
std::string ts;
index_t m, n, lda_mul, ldb_mul, stride_a_mul, stride_b_mul, batch_size;
scalar_t alpha;
Expand All @@ -151,21 +163,37 @@ void register_benchmark(blas_benchmark::Args& args,
index_t lda_mul, index_t ldb_mul, index_t stride_a_mul,
index_t stride_b_mul, index_t batch_size,
bool* success) {
run<scalar_t>(st, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul,
stride_a_mul, stride_b_mul, batch_size, success);
run<scalar_t, mem_alloc>(st, sb_handle_ptr, t, m, n, alpha, lda_mul,
ldb_mul, stride_a_mul, stride_b_mul, batch_size,
success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<
blas_benchmark::utils::ExtensionOp::omatcopy_batch, scalar_t,
index_t>(ts, m, n, alpha, lda_mul, ldb_mul, stride_a_mul,
stride_b_mul, batch_size)
stride_b_mul, batch_size, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul,
stride_a_mul, stride_b_mul, batch_size, success)
->UseRealTime();
}
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto omatcopy_batch_params =
blas_benchmark::utils::get_matcopy_batch_params<scalar_t>(args);
register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
omatcopy_batch_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
omatcopy_batch_params);
#endif
}

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
Expand Down
11 changes: 6 additions & 5 deletions common/include/common/benchmark_names.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,16 @@ template <ExtensionOp op, typename scalar_t, typename index_t>
inline typename std::enable_if<op == ExtensionOp::omatcopy, std::string>::type
get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul,
index_t ldb_mul, std::string mem_type) {
return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul);
return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul,
mem_type);
}

template <ExtensionOp op, typename scalar_t, typename index_t>
inline typename std::enable_if<op == ExtensionOp::omatcopy2, std::string>::type
get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul,
index_t ldb_mul, index_t inc_a, index_t inc_b, std::string mem_type) {
return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul,
inc_a, inc_b);
inc_a, inc_b, mem_type);
}

template <ExtensionOp op, typename scalar_t, typename index_t>
Expand All @@ -262,17 +263,17 @@ get_name(std::string trans_a, std::string trans_b, int m, int n, scalar_t alpha,
scalar_t beta, index_t lda_mul, index_t ldb_mul, index_t ldc_mul,
std::string mem_type) {
return internal::get_name<op, scalar_t>(trans_a, trans_b, m, n, alpha, beta,
lda_mul, ldb_mul, ldc_mul);
lda_mul, ldb_mul, ldc_mul, mem_type);
}
template <ExtensionOp op, typename scalar_t, typename index_t>
inline typename std::enable_if<op == ExtensionOp::omatcopy_batch,
std::string>::type
get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul,
index_t ldb_mul, index_t stride_a_mul, index_t stride_b_mul,
index_t batch_size) {
index_t batch_size, std::string mem_type) {
return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul,
stride_a_mul, stride_b_mul,
batch_size);
batch_size, mem_type);
}

template <ExtensionOp op, typename scalar_t, typename index_t>
Expand Down
123 changes: 57 additions & 66 deletions include/interface/extension_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,52 +61,49 @@ namespace internal {
*/
template <bool in_place, typename sb_handle_t, typename element_t,
typename index_t, typename in_t, typename out_t>
typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans,
index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in,
index_t inc_in, out_t out_memory,
index_t ld_out, index_t inc_out,
const typename sb_handle_t::event_t& _dependencies);
typename sb_handle_t::event_t _matcopy(
sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
index_t ld_out, index_t inc_out,
const typename sb_handle_t::event_t& _dependencies);

template <typename sb_handle_t, typename element_t, typename index_t,
typename container_0_t, typename container_1_t, typename container_2_t>
typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a,
char trans_b, index_t m, index_t n,
element_t alpha, container_0_t a,
index_t lda, element_t beta,
container_1_t b, index_t ldb,
container_2_t c, index_t ldc,
const typename sb_handle_t::event_t& _dependencies);
typename container_0_t, typename container_1_t,
typename container_2_t>
typename sb_handle_t::event_t _omatadd(
sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n,
element_t alpha, container_0_t a, index_t lda, element_t beta,
container_1_t b, index_t ldb, container_2_t c, index_t ldc,
const typename sb_handle_t::event_t& _dependencies);

template <bool in_place, typename element_t, typename sb_handle_t,
typename index_t, typename in_t, typename out_t>
typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
index_t n, in_t A, index_t ld_a,
out_t B, index_t ld_b,
const typename sb_handle_t::event_t& _dependencies);
typename sb_handle_t::event_t _transpose(
sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B,
index_t ld_b, const typename sb_handle_t::event_t& _dependencies);

template <bool in_place, typename sb_handle_t, typename element_t,
typename index_t, typename in_t, typename out_t>
typename sb_handle_t::event_t _matcopy_batch(
sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory,
index_t ld_out, index_t stride_out, index_t batch_size);
index_t ld_out, index_t stride_out, index_t batch_size,
const typename sb_handle_t::event_t& _dependencies);

template <uint32_t TileSize, int TilePerWG, typename sb_handle_t,
typename element_t, typename index_t, typename in_t, typename out_t>
typename sb_handle_t::event_t _matcopy_batch_impl(
sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, in_t memory,
index_t ld_in, index_t in_stride, out_t out_memory, index_t ld_out,
index_t out_stride, index_t batch_size);
index_t out_stride, index_t batch_size,
const typename sb_handle_t::event_t& _dependencies);

template <typename operator_t, typename element_t, typename sb_handle_t,
typename input_t, typename output_t, typename index_t>
typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle,
input_t buffer_in, index_t ld,
output_t buffer_out, index_t rows,
index_t cols,
reduction_dim_t reduction_dim,
const typename sb_handle_t::event_t& _dependencies);
typename sb_handle_t::event_t _reduction(
sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out,
index_t rows, index_t cols, reduction_dim_t reduction_dim,
const typename sb_handle_t::event_t& _dependencies);

template <int Tile_size, int wg_size, int cl_size, bool local_memory,
typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down Expand Up @@ -150,15 +147,13 @@ typename sb_handle_t::event_t _transpose_add_impl(
*/
template <typename sb_handle_t, typename element_t, typename index_t,
typename in_t, typename out_t>
typename sb_handle_t::event_t _omatcopy(sb_handle_t& sb_handle, char trans,
index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in,
out_t out_memory, index_t ld_out,
const typename sb_handle_t::event_t& _dependencies = {}) {
return internal::_matcopy<false>(sb_handle, trans, m, n, alpha, in_memory,
ld_in, static_cast<index_t>(1), out_memory,
ld_out, static_cast<index_t>(1),
_dependencies);
typename sb_handle_t::event_t _omatcopy(
sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in, out_t out_memory, index_t ld_out,
const typename sb_handle_t::event_t& _dependencies = {}) {
return internal::_matcopy<false>(
sb_handle, trans, m, n, alpha, in_memory, ld_in, static_cast<index_t>(1),
out_memory, ld_out, static_cast<index_t>(1), _dependencies);
}

/**
Expand Down Expand Up @@ -187,12 +182,11 @@ typename sb_handle_t::event_t _omatcopy(sb_handle_t& sb_handle, char trans,
*/
template <typename sb_handle_t, typename element_t, typename index_t,
typename in_t, typename out_t>
typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans,
index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in,
index_t inc_in, out_t out_memory,
index_t ld_out, index_t inc_out,
const typename sb_handle_t::event_t& _dependencies = {}) {
typename sb_handle_t::event_t _omatcopy2(
sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
index_t ld_out, index_t inc_out,
const typename sb_handle_t::event_t& _dependencies = {}) {
return internal::_matcopy<false>(sb_handle, trans, m, n, alpha, in_memory,
ld_in, inc_in, out_memory, ld_out, inc_out,
_dependencies);
Expand Down Expand Up @@ -220,14 +214,13 @@ typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans,
* @param ldc Matrix C leading dimension
*/
template <typename sb_handle_t, typename element_t, typename index_t,
typename container_0_t, typename container_1_t, typename container_2_t>
typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a,
char trans_b, index_t m, index_t n,
element_t alpha, container_0_t A,
index_t lda, element_t beta,
container_1_t B, index_t ldb,
container_2_t C, index_t ldc,
const typename sb_handle_t::event_t& _dependencies = {}) {
typename container_0_t, typename container_1_t,
typename container_2_t>
typename sb_handle_t::event_t _omatadd(
sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n,
element_t alpha, container_0_t A, index_t lda, element_t beta,
container_1_t B, index_t ldb, container_2_t C, index_t ldc,
const typename sb_handle_t::event_t& _dependencies = {}) {
return internal::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, A, lda,
beta, B, ldb, C, ldc, _dependencies);
}
Expand Down Expand Up @@ -289,10 +282,11 @@ template <typename sb_handle_t, typename element_t, typename index_t,
typename sb_handle_t::event_t _omatcopy_batch(
sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory,
index_t ld_out, index_t stride_out, index_t batch_size) {
index_t ld_out, index_t stride_out, index_t batch_size,
const typename sb_handle_t::event_t& _dependencies = {}) {
return internal::_matcopy_batch<false>(
sb_handle, trans, m, n, alpha, in_memory, ld_in, stride_in, out_memory,
ld_out, stride_out, batch_size);
ld_out, stride_out, batch_size, _dependencies);
}

namespace extension {
Expand All @@ -316,10 +310,9 @@ namespace extension {
*/
template <typename element_t, typename sb_handle_t, typename index_t,
typename in_t, typename out_t>
typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
index_t n, in_t A, index_t ld_in,
index_t ld_out,
const typename sb_handle_t::event_t& _dependencies = {}) {
typename sb_handle_t::event_t _transpose(
sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_in,
index_t ld_out, const typename sb_handle_t::event_t& _dependencies = {}) {
return blas::internal::_transpose<true, element_t>(sb_handle, m, n, A, ld_in,
A, ld_out, _dependencies);
}
Expand All @@ -345,24 +338,22 @@ typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
*/
template <typename element_t, typename sb_handle_t, typename index_t,
typename in_t, typename out_t>
typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
index_t n, in_t A, index_t ld_a,
out_t B, index_t ld_b,
const typename sb_handle_t::event_t& _dependencies = {}) {
typename sb_handle_t::event_t _transpose(
sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B,
index_t ld_b, const typename sb_handle_t::event_t& _dependencies = {}) {
return blas::internal::_transpose<false, element_t>(sb_handle, m, n, A, ld_a,
B, ld_b, _dependencies);
}

template <typename operator_t, typename element_t, typename sb_handle_t,
typename input_t, typename output_t, typename index_t>
typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle,
input_t buffer_in, index_t ld,
output_t buffer_out, index_t rows,
index_t cols,
reduction_dim_t reduction_dim,
const typename sb_handle_t::event_t& _dependencies = {}) {
typename sb_handle_t::event_t _reduction(
sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out,
index_t rows, index_t cols, reduction_dim_t reduction_dim,
const typename sb_handle_t::event_t& _dependencies = {}) {
return blas::internal::_reduction<operator_t, element_t>(
sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim, _dependencies);
sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim,
_dependencies);
}

} // namespace extension
Expand Down
Loading

0 comments on commit 4839e9a

Please sign in to comment.