Skip to content

Commit

Permalink
Merge pull request #1234 from pika-org/fewer-cublas-cusolver-handles
Browse files Browse the repository at this point in the history
Use fewer cuBLAS/SOLVER handles in `cuda_pool`
  • Loading branch information
msimberg committed Sep 10, 2024
2 parents 4b50501 + 75d9ecc commit c47f209
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 85 deletions.
32 changes: 18 additions & 14 deletions libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,11 @@ namespace pika::cuda::experimental {
struct cublas_handles_holder
{
std::size_t const concurrency;
std::vector<cublas_handle> unsynchronized_handles;
std::atomic<std::size_t> synchronized_handle_index;
std::vector<cublas_handle> synchronized_handles;
std::atomic<std::size_t> handle_index;
std::vector<cublas_handle> handles;
std::vector<std::mutex> handle_mutexes;

PIKA_EXPORT cublas_handles_holder();
PIKA_EXPORT explicit cublas_handles_holder(std::size_t num_handles);
cublas_handles_holder(cublas_handles_holder&&) = delete;
cublas_handles_holder(cublas_handles_holder const&) = delete;
cublas_handles_holder& operator=(cublas_handles_holder&&) = delete;
Expand All @@ -103,12 +102,11 @@ namespace pika::cuda::experimental {
struct cusolver_handles_holder
{
std::size_t const concurrency;
std::vector<cusolver_handle> unsynchronized_handles;
std::atomic<std::size_t> synchronized_handle_index;
std::vector<cusolver_handle> synchronized_handles;
std::atomic<std::size_t> handle_index;
std::vector<cusolver_handle> handles;
std::vector<std::mutex> handle_mutexes;

PIKA_EXPORT cusolver_handles_holder();
PIKA_EXPORT explicit cusolver_handles_holder(std::size_t num_handles);
cusolver_handles_holder(cusolver_handles_holder&&) = delete;
cusolver_handles_holder(cusolver_handles_holder const&) = delete;
cusolver_handles_holder& operator=(cusolver_handles_holder&&) = delete;
Expand All @@ -126,7 +124,8 @@ namespace pika::cuda::experimental {
cusolver_handles_holder cusolver_handles;

PIKA_EXPORT pool_data(int device, std::size_t num_normal_priority_streams_per_thread,
std::size_t num_high_priority_streams_per_thread, unsigned int flags);
std::size_t num_high_priority_streams_per_thread, unsigned int flags,
std::size_t num_cublas_handles, std::size_t num_cusolver_handles);
pool_data(pool_data&&) = delete;
pool_data(pool_data const&) = delete;
pool_data& operator=(pool_data&&) = delete;
Expand All @@ -138,7 +137,8 @@ namespace pika::cuda::experimental {
public:
PIKA_EXPORT explicit cuda_pool(int device = 0,
std::size_t num_normal_priority_streams_per_thread = 3,
std::size_t num_high_priority_streams_per_thread = 3, unsigned int flags = 0);
std::size_t num_high_priority_streams_per_thread = 3, unsigned int flags = 0,
std::size_t num_cublas_handles = 16, std::size_t num_cusolver_handles = 16);
PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
cuda_pool(cuda_pool&&) = default;
PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
Expand Down Expand Up @@ -179,14 +179,18 @@ struct fmt::formatter<pika::cuda::experimental::cuda_pool> : fmt::formatter<std:
auto format(pika::cuda::experimental::cuda_pool const& pool, FormatContext& ctx) const
{
bool valid{pool.data};
auto high_priority_streams =
auto num_high_priority_streams =
valid ? pool.data->high_priority_streams.num_streams_per_thread : 0;
auto normal_priority_streams =
auto num_normal_priority_streams =
valid ? pool.data->normal_priority_streams.num_streams_per_thread : 0;
auto num_cublas_handles = valid ? pool.data->cublas_handles.handles.size() : 0;
auto num_cusolver_handles = valid ? pool.data->cusolver_handles.handles.size() : 0;
return fmt::formatter<std::string>::format(
fmt::format("cuda_pool({}, num_high_priority_streams_per_thread = {}, "
"num_normal_priority_streams_per_thread = {})",
fmt::ptr(pool.data.get()), high_priority_streams, normal_priority_streams),
"num_normal_priority_streams_per_thread = {}, num_cublas_handles = {}, "
"num_cusolver_handles = {})",
fmt::ptr(pool.data.get()), num_high_priority_streams, num_normal_priority_streams,
num_cublas_handles, num_cusolver_handles),
ctx);
}
};
101 changes: 30 additions & 71 deletions libs/pika/async_cuda/src/cuda_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,15 @@ namespace pika::cuda::experimental {
return streams[global_stream_index];
}

cuda_pool::cublas_handles_holder::cublas_handles_holder()
cuda_pool::cublas_handles_holder::cublas_handles_holder(std::size_t num_handles)
: concurrency(pika::detail::get_runtime_ptr() ? pika::get_num_worker_threads() :
pika::threads::detail::hardware_concurrency())
, unsynchronized_handles()
, synchronized_handle_index{0}
, synchronized_handles()
, handle_mutexes(concurrency)
, handle_index{0}
, handles()
, handle_mutexes(num_handles)
{
unsynchronized_handles.reserve(concurrency);
synchronized_handles.reserve(concurrency);
for (std::size_t i = 0; i < concurrency; ++i)
{
unsynchronized_handles.emplace_back();
synchronized_handles.emplace_back();
}
handles.reserve(num_handles);
for (std::size_t i = 0; i < num_handles; ++i) { handles.emplace_back(); }
}

locked_cublas_handle::locked_cublas_handle(
Expand All @@ -77,48 +71,26 @@ namespace pika::cuda::experimental {
locked_cublas_handle cuda_pool::cublas_handles_holder::get_locked_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode)
{
auto const t = pika::threads::detail::get_global_thread_num_tss();

// If we are on a pika runtime worker thread we use one of the unsynchronized handles since
// this is the only thread that will access this handle
if (t < unsynchronized_handles.size())
{
auto& handle = unsynchronized_handles[t];
handle.set_stream(stream);
handle.set_pointer_mode(pointer_mode);
auto const i = handle_index++ % handles.size();

return locked_cublas_handle(handle, std::unique_lock<std::mutex>{});
}
// We use synchronized (locked) handles in a round-robin fashion for all other threads
else
{
auto const t = synchronized_handle_index++ % synchronized_handles.size();
std::unique_lock lock{handle_mutexes[i]};

std::unique_lock lock{handle_mutexes[t]};
auto& handle = handles[i];
handle.set_stream(stream);
handle.set_pointer_mode(pointer_mode);

auto& handle = synchronized_handles[t];
handle.set_stream(stream);
handle.set_pointer_mode(pointer_mode);

return locked_cublas_handle(handle, std::move(lock));
}
return locked_cublas_handle(handle, std::move(lock));
}

cuda_pool::cusolver_handles_holder::cusolver_handles_holder()
cuda_pool::cusolver_handles_holder::cusolver_handles_holder(std::size_t num_handles)
: concurrency(pika::detail::get_runtime_ptr() ? pika::get_num_worker_threads() :
pika::threads::detail::hardware_concurrency())
, unsynchronized_handles()
, synchronized_handle_index{0}
, synchronized_handles()
, handle_mutexes(concurrency)
, handle_index{0}
, handles()
, handle_mutexes(num_handles)
{
unsynchronized_handles.reserve(concurrency);
synchronized_handles.reserve(concurrency);
for (std::size_t i = 0; i < concurrency; ++i)
{
unsynchronized_handles.emplace_back();
synchronized_handles.emplace_back();
}
handles.reserve(num_handles);
for (std::size_t i = 0; i < num_handles; ++i) { handles.emplace_back(); }
}

locked_cusolver_handle::locked_cusolver_handle(
Expand All @@ -133,47 +105,34 @@ namespace pika::cuda::experimental {
locked_cusolver_handle cuda_pool::cusolver_handles_holder::get_locked_handle(
cuda_stream const& stream)
{
auto const t = pika::threads::detail::get_global_thread_num_tss();

// If we are on a pika runtime worker thread we use one of the unsynchronized handles since
// this is the only thread that will access this handle
if (t < unsynchronized_handles.size())
{
auto& handle = unsynchronized_handles[t];
handle.set_stream(stream);
auto const i = handle_index++ % handles.size();

return locked_cusolver_handle(handle, std::unique_lock<std::mutex>{});
}
// We use synchronized (locked) handles in a round-robin fashion for all other threads
else
{
auto const t = synchronized_handle_index++ % synchronized_handles.size();
std::unique_lock lock{handle_mutexes[i]};

std::unique_lock lock{handle_mutexes[t]};
auto& handle = handles[i];
handle.set_stream(stream);

auto& handle = synchronized_handles[t];
handle.set_stream(stream);

return {handle, std::move(lock)};
}
return {handle, std::move(lock)};
}

cuda_pool::pool_data::pool_data(int device, std::size_t num_normal_priority_streams_per_thread,
std::size_t num_high_priority_streams_per_thread, unsigned int flags)
std::size_t num_high_priority_streams_per_thread, unsigned int flags,
std::size_t num_cublas_handles, std::size_t num_cusolver_handles)
: device(device)
, normal_priority_streams(device, num_normal_priority_streams_per_thread,
pika::execution::thread_priority::normal, flags)
, high_priority_streams(device, num_high_priority_streams_per_thread,
pika::execution::thread_priority::high, flags)
, cublas_handles()
, cusolver_handles()
, cublas_handles(num_cublas_handles)
, cusolver_handles(num_cusolver_handles)
{
}

cuda_pool::cuda_pool(int device, std::size_t num_normal_priority_streams_per_thread,
std::size_t num_high_priority_streams_per_thread, unsigned int flags)
std::size_t num_high_priority_streams_per_thread, unsigned int flags,
std::size_t num_cublas_handles, std::size_t num_cusolver_handles)
: data(std::make_shared<pool_data>(device, num_normal_priority_streams_per_thread,
num_high_priority_streams_per_thread, flags))
num_high_priority_streams_per_thread, flags, num_cublas_handles, num_cusolver_handles))
{
}

Expand Down

0 comments on commit c47f209

Please sign in to comment.