From 0ac0cec8d183110baa9a876073b7588132fb0d0d Mon Sep 17 00:00:00 2001 From: Ivet Galabova Date: Mon, 26 Aug 2024 14:07:23 +0300 Subject: [PATCH] adding cuda parts of pdlp: part one --- CMakeLists.txt | 13 ++ FindCUDAConf.cmake | 33 +++ src/CMakeLists.txt | 5 + src/pdlp/cupdlp/CMakeLists.txt | 14 ++ src/pdlp/cupdlp/cuda/CMakeLists.txt | 30 +++ src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu | 121 +++++++++++ src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh | 148 +++++++++++++ src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu | 209 +++++++++++++++++++ src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh | 98 +++++++++ src/pdlp/cupdlp/cuda/test_cublas.c | 152 ++++++++++++++ src/pdlp/cupdlp/cuda/test_cuda_linalg.c | 79 +++++++ 11 files changed, 902 insertions(+) create mode 100644 FindCUDAConf.cmake create mode 100644 src/pdlp/cupdlp/CMakeLists.txt create mode 100644 src/pdlp/cupdlp/cuda/CMakeLists.txt create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh create mode 100644 src/pdlp/cupdlp/cuda/test_cublas.c create mode 100644 src/pdlp/cupdlp/cuda/test_cuda_linalg.c diff --git a/CMakeLists.txt b/CMakeLists.txt index aa59302874..7402c24bba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,19 @@ endif() # emscripten option(EMSCRIPTEN_HTML "Emscripten HTML output" OFF) +option(CUPDLP_GPU "Build pdlp with nvidia" OFF) +message(STATUS "Build pdlp with nvidia: ${CUPDLP_GPU}") + +if (CUPDLP_GPU) + list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}) + message(NOTICE "Set build with CUDA ${BUILD_CUDA}") + include(FindCUDAConf.cmake) + set(CUPLDP_CPU OFF) +else() + set(CUPLDP_CPU ON) + set(CUDA_LIBRARY-NOTFOUND true) +endif() + if (BUILD_CXX) # Default Build Type to be Release get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) diff --git a/FindCUDAConf.cmake b/FindCUDAConf.cmake new file mode 100644 index 0000000000..b6697918da --- /dev/null +++ b/FindCUDAConf.cmake @@ -0,0 +1,33 @@ + +set(CUDA_LIBRARY-NOTFOUND, OFF) +message(NOTICE "Finding CUDA environment") +message(NOTICE " - CUDA Home detected at $ENV{CUDA_HOME}") +set(CMAKE_CUDA_ARCHITECTURES "all") +set(CMAKE_CUDA_PATH "$ENV{CUDA_HOME}") +set(CMAKE_CUDA_COMPILER "${CMAKE_CUDA_PATH}/bin/nvcc") + +enable_language(CUDA) + +find_library(CUDA_LIBRARY_ART + NAMES cudart + HINTS "${CMAKE_CUDA_PATH}/lib64/" + REQUIRED +) +find_library(CUDA_LIBRARY_SPS + NAMES cusparse + HINTS "${CMAKE_CUDA_PATH}/lib64/" + REQUIRED +) +find_library(CUDA_LIBRARY_BLS + NAMES cublas + HINTS "${CMAKE_CUDA_PATH}/lib64/" + REQUIRED +) +if (${CUDA_LIBRARY-NOTFOUND}) + message(WARNING " - CUDA Libraries not detected at $ENV{CUDA_HOME}") +else () + message(NOTICE " - CUDA Libraries detected at $ENV{CUDA_HOME}") + set(CUDA_LIBRARY ${CUDA_LIBRARY_ART} ${CUDA_LIBRARY_SPS} ${CUDA_LIBRARY_BLS}) + message(NOTICE " - :${CUDA_LIBRARY}") +endif () + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 69a78869db..35bc8890b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -179,6 +179,11 @@ else() target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file}) target_include_directories(highs PRIVATE ${include_dirs}) + # Optional Cuda + if (CUPDLP_GPU) + add_subdirectory(cupdlp) + endif() + if(MSVC) list(APPEND highs_compile_opts "/bigobj" # Allow big object diff --git a/src/pdlp/cupdlp/CMakeLists.txt b/src/pdlp/cupdlp/CMakeLists.txt new file mode 100644 index 0000000000..911f775419 --- /dev/null +++ b/src/pdlp/cupdlp/CMakeLists.txt @@ -0,0 +1,14 @@ +if (${CUDA_LIBRARY-NOTFOUND}) + message(NOTICE "- CPU version PDLP") +# target_compile_definitions(cupdlp +# PUBLIC +# -DCUPDLP_CPU +# ) + target_link_libraries(highs m) +else() + add_subdirectory(cuda) + message(NOTICE "- GPU version PDLP") + target_include_directories(highs PUBLIC "/usr/local/cuda/include") + target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY} m) + set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +endif () diff --git a/src/pdlp/cupdlp/cuda/CMakeLists.txt b/src/pdlp/cupdlp/cuda/CMakeLists.txt new file mode 100644 index 0000000000..1e2f87ea25 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/CMakeLists.txt @@ -0,0 +1,30 @@ +enable_language(CXX CUDA) + +add_library(cudalin SHARED + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh + ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu +) + +set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +target_include_directories(cudalin PUBLIC "/usr/local/cuda/include") +target_compile_definitions(cudalin + PUBLIC + # If the debug configuration pass the DEBUG define to the compiler + $<$:HIGHS_DEBUG> +) + +target_link_libraries(cudalin ${CUDA_LIBRARY} m) + +# add a test +add_executable(testcudalin test_cuda_linalg.c) +add_executable(testcublas test_cublas.c) + +set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +#target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda) +target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY}) + +set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY}) + diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu new file mode 100644 index 0000000000..519408b7c0 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu @@ -0,0 +1,121 @@ +#include "cupdlp_cuda_kernels.cuh" + +dim3 cuda_gridsize(cupdlp_int n) { + cupdlp_int k = (n - 1) / CUPDLP_BLOCK_SIZE + 1; + cupdlp_int x = k; + cupdlp_int y = 1; + if (x > 65535) { + x = ceil(sqrt(k)); + y = (n - 1) / (x * CUPDLP_BLOCK_SIZE) + 1; + } + dim3 d = {x, y, 1}; + return d; +} + +__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] *= y[i]; +} + +__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] /= y[i]; +} + +__global__ void element_wise_projlb_kernel(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] < lb[i] ? lb[i] : x[i]; +} + +__global__ void element_wise_projub_kernel(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] > ub[i] ? ub[i] : x[i]; +} + +__global__ void element_wise_projSamelb_kernel(cupdlp_float *x, + const cupdlp_float lb, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] <= lb ? lb : x[i]; +} + +__global__ void element_wise_projSameub_kernel(cupdlp_float *x, + const cupdlp_float ub, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = x[i] >= ub ? ub : x[i]; +} + +__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) haslb[i] = lb[i] > bound ? 1.0 : 0.0; +} + +__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) hasub[i] = ub[i] < bound ? 1.0 : 0.0; +} + +__global__ void element_wise_filterlb_kernal(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = lb[i] > bound ? lb[i] : 0.0; +} + +__global__ void element_wise_filterub_kernal(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = ub[i] < bound ? ub[i] : 0.0; +} + +__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) x[i] = val; +} + +//xUpdate = x - dPrimalStep * (cost - ATy) +__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate, + const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) xUpdate[i] = x[i] - dPrimalStep * (cost[i] - ATy[i]); +} + +//yUpdate = y + dDualStep * (b -2AxUpdate + Ax) +__global__ void dual_grad_step_kernal(cupdlp_float *yUpdate, + const cupdlp_float *y, + const cupdlp_float *b, + const cupdlp_float *Ax, + const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, + const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) yUpdate[i] = y[i] + dDualStep * (b[i] - 2 * AxUpdate[i] + Ax[i]); +} + +// z = x - y +__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len) { + cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) z[i] = x[i] - y[i]; +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh new file mode 100644 index 0000000000..339a5da064 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh @@ -0,0 +1,148 @@ +#ifndef CUPDLP_CUDA_KERNALS_H +#define CUPDLP_CUDA_KERNALS_H + +#include "cuda_runtime.h" +#define CUPDLP_BLOCK_SIZE 512 + +#ifndef SFLOAT +#ifdef DLONG +typedef long long cupdlp_int; +#else +typedef int cupdlp_int; +#endif +typedef double cupdlp_float; +#define CudaComputeType CUDA_R_64F +#else +#define CudaComputeType CUDA_R_32F +#endif + +#define CHECK_CUDA(func) \ + { \ + cudaError_t status = (func); \ + if (status != cudaSuccess) { \ + printf("CUDA API failed at line %d of %s with error: %s (%d)\n", \ + __LINE__, __FILE__, cudaGetErrorString(status), status); \ + return EXIT_FAILURE; \ + } \ + } + +#define CHECK_CUSPARSE(func) \ + { \ + cusparseStatus_t status = (func); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d of %s with error: %s (%d)\n", \ + __LINE__, __FILE__, cusparseGetErrorString(status), status); \ + return EXIT_FAILURE; \ + } \ + } + +#define CHECK_CUBLAS(func) \ + { \ + cublasStatus_t status = (func); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + printf("CUBLAS API failed at line %d of %s with error: %s (%d)\n", \ + __LINE__, __FILE__, cublasGetStatusString(status), status); \ + return EXIT_FAILURE; \ + } \ + } + +#define CUPDLP_FREE_VEC(x) \ + { \ + cudaFree(x); \ + x = cupdlp_NULL; \ + } + +#define CUPDLP_COPY_VEC(dst, src, type, size) \ + cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault) + +#define CUPDLP_INIT_VEC(var, size) \ + { \ + cusparseStatus_t status = \ + cudaMalloc((void **)&var, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + } +#define CUPDLP_INIT_ZERO_VEC(var, size) \ + { \ + cusparseStatus_t status = \ + cudaMalloc((void **)&var, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + status = cudaMemset(var, 0, (size) * sizeof(typeof(*var))); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \ + cusparseGetErrorString(status), status); \ + goto exit_cleanup; \ + } \ + } +#define CUPDLP_ZERO_VEC(var, type, size) \ + cudaMemset(var, 0, sizeof(type) * (size)) + +dim3 cuda_gridsize(cupdlp_int n); + +__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); + +__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); + +__global__ void element_wise_projlb_kernel(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_int len); + +__global__ void element_wise_projub_kernel(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_int len); + +__global__ void element_wise_projSamelb_kernel(cupdlp_float *x, + const cupdlp_float lb, + const cupdlp_int len); + +__global__ void element_wise_projSameub_kernel(cupdlp_float *x, + const cupdlp_float ub, + const cupdlp_int len); + +__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void element_wise_filterlb_kernal(cupdlp_float *x, + const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void element_wise_filterub_kernal(cupdlp_float *x, + const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); + +__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len); + +__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate, + const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len); + +__global__ void dual_grad_step_kernal( + cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b, + const cupdlp_float *Ax, const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, const cupdlp_int len); + +__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len); +#endif \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu new file mode 100644 index 0000000000..332b417899 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu @@ -0,0 +1,209 @@ +#include "cupdlp_cudalinalg.cuh" + +extern "C" cupdlp_int cuda_alloc_MVbuffer( + cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx, + cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void **dBuffer) { + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + size_t AxBufferSize = 0; + size_t ATyBufferSize = 0; + cupdlp_float alpha = 1.0; + cupdlp_float beta = 0.0; + // cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_CSR_ALG2; //deterministic + + // get the buffer size needed by csr Ax + CHECK_CUSPARSE(cusparseSpMV_bufferSize( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, cuda_csr, vecX, &beta, + vecAx, computeType, alg, &AxBufferSize)) + + // get the buffer size needed by csc ATy + CHECK_CUSPARSE(cusparseSpMV_bufferSize( + handle, CUSPARSE_OPERATION_TRANSPOSE, &alpha, cuda_csc, vecY, &beta, + vecATy, computeType, alg, &ATyBufferSize)) + + size_t bufferSize = + (AxBufferSize > ATyBufferSize) ? AxBufferSize : ATyBufferSize; + + // allocate an external buffer if needed + CHECK_CUDA(cudaMalloc(dBuffer, bufferSize)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hAx = alpha * Acsc * hX + beta * hAx + + cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecX, &beta, vecAx, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hAx = alpha * Acsc * hX + beta * hAx + + cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecX, &beta, vecAx, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hATy = alpha * Acsr^T * hY + beta * hATy + cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecY, &beta, vecATy, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta) { + // hATy = alpha * Acsr^T * hY + beta * hATy + cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE; + cudaDataType computeType = CUDA_R_32F; +#ifndef SFLOAT + computeType = CUDA_R_64F; +#endif + + CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecY, &beta, vecATy, + // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)) + computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer)) + + return EXIT_SUCCESS; +} + +extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub, + const cupdlp_int len) { + element_wise_projSameub_kernel<<>>( + x, ub, len); +} + +extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb, + const cupdlp_int len) { + element_wise_projSamelb_kernel<<>>( + x, lb, len); +} + +extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_int len) { + element_wise_projub_kernel<<>>(x, ub, + len); +} + +extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_int len) { + element_wise_projlb_kernel<<>>(x, lb, + len); +} + +extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + element_wise_div_kernel<<>>(x, y, len); +} + +extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len) { + element_wise_dot_kernel<<>>(x, y, len); +} + +extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_initHaslb_kernal<<>>( + haslb, lb, bound, len); +} + +extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_initHasub_kernal<<>>( + hasub, ub, bound, len); +} + +extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_filterlb_kernal<<>>( + x, lb, bound, len); +} + +extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len) { + element_wise_filterub_kernal<<>>( + x, ub, bound, len); +} + +extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len) { + init_cuda_vec_kernal<<>>(x, val, len); +} + +extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate, + const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len) { + primal_grad_step_kernal<<>>( + xUpdate, x, cost, ATy, dPrimalStep, len); +} + +extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b, + const cupdlp_float *Ax, const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, const cupdlp_int len) { + dual_grad_step_kernal<<>>( + yUpdate, y, b, Ax, AxUpdate, dDualStep, len); +} + +extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len) +{ + naive_sub_kernal<<>>(z, x, y, len); +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh new file mode 100644 index 0000000000..e0103b9228 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh @@ -0,0 +1,98 @@ +#ifndef CUPDLP_CUDA_LINALG_H +#define CUPDLP_CUDA_LINALG_H + +#include // cublas +#include // cudaMalloc, cudaMemcpy, etc. +#include // cusparseSpMV + +#include "cupdlp_cuda_kernels.cuh" + +#ifdef __cplusplus +extern "C" { +#endif + +#include // printf +#include // EXIT_FAILURE + +// #include "../cupdlp_defs.h" +// #include "../glbopts.h" +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus + +extern "C" cupdlp_int cuda_alloc_MVbuffer( + cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx, + cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void **dBuffer); + +extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); +extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecAx, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); +extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csc, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); +extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle, + cusparseSpMatDescr_t cuda_csr, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecATy, void *dBuffer, + const cupdlp_float alpha, + const cupdlp_float beta); + +extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub, + const cupdlp_int len); +extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb, + const cupdlp_int len); +extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_int len); +extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_int len); +extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); +extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y, + const cupdlp_int len); +extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub, + const cupdlp_float bound, + const cupdlp_int len); +extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val, + const cupdlp_int len); + +extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate, const cupdlp_float *x, + const cupdlp_float *cost, + const cupdlp_float *ATy, + const cupdlp_float dPrimalStep, + const cupdlp_int len); + +extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y, + const cupdlp_float *b, const cupdlp_float *Ax, + const cupdlp_float *AxUpdate, + const cupdlp_float dDualStep, + const cupdlp_int len); + +extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x, + const cupdlp_float *y, const cupdlp_int len); +#endif +#endif \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/test_cublas.c b/src/pdlp/cupdlp/cuda/test_cublas.c new file mode 100644 index 0000000000..4e1a52b85e --- /dev/null +++ b/src/pdlp/cupdlp/cuda/test_cublas.c @@ -0,0 +1,152 @@ +#include "cupdlp_cuda_kernels.cuh" +#include "cupdlp_cudalinalg.cuh" + +void use_cublas(cublasHandle_t cublashandle) { + cupdlp_int len = 10; + // cupdlp_int len = 1<<10; + + // int N = 1<<20; + + // alloc and init host vec memory + cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + for (cupdlp_int i = 0; i < len; i++) { + h_vec1[i] = 1.0; + h_vec2[i] = i; + // h_vec1[i] = 1.0; + // h_vec2[i] = 2.0; + } + + // alloc and init device vec memory + cupdlp_float *d_vec1; + cupdlp_float *d_vec2; + cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float)); + // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float), + // cudaMemcpyHostToDevice); + + cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float)); + cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float), + cudaMemcpyHostToDevice); + + // init cublas handle + // cublasHandle_t cublashandle; + // CHECK_CUBLAS(cublasCreate(&cublashandle)); + + cupdlp_float result; + // call nrm2 + CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result)); + + // print result + printf("2-norm is :%f\n", result); + + // copy result back to host + // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaError_t errSync = cudaGetLastError(); + // cudaError_t errAsync = cudaDeviceSynchronize(); + // if (errSync != cudaSuccess) + // printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); + // if (errAsync != cudaSuccess) + // printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); + + // // print result + // for (cupdlp_int i = 0; i < len; i++) { + // printf("%f\n", h_vec1[i]); + // // printf("%f\n", h_vec2[i]); + // } + + // destroy cublas handle + // CHECK_CUBLAS(cublasDestroy(cublashandle)); + + // free memory + free(h_vec1); + free(h_vec2); + cudaFree(d_vec1); + cudaFree(d_vec2); +} +int main() { + // try cupdlp_edot_cuda + + // int nDevices; + + // cudaGetDeviceCount(&nDevices); + // for (int i = 0; i < nDevices; i++) { + // cudaDeviceProp prop; + // cudaGetDeviceProperties(&prop, i); + // printf("Device Number: %d\n", i); + // printf(" Device name: %s\n", prop.name); + // printf(" Memory Clock Rate (KHz): %d\n", + // prop.memoryClockRate); + // printf(" Memory Bus Width (bits): %d\n", + // prop.memoryBusWidth); + // printf(" Peak Memory Bandwidth (GB/s): %f\n\n", + // 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) + // / 1.0e6); + // } + + // cupdlp_int len = 10; + // // cupdlp_int len = 1<<10; + + // // int N = 1<<20; + + // // alloc and init host vec memory + // cupdlp_float *h_vec1 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float)); + // cupdlp_float *h_vec2 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float)); + // for (cupdlp_int i = 0; i < len; i++) { + // h_vec1[i] = 1.0; + // h_vec2[i] = i; + // // h_vec1[i] = 1.0; + // // h_vec2[i] = 2.0; + // } + + // // alloc and init device vec memory + // cupdlp_float *d_vec1; + // cupdlp_float *d_vec2; + // cudaMalloc((void **) &d_vec1, len * sizeof(cupdlp_float)); + // // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float), + // // cudaMemcpyHostToDevice); + + // cudaMalloc((void **) &d_vec2, len * sizeof(cupdlp_float)); + // cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float), + // cudaMemcpyHostToDevice); + + // init cublas handle + cublasHandle_t cublashandle; + CHECK_CUBLAS(cublasCreate(&cublashandle)); + use_cublas(cublashandle); + // cupdlp_float result; + // call nrm2 + // CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result)); + + // print result + // printf("2-norm is :%f\n", result); + + // copy result back to host + // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float), + // cudaMemcpyDeviceToHost); + // cudaError_t errSync = cudaGetLastError(); + // cudaError_t errAsync = cudaDeviceSynchronize(); + // if (errSync != cudaSuccess) + // printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); + // if (errAsync != cudaSuccess) + // printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); + + // // print result + // for (cupdlp_int i = 0; i < len; i++) { + // printf("%f\n", h_vec1[i]); + // // printf("%f\n", h_vec2[i]); + // } + + // destroy cublas handle + CHECK_CUBLAS(cublasDestroy(cublashandle)); + + // // free memory + // free(h_vec1); + // free(h_vec2); + // cudaFree(d_vec1); + // cudaFree(d_vec2); +} \ No newline at end of file diff --git a/src/pdlp/cupdlp/cuda/test_cuda_linalg.c b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c new file mode 100644 index 0000000000..ad19bbe701 --- /dev/null +++ b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c @@ -0,0 +1,79 @@ +#include "cupdlp_cuda_kernels.cuh" +#include "cupdlp_cudalinalg.cuh" + +int main() { + // try cupdlp_edot_cuda + + int nDevices; + + cudaGetDeviceCount(&nDevices); + // for (int i = 0; i < nDevices; i++) { + // cudaDeviceProp prop; + // cudaGetDeviceProperties(&prop, i); + // printf("Device Number: %d\n", i); + // printf(" Device name: %s\n", prop.name); + // printf(" Memory Clock Rate (KHz): %d\n", + // prop.memoryClockRate); + // printf(" Memory Bus Width (bits): %d\n", + // prop.memoryBusWidth); + // printf(" Peak Memory Bandwidth (GB/s): %f\n\n", + // 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) + // / 1.0e6); + // } + + cupdlp_int len = 10; + // cupdlp_int len = 1<<10; + + // int N = 1<<20; + + // alloc and init host vec memory + cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float)); + for (cupdlp_int i = 0; i < len; i++) { + h_vec1[i] = i; + h_vec2[i] = i; + // h_vec1[i] = 1.0; + // h_vec2[i] = 2.0; + } + + // alloc and init device vec memory + cupdlp_float *d_vec1; + cupdlp_float *d_vec2; + cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float)); + cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float)); + cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float), + cudaMemcpyHostToDevice); + cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float), + cudaMemcpyHostToDevice); + + // call cupdlp_edot_cuda + cupdlp_edot_cuda(d_vec1, d_vec2, len); + // element_wise_dot_kernel<<<(len+255)/256, 256>>>(d_vec1, d_vec2, len); + // saxpy<<<(len+255)/256, 256>>>(len, 2.0f, d_vec1, d_vec2); + // element_wise_dot<<>>(d_vec1, d_vec2, + // len); + + // copy result back to host + cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float), + cudaMemcpyDeviceToHost); + cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float), + cudaMemcpyDeviceToHost); + cudaError_t errSync = cudaGetLastError(); + cudaError_t errAsync = cudaDeviceSynchronize(); + if (errSync != cudaSuccess) + printf("Sync kernel error: %s\n", cudaGetErrorString(errSync)); + if (errAsync != cudaSuccess) + printf("Async kernel error: %s\n", cudaGetErrorString(errAsync)); + + // print result + for (cupdlp_int i = 0; i < len; i++) { + printf("%f\n", h_vec1[i]); + // printf("%f\n", h_vec2[i]); + } + + // free memory + free(h_vec1); + free(h_vec2); + cudaFree(d_vec1); + cudaFree(d_vec2); +} \ No newline at end of file