From 0ac0cec8d183110baa9a876073b7588132fb0d0d Mon Sep 17 00:00:00 2001
From: Ivet Galabova <galabovaa@gmail.com>
Date: Mon, 26 Aug 2024 14:07:23 +0300
Subject: [PATCH] adding cuda parts of pdlp: part one

---
 CMakeLists.txt                               |  13 ++
 FindCUDAConf.cmake                           |  33 +++
 src/CMakeLists.txt                           |   5 +
 src/pdlp/cupdlp/CMakeLists.txt               |  14 ++
 src/pdlp/cupdlp/cuda/CMakeLists.txt          |  30 +++
 src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu  | 121 +++++++++++
 src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh | 148 +++++++++++++
 src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu    | 209 +++++++++++++++++++
 src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh   |  98 +++++++++
 src/pdlp/cupdlp/cuda/test_cublas.c           | 152 ++++++++++++++
 src/pdlp/cupdlp/cuda/test_cuda_linalg.c      |  79 +++++++
 11 files changed, 902 insertions(+)
 create mode 100644 FindCUDAConf.cmake
 create mode 100644 src/pdlp/cupdlp/CMakeLists.txt
 create mode 100644 src/pdlp/cupdlp/cuda/CMakeLists.txt
 create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
 create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
 create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
 create mode 100644 src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
 create mode 100644 src/pdlp/cupdlp/cuda/test_cublas.c
 create mode 100644 src/pdlp/cupdlp/cuda/test_cuda_linalg.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa59302874..7402c24bba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,19 @@ endif()
 # emscripten
 option(EMSCRIPTEN_HTML "Emscripten HTML output" OFF)
 
+option(CUPDLP_GPU "Build pdlp with nvidia" OFF)
+message(STATUS "Build pdlp with nvidia: ${CUPDLP_GPU}")
+
+if (CUPDLP_GPU)
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+    message(NOTICE "Set build with CUDA ${BUILD_CUDA}")
+    include(FindCUDAConf.cmake)
+    set(CUPLDP_CPU OFF)
+else()
+    set(CUPLDP_CPU ON)
+    set(CUDA_LIBRARY-NOTFOUND true)
+endif()
+
 if (BUILD_CXX)
   # Default Build Type to be Release
   get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
diff --git a/FindCUDAConf.cmake b/FindCUDAConf.cmake
new file mode 100644
index 0000000000..b6697918da
--- /dev/null
+++ b/FindCUDAConf.cmake
@@ -0,0 +1,33 @@
+
+set(CUDA_LIBRARY-NOTFOUND, OFF)
+message(NOTICE "Finding CUDA environment")
+message(NOTICE "    - CUDA Home detected at $ENV{CUDA_HOME}")
+set(CMAKE_CUDA_ARCHITECTURES "all")
+set(CMAKE_CUDA_PATH "$ENV{CUDA_HOME}")
+set(CMAKE_CUDA_COMPILER "${CMAKE_CUDA_PATH}/bin/nvcc")
+
+enable_language(CUDA)
+
+find_library(CUDA_LIBRARY_ART
+        NAMES cudart
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_SPS
+        NAMES cusparse
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_BLS
+        NAMES cublas
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+if (${CUDA_LIBRARY-NOTFOUND})
+    message(WARNING "    - CUDA Libraries not detected at $ENV{CUDA_HOME}")
+else ()
+    message(NOTICE "    - CUDA Libraries detected at $ENV{CUDA_HOME}")
+    set(CUDA_LIBRARY ${CUDA_LIBRARY_ART} ${CUDA_LIBRARY_SPS} ${CUDA_LIBRARY_BLS})
+    message(NOTICE "    -   :${CUDA_LIBRARY}")
+endif ()
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 69a78869db..35bc8890b6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -179,6 +179,11 @@ else()
   target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file})
   target_include_directories(highs PRIVATE ${include_dirs})
 
+  # Optional Cuda 
+  if (CUPDLP_GPU)
+    add_subdirectory(cupdlp)
+  endif()
+
   if(MSVC)
     list(APPEND highs_compile_opts
       "/bigobj" # Allow big object
diff --git a/src/pdlp/cupdlp/CMakeLists.txt b/src/pdlp/cupdlp/CMakeLists.txt
new file mode 100644
index 0000000000..911f775419
--- /dev/null
+++ b/src/pdlp/cupdlp/CMakeLists.txt
@@ -0,0 +1,14 @@
+if (${CUDA_LIBRARY-NOTFOUND})
+    message(NOTICE "- CPU version PDLP")
+#     target_compile_definitions(cupdlp
+#             PUBLIC
+#             -DCUPDLP_CPU
+#     )
+    target_link_libraries(highs m)
+else()
+    add_subdirectory(cuda)
+    message(NOTICE "- GPU version PDLP")
+    target_include_directories(highs PUBLIC "/usr/local/cuda/include")
+    target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY} m)
+    set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif ()
diff --git a/src/pdlp/cupdlp/cuda/CMakeLists.txt b/src/pdlp/cupdlp/cuda/CMakeLists.txt
new file mode 100644
index 0000000000..1e2f87ea25
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/CMakeLists.txt
@@ -0,0 +1,30 @@
+enable_language(CXX CUDA)
+
+add_library(cudalin SHARED
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu
+)
+
+set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(cudalin PUBLIC "/usr/local/cuda/include")
+target_compile_definitions(cudalin
+        PUBLIC
+        # If the debug configuration pass the DEBUG define to the compiler
+        $<$<CONFIG:Debug>:HIGHS_DEBUG>
+)
+
+target_link_libraries(cudalin ${CUDA_LIBRARY} m)
+
+# add a test
+add_executable(testcudalin test_cuda_linalg.c)
+add_executable(testcublas test_cublas.c)
+
+set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+#target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda)
+target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY})
+
+set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY})
+
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
new file mode 100644
index 0000000000..519408b7c0
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
@@ -0,0 +1,121 @@
+#include "cupdlp_cuda_kernels.cuh"
+
+dim3 cuda_gridsize(cupdlp_int n) {
+  cupdlp_int k = (n - 1) / CUPDLP_BLOCK_SIZE + 1;
+  cupdlp_int x = k;
+  cupdlp_int y = 1;
+  if (x > 65535) {
+    x = ceil(sqrt(k));
+    y = (n - 1) / (x * CUPDLP_BLOCK_SIZE) + 1;
+  }
+  dim3 d = {x, y, 1};
+  return d;
+}
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] *= y[i];
+}
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] /= y[i];
+}
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] < lb[i] ? lb[i] : x[i];
+}
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] > ub[i] ? ub[i] : x[i];
+}
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] <= lb ? lb : x[i];
+}
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] >= ub ? ub : x[i];
+}
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) haslb[i] = lb[i] > bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) hasub[i] = ub[i] < bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = lb[i] > bound ? lb[i] : 0.0;
+}
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = ub[i] < bound ? ub[i] : 0.0;
+}
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = val;
+}
+
+//xUpdate = x - dPrimalStep * (cost - ATy)
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) xUpdate[i] = x[i] - dPrimalStep * (cost[i] - ATy[i]);
+}
+
+//yUpdate = y + dDualStep * (b -2AxUpdate + Ax)
+__global__ void dual_grad_step_kernal(cupdlp_float *yUpdate,
+                                      const cupdlp_float *y,
+                                      const cupdlp_float *b,
+                                      const cupdlp_float *Ax,
+                                      const cupdlp_float *AxUpdate,
+                                      const cupdlp_float dDualStep,
+                                      const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; 
+  if (i < len) yUpdate[i] = y[i] + dDualStep * (b[i] - 2 * AxUpdate[i] + Ax[i]);
+}
+
+// z = x - y
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                  const cupdlp_float *y, const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) z[i] = x[i] - y[i];
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
new file mode 100644
index 0000000000..339a5da064
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
@@ -0,0 +1,148 @@
+#ifndef CUPDLP_CUDA_KERNALS_H
+#define CUPDLP_CUDA_KERNALS_H
+
+#include "cuda_runtime.h"
+#define CUPDLP_BLOCK_SIZE 512
+
+#ifndef SFLOAT
+#ifdef DLONG
+typedef long long cupdlp_int;
+#else
+typedef int cupdlp_int;
+#endif
+typedef double cupdlp_float;
+#define CudaComputeType CUDA_R_64F
+#else
+#define CudaComputeType CUDA_R_32F
+#endif
+
+#define CHECK_CUDA(func)                                               \
+  {                                                                    \
+    cudaError_t status = (func);                                       \
+    if (status != cudaSuccess) {                                       \
+      printf("CUDA API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cudaGetErrorString(status), status);  \
+      return EXIT_FAILURE;                                             \
+    }                                                                  \
+  }
+
+#define CHECK_CUSPARSE(func)                                               \
+  {                                                                        \
+    cusparseStatus_t status = (func);                                      \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                               \
+      printf("CUSPARSE API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cusparseGetErrorString(status), status);  \
+      return EXIT_FAILURE;                                                 \
+    }                                                                      \
+  }
+
+#define CHECK_CUBLAS(func)                                               \
+  {                                                                      \
+    cublasStatus_t status = (func);                                      \
+    if (status != CUBLAS_STATUS_SUCCESS) {                               \
+      printf("CUBLAS API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cublasGetStatusString(status), status); \
+      return EXIT_FAILURE;                                               \
+    }                                                                    \
+  }
+
+#define CUPDLP_FREE_VEC(x) \
+  {                        \
+    cudaFree(x);           \
+    x = cupdlp_NULL;       \
+  }
+
+#define CUPDLP_COPY_VEC(dst, src, type, size) \
+  cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault)
+
+#define CUPDLP_INIT_VEC(var, size)                                             \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_INIT_ZERO_VEC(var, size)                                        \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+    status = cudaMemset(var, 0, (size) * sizeof(typeof(*var)));                \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_ZERO_VEC(var, type, size) \
+  cudaMemset(var, 0, sizeof(type) * (size))
+
+dim3 cuda_gridsize(cupdlp_int n);
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len);
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len);
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len);
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len);
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len);
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len);
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len);
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len);
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len);
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len);
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len);
+
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len);
+
+__global__ void dual_grad_step_kernal(
+    cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b,
+    const cupdlp_float *Ax, const cupdlp_float *AxUpdate,
+    const cupdlp_float dDualStep, const cupdlp_int len);
+
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                 const cupdlp_float *y, const cupdlp_int len);
+#endif
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
new file mode 100644
index 0000000000..332b417899
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
@@ -0,0 +1,209 @@
+#include "cupdlp_cudalinalg.cuh"
+
+extern "C" cupdlp_int cuda_alloc_MVbuffer(
+    cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc,
+    cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx,
+    cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY,
+    cusparseDnVecDescr_t vecATy, void **dBuffer) {
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  size_t AxBufferSize = 0;
+  size_t ATyBufferSize = 0;
+  cupdlp_float alpha = 1.0;
+  cupdlp_float beta = 0.0;
+  // cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
+  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_CSR_ALG2; //deterministic
+
+  // get the buffer size needed by csr Ax
+  CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+      handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, cuda_csr, vecX, &beta,
+      vecAx, computeType, alg, &AxBufferSize))
+
+  // get the buffer size needed by csc ATy
+  CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+      handle, CUSPARSE_OPERATION_TRANSPOSE, &alpha, cuda_csc, vecY, &beta,
+      vecATy, computeType, alg, &ATyBufferSize))
+
+  size_t bufferSize =
+      (AxBufferSize > ATyBufferSize) ? AxBufferSize : ATyBufferSize;
+
+  // allocate an external buffer if needed
+  CHECK_CUDA(cudaMalloc(dBuffer, bufferSize))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csc,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta) {
+  // hAx = alpha * Acsc * hX + beta * hAx
+
+  cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecX, &beta, vecAx,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csr,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta) {
+  // hAx = alpha * Acsc * hX + beta * hAx
+
+  cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecX, &beta, vecAx,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csc,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta) {
+  // hATy = alpha * Acsr^T * hY + beta * hATy
+  cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csc, vecY, &beta, vecATy,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csr,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta) {
+  // hATy = alpha * Acsr^T * hY + beta * hATy
+  cusparseOperation_t op = CUSPARSE_OPERATION_TRANSPOSE;
+  cudaDataType computeType = CUDA_R_32F;
+#ifndef SFLOAT
+  computeType = CUDA_R_64F;
+#endif
+
+  CHECK_CUSPARSE(cusparseSpMV(handle, op, &alpha, cuda_csr, vecY, &beta, vecATy,
+                              // computeType, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer))
+                              computeType, CUSPARSE_SPMV_CSR_ALG2, dBuffer))
+
+  return EXIT_SUCCESS;
+}
+
+extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub,
+                                       const cupdlp_int len) {
+  element_wise_projSameub_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, ub, len);
+}
+
+extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb,
+                                       const cupdlp_int len) {
+  element_wise_projSamelb_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, lb, len);
+}
+
+extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                   const cupdlp_int len) {
+  element_wise_projub_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, ub,
+                                                                        len);
+}
+
+extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                   const cupdlp_int len) {
+  element_wise_projlb_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, lb,
+                                                                        len);
+}
+
+extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len) {
+  element_wise_div_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, y, len);
+}
+
+extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len) {
+  element_wise_dot_kernel<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, y, len);
+}
+
+extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len) {
+  element_wise_initHaslb_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      haslb, lb, bound, len);
+}
+
+extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len) {
+  element_wise_initHasub_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      hasub, ub, bound, len);
+}
+
+extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len) {
+  element_wise_filterlb_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, lb, bound, len);
+}
+
+extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len) {
+  element_wise_filterub_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+      x, ub, bound, len);
+}
+
+extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val,
+                                    const cupdlp_int len) {
+  init_cuda_vec_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(x, val, len);
+}
+
+extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len) {
+    primal_grad_step_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+        xUpdate, x, cost, ATy, dPrimalStep, len);
+}
+
+extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b,
+    const cupdlp_float *Ax, const cupdlp_float *AxUpdate,
+    const cupdlp_float dDualStep, const cupdlp_int len) {
+      dual_grad_step_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(
+          yUpdate, y, b, Ax, AxUpdate, dDualStep, len);
+}
+
+extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x,
+                                  const cupdlp_float *y, const cupdlp_int len)
+{
+   naive_sub_kernal<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(z, x, y, len);
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
new file mode 100644
index 0000000000..e0103b9228
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
@@ -0,0 +1,98 @@
+#ifndef CUPDLP_CUDA_LINALG_H
+#define CUPDLP_CUDA_LINALG_H
+
+#include <cublas_v2.h>         // cublas
+#include <cuda_runtime_api.h>  // cudaMalloc, cudaMemcpy, etc.
+#include <cusparse.h>          // cusparseSpMV
+
+#include "cupdlp_cuda_kernels.cuh"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>   // printf
+#include <stdlib.h>  // EXIT_FAILURE
+
+// #include "../cupdlp_defs.h"
+// #include "../glbopts.h"
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+extern "C" cupdlp_int cuda_alloc_MVbuffer(
+    cusparseHandle_t handle, cusparseSpMatDescr_t cuda_csc,
+    cusparseDnVecDescr_t vecX, cusparseDnVecDescr_t vecAx,
+    cusparseSpMatDescr_t cuda_csr, cusparseDnVecDescr_t vecY,
+    cusparseDnVecDescr_t vecATy, void **dBuffer);
+
+extern "C" cupdlp_int cuda_csc_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csc,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta);
+extern "C" cupdlp_int cuda_csr_Ax(cusparseHandle_t handle,
+                                  cusparseSpMatDescr_t cuda_csr,
+                                  cusparseDnVecDescr_t vecX,
+                                  cusparseDnVecDescr_t vecAx, void *dBuffer,
+                                  const cupdlp_float alpha,
+                                  const cupdlp_float beta);
+extern "C" cupdlp_int cuda_csc_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csc,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta);
+extern "C" cupdlp_int cuda_csr_ATy(cusparseHandle_t handle,
+                                   cusparseSpMatDescr_t cuda_csr,
+                                   cusparseDnVecDescr_t vecY,
+                                   cusparseDnVecDescr_t vecATy, void *dBuffer,
+                                   const cupdlp_float alpha,
+                                   const cupdlp_float beta);
+
+extern "C" void cupdlp_projSameub_cuda(cupdlp_float *x, const cupdlp_float ub,
+                                       const cupdlp_int len);
+extern "C" void cupdlp_projSamelb_cuda(cupdlp_float *x, const cupdlp_float lb,
+                                       const cupdlp_int len);
+extern "C" void cupdlp_projub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                   const cupdlp_int len);
+extern "C" void cupdlp_projlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                   const cupdlp_int len);
+extern "C" void cupdlp_ediv_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len);
+extern "C" void cupdlp_edot_cuda(cupdlp_float *x, const cupdlp_float *y,
+                                 const cupdlp_int len);
+extern "C" void cupdlp_haslb_cuda(cupdlp_float *haslb, const cupdlp_float *lb,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len);
+extern "C" void cupdlp_hasub_cuda(cupdlp_float *hasub, const cupdlp_float *ub,
+                                  const cupdlp_float bound,
+                                  const cupdlp_int len);
+extern "C" void cupdlp_filterlb_cuda(cupdlp_float *x, const cupdlp_float *lb,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len);
+extern "C" void cupdlp_filterub_cuda(cupdlp_float *x, const cupdlp_float *ub,
+                                     const cupdlp_float bound,
+                                     const cupdlp_int len);
+extern "C" void cupdlp_initvec_cuda(cupdlp_float *x, const cupdlp_float val,
+                                    const cupdlp_int len);
+
+extern "C" void cupdlp_pgrad_cuda(cupdlp_float *xUpdate, const cupdlp_float *x,
+                                  const cupdlp_float *cost,
+                                  const cupdlp_float *ATy,
+                                  const cupdlp_float dPrimalStep,
+                                  const cupdlp_int len);
+
+extern "C" void cupdlp_dgrad_cuda(cupdlp_float *yUpdate, const cupdlp_float *y,
+                                  const cupdlp_float *b, const cupdlp_float *Ax,
+                                  const cupdlp_float *AxUpdate,
+                                  const cupdlp_float dDualStep,
+                                  const cupdlp_int len);
+
+extern "C" void cupdlp_sub_cuda(cupdlp_float *z, const cupdlp_float *x,
+                                const cupdlp_float *y, const cupdlp_int len);
+#endif
+#endif
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/test_cublas.c b/src/pdlp/cupdlp/cuda/test_cublas.c
new file mode 100644
index 0000000000..4e1a52b85e
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/test_cublas.c
@@ -0,0 +1,152 @@
+#include "cupdlp_cuda_kernels.cuh"
+#include "cupdlp_cudalinalg.cuh"
+
+void use_cublas(cublasHandle_t cublashandle) {
+  cupdlp_int len = 10;
+  // cupdlp_int len = 1<<10;
+
+  // int N = 1<<20;
+
+  // alloc and init host vec memory
+  cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  for (cupdlp_int i = 0; i < len; i++) {
+    h_vec1[i] = 1.0;
+    h_vec2[i] = i;
+    // h_vec1[i] = 1.0;
+    // h_vec2[i] = 2.0;
+  }
+
+  // alloc and init device vec memory
+  cupdlp_float *d_vec1;
+  cupdlp_float *d_vec2;
+  cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float));
+  // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float),
+  //            cudaMemcpyHostToDevice);
+
+  cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float));
+  cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float),
+             cudaMemcpyHostToDevice);
+
+  // init cublas handle
+  // cublasHandle_t cublashandle;
+  // CHECK_CUBLAS(cublasCreate(&cublashandle));
+
+  cupdlp_float result;
+  // call nrm2
+  CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result));
+
+  // print result
+  printf("2-norm is :%f\n", result);
+
+  // copy result back to host
+  // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaError_t errSync = cudaGetLastError();
+  // cudaError_t errAsync = cudaDeviceSynchronize();
+  // if (errSync != cudaSuccess)
+  //     printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
+  // if (errAsync != cudaSuccess)
+  //     printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
+
+  // // print result
+  // for (cupdlp_int i = 0; i < len; i++) {
+  //     printf("%f\n", h_vec1[i]);
+  //     // printf("%f\n", h_vec2[i]);
+  // }
+
+  // destroy cublas handle
+  //  CHECK_CUBLAS(cublasDestroy(cublashandle));
+
+  // free memory
+  free(h_vec1);
+  free(h_vec2);
+  cudaFree(d_vec1);
+  cudaFree(d_vec2);
+}
+int main() {
+  // try cupdlp_edot_cuda
+
+  // int nDevices;
+
+  // cudaGetDeviceCount(&nDevices);
+  //    for (int i = 0; i < nDevices; i++) {
+  //        cudaDeviceProp prop;
+  //        cudaGetDeviceProperties(&prop, i);
+  //        printf("Device Number: %d\n", i);
+  //        printf("  Device name: %s\n", prop.name);
+  //        printf("  Memory Clock Rate (KHz): %d\n",
+  //               prop.memoryClockRate);
+  //        printf("  Memory Bus Width (bits): %d\n",
+  //               prop.memoryBusWidth);
+  //        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
+  //               2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8)
+  //               / 1.0e6);
+  //    }
+
+  // cupdlp_int len = 10;
+  // // cupdlp_int len = 1<<10;
+
+  // // int N = 1<<20;
+
+  // // alloc and init host vec memory
+  // cupdlp_float *h_vec1 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float));
+  // cupdlp_float *h_vec2 = (cupdlp_float *) malloc(len * sizeof(cupdlp_float));
+  // for (cupdlp_int i = 0; i < len; i++) {
+  //     h_vec1[i] = 1.0;
+  //     h_vec2[i] = i;
+  //     // h_vec1[i] = 1.0;
+  //     // h_vec2[i] = 2.0;
+  // }
+
+  // // alloc and init device vec memory
+  // cupdlp_float *d_vec1;
+  // cupdlp_float *d_vec2;
+  // cudaMalloc((void **) &d_vec1, len * sizeof(cupdlp_float));
+  // // cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float),
+  // //            cudaMemcpyHostToDevice);
+
+  // cudaMalloc((void **) &d_vec2, len * sizeof(cupdlp_float));
+  // cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float),
+  //            cudaMemcpyHostToDevice);
+
+  // init cublas handle
+  cublasHandle_t cublashandle;
+  CHECK_CUBLAS(cublasCreate(&cublashandle));
+  use_cublas(cublashandle);
+  // cupdlp_float result;
+  // call nrm2
+  // CHECK_CUBLAS(cublasDnrm2(cublashandle, len, d_vec1, 1, &result));
+
+  // print result
+  // printf("2-norm is :%f\n", result);
+
+  // copy result back to host
+  // cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float),
+  //            cudaMemcpyDeviceToHost);
+  // cudaError_t errSync = cudaGetLastError();
+  // cudaError_t errAsync = cudaDeviceSynchronize();
+  // if (errSync != cudaSuccess)
+  //     printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
+  // if (errAsync != cudaSuccess)
+  //     printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
+
+  // // print result
+  // for (cupdlp_int i = 0; i < len; i++) {
+  //     printf("%f\n", h_vec1[i]);
+  //     // printf("%f\n", h_vec2[i]);
+  // }
+
+  // destroy cublas handle
+  CHECK_CUBLAS(cublasDestroy(cublashandle));
+
+  // // free memory
+  // free(h_vec1);
+  // free(h_vec2);
+  // cudaFree(d_vec1);
+  // cudaFree(d_vec2);
+}
\ No newline at end of file
diff --git a/src/pdlp/cupdlp/cuda/test_cuda_linalg.c b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c
new file mode 100644
index 0000000000..ad19bbe701
--- /dev/null
+++ b/src/pdlp/cupdlp/cuda/test_cuda_linalg.c
@@ -0,0 +1,79 @@
+#include "cupdlp_cuda_kernels.cuh"
+#include "cupdlp_cudalinalg.cuh"
+
+int main() {
+  // try cupdlp_edot_cuda
+
+  int nDevices;
+
+  cudaGetDeviceCount(&nDevices);
+  //    for (int i = 0; i < nDevices; i++) {
+  //        cudaDeviceProp prop;
+  //        cudaGetDeviceProperties(&prop, i);
+  //        printf("Device Number: %d\n", i);
+  //        printf("  Device name: %s\n", prop.name);
+  //        printf("  Memory Clock Rate (KHz): %d\n",
+  //               prop.memoryClockRate);
+  //        printf("  Memory Bus Width (bits): %d\n",
+  //               prop.memoryBusWidth);
+  //        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
+  //               2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8)
+  //               / 1.0e6);
+  //    }
+
+  cupdlp_int len = 10;
+  // cupdlp_int len = 1<<10;
+
+  // int N = 1<<20;
+
+  // alloc and init host vec memory
+  cupdlp_float *h_vec1 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  cupdlp_float *h_vec2 = (cupdlp_float *)malloc(len * sizeof(cupdlp_float));
+  for (cupdlp_int i = 0; i < len; i++) {
+    h_vec1[i] = i;
+    h_vec2[i] = i;
+    // h_vec1[i] = 1.0;
+    // h_vec2[i] = 2.0;
+  }
+
+  // alloc and init device vec memory
+  cupdlp_float *d_vec1;
+  cupdlp_float *d_vec2;
+  cudaMalloc((void **)&d_vec1, len * sizeof(cupdlp_float));
+  cudaMalloc((void **)&d_vec2, len * sizeof(cupdlp_float));
+  cudaMemcpy(d_vec1, h_vec1, len * sizeof(cupdlp_float),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(d_vec2, h_vec2, len * sizeof(cupdlp_float),
+             cudaMemcpyHostToDevice);
+
+  // call cupdlp_edot_cuda
+  cupdlp_edot_cuda(d_vec1, d_vec2, len);
+  // element_wise_dot_kernel<<<(len+255)/256, 256>>>(d_vec1, d_vec2, len);
+  // saxpy<<<(len+255)/256, 256>>>(len, 2.0f, d_vec1, d_vec2);
+  // element_wise_dot<<<cuda_gridsize(len), CUPDLP_BLOCK_SIZE>>>(d_vec1, d_vec2,
+  // len);
+
+  // copy result back to host
+  cudaMemcpy(h_vec1, d_vec1, len * sizeof(cupdlp_float),
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_vec2, d_vec2, len * sizeof(cupdlp_float),
+             cudaMemcpyDeviceToHost);
+  cudaError_t errSync = cudaGetLastError();
+  cudaError_t errAsync = cudaDeviceSynchronize();
+  if (errSync != cudaSuccess)
+    printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
+  if (errAsync != cudaSuccess)
+    printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
+
+  // print result
+  for (cupdlp_int i = 0; i < len; i++) {
+    printf("%f\n", h_vec1[i]);
+    // printf("%f\n", h_vec2[i]);
+  }
+
+  // free memory
+  free(h_vec1);
+  free(h_vec2);
+  cudaFree(d_vec1);
+  cudaFree(d_vec2);
+}
\ No newline at end of file