adding cuda parts of pdlp: part one

ERGO-Code · Aug 26, 2024 · 0ac0cec · 0ac0cec
1 parent 7ce96cb
commit 0ac0cec
Show file tree

Hide file tree

Showing 11 changed files with 902 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -74,6 +74,19 @@ endif()
 # emscripten
 option(EMSCRIPTEN_HTML "Emscripten HTML output" OFF)
 
+option(CUPDLP_GPU "Build pdlp with nvidia" OFF)
+message(STATUS "Build pdlp with nvidia: ${CUPDLP_GPU}")
+
+if (CUPDLP_GPU)
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+    message(NOTICE "Set build with CUDA ${BUILD_CUDA}")
+    include(FindCUDAConf.cmake)
+    set(CUPLDP_CPU OFF)
+else()
+    set(CUPLDP_CPU ON)
+    set(CUDA_LIBRARY-NOTFOUND true)
+endif()
+
 if (BUILD_CXX)
   # Default Build Type to be Release
   get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)

diff --git a/FindCUDAConf.cmake b/FindCUDAConf.cmake
@@ -0,0 +1,33 @@
+
+set(CUDA_LIBRARY-NOTFOUND, OFF)
+message(NOTICE "Finding CUDA environment")
+message(NOTICE "    - CUDA Home detected at $ENV{CUDA_HOME}")
+set(CMAKE_CUDA_ARCHITECTURES "all")
+set(CMAKE_CUDA_PATH "$ENV{CUDA_HOME}")
+set(CMAKE_CUDA_COMPILER "${CMAKE_CUDA_PATH}/bin/nvcc")
+
+enable_language(CUDA)
+
+find_library(CUDA_LIBRARY_ART
+        NAMES cudart
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_SPS
+        NAMES cusparse
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+find_library(CUDA_LIBRARY_BLS
+        NAMES cublas
+        HINTS "${CMAKE_CUDA_PATH}/lib64/"
+        REQUIRED
+)
+if (${CUDA_LIBRARY-NOTFOUND})
+    message(WARNING "    - CUDA Libraries not detected at $ENV{CUDA_HOME}")
+else ()
+    message(NOTICE "    - CUDA Libraries detected at $ENV{CUDA_HOME}")
+    set(CUDA_LIBRARY ${CUDA_LIBRARY_ART} ${CUDA_LIBRARY_SPS} ${CUDA_LIBRARY_BLS})
+    message(NOTICE "    -   :${CUDA_LIBRARY}")
+endif ()
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -179,6 +179,11 @@ else()
   target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file})
   target_include_directories(highs PRIVATE ${include_dirs})
 
+  # Optional Cuda 
+  if (CUPDLP_GPU)
+    add_subdirectory(cupdlp)
+  endif()
+
   if(MSVC)
     list(APPEND highs_compile_opts
       "/bigobj" # Allow big object

diff --git a/src/pdlp/cupdlp/CMakeLists.txt b/src/pdlp/cupdlp/CMakeLists.txt
@@ -0,0 +1,14 @@
+if (${CUDA_LIBRARY-NOTFOUND})
+    message(NOTICE "- CPU version PDLP")
+#     target_compile_definitions(cupdlp
+#             PUBLIC
+#             -DCUPDLP_CPU
+#     )
+    target_link_libraries(highs m)
+else()
+    add_subdirectory(cuda)
+    message(NOTICE "- GPU version PDLP")
+    target_include_directories(highs PUBLIC "/usr/local/cuda/include")
+    target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY} m)
+    set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif ()
diff --git a/src/pdlp/cupdlp/cuda/CMakeLists.txt b/src/pdlp/cupdlp/cuda/CMakeLists.txt
@@ -0,0 +1,30 @@
+enable_language(CXX CUDA)
+
+add_library(cudalin SHARED
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cu
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cuda_kernels.cuh
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cuh
+       ${CUPDLP_INCLUDE_DIR}/cuda/cupdlp_cudalinalg.cu
+)
+
+set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(cudalin PUBLIC "/usr/local/cuda/include")
+target_compile_definitions(cudalin
+        PUBLIC
+        # If the debug configuration pass the DEBUG define to the compiler
+        $<$<CONFIG:Debug>:HIGHS_DEBUG>
+)
+
+target_link_libraries(cudalin ${CUDA_LIBRARY} m)
+
+# add a test
+add_executable(testcudalin test_cuda_linalg.c)
+add_executable(testcublas test_cublas.c)
+
+set_target_properties(testcudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+#target_include_directories(cudalinalg PRIVATE ${CUPDLP_INCLUDE_DIR}/cuda)
+target_link_libraries(testcudalin PRIVATE cudalin ${CUDA_LIBRARY})
+
+set_target_properties(testcublas PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(testcublas PRIVATE cudalin ${CUDA_LIBRARY})
+
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
@@ -0,0 +1,121 @@
+#include "cupdlp_cuda_kernels.cuh"
+
+dim3 cuda_gridsize(cupdlp_int n) {
+  cupdlp_int k = (n - 1) / CUPDLP_BLOCK_SIZE + 1;
+  cupdlp_int x = k;
+  cupdlp_int y = 1;
+  if (x > 65535) {
+    x = ceil(sqrt(k));
+    y = (n - 1) / (x * CUPDLP_BLOCK_SIZE) + 1;
+  }
+  dim3 d = {x, y, 1};
+  return d;
+}
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] *= y[i];
+}
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] /= y[i];
+}
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] < lb[i] ? lb[i] : x[i];
+}
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] > ub[i] ? ub[i] : x[i];
+}
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] <= lb ? lb : x[i];
+}
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = x[i] >= ub ? ub : x[i];
+}
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) haslb[i] = lb[i] > bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) hasub[i] = ub[i] < bound ? 1.0 : 0.0;
+}
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = lb[i] > bound ? lb[i] : 0.0;
+}
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = ub[i] < bound ? ub[i] : 0.0;
+}
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) x[i] = val;
+}
+
+//xUpdate = x - dPrimalStep * (cost - ATy)
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) xUpdate[i] = x[i] - dPrimalStep * (cost[i] - ATy[i]);
+}
+
+//yUpdate = y + dDualStep * (b -2AxUpdate + Ax)
+__global__ void dual_grad_step_kernal(cupdlp_float *yUpdate,
+                                      const cupdlp_float *y,
+                                      const cupdlp_float *b,
+                                      const cupdlp_float *Ax,
+                                      const cupdlp_float *AxUpdate,
+                                      const cupdlp_float dDualStep,
+                                      const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x; 
+  if (i < len) yUpdate[i] = y[i] + dDualStep * (b[i] - 2 * AxUpdate[i] + Ax[i]);
+}
+
+// z = x - y
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                  const cupdlp_float *y, const cupdlp_int len) {
+  cupdlp_int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) z[i] = x[i] - y[i];
+}
diff --git a/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh b/src/pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
@@ -0,0 +1,148 @@
+#ifndef CUPDLP_CUDA_KERNALS_H
+#define CUPDLP_CUDA_KERNALS_H
+
+#include "cuda_runtime.h"
+#define CUPDLP_BLOCK_SIZE 512
+
+#ifndef SFLOAT
+#ifdef DLONG
+typedef long long cupdlp_int;
+#else
+typedef int cupdlp_int;
+#endif
+typedef double cupdlp_float;
+#define CudaComputeType CUDA_R_64F
+#else
+#define CudaComputeType CUDA_R_32F
+#endif
+
+#define CHECK_CUDA(func)                                               \
+  {                                                                    \
+    cudaError_t status = (func);                                       \
+    if (status != cudaSuccess) {                                       \
+      printf("CUDA API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cudaGetErrorString(status), status);  \
+      return EXIT_FAILURE;                                             \
+    }                                                                  \
+  }
+
+#define CHECK_CUSPARSE(func)                                               \
+  {                                                                        \
+    cusparseStatus_t status = (func);                                      \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                               \
+      printf("CUSPARSE API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cusparseGetErrorString(status), status);  \
+      return EXIT_FAILURE;                                                 \
+    }                                                                      \
+  }
+
+#define CHECK_CUBLAS(func)                                               \
+  {                                                                      \
+    cublasStatus_t status = (func);                                      \
+    if (status != CUBLAS_STATUS_SUCCESS) {                               \
+      printf("CUBLAS API failed at line %d of %s with error: %s (%d)\n", \
+             __LINE__, __FILE__, cublasGetStatusString(status), status); \
+      return EXIT_FAILURE;                                               \
+    }                                                                    \
+  }
+
+#define CUPDLP_FREE_VEC(x) \
+  {                        \
+    cudaFree(x);           \
+    x = cupdlp_NULL;       \
+  }
+
+#define CUPDLP_COPY_VEC(dst, src, type, size) \
+  cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDefault)
+
+#define CUPDLP_INIT_VEC(var, size)                                             \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_INIT_ZERO_VEC(var, size)                                        \
+  {                                                                            \
+    cusparseStatus_t status =                                                  \
+        cudaMalloc((void **)&var, (size) * sizeof(typeof(*var)));              \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+    status = cudaMemset(var, 0, (size) * sizeof(typeof(*var)));                \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
+             cusparseGetErrorString(status), status);                          \
+      goto exit_cleanup;                                                       \
+    }                                                                          \
+  }
+#define CUPDLP_ZERO_VEC(var, type, size) \
+  cudaMemset(var, 0, sizeof(type) * (size))
+
+dim3 cuda_gridsize(cupdlp_int n);
+
+__global__ void element_wise_dot_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len);
+
+__global__ void element_wise_div_kernel(cupdlp_float *x, const cupdlp_float *y,
+                                        const cupdlp_int len);
+
+__global__ void element_wise_projlb_kernel(cupdlp_float *x,
+                                           const cupdlp_float *lb,
+                                           const cupdlp_int len);
+
+__global__ void element_wise_projub_kernel(cupdlp_float *x,
+                                           const cupdlp_float *ub,
+                                           const cupdlp_int len);
+
+__global__ void element_wise_projSamelb_kernel(cupdlp_float *x,
+                                               const cupdlp_float lb,
+                                               const cupdlp_int len);
+
+__global__ void element_wise_projSameub_kernel(cupdlp_float *x,
+                                               const cupdlp_float ub,
+                                               const cupdlp_int len);
+
+__global__ void element_wise_initHaslb_kernal(cupdlp_float *haslb,
+                                              const cupdlp_float *lb,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len);
+
+__global__ void element_wise_initHasub_kernal(cupdlp_float *hasub,
+                                              const cupdlp_float *ub,
+                                              const cupdlp_float bound,
+                                              const cupdlp_int len);
+
+__global__ void element_wise_filterlb_kernal(cupdlp_float *x,
+                                             const cupdlp_float *lb,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len);
+
+__global__ void element_wise_filterub_kernal(cupdlp_float *x,
+                                             const cupdlp_float *ub,
+                                             const cupdlp_float bound,
+                                             const cupdlp_int len);
+
+__global__ void init_cuda_vec_kernal(cupdlp_float *x, const cupdlp_float val,
+                                     const cupdlp_int len);
+
+__global__ void primal_grad_step_kernal(cupdlp_float *xUpdate,
+                                        const cupdlp_float *x,
+                                        const cupdlp_float *cost,
+                                        const cupdlp_float *ATy,
+                                        const cupdlp_float dPrimalStep,
+                                        const cupdlp_int len);
+
+__global__ void dual_grad_step_kernal(
+    cupdlp_float *yUpdate, const cupdlp_float *y, const cupdlp_float *b,
+    const cupdlp_float *Ax, const cupdlp_float *AxUpdate,
+    const cupdlp_float dDualStep, const cupdlp_int len);
+
+__global__ void naive_sub_kernal(cupdlp_float *z, const cupdlp_float *x,
+                                 const cupdlp_float *y, const cupdlp_int len);
+#endif