feat(autoware_tensorrt_yolox): add GPU - CUDA device option (autoware…

…foundation#8245) * init CUDA device option Signed-off-by: ismetatabay <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Batuhan Beytekin <[email protected]>
batuhanbeytekin · Sep 2, 2024 · 1a8038c · 1a8038c
1 parent 43694f0
commit 1a8038c
Show file tree

Hide file tree

Showing 11 changed files with 87 additions and 43 deletions.
diff --git a/perception/autoware_tensorrt_yolox/config/yolox_s_plus_opt.param.yaml b/perception/autoware_tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
@@ -36,4 +36,5 @@
     profile_per_layer: false # If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose.
     clip_value: 6.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration.
     preprocess_on_gpu: true # If true, pre-processing is performed on GPU.
+    gpu_id: 0 # GPU ID to select CUDA Device
     calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization.
diff --git a/perception/autoware_tensorrt_yolox/config/yolox_tiny.param.yaml b/perception/autoware_tensorrt_yolox/config/yolox_tiny.param.yaml
@@ -36,4 +36,5 @@
     profile_per_layer: false # If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose.
     clip_value: 0.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration.
     preprocess_on_gpu: true # If true, pre-processing is performed on GPU.
+    gpu_id: 0 # GPU ID to select CUDA Device
     calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization.
diff --git a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp
@@ -89,15 +89,27 @@ class TrtYoloX
     const std::string & model_path, const std::string & precision, const int num_class = 8,
     const float score_threshold = 0.3, const float nms_threshold = 0.7,
     const tensorrt_common::BuildConfig build_config = tensorrt_common::BuildConfig(),
-    const bool use_gpu_preprocess = false, std::string calibration_image_list_file = std::string(),
-    const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "",
+    const bool use_gpu_preprocess = false, const uint8_t gpu_id = 0,
+    std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0,
+    [[maybe_unused]] const std::string & cache_dir = "",
     const tensorrt_common::BatchConfig & batch_config = {1, 1, 1},
     const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = "");
   /**
    * @brief Deconstruct TrtYoloX
    */
   ~TrtYoloX();
 
+  /**
+   * @brief select cuda device for a inference
+   * @param[in] GPU id
+   */
+  bool setCudaDeviceId(const uint8_t gpu_id);
+
+  /**
+   * @brief return a flag for gpu initialization
+   */
+  bool isGPUInitialized() const { return is_gpu_initialized_; }
+
   /**
    * @brief run inference including pre-process and post-process
    * @param[out] objects results for object detection
@@ -267,7 +279,7 @@ class TrtYoloX
   size_t out_elem_num_per_batch_;
   CudaUniquePtr<float[]> out_prob_d_;
 
-  StreamUniquePtr stream_{makeCudaStream()};
+  StreamUniquePtr stream_;
 
   int32_t max_detections_;
   std::vector<float> scales_;
@@ -280,6 +292,10 @@ class TrtYoloX
 
   // flag whether preprocess are performed on GPU
   bool use_gpu_preprocess_;
+  // GPU id for inference
+  const uint8_t gpu_id_;
+  // flag for gpu initialization
+  bool is_gpu_initialized_;
   // host buffer for preprocessing on GPU
   CudaUniquePtrHost<unsigned char[]> image_buf_h_;
   // device buffer for preprocessing on GPU

diff --git a/perception/autoware_tensorrt_yolox/launch/multiple_yolox.launch.xml b/perception/autoware_tensorrt_yolox/launch/multiple_yolox.launch.xml
@@ -1,69 +1,45 @@
 <launch>
   <arg name="image_raw0" default="/image_raw0"/>
-  <arg name="gpu_id_image_raw0" default="0"/>
-
   <arg name="image_raw1" default=""/>
-  <arg name="gpu_id_image_raw1" default="0"/>
-
   <arg name="image_raw2" default=""/>
-  <arg name="gpu_id_image_raw2" default="0"/>
-
   <arg name="image_raw3" default=""/>
-  <arg name="gpu_id_image_raw3" default="0"/>
-
   <arg name="image_raw4" default=""/>
-  <arg name="gpu_id_image_raw4" default="0"/>
-
   <arg name="image_raw5" default=""/>
-  <arg name="gpu_id_image_raw5" default="0"/>
-
   <arg name="image_raw6" default=""/>
-  <arg name="gpu_id_image_raw6" default="0"/>
-
   <arg name="image_raw7" default=""/>
-  <arg name="gpu_id_image_raw7" default="0"/>
-
   <arg name="image_number" default="1"/>
   <arg name="output_topic" default="rois"/>
 
   <include if="$(eval &quot;'$(var image_number)'>='1'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw0)"/>
     <arg name="output/objects" value="rois0"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw0)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='2'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw1)"/>
     <arg name="output/objects" value="rois1"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw1)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='3'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw2)"/>
     <arg name="output/objects" value="rois2"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw2)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='4'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw3)"/>
     <arg name="output/objects" value="rois3"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw3)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='5'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw4)"/>
     <arg name="output/objects" value="rois4"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw4)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='6'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw5)"/>
     <arg name="output/objects" value="rois5"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw5)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='7'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw6)"/>
     <arg name="output/objects" value="rois6"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw6)"/>
   </include>
   <include if="$(eval &quot;'$(var image_number)'>='8'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
     <arg name="input/image" value="$(var image_raw7)"/>
     <arg name="output/objects" value="rois7"/>
-    <arg name="gpu_id" value="$(var gpu_id_image_raw7)"/>
   </include>
 </launch>
diff --git a/perception/autoware_tensorrt_yolox/schema/yolox_s_plus_opt.schema.json b/perception/autoware_tensorrt_yolox/schema/yolox_s_plus_opt.schema.json
@@ -70,6 +70,11 @@
           "default": true,
           "description": "If true, pre-processing is performed on GPU."
         },
+        "gpu_id": {
+          "type": "integer",
+          "default": 0,
+          "description": "GPU ID for selecting CUDA device"
+        },
         "calibration_image_list_path": {
           "type": "string",
           "default": "",
@@ -88,7 +93,8 @@
         "quantize_last_layer",
         "profile_per_layer",
         "clip_value",
-        "preprocess_on_gpu"
+        "preprocess_on_gpu",
+        "gpu_id"
       ]
     }
   },

diff --git a/perception/autoware_tensorrt_yolox/schema/yolox_tiny.schema.json b/perception/autoware_tensorrt_yolox/schema/yolox_tiny.schema.json
@@ -70,6 +70,11 @@
           "default": true,
           "description": "If true, pre-processing is performed on GPU."
         },
+        "gpu_id": {
+          "type": "integer",
+          "default": 0,
+          "description": "GPU ID for selecting CUDA device"
+        },
         "calibration_image_list_path": {
           "type": "string",
           "default": "",
@@ -88,7 +93,8 @@
         "quantize_last_layer",
         "profile_per_layer",
         "clip_value",
-        "preprocess_on_gpu"
+        "preprocess_on_gpu",
+        "gpu_id"
       ]
     }
   },

diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp
@@ -156,16 +156,25 @@ namespace autoware::tensorrt_yolox
 TrtYoloX::TrtYoloX(
   const std::string & model_path, const std::string & precision, const int num_class,
   const float score_threshold, const float nms_threshold, tensorrt_common::BuildConfig build_config,
-  const bool use_gpu_preprocess, std::string calibration_image_list_path, const double norm_factor,
-  [[maybe_unused]] const std::string & cache_dir, const tensorrt_common::BatchConfig & batch_config,
-  const size_t max_workspace_size, const std::string & color_map_path)
+  const bool use_gpu_preprocess, const uint8_t gpu_id, std::string calibration_image_list_path,
+  const double norm_factor, [[maybe_unused]] const std::string & cache_dir,
+  const tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size,
+  const std::string & color_map_path)
+: gpu_id_(gpu_id), is_gpu_initialized_(false)
 {
+  if (!setCudaDeviceId(gpu_id_)) {
+    return;
+  }
+  is_gpu_initialized_ = true;
+
   src_width_ = -1;
   src_height_ = -1;
   norm_factor_ = norm_factor;
   batch_size_ = batch_config[2];
   multitask_ = 0;
   sematic_color_map_ = get_seg_colormap(color_map_path);
+  stream_ = makeCudaStream();
+
   if (precision == "int8") {
     if (build_config.clip_value <= 0.0) {
       if (calibration_image_list_path.empty()) {
@@ -217,7 +226,6 @@ TrtYoloX::TrtYoloX(
       calibrator.reset(
         new tensorrt_yolox::Int8MinMaxCalibrator(stream, calibration_table, norm_factor_));
     }
-
     trt_common_ = std::make_unique<tensorrt_common::TrtCommon>(
       model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config);
   } else {
@@ -262,7 +270,6 @@ TrtYoloX::TrtYoloX(
       throw std::runtime_error{s.str()};
       */
   }
-
   // GPU memory allocation
   const auto input_dims = trt_common_->getBindingDimensions(0);
   const auto input_size =
@@ -341,6 +348,16 @@ TrtYoloX::~TrtYoloX()
   }
 }
 
+bool TrtYoloX::setCudaDeviceId(const uint8_t gpu_id)
+{
+  cudaError_t cuda_err = cudaSetDevice(gpu_id);
+  if (cuda_err != cudaSuccess) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
 void TrtYoloX::initPreprocessBuffer(int width, int height)
 {
   // if size of source input has been changed...
@@ -534,6 +551,9 @@ bool TrtYoloX::doInference(
   const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
   [[maybe_unused]] std::vector<cv::Mat> & color_masks)
 {
+  if (!setCudaDeviceId(gpu_id_)) {
+    return false;
+  }
   if (!trt_common_->isInitialized()) {
     return false;
   }

diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp
@@ -57,6 +57,7 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
   const bool preprocess_on_gpu = this->declare_parameter<bool>("preprocess_on_gpu");
   const std::string calibration_image_list_path =
     this->declare_parameter<std::string>("calibration_image_list_path");
+  const uint8_t gpu_id = this->declare_parameter<uint8_t>("gpu_id");
 
   std::string color_map_path = this->declare_parameter<std::string>("color_map_path");
 
@@ -93,9 +94,16 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
 
   trt_yolox_ = std::make_unique<tensorrt_yolox::TrtYoloX>(
     model_path, precision, label_map_.size(), score_threshold, nms_threshold, build_config,
-    preprocess_on_gpu, calibration_image_list_path, norm_factor, cache_dir, batch_config,
+    preprocess_on_gpu, gpu_id, calibration_image_list_path, norm_factor, cache_dir, batch_config,
     max_workspace_size, color_map_path);
 
+  if (!trt_yolox_->isGPUInitialized()) {
+    RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id);
+    rclcpp::shutdown();
+    return;
+  }
+  RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id);
+
   timer_ =
     rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYoloXNode::onConnect, this));
 

diff --git a/perception/autoware_traffic_light_fine_detector/README.md b/perception/autoware_traffic_light_fine_detector/README.md
@@ -51,13 +51,14 @@ Based on the camera image and the global ROI array detected by `map_based_detect
 
 ### Node Parameters
 
-| Name                       | Type   | Default Value               | Description                                                        |
-| -------------------------- | ------ | --------------------------- | ------------------------------------------------------------------ |
-| `data_path`                | string | "$(env HOME)/autoware_data" | packages data and artifacts directory path                         |
-| `fine_detector_model_path` | string | ""                          | The onnx file name for yolo model                                  |
-| `fine_detector_label_path` | string | ""                          | The label file with label names for detected objects written on it |
-| `fine_detector_precision`  | string | "fp32"                      | The inference mode: "fp32", "fp16"                                 |
-| `approximate_sync`         | bool   | false                       | Flag for whether to ues approximate sync policy                    |
+| Name                       | Type    | Default Value               | Description                                                        |
+| -------------------------- | ------- | --------------------------- | ------------------------------------------------------------------ |
+| `data_path`                | string  | "$(env HOME)/autoware_data" | packages data and artifacts directory path                         |
+| `fine_detector_model_path` | string  | ""                          | The onnx file name for yolo model                                  |
+| `fine_detector_label_path` | string  | ""                          | The label file with label names for detected objects written on it |
+| `fine_detector_precision`  | string  | "fp32"                      | The inference mode: "fp32", "fp16"                                 |
+| `approximate_sync`         | bool    | false                       | Flag for whether to ues approximate sync policy                    |
+| `gpu_id`                   | integer | 0                           | ID for the selecting CUDA GPU device                               |
 
 ## Assumptions / Known limits
 

diff --git a/...eption/autoware_traffic_light_fine_detector/config/traffic_light_fine_detector.param.yaml b/...eption/autoware_traffic_light_fine_detector/config/traffic_light_fine_detector.param.yaml
@@ -5,3 +5,4 @@
     fine_detector_precision: fp16
     fine_detector_score_thresh: 0.3
     fine_detector_nms_thresh: 0.65
+    gpu_id: 0
diff --git a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp
@@ -60,6 +60,7 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt
   std::string model_path = declare_parameter("fine_detector_model_path", "");
   std::string label_path = declare_parameter("fine_detector_label_path", "");
   std::string precision = declare_parameter("fine_detector_precision", "fp32");
+  const uint8_t gpu_id = declare_parameter("gpu_id", 0);
   // Objects with a score lower than this value will be ignored.
   // This threshold will be ignored if specified model contains EfficientNMS_TRT module in it
   score_thresh_ = declare_parameter("fine_detector_score_thresh", 0.3);
@@ -86,7 +87,14 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt
 
   trt_yolox_ = std::make_unique<autoware::tensorrt_yolox::TrtYoloX>(
     model_path, precision, num_class, score_thresh_, nms_threshold, build_config, cuda_preprocess,
-    calib_image_list, scale, cache_dir, batch_config);
+    gpu_id, calib_image_list, scale, cache_dir, batch_config);
+
+  if (!trt_yolox_->isGPUInitialized()) {
+    RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id);
+    rclcpp::shutdown();
+    return;
+  }
+  RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id);
 
   using std::chrono_literals::operator""ms;
   timer_ = rclcpp::create_timer(