diff --git a/perception/autoware_tensorrt_yolox/config/yolox_s_plus_opt.param.yaml b/perception/autoware_tensorrt_yolox/config/yolox_s_plus_opt.param.yaml index d963074e51840..df01e2cb6eec6 100644 --- a/perception/autoware_tensorrt_yolox/config/yolox_s_plus_opt.param.yaml +++ b/perception/autoware_tensorrt_yolox/config/yolox_s_plus_opt.param.yaml @@ -36,4 +36,5 @@ profile_per_layer: false # If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose. clip_value: 6.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration. preprocess_on_gpu: true # If true, pre-processing is performed on GPU. + gpu_id: 0 # GPU ID to select CUDA Device calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization. diff --git a/perception/autoware_tensorrt_yolox/config/yolox_tiny.param.yaml b/perception/autoware_tensorrt_yolox/config/yolox_tiny.param.yaml index e1ece63a4940f..e8b5b38503c9b 100644 --- a/perception/autoware_tensorrt_yolox/config/yolox_tiny.param.yaml +++ b/perception/autoware_tensorrt_yolox/config/yolox_tiny.param.yaml @@ -36,4 +36,5 @@ profile_per_layer: false # If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose. clip_value: 0.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration. preprocess_on_gpu: true # If true, pre-processing is performed on GPU. + gpu_id: 0 # GPU ID to select CUDA Device calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization. diff --git a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp index 6610476dc33fc..6fba801d1072b 100644 --- a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp +++ b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp @@ -89,8 +89,9 @@ class TrtYoloX const std::string & model_path, const std::string & precision, const int num_class = 8, const float score_threshold = 0.3, const float nms_threshold = 0.7, const tensorrt_common::BuildConfig build_config = tensorrt_common::BuildConfig(), - const bool use_gpu_preprocess = false, std::string calibration_image_list_file = std::string(), - const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "", + const bool use_gpu_preprocess = false, const uint8_t gpu_id = 0, + std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0, + [[maybe_unused]] const std::string & cache_dir = "", const tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = ""); /** @@ -98,6 +99,17 @@ class TrtYoloX */ ~TrtYoloX(); + /** + * @brief select cuda device for a inference + * @param[in] GPU id + */ + bool setCudaDeviceId(const uint8_t gpu_id); + + /** + * @brief return a flag for gpu initialization + */ + bool isGPUInitialized() const { return is_gpu_initialized_; } + /** * @brief run inference including pre-process and post-process * @param[out] objects results for object detection @@ -267,7 +279,7 @@ class TrtYoloX size_t out_elem_num_per_batch_; CudaUniquePtr out_prob_d_; - StreamUniquePtr stream_{makeCudaStream()}; + StreamUniquePtr stream_; int32_t max_detections_; std::vector scales_; @@ -280,6 +292,10 @@ class TrtYoloX // flag whether preprocess are performed on GPU bool use_gpu_preprocess_; + // GPU id for inference + const uint8_t gpu_id_; + // flag for gpu initialization + bool is_gpu_initialized_; // host buffer for preprocessing on GPU CudaUniquePtrHost image_buf_h_; // device buffer for preprocessing on GPU diff --git a/perception/autoware_tensorrt_yolox/launch/multiple_yolox.launch.xml b/perception/autoware_tensorrt_yolox/launch/multiple_yolox.launch.xml index a039d25222490..a7952b12414b1 100644 --- a/perception/autoware_tensorrt_yolox/launch/multiple_yolox.launch.xml +++ b/perception/autoware_tensorrt_yolox/launch/multiple_yolox.launch.xml @@ -1,69 +1,45 @@ - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/perception/autoware_tensorrt_yolox/schema/yolox_s_plus_opt.schema.json b/perception/autoware_tensorrt_yolox/schema/yolox_s_plus_opt.schema.json index ce1ad6c2d0caf..a55158be80b7c 100644 --- a/perception/autoware_tensorrt_yolox/schema/yolox_s_plus_opt.schema.json +++ b/perception/autoware_tensorrt_yolox/schema/yolox_s_plus_opt.schema.json @@ -70,6 +70,11 @@ "default": true, "description": "If true, pre-processing is performed on GPU." }, + "gpu_id": { + "type": "integer", + "default": 0, + "description": "GPU ID for selecting CUDA device" + }, "calibration_image_list_path": { "type": "string", "default": "", @@ -88,7 +93,8 @@ "quantize_last_layer", "profile_per_layer", "clip_value", - "preprocess_on_gpu" + "preprocess_on_gpu", + "gpu_id" ] } }, diff --git a/perception/autoware_tensorrt_yolox/schema/yolox_tiny.schema.json b/perception/autoware_tensorrt_yolox/schema/yolox_tiny.schema.json index f47b28e47a3f8..707cd393d6828 100644 --- a/perception/autoware_tensorrt_yolox/schema/yolox_tiny.schema.json +++ b/perception/autoware_tensorrt_yolox/schema/yolox_tiny.schema.json @@ -70,6 +70,11 @@ "default": true, "description": "If true, pre-processing is performed on GPU." }, + "gpu_id": { + "type": "integer", + "default": 0, + "description": "GPU ID for selecting CUDA device" + }, "calibration_image_list_path": { "type": "string", "default": "", @@ -88,7 +93,8 @@ "quantize_last_layer", "profile_per_layer", "clip_value", - "preprocess_on_gpu" + "preprocess_on_gpu", + "gpu_id" ] } }, diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp index 675fee64812a2..b6da363a9a237 100644 --- a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp +++ b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp @@ -156,16 +156,25 @@ namespace autoware::tensorrt_yolox TrtYoloX::TrtYoloX( const std::string & model_path, const std::string & precision, const int num_class, const float score_threshold, const float nms_threshold, tensorrt_common::BuildConfig build_config, - const bool use_gpu_preprocess, std::string calibration_image_list_path, const double norm_factor, - [[maybe_unused]] const std::string & cache_dir, const tensorrt_common::BatchConfig & batch_config, - const size_t max_workspace_size, const std::string & color_map_path) + const bool use_gpu_preprocess, const uint8_t gpu_id, std::string calibration_image_list_path, + const double norm_factor, [[maybe_unused]] const std::string & cache_dir, + const tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, + const std::string & color_map_path) +: gpu_id_(gpu_id), is_gpu_initialized_(false) { + if (!setCudaDeviceId(gpu_id_)) { + return; + } + is_gpu_initialized_ = true; + src_width_ = -1; src_height_ = -1; norm_factor_ = norm_factor; batch_size_ = batch_config[2]; multitask_ = 0; sematic_color_map_ = get_seg_colormap(color_map_path); + stream_ = makeCudaStream(); + if (precision == "int8") { if (build_config.clip_value <= 0.0) { if (calibration_image_list_path.empty()) { @@ -217,7 +226,6 @@ TrtYoloX::TrtYoloX( calibrator.reset( new tensorrt_yolox::Int8MinMaxCalibrator(stream, calibration_table, norm_factor_)); } - trt_common_ = std::make_unique( model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config); } else { @@ -262,7 +270,6 @@ TrtYoloX::TrtYoloX( throw std::runtime_error{s.str()}; */ } - // GPU memory allocation const auto input_dims = trt_common_->getBindingDimensions(0); const auto input_size = @@ -341,6 +348,16 @@ TrtYoloX::~TrtYoloX() } } +bool TrtYoloX::setCudaDeviceId(const uint8_t gpu_id) +{ + cudaError_t cuda_err = cudaSetDevice(gpu_id); + if (cuda_err != cudaSuccess) { + return false; + } else { + return true; + } +} + void TrtYoloX::initPreprocessBuffer(int width, int height) { // if size of source input has been changed... @@ -534,6 +551,9 @@ bool TrtYoloX::doInference( const std::vector & images, ObjectArrays & objects, std::vector & masks, [[maybe_unused]] std::vector & color_masks) { + if (!setCudaDeviceId(gpu_id_)) { + return false; + } if (!trt_common_->isInitialized()) { return false; } diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp index c613e7d1df52f..2455411618ceb 100644 --- a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp +++ b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp @@ -57,6 +57,7 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) const bool preprocess_on_gpu = this->declare_parameter("preprocess_on_gpu"); const std::string calibration_image_list_path = this->declare_parameter("calibration_image_list_path"); + const uint8_t gpu_id = this->declare_parameter("gpu_id"); std::string color_map_path = this->declare_parameter("color_map_path"); @@ -93,9 +94,16 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) trt_yolox_ = std::make_unique( model_path, precision, label_map_.size(), score_threshold, nms_threshold, build_config, - preprocess_on_gpu, calibration_image_list_path, norm_factor, cache_dir, batch_config, + preprocess_on_gpu, gpu_id, calibration_image_list_path, norm_factor, cache_dir, batch_config, max_workspace_size, color_map_path); + if (!trt_yolox_->isGPUInitialized()) { + RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id); + rclcpp::shutdown(); + return; + } + RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id); + timer_ = rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYoloXNode::onConnect, this)); diff --git a/perception/autoware_traffic_light_fine_detector/README.md b/perception/autoware_traffic_light_fine_detector/README.md index 9e69c22fdc17b..35b55c84aa087 100644 --- a/perception/autoware_traffic_light_fine_detector/README.md +++ b/perception/autoware_traffic_light_fine_detector/README.md @@ -51,13 +51,14 @@ Based on the camera image and the global ROI array detected by `map_based_detect ### Node Parameters -| Name | Type | Default Value | Description | -| -------------------------- | ------ | --------------------------- | ------------------------------------------------------------------ | -| `data_path` | string | "$(env HOME)/autoware_data" | packages data and artifacts directory path | -| `fine_detector_model_path` | string | "" | The onnx file name for yolo model | -| `fine_detector_label_path` | string | "" | The label file with label names for detected objects written on it | -| `fine_detector_precision` | string | "fp32" | The inference mode: "fp32", "fp16" | -| `approximate_sync` | bool | false | Flag for whether to ues approximate sync policy | +| Name | Type | Default Value | Description | +| -------------------------- | ------- | --------------------------- | ------------------------------------------------------------------ | +| `data_path` | string | "$(env HOME)/autoware_data" | packages data and artifacts directory path | +| `fine_detector_model_path` | string | "" | The onnx file name for yolo model | +| `fine_detector_label_path` | string | "" | The label file with label names for detected objects written on it | +| `fine_detector_precision` | string | "fp32" | The inference mode: "fp32", "fp16" | +| `approximate_sync` | bool | false | Flag for whether to ues approximate sync policy | +| `gpu_id` | integer | 0 | ID for the selecting CUDA GPU device | ## Assumptions / Known limits diff --git a/perception/autoware_traffic_light_fine_detector/config/traffic_light_fine_detector.param.yaml b/perception/autoware_traffic_light_fine_detector/config/traffic_light_fine_detector.param.yaml index 0265d0c4c7abb..36d4413472a0b 100644 --- a/perception/autoware_traffic_light_fine_detector/config/traffic_light_fine_detector.param.yaml +++ b/perception/autoware_traffic_light_fine_detector/config/traffic_light_fine_detector.param.yaml @@ -5,3 +5,4 @@ fine_detector_precision: fp16 fine_detector_score_thresh: 0.3 fine_detector_nms_thresh: 0.65 + gpu_id: 0 diff --git a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp index fec26b762dfe9..5963efe611c00 100644 --- a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp +++ b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp @@ -60,6 +60,7 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt std::string model_path = declare_parameter("fine_detector_model_path", ""); std::string label_path = declare_parameter("fine_detector_label_path", ""); std::string precision = declare_parameter("fine_detector_precision", "fp32"); + const uint8_t gpu_id = declare_parameter("gpu_id", 0); // Objects with a score lower than this value will be ignored. // This threshold will be ignored if specified model contains EfficientNMS_TRT module in it score_thresh_ = declare_parameter("fine_detector_score_thresh", 0.3); @@ -86,7 +87,14 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt trt_yolox_ = std::make_unique( model_path, precision, num_class, score_thresh_, nms_threshold, build_config, cuda_preprocess, - calib_image_list, scale, cache_dir, batch_config); + gpu_id, calib_image_list, scale, cache_dir, batch_config); + + if (!trt_yolox_->isGPUInitialized()) { + RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id); + rclcpp::shutdown(); + return; + } + RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id); using std::chrono_literals::operator""ms; timer_ = rclcpp::create_timer(