Skip to content

Commit

Permalink
feat(autoware_tensorrt_yolox): add GPU - CUDA device option (autoware…
Browse files Browse the repository at this point in the history
…foundation#8245)

* init CUDA device option

Signed-off-by: ismetatabay <[email protected]>

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Batuhan Beytekin <[email protected]>
  • Loading branch information
2 people authored and batuhanbeytekin committed Sep 2, 2024
1 parent 43694f0 commit 1a8038c
Show file tree
Hide file tree
Showing 11 changed files with 87 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@
profile_per_layer: false # If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose.
clip_value: 6.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration.
preprocess_on_gpu: true # If true, pre-processing is performed on GPU.
gpu_id: 0 # GPU ID to select CUDA Device
calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization.
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@
profile_per_layer: false # If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose.
clip_value: 0.0 # If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration.
preprocess_on_gpu: true # If true, pre-processing is performed on GPU.
gpu_id: 0 # GPU ID to select CUDA Device
calibration_image_list_path: "" # Path to a file which contains path to images. Those images will be used for int8 quantization.
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,27 @@ class TrtYoloX
const std::string & model_path, const std::string & precision, const int num_class = 8,
const float score_threshold = 0.3, const float nms_threshold = 0.7,
const tensorrt_common::BuildConfig build_config = tensorrt_common::BuildConfig(),
const bool use_gpu_preprocess = false, std::string calibration_image_list_file = std::string(),
const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "",
const bool use_gpu_preprocess = false, const uint8_t gpu_id = 0,
std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0,
[[maybe_unused]] const std::string & cache_dir = "",
const tensorrt_common::BatchConfig & batch_config = {1, 1, 1},
const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = "");
/**
* @brief Deconstruct TrtYoloX
*/
~TrtYoloX();

/**
* @brief select cuda device for a inference
* @param[in] GPU id
*/
bool setCudaDeviceId(const uint8_t gpu_id);

/**
* @brief return a flag for gpu initialization
*/
bool isGPUInitialized() const { return is_gpu_initialized_; }

/**
* @brief run inference including pre-process and post-process
* @param[out] objects results for object detection
Expand Down Expand Up @@ -267,7 +279,7 @@ class TrtYoloX
size_t out_elem_num_per_batch_;
CudaUniquePtr<float[]> out_prob_d_;

StreamUniquePtr stream_{makeCudaStream()};
StreamUniquePtr stream_;

int32_t max_detections_;
std::vector<float> scales_;
Expand All @@ -280,6 +292,10 @@ class TrtYoloX

// flag whether preprocess are performed on GPU
bool use_gpu_preprocess_;
// GPU id for inference
const uint8_t gpu_id_;
// flag for gpu initialization
bool is_gpu_initialized_;
// host buffer for preprocessing on GPU
CudaUniquePtrHost<unsigned char[]> image_buf_h_;
// device buffer for preprocessing on GPU
Expand Down
Original file line number Diff line number Diff line change
@@ -1,69 +1,45 @@
<launch>
<arg name="image_raw0" default="/image_raw0"/>
<arg name="gpu_id_image_raw0" default="0"/>

<arg name="image_raw1" default=""/>
<arg name="gpu_id_image_raw1" default="0"/>

<arg name="image_raw2" default=""/>
<arg name="gpu_id_image_raw2" default="0"/>

<arg name="image_raw3" default=""/>
<arg name="gpu_id_image_raw3" default="0"/>

<arg name="image_raw4" default=""/>
<arg name="gpu_id_image_raw4" default="0"/>

<arg name="image_raw5" default=""/>
<arg name="gpu_id_image_raw5" default="0"/>

<arg name="image_raw6" default=""/>
<arg name="gpu_id_image_raw6" default="0"/>

<arg name="image_raw7" default=""/>
<arg name="gpu_id_image_raw7" default="0"/>

<arg name="image_number" default="1"/>
<arg name="output_topic" default="rois"/>

<include if="$(eval &quot;'$(var image_number)'>='1'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw0)"/>
<arg name="output/objects" value="rois0"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw0)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='2'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw1)"/>
<arg name="output/objects" value="rois1"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw1)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='3'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw2)"/>
<arg name="output/objects" value="rois2"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw2)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='4'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw3)"/>
<arg name="output/objects" value="rois3"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw3)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='5'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw4)"/>
<arg name="output/objects" value="rois4"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw4)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='6'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw5)"/>
<arg name="output/objects" value="rois5"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw5)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='7'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw6)"/>
<arg name="output/objects" value="rois6"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw6)"/>
</include>
<include if="$(eval &quot;'$(var image_number)'>='8'&quot;)" file="$(find-pkg-share autoware_tensorrt_yolox)/launch/yolox.launch.xml">
<arg name="input/image" value="$(var image_raw7)"/>
<arg name="output/objects" value="rois7"/>
<arg name="gpu_id" value="$(var gpu_id_image_raw7)"/>
</include>
</launch>
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@
"default": true,
"description": "If true, pre-processing is performed on GPU."
},
"gpu_id": {
"type": "integer",
"default": 0,
"description": "GPU ID for selecting CUDA device"
},
"calibration_image_list_path": {
"type": "string",
"default": "",
Expand All @@ -88,7 +93,8 @@
"quantize_last_layer",
"profile_per_layer",
"clip_value",
"preprocess_on_gpu"
"preprocess_on_gpu",
"gpu_id"
]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@
"default": true,
"description": "If true, pre-processing is performed on GPU."
},
"gpu_id": {
"type": "integer",
"default": 0,
"description": "GPU ID for selecting CUDA device"
},
"calibration_image_list_path": {
"type": "string",
"default": "",
Expand All @@ -88,7 +93,8 @@
"quantize_last_layer",
"profile_per_layer",
"clip_value",
"preprocess_on_gpu"
"preprocess_on_gpu",
"gpu_id"
]
}
},
Expand Down
30 changes: 25 additions & 5 deletions perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,16 +156,25 @@ namespace autoware::tensorrt_yolox
TrtYoloX::TrtYoloX(
const std::string & model_path, const std::string & precision, const int num_class,
const float score_threshold, const float nms_threshold, tensorrt_common::BuildConfig build_config,
const bool use_gpu_preprocess, std::string calibration_image_list_path, const double norm_factor,
[[maybe_unused]] const std::string & cache_dir, const tensorrt_common::BatchConfig & batch_config,
const size_t max_workspace_size, const std::string & color_map_path)
const bool use_gpu_preprocess, const uint8_t gpu_id, std::string calibration_image_list_path,
const double norm_factor, [[maybe_unused]] const std::string & cache_dir,
const tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size,
const std::string & color_map_path)
: gpu_id_(gpu_id), is_gpu_initialized_(false)
{
if (!setCudaDeviceId(gpu_id_)) {
return;
}
is_gpu_initialized_ = true;

src_width_ = -1;
src_height_ = -1;
norm_factor_ = norm_factor;
batch_size_ = batch_config[2];
multitask_ = 0;
sematic_color_map_ = get_seg_colormap(color_map_path);
stream_ = makeCudaStream();

if (precision == "int8") {
if (build_config.clip_value <= 0.0) {
if (calibration_image_list_path.empty()) {
Expand Down Expand Up @@ -217,7 +226,6 @@ TrtYoloX::TrtYoloX(
calibrator.reset(
new tensorrt_yolox::Int8MinMaxCalibrator(stream, calibration_table, norm_factor_));
}

trt_common_ = std::make_unique<tensorrt_common::TrtCommon>(
model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config);
} else {
Expand Down Expand Up @@ -262,7 +270,6 @@ TrtYoloX::TrtYoloX(
throw std::runtime_error{s.str()};
*/
}

// GPU memory allocation
const auto input_dims = trt_common_->getBindingDimensions(0);
const auto input_size =
Expand Down Expand Up @@ -341,6 +348,16 @@ TrtYoloX::~TrtYoloX()
}
}

bool TrtYoloX::setCudaDeviceId(const uint8_t gpu_id)
{
cudaError_t cuda_err = cudaSetDevice(gpu_id);
if (cuda_err != cudaSuccess) {
return false;
} else {
return true;
}
}

void TrtYoloX::initPreprocessBuffer(int width, int height)
{
// if size of source input has been changed...
Expand Down Expand Up @@ -534,6 +551,9 @@ bool TrtYoloX::doInference(
const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
[[maybe_unused]] std::vector<cv::Mat> & color_masks)
{
if (!setCudaDeviceId(gpu_id_)) {
return false;
}
if (!trt_common_->isInitialized()) {
return false;
}
Expand Down
10 changes: 9 additions & 1 deletion perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
const bool preprocess_on_gpu = this->declare_parameter<bool>("preprocess_on_gpu");
const std::string calibration_image_list_path =
this->declare_parameter<std::string>("calibration_image_list_path");
const uint8_t gpu_id = this->declare_parameter<uint8_t>("gpu_id");

std::string color_map_path = this->declare_parameter<std::string>("color_map_path");

Expand Down Expand Up @@ -93,9 +94,16 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)

trt_yolox_ = std::make_unique<tensorrt_yolox::TrtYoloX>(
model_path, precision, label_map_.size(), score_threshold, nms_threshold, build_config,
preprocess_on_gpu, calibration_image_list_path, norm_factor, cache_dir, batch_config,
preprocess_on_gpu, gpu_id, calibration_image_list_path, norm_factor, cache_dir, batch_config,
max_workspace_size, color_map_path);

if (!trt_yolox_->isGPUInitialized()) {
RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id);
rclcpp::shutdown();
return;
}
RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id);

timer_ =
rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYoloXNode::onConnect, this));

Expand Down
15 changes: 8 additions & 7 deletions perception/autoware_traffic_light_fine_detector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,14 @@ Based on the camera image and the global ROI array detected by `map_based_detect

### Node Parameters

| Name | Type | Default Value | Description |
| -------------------------- | ------ | --------------------------- | ------------------------------------------------------------------ |
| `data_path` | string | "$(env HOME)/autoware_data" | packages data and artifacts directory path |
| `fine_detector_model_path` | string | "" | The onnx file name for yolo model |
| `fine_detector_label_path` | string | "" | The label file with label names for detected objects written on it |
| `fine_detector_precision` | string | "fp32" | The inference mode: "fp32", "fp16" |
| `approximate_sync` | bool | false | Flag for whether to ues approximate sync policy |
| Name | Type | Default Value | Description |
| -------------------------- | ------- | --------------------------- | ------------------------------------------------------------------ |
| `data_path` | string | "$(env HOME)/autoware_data" | packages data and artifacts directory path |
| `fine_detector_model_path` | string | "" | The onnx file name for yolo model |
| `fine_detector_label_path` | string | "" | The label file with label names for detected objects written on it |
| `fine_detector_precision` | string | "fp32" | The inference mode: "fp32", "fp16" |
| `approximate_sync` | bool | false | Flag for whether to ues approximate sync policy |
| `gpu_id` | integer | 0 | ID for the selecting CUDA GPU device |

## Assumptions / Known limits

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
fine_detector_precision: fp16
fine_detector_score_thresh: 0.3
fine_detector_nms_thresh: 0.65
gpu_id: 0
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt
std::string model_path = declare_parameter("fine_detector_model_path", "");
std::string label_path = declare_parameter("fine_detector_label_path", "");
std::string precision = declare_parameter("fine_detector_precision", "fp32");
const uint8_t gpu_id = declare_parameter("gpu_id", 0);
// Objects with a score lower than this value will be ignored.
// This threshold will be ignored if specified model contains EfficientNMS_TRT module in it
score_thresh_ = declare_parameter("fine_detector_score_thresh", 0.3);
Expand All @@ -86,7 +87,14 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt

trt_yolox_ = std::make_unique<autoware::tensorrt_yolox::TrtYoloX>(
model_path, precision, num_class, score_thresh_, nms_threshold, build_config, cuda_preprocess,
calib_image_list, scale, cache_dir, batch_config);
gpu_id, calib_image_list, scale, cache_dir, batch_config);

if (!trt_yolox_->isGPUInitialized()) {
RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id);
rclcpp::shutdown();
return;
}
RCLCPP_INFO(this->get_logger(), "GPU %d is selected for the inference!", gpu_id);

using std::chrono_literals::operator""ms;
timer_ = rclcpp::create_timer(
Expand Down

0 comments on commit 1a8038c

Please sign in to comment.