diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 17ac4f3f94f57..3367eb9ad71b5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -236,7 +236,6 @@ system/mrm_comfortable_stop_operator/** makoto.kurihara@tier4.jp tomohito.ando@t system/mrm_emergency_stop_operator/** makoto.kurihara@tier4.jp tomohito.ando@tier4.jp system/mrm_handler/** makoto.kurihara@tier4.jp ryuta.kambe@tier4.jp tetsuhiro.kawaguchi@tier4.jp system/system_diagnostic_monitor/** isamu.takagi@tier4.jp -system/system_error_monitor/** fumihito.ito@tier4.jp system/system_monitor/** fumihito.ito@tier4.jp tetsuhiro.kawaguchi@tier4.jp system/topic_state_monitor/** ryohsuke.mitsudome@tier4.jp system/velodyne_monitor/** fumihito.ito@tier4.jp diff --git a/common/tier4_system_rviz_plugin/src/mrm_summary_overlay_display.cpp b/common/tier4_system_rviz_plugin/src/mrm_summary_overlay_display.cpp index c0db8cc86450b..b0ba99c98b154 100644 --- a/common/tier4_system_rviz_plugin/src/mrm_summary_overlay_display.cpp +++ b/common/tier4_system_rviz_plugin/src/mrm_summary_overlay_display.cpp @@ -74,7 +74,7 @@ void insertNewlines(std::string & str, const size_t max_line_length) std::optional generateMrmMessage(diagnostic_msgs::msg::DiagnosticStatus diag_status) { - if (diag_status.hardware_id == "" || diag_status.hardware_id == "system_error_monitor") { + if (diag_status.hardware_id == "") { return std::nullopt; } else if (diag_status.level == diagnostic_msgs::msg::DiagnosticStatus::ERROR) { std::string msg = "- ERROR: " + diag_status.name + " (" + diag_status.message + ")"; diff --git a/launch/tier4_system_launch/README.md b/launch/tier4_system_launch/README.md index 76cea5a575c5d..440843bf1c3f5 100644 --- a/launch/tier4_system_launch/README.md +++ b/launch/tier4_system_launch/README.md @@ -23,5 +23,3 @@ Note that you should provide parameter paths as `PACKAGE_param_path`. The list o ... ``` - -The sensing configuration parameters used in system_error_monitor are loaded from "config/diagnostic_aggregator/sensor_kit.param.yaml" in the "`SENSOR_MODEL`\_description" package. diff --git a/launch/tier4_system_launch/launch/system.launch.xml b/launch/tier4_system_launch/launch/system.launch.xml index 07f6ae8d4bdc2..19e5120d605e2 100644 --- a/launch/tier4_system_launch/launch/system.launch.xml +++ b/launch/tier4_system_launch/launch/system.launch.xml @@ -6,10 +6,6 @@ - - - - diff --git a/launch/tier4_system_launch/package.xml b/launch/tier4_system_launch/package.xml index 673596fad9972..881a65effe2c0 100644 --- a/launch/tier4_system_launch/package.xml +++ b/launch/tier4_system_launch/package.xml @@ -12,7 +12,6 @@ autoware_cmake component_state_monitor - system_error_monitor system_monitor ament_lint_auto diff --git a/system/system_error_monitor/CMakeLists.txt b/system/system_error_monitor/CMakeLists.txt deleted file mode 100644 index 91e9e20146327..0000000000000 --- a/system/system_error_monitor/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -cmake_minimum_required(VERSION 3.14) -project(system_error_monitor) - -find_package(autoware_cmake REQUIRED) -autoware_package() - -ament_auto_add_library(${PROJECT_NAME} SHARED - src/system_error_monitor_core.cpp -) - -rclcpp_components_register_node(${PROJECT_NAME} - PLUGIN "AutowareErrorMonitor" - EXECUTABLE ${PROJECT_NAME}_node) - -ament_auto_package(INSTALL_TO_SHARE - launch - config -) diff --git a/system/system_error_monitor/README.md b/system/system_error_monitor/README.md deleted file mode 100644 index e765239f30f4d..0000000000000 --- a/system/system_error_monitor/README.md +++ /dev/null @@ -1,145 +0,0 @@ -# system_error_monitor - -## Purpose - -Autoware Error Monitor has two main functions. - -1. It is to judge the system hazard level from the aggregated diagnostic information of each module of Autoware. -2. It enables automatic recovery from the emergency state. - -## Inner-workings / Algorithms - -### State Transition - -```plantuml -@startuml -hide empty description - -state "Not Emergency" as N -state "Emergency" as E - -[*] --> N - -N -down-> E : found critical errors -E -up-> N : emergency status is cleared - -note right of E - If the emergency hold is enabled, a service call is required to clear the emergency status. - If not, the emergency status will automatically clear when errors recover. -end note - -@enduml -``` - -### updateEmergencyHoldingCondition Flow Chart - -```plantuml -@startuml -title updateEmergencyHoldingCondition Flow Chart - - -(*) --> "Check emergency holding condition" -note right : emergency_holding is initialized with false - -if "" then - -->[false] "emergency is not held" - else - -->[true] "Check use emergency hold condition" -endif - -if "" then - -->[true] "emergency is held" - else - -->[false] "Check emergency condition" -endif - -if "" then - -->[false] "emergency is not held" - else - -->[true] "Check auto recovery setting" -endif - -if "" then - -->[not approved auto recovery] "emergency is held" - else - -->[approved auto recovery] "Check Emergency Duration" -endif - -if "" then - -->[within recovery timeout] "emergency is not held" - else - -->[over recovery timeout] "Check Control Mode" -endif - -if "" then - -->[autonomous driving] "emergency is held" - else - -->[manual driving] "Check use emergency hold in manual driving condition" -endif - -if "" then - -->[true] "emergency is held" - else - -->[false] "emergency is not held" -endif - -@enduml -``` - -## Inputs / Outputs - -### Input - -| Name | Type | Description | -| ---------------------------- | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | -| `/diagnostics_agg` | `diagnostic_msgs::msg::DiagnosticArray` | Diagnostic information aggregated based [diagnostic_aggregator setting](./config/diagnostic_aggregator) is used to | -| `/autoware/state` | `autoware_system_msgs::msg::AutowareState` | Required to ignore error during Route, Planning and Finalizing. | -| `/control/current_gate_mode` | `tier4_control_msgs::msg::GateMode` | Required to select the appropriate module from `autonomous_driving` or `external_control` | -| `/vehicle/control_mode` | `autoware_vehicle_msgs::msg::ControlModeReport` | Required to not hold emergency during manual driving | - -### Output - -| Name | Type | Description | -| --------------------------------- | ------------------------------------------------ | ------------------------------------------------------------------------------------ | -| `/system/emergency/hazard_status` | `autoware_system_msgs::msg::HazardStatusStamped` | HazardStatus contains system hazard level, emergency hold status and failure details | -| `/diagnostics_err` | `diagnostic_msgs::msg::DiagnosticArray` | This has the same contents as HazardStatus. This is used for visualization | - -## Parameters - -### Node Parameters - -| Name | Type | Default Value | Explanation | -| ---------------------------- | ------ | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ignore_missing_diagnostics` | bool | `false` | If this parameter is turned off, it will be ignored if required modules have not been received. | -| `add_leaf_diagnostics` | bool | `true` | Required to use children diagnostics. | -| `diag_timeout_sec` | double | `1.0` (sec) | If required diagnostic is not received for a `diag_timeout_sec`, the diagnostic state become STALE state. | -| `data_ready_timeout` | double | `30.0` | If input topics required for system_error_monitor are not available for `data_ready_timeout` seconds, autoware_state will translate to emergency state. | -| `data_heartbeat_timeout` | double | `1.0` | If input topics required for system_error_monitor are not no longer subscribed for `data_heartbeat_timeout` seconds, autoware_state will translate to emergency state. | - -### Core Parameters - -| Name | Type | Default Value | Explanation | -| -------------------------------------- | ------ | ------------- | ----------------------------------------------------------------------------------------------------- | -| `hazard_recovery_timeout` | double | `5.0` | The vehicle can recovery to normal driving if emergencies disappear during `hazard_recovery_timeout`. | -| `use_emergency_hold` | bool | `false` | If it is false, the vehicle will return to normal as soon as emergencies disappear. | -| `use_emergency_hold_in_manual_driving` | bool | `false` | If this parameter is turned off, emergencies will be ignored during manual driving. | -| `emergency_hazard_level` | int | `2` | If hazard_level is more than emergency_hazard_level, autoware state will translate to emergency state | - -### YAML format for system_error_monitor - -The parameter key should be filled with the hierarchical diagnostics output by diagnostic_aggregator. Parameters prefixed with `required_modules.autonomous_driving` are for autonomous driving. Parameters with the `required_modules.remote_control` prefix are for remote control. If the value is `default`, the default value will be set. - -| Key | Type | Default Value | Explanation | -| ------------------------------------------------------------------- | ------ | ------------- | ---------------------------------------------------------------------------------------------------------- | -| `required_modules.autonomous_driving.DIAGNOSTIC_NAME.sf_at` | string | `"none"` | Diagnostic level where it becomes Safe Fault. Available options are `"none"`, `"warn"`, `"error"`. | -| `required_modules.autonomous_driving.DIAGNOSTIC_NAME.lf_at` | string | `"warn"` | Diagnostic level where it becomes Latent Fault. Available options are `"none"`, `"warn"`, `"error"`. | -| `required_modules.autonomous_driving.DIAGNOSTIC_NAME.spf_at` | string | `"error"` | Diagnostic level where it becomes Single Point Fault. Available options are `"none"`, `"warn"`, `"error"`. | -| `required_modules.autonomous_driving.DIAGNOSTIC_NAME.auto_recovery` | string | `"true"` | Determines whether the system will automatically recover when it recovers from an error. | -| `required_modules.remote_control.DIAGNOSTIC_NAME.sf_at` | string | `"none"` | Diagnostic level where it becomes Safe Fault. Available options are `"none"`, `"warn"`, `"error"`. | -| `required_modules.remote_control.DIAGNOSTIC_NAME.lf_at` | string | `"warn"` | Diagnostic level where it becomes Latent Fault. Available options are `"none"`, `"warn"`, `"error"`. | -| `required_modules.remote_control.DIAGNOSTIC_NAME.spf_at` | string | `"error"` | Diagnostic level where it becomes Single Point Fault. Available options are `"none"`, `"warn"`, `"error"`. | -| `required_modules.remote_control.DIAGNOSTIC_NAME.auto_recovery` | string | `"true"` | Determines whether the system will automatically recover when it recovers from an error. | - -## Assumptions / Known limits - -TBD. diff --git a/system/system_error_monitor/config/diagnostic_aggregator/_empty.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/_empty.param.yaml deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/control.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/control.param.yaml deleted file mode 100644 index 9da5b14780dd1..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/control.param.yaml +++ /dev/null @@ -1,121 +0,0 @@ -/**: - ros__parameters: - control: - type: diagnostic_aggregator/AnalyzerGroup - path: control - analyzers: - autonomous_emergency_braking: - type: diagnostic_aggregator/AnalyzerGroup - path: autoware_autonomous_emergency_braking - analyzers: - performance_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: performance_monitoring - analyzers: - emergency_stop: - type: diagnostic_aggregator/GenericAnalyzer - path: emergency_stop - contains: [": aeb_emergency_stop"] - timeout: 1.0 - - control_command_gate: - type: diagnostic_aggregator/AnalyzerGroup - path: control_command_gate - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - heartbeat: - type: diagnostic_aggregator/GenericAnalyzer - path: heartbeat - contains: ["vehicle_cmd_gate: heartbeat"] - timeout: 1.0 - - autonomous_driving: - type: diagnostic_aggregator/AnalyzerGroup - path: autonomous_driving - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": control_topic_status"] - timeout: 1.0 - - performance_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: performance_monitoring - analyzers: - lane_departure: - type: diagnostic_aggregator/GenericAnalyzer - path: lane_departure - contains: [": lane_departure"] - timeout: 1.0 - - trajectory_deviation: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_deviation - contains: [": trajectory_deviation"] - timeout: 1.0 - - control_state: - type: diagnostic_aggregator/GenericAnalyzer - path: control_state - contains: [": control_state"] - timeout: 1.0 - - control_validator: - type: diagnostic_aggregator/GenericAnalyzer - path: autoware_control_validator - contains: [": control_validation_max_distance_deviation"] - timeout: 1.0 - - external_control: - type: diagnostic_aggregator/AnalyzerGroup - path: external_control - analyzers: - external_command_selector: - type: diagnostic_aggregator/AnalyzerGroup - path: external_command_selector - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - heartbeat: - type: diagnostic_aggregator/GenericAnalyzer - path: heartbeat - contains: ["external_cmd_selector: heartbeat"] - timeout: 1.0 - - local_external_control: - type: diagnostic_aggregator/AnalyzerGroup - path: local_external_control - analyzers: - device_connection: - type: diagnostic_aggregator/AnalyzerGroup - path: device_connection - analyzers: - joy_controller_connection: - type: diagnostic_aggregator/GenericAnalyzer - path: joy_controller_connection - contains: [": joy_controller_connection"] - timeout: 1.0 - - remote_external_control: - type: diagnostic_aggregator/AnalyzerGroup - path: remote_external_control - analyzers: - network_connection: - type: diagnostic_aggregator/AnalyzerGroup - path: network_connection - analyzers: - remote_control_topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: remote_control_topic_status - contains: [": remote_control_topic_status"] - timeout: 1.0 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/localization.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/localization.param.yaml deleted file mode 100644 index ff6a85d150ee8..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/localization.param.yaml +++ /dev/null @@ -1,47 +0,0 @@ -/**: - ros__parameters: - localization: - type: diagnostic_aggregator/AnalyzerGroup - path: localization - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": localization_topic_status"] - timeout: 1.0 - - performance_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: performance_monitoring - analyzers: - scan_matching_status: - type: diagnostic_aggregator/GenericAnalyzer - path: scan_matching_status - contains: ["ndt_scan_matcher: scan_matching_status"] - timeout: 1.0 - - localization_error_ellipse: - type: diagnostic_aggregator/GenericAnalyzer - path: localization_error_ellipse - contains: ["localization_error_monitor: ellipse_error_status"] - timeout: 1.0 - - localization_stability: - type: diagnostic_aggregator/GenericAnalyzer - path: localization_stability - contains: ["localization: pose_instability_detector"] - timeout: 1.0 - - # This diagnostic should ideally be avoided in terms of Fault Tree Analysis (FTA) compatibility. - # However, we may need this since the localization accuracy is still not reliable enough and may produce - # false positives. Thus, NOTE that this diagnostic should be removed in the future when the localization accuracy - # is reliable enough. - sensor_fusion_status: - type: diagnostic_aggregator/GenericAnalyzer - path: sensor_fusion_status - contains: ["localization: ekf_localizer"] - timeout: 1.0 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/map.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/map.param.yaml deleted file mode 100644 index fa218ca5f08fb..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/map.param.yaml +++ /dev/null @@ -1,22 +0,0 @@ -/**: - ros__parameters: - map: - type: diagnostic_aggregator/AnalyzerGroup - path: map - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": map_topic_status"] - timeout: 1.0 - - # TODO(Tier IV): Support this diagnostics - # route_validation: - # type: diagnostic_aggregator/GenericAnalyzer - # path: route_validation - # contains: [": route_validation"] - # timeout: 0.0 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/perception.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/perception.param.yaml deleted file mode 100644 index 1f897def54f7a..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/perception.param.yaml +++ /dev/null @@ -1,15 +0,0 @@ -/**: - ros__parameters: - perception: - type: diagnostic_aggregator/AnalyzerGroup - path: perception - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": perception_topic_status"] - timeout: 1.0 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/planning.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/planning.param.yaml deleted file mode 100644 index aad8a4ffffac4..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/planning.param.yaml +++ /dev/null @@ -1,95 +0,0 @@ -/**: - ros__parameters: - planning: - type: diagnostic_aggregator/AnalyzerGroup - path: planning - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": planning_topic_status"] - timeout: 1.0 - - performance_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: performance_monitoring - analyzers: - trajectory_validation: - type: diagnostic_aggregator/AnalyzerGroup - path: trajectory_validation - analyzers: - trajectory_size: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_size - contains: [": trajectory_validation_size"] - timeout: 1.0 - - trajectory_finite_value: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_finite - contains: [": trajectory_validation_finite"] - timeout: 1.0 - - trajectory_interval: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_interval - contains: [": trajectory_validation_interval"] - timeout: 1.0 - - trajectory_curvature: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_curvature - contains: [": trajectory_validation_curvature"] - timeout: 1.0 - - trajectory_sharp_angle: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_relative_angle - contains: [": trajectory_validation_relative_angle"] - timeout: 1.0 - - trajectory_relative_angle: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_relative_angle - contains: [": trajectory_validation_relative_angle"] - timeout: 1.0 - - trajectory_lateral_acceleration: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_lateral_acceleration - contains: [": trajectory_validation_lateral_acceleration"] - timeout: 1.0 - - trajectory_acceleration: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_acceleration - contains: [": trajectory_validation_acceleration"] - timeout: 1.0 - - trajectory_deceleration: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_deceleration - contains: [": trajectory_validation_deceleration"] - timeout: 1.0 - - trajectory_steering: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_steering - contains: [": trajectory_validation_steering"] - timeout: 1.0 - - trajectory_steering_rate: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_steering_rate - contains: [": trajectory_validation_steering_rate"] - timeout: 1.0 - - trajectory_velocity_deviation: - type: diagnostic_aggregator/GenericAnalyzer - path: trajectory_validation_velocity_deviation - contains: [": trajectory_validation_velocity_deviation"] - timeout: 1.0 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/sensing.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/sensing.param.yaml deleted file mode 100644 index 27dc4518d4f30..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/sensing.param.yaml +++ /dev/null @@ -1,2 +0,0 @@ -/**: - ros__parameters: {} diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml deleted file mode 100644 index f6e13012957aa..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ /dev/null @@ -1,249 +0,0 @@ -/**: - ros__parameters: - system: - type: diagnostic_aggregator/AnalyzerGroup - path: system - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": system_topic_status"] - timeout: 1.0 - - emergency_stop_operation: - type: diagnostic_aggregator/GenericAnalyzer - path: emergency_stop_operation - contains: [": emergency_stop_operation"] - timeout: 1.0 - - service_log_checker: - type: diagnostic_aggregator/GenericAnalyzer - path: service_log_checker - contains: ["service_log_checker"] - timeout: 5.0 - - duplicated_node_checker: - type: diagnostic_aggregator/GenericAnalyzer - path: duplicated_node_checker - contains: ["duplicated_node_checker"] - timeout: 5.0 - - resource_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: resource_monitoring - analyzers: - clock: - type: diagnostic_aggregator/AnalyzerGroup - path: clock - analyzers: - clock_offset: - type: diagnostic_aggregator/GenericAnalyzer - path: clock_offset - contains: [": NTP Offset"] - timeout: 10.0 - - cpu: - type: diagnostic_aggregator/AnalyzerGroup - path: cpu - analyzers: - temperature: - type: diagnostic_aggregator/GenericAnalyzer - path: temperature - contains: [": CPU Temperature"] - timeout: 3.0 - - usage: - type: diagnostic_aggregator/GenericAnalyzer - path: usage - contains: [": CPU Usage"] - timeout: 3.0 - - thermal_throttling: - type: diagnostic_aggregator/GenericAnalyzer - path: thermal_throttling - contains: [": CPU Thermal Throttling"] - timeout: 3.0 - - frequency: - type: diagnostic_aggregator/GenericAnalyzer - path: frequency - contains: [": CPU Frequency"] - timeout: 3.0 - - load_average: - type: diagnostic_aggregator/GenericAnalyzer - path: load_average - contains: [": CPU Load Average"] - timeout: 3.0 - - gpu: - type: diagnostic_aggregator/AnalyzerGroup - path: gpu - analyzers: - temperature: - type: diagnostic_aggregator/GenericAnalyzer - path: temperature - contains: [": GPU Temperature"] - timeout: 3.0 - - usage: - type: diagnostic_aggregator/GenericAnalyzer - path: gpu_usage - contains: [": GPU Usage"] - timeout: 3.0 - - memory_usage: - type: diagnostic_aggregator/GenericAnalyzer - path: memory_usage - contains: [": GPU Memory Usage"] - timeout: 3.0 - - thermal_throttling: - type: diagnostic_aggregator/GenericAnalyzer - path: thermal_throttling - contains: [": GPU Thermal Throttling"] - timeout: 3.0 - - frequency: - type: diagnostic_aggregator/GenericAnalyzer - path: frequency - contains: [": GPU Frequency"] - timeout: 3.0 - - memory: - type: diagnostic_aggregator/AnalyzerGroup - path: memory - analyzers: - usage: - type: diagnostic_aggregator/GenericAnalyzer - path: usage - contains: [": Memory Usage"] - timeout: 3.0 - - network: - type: diagnostic_aggregator/AnalyzerGroup - path: network - analyzers: - network_usage: - type: diagnostic_aggregator/GenericAnalyzer - path: network_usage - contains: [": Network Usage"] - timeout: 3.0 - - network_traffic: - type: diagnostic_aggregator/GenericAnalyzer - path: network_traffic - contains: [": Network Traffic"] - timeout: 3.0 - - network_crc_error: - type: diagnostic_aggregator/GenericAnalyzer - path: network_crc_error - contains: [": Network CRC Error"] - timeout: 3.0 - - ip_packet_reassembles_failed: - type: diagnostic_aggregator/GenericAnalyzer - path: ip_packet_reassembles_failed - contains: [": IP Packet Reassembles Failed"] - timeout: 3.0 - - storage: - type: diagnostic_aggregator/AnalyzerGroup - path: storage - analyzers: - temperature: - type: diagnostic_aggregator/GenericAnalyzer - path: temperature - contains: [": HDD Temperature"] - timeout: 3.0 - - recovered_error: - type: diagnostic_aggregator/GenericAnalyzer - path: recovered_error - contains: [": HDD RecoveredError"] - timeout: 3.0 - - read_data_rate: - type: diagnostic_aggregator/GenericAnalyzer - path: read_data_rate - contains: [": HDD ReadDataRate"] - timeout: 3.0 - - write_data_rate: - type: diagnostic_aggregator/GenericAnalyzer - path: write_data_rate - contains: [": HDD WriteDataRate"] - timeout: 3.0 - - read_iops: - type: diagnostic_aggregator/GenericAnalyzer - path: read_iops - contains: [": HDD ReadIOPS"] - timeout: 3.0 - - write_iops: - type: diagnostic_aggregator/GenericAnalyzer - path: write_iops - contains: [": HDD WriteIOPS"] - timeout: 3.0 - - usage: - type: diagnostic_aggregator/GenericAnalyzer - path: usage - contains: [": HDD Usage"] - timeout: 3.0 - - power_on_hours: - type: diagnostic_aggregator/GenericAnalyzer - path: power_on_hours - contains: [": HDD PowerOnHours"] - timeout: 3.0 - - total_data_written: - type: diagnostic_aggregator/GenericAnalyzer - path: total_data_written - contains: [": HDD TotalDataWritten"] - timeout: 3.0 - - connection: - type: diagnostic_aggregator/GenericAnalyzer - path: connection - contains: [": HDD Connection"] - timeout: 3.0 - - process: - type: diagnostic_aggregator/AnalyzerGroup - path: process - analyzers: - high_load: - type: diagnostic_aggregator/GenericAnalyzer - path: high_load - contains: [": High-load"] - timeout: 3.0 - - high_mem: - type: diagnostic_aggregator/GenericAnalyzer - path: high_mem - contains: [": High-mem"] - timeout: 3.0 - - tasks_summary: - type: diagnostic_aggregator/GenericAnalyzer - path: tasks_summary - contains: [": Tasks Summary"] - timeout: 3.0 - - hardware: - type: diagnostic_aggregator/AnalyzerGroup - path: voltage - analyzers: - cmos_battery: - type: diagnostic_aggregator/GenericAnalyzer - path: cmos_battery - contains: [": CMOS Battery Status"] - timeout: 3.0 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/vehicle.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/vehicle.param.yaml deleted file mode 100644 index 45ede1a4de000..0000000000000 --- a/system/system_error_monitor/config/diagnostic_aggregator/vehicle.param.yaml +++ /dev/null @@ -1,16 +0,0 @@ -/**: - ros__parameters: - vehicle: - type: diagnostic_aggregator/AnalyzerGroup - path: vehicle - analyzers: - node_alive_monitoring: - type: diagnostic_aggregator/AnalyzerGroup - path: node_alive_monitoring - analyzers: - # TODO(Tier IV): Consider splitting sensor input and control command output - topic_status: - type: diagnostic_aggregator/GenericAnalyzer - path: topic_status - contains: [": vehicle_topic_status"] - timeout: 1.0 diff --git a/system/system_error_monitor/config/system_error_monitor.param.yaml b/system/system_error_monitor/config/system_error_monitor.param.yaml deleted file mode 100644 index 63fdbd0081af6..0000000000000 --- a/system/system_error_monitor/config/system_error_monitor.param.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Description: -# name: diag name -# sf_at: diag level where it becomes Safe Fault -# lf_at: diag level where it becomes Latent Fault -# spf_at: diag level where it becomes Single Point Fault -# auto_recovery: Determines whether the system will automatically recover when it recovers from an error. -# -# Note: -# empty-value for sf_at, lf_at and spf_at is "none" -# default values are: -# sf_at: "none" -# lf_at: "warn" -# spf_at: "error" -# auto_recovery: "true" ---- -/**: - ros__parameters: - required_modules: - autonomous_driving: - /autoware/control/autonomous_driving/node_alive_monitoring: default - /autoware/control/autonomous_driving/performance_monitoring/lane_departure: default - /autoware/control/control_command_gate/node_alive_monitoring: default - /autoware/control/autoware_autonomous_emergency_braking/performance_monitoring/emergency_stop: { sf_at: "none", lf_at: "warn", spf_at: "error", auto_recovery: "false"} - - /autoware/localization/node_alive_monitoring: default - /autoware/localization/performance_monitoring/scan_matching_status: { sf_at: "warn", lf_at: "none", spf_at: "none" } - /autoware/localization/performance_monitoring/localization_error_ellipse: default - - /autoware/map/node_alive_monitoring: default - - /autoware/perception/node_alive_monitoring: default - - /autoware/planning/node_alive_monitoring: default - /autoware/planning/performance_monitoring/trajectory_validation: default - - # /autoware/sensing/node_alive_monitoring: default - - /autoware/system/node_alive_monitoring: default - /autoware/system/emergency_stop_operation: default - /autoware/system/service_log_checker: { sf_at: "warn", lf_at: "none", spf_at: "none" } - /autoware/system/resource_monitoring: { sf_at: "warn", lf_at: "none", spf_at: "none" } - /autoware/system/duplicated_node_checker: default - - /autoware/vehicle/node_alive_monitoring: default - - external_control: - /autoware/control/control_command_gate/node_alive_monitoring: default - /autoware/control/autoware_autonomous_emergency_braking/performance_monitoring/emergency_stop: { sf_at: "none", lf_at: "warn", spf_at: "error", auto_recovery: "false"} - /autoware/control/external_control/external_command_selector/node_alive_monitoring: default - - /autoware/system/node_alive_monitoring: default - /autoware/system/emergency_stop_operation: default - /autoware/system/service_log_checker: { sf_at: "warn", lf_at: "none", spf_at: "none" } - - /autoware/vehicle/node_alive_monitoring: default diff --git a/system/system_error_monitor/config/system_error_monitor.planning_simulation.param.yaml b/system/system_error_monitor/config/system_error_monitor.planning_simulation.param.yaml deleted file mode 100644 index c9ab89a8fce96..0000000000000 --- a/system/system_error_monitor/config/system_error_monitor.planning_simulation.param.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Description: -# name: diag name -# sf_at: diag level where it becomes Safe Fault -# lf_at: diag level where it becomes Latent Fault -# spf_at: diag level where it becomes Single Point Fault -# auto_recovery: Determines whether the system will automatically recover when it recovers from an error. -# -# Note: -# empty-value for sf_at, lf_at and spf_at is "none" -# default values are: -# sf_at: "none" -# lf_at: "warn" -# spf_at: "error" -# auto_recovery: "true" ---- -/**: - ros__parameters: - required_modules: - autonomous_driving: - /autoware/control/autonomous_driving/node_alive_monitoring: default - /autoware/control/autonomous_driving/performance_monitoring/lane_departure: default - /autoware/control/control_command_gate/node_alive_monitoring: default - /autoware/control/autoware_autonomous_emergency_braking/performance_monitoring/emergency_stop: { sf_at: "none", lf_at: "warn", spf_at: "error", auto_recovery: "false"} - - /autoware/localization/node_alive_monitoring: default - # /autoware/localization/performance_monitoring/scan_matching_status: { sf_at: "warn", lf_at: "none", spf_at: "none" } - # /autoware/localization/performance_monitoring/localization_error_ellipse: default - - /autoware/map/node_alive_monitoring: default - - /autoware/perception/node_alive_monitoring: default - - /autoware/planning/node_alive_monitoring: default - /autoware/planning/performance_monitoring/trajectory_validation: default - - # /autoware/sensing/node_alive_monitoring: default - - /autoware/system/node_alive_monitoring: default - /autoware/system/emergency_stop_operation: default - /autoware/system/service_log_checker: { sf_at: "warn", lf_at: "none", spf_at: "none" } - /autoware/system/duplicated_node_checker: default - # /autoware/system/resource_monitoring: { sf_at: "warn", lf_at: "error", spf_at: "none" } - # /autoware/system/duplicated_node_checker: default - - /autoware/vehicle/node_alive_monitoring: default - - external_control: - /autoware/control/control_command_gate/node_alive_monitoring: default - /autoware/control/autoware_autonomous_emergency_braking/performance_monitoring/emergency_stop: { sf_at: "none", lf_at: "warn", spf_at: "error", auto_recovery: "false"} - /autoware/control/external_control/external_command_selector/node_alive_monitoring: default - - /autoware/system/node_alive_monitoring: default - /autoware/system/emergency_stop_operation: default - /autoware/system/service_log_checker: { sf_at: "warn", lf_at: "none", spf_at: "none" } - - /autoware/vehicle/node_alive_monitoring: default diff --git a/system/system_error_monitor/include/system_error_monitor/diagnostics_filter.hpp b/system/system_error_monitor/include/system_error_monitor/diagnostics_filter.hpp deleted file mode 100644 index cbca3ed1c477c..0000000000000 --- a/system/system_error_monitor/include/system_error_monitor/diagnostics_filter.hpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2020 Tier IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SYSTEM_ERROR_MONITOR__DIAGNOSTICS_FILTER_HPP_ -#define SYSTEM_ERROR_MONITOR__DIAGNOSTICS_FILTER_HPP_ - -#include - -#include -#include -#include - -namespace diagnostics_filter -{ -inline std::string splitStringByLastSlash(const std::string & str) -{ - const auto last_slash = str.find_last_of("/"); - - if (last_slash == std::string::npos) { - return ""; - } - - return str.substr(0, last_slash); -} - -inline bool isChild( - const diagnostic_msgs::msg::DiagnosticStatus & child, - const diagnostic_msgs::msg::DiagnosticStatus & parent) -{ - auto name = splitStringByLastSlash(child.name); - while (name != "") { - if (name == parent.name) { - return true; - } - - name = splitStringByLastSlash(name); - } - - return false; -} - -inline bool isLeaf( - const std::unordered_set & diag_name_set, - const diagnostic_msgs::msg::DiagnosticStatus & diag) -{ - return diag_name_set.count(diag.name) == 0; -} - -inline std::unordered_set createDiagNameSet( - const std::vector & diagnostics) -{ - std::unordered_set diag_name_set; - - for (const auto & diag : diagnostics) { - diag_name_set.insert(splitStringByLastSlash(diag.name)); - } - - return diag_name_set; -} - -inline std::vector extractLeafDiagnostics( - const std::vector & diagnostics) -{ - const auto diag_name_set = createDiagNameSet(diagnostics); - - std::vector leaf_diagnostics; - for (const auto & diag : diagnostics) { - if (isLeaf(diag_name_set, diag)) { - leaf_diagnostics.emplace_back(diag); - } - } - - return leaf_diagnostics; -} - -inline std::vector extractLeafChildrenDiagnostics( - const diagnostic_msgs::msg::DiagnosticStatus & parent, - const std::vector & diagnostics) -{ - std::vector leaf_children_diagnostics; - for (const auto & diag : extractLeafDiagnostics(diagnostics)) { - if (isChild(diag, parent)) { - leaf_children_diagnostics.emplace_back(diag); - } - } - - return leaf_children_diagnostics; -} - -} // namespace diagnostics_filter - -#endif // SYSTEM_ERROR_MONITOR__DIAGNOSTICS_FILTER_HPP_ diff --git a/system/system_error_monitor/include/system_error_monitor/system_error_monitor_core.hpp b/system/system_error_monitor/include/system_error_monitor/system_error_monitor_core.hpp deleted file mode 100644 index dbb1db027b149..0000000000000 --- a/system/system_error_monitor/include/system_error_monitor/system_error_monitor_core.hpp +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2020 Tier IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SYSTEM_ERROR_MONITOR__SYSTEM_ERROR_MONITOR_CORE_HPP_ -#define SYSTEM_ERROR_MONITOR__SYSTEM_ERROR_MONITOR_CORE_HPP_ - -#include "autoware/universe_utils/ros/logger_level_configure.hpp" - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -struct DiagStamped -{ - std_msgs::msg::Header header; - diagnostic_msgs::msg::DiagnosticStatus status; -}; - -using DiagBuffer = std::deque; - -struct DiagConfig -{ - std::string name; - std::string sf_at; - std::string lf_at; - std::string spf_at; - bool auto_recovery; -}; - -using RequiredModules = std::vector; - -struct KeyName -{ - static constexpr const char * autonomous_driving = "autonomous_driving"; - static constexpr const char * external_control = "external_control"; -}; - -class AutowareErrorMonitor : public rclcpp::Node -{ -public: - explicit AutowareErrorMonitor(const rclcpp::NodeOptions & options); - -private: - // Parameter - struct Parameters - { - int update_rate; - bool ignore_missing_diagnostics; - bool add_leaf_diagnostics; - double data_ready_timeout; - double data_heartbeat_timeout; - double diag_timeout_sec; - double hazard_recovery_timeout; - int emergency_hazard_level; - bool use_emergency_hold; - bool use_emergency_hold_in_manual_driving; - }; - - Parameters params_{}; - - rclcpp::Time emergency_state_switch_time_; - rclcpp::Time initialized_time_; - autoware_system_msgs::msg::HazardStatus hazard_status_{}; - std::unordered_map required_modules_map_; - std::string current_mode_; - - void loadRequiredModules(const std::string & key); - - // Timer - rclcpp::TimerBase::SharedPtr timer_; - - bool isDataReady(); - bool isDataHeartbeatTimeout(); - void onTimer(); - - // Subscriber - rclcpp::Subscription::SharedPtr sub_diag_array_; - rclcpp::Subscription::SharedPtr sub_autoware_state_; - rclcpp::Subscription::SharedPtr sub_current_gate_mode_; - rclcpp::Subscription::SharedPtr sub_control_mode_; - void onAutowareState(const autoware_system_msgs::msg::AutowareState::ConstSharedPtr msg); - void onCurrentGateMode(const tier4_control_msgs::msg::GateMode::ConstSharedPtr msg); - void onControlMode(const autoware_vehicle_msgs::msg::ControlModeReport::ConstSharedPtr msg); - void onDiagArray(const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr msg); - - const size_t diag_buffer_size_ = 100; - std::unordered_map diag_buffer_map_; - diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr diag_array_; - autoware_system_msgs::msg::AutowareState::ConstSharedPtr autoware_state_; - tier4_control_msgs::msg::GateMode::ConstSharedPtr current_gate_mode_; - autoware_vehicle_msgs::msg::ControlModeReport::ConstSharedPtr control_mode_; - - // Publisher - rclcpp::Publisher::SharedPtr pub_hazard_status_; - rclcpp::Publisher::SharedPtr pub_diagnostics_err_; - void publishHazardStatus(const autoware_system_msgs::msg::HazardStatus & hazard_status); - - // Service - rclcpp::Service::SharedPtr srv_clear_emergency_; - bool onClearEmergencyService( - [[maybe_unused]] std_srvs::srv::Trigger::Request::SharedPtr request, - std_srvs::srv::Trigger::Response::SharedPtr response); - - // for Heartbeat - rclcpp::Time diag_array_stamp_; - rclcpp::Time autoware_state_stamp_; - rclcpp::Time current_gate_mode_stamp_; - rclcpp::Time control_mode_stamp_; - - // Algorithm - boost::optional getLatestDiag(const std::string & diag_name) const; - uint8_t getHazardLevel(const DiagConfig & required_module, const int diag_level) const; - void appendHazardDiag( - const DiagConfig & required_module, const diagnostic_msgs::msg::DiagnosticStatus & diag, - autoware_system_msgs::msg::HazardStatus * hazard_status) const; - autoware_system_msgs::msg::HazardStatus judgeHazardStatus() const; - void updateHazardStatus(); - void updateTimeoutHazardStatus(); - bool canAutoRecovery() const; - bool isEmergencyHoldingRequired() const; - - void loggingErrors(const autoware_system_msgs::msg::HazardStatus & diag_array); - - std::unique_ptr logger_configure_; -}; - -#endif // SYSTEM_ERROR_MONITOR__SYSTEM_ERROR_MONITOR_CORE_HPP_ diff --git a/system/system_error_monitor/launch/system_error_monitor.launch.xml b/system/system_error_monitor/launch/system_error_monitor.launch.xml deleted file mode 100644 index 9eb805b0eb2a6..0000000000000 --- a/system/system_error_monitor/launch/system_error_monitor.launch.xml +++ /dev/null @@ -1,55 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/system/system_error_monitor/launch/system_error_monitor_node.launch.xml b/system/system_error_monitor/launch/system_error_monitor_node.launch.xml deleted file mode 100644 index f5ed927c7d6ca..0000000000000 --- a/system/system_error_monitor/launch/system_error_monitor_node.launch.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/system/system_error_monitor/package.xml b/system/system_error_monitor/package.xml deleted file mode 100644 index 2dda4e462b98b..0000000000000 --- a/system/system_error_monitor/package.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - system_error_monitor - 0.1.1 - The system_error_monitor package in ROS 2 - Fumihito Ito - Apache License 2.0 - - ament_cmake_auto - autoware_cmake - - autoware_system_msgs - autoware_universe_utils - autoware_vehicle_msgs - diagnostic_msgs - fmt - rclcpp - rclcpp_components - std_srvs - tier4_control_msgs - - diagnostic_aggregator - rqt_robot_monitor - - ament_lint_auto - autoware_lint_common - - - ament_cmake - - diff --git a/system/system_error_monitor/src/system_error_monitor_core.cpp b/system/system_error_monitor/src/system_error_monitor_core.cpp deleted file mode 100644 index ef892bd01ff50..0000000000000 --- a/system/system_error_monitor/src/system_error_monitor_core.cpp +++ /dev/null @@ -1,722 +0,0 @@ -// Copyright 2020 Tier IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#define FMT_HEADER_ONLY -#include "system_error_monitor/diagnostics_filter.hpp" -#include "system_error_monitor/system_error_monitor_core.hpp" - -#include - -namespace -{ -enum class DebugLevel { DEBUG, INFO, WARN, ERROR, FATAL }; - -template -void logThrottledNamed( - const std::string & logger_name, const rclcpp::Clock::SharedPtr clock, const double duration_ms, - const std::string & message) -{ - static std::unordered_map last_output_time; - if (last_output_time.count(logger_name) != 0) { - const auto time_from_last_output = clock->now() - last_output_time.at(logger_name); - if (time_from_last_output.seconds() * 1000.0 < duration_ms) { - return; - } - } - - last_output_time[logger_name] = clock->now(); - if constexpr (debug_level == DebugLevel::DEBUG) { - RCLCPP_DEBUG(rclcpp::get_logger(logger_name), message.c_str()); - } else if constexpr (debug_level == DebugLevel::INFO) { - RCLCPP_INFO(rclcpp::get_logger(logger_name), message.c_str()); - } else if constexpr (debug_level == DebugLevel::WARN) { - RCLCPP_WARN(rclcpp::get_logger(logger_name), message.c_str()); - } else if constexpr (debug_level == DebugLevel::ERROR) { - RCLCPP_ERROR(rclcpp::get_logger(logger_name), message.c_str()); - } else if constexpr (debug_level == DebugLevel::FATAL) { - RCLCPP_FATAL(rclcpp::get_logger(logger_name), message.c_str()); - } -} - -std::vector split(const std::string & str, const char delim) -{ - std::vector elems; - std::stringstream ss(str); - std::string item; - while (std::getline(ss, item, delim)) { - elems.push_back(item); - } - return elems; -} - -int str2level(const std::string & level_str) -{ - using diagnostic_msgs::msg::DiagnosticStatus; - using std::regex_constants::icase; - - if (std::regex_match(level_str, std::regex("warn", icase))) { - return DiagnosticStatus::WARN; - } - if (std::regex_match(level_str, std::regex("error", icase))) { - return DiagnosticStatus::ERROR; - } - if (std::regex_match(level_str, std::regex("stale", icase))) { - return DiagnosticStatus::STALE; - } - - throw std::runtime_error(fmt::format("invalid level: {}", level_str)); -} - -bool isOverLevel(const int & diag_level, const std::string & failure_level_str) -{ - if (failure_level_str == "none") { - return false; - } - - return diag_level >= str2level(failure_level_str); -} - -std::vector & getTargetDiagnosticsRef( - const int hazard_level, autoware_system_msgs::msg::HazardStatus * hazard_status) -{ - using autoware_system_msgs::msg::HazardStatus; - - if (hazard_level == HazardStatus::NO_FAULT) { - return hazard_status->diag_no_fault; - } - if (hazard_level == HazardStatus::SAFE_FAULT) { - return hazard_status->diag_safe_fault; - } - if (hazard_level == HazardStatus::LATENT_FAULT) { - return hazard_status->diag_latent_fault; - } - if (hazard_level == HazardStatus::SINGLE_POINT_FAULT) { - return hazard_status->diag_single_point_fault; - } - - throw std::runtime_error(fmt::format("invalid hazard level: {}", hazard_level)); -} - -diagnostic_msgs::msg::DiagnosticArray convertHazardStatusToDiagnosticArray( - rclcpp::Clock::SharedPtr clock, const autoware_system_msgs::msg::HazardStatus & hazard_status) -{ - using diagnostic_msgs::msg::DiagnosticStatus; - - diagnostic_msgs::msg::DiagnosticArray diag_array; - diag_array.header.stamp = clock->now(); - - const auto decorateDiag = [](const auto & hazard_diag, const std::string & label) { - auto diag = hazard_diag; - - diag.message = label + diag.message; - - return diag; - }; - - for (const auto & hazard_diag : hazard_status.diag_no_fault) { - diag_array.status.push_back(decorateDiag(hazard_diag, "[No Fault]")); - } - for (const auto & hazard_diag : hazard_status.diag_safe_fault) { - diag_array.status.push_back(decorateDiag(hazard_diag, "[Safe Fault]")); - } - for (const auto & hazard_diag : hazard_status.diag_latent_fault) { - diag_array.status.push_back(decorateDiag(hazard_diag, "[Latent Fault]")); - } - for (const auto & hazard_diag : hazard_status.diag_single_point_fault) { - diag_array.status.push_back(decorateDiag(hazard_diag, "[Single Point Fault]")); - } - - return diag_array; -} - -std::set getErrorModules( - const autoware_system_msgs::msg::HazardStatus & hazard_status, const int emergency_hazard_level) -{ - std::set error_modules; - using autoware_system_msgs::msg::HazardStatus; - if (emergency_hazard_level <= HazardStatus::SINGLE_POINT_FAULT) { - for (const auto & diag_spf : hazard_status.diag_single_point_fault) { - error_modules.insert(diag_spf.name); - } - } - if (emergency_hazard_level <= HazardStatus::LATENT_FAULT) { - for (const auto & diag_lf : hazard_status.diag_latent_fault) { - error_modules.insert(diag_lf.name); - } - } - if (emergency_hazard_level <= HazardStatus::SAFE_FAULT) { - for (const auto & diag_sf : hazard_status.diag_safe_fault) { - error_modules.insert(diag_sf.name); - } - } - - return error_modules; -} - -int isInNoFaultCondition( - const autoware_system_msgs::msg::AutowareState & autoware_state, - const tier4_control_msgs::msg::GateMode & current_gate_mode) -{ - using autoware_system_msgs::msg::AutowareState; - using tier4_control_msgs::msg::GateMode; - - const auto is_in_autonomous_ignore_state = - (autoware_state.state == AutowareState::INITIALIZING) || - (autoware_state.state == AutowareState::WAITING_FOR_ROUTE) || - (autoware_state.state == AutowareState::PLANNING) || - (autoware_state.state == AutowareState::FINALIZING); - - if (current_gate_mode.data == GateMode::AUTO && is_in_autonomous_ignore_state) { - return true; - } - - const auto is_in_external_ignore_state = (autoware_state.state == AutowareState::INITIALIZING) || - (autoware_state.state == AutowareState::FINALIZING); - - if (current_gate_mode.data == GateMode::EXTERNAL && is_in_external_ignore_state) { - return true; - } - - return false; -} -} // namespace - -AutowareErrorMonitor::AutowareErrorMonitor(const rclcpp::NodeOptions & options) -: Node( - "system_error_monitor", - rclcpp::NodeOptions(options).automatically_declare_parameters_from_overrides(true)), - diag_array_stamp_(0, 0, this->get_clock()->get_clock_type()), - autoware_state_stamp_(0, 0, this->get_clock()->get_clock_type()), - current_gate_mode_stamp_(0, 0, this->get_clock()->get_clock_type()), - control_mode_stamp_(0, 0, this->get_clock()->get_clock_type()) -{ - // Parameter - get_parameter_or("update_rate", params_.update_rate, 10); - get_parameter_or("ignore_missing_diagnostics", params_.ignore_missing_diagnostics, false); - get_parameter_or("add_leaf_diagnostics", params_.add_leaf_diagnostics, true); - get_parameter_or("data_ready_timeout", params_.data_ready_timeout, 30.0); - get_parameter_or("data_heartbeat_timeout", params_.data_heartbeat_timeout, 1.0); - get_parameter_or("diag_timeout_sec", params_.diag_timeout_sec, 1.0); - get_parameter_or("hazard_recovery_timeout", params_.hazard_recovery_timeout, 5.0); - get_parameter_or( - "emergency_hazard_level", params_.emergency_hazard_level, - autoware_system_msgs::msg::HazardStatus::LATENT_FAULT); - get_parameter_or("use_emergency_hold", params_.use_emergency_hold, false); - get_parameter_or( - "use_emergency_hold_in_manual_driving", params_.use_emergency_hold_in_manual_driving, false); - - loadRequiredModules(KeyName::autonomous_driving); - loadRequiredModules(KeyName::external_control); - - using std::placeholders::_1; - using std::placeholders::_2; - // Subscriber - sub_diag_array_ = create_subscription( - "input/diag_array", rclcpp::QoS{1}, std::bind(&AutowareErrorMonitor::onDiagArray, this, _1)); - sub_current_gate_mode_ = create_subscription( - "~/input/current_gate_mode", rclcpp::QoS{1}, - std::bind(&AutowareErrorMonitor::onCurrentGateMode, this, _1)); - sub_autoware_state_ = create_subscription( - "~/input/autoware_state", rclcpp::QoS{1}, - std::bind(&AutowareErrorMonitor::onAutowareState, this, _1)); - sub_control_mode_ = create_subscription( - "~/input/control_mode", rclcpp::QoS{1}, - std::bind(&AutowareErrorMonitor::onControlMode, this, _1)); - - // Publisher - pub_hazard_status_ = create_publisher( - "~/output/hazard_status", rclcpp::QoS{1}); - pub_diagnostics_err_ = create_publisher( - "~/output/diagnostics_err", rclcpp::QoS{1}); - - // Service - srv_clear_emergency_ = this->create_service( - "service/clear_emergency", - std::bind(&AutowareErrorMonitor::onClearEmergencyService, this, _1, _2)); - - // Initialize - autoware_vehicle_msgs::msg::ControlModeReport vehicle_state_report; - vehicle_state_report.mode = autoware_vehicle_msgs::msg::ControlModeReport::MANUAL; - control_mode_ = - std::make_shared(vehicle_state_report); - - // Timer - initialized_time_ = this->now(); - const auto period_ns = rclcpp::Rate(params_.update_rate).period(); - timer_ = rclcpp::create_timer( - this, get_clock(), period_ns, std::bind(&AutowareErrorMonitor::onTimer, this)); - - logger_configure_ = std::make_unique(this); -} - -void AutowareErrorMonitor::loadRequiredModules(const std::string & key) -{ - const uint64_t depth = 3; - const auto param_names = - this->list_parameters({std::string("required_modules.") + key}, depth).names; - - if (param_names.empty()) { - throw std::runtime_error(fmt::format("no parameter found: required_modules.{}", key)); - } - - // Load module names from parameter key - std::set module_names; - RequiredModules required_modules; - - for (const auto & param_name : param_names) { - // Example of param_name: required_modules.key.module - // or required_modules.key.module.parameter - const auto split_names = split(param_name, '.'); - const auto & param_required_modules = split_names.at(0); - const auto & param_key = split_names.at(1); - const auto & param_module = split_names.at(2); - const auto module_name_with_prefix = - fmt::format("{0}.{1}.{2}", param_required_modules, param_key, param_module); - - // Skip duplicate parameter - if (module_names.count(module_name_with_prefix) != 0) { - continue; - } - module_names.insert(module_name_with_prefix); - - // Load diag level - const auto sf_key = module_name_with_prefix + std::string(".sf_at"); - std::string sf_at; - this->get_parameter_or(sf_key, sf_at, std::string("none")); - - const auto lf_key = module_name_with_prefix + std::string(".lf_at"); - std::string lf_at; - this->get_parameter_or(lf_key, lf_at, std::string("warn")); - - const auto spf_key = module_name_with_prefix + std::string(".spf_at"); - std::string spf_at; - this->get_parameter_or(spf_key, spf_at, std::string("error")); - - // auto_recovery - const auto auto_recovery_key = module_name_with_prefix + std::string(".auto_recovery"); - std::string auto_recovery_approval_str; - this->get_parameter_or(auto_recovery_key, auto_recovery_approval_str, std::string("true")); - - // Convert auto_recovery_approval_str to bool - bool auto_recovery_approval{}; - std::istringstream(auto_recovery_approval_str) >> std::boolalpha >> auto_recovery_approval; - - required_modules.push_back({param_module, sf_at, lf_at, spf_at, auto_recovery_approval}); - } - - required_modules_map_.insert(std::make_pair(key, required_modules)); -} - -void AutowareErrorMonitor::onDiagArray( - const diagnostic_msgs::msg::DiagnosticArray::ConstSharedPtr msg) -{ - diag_array_ = msg; - - const auto & header = msg->header; - - for (const auto & diag : msg->status) { - if (diag_buffer_map_.count(diag.name) == 0) { - diag_buffer_map_.insert(std::make_pair(diag.name, DiagBuffer{})); - } - - auto & diag_buffer = diag_buffer_map_.at(diag.name); - diag_buffer.push_back(DiagStamped{header, diag}); - - while (diag_buffer.size() > diag_buffer_size_) { - diag_buffer.pop_front(); - } - } - - // for Heartbeat - diag_array_stamp_ = this->now(); -} - -void AutowareErrorMonitor::onCurrentGateMode( - const tier4_control_msgs::msg::GateMode::ConstSharedPtr msg) -{ - current_gate_mode_ = msg; - - // for Heartbeat - current_gate_mode_stamp_ = this->now(); -} - -void AutowareErrorMonitor::onAutowareState( - const autoware_system_msgs::msg::AutowareState::ConstSharedPtr msg) -{ - autoware_state_ = msg; - - // for Heartbeat - autoware_state_stamp_ = this->now(); -} - -void AutowareErrorMonitor::onControlMode( - const autoware_vehicle_msgs::msg::ControlModeReport::ConstSharedPtr msg) -{ - control_mode_ = msg; - - // for Heartbeat - control_mode_stamp_ = this->now(); -} - -bool AutowareErrorMonitor::isDataReady() -{ - if (!diag_array_) { - RCLCPP_INFO_THROTTLE(get_logger(), *get_clock(), 5000, "waiting for diag_array msg..."); - return false; - } - - if (!current_gate_mode_) { - RCLCPP_INFO_THROTTLE(get_logger(), *get_clock(), 5000, "waiting for current_gate_mode msg..."); - return false; - } - - if (!autoware_state_) { - RCLCPP_INFO_THROTTLE(get_logger(), *get_clock(), 5000, "waiting for autoware_state msg..."); - return false; - } - - if (!control_mode_) { - RCLCPP_INFO_THROTTLE( - get_logger(), *get_clock(), 5000, "waiting for vehicle_state_report msg..."); - return false; - } - return true; -} - -bool AutowareErrorMonitor::isDataHeartbeatTimeout() -{ - auto isTimeout = [this](const rclcpp::Time & last_stamp, const double threshold) { - if (last_stamp.get_clock_type() != this->now().get_clock_type()) { - RCLCPP_ERROR_THROTTLE(get_logger(), *get_clock(), 5000, "clock type is different..."); - return false; - } - const auto time_diff = this->now() - last_stamp; - return time_diff.seconds() > threshold; - }; - - if (isTimeout(diag_array_stamp_, params_.data_heartbeat_timeout)) { - RCLCPP_ERROR_THROTTLE(get_logger(), *get_clock(), 5000, "diag_array msg is timeout..."); - return true; - } - - if (isTimeout(current_gate_mode_stamp_, params_.data_heartbeat_timeout)) { - RCLCPP_ERROR_THROTTLE(get_logger(), *get_clock(), 5000, "current_gate_mode msg is timeout..."); - return true; - } - - if (isTimeout(autoware_state_stamp_, params_.data_heartbeat_timeout)) { - RCLCPP_ERROR_THROTTLE(get_logger(), *get_clock(), 5000, "autoware_state msg is timeout..."); - return true; - } - - if (isTimeout(control_mode_stamp_, params_.data_heartbeat_timeout)) { - RCLCPP_ERROR_THROTTLE( - get_logger(), *get_clock(), 5000, "vehicle_state_report msg is timeout..."); - return true; - } - - return false; -} - -void AutowareErrorMonitor::onTimer() -{ - if (!isDataReady()) { - if ((this->now() - initialized_time_).seconds() > params_.data_ready_timeout) { - RCLCPP_WARN_THROTTLE( - get_logger(), *get_clock(), std::chrono::milliseconds(5000).count(), - "input data is timeout"); - updateTimeoutHazardStatus(); - publishHazardStatus(hazard_status_); - } - return; - } - - if (isDataHeartbeatTimeout()) { - updateTimeoutHazardStatus(); - publishHazardStatus(hazard_status_); - return; - } - - current_mode_ = current_gate_mode_->data == tier4_control_msgs::msg::GateMode::AUTO - ? KeyName::autonomous_driving - : KeyName::external_control; - - updateHazardStatus(); - publishHazardStatus(hazard_status_); -} - -boost::optional AutowareErrorMonitor::getLatestDiag( - const std::string & diag_name) const -{ - if (diag_buffer_map_.count(diag_name) == 0) { - return {}; - } - - const auto & diag_buffer = diag_buffer_map_.at(diag_name); - - if (diag_buffer.empty()) { - return {}; - } - - return diag_buffer.back(); -} - -uint8_t AutowareErrorMonitor::getHazardLevel( - const DiagConfig & required_module, const int diag_level) const -{ - using autoware_system_msgs::msg::HazardStatus; - - if (isOverLevel(diag_level, required_module.spf_at)) { - return HazardStatus::SINGLE_POINT_FAULT; - } - if (isOverLevel(diag_level, required_module.lf_at)) { - return HazardStatus::LATENT_FAULT; - } - if (isOverLevel(diag_level, required_module.sf_at)) { - return HazardStatus::SAFE_FAULT; - } - - return HazardStatus::NO_FAULT; -} - -void AutowareErrorMonitor::appendHazardDiag( - const DiagConfig & required_module, const diagnostic_msgs::msg::DiagnosticStatus & hazard_diag, - autoware_system_msgs::msg::HazardStatus * hazard_status) const -{ - const auto hazard_level = getHazardLevel(required_module, hazard_diag.level); - - auto & target_diagnostics_ref = getTargetDiagnosticsRef(hazard_level, hazard_status); - target_diagnostics_ref.push_back(hazard_diag); - - if (params_.add_leaf_diagnostics) { - for (const auto & diag : - diagnostics_filter::extractLeafChildrenDiagnostics(hazard_diag, diag_array_->status)) { - target_diagnostics_ref.push_back(diag); - } - } - - hazard_status->level = std::max(hazard_status->level, hazard_level); -} - -autoware_system_msgs::msg::HazardStatus AutowareErrorMonitor::judgeHazardStatus() const -{ - using autoware_system_msgs::msg::HazardStatus; - using diagnostic_msgs::msg::DiagnosticStatus; - - autoware_system_msgs::msg::HazardStatus hazard_status; - for (const auto & required_module : required_modules_map_.at(current_mode_)) { - const auto & diag_name = required_module.name; - const auto latest_diag = getLatestDiag(diag_name); - - // no diag found - if (!latest_diag) { - if (!params_.ignore_missing_diagnostics) { - DiagnosticStatus missing_diag; - - missing_diag.name = diag_name; - missing_diag.hardware_id = "system_error_monitor"; - missing_diag.level = DiagnosticStatus::STALE; - missing_diag.message = "no diag found"; - - appendHazardDiag(required_module, missing_diag, &hazard_status); - } - - continue; - } - - // diag level high - { - appendHazardDiag(required_module, latest_diag->status, &hazard_status); - } - - // diag timeout - { - const auto time_diff = this->now() - latest_diag->header.stamp; - if (time_diff.seconds() > params_.diag_timeout_sec) { - DiagnosticStatus timeout_diag = latest_diag->status; - timeout_diag.level = DiagnosticStatus::STALE; - timeout_diag.message = "timeout"; - - appendHazardDiag(required_module, timeout_diag, &hazard_status); - } - } - } - - // Ignore error when vehicle is not ready to start - if (isInNoFaultCondition(*autoware_state_, *current_gate_mode_)) { - hazard_status.level = autoware_system_msgs::msg::HazardStatus::NO_FAULT; - } - - return hazard_status; -} - -void AutowareErrorMonitor::updateHazardStatus() -{ - const bool prev_emergency_status = hazard_status_.emergency; - - // Create hazard status based on diagnostics - if (!hazard_status_.emergency_holding) { - const auto current_hazard_status = judgeHazardStatus(); - hazard_status_.level = current_hazard_status.level; - hazard_status_.diag_no_fault = current_hazard_status.diag_no_fault; - hazard_status_.diag_safe_fault = current_hazard_status.diag_safe_fault; - hazard_status_.diag_latent_fault = current_hazard_status.diag_latent_fault; - hazard_status_.diag_single_point_fault = current_hazard_status.diag_single_point_fault; - } - - // Update emergency status - { - hazard_status_.emergency = hazard_status_.level >= params_.emergency_hazard_level; - if (hazard_status_.emergency != prev_emergency_status) { - emergency_state_switch_time_ = this->now(); - } - } - - // Update emergency_holding condition - if (params_.use_emergency_hold) { - hazard_status_.emergency_holding = isEmergencyHoldingRequired(); - } -} - -void AutowareErrorMonitor::updateTimeoutHazardStatus() -{ - const bool prev_emergency_status = hazard_status_.emergency; - - hazard_status_.level = autoware_system_msgs::msg::HazardStatus::SINGLE_POINT_FAULT; - hazard_status_.emergency = true; - - if (hazard_status_.emergency != prev_emergency_status) { - emergency_state_switch_time_ = this->now(); - } - - hazard_status_.diag_no_fault = std::vector(); - hazard_status_.diag_safe_fault = std::vector(); - hazard_status_.diag_latent_fault = std::vector(); - - diagnostic_msgs::msg::DiagnosticStatus diag; - diag.name = "system_error_monitor/input_data_timeout"; - diag.hardware_id = "system_error_monitor"; - diag.level = diagnostic_msgs::msg::DiagnosticStatus::ERROR; - hazard_status_.diag_single_point_fault = - std::vector{diag}; - - // Update emergency_holding condition - if (params_.use_emergency_hold) { - const auto emergency_duration = (this->now() - emergency_state_switch_time_).seconds(); - hazard_status_.emergency_holding = emergency_duration > params_.hazard_recovery_timeout; - } -} - -bool AutowareErrorMonitor::canAutoRecovery() const -{ - const auto error_modules = getErrorModules(hazard_status_, params_.emergency_hazard_level); - for (const auto & required_module : required_modules_map_.at(current_mode_)) { - if (required_module.auto_recovery) { - continue; - } - if (error_modules.count(required_module.name) != 0) { - return false; - } - } - return true; -} - -bool AutowareErrorMonitor::isEmergencyHoldingRequired() const -{ - // Does not change holding status until emergency_holding is cleared by service call - if (hazard_status_.emergency_holding) { - return true; - } - - if (!hazard_status_.emergency) { - return false; - } - - // Don't hold status if emergency duration within recovery timeout - const auto emergency_duration = (this->now() - emergency_state_switch_time_).seconds(); - const auto within_recovery_timeout = emergency_duration < params_.hazard_recovery_timeout; - if (within_recovery_timeout && canAutoRecovery()) { - return false; - } - - // Don't hold status during manual driving - const bool is_manual_driving = - (control_mode_->mode == autoware_vehicle_msgs::msg::ControlModeReport::MANUAL); - const auto no_hold_condition = - (!params_.use_emergency_hold_in_manual_driving && is_manual_driving); - if (no_hold_condition) { - return false; - } - - return true; -} - -void AutowareErrorMonitor::publishHazardStatus( - const autoware_system_msgs::msg::HazardStatus & hazard_status) -{ - autoware_system_msgs::msg::HazardStatusStamped hazard_status_stamped; - hazard_status_stamped.stamp = this->now(); - hazard_status_stamped.status = hazard_status; - pub_hazard_status_->publish(hazard_status_stamped); - - loggingErrors(hazard_status_stamped.status); - - pub_diagnostics_err_->publish( - convertHazardStatusToDiagnosticArray(this->get_clock(), hazard_status_stamped.status)); -} - -bool AutowareErrorMonitor::onClearEmergencyService( - [[maybe_unused]] std_srvs::srv::Trigger::Request::SharedPtr request, - std_srvs::srv::Trigger::Response::SharedPtr response) -{ - hazard_status_.emergency_holding = false; - updateHazardStatus(); - response->success = true; - response->message = "Emergency Holding state was cleared."; - - return true; -} - -void AutowareErrorMonitor::loggingErrors( - const autoware_system_msgs::msg::HazardStatus & hazard_status) -{ - if ( - autoware_state_ && current_gate_mode_ && - isInNoFaultCondition(*autoware_state_, *current_gate_mode_)) { - RCLCPP_DEBUG(get_logger(), "Autoware is in no-fault condition."); - return; - } - - for (const auto & hazard_diag : hazard_status.diag_latent_fault) { - const std::string logger_name = "system_error_monitor " + hazard_diag.name; - logThrottledNamed( - logger_name, get_clock(), 5000, "[Latent Fault]: " + hazard_diag.message); - } - for (const auto & hazard_diag : hazard_status.diag_single_point_fault) { - const std::string logger_name = "system_error_monitor " + hazard_diag.name; - logThrottledNamed( - logger_name, get_clock(), 5000, "[Single Point Fault]: " + hazard_diag.message); - } -} - -#include -RCLCPP_COMPONENTS_REGISTER_NODE(AutowareErrorMonitor)