From 186204b2717386b9e2179d57d499607036b0fba7 Mon Sep 17 00:00:00 2001 From: Benjamin Dos Santos Date: Thu, 21 Jan 2021 20:44:40 +0100 Subject: [PATCH] refactor: cleanup mysql related rules --- rules/mysql.yml | 77 +++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/rules/mysql.yml b/rules/mysql.yml index 14dd775..74b95ba 100644 --- a/rules/mysql.yml +++ b/rules/mysql.yml @@ -1,14 +1,6 @@ groups: - name: mysql rules: - - record: mysql_slave_lag_seconds - expr: mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay - - record: mysql_heartbeat_lag_seconds - expr: mysql_heartbeat_now_timestamp_seconds - mysql_heartbeat_stored_timestamp_seconds - - record: job:mysql_transactions:rate5m - expr: - sum(rate(mysql_global_status_commands_total{command=~"(commit|rollback)"}[5m])) - WITHOUT (command) - alert: MySQLGaleraNotReady expr: mysql_global_status_wsrep_ready != 1 for: 5m @@ -41,58 +33,67 @@ groups: "{{$labels.job}} on {{$labels.instance}} is a donor (hotbackup) and is falling behind (queue size {{$value}})." summary: xtradb cluster donor node falling behind - - alert: MySQLReplicationNotRunning - expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0 - for: 2m - labels: - severity: page - annotations: - description: Slave replication (IO or SQL) has been down for more than 2 minutes. - summary: Slave replication is not running - - alert: MySQLReplicationLag - expr: (mysql_slave_lag_seconds > 30) and ON(instance) (predict_linear(mysql_slave_lag_seconds[5m], 60 * 2) > 0) + - alert: MysqlSlaveReplicationLag + expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30 for: 1m labels: severity: page annotations: - description: The mysql slave replication has fallen behind and is not recovering - summary: MySQL slave replication is lagging - - alert: MySQLHeartbeatReplicationLag - expr: (mysql_heartbeat_lag_seconds > 30) and ON(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0) - for: 1m - labels: - severity: page - annotations: - description: The mysql slave replication has fallen behind and is not recovering - summary: MySQL slave replication is lagging - - alert: MySQLInnoDBLogWaits + summary: "MySQL Slave replication lag (instance {{ $labels.instance }})" + description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: MysqlInnodbLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 + for: 0m labels: severity: warning annotations: - description: The innodb logs are waiting for disk at a rate of {{$value}} / second - summary: MySQL innodb log writes stalling + summary: "MySQL InnoDB log waits (instance {{ $labels.instance }})" + description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: MysqlSlowQueries - expr: mysql_global_status_slow_queries > 0 - for: 5m + expr: increase(mysql_global_status_slow_queries[1m]) > 0 + for: 2m labels: severity: warning annotations: summary: "MySQL slow queries (instance {{ $labels.instance }})" - description: "MySQL server is having some slow queries.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: MysqlTooManyConnections - expr: (mysql_global_status_threads_connected / mysql_global_variables_max_connections) * 100 > 80 - for: 5m + expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80 + for: 2m labels: severity: page annotations: summary: "MySQL too many connections (instance {{ $labels.instance }})" description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: MysqlHighThreadsRunning + expr: avg by (instance) (rate(mysql_global_status_threads_running[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60 + for: 2m + labels: + severity: warning + annotations: + summary: "MySQL high threads running (instance {{ $labels.instance }})" + description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: MysqlSlaveIoThreadNotRunning + expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0 + for: 0m + labels: + severity: page + annotations: + summary: "MySQL Slave IO thread not running (instance {{ $labels.instance }})" + description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + - alert: MysqlSlaveSqlThreadNotRunning + expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0 + for: 0m + labels: + severity: page + annotations: + summary: "MySQL Slave SQL thread not running (instance {{ $labels.instance }})" + description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - alert: MysqlRestarted expr: mysql_global_status_uptime < 60 - for: 5m + for: 0m labels: - severity: warning + severity: info annotations: summary: "MySQL restarted (instance {{ $labels.instance }})" description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"