Skip to content

Commit

Permalink
chore: update mimir rules
Browse files Browse the repository at this point in the history
  • Loading branch information
bdossantos committed Sep 18, 2023
1 parent 6a72d2f commit d7e492c
Showing 1 changed file with 34 additions and 30 deletions.
64 changes: 34 additions & 30 deletions rules/mimir_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ groups:
The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors
expr: |
100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m]))
/
sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m]))
> 1
for: 15m
labels:
Expand All @@ -33,7 +33,7 @@ groups:
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
expr: |
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"}
>
2.5
for: 15m
Expand Down Expand Up @@ -117,11 +117,20 @@ groups:
- alert: MimirIngesterRestarts
annotations:
message:
'{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
$value }} times in the last 30 mins.'
Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
}} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts
expr: |
changes(process_start_time_seconds{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}[30m]) >= 2
(
sum by(cluster, namespace, instance) (
increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m])
)
>= 2
)
and
(
count by(cluster, namespace, instance) (cortex_build_info) > 0
)
labels:
severity: warning
- alert: MimirKVStoreFailure
Expand Down Expand Up @@ -187,6 +196,21 @@ groups:
for: 1h
labels:
severity: warning
- alert: MimirIngestedDataTooFarInTheFuture
annotations:
message:
Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
}} has ingested samples with timestamps more than 1h in the future.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture
expr: |
max by(cluster, namespace, instance) (
cortex_ingester_tsdb_head_max_timestamp_seconds - time()
and
cortex_ingester_tsdb_head_max_timestamp_seconds > 0
) > 60*60
for: 5m
labels:
severity: warning
- alert: MimirRingMembersMismatch
annotations:
message: |
Expand Down Expand Up @@ -350,26 +374,6 @@ groups:
severity: critical
- name: mimir-provisioning
rules:
- alert: MimirProvisioningTooManyActiveSeries
annotations:
message: |
The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries
expr: |
avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
for: 2h
labels:
severity: warning
- alert: MimirProvisioningTooManyWrites
annotations:
message: |
Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanywrites
expr: |
avg by (cluster, namespace) (cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m) > 80e3
for: 15m
labels:
severity: warning
- alert: MimirAllocatingTooMuchMemory
annotations:
message: |
Expand Down Expand Up @@ -476,7 +480,7 @@ groups:
}} sees incorrect number of gossip members.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch
expr: |
avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(alertmanager|compactor|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
for: 15m
labels:
severity: warning
Expand Down Expand Up @@ -615,9 +619,9 @@ groups:
}} has not shipped any block in the last 4 hours.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks
expr: |
(min by(cluster, namespace, instance) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4)
(min by(cluster, namespace, instance) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4)
and
(max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time) > 0)
(max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
Expand All @@ -637,7 +641,7 @@ groups:
}} has not shipped any block in the last 4 hours.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart
expr: |
(max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time) == 0)
(max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0)
and
(max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
for: 4h
Expand Down

0 comments on commit d7e492c

Please sign in to comment.