Commit b03b5aa4 authored by Andrew Newdigate's avatar Andrew Newdigate

Switch all Prometheus alerts to use the same S1, C1 prioritisation

scheme as the incident management framework
parent 34064409
......@@ -18,7 +18,7 @@ groups:
expr: node_load1{environment="gprd",job="node",type="git"} > 1
for: 1m
labels:
severity: critical
severity: s1
annotations:
title: "Emergency Alert"
description: |
......
......@@ -7,7 +7,7 @@ groups:
labels:
pager: pagerduty
service: alertmanager
severity: critical
severity: s1
annotations:
title: Alertmanager is failing sending notications
runbook: troubleshooting/alertmanager-notification-failures.md
......
......@@ -5,7 +5,7 @@ groups:
expr: chef_client_error == 1
for: 5h
labels:
severity: warn
severity: s4
annotations:
description: Check failed chef executions on host {{ $labels.fqdn }} in https://prometheus.gitlab.com/graph?g0.range_input=8w&g0.expr=chef_client_error+%3D%3D+1&g0.tab=1
runbook: troubleshooting/chef.md
......@@ -13,7 +13,7 @@ groups:
- alert: ChefClientStale
expr: time() - chef_client_last_run_timestamp_seconds > 5 * 3600 + time() % 3600
labels:
severity: warn
severity: s4
annotations:
description: Last Chef run for {{ $labels.fqdn }} was over {{ $value | humanizeDuration }} ago
runbook: troubleshooting/chef.md
......@@ -22,7 +22,7 @@ groups:
expr: count(chef_client_error == 1) / count(chef_client_error) > .25
for: 5m
labels:
severity: critical
severity: s1
pager: pagerduty
annotations:
description: |
......
......@@ -6,7 +6,7 @@ groups:
for: 10m
labels:
channel: production
severity: critical
severity: s1
pager: pagerduty
annotations:
description: ProcessCommitWorker sidekiq jobs are piling up for the last 10
......
......@@ -6,7 +6,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
title: The public dashboard page is down
description: dashboards.gitlab.com is down. Investigate the root cause by logging into the host.
......@@ -8,7 +8,7 @@ groups:
unless
deadman_15m_checkin
labels:
severity: warn
severity: s4
annotations:
title: "Availability data for the deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) is missing"
description: |
......@@ -25,7 +25,7 @@ groups:
unless
deadman_30m_checkin
labels:
severity: warn
severity: s4
annotations:
title: "Availability data for the deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) is missing"
description: |
......@@ -42,7 +42,7 @@ groups:
unless
deadman_6h_checkin
labels:
severity: warn
severity: s4
annotations:
title: "Availability data for the deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) is missing"
description: |
......@@ -59,7 +59,7 @@ groups:
unless
deadman_1d_checkin
labels:
severity: warn
severity: s4
annotations:
title: "Availability data for the deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) is missing"
description: |
......@@ -76,7 +76,7 @@ groups:
unless
deadman_1d6h_checkin
labels:
severity: warn
severity: s4
annotations:
title: "Availability data for the deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) is missing"
description: |
......@@ -92,7 +92,7 @@ groups:
max(deadman_15m_checkin) by (environment, tier, type, resource) < time() - 900
labels:
# pager: pagerduty
severity: warn
severity: s4
annotations:
title: "Deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) has expired"
description: |
......@@ -108,7 +108,7 @@ groups:
max(deadman_30m_checkin) by (environment, tier, type, resource) < time() - 1800
labels:
# pager: pagerduty
severity: warn
severity: s4
annotations:
title: "Deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) has expired"
description: |
......@@ -124,7 +124,7 @@ groups:
max(deadman_6h_checkin) by (environment, tier, type, resource) < time() - 21600
labels:
# pager: pagerduty
severity: warn
severity: s4
annotations:
title: "Deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) has expired"
description: |
......@@ -140,7 +140,7 @@ groups:
max(deadman_1d_checkin) by (environment, tier, type, resource) < time() - 86400
labels:
# pager: pagerduty
severity: warn
severity: s4
annotations:
title: "Deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) has expired"
description: |
......@@ -156,7 +156,7 @@ groups:
max(deadman_1d6h_checkin) by (environment, tier, type, resource) < time() - 108000
labels:
# pager: pagerduty
severity: warn
severity: s4
annotations:
title: "Deadman switch of `{{ $labels.resource }}` ({{ $labels.tier }}/{{ $labels.type }}) has expired"
description: |
......
......@@ -6,7 +6,7 @@ groups:
for: 10m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: The development site is returning status codes other than 200,
this usually means that a nightly deployment went wrong and took the site
......
......@@ -15,7 +15,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: Rails is returning 5xx errors at a high rate for {{ $labels.type }} . Traffic is impacted and users are likely seeing 500 errors.
runbook: troubleshooting/high-error-rate.md
......@@ -24,7 +24,7 @@ groups:
expr: environment_type:rails_request_errors:ratio * 100 > .5
for: 30s
labels:
severity: warn
severity: s4
annotations:
description: Rails is returning 5xx errors at a high rate for {{ $labels.type }} . Traffic is impacted and users are likely seeing 500 errors.
runbook: troubleshooting/high-error-rate.md
......
......@@ -7,7 +7,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: |
{{ $labels.type }} has lost redundancy. Only {{ $value }}% of servers are online.
......@@ -19,7 +19,7 @@ groups:
> 0
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: |
{{ $labels.type }} have no instances online to serve traffic.
......
......@@ -5,7 +5,7 @@ groups:
expr: up{job="fluentd"} == 0
for: 5m
labels:
severity: warn
severity: s4
annotations:
title: td-agent is down on {{$labels.instance}}
description: |
......
......@@ -10,7 +10,7 @@ groups:
for: 10m
labels:
rules_domain: general
severity: warn
severity: s4
annotations:
title: "Availability data for the `{{ $labels.component }}` component of the `{{ $labels.type }}` service is missing"
description: |
......@@ -31,7 +31,7 @@ groups:
for: 10m
labels:
rules_domain: general
severity: warn
severity: s4
annotations:
title: "Operation rate data for the `{{ $labels.component }}` component of the `{{ $labels.type }}` service is missing"
description: |
......@@ -52,7 +52,7 @@ groups:
for: 10m
labels:
rules_domain: general
severity: warn
severity: s4
annotations:
title: "Apdex for the `{{ $labels.component }}` component of the `{{ $labels.type }}` service is missing"
description: |
......@@ -73,7 +73,7 @@ groups:
for: 10m
labels:
rules_domain: general
severity: warn
severity: s4
annotations:
title: "Error rate data for the `{{ $labels.component }}` component of the `{{ $labels.type }}` service is missing"
description: |
......
......@@ -11,7 +11,7 @@ groups:
labels:
rules_domain: general
metric: gitlab_service_apdex:ratio
severity: warn
severity: s4
period: 5m
bound: lower
annotations:
......@@ -43,7 +43,7 @@ groups:
labels:
rules_domain: general
metric: gitlab_service_errors:ratio
severity: warn
severity: s4
period: 5m
bound: lower
annotations:
......@@ -79,7 +79,7 @@ groups:
labels:
rules_domain: general
metric: gitlab_service_availability:ratio
severity: warn
severity: s4
period: 5m
bound: lower
threshold_value: "0.75"
......@@ -111,7 +111,8 @@ groups:
labels:
rules_domain: general
metric: gitlab_service_availability:ratio
severity: error
severity: s2
pager: pagerduty
period: 5m
bound: lower
threshold_value: "0.5"
......@@ -154,7 +155,7 @@ groups:
labels:
rules_domain: general
metric: gitlab_service_ops:rate
severity: warn
severity: s4
period: 5m
bound: upper
threshold_sigma: "3"
......@@ -194,7 +195,7 @@ groups:
labels:
rules_domain: general
metric: gitlab_service_ops:rate
severity: warn
severity: s4
period: 5m
bound: lower
threshold_sigma: "3"
......
......@@ -6,7 +6,7 @@ groups:
for: 5m
labels:
channel: production
severity: warn
severity: s4
annotations:
description: '{{ $labels.fqdn }} is more than {{$value}}%.
We need to build new file servers or rebalance repos.'
......
......@@ -29,7 +29,7 @@ groups:
labels:
channel: gitaly
pager: pagerduty
severity: critical
severity: s1
annotations:
description: Gitaly {{$labels.grpc_code}} error rate for the last 5 minutes is over 2 for {{$labels.grpc_method}}.
Check Gitaly logs and consider disabling that method.
......@@ -41,7 +41,7 @@ groups:
labels:
channel: gitaly
pager: pagerduty
severity: critical
severity: s1
annotations:
description: Gitaly error rate for the last 5 minutes is over 5 on {{$labels.instance}}.
Check Gitaly logs and consider disabling it on that host.
......@@ -58,7 +58,7 @@ groups:
for: 5m
labels:
channel: gitaly
severity: warn
severity: s4
annotations:
description: >
The {{$labels.grpc_code}} error rate on {{ $labels.grpc_method }} is outside normal
......@@ -123,7 +123,7 @@ groups:
for: 5m
labels:
channel: gitaly
severity: warn
severity: s4
annotations:
description: The error rate on the {{ $labels.grpc_method }} endpoint is outside
normal values over a 12 hour period (95% confidence). Check https://dashboards.gitlab.net/dashboard/db/gitaly-feature-status?var-method={{
......@@ -145,7 +145,7 @@ groups:
labels:
channel: gitaly
pager: pagerduty
severity: critical
severity: s1
annotations:
description: Gitaly has been marked as down for the past minute on {{$labels.instance}}.
Check Gitaly logs and restart the process if necessary
......@@ -163,7 +163,7 @@ groups:
labels:
channel: gitaly
pager: pagerduty
severity: critical
severity: s1
annotations:
description: 'Gitaly has been using more than 50% of total available CPU on
{{$labels.fqdn}} for the past minute. This may affect the stability of the
......@@ -181,7 +181,7 @@ groups:
labels:
channel: gitaly
pager: pagerduty
severity: critical
severity: s1
annotations:
description: During a deployment, two distinct versions of Gitaly may be running
alongside one another, but this should not be the case for more than 30m.
......@@ -201,7 +201,7 @@ groups:
labels:
channel: gitaly
pager: pagerduty
severity: critical
severity: s1
annotations:
description: Three of more versions of Gitaly are currently running alongside
one another in production. This should never occur and indicates serious deployment
......@@ -218,7 +218,7 @@ groups:
for: 5m
labels:
channel: g_gitaly
severity: warn
severity: s4
annotations:
description: |
Reach out to the Gitaly team, ensure logging is set properly.
......@@ -230,40 +230,13 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: Reach out to the Gitaly team and mitigate the problem restarting
gitaly on the affected node if it's causing a partial outage.
runbook: troubleshooting/gitaly-latency.md
title: Gitaly latency on {{ $labels.fqdn }} has been over 1m during the last 5m
- alert: gitaly_lock_acquisition_rates
expr: |
1 - (
sum(rate(gitaly_rate_limiting_acquiring_seconds_bucket{le="60"}[10m])) by (environment, tier, type, stage, fqdn, grpc_method)
/
sum(rate(gitaly_rate_limiting_acquiring_seconds_bucket{le="+Inf"}[10m])) by (environment, tier, type, stage, fqdn, grpc_method)
) > 0.1
for: 10m
labels:
rules_domain: general
type: gitaly
metric: gitaly_rate_limiting_acquiring_seconds_bucket
severity: error
period: 10m
annotations:
title: "More than 10% of Gitaly {{ $labels.grpc_method }} requests to {{ $labels.fqdn }} are queueing for more than 60s"
description: |
A very high proportion of Gitaly {{ $labels.grpc_method }} requests to {{ $labels.fqdn }} are being queued up for an extended period.
This issue is likely to be immediately client-impacting.
grafana_dashboard_id: "VBaSC9aik/gitaly-rate-limiting-alerting"
grafana_panel_id: "2"
grafana_variables: "environment,stage,grpc_method"
grafana_min_zoom_hours: 6
runbook: "troubleshooting/gitaly-rate-limiting.md"
promql_template_1: 'histogram_quantile(0.90, sum(rate(gitaly_rate_limiting_acquiring_seconds_bucket{stage="$stage",grpc_method="$grpc_method"}[1m])) by (fqdn, grpc_method, le))'
- alert: gitaly_lock_acquisition_rates
expr: |
1 - (
......@@ -276,8 +249,7 @@ groups:
rules_domain: general
type: gitaly
metric: gitaly_rate_limiting_acquiring_seconds_bucket
pager: pagerduty
severity: critical
severity: s4
period: 10m
annotations:
title: "More than 20% of Gitaly {{ $labels.grpc_method }} requests to {{ $labels.fqdn }} are queueing for more than 60s"
......
......@@ -7,7 +7,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: 'The number of pending builds for projects with shared runners will be
too high in 1h: {{$value | printf "%.2f" }}'
......@@ -21,7 +21,7 @@ groups:
for: 1m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: 'Number of pending jobs per namespace too high: {{$value}}'
description: 'Number of pending jobs for namespace {{$labels.namespace}} is too high: {{$value}}.
......@@ -33,7 +33,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: 'Number of running jobs per namespace too high: {{$value}}'
description: 'Number of running jobs for namespace {{$labels.namespace}} running on regular Shared Runners is too high: {{$value}}.
......@@ -45,7 +45,7 @@ groups:
for: 10m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: 'Number of running jobs per namespace too high: {{$value}}'
description: 'Number of running jobs for namespace {{$labels.namespace}} running on gitlab-org Shared Runners is too high: {{$value}}.
......@@ -57,7 +57,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: 'Number of builds running on shared runners is too low: {{$value}}'
description: "Number of builds running on shared runners for the last 5 minutes
......@@ -69,7 +69,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: "{{ $labels.job }} runners are using 85% of concurrent limit for more than 5 minutes."
description: 'This may suggest problems with our autoscaled machines fleet OR
......@@ -80,7 +80,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: "{{ $labels.job }} runners are using 95% of concurrent limit for more than 5 minutes."
description: 'This may suggest problems with our autoscaled machines fleet OR
......@@ -91,7 +91,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: Runners manager is down on {{ $labels.instance }}
description: 'This impacts CI execution builds, consider tweeting: !tweet ''Builds
......@@ -105,7 +105,7 @@ groups:
for: 1m
labels:
channel: ci-cd
severity: warn
severity: s3
annotations:
title: 'Machine creation rate for runners is too high: {{$value | printf "%.2f" }}'
description: 'Machine creation rate for the last 1 minute is at least {{$value}}
......@@ -118,7 +118,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: Runners cache service {{ $labels.instance }} on {{ $labels.fqdn }} has been down for more than 5 minutes.
description: 'This impacts CI execution builds, consider tweeting: !tweet ''CI
......@@ -131,7 +131,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: Runners cache nginx service on {{ $labels.fqdn }} has been down for more than 5 minutes.
description: 'This impacts CI execution builds, consider tweeting: !tweet ''CI
......@@ -144,7 +144,7 @@ groups:
for: 20m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: Number of established connections for {{ $labels.instance }} is too high
description: 'This impacts CI execution builds, consider tweeting: !tweet ''CI
......@@ -158,7 +158,7 @@ groups:
for: 10m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: Number of used file descriptors on {{ $labels.instance }} is too high
description: '{{ $labels.instance }} is using more than 80% of available FDs
......@@ -170,7 +170,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: CI Consul+Prometheus cluster is degradated, {{ $labels.instance }} is down
description: 'One or more of hosts from CI Consul+Prometheus cluster is down: {{ $labels.instance }}.'
......@@ -183,7 +183,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: GCP Quota usage of {{ $labels.quota }} is too high
description: |
......@@ -199,7 +199,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: GCP Quota usage of {{ $labels.quota }} is near limit
description: |
......@@ -215,7 +215,7 @@ groups:
for: 45m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: Namespace with constant number of long running jobs with repeated commands
description: |
......@@ -233,7 +233,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: Job queue duration performance for '0 - 10s' is to low
description: |
......@@ -253,7 +253,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: 90% of request queued on Workhorse is longer than 30s
description: |
......@@ -267,7 +267,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: warn
severity: s4
annotations:
title: 'Too big number of traces archiving failures: {{$value}}'
description: |
......@@ -282,7 +282,7 @@ groups:
for: 5m
labels:
channel: ci-cd
severity: error
severity: s3
annotations:
title: 'Sidekiq queues for CI/CD are growing: {{$value}}'
description: |
......
......@@ -5,7 +5,7 @@ groups:
expr: status:rails_requests_completed_seconds:p95{type="web", status="200", stage="main"} > 2.0
for: 1m
labels:
severity: warn
severity: s4
annotations:
description: This might be causing a slowdown on the site and/or affecting users.
Please check the Triage Dashboard in Grafana.
......@@ -15,7 +15,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: This might be causing a slowdown on the site and/or affecting users.
Please check the Triage Dashboard in Grafana.
......@@ -24,7 +24,7 @@ groups:
expr: status:rails_requests_completed_seconds:p95{type="api", status="200", stage="main"} > 1.5
for: 1m
labels:
severity: warn
severity: s4
annotations:
description: This might be causing a slowdown on the site and/or affecting users.
Please check the Triage Dashboard in Grafana.
......@@ -34,7 +34,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: This might be causing a slowdown on the site and/or affecting users.
Please check the Triage Dashboard in Grafana.
......@@ -43,7 +43,7 @@ groups:
expr: status:rails_requests_completed_seconds:p95{type="git", status="200", stage="main"} > 4.5
for: 1m
labels:
severity: warn
severity: s4
annotations:
description: This might be causing a slowdown on the site and/or affecting users.
Please check the Triage Dashboard in Grafana.
......@@ -52,7 +52,7 @@ groups:
expr: status:rails_requests_completed_seconds:p95{type="git", status="200", stage="main"} > 4.5
for: 5m
labels:
severity: critical
severity: s1
pager: pagerduty
annotations:
description: This might be causing a slowdown on the site and/or affecting users.
......
......@@ -5,7 +5,7 @@ groups:
expr: instance:rails_no_repository_for_path:rate1m >= 10
for: 1m
labels:
severity: critical
severity: s1
pager: pagerduty
annotations:
description: This usually means that we lost an NFS mount somewhere in the fleet,
......
......@@ -6,7 +6,7 @@ groups:
for: 2m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: GitLab.com is down for more than 2 minutes! Consider !tweet 'GitLab.com
is currently down, we are investigating the root cause, apologies for the
......@@ -18,7 +18,7 @@ groups:
for: 2m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: GitLab.com is down for more than 2 minutes! Consider !tweet 'GitLab.com
is currently down, we are investigating the root cause, apologies for the
......@@ -30,7 +30,7 @@ groups:
for: 2m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: www.GitLab.com is down for more than 1 minute! Consider !tweet 'GitLab.com
is currently down, we are investigating the root cause, apologies for the
......
......@@ -10,7 +10,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: WALE syncs to GCS might be not working. Please follow the runbook
to review the problem.
......@@ -21,7 +21,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: WALE basebackup syncs to GCS might be not working. Please follow the runbook
to review the problem.
......
......@@ -10,7 +10,7 @@ groups:
for: 5m
labels:
pager: pagerduty
severity: critical
severity: s1
annotations:
description: walg syncs to GCS might be not working. Please follow the runbook
to review the problem.
......@@ -21,7 +21,7 @@ groups:
for: 5m