From d231961c094f1de2904e07456ef927afb2e4198c Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Thu, 6 Jun 2024 12:34:21 +0200 Subject: [PATCH 1/7] feat: zoekt service definition --- metrics-catalog/services/all.jsonnet | 1 + metrics-catalog/services/zoekt.jsonnet | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 metrics-catalog/services/zoekt.jsonnet diff --git a/metrics-catalog/services/all.jsonnet b/metrics-catalog/services/all.jsonnet index 26a1042167..969b21dcd8 100644 --- a/metrics-catalog/services/all.jsonnet +++ b/metrics-catalog/services/all.jsonnet @@ -74,4 +74,5 @@ import 'web.jsonnet', import 'websockets.jsonnet', import 'woodhouse.jsonnet', + import 'zoekt.jsonnet', ] diff --git a/metrics-catalog/services/zoekt.jsonnet b/metrics-catalog/services/zoekt.jsonnet new file mode 100644 index 0000000000..a79d9a550e --- /dev/null +++ b/metrics-catalog/services/zoekt.jsonnet @@ -0,0 +1,12 @@ +local metricsCatalog = import 'servicemetrics/metrics.libsonnet'; +local rateMetric = metricsCatalog.rateMetric; + +metricsCatalog.serviceDefinition({ + type: 'zoekt', + tier: 'inf', + monitoringThresholds: { + apdexScore: 0.999, + errorRatio: 0.999, + }, + serviceLevelIndicators: {}, +}) -- GitLab From fb3f964cd913c4e65199d3976b057ec8aabd4db8 Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Thu, 6 Jun 2024 12:46:12 +0200 Subject: [PATCH 2/7] chore: make generate --- dashboards/zoekt/main.dashboard.jsonnet | 6 + .../autogenerated-saturation.yml | 28 +- ...tlab-gprd-gprd-zoekt-saturation-alerts.yml | 628 ++++++++++++++++++ ...ab-gprd-gprd-zoekt-saturation-metadata.yml | 78 +++ ...ated-gitlab-gprd-gprd-zoekt-saturation.yml | 145 ++++ ...zoekt-service-anomaly-detection-alerts.yml | 79 +++ ...rd-zoekt-service_ops_anomaly_detection.yml | 38 ++ ...lab-gprd-gprd-zoekt-zoekt-service-slos.yml | 18 + ...tlab-gstg-gstg-zoekt-saturation-alerts.yml | 628 ++++++++++++++++++ ...ab-gstg-gstg-zoekt-saturation-metadata.yml | 78 +++ ...ated-gitlab-gstg-gstg-zoekt-saturation.yml | 145 ++++ ...zoekt-service-anomaly-detection-alerts.yml | 79 +++ ...tg-zoekt-service_ops_anomaly_detection.yml | 38 ++ ...lab-gstg-gstg-zoekt-zoekt-service-slos.yml | 18 + thanos-rules/autogenerated-service-slos.yml | 10 + 15 files changed, 2002 insertions(+), 14 deletions(-) create mode 100644 dashboards/zoekt/main.dashboard.jsonnet create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service-anomaly-detection-alerts.yml create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service_ops_anomaly_detection.yml create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-zoekt-service-slos.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service-anomaly-detection-alerts.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service_ops_anomaly_detection.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-zoekt-service-slos.yml diff --git a/dashboards/zoekt/main.dashboard.jsonnet b/dashboards/zoekt/main.dashboard.jsonnet new file mode 100644 index 0000000000..16472d53f1 --- /dev/null +++ b/dashboards/zoekt/main.dashboard.jsonnet @@ -0,0 +1,6 @@ +// This file is autogenerated using scripts/generate-service-dashboards +// Please feel free to customize this file. +local serviceDashboard = import 'gitlab-dashboards/service_dashboard.libsonnet'; + +serviceDashboard.overview('zoekt') +.overviewTrailer() diff --git a/legacy-prometheus-rules/autogenerated-saturation.yml b/legacy-prometheus-rules/autogenerated-saturation.yml index fd03514a2a..ca319dc3c2 100644 --- a/legacy-prometheus-rules/autogenerated-saturation.yml +++ b/legacy-prometheus-rules/autogenerated-saturation.yml @@ -79,7 +79,7 @@ groups: clamp_min( clamp_max( 1 - avg by (environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[5m]) + rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[5m]) ) , 1) @@ -94,9 +94,9 @@ groups: clamp_min( clamp_max( 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} + node_filesystem_files_free{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} / - node_filesystem_files{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} + node_filesystem_files{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} ) , 1) @@ -111,7 +111,7 @@ groups: clamp_min( clamp_max( ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} ) , 1) @@ -523,7 +523,7 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - instance:node_memory_utilization:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} or instance:node_memory_utilisation:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} + instance:node_memory_utilization:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} or instance:node_memory_utilisation:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} , 1) , @@ -578,9 +578,9 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[1m]) + max_over_time(node_nf_conntrack_entries{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[1m]) / - node_nf_conntrack_entries_limit{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} + node_nf_conntrack_entries_limit{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} , 1) , @@ -593,7 +593,7 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[1h])) + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[1h])) , 1) , @@ -607,15 +607,15 @@ groups: clamp_min( clamp_max( ( - process_open_fds{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse"} + process_open_fds{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse|zoekt"} / - process_max_fds{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse"} + process_max_fds{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse|zoekt"} ) or ( - ruby_file_descriptors{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse"} + ruby_file_descriptors{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse|zoekt"} / - ruby_process_max_fds{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse"} + ruby_process_max_fds{type=~"ai-assisted|api|atlantis|camoproxy|ci-runners|cloudflare|consul|customersdot|external-dns|frontend|git|gitaly|google-cloud-storage|internal-api|istio|jaeger|kas|logging|mailgun|mailroom|memorystore|monitoring|nginx|ops-gitlab-net|packagecloud|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|plantuml|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|runway|search|sentry|sidekiq|vault|web-pages|web|websockets|woodhouse|zoekt"} ) , 1) @@ -1294,7 +1294,7 @@ groups: clamp_min( clamp_max( 1 - avg by (environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[5m]) + rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[5m]) ) , 1) @@ -1361,7 +1361,7 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[5m])) + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[5m])) , 1) , diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml new file mode 100644 index 0000000000..191ec4be99 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml @@ -0,0 +1,628 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/saturation.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: GitLab Component Saturation Statistics + interval: 5m + rules: + - record: gitlab_component_saturation:ratio_quantile95_1w + expr: quantile_over_time(0.95, gitlab_component_saturation:ratio{env="gprd",type="zoekt"}[1w]) + - record: gitlab_component_saturation:ratio_quantile99_1w + expr: quantile_over_time(0.99, gitlab_component_saturation:ratio{env="gprd",type="zoekt"}[1w]) + - record: gitlab_component_saturation:ratio_quantile95_1h + expr: quantile_over_time(0.95, gitlab_component_saturation:ratio{env="gprd",type="zoekt"}[1h]) + - record: gitlab_component_saturation:ratio_quantile99_1h + expr: quantile_over_time(0.99, gitlab_component_saturation:ratio{env="gprd",type="zoekt"}[1h]) + - record: gitlab_component_saturation:ratio_avg_1h + expr: avg_over_time(gitlab_component_saturation:ratio{env="gprd",type="zoekt"}[1h]) +- name: GitLab Saturation Alerts + interval: 1m + rules: + - alert: component_saturation_slo_out_of_bounds:cpu + for: 5m + annotations: + title: The Average Service CPU Utilization resource of the {{ $labels.type }} + service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is + close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Average Service CPU Utilization resource: + + This resource measures average CPU utilization across an all cores in a service fleet. If it is becoming saturated, it may indicate that the fleet needs horizontal or vertical scaling. + grafana_dashboard_id: alerts-sat_cpu + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "1465724101" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s3 + expr: | + gitlab_component_saturation:ratio{component="cpu",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="cpu"} + - alert: component_saturation_slo_out_of_bounds:disk_inodes + for: 15m + annotations: + title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Disk inode Utilization per Device per Node resource: + + Disk inode utilization per device per node. + + If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files + grafana_dashboard_id: alerts-sat_disk_inodes + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "39965907" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + gitlab_component_saturation:ratio{component="disk_inodes",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} + - alert: ComponentResourceRunningOut_disk_inodes + for: 15m + annotations: + title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) is on track to hit capacity within + 6h + description: | + This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. + + Details of the Disk inode Utilization per Device per Node resource: + + Disk inode utilization per device per node. + + If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files + grafana_dashboard_id: alerts-sat_disk_inodes + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "39965907" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + linear_prediction_saturation_alert: 6h + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + predict_linear(gitlab_component_saturation:ratio{component="disk_inodes",env="gprd",type="zoekt"}[6h], 21600) + > on (component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} + - alert: component_saturation_slo_out_of_bounds:disk_space + for: 15m + annotations: + title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Disk Space Utilization per Device per Node resource: + + Disk space utilization per device per node. + grafana_dashboard_id: alerts-sat_disk_space + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "2661375984" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + gitlab_component_saturation:ratio{component="disk_space",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} + - alert: ComponentResourceRunningOut_disk_space + for: 15m + annotations: + title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) is on track to hit capacity within + 6h + description: | + This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. + + Details of the Disk Space Utilization per Device per Node resource: + + Disk space utilization per device per node. + grafana_dashboard_id: alerts-sat_disk_space + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "2661375984" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + linear_prediction_saturation_alert: 6h + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + predict_linear(gitlab_component_saturation:ratio{component="disk_space",env="gprd",type="zoekt"}[6h], 21600) + > on (component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} + - alert: component_saturation_slo_out_of_bounds:memory + for: 5m + annotations: + title: The Memory Utilization per Node resource of the {{ $labels.type }} service + ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to + its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Memory Utilization per Node resource: + + Memory utilization per device per node. + grafana_dashboard_id: alerts-sat_memory + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_memory?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "1955556769" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + gitlab_component_saturation:ratio{component="memory",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="memory"} + - alert: component_saturation_slo_out_of_bounds:nf_conntrack_entries + for: 5m + annotations: + title: The conntrack Entries per Node resource of the {{ $labels.type }} service + ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to + its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the conntrack Entries per Node resource: + + Netfilter connection tracking table utilization per node. + + When saturated, new connection attempts (incoming SYN packets) are dropped with no reply, leaving clients to slowly retry (and typically fail again) over the next several seconds. When packets are being dropped due to this condition, kernel will log the event as: "nf_conntrack: table full, dropping packet". + grafana_dashboard_id: alerts-sat_conntrack + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_conntrack?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "503581002" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, instance) ( + clamp_min( + clamp_max( + max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) + / + node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, instance) ( + clamp_min( + clamp_max( + max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) + / + node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s3 + expr: | + gitlab_component_saturation:ratio{component="nf_conntrack_entries",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="nf_conntrack_entries"} + - alert: component_saturation_slo_out_of_bounds:node_schedstat_waiting + for: 90m + annotations: + title: The Node Scheduler Waiting Time resource of the {{ $labels.type }} service + ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to + its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Node Scheduler Waiting Time resource: + + Measures the amount of scheduler waiting time that processes are waiting to be scheduled, according to [`CPU Scheduling Metrics`](https://www.robustperception.io/cpu-scheduling-metrics-from-the-node-exporter). + + A high value indicates that a node has more processes to be run than CPU time available to handle them, and may lead to degraded responsiveness and performance from the application. + + Additionally, it may indicate that the fleet is under-provisioned. + grafana_dashboard_id: alerts-sat_node_schedstat_waiting + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_node_schedstat_waiting?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "1415313189" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, shard) ( + clamp_min( + clamp_max( + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, shard) ( + clamp_min( + clamp_max( + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + gitlab_component_saturation:ratio{component="node_schedstat_waiting",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="node_schedstat_waiting"} + - alert: component_saturation_slo_out_of_bounds:open_fds + for: 5m + annotations: + title: The Open file descriptor utilization per instance resource of the {{ + $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding + SLO and is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Open file descriptor utilization per instance resource: + + Open file descriptor utilization per instance. + + Saturation on file descriptor limits may indicate a resource-descriptor leak in the application. + + As a temporary fix, you may want to consider restarting the affected process. + grafana_dashboard_id: alerts-sat_open_fds + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_open_fds?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "1001792825" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, job, instance) ( + clamp_min( + clamp_max( + ( + process_open_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + or + ( + ruby_file_descriptors{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + ruby_process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, job, instance) ( + clamp_min( + clamp_max( + ( + process_open_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + or + ( + ruby_file_descriptors{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + ruby_process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + gitlab_component_saturation:ratio{component="open_fds",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="open_fds"} + - alert: component_saturation_slo_out_of_bounds:shard_cpu + for: 5m + annotations: + title: The Average CPU Utilization per Shard resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Average CPU Utilization per Shard resource: + + This resource measures average CPU utilization across an all cores in a shard of a service fleet. If it is becoming saturated, it may indicate that the shard needs horizontal or vertical scaling. + grafana_dashboard_id: alerts-sat_shard_cpu + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_shard_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "1472933476" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s3 + expr: | + gitlab_component_saturation:ratio{component="shard_cpu",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="shard_cpu"} + - alert: component_saturation_slo_out_of_bounds:single_node_cpu + for: 10m + annotations: + title: The Average CPU Utilization per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Average CPU Utilization per Node resource: + + Average CPU utilization per Node. + + If average CPU is saturated, it may indicate that a fleet is in need to horizontal or vertical scaling. It may also indicate imbalances in load in a fleet. + grafana_dashboard_id: alerts-sat_single_node_cpu + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_single_node_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_panel_id: "3372411356" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + gitlab_component_saturation:ratio{component="single_node_cpu",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="single_node_cpu"} diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml new file mode 100644 index 0000000000..fd74401a84 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml @@ -0,0 +1,78 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/saturation.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: GitLab Component Saturation Max SLOs + interval: 5m + rules: + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: cpu + expr: "0.8" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: cpu + expr: "0.9" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: disk_inodes + expr: "0.75" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: disk_inodes + expr: "0.8" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: disk_space + expr: "0.85" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: disk_space + expr: "0.9" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: memory + expr: "0.9" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: memory + expr: "0.98" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: nf_conntrack_entries + expr: "0.95" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: nf_conntrack_entries + expr: "0.98" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: node_schedstat_waiting + expr: "0.1" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: node_schedstat_waiting + expr: "0.15" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: open_fds + expr: "0.8" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: open_fds + expr: "0.9" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: shard_cpu + expr: "0.85" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: shard_cpu + expr: "0.95" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: single_node_cpu + expr: "0.9" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: single_node_cpu + expr: "0.95" diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml new file mode 100644 index 0000000000..ff447b39c1 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml @@ -0,0 +1,145 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/saturation.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: Saturation Rules (autogenerated) + interval: 1m + rules: + - record: gitlab_component_saturation:ratio + labels: + component: cpu + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (env, environment, tier, type, stage, shard) ( + rate(node_cpu_seconds_total{mode="idle", env="gprd",type="zoekt"}[5m]) + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: disk_inodes + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", env="gprd",type="zoekt"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", env="gprd",type="zoekt"} + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: disk_space + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", env="gprd",type="zoekt"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", env="gprd",type="zoekt"} + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: memory + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + instance:node_memory_utilization:ratio{env="gprd",type="zoekt"} or instance:node_memory_utilisation:ratio{env="gprd",type="zoekt"} + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: nf_conntrack_entries + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + max_over_time(node_nf_conntrack_entries{env="gprd",type="zoekt"}[1m]) + / + node_nf_conntrack_entries_limit{env="gprd",type="zoekt"} + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: node_schedstat_waiting + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{env="gprd",type="zoekt"}[1h])) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: open_fds + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + ( + process_open_fds{env="gprd",type="zoekt"} + / + process_max_fds{env="gprd",type="zoekt"} + ) + or + ( + ruby_file_descriptors{env="gprd",type="zoekt"} + / + ruby_process_max_fds{env="gprd",type="zoekt"} + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: shard_cpu + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (env, environment, tier, type, stage, shard, shard) ( + rate(node_cpu_seconds_total{mode="idle", env="gprd",type="zoekt"}[5m]) + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: single_node_cpu + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", env="gprd",type="zoekt"}[5m])) + , + 1) + , + 0) + ) diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service-anomaly-detection-alerts.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service-anomaly-detection-alerts.yml new file mode 100644 index 0000000000..937aad313d --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service-anomaly-detection-alerts.yml @@ -0,0 +1,79 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/service-anomaly-detection-alerts.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: zoekt - service_ops_anomaly_detection + rules: + - alert: service_ops_out_of_bounds_upper_5m + for: 5m + annotations: + title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage + }}` stage) is receiving more requests than normal' + description: | + The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) is receiving more requests than normal. This is often caused by user generated traffic, sometimes abuse. It can also be cause by application changes that lead to higher operations rates or from retries in the event of errors. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause. + grafana_dashboard_id: general-service/service-platform-metrics + grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "12" + grafana_panel_id: "2633741645" + grafana_variables: environment,type,stage + link1_title: Definition + link1_url: https://gitlab.com/gitlab-com/runbooks/blob/master/docs/monitoring/definition-service-ops-rate.md + promql_template_1: gitlab_service_ops:rate{environment="$environment", type="$type", + stage="$stage"} + promql_template_2: gitlab_component_ops:rate{environment="$environment", type="$type", + stage="$stage"} + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + ( + ( + (gitlab_service_ops:rate{env="gprd",monitor="global",type="zoekt"} - gitlab_service_ops:rate:prediction{env="gprd",monitor="global",type="zoekt"}) / + gitlab_service_ops:rate:stddev_over_time_1w{env="gprd",monitor="global",type="zoekt"} + ) + > + 3 + ) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} + - alert: service_ops_out_of_bounds_lower_5m + for: 5m + annotations: + title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage + }}` stage) is receiving fewer requests than normal' + description: | + The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) is receiving fewer requests than normal. This is often caused by a failure in an upstream service - for example, an upstream load balancer rejected all incoming traffic. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors that may be leading to traffic flow issues in downstream services. + grafana_dashboard_id: general-service/service-platform-metrics + grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "12" + grafana_panel_id: "2633741645" + grafana_variables: environment,type,stage + link1_title: Definition + link1_url: https://gitlab.com/gitlab-com/runbooks/blob/master/docs/monitoring/definition-service-ops-rate.md + promql_template_1: gitlab_service_ops:rate{environment="$environment", type="$type", + stage="$stage"} + promql_template_2: gitlab_component_ops:rate{environment="$environment", type="$type", + stage="$stage"} + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + ( + ( + (gitlab_service_ops:rate{env="gprd",monitor="global",type="zoekt"} - gitlab_service_ops:rate:prediction{env="gprd",monitor="global",type="zoekt"}) / + gitlab_service_ops:rate:stddev_over_time_1w{env="gprd",monitor="global",type="zoekt"} + ) + < + -3 + ) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service_ops_anomaly_detection.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service_ops_anomaly_detection.yml new file mode 100644 index 0000000000..b821aa7213 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-service_ops_anomaly_detection.yml @@ -0,0 +1,38 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/service-ops-anomaly-detection.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: 'zoekt operation rate weekly statistics: {"env": "gprd", "type": "zoekt"}' + interval: 5m + rules: + - record: gitlab_service_ops:rate:avg_over_time_1w + expr: | + avg_over_time(gitlab_service_ops:rate_5m{env="gprd",monitor="global",type="zoekt"}[1w]) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global",type="zoekt"} + - record: gitlab_service_ops:rate:stddev_over_time_1w + expr: | + stddev_over_time(gitlab_service_ops:rate_5m{env="gprd",monitor="global",type="zoekt"}[1w]) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global",type="zoekt"} +- name: 'zoekt ops rate weekly prediction values: {"env": "gprd", "type": "zoekt"}' + interval: 5m + rules: + - record: gitlab_service_ops:rate:prediction + expr: | + quantile(0.5, + label_replace( + gitlab_service_ops:rate_1h{env="gprd",monitor="global",type="zoekt"} offset 10050m # 1 week - 30mins + + delta(gitlab_service_ops:rate:avg_over_time_1w{env="gprd",monitor="global",type="zoekt"}[1w]) + , "p", "1w", "", "") + or + label_replace( + gitlab_service_ops:rate_1h{env="gprd",monitor="global",type="zoekt"} offset 20130m # 2 weeks - 30mins + + delta(gitlab_service_ops:rate:avg_over_time_1w{env="gprd",monitor="global",type="zoekt"}[2w]) + , "p", "2w", "", "") + or + label_replace( + gitlab_service_ops:rate_1h{env="gprd",monitor="global",type="zoekt"} offset 30210m # 3 weeks - 30mins + + delta(gitlab_service_ops:rate:avg_over_time_1w{env="gprd",monitor="global",type="zoekt"}[3w]) + , "p", "3w", "", "") + ) + without (p) diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-zoekt-service-slos.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-zoekt-service-slos.yml new file mode 100644 index 0000000000..c5401614d1 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-zoekt-service-slos.yml @@ -0,0 +1,18 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/service-slos.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: Autogenerated Service SLOs + interval: 5m + rules: + - record: slo:min:events:gitlab_service_apdex:ratio + labels: + monitor: global + tier: inf + type: zoekt + expr: "0.999000" + - record: slo:max:events:gitlab_service_errors:ratio + labels: + monitor: global + tier: inf + type: zoekt + expr: "0.001000" diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml new file mode 100644 index 0000000000..221970a5e4 --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml @@ -0,0 +1,628 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/saturation.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: GitLab Component Saturation Statistics + interval: 5m + rules: + - record: gitlab_component_saturation:ratio_quantile95_1w + expr: quantile_over_time(0.95, gitlab_component_saturation:ratio{env="gstg",type="zoekt"}[1w]) + - record: gitlab_component_saturation:ratio_quantile99_1w + expr: quantile_over_time(0.99, gitlab_component_saturation:ratio{env="gstg",type="zoekt"}[1w]) + - record: gitlab_component_saturation:ratio_quantile95_1h + expr: quantile_over_time(0.95, gitlab_component_saturation:ratio{env="gstg",type="zoekt"}[1h]) + - record: gitlab_component_saturation:ratio_quantile99_1h + expr: quantile_over_time(0.99, gitlab_component_saturation:ratio{env="gstg",type="zoekt"}[1h]) + - record: gitlab_component_saturation:ratio_avg_1h + expr: avg_over_time(gitlab_component_saturation:ratio{env="gstg",type="zoekt"}[1h]) +- name: GitLab Saturation Alerts + interval: 1m + rules: + - alert: component_saturation_slo_out_of_bounds:cpu + for: 5m + annotations: + title: The Average Service CPU Utilization resource of the {{ $labels.type }} + service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is + close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Average Service CPU Utilization resource: + + This resource measures average CPU utilization across an all cores in a service fleet. If it is becoming saturated, it may indicate that the fleet needs horizontal or vertical scaling. + grafana_dashboard_id: alerts-sat_cpu + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "1465724101" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s3 + expr: | + gitlab_component_saturation:ratio{component="cpu",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="cpu"} + - alert: component_saturation_slo_out_of_bounds:disk_inodes + for: 15m + annotations: + title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Disk inode Utilization per Device per Node resource: + + Disk inode utilization per device per node. + + If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files + grafana_dashboard_id: alerts-sat_disk_inodes + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "39965907" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + gitlab_component_saturation:ratio{component="disk_inodes",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} + - alert: ComponentResourceRunningOut_disk_inodes + for: 15m + annotations: + title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) is on track to hit capacity within + 6h + description: | + This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. + + Details of the Disk inode Utilization per Device per Node resource: + + Disk inode utilization per device per node. + + If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files + grafana_dashboard_id: alerts-sat_disk_inodes + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "39965907" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + linear_prediction_saturation_alert: 6h + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + predict_linear(gitlab_component_saturation:ratio{component="disk_inodes",env="gstg",type="zoekt"}[6h], 21600) + > on (component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} + - alert: component_saturation_slo_out_of_bounds:disk_space + for: 15m + annotations: + title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Disk Space Utilization per Device per Node resource: + + Disk space utilization per device per node. + grafana_dashboard_id: alerts-sat_disk_space + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "2661375984" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + gitlab_component_saturation:ratio{component="disk_space",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} + - alert: ComponentResourceRunningOut_disk_space + for: 15m + annotations: + title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) is on track to hit capacity within + 6h + description: | + This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. + + Details of the Disk Space Utilization per Device per Node resource: + + Disk space utilization per device per node. + grafana_dashboard_id: alerts-sat_disk_space + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "2661375984" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, device) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + linear_prediction_saturation_alert: 6h + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + predict_linear(gitlab_component_saturation:ratio{component="disk_space",env="gstg",type="zoekt"}[6h], 21600) + > on (component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} + - alert: component_saturation_slo_out_of_bounds:memory + for: 5m + annotations: + title: The Memory Utilization per Node resource of the {{ $labels.type }} service + ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to + its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Memory Utilization per Node resource: + + Memory utilization per device per node. + grafana_dashboard_id: alerts-sat_memory + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_memory?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "1955556769" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + gitlab_component_saturation:ratio{component="memory",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="memory"} + - alert: component_saturation_slo_out_of_bounds:nf_conntrack_entries + for: 5m + annotations: + title: The conntrack Entries per Node resource of the {{ $labels.type }} service + ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to + its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the conntrack Entries per Node resource: + + Netfilter connection tracking table utilization per node. + + When saturated, new connection attempts (incoming SYN packets) are dropped with no reply, leaving clients to slowly retry (and typically fail again) over the next several seconds. When packets are being dropped due to this condition, kernel will log the event as: "nf_conntrack: table full, dropping packet". + grafana_dashboard_id: alerts-sat_conntrack + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_conntrack?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "503581002" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, instance) ( + clamp_min( + clamp_max( + max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) + / + node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, instance) ( + clamp_min( + clamp_max( + max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) + / + node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s3 + expr: | + gitlab_component_saturation:ratio{component="nf_conntrack_entries",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="nf_conntrack_entries"} + - alert: component_saturation_slo_out_of_bounds:node_schedstat_waiting + for: 90m + annotations: + title: The Node Scheduler Waiting Time resource of the {{ $labels.type }} service + ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to + its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Node Scheduler Waiting Time resource: + + Measures the amount of scheduler waiting time that processes are waiting to be scheduled, according to [`CPU Scheduling Metrics`](https://www.robustperception.io/cpu-scheduling-metrics-from-the-node-exporter). + + A high value indicates that a node has more processes to be run than CPU time available to handle them, and may lead to degraded responsiveness and performance from the application. + + Additionally, it may indicate that the fleet is under-provisioned. + grafana_dashboard_id: alerts-sat_node_schedstat_waiting + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_node_schedstat_waiting?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "1415313189" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn, shard) ( + clamp_min( + clamp_max( + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn, shard) ( + clamp_min( + clamp_max( + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + gitlab_component_saturation:ratio{component="node_schedstat_waiting",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="node_schedstat_waiting"} + - alert: component_saturation_slo_out_of_bounds:open_fds + for: 5m + annotations: + title: The Open file descriptor utilization per instance resource of the {{ + $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding + SLO and is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Open file descriptor utilization per instance resource: + + Open file descriptor utilization per instance. + + Saturation on file descriptor limits may indicate a resource-descriptor leak in the application. + + As a temporary fix, you may want to consider restarting the affected process. + grafana_dashboard_id: alerts-sat_open_fds + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_open_fds?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "1001792825" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, job, instance) ( + clamp_min( + clamp_max( + ( + process_open_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + or + ( + ruby_file_descriptors{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + ruby_process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, job, instance) ( + clamp_min( + clamp_max( + ( + process_open_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + or + ( + ruby_file_descriptors{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + ruby_process_max_fds{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + pager: pagerduty + rules_domain: general + severity: s2 + expr: | + gitlab_component_saturation:ratio{component="open_fds",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="open_fds"} + - alert: component_saturation_slo_out_of_bounds:shard_cpu + for: 5m + annotations: + title: The Average CPU Utilization per Shard resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Average CPU Utilization per Shard resource: + + This resource measures average CPU utilization across an all cores in a shard of a service fleet. If it is becoming saturated, it may indicate that the shard needs horizontal or vertical scaling. + grafana_dashboard_id: alerts-sat_shard_cpu + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_shard_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "1472933476" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, shard) ( + clamp_min( + clamp_max( + 1 - avg by (environment, tier, type, stage, shard, shard) ( + rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s3 + expr: | + gitlab_component_saturation:ratio{component="shard_cpu",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="shard_cpu"} + - alert: component_saturation_slo_out_of_bounds:single_node_cpu + for: 10m + annotations: + title: The Average CPU Utilization per Node resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. + description: | + This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. + + Details of the Average CPU Utilization per Node resource: + + Average CPU utilization per Node. + + If average CPU is saturated, it may indicate that a fleet is in need to horizontal or vertical scaling. It may also indicate imbalances in load in a fleet. + grafana_dashboard_id: alerts-sat_single_node_cpu + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_single_node_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_panel_id: "3372411356" + grafana_variables: environment,type,stage + promql_query: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) + , + 1) + , + 0) + ) + promql_template_1: | + max by(environment, tier, type, stage, shard, fqdn) ( + clamp_min( + clamp_max( + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) + , + 1) + , + 0) + ) + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + gitlab_component_saturation:ratio{component="single_node_cpu",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="single_node_cpu"} diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml new file mode 100644 index 0000000000..fd74401a84 --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml @@ -0,0 +1,78 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/saturation.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: GitLab Component Saturation Max SLOs + interval: 5m + rules: + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: cpu + expr: "0.8" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: cpu + expr: "0.9" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: disk_inodes + expr: "0.75" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: disk_inodes + expr: "0.8" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: disk_space + expr: "0.85" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: disk_space + expr: "0.9" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: memory + expr: "0.9" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: memory + expr: "0.98" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: nf_conntrack_entries + expr: "0.95" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: nf_conntrack_entries + expr: "0.98" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: node_schedstat_waiting + expr: "0.1" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: node_schedstat_waiting + expr: "0.15" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: open_fds + expr: "0.8" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: open_fds + expr: "0.9" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: shard_cpu + expr: "0.85" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: shard_cpu + expr: "0.95" + - record: slo:max:soft:gitlab_component_saturation:ratio + labels: + component: single_node_cpu + expr: "0.9" + - record: slo:max:hard:gitlab_component_saturation:ratio + labels: + component: single_node_cpu + expr: "0.95" diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml new file mode 100644 index 0000000000..14e37fdc5d --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml @@ -0,0 +1,145 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/saturation.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: Saturation Rules (autogenerated) + interval: 1m + rules: + - record: gitlab_component_saturation:ratio + labels: + component: cpu + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (env, environment, tier, type, stage, shard) ( + rate(node_cpu_seconds_total{mode="idle", env="gstg",type="zoekt"}[5m]) + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: disk_inodes + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - ( + node_filesystem_files_free{fstype=~"(ext.|xfs)", env="gstg",type="zoekt"} + / + node_filesystem_files{fstype=~"(ext.|xfs)", env="gstg",type="zoekt"} + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: disk_space + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + ( + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", env="gstg",type="zoekt"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", env="gstg",type="zoekt"} + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: memory + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + instance:node_memory_utilization:ratio{env="gstg",type="zoekt"} or instance:node_memory_utilisation:ratio{env="gstg",type="zoekt"} + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: nf_conntrack_entries + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + max_over_time(node_nf_conntrack_entries{env="gstg",type="zoekt"}[1m]) + / + node_nf_conntrack_entries_limit{env="gstg",type="zoekt"} + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: node_schedstat_waiting + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{env="gstg",type="zoekt"}[1h])) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: open_fds + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + ( + process_open_fds{env="gstg",type="zoekt"} + / + process_max_fds{env="gstg",type="zoekt"} + ) + or + ( + ruby_file_descriptors{env="gstg",type="zoekt"} + / + ruby_process_max_fds{env="gstg",type="zoekt"} + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: shard_cpu + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + 1 - avg by (env, environment, tier, type, stage, shard, shard) ( + rate(node_cpu_seconds_total{mode="idle", env="gstg",type="zoekt"}[5m]) + ) + , + 1) + , + 0) + ) + - record: gitlab_component_saturation:ratio + labels: + component: single_node_cpu + expr: | + max by(env, environment, tier, type, stage, shard) ( + clamp_min( + clamp_max( + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", env="gstg",type="zoekt"}[5m])) + , + 1) + , + 0) + ) diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service-anomaly-detection-alerts.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service-anomaly-detection-alerts.yml new file mode 100644 index 0000000000..b9c2f56dfc --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service-anomaly-detection-alerts.yml @@ -0,0 +1,79 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/service-anomaly-detection-alerts.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: zoekt - service_ops_anomaly_detection + rules: + - alert: service_ops_out_of_bounds_upper_5m + for: 5m + annotations: + title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage + }}` stage) is receiving more requests than normal' + description: | + The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) is receiving more requests than normal. This is often caused by user generated traffic, sometimes abuse. It can also be cause by application changes that lead to higher operations rates or from retries in the event of errors. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause. + grafana_dashboard_id: general-service/service-platform-metrics + grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "12" + grafana_panel_id: "2633741645" + grafana_variables: environment,type,stage + link1_title: Definition + link1_url: https://gitlab.com/gitlab-com/runbooks/blob/master/docs/monitoring/definition-service-ops-rate.md + promql_template_1: gitlab_service_ops:rate{environment="$environment", type="$type", + stage="$stage"} + promql_template_2: gitlab_component_ops:rate{environment="$environment", type="$type", + stage="$stage"} + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + ( + ( + (gitlab_service_ops:rate{env="gstg",monitor="global",type="zoekt"} - gitlab_service_ops:rate:prediction{env="gstg",monitor="global",type="zoekt"}) / + gitlab_service_ops:rate:stddev_over_time_1w{env="gstg",monitor="global",type="zoekt"} + ) + > + 3 + ) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} + - alert: service_ops_out_of_bounds_lower_5m + for: 5m + annotations: + title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage + }}` stage) is receiving fewer requests than normal' + description: | + The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) is receiving fewer requests than normal. This is often caused by a failure in an upstream service - for example, an upstream load balancer rejected all incoming traffic. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors that may be leading to traffic flow issues in downstream services. + grafana_dashboard_id: general-service/service-platform-metrics + grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "12" + grafana_panel_id: "2633741645" + grafana_variables: environment,type,stage + link1_title: Definition + link1_url: https://gitlab.com/gitlab-com/runbooks/blob/master/docs/monitoring/definition-service-ops-rate.md + promql_template_1: gitlab_service_ops:rate{environment="$environment", type="$type", + stage="$stage"} + promql_template_2: gitlab_component_ops:rate{environment="$environment", type="$type", + stage="$stage"} + runbook: docs/{{ $labels.type }}/README.md + labels: + alert_type: cause + rules_domain: general + severity: s4 + expr: | + ( + ( + (gitlab_service_ops:rate{env="gstg",monitor="global",type="zoekt"} - gitlab_service_ops:rate:prediction{env="gstg",monitor="global",type="zoekt"}) / + gitlab_service_ops:rate:stddev_over_time_1w{env="gstg",monitor="global",type="zoekt"} + ) + < + -3 + ) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service_ops_anomaly_detection.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service_ops_anomaly_detection.yml new file mode 100644 index 0000000000..1c1e1feab1 --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-service_ops_anomaly_detection.yml @@ -0,0 +1,38 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/service-ops-anomaly-detection.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: 'zoekt operation rate weekly statistics: {"env": "gstg", "type": "zoekt"}' + interval: 5m + rules: + - record: gitlab_service_ops:rate:avg_over_time_1w + expr: | + avg_over_time(gitlab_service_ops:rate_5m{env="gstg",monitor="global",type="zoekt"}[1w]) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global",type="zoekt"} + - record: gitlab_service_ops:rate:stddev_over_time_1w + expr: | + stddev_over_time(gitlab_service_ops:rate_5m{env="gstg",monitor="global",type="zoekt"}[1w]) + unless on(tier, type) + gitlab_service:mapping:disable_ops_rate_prediction{monitor="global",type="zoekt"} +- name: 'zoekt ops rate weekly prediction values: {"env": "gstg", "type": "zoekt"}' + interval: 5m + rules: + - record: gitlab_service_ops:rate:prediction + expr: | + quantile(0.5, + label_replace( + gitlab_service_ops:rate_1h{env="gstg",monitor="global",type="zoekt"} offset 10050m # 1 week - 30mins + + delta(gitlab_service_ops:rate:avg_over_time_1w{env="gstg",monitor="global",type="zoekt"}[1w]) + , "p", "1w", "", "") + or + label_replace( + gitlab_service_ops:rate_1h{env="gstg",monitor="global",type="zoekt"} offset 20130m # 2 weeks - 30mins + + delta(gitlab_service_ops:rate:avg_over_time_1w{env="gstg",monitor="global",type="zoekt"}[2w]) + , "p", "2w", "", "") + or + label_replace( + gitlab_service_ops:rate_1h{env="gstg",monitor="global",type="zoekt"} offset 30210m # 3 weeks - 30mins + + delta(gitlab_service_ops:rate:avg_over_time_1w{env="gstg",monitor="global",type="zoekt"}[3w]) + , "p", "3w", "", "") + ) + without (p) diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-zoekt-service-slos.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-zoekt-service-slos.yml new file mode 100644 index 0000000000..c5401614d1 --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-zoekt-service-slos.yml @@ -0,0 +1,18 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/service-slos.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: Autogenerated Service SLOs + interval: 5m + rules: + - record: slo:min:events:gitlab_service_apdex:ratio + labels: + monitor: global + tier: inf + type: zoekt + expr: "0.999000" + - record: slo:max:events:gitlab_service_errors:ratio + labels: + monitor: global + tier: inf + type: zoekt + expr: "0.001000" diff --git a/thanos-rules/autogenerated-service-slos.yml b/thanos-rules/autogenerated-service-slos.yml index 5b40661342..f391786a05 100644 --- a/thanos-rules/autogenerated-service-slos.yml +++ b/thanos-rules/autogenerated-service-slos.yml @@ -3273,6 +3273,16 @@ groups: tier: sv type: woodhouse expr: "0.001000" + - record: slo:min:events:gitlab_service_apdex:ratio + labels: + tier: inf + type: zoekt + expr: "0.999000" + - record: slo:max:events:gitlab_service_errors:ratio + labels: + tier: inf + type: zoekt + expr: "0.001000" - record: gitlab_service:mapping:disable_ops_rate_prediction labels: tier: inf -- GitLab From 48ca531acd97cd68dd86fc5400ae4bc16aa595b9 Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Mon, 10 Jun 2024 16:11:57 +0200 Subject: [PATCH 3/7] feat(zoekt): skippedMaturityCriteria --- metrics-catalog/services/zoekt.jsonnet | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metrics-catalog/services/zoekt.jsonnet b/metrics-catalog/services/zoekt.jsonnet index a79d9a550e..5a99784501 100644 --- a/metrics-catalog/services/zoekt.jsonnet +++ b/metrics-catalog/services/zoekt.jsonnet @@ -9,4 +9,7 @@ metricsCatalog.serviceDefinition({ errorRatio: 0.999, }, serviceLevelIndicators: {}, + skippedMaturityCriteria: { + 'Structured logs available in Kibana': 'zoekt is an infrastructure component, developers do not interact with it', + }, }) -- GitLab From af9e6cf6e3b91ae2737264dfdf14df1facb90412 Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Mon, 10 Jun 2024 16:17:33 +0200 Subject: [PATCH 4/7] feat(zoekt): serviceDependencies --- metrics-catalog/services/api.jsonnet | 1 + metrics-catalog/services/sidekiq.jsonnet | 3 ++- metrics-catalog/services/web.jsonnet | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/metrics-catalog/services/api.jsonnet b/metrics-catalog/services/api.jsonnet index 082f2838c9..5fa980e799 100644 --- a/metrics-catalog/services/api.jsonnet +++ b/metrics-catalog/services/api.jsonnet @@ -61,6 +61,7 @@ metricsCatalog.serviceDefinition({ search: true, consul: true, 'google-cloud-storage': true, + zoekt: true, }, provisioning: { vms: false, diff --git a/metrics-catalog/services/sidekiq.jsonnet b/metrics-catalog/services/sidekiq.jsonnet index 6f0229efef..e4bca9f108 100644 --- a/metrics-catalog/services/sidekiq.jsonnet +++ b/metrics-catalog/services/sidekiq.jsonnet @@ -22,7 +22,7 @@ local baseSelector = { type: 'sidekiq' } + ignoredWorkers; metricsCatalog.serviceDefinition({ type: 'sidekiq', tier: 'sv', - tenants: [ 'gitlab-gprd', 'gitlab-gstg', 'gitlab-pre' ], + tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'], tags: ['rails'], // overrides monitoringThresholds for specific shards and SLIs @@ -82,6 +82,7 @@ metricsCatalog.serviceDefinition({ search: true, consul: true, 'google-cloud-storage': true, + zoekt: true, }, provisioning: { kubernetes: true, diff --git a/metrics-catalog/services/web.jsonnet b/metrics-catalog/services/web.jsonnet index 56c4e4a28f..9f26798af9 100644 --- a/metrics-catalog/services/web.jsonnet +++ b/metrics-catalog/services/web.jsonnet @@ -14,7 +14,7 @@ local railsSelector = { job: 'gitlab-rails', type: 'web' }; metricsCatalog.serviceDefinition({ type: 'web', tier: 'sv', - tenants: [ 'gitlab-gprd', 'gitlab-gstg', 'gitlab-pre' ], + tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'], tags: ['golang', 'rails', 'puma'], @@ -61,6 +61,7 @@ metricsCatalog.serviceDefinition({ search: true, consul: true, 'google-cloud-storage': true, + zoekt: true, }, recordingRuleMetrics: [ 'http_requests_total', -- GitLab From 276e7ba0af6b14d8ed2514984466c84fcbb01429 Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Wed, 12 Jun 2024 15:11:51 +0000 Subject: [PATCH 5/7] feat(zoekt): Kibana url for zoekt logs --- metrics-catalog/services/zoekt.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics-catalog/services/zoekt.jsonnet b/metrics-catalog/services/zoekt.jsonnet index 5a99784501..e3d6430c99 100644 --- a/metrics-catalog/services/zoekt.jsonnet +++ b/metrics-catalog/services/zoekt.jsonnet @@ -10,6 +10,6 @@ metricsCatalog.serviceDefinition({ }, serviceLevelIndicators: {}, skippedMaturityCriteria: { - 'Structured logs available in Kibana': 'zoekt is an infrastructure component, developers do not interact with it', + 'Structured logs available in Kibana': 'logs are available at https://log.gprd.gitlab.net/app/r/s/U9Av8, but not linked to SLIs as there are no SLIs for now.', }, }) -- GitLab From b16811adb7789f8f04d9ee20e691c0a61397540b Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Thu, 13 Jun 2024 17:14:22 +0200 Subject: [PATCH 6/7] feat(zoekt): kube resources --- metrics-catalog/services/zoekt.jsonnet | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/metrics-catalog/services/zoekt.jsonnet b/metrics-catalog/services/zoekt.jsonnet index e3d6430c99..7764076143 100644 --- a/metrics-catalog/services/zoekt.jsonnet +++ b/metrics-catalog/services/zoekt.jsonnet @@ -8,6 +8,26 @@ metricsCatalog.serviceDefinition({ apdexScore: 0.999, errorRatio: 0.999, }, + provisioning: { + kubernetes: true, + vms: false, + }, + kubeResources: { + 'gitlab-zoekt': { + kind: 'StatefulSet', + containers: [ + 'zoekt-indexer', + 'zoekt-webserver', + 'zoekt-internal-gateway', + ], + }, + 'gitlab-zoekt-gateway': { + kind: 'Deployment', + containers: [ + 'zoekt-external-gateway', + ], + }, + }, serviceLevelIndicators: {}, skippedMaturityCriteria: { 'Structured logs available in Kibana': 'logs are available at https://log.gprd.gitlab.net/app/r/s/U9Av8, but not linked to SLIs as there are no SLIs for now.', -- GitLab From d47250b7cef8b4cce624f62fb83ac646a1736c1b Mon Sep 17 00:00:00 2001 From: Hercules Merscher <hmerscher@gitlab.com> Date: Thu, 13 Jun 2024 17:38:34 +0200 Subject: [PATCH 7/7] chore: make generate --- ...ted-kube-state-metrics-recording-rules.yml | 188 ++++--- .../autogenerated-saturation.yml | 44 +- ...tlab-gprd-gprd-zoekt-kube-cause-alerts.yml | 39 ++ ...lab-gprd-gprd-zoekt-kube-state-metrics.yml | 446 ++++++++++++++++ ...tlab-gprd-gprd-zoekt-saturation-alerts.yml | 488 ++++-------------- ...ab-gprd-gprd-zoekt-saturation-metadata.yml | 62 +-- ...ated-gitlab-gprd-gprd-zoekt-saturation.yml | 96 ++-- ...tlab-gstg-gstg-zoekt-kube-cause-alerts.yml | 39 ++ ...lab-gstg-gstg-zoekt-kube-state-metrics.yml | 446 ++++++++++++++++ ...tlab-gstg-gstg-zoekt-saturation-alerts.yml | 488 ++++-------------- ...ab-gstg-gstg-zoekt-saturation-metadata.yml | 62 +-- ...ated-gitlab-gstg-gstg-zoekt-saturation.yml | 96 ++-- 12 files changed, 1415 insertions(+), 1079 deletions(-) create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-cause-alerts.yml create mode 100644 mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-state-metrics.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-cause-alerts.yml create mode 100644 mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-state-metrics.yml diff --git a/legacy-prometheus-rules/autogenerated-kube-state-metrics-recording-rules.yml b/legacy-prometheus-rules/autogenerated-kube-state-metrics-recording-rules.yml index fd195a01d9..9661ccc6fe 100644 --- a/legacy-prometheus-rules/autogenerated-kube-state-metrics-recording-rules.yml +++ b/legacy-prometheus-rules/autogenerated-kube-state-metrics-recording-rules.yml @@ -2216,6 +2216,68 @@ groups: "shard", "$0", "label_shard", ".*" ) ) +- name: 'kube-state-metrics-recording-rules: zoekt' + interval: 1m + rules: + - record: kube_pod_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard,label_deployment) ( + label_replace( + label_replace( + label_replace( + topk by(environment,cluster,pod) (1, kube_pod_labels{label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ), + "deployment", "$0", "label_deployment", ".*" + ) + ) + - record: kube_horizontalpodautoscaler_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels{label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) + - record: kube_ingress_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,ingress) (1, kube_ingress_labels{label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) + - record: kube_deployment_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,deployment) (1, kube_deployment_labels{label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) - name: 'kube-state-metrics-recording-rules: enriched label recording rules' interval: 1m rules: @@ -2224,376 +2286,376 @@ groups: container_start_time_seconds{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_cpu_cfs_periods_total:labeled expr: | container_cpu_cfs_periods_total{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_cpu_cfs_throttled_periods_total:labeled expr: | container_cpu_cfs_throttled_periods_total{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_cpu_cfs_throttled_seconds_total:labeled expr: | container_cpu_cfs_throttled_seconds_total{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_cpu_usage_seconds_total:labeled expr: | container_cpu_usage_seconds_total{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_memory_cache:labeled expr: | container_memory_cache{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_memory_rss:labeled expr: | container_memory_rss{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_memory_swap:labeled expr: | container_memory_swap{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_memory_usage_bytes:labeled expr: | container_memory_usage_bytes{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_memory_working_set_bytes:labeled expr: | container_memory_working_set_bytes{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_network_receive_bytes_total:labeled expr: | container_network_receive_bytes_total{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_network_transmit_bytes_total:labeled expr: | container_network_transmit_bytes_total{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_spec_cpu_period:labeled expr: | container_spec_cpu_period{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_spec_cpu_quota:labeled expr: | container_spec_cpu_quota{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_spec_cpu_shares:labeled expr: | container_spec_cpu_shares{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: container_spec_memory_limit_bytes:labeled expr: | container_spec_memory_limit_bytes{metrics_path="/metrics/cadvisor"} * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_resource_limits:labeled expr: | kube_pod_container_resource_limits * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_resource_requests:labeled expr: | kube_pod_container_resource_requests * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_last_terminated_reason:labeled expr: | kube_pod_container_status_last_terminated_reason * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_ready:labeled expr: | kube_pod_container_status_ready * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_restarts_total:labeled expr: | kube_pod_container_status_restarts_total * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_running:labeled expr: | kube_pod_container_status_running * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_terminated:labeled expr: | kube_pod_container_status_terminated * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_terminated_reason:labeled expr: | kube_pod_container_status_terminated_reason * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_waiting:labeled expr: | kube_pod_container_status_waiting * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_pod_container_status_waiting_reason:labeled expr: | kube_pod_container_status_waiting_reason * on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) - topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_spec_target_metric:labeled expr: | kube_horizontalpodautoscaler_spec_target_metric * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_status_condition:labeled expr: | kube_horizontalpodautoscaler_status_condition * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_status_current_replicas:labeled expr: | kube_horizontalpodautoscaler_status_current_replicas * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_status_desired_replicas:labeled expr: | kube_horizontalpodautoscaler_status_desired_replicas * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_metadata_generation:labeled expr: | kube_horizontalpodautoscaler_metadata_generation * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_spec_max_replicas:labeled expr: | kube_horizontalpodautoscaler_spec_max_replicas * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_horizontalpodautoscaler_spec_min_replicas:labeled expr: | kube_horizontalpodautoscaler_spec_min_replicas * on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) - topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_node_status_capacity:labeled expr: | kube_node_status_capacity * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_node_status_allocatable:labeled expr: | kube_node_status_allocatable * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_node_status_condition:labeled expr: | kube_node_status_condition * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_schedstat_waiting_seconds_total:labeled expr: | node_schedstat_waiting_seconds_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_cpu_seconds_total:labeled expr: | node_cpu_seconds_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_network_transmit_bytes_total:labeled expr: | node_network_transmit_bytes_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_network_receive_bytes_total:labeled expr: | node_network_receive_bytes_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_disk_reads_completed_total:labeled expr: | node_disk_reads_completed_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_disk_writes_completed_total:labeled expr: | node_disk_writes_completed_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_disk_read_bytes_total:labeled expr: | node_disk_read_bytes_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_disk_written_bytes_total:labeled expr: | node_disk_written_bytes_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_disk_read_time_seconds_total:labeled expr: | node_disk_read_time_seconds_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_disk_write_time_seconds_total:labeled expr: | node_disk_write_time_seconds_total * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_load1:labeled expr: | node_load1 * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_load5:labeled expr: | node_load5 * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_load15:labeled expr: | node_load15 * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: node_vmstat_oom_kill:labeled expr: | node_vmstat_oom_kill * on(environment,cluster,node) group_left(tier,type,stage,shard) - topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: nginx_ingress_controller_requests:labeled expr: | nginx_ingress_controller_requests * on(environment,cluster,ingress) group_left(tier,type,stage,shard) - topk by (environment,cluster,ingress) (1, kube_ingress_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,ingress) (1, kube_ingress_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_status_replicas_unavailable:labeled expr: | kube_deployment_status_replicas_unavailable * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_status_replicas_updated:labeled expr: | kube_deployment_status_replicas_updated * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_spec_paused:labeled expr: | kube_deployment_spec_paused * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_spec_replicas:labeled expr: | kube_deployment_spec_replicas * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_spec_strategy_rollingupdate_max_surge:labeled expr: | kube_deployment_spec_strategy_rollingupdate_max_surge * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_spec_strategy_rollingupdate_max_unavailable:labeled expr: | kube_deployment_spec_strategy_rollingupdate_max_unavailable * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_status_condition:labeled expr: | kube_deployment_status_condition * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_status_replicas_available:labeled expr: | kube_deployment_status_replicas_available * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_created:labeled expr: | kube_deployment_created * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_metadata_generation:labeled expr: | kube_deployment_metadata_generation * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_status_observed_generation:labeled expr: | kube_deployment_status_observed_generation * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) - record: kube_deployment_status_replicas:labeled expr: | kube_deployment_status_replicas * on(environment,cluster,deployment) group_left(tier,type,stage,shard) - topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse"}) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{type=~"ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|registry|sidekiq|vault|web|web-pages|websockets|woodhouse|zoekt"}) diff --git a/legacy-prometheus-rules/autogenerated-saturation.yml b/legacy-prometheus-rules/autogenerated-saturation.yml index ca319dc3c2..a4eb4b10bb 100644 --- a/legacy-prometheus-rules/autogenerated-saturation.yml +++ b/legacy-prometheus-rules/autogenerated-saturation.yml @@ -79,7 +79,7 @@ groups: clamp_min( clamp_max( 1 - avg by (environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[5m]) + rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[5m]) ) , 1) @@ -94,9 +94,9 @@ groups: clamp_min( clamp_max( 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} + node_filesystem_files_free{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} / - node_filesystem_files{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} + node_filesystem_files{fstype=~"(ext.|xfs)", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} ) , 1) @@ -111,7 +111,7 @@ groups: clamp_min( clamp_max( ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} + 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} ) , 1) @@ -366,15 +366,15 @@ groups: clamp_max( ( sum by (environment, tier, type, stage, shard, pod, container) ( - rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"}[1h]) + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"}[1h]) ) unless on(environment, tier, type, stage, shard, pod, container) ( - container_spec_cpu_quota:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"} + container_spec_cpu_quota:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"} ) ) / sum by(environment, tier, type, stage, shard, pod, container) ( - kube_pod_container_resource_requests:labeled{container!="", container!="POD", resource="cpu", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"} + kube_pod_container_resource_requests:labeled{container!="", container!="POD", resource="cpu", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"} ) , 1) @@ -389,13 +389,13 @@ groups: clamp_min( clamp_max( sum by (environment, tier, type, stage, shard, pod, container) ( - rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"}[5m]) + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"}[5m]) ) / sum by(environment, tier, type, stage, shard, pod, container) ( - container_spec_cpu_quota:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"} + container_spec_cpu_quota:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"} / - container_spec_cpu_period:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"} + container_spec_cpu_period:labeled{container!="", container!="POD", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"} ) , 1) @@ -409,9 +409,9 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - container_memory_working_set_bytes:labeled{container!="", container!="POD", type=~"atlantis|camoproxy|consul|external-dns|istio|kas|kube|logging|mailroom|monitoring|nginx|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|vault|web-pages|woodhouse"} + container_memory_working_set_bytes:labeled{container!="", container!="POD", type=~"atlantis|camoproxy|consul|external-dns|istio|kas|kube|logging|mailroom|monitoring|nginx|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|vault|web-pages|woodhouse|zoekt"} / - (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", type=~"atlantis|camoproxy|consul|external-dns|istio|kas|kube|logging|mailroom|monitoring|nginx|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|vault|web-pages|woodhouse"} > 0) + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", type=~"atlantis|camoproxy|consul|external-dns|istio|kas|kube|logging|mailroom|monitoring|nginx|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|vault|web-pages|woodhouse|zoekt"} > 0) , 1) , @@ -441,9 +441,9 @@ groups: clamp_min( clamp_max( avg by (environment, tier, type, stage, shard, pod, container)( - rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"}[5m]) + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"}[5m]) / - rate(container_cpu_cfs_periods_total:labeled{container!="", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse"}[5m]) + rate(container_cpu_cfs_periods_total:labeled{container!="", type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt"}[5m]) ) , 1) @@ -457,9 +457,9 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - kube_horizontalpodautoscaler_status_desired_replicas:labeled{type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} + kube_horizontalpodautoscaler_status_desired_replicas:labeled{type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} / - kube_horizontalpodautoscaler_spec_max_replicas:labeled{type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} + kube_horizontalpodautoscaler_spec_max_replicas:labeled{type=~"web|ai-assisted|api|atlantis|camoproxy|consul|external-dns|git|internal-api|istio|kas|kube|logging|mailroom|monitoring|nginx|ops-gitlab-net|packagecloud|plantuml|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sidekiq|vault|web-pages|websockets|woodhouse|zoekt", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , @@ -523,7 +523,7 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - instance:node_memory_utilization:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} or instance:node_memory_utilisation:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} + instance:node_memory_utilization:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} or instance:node_memory_utilisation:ratio{type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} , 1) , @@ -578,9 +578,9 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[1m]) + max_over_time(node_nf_conntrack_entries{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[1m]) / - node_nf_conntrack_entries_limit{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"} + node_nf_conntrack_entries_limit{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"} , 1) , @@ -593,7 +593,7 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[1h])) + avg without (cpu) (rate(node_schedstat_waiting_seconds_total{type=~"patroni|ci-runners|consul|customersdot|frontend|gitaly|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[1h])) , 1) , @@ -1294,7 +1294,7 @@ groups: clamp_min( clamp_max( 1 - avg by (environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[5m]) + rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[5m]) ) , 1) @@ -1361,7 +1361,7 @@ groups: max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages|zoekt"}[5m])) + avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", type=~"gitaly|ci-runners|consul|customersdot|frontend|jaeger|monitoring|patroni-ci|patroni-embedding|patroni-registry|patroni|pgbouncer-ci|pgbouncer-embedding|pgbouncer-registry|pgbouncer|postgres-archive|redis-cluster-cache|redis-cluster-chat-cache|redis-cluster-feature-flag|redis-cluster-queues-meta|redis-cluster-ratelimiting|redis-cluster-repo-cache|redis-cluster-shared-state|redis-db-load-balancing|redis-pubsub|redis-registry-cache|redis-sessions|redis-sidekiq|redis-tracechunks|redis|registry|sentry|web-pages"}[5m])) , 1) , diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-cause-alerts.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-cause-alerts.yml new file mode 100644 index 0000000000..73eb427571 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-cause-alerts.yml @@ -0,0 +1,39 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/kube-cause-alerts.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: kube_cause_alerts + rules: + - alert: KubeContainersWaitingInError + for: 20m + annotations: + title: Containers for the `{{ $labels.type }}` service, `{{ $labels.stage }}` + are unable to start. + description: | + More than 50% of the deployment's `maxSurge` setting consists of containers unable to start for reasons other than `ContainerCreating`. + grafana_dashboard_id: alerts-kube_containers_waiting/alerts-containers-waiting + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-kube_containers_waiting/alerts-containers-waiting?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }}&var-cluster={{ $labels.cluster }} + grafana_datasource_id: mimir-gitlab-gprd + grafana_min_zoom_hours: "6" + grafana_variables: environment,type,stage,cluster + labels: + alert_type: cause + pager: pagerduty + runbook: docs/kube/alerts/KubeContainersWaitingInError.md + severity: s2 + team: sre_reliability + expr: | + sum by (type, env, tier, stage, cluster) ( + kube_pod_container_status_waiting_reason:labeled{ + env="gprd",type="zoekt", + reason!="ContainerCreating", + } + ) + > 0 + >= on(type, env, tier, stage, cluster) ( + topk by(type, env, tier, stage, cluster) (1, + kube_deployment_spec_strategy_rollingupdate_max_surge:labeled{env="gprd",type="zoekt"} + ) + * 0.5 + ) diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-state-metrics.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-state-metrics.yml new file mode 100644 index 0000000000..41032f28d0 --- /dev/null +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-kube-state-metrics.yml @@ -0,0 +1,446 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/kube-state-metrics-recording-rules.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: 'kube-state-metrics-recording-rules: zoekt' + interval: 1m + rules: + - record: kube_pod_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard,label_deployment) ( + label_replace( + label_replace( + label_replace( + topk by(environment,cluster,pod) (1, kube_pod_labels{env="gprd",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ), + "deployment", "$0", "label_deployment", ".*" + ) + ) + - record: kube_horizontalpodautoscaler_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels{env="gprd",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) + - record: kube_ingress_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,ingress) (1, kube_ingress_labels{env="gprd",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) + - record: kube_deployment_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,deployment) (1, kube_deployment_labels{env="gprd",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) +- name: 'kube-state-metrics-recording-rules: enriched label recording rules' + interval: 1m + rules: + - record: container_start_time_seconds:labeled + expr: | + container_start_time_seconds{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_cpu_cfs_periods_total:labeled + expr: | + container_cpu_cfs_periods_total{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_cpu_cfs_throttled_periods_total:labeled + expr: | + container_cpu_cfs_throttled_periods_total{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_cpu_cfs_throttled_seconds_total:labeled + expr: | + container_cpu_cfs_throttled_seconds_total{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_cpu_usage_seconds_total:labeled + expr: | + container_cpu_usage_seconds_total{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_memory_cache:labeled + expr: | + container_memory_cache{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_memory_rss:labeled + expr: | + container_memory_rss{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_memory_swap:labeled + expr: | + container_memory_swap{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_memory_usage_bytes:labeled + expr: | + container_memory_usage_bytes{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_memory_working_set_bytes:labeled + expr: | + container_memory_working_set_bytes{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_network_receive_bytes_total:labeled + expr: | + container_network_receive_bytes_total{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_network_transmit_bytes_total:labeled + expr: | + container_network_transmit_bytes_total{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_spec_cpu_period:labeled + expr: | + container_spec_cpu_period{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_spec_cpu_quota:labeled + expr: | + container_spec_cpu_quota{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_spec_cpu_shares:labeled + expr: | + container_spec_cpu_shares{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: container_spec_memory_limit_bytes:labeled + expr: | + container_spec_memory_limit_bytes{env="gprd",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_resource_limits:labeled + expr: | + kube_pod_container_resource_limits{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_resource_requests:labeled + expr: | + kube_pod_container_resource_requests{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_last_terminated_reason:labeled + expr: | + kube_pod_container_status_last_terminated_reason{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_ready:labeled + expr: | + kube_pod_container_status_ready{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_restarts_total:labeled + expr: | + kube_pod_container_status_restarts_total{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_running:labeled + expr: | + kube_pod_container_status_running{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_terminated:labeled + expr: | + kube_pod_container_status_terminated{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_terminated_reason:labeled + expr: | + kube_pod_container_status_terminated_reason{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_waiting:labeled + expr: | + kube_pod_container_status_waiting{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_pod_container_status_waiting_reason:labeled + expr: | + kube_pod_container_status_waiting_reason{env="gprd"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_spec_target_metric:labeled + expr: | + kube_horizontalpodautoscaler_spec_target_metric{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_status_condition:labeled + expr: | + kube_horizontalpodautoscaler_status_condition{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_status_current_replicas:labeled + expr: | + kube_horizontalpodautoscaler_status_current_replicas{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_status_desired_replicas:labeled + expr: | + kube_horizontalpodautoscaler_status_desired_replicas{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_metadata_generation:labeled + expr: | + kube_horizontalpodautoscaler_metadata_generation{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_spec_max_replicas:labeled + expr: | + kube_horizontalpodautoscaler_spec_max_replicas{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_spec_min_replicas:labeled + expr: | + kube_horizontalpodautoscaler_spec_min_replicas{env="gprd"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_node_status_capacity:labeled + expr: | + kube_node_status_capacity{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_node_status_allocatable:labeled + expr: | + kube_node_status_allocatable{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_node_status_condition:labeled + expr: | + kube_node_status_condition{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_schedstat_waiting_seconds_total:labeled + expr: | + node_schedstat_waiting_seconds_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_cpu_seconds_total:labeled + expr: | + node_cpu_seconds_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_network_transmit_bytes_total:labeled + expr: | + node_network_transmit_bytes_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_network_receive_bytes_total:labeled + expr: | + node_network_receive_bytes_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_disk_reads_completed_total:labeled + expr: | + node_disk_reads_completed_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_disk_writes_completed_total:labeled + expr: | + node_disk_writes_completed_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_disk_read_bytes_total:labeled + expr: | + node_disk_read_bytes_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_disk_written_bytes_total:labeled + expr: | + node_disk_written_bytes_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_disk_read_time_seconds_total:labeled + expr: | + node_disk_read_time_seconds_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_disk_write_time_seconds_total:labeled + expr: | + node_disk_write_time_seconds_total{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_load1:labeled + expr: | + node_load1{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_load5:labeled + expr: | + node_load5{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_load15:labeled + expr: | + node_load15{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: node_vmstat_oom_kill:labeled + expr: | + node_vmstat_oom_kill{env="gprd"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gprd",type=~"zoekt"}) + - record: nginx_ingress_controller_requests:labeled + expr: | + nginx_ingress_controller_requests{env="gprd"} + * + on(environment,cluster,ingress) group_left(tier,type,stage,shard) + topk by (environment,cluster,ingress) (1, kube_ingress_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_status_replicas_unavailable:labeled + expr: | + kube_deployment_status_replicas_unavailable{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_status_replicas_updated:labeled + expr: | + kube_deployment_status_replicas_updated{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_spec_paused:labeled + expr: | + kube_deployment_spec_paused{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_spec_replicas:labeled + expr: | + kube_deployment_spec_replicas{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_spec_strategy_rollingupdate_max_surge:labeled + expr: | + kube_deployment_spec_strategy_rollingupdate_max_surge{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_spec_strategy_rollingupdate_max_unavailable:labeled + expr: | + kube_deployment_spec_strategy_rollingupdate_max_unavailable{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_status_condition:labeled + expr: | + kube_deployment_status_condition{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_status_replicas_available:labeled + expr: | + kube_deployment_status_replicas_available{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_created:labeled + expr: | + kube_deployment_created{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_metadata_generation:labeled + expr: | + kube_deployment_metadata_generation{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_status_observed_generation:labeled + expr: | + kube_deployment_status_observed_generation{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) + - record: kube_deployment_status_replicas:labeled + expr: | + kube_deployment_status_replicas{env="gprd"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gprd",type=~"zoekt"}) diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml index 191ec4be99..059e78629f 100644 --- a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-alerts.yml @@ -17,147 +17,40 @@ groups: - name: GitLab Saturation Alerts interval: 1m rules: - - alert: component_saturation_slo_out_of_bounds:cpu - for: 5m - annotations: - title: The Average Service CPU Utilization resource of the {{ $labels.type }} - service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is - close to its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Average Service CPU Utilization resource: - - This resource measures average CPU utilization across an all cores in a service fleet. If it is becoming saturated, it may indicate that the fleet needs horizontal or vertical scaling. - grafana_dashboard_id: alerts-sat_cpu - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gprd - grafana_min_zoom_hours: "6" - grafana_panel_id: "1465724101" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - rules_domain: general - severity: s3 - expr: | - gitlab_component_saturation:ratio{component="cpu",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="cpu"} - - alert: component_saturation_slo_out_of_bounds:disk_inodes + - alert: component_saturation_slo_out_of_bounds:kube_container_cpu_limit for: 15m annotations: - title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type + title: The Kube Container CPU over-utilization resource of the {{ $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the Disk inode Utilization per Device per Node resource: + Details of the Kube Container CPU over-utilization resource: - Disk inode utilization per device per node. + Kubernetes containers can have a limit configured on how much CPU they can consume in a burst. If we are at this limit, exceeding the allocated requested resources, we should consider revisting the container's HPA configuration. - If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files - grafana_dashboard_id: alerts-sat_disk_inodes - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ + When a container is utilizing CPU resources up-to it's configured limit for extended periods of time, this could cause it and other running containers to be throttled. + grafana_dashboard_id: alerts-sat_kube_container_cpu_limit + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_cpu_limit?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gprd grafana_min_zoom_hours: "6" - grafana_panel_id: "39965907" + grafana_panel_id: "1262336683" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( + max by(environment, tier, type, stage, shard, pod, container) ( clamp_min( clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + sum by (environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - pager: pagerduty - rules_domain: general - severity: s2 - expr: | - gitlab_component_saturation:ratio{component="disk_inodes",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} - - alert: ComponentResourceRunningOut_disk_inodes - for: 15m - annotations: - title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) is on track to hit capacity within - 6h - description: | - This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. - - Details of the Disk inode Utilization per Device per Node resource: - - Disk inode utilization per device per node. - - If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files - grafana_dashboard_id: alerts-sat_disk_inodes - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gprd - grafana_min_zoom_hours: "6" - grafana_panel_id: "39965907" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + sum by(environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_spec_cpu_period:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} ) , 1) @@ -165,13 +58,17 @@ groups: 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( + max by(environment, tier, type, stage, shard, pod, container) ( clamp_min( clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + sum by (environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + / + sum by(environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_spec_cpu_period:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} ) , 1) @@ -181,157 +78,50 @@ groups: runbook: docs/{{ $labels.type }}/README.md labels: alert_type: cause - linear_prediction_saturation_alert: 6h - pager: pagerduty rules_domain: general - severity: s2 + severity: s4 expr: | - predict_linear(gitlab_component_saturation:ratio{component="disk_inodes",env="gprd",type="zoekt"}[6h], 21600) - > on (component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} - - alert: component_saturation_slo_out_of_bounds:disk_space + gitlab_component_saturation:ratio{component="kube_container_cpu_limit",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_cpu_limit"} + - alert: component_saturation_slo_out_of_bounds:kube_container_memory for: 15m annotations: - title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type + title: The Kube Container Memory Utilization resource of the {{ $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the Disk Space Utilization per Device per Node resource: + Details of the Kube Container Memory Utilization resource: - Disk space utilization per device per node. - grafana_dashboard_id: alerts-sat_disk_space - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ + This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed. + grafana_dashboard_id: alerts-sat_kube_container_memory + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_memory?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gprd grafana_min_zoom_hours: "6" - grafana_panel_id: "2661375984" + grafana_panel_id: "172578411" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - pager: pagerduty - rules_domain: general - severity: s2 - expr: | - gitlab_component_saturation:ratio{component="disk_space",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} - - alert: ComponentResourceRunningOut_disk_space - for: 15m - annotations: - title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) is on track to hit capacity within - 6h - description: | - This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. - - Details of the Disk Space Utilization per Device per Node resource: - - Disk space utilization per device per node. - grafana_dashboard_id: alerts-sat_disk_space - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gprd - grafana_min_zoom_hours: "6" - grafana_panel_id: "2661375984" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - linear_prediction_saturation_alert: 6h - pager: pagerduty - rules_domain: general - severity: s2 - expr: | - predict_linear(gitlab_component_saturation:ratio{component="disk_space",env="gprd",type="zoekt"}[6h], 21600) - > on (component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} - - alert: component_saturation_slo_out_of_bounds:memory - for: 5m - annotations: - title: The Memory Utilization per Node resource of the {{ $labels.type }} service - ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to - its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Memory Utilization per Node resource: - - Memory utilization per device per node. - grafana_dashboard_id: alerts-sat_memory - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_memory?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gprd - grafana_min_zoom_hours: "6" - grafana_panel_id: "1955556769" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn) ( + max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0) , 1) , 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn) ( + max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0) , 1) , @@ -343,49 +133,57 @@ groups: rules_domain: general severity: s4 expr: | - gitlab_component_saturation:ratio{component="memory",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="memory"} - - alert: component_saturation_slo_out_of_bounds:nf_conntrack_entries - for: 5m + gitlab_component_saturation:ratio{component="kube_container_memory",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_memory"} + - alert: component_saturation_slo_out_of_bounds:kube_container_throttling + for: 10m annotations: - title: The conntrack Entries per Node resource of the {{ $labels.type }} service + title: The Kube container throttling resource of the {{ $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the conntrack Entries per Node resource: + Details of the Kube container throttling resource: - Netfilter connection tracking table utilization per node. + Kube container throttling - When saturated, new connection attempts (incoming SYN packets) are dropped with no reply, leaving clients to slowly retry (and typically fail again) over the next several seconds. When packets are being dropped due to this condition, kernel will log the event as: "nf_conntrack: table full, dropping packet". - grafana_dashboard_id: alerts-sat_conntrack - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_conntrack?from=now-6h/m&to=now-1m/m&var-environment={{ + A container will be throttled if it reaches the configured cpu limit for the horizontal pod autoscaler. Or when other containers on the node are overutilizing the the CPU. + + To get around this, consider increasing the limit for this workload, taking into consideration the requested resources. + grafana_dashboard_id: alerts-kube_container_throttling + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-kube_container_throttling?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gprd grafana_min_zoom_hours: "6" - grafana_panel_id: "503581002" + grafana_panel_id: "54512634" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, instance) ( + quantile by(environment, tier, type, stage, shard, pod, container) ( + 0.99, clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) - / - node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + avg by (environment, tier, type, stage, shard, pod, container)( + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + / + rate(container_cpu_cfs_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) , 1) , 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, instance) ( + quantile by(environment, tier, type, stage, shard, pod, container) ( + 0.99, clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) - / - node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + avg by (environment, tier, type, stage, shard, pod, container)( + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + / + rate(container_cpu_cfs_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) , 1) , @@ -397,60 +195,62 @@ groups: rules_domain: general severity: s3 expr: | - gitlab_component_saturation:ratio{component="nf_conntrack_entries",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="nf_conntrack_entries"} - - alert: component_saturation_slo_out_of_bounds:node_schedstat_waiting - for: 90m + gitlab_component_saturation:ratio{component="kube_container_throttling",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_throttling"} + - alert: component_saturation_slo_out_of_bounds:kube_horizontalpodautoscaler_desired_replicas + for: 25m annotations: - title: The Node Scheduler Waiting Time resource of the {{ $labels.type }} service - ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to - its capacity limit. + title: The Horizontal Pod Autoscaler Desired Replicas resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the Node Scheduler Waiting Time resource: - - Measures the amount of scheduler waiting time that processes are waiting to be scheduled, according to [`CPU Scheduling Metrics`](https://www.robustperception.io/cpu-scheduling-metrics-from-the-node-exporter). + Details of the Horizontal Pod Autoscaler Desired Replicas resource: - A high value indicates that a node has more processes to be run than CPU time available to handle them, and may lead to degraded responsiveness and performance from the application. + The [Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) automatically scales the number of Pods in a deployment based on metrics. - Additionally, it may indicate that the fleet is under-provisioned. - grafana_dashboard_id: alerts-sat_node_schedstat_waiting - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_node_schedstat_waiting?from=now-6h/m&to=now-1m/m&var-environment={{ + The Horizontal Pod Autoscaler has a configured upper maximum. When this limit is reached, the HPA will not increase the number of pods and other resource saturation (eg, CPU, memory) may occur. + grafana_dashboard_id: alerts-sat_kube_horizontalpodautoscaler + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_horizontalpodautoscaler?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gprd grafana_min_zoom_hours: "6" - grafana_panel_id: "1415313189" + grafana_panel_id: "351198712" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, shard) ( + max by(environment, tier, type, stage, shard, horizontalpodautoscaler, shard) ( clamp_min( clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + kube_horizontalpodautoscaler_status_desired_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} + / + kube_horizontalpodautoscaler_spec_max_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, shard) ( + max by(environment, tier, type, stage, shard, horizontalpodautoscaler, shard) ( clamp_min( clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + kube_horizontalpodautoscaler_status_desired_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} + / + kube_horizontalpodautoscaler_spec_max_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , 0) ) - runbook: docs/{{ $labels.type }}/README.md + runbook: docs/kube/kubernetes.md#hpascalecapability labels: alert_type: cause rules_domain: general - severity: s4 + severity: s3 expr: | - gitlab_component_saturation:ratio{component="node_schedstat_waiting",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="node_schedstat_waiting"} + gitlab_component_saturation:ratio{component="kube_horizontalpodautoscaler_desired_replicas",env="gprd",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_horizontalpodautoscaler_desired_replicas"} - alert: component_saturation_slo_out_of_bounds:open_fds for: 5m annotations: @@ -524,105 +324,3 @@ groups: expr: | gitlab_component_saturation:ratio{component="open_fds",env="gprd",type="zoekt"} > on(component) group_left slo:max:hard:gitlab_component_saturation:ratio{component="open_fds"} - - alert: component_saturation_slo_out_of_bounds:shard_cpu - for: 5m - annotations: - title: The Average CPU Utilization per Shard resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and - is close to its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Average CPU Utilization per Shard resource: - - This resource measures average CPU utilization across an all cores in a shard of a service fleet. If it is becoming saturated, it may indicate that the shard needs horizontal or vertical scaling. - grafana_dashboard_id: alerts-sat_shard_cpu - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_shard_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gprd - grafana_min_zoom_hours: "6" - grafana_panel_id: "1472933476" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - rules_domain: general - severity: s3 - expr: | - gitlab_component_saturation:ratio{component="shard_cpu",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="shard_cpu"} - - alert: component_saturation_slo_out_of_bounds:single_node_cpu - for: 10m - annotations: - title: The Average CPU Utilization per Node resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and - is close to its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Average CPU Utilization per Node resource: - - Average CPU utilization per Node. - - If average CPU is saturated, it may indicate that a fleet is in need to horizontal or vertical scaling. It may also indicate imbalances in load in a fleet. - grafana_dashboard_id: alerts-sat_single_node_cpu - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_single_node_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gprd - grafana_min_zoom_hours: "6" - grafana_panel_id: "3372411356" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn) ( - clamp_min( - clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn) ( - clamp_min( - clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - rules_domain: general - severity: s4 - expr: | - gitlab_component_saturation:ratio{component="single_node_cpu",env="gprd",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="single_node_cpu"} diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml index fd74401a84..c4b506c3e4 100644 --- a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation-metadata.yml @@ -6,52 +6,44 @@ groups: rules: - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: cpu - expr: "0.8" + component: kube_container_cpu + expr: "0.95" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: cpu - expr: "0.9" + component: kube_container_cpu + expr: "0.99" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: disk_inodes - expr: "0.75" + component: kube_container_cpu_limit + expr: "0.9" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: disk_inodes - expr: "0.8" + component: kube_container_cpu_limit + expr: "0.99" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: disk_space - expr: "0.85" + component: kube_container_memory + expr: "0.8" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: disk_space + component: kube_container_memory expr: "0.9" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: memory - expr: "0.9" + component: kube_container_throttling + expr: "0.4" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: memory - expr: "0.98" + component: kube_container_throttling + expr: "0.5" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: nf_conntrack_entries - expr: "0.95" - - record: slo:max:hard:gitlab_component_saturation:ratio - labels: - component: nf_conntrack_entries - expr: "0.98" - - record: slo:max:soft:gitlab_component_saturation:ratio - labels: - component: node_schedstat_waiting - expr: "0.1" + component: kube_horizontalpodautoscaler_desired_replicas + expr: "0.9" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: node_schedstat_waiting - expr: "0.15" + component: kube_horizontalpodautoscaler_desired_replicas + expr: "0.95" - record: slo:max:soft:gitlab_component_saturation:ratio labels: component: open_fds @@ -60,19 +52,3 @@ groups: labels: component: open_fds expr: "0.9" - - record: slo:max:soft:gitlab_component_saturation:ratio - labels: - component: shard_cpu - expr: "0.85" - - record: slo:max:hard:gitlab_component_saturation:ratio - labels: - component: shard_cpu - expr: "0.95" - - record: slo:max:soft:gitlab_component_saturation:ratio - labels: - component: single_node_cpu - expr: "0.9" - - record: slo:max:hard:gitlab_component_saturation:ratio - labels: - component: single_node_cpu - expr: "0.95" diff --git a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml index ff447b39c1..b64eba3ed6 100644 --- a/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml +++ b/mimir-rules/gitlab-gprd/gprd/zoekt/autogenerated-gitlab-gprd-gprd-zoekt-saturation.yml @@ -6,13 +6,23 @@ groups: rules: - record: gitlab_component_saturation:ratio labels: - component: cpu + component: kube_container_cpu expr: | - max by(env, environment, tier, type, stage, shard) ( + quantile by(env, environment, tier, type, stage, shard) ( + 0.99, clamp_min( clamp_max( - 1 - avg by (env, environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", env="gprd",type="zoekt"}[5m]) + ( + sum by (env, environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", env="gprd",type="zoekt"}[1h]) + ) + unless on(env, environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", env="gprd",type="zoekt"} + ) + ) + / + sum by(env, environment, tier, type, stage, shard, pod, container) ( + kube_pod_container_resource_requests:labeled{container!="", container!="POD", resource="cpu", env="gprd",type="zoekt"} ) , 1) @@ -21,15 +31,19 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: disk_inodes + component: kube_container_cpu_limit expr: | max by(env, environment, tier, type, stage, shard) ( clamp_min( clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", env="gprd",type="zoekt"} + sum by (env, environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", env="gprd",type="zoekt"}[5m]) + ) + / + sum by(env, environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", env="gprd",type="zoekt"} / - node_filesystem_files{fstype=~"(ext.|xfs)", env="gprd",type="zoekt"} + container_spec_cpu_period:labeled{container!="", container!="POD", env="gprd",type="zoekt"} ) , 1) @@ -38,14 +52,14 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: disk_space + component: kube_container_memory expr: | max by(env, environment, tier, type, stage, shard) ( clamp_min( clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", env="gprd",type="zoekt"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", env="gprd",type="zoekt"} - ) + container_memory_working_set_bytes:labeled{container!="", container!="POD", env="gprd",type="zoekt"} + / + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", env="gprd",type="zoekt"} > 0) , 1) , @@ -53,12 +67,17 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: memory + component: kube_container_throttling expr: | - max by(env, environment, tier, type, stage, shard) ( + quantile by(env, environment, tier, type, stage, shard) ( + 0.99, clamp_min( clamp_max( - instance:node_memory_utilization:ratio{env="gprd",type="zoekt"} or instance:node_memory_utilisation:ratio{env="gprd",type="zoekt"} + avg by (env, environment, tier, type, stage, shard, pod, container)( + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", env="gprd",type="zoekt"}[5m]) + / + rate(container_cpu_cfs_periods_total:labeled{container!="", env="gprd",type="zoekt"}[5m]) + ) , 1) , @@ -66,27 +85,14 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: nf_conntrack_entries + component: kube_horizontalpodautoscaler_desired_replicas expr: | max by(env, environment, tier, type, stage, shard) ( clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{env="gprd",type="zoekt"}[1m]) + kube_horizontalpodautoscaler_status_desired_replicas:labeled{env="gprd",type="zoekt", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} / - node_nf_conntrack_entries_limit{env="gprd",type="zoekt"} - , - 1) - , - 0) - ) - - record: gitlab_component_saturation:ratio - labels: - component: node_schedstat_waiting - expr: | - max by(env, environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{env="gprd",type="zoekt"}[1h])) + kube_horizontalpodautoscaler_spec_max_replicas:labeled{env="gprd",type="zoekt", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , @@ -115,31 +121,3 @@ groups: , 0) ) - - record: gitlab_component_saturation:ratio - labels: - component: shard_cpu - expr: | - max by(env, environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - 1 - avg by (env, environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", env="gprd",type="zoekt"}[5m]) - ) - , - 1) - , - 0) - ) - - record: gitlab_component_saturation:ratio - labels: - component: single_node_cpu - expr: | - max by(env, environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", env="gprd",type="zoekt"}[5m])) - , - 1) - , - 0) - ) diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-cause-alerts.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-cause-alerts.yml new file mode 100644 index 0000000000..23d0dfcfc8 --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-cause-alerts.yml @@ -0,0 +1,39 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/kube-cause-alerts.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: kube_cause_alerts + rules: + - alert: KubeContainersWaitingInError + for: 20m + annotations: + title: Containers for the `{{ $labels.type }}` service, `{{ $labels.stage }}` + are unable to start. + description: | + More than 50% of the deployment's `maxSurge` setting consists of containers unable to start for reasons other than `ContainerCreating`. + grafana_dashboard_id: alerts-kube_containers_waiting/alerts-containers-waiting + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-kube_containers_waiting/alerts-containers-waiting?from=now-6h/m&to=now-1m/m&var-environment={{ + $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage + }}&var-cluster={{ $labels.cluster }} + grafana_datasource_id: mimir-gitlab-gstg + grafana_min_zoom_hours: "6" + grafana_variables: environment,type,stage,cluster + labels: + alert_type: cause + pager: pagerduty + runbook: docs/kube/alerts/KubeContainersWaitingInError.md + severity: s2 + team: sre_reliability + expr: | + sum by (type, env, tier, stage, cluster) ( + kube_pod_container_status_waiting_reason:labeled{ + env="gstg",type="zoekt", + reason!="ContainerCreating", + } + ) + > 0 + >= on(type, env, tier, stage, cluster) ( + topk by(type, env, tier, stage, cluster) (1, + kube_deployment_spec_strategy_rollingupdate_max_surge:labeled{env="gstg",type="zoekt"} + ) + * 0.5 + ) diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-state-metrics.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-state-metrics.yml new file mode 100644 index 0000000000..604b4799a5 --- /dev/null +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-kube-state-metrics.yml @@ -0,0 +1,446 @@ +# WARNING. DO NOT EDIT THIS FILE BY HAND. USE ./mimir-rules-jsonnet/kube-state-metrics-recording-rules.jsonnet TO GENERATE IT +# YOUR CHANGES WILL BE OVERRIDDEN +groups: +- name: 'kube-state-metrics-recording-rules: zoekt' + interval: 1m + rules: + - record: kube_pod_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard,label_deployment) ( + label_replace( + label_replace( + label_replace( + topk by(environment,cluster,pod) (1, kube_pod_labels{env="gstg",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ), + "deployment", "$0", "label_deployment", ".*" + ) + ) + - record: kube_horizontalpodautoscaler_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels{env="gstg",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) + - record: kube_ingress_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,ingress) (1, kube_ingress_labels{env="gstg",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) + - record: kube_deployment_labels:labeled + labels: + tier: inf + type: zoekt + expr: | + group without(label_stage,label_shard) ( + label_replace( + label_replace( + topk by(environment,cluster,deployment) (1, kube_deployment_labels{env="gstg",label_type="zoekt"}), + "stage", "$0", "label_stage", ".*" + ), + "shard", "$0", "label_shard", ".*" + ) + ) +- name: 'kube-state-metrics-recording-rules: enriched label recording rules' + interval: 1m + rules: + - record: container_start_time_seconds:labeled + expr: | + container_start_time_seconds{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_cpu_cfs_periods_total:labeled + expr: | + container_cpu_cfs_periods_total{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_cpu_cfs_throttled_periods_total:labeled + expr: | + container_cpu_cfs_throttled_periods_total{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_cpu_cfs_throttled_seconds_total:labeled + expr: | + container_cpu_cfs_throttled_seconds_total{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_cpu_usage_seconds_total:labeled + expr: | + container_cpu_usage_seconds_total{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_memory_cache:labeled + expr: | + container_memory_cache{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_memory_rss:labeled + expr: | + container_memory_rss{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_memory_swap:labeled + expr: | + container_memory_swap{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_memory_usage_bytes:labeled + expr: | + container_memory_usage_bytes{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_memory_working_set_bytes:labeled + expr: | + container_memory_working_set_bytes{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_network_receive_bytes_total:labeled + expr: | + container_network_receive_bytes_total{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_network_transmit_bytes_total:labeled + expr: | + container_network_transmit_bytes_total{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_spec_cpu_period:labeled + expr: | + container_spec_cpu_period{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_spec_cpu_quota:labeled + expr: | + container_spec_cpu_quota{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_spec_cpu_shares:labeled + expr: | + container_spec_cpu_shares{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: container_spec_memory_limit_bytes:labeled + expr: | + container_spec_memory_limit_bytes{env="gstg",metrics_path="/metrics/cadvisor"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_resource_limits:labeled + expr: | + kube_pod_container_resource_limits{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_resource_requests:labeled + expr: | + kube_pod_container_resource_requests{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_last_terminated_reason:labeled + expr: | + kube_pod_container_status_last_terminated_reason{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_ready:labeled + expr: | + kube_pod_container_status_ready{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_restarts_total:labeled + expr: | + kube_pod_container_status_restarts_total{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_running:labeled + expr: | + kube_pod_container_status_running{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_terminated:labeled + expr: | + kube_pod_container_status_terminated{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_terminated_reason:labeled + expr: | + kube_pod_container_status_terminated_reason{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_waiting:labeled + expr: | + kube_pod_container_status_waiting{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_pod_container_status_waiting_reason:labeled + expr: | + kube_pod_container_status_waiting_reason{env="gstg"} + * + on(environment,cluster,pod) group_left(tier,type,stage,shard,deployment) + topk by (environment,cluster,pod) (1, kube_pod_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_spec_target_metric:labeled + expr: | + kube_horizontalpodautoscaler_spec_target_metric{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_status_condition:labeled + expr: | + kube_horizontalpodautoscaler_status_condition{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_status_current_replicas:labeled + expr: | + kube_horizontalpodautoscaler_status_current_replicas{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_status_desired_replicas:labeled + expr: | + kube_horizontalpodautoscaler_status_desired_replicas{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_metadata_generation:labeled + expr: | + kube_horizontalpodautoscaler_metadata_generation{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_spec_max_replicas:labeled + expr: | + kube_horizontalpodautoscaler_spec_max_replicas{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_horizontalpodautoscaler_spec_min_replicas:labeled + expr: | + kube_horizontalpodautoscaler_spec_min_replicas{env="gstg"} + * + on(environment,cluster,horizontalpodautoscaler) group_left(tier,type,stage,shard) + topk by (environment,cluster,horizontalpodautoscaler) (1, kube_horizontalpodautoscaler_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_node_status_capacity:labeled + expr: | + kube_node_status_capacity{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_node_status_allocatable:labeled + expr: | + kube_node_status_allocatable{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_node_status_condition:labeled + expr: | + kube_node_status_condition{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_schedstat_waiting_seconds_total:labeled + expr: | + node_schedstat_waiting_seconds_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_cpu_seconds_total:labeled + expr: | + node_cpu_seconds_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_network_transmit_bytes_total:labeled + expr: | + node_network_transmit_bytes_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_network_receive_bytes_total:labeled + expr: | + node_network_receive_bytes_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_disk_reads_completed_total:labeled + expr: | + node_disk_reads_completed_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_disk_writes_completed_total:labeled + expr: | + node_disk_writes_completed_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_disk_read_bytes_total:labeled + expr: | + node_disk_read_bytes_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_disk_written_bytes_total:labeled + expr: | + node_disk_written_bytes_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_disk_read_time_seconds_total:labeled + expr: | + node_disk_read_time_seconds_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_disk_write_time_seconds_total:labeled + expr: | + node_disk_write_time_seconds_total{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_load1:labeled + expr: | + node_load1{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_load5:labeled + expr: | + node_load5{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_load15:labeled + expr: | + node_load15{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: node_vmstat_oom_kill:labeled + expr: | + node_vmstat_oom_kill{env="gstg"} + * + on(environment,cluster,node) group_left(tier,type,stage,shard) + topk by (environment,cluster,node) (1, kube_node_labels:labeled{env="gstg",type=~"zoekt"}) + - record: nginx_ingress_controller_requests:labeled + expr: | + nginx_ingress_controller_requests{env="gstg"} + * + on(environment,cluster,ingress) group_left(tier,type,stage,shard) + topk by (environment,cluster,ingress) (1, kube_ingress_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_status_replicas_unavailable:labeled + expr: | + kube_deployment_status_replicas_unavailable{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_status_replicas_updated:labeled + expr: | + kube_deployment_status_replicas_updated{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_spec_paused:labeled + expr: | + kube_deployment_spec_paused{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_spec_replicas:labeled + expr: | + kube_deployment_spec_replicas{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_spec_strategy_rollingupdate_max_surge:labeled + expr: | + kube_deployment_spec_strategy_rollingupdate_max_surge{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_spec_strategy_rollingupdate_max_unavailable:labeled + expr: | + kube_deployment_spec_strategy_rollingupdate_max_unavailable{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_status_condition:labeled + expr: | + kube_deployment_status_condition{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_status_replicas_available:labeled + expr: | + kube_deployment_status_replicas_available{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_created:labeled + expr: | + kube_deployment_created{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_metadata_generation:labeled + expr: | + kube_deployment_metadata_generation{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_status_observed_generation:labeled + expr: | + kube_deployment_status_observed_generation{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) + - record: kube_deployment_status_replicas:labeled + expr: | + kube_deployment_status_replicas{env="gstg"} + * + on(environment,cluster,deployment) group_left(tier,type,stage,shard) + topk by (environment,cluster,deployment) (1, kube_deployment_labels:labeled{env="gstg",type=~"zoekt"}) diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml index 221970a5e4..958e4afe31 100644 --- a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-alerts.yml @@ -17,147 +17,40 @@ groups: - name: GitLab Saturation Alerts interval: 1m rules: - - alert: component_saturation_slo_out_of_bounds:cpu - for: 5m - annotations: - title: The Average Service CPU Utilization resource of the {{ $labels.type }} - service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is - close to its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Average Service CPU Utilization resource: - - This resource measures average CPU utilization across an all cores in a service fleet. If it is becoming saturated, it may indicate that the fleet needs horizontal or vertical scaling. - grafana_dashboard_id: alerts-sat_cpu - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gstg - grafana_min_zoom_hours: "6" - grafana_panel_id: "1465724101" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - rules_domain: general - severity: s3 - expr: | - gitlab_component_saturation:ratio{component="cpu",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="cpu"} - - alert: component_saturation_slo_out_of_bounds:disk_inodes + - alert: component_saturation_slo_out_of_bounds:kube_container_cpu_limit for: 15m annotations: - title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type + title: The Kube Container CPU over-utilization resource of the {{ $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the Disk inode Utilization per Device per Node resource: + Details of the Kube Container CPU over-utilization resource: - Disk inode utilization per device per node. + Kubernetes containers can have a limit configured on how much CPU they can consume in a burst. If we are at this limit, exceeding the allocated requested resources, we should consider revisting the container's HPA configuration. - If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files - grafana_dashboard_id: alerts-sat_disk_inodes - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ + When a container is utilizing CPU resources up-to it's configured limit for extended periods of time, this could cause it and other running containers to be throttled. + grafana_dashboard_id: alerts-sat_kube_container_cpu_limit + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_cpu_limit?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gstg grafana_min_zoom_hours: "6" - grafana_panel_id: "39965907" + grafana_panel_id: "1262336683" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( + max by(environment, tier, type, stage, shard, pod, container) ( clamp_min( clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + sum by (environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - pager: pagerduty - rules_domain: general - severity: s2 - expr: | - gitlab_component_saturation:ratio{component="disk_inodes",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} - - alert: ComponentResourceRunningOut_disk_inodes - for: 15m - annotations: - title: The Disk inode Utilization per Device per Node resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) is on track to hit capacity within - 6h - description: | - This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. - - Details of the Disk inode Utilization per Device per Node resource: - - Disk inode utilization per device per node. - - If this is too high, its possible that a directory is filling up with files. Consider logging in an checking temp directories for large numbers of files - grafana_dashboard_id: alerts-sat_disk_inodes - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_inodes?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gstg - grafana_min_zoom_hours: "6" - grafana_panel_id: "39965907" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + sum by(environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_spec_cpu_period:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} ) , 1) @@ -165,13 +58,17 @@ groups: 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( + max by(environment, tier, type, stage, shard, pod, container) ( clamp_min( clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + sum by (environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) + / + sum by(environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / - node_filesystem_files{fstype=~"(ext.|xfs)", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_spec_cpu_period:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} ) , 1) @@ -181,157 +78,50 @@ groups: runbook: docs/{{ $labels.type }}/README.md labels: alert_type: cause - linear_prediction_saturation_alert: 6h - pager: pagerduty rules_domain: general - severity: s2 + severity: s4 expr: | - predict_linear(gitlab_component_saturation:ratio{component="disk_inodes",env="gstg",type="zoekt"}[6h], 21600) - > on (component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_inodes"} - - alert: component_saturation_slo_out_of_bounds:disk_space + gitlab_component_saturation:ratio{component="kube_container_cpu_limit",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_cpu_limit"} + - alert: component_saturation_slo_out_of_bounds:kube_container_memory for: 15m annotations: - title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type + title: The Kube Container Memory Utilization resource of the {{ $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the Disk Space Utilization per Device per Node resource: + Details of the Kube Container Memory Utilization resource: - Disk space utilization per device per node. - grafana_dashboard_id: alerts-sat_disk_space - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ + This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed. + grafana_dashboard_id: alerts-sat_kube_container_memory + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_memory?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gstg grafana_min_zoom_hours: "6" - grafana_panel_id: "2661375984" + grafana_panel_id: "172578411" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - pager: pagerduty - rules_domain: general - severity: s2 - expr: | - gitlab_component_saturation:ratio{component="disk_space",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} - - alert: ComponentResourceRunningOut_disk_space - for: 15m - annotations: - title: The Disk Space Utilization per Device per Node resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) is on track to hit capacity within - 6h - description: | - This means that this resource is growing rapidly and is predicted to exceed saturation threshold within 6h. - - Details of the Disk Space Utilization per Device per Node resource: - - Disk space utilization per device per node. - grafana_dashboard_id: alerts-sat_disk_space - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_disk_space?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gstg - grafana_min_zoom_hours: "6" - grafana_panel_id: "2661375984" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, device) ( - clamp_min( - clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - linear_prediction_saturation_alert: 6h - pager: pagerduty - rules_domain: general - severity: s2 - expr: | - predict_linear(gitlab_component_saturation:ratio{component="disk_space",env="gstg",type="zoekt"}[6h], 21600) - > on (component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="disk_space"} - - alert: component_saturation_slo_out_of_bounds:memory - for: 5m - annotations: - title: The Memory Utilization per Node resource of the {{ $labels.type }} service - ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to - its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Memory Utilization per Node resource: - - Memory utilization per device per node. - grafana_dashboard_id: alerts-sat_memory - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_memory?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gstg - grafana_min_zoom_hours: "6" - grafana_panel_id: "1955556769" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn) ( + max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0) , 1) , 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn) ( + max by(environment, tier, type, stage, shard) ( clamp_min( clamp_max( - instance:node_memory_utilization:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} or instance:node_memory_utilisation:ratio{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + / + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0) , 1) , @@ -343,49 +133,57 @@ groups: rules_domain: general severity: s4 expr: | - gitlab_component_saturation:ratio{component="memory",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="memory"} - - alert: component_saturation_slo_out_of_bounds:nf_conntrack_entries - for: 5m + gitlab_component_saturation:ratio{component="kube_container_memory",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_memory"} + - alert: component_saturation_slo_out_of_bounds:kube_container_throttling + for: 10m annotations: - title: The conntrack Entries per Node resource of the {{ $labels.type }} service + title: The Kube container throttling resource of the {{ $labels.type }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the conntrack Entries per Node resource: + Details of the Kube container throttling resource: - Netfilter connection tracking table utilization per node. + Kube container throttling - When saturated, new connection attempts (incoming SYN packets) are dropped with no reply, leaving clients to slowly retry (and typically fail again) over the next several seconds. When packets are being dropped due to this condition, kernel will log the event as: "nf_conntrack: table full, dropping packet". - grafana_dashboard_id: alerts-sat_conntrack - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_conntrack?from=now-6h/m&to=now-1m/m&var-environment={{ + A container will be throttled if it reaches the configured cpu limit for the horizontal pod autoscaler. Or when other containers on the node are overutilizing the the CPU. + + To get around this, consider increasing the limit for this workload, taking into consideration the requested resources. + grafana_dashboard_id: alerts-kube_container_throttling + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-kube_container_throttling?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gstg grafana_min_zoom_hours: "6" - grafana_panel_id: "503581002" + grafana_panel_id: "54512634" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, instance) ( + quantile by(environment, tier, type, stage, shard, pod, container) ( + 0.99, clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) - / - node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + avg by (environment, tier, type, stage, shard, pod, container)( + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + / + rate(container_cpu_cfs_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) , 1) , 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, instance) ( + quantile by(environment, tier, type, stage, shard, pod, container) ( + 0.99, clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1m]) - / - node_nf_conntrack_entries_limit{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} + avg by (environment, tier, type, stage, shard, pod, container)( + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + / + rate(container_cpu_cfs_periods_total:labeled{container!="", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) + ) , 1) , @@ -397,60 +195,62 @@ groups: rules_domain: general severity: s3 expr: | - gitlab_component_saturation:ratio{component="nf_conntrack_entries",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="nf_conntrack_entries"} - - alert: component_saturation_slo_out_of_bounds:node_schedstat_waiting - for: 90m + gitlab_component_saturation:ratio{component="kube_container_throttling",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_throttling"} + - alert: component_saturation_slo_out_of_bounds:kube_horizontalpodautoscaler_desired_replicas + for: 25m annotations: - title: The Node Scheduler Waiting Time resource of the {{ $labels.type }} service - ({{ $labels.stage }} stage) has a saturation exceeding SLO and is close to - its capacity limit. + title: The Horizontal Pod Autoscaler Desired Replicas resource of the {{ $labels.type + }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and + is close to its capacity limit. description: | This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - Details of the Node Scheduler Waiting Time resource: - - Measures the amount of scheduler waiting time that processes are waiting to be scheduled, according to [`CPU Scheduling Metrics`](https://www.robustperception.io/cpu-scheduling-metrics-from-the-node-exporter). + Details of the Horizontal Pod Autoscaler Desired Replicas resource: - A high value indicates that a node has more processes to be run than CPU time available to handle them, and may lead to degraded responsiveness and performance from the application. + The [Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) automatically scales the number of Pods in a deployment based on metrics. - Additionally, it may indicate that the fleet is under-provisioned. - grafana_dashboard_id: alerts-sat_node_schedstat_waiting - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_node_schedstat_waiting?from=now-6h/m&to=now-1m/m&var-environment={{ + The Horizontal Pod Autoscaler has a configured upper maximum. When this limit is reached, the HPA will not increase the number of pods and other resource saturation (eg, CPU, memory) may occur. + grafana_dashboard_id: alerts-sat_kube_horizontalpodautoscaler + grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_horizontalpodautoscaler?from=now-6h/m&to=now-1m/m&var-environment={{ $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage }} grafana_datasource_id: mimir-gitlab-gstg grafana_min_zoom_hours: "6" - grafana_panel_id: "1415313189" + grafana_panel_id: "351198712" grafana_variables: environment,type,stage promql_query: | - max by(environment, tier, type, stage, shard, fqdn, shard) ( + max by(environment, tier, type, stage, shard, horizontalpodautoscaler, shard) ( clamp_min( clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + kube_horizontalpodautoscaler_status_desired_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} + / + kube_horizontalpodautoscaler_spec_max_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , 0) ) promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn, shard) ( + max by(environment, tier, type, stage, shard, horizontalpodautoscaler, shard) ( clamp_min( clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[1h])) + kube_horizontalpodautoscaler_status_desired_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} + / + kube_horizontalpodautoscaler_spec_max_replicas:labeled{environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , 0) ) - runbook: docs/{{ $labels.type }}/README.md + runbook: docs/kube/kubernetes.md#hpascalecapability labels: alert_type: cause rules_domain: general - severity: s4 + severity: s3 expr: | - gitlab_component_saturation:ratio{component="node_schedstat_waiting",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="node_schedstat_waiting"} + gitlab_component_saturation:ratio{component="kube_horizontalpodautoscaler_desired_replicas",env="gstg",type="zoekt"} > on(component) group_left + slo:max:hard:gitlab_component_saturation:ratio{component="kube_horizontalpodautoscaler_desired_replicas"} - alert: component_saturation_slo_out_of_bounds:open_fds for: 5m annotations: @@ -524,105 +324,3 @@ groups: expr: | gitlab_component_saturation:ratio{component="open_fds",env="gstg",type="zoekt"} > on(component) group_left slo:max:hard:gitlab_component_saturation:ratio{component="open_fds"} - - alert: component_saturation_slo_out_of_bounds:shard_cpu - for: 5m - annotations: - title: The Average CPU Utilization per Shard resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and - is close to its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Average CPU Utilization per Shard resource: - - This resource measures average CPU utilization across an all cores in a shard of a service fleet. If it is becoming saturated, it may indicate that the shard needs horizontal or vertical scaling. - grafana_dashboard_id: alerts-sat_shard_cpu - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_shard_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gstg - grafana_min_zoom_hours: "6" - grafana_panel_id: "1472933476" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, shard) ( - clamp_min( - clamp_max( - 1 - avg by (environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m]) - ) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - rules_domain: general - severity: s3 - expr: | - gitlab_component_saturation:ratio{component="shard_cpu",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="shard_cpu"} - - alert: component_saturation_slo_out_of_bounds:single_node_cpu - for: 10m - annotations: - title: The Average CPU Utilization per Node resource of the {{ $labels.type - }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and - is close to its capacity limit. - description: | - This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit. - - Details of the Average CPU Utilization per Node resource: - - Average CPU utilization per Node. - - If average CPU is saturated, it may indicate that a fleet is in need to horizontal or vertical scaling. It may also indicate imbalances in load in a fleet. - grafana_dashboard_id: alerts-sat_single_node_cpu - grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_single_node_cpu?from=now-6h/m&to=now-1m/m&var-environment={{ - $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage - }} - grafana_datasource_id: mimir-gitlab-gstg - grafana_min_zoom_hours: "6" - grafana_panel_id: "3372411356" - grafana_variables: environment,type,stage - promql_query: | - max by(environment, tier, type, stage, shard, fqdn) ( - clamp_min( - clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) - , - 1) - , - 0) - ) - promql_template_1: | - max by(environment, tier, type, stage, shard, fqdn) ( - clamp_min( - clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}[5m])) - , - 1) - , - 0) - ) - runbook: docs/{{ $labels.type }}/README.md - labels: - alert_type: cause - rules_domain: general - severity: s4 - expr: | - gitlab_component_saturation:ratio{component="single_node_cpu",env="gstg",type="zoekt"} > on(component) group_left - slo:max:hard:gitlab_component_saturation:ratio{component="single_node_cpu"} diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml index fd74401a84..c4b506c3e4 100644 --- a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation-metadata.yml @@ -6,52 +6,44 @@ groups: rules: - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: cpu - expr: "0.8" + component: kube_container_cpu + expr: "0.95" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: cpu - expr: "0.9" + component: kube_container_cpu + expr: "0.99" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: disk_inodes - expr: "0.75" + component: kube_container_cpu_limit + expr: "0.9" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: disk_inodes - expr: "0.8" + component: kube_container_cpu_limit + expr: "0.99" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: disk_space - expr: "0.85" + component: kube_container_memory + expr: "0.8" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: disk_space + component: kube_container_memory expr: "0.9" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: memory - expr: "0.9" + component: kube_container_throttling + expr: "0.4" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: memory - expr: "0.98" + component: kube_container_throttling + expr: "0.5" - record: slo:max:soft:gitlab_component_saturation:ratio labels: - component: nf_conntrack_entries - expr: "0.95" - - record: slo:max:hard:gitlab_component_saturation:ratio - labels: - component: nf_conntrack_entries - expr: "0.98" - - record: slo:max:soft:gitlab_component_saturation:ratio - labels: - component: node_schedstat_waiting - expr: "0.1" + component: kube_horizontalpodautoscaler_desired_replicas + expr: "0.9" - record: slo:max:hard:gitlab_component_saturation:ratio labels: - component: node_schedstat_waiting - expr: "0.15" + component: kube_horizontalpodautoscaler_desired_replicas + expr: "0.95" - record: slo:max:soft:gitlab_component_saturation:ratio labels: component: open_fds @@ -60,19 +52,3 @@ groups: labels: component: open_fds expr: "0.9" - - record: slo:max:soft:gitlab_component_saturation:ratio - labels: - component: shard_cpu - expr: "0.85" - - record: slo:max:hard:gitlab_component_saturation:ratio - labels: - component: shard_cpu - expr: "0.95" - - record: slo:max:soft:gitlab_component_saturation:ratio - labels: - component: single_node_cpu - expr: "0.9" - - record: slo:max:hard:gitlab_component_saturation:ratio - labels: - component: single_node_cpu - expr: "0.95" diff --git a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml index 14e37fdc5d..50aa2a03e8 100644 --- a/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml +++ b/mimir-rules/gitlab-gstg/gstg/zoekt/autogenerated-gitlab-gstg-gstg-zoekt-saturation.yml @@ -6,13 +6,23 @@ groups: rules: - record: gitlab_component_saturation:ratio labels: - component: cpu + component: kube_container_cpu expr: | - max by(env, environment, tier, type, stage, shard) ( + quantile by(env, environment, tier, type, stage, shard) ( + 0.99, clamp_min( clamp_max( - 1 - avg by (env, environment, tier, type, stage, shard) ( - rate(node_cpu_seconds_total{mode="idle", env="gstg",type="zoekt"}[5m]) + ( + sum by (env, environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", env="gstg",type="zoekt"}[1h]) + ) + unless on(env, environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", env="gstg",type="zoekt"} + ) + ) + / + sum by(env, environment, tier, type, stage, shard, pod, container) ( + kube_pod_container_resource_requests:labeled{container!="", container!="POD", resource="cpu", env="gstg",type="zoekt"} ) , 1) @@ -21,15 +31,19 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: disk_inodes + component: kube_container_cpu_limit expr: | max by(env, environment, tier, type, stage, shard) ( clamp_min( clamp_max( - 1 - ( - node_filesystem_files_free{fstype=~"(ext.|xfs)", env="gstg",type="zoekt"} + sum by (env, environment, tier, type, stage, shard, pod, container) ( + rate(container_cpu_usage_seconds_total:labeled{container!="", container!="POD", env="gstg",type="zoekt"}[5m]) + ) + / + sum by(env, environment, tier, type, stage, shard, pod, container) ( + container_spec_cpu_quota:labeled{container!="", container!="POD", env="gstg",type="zoekt"} / - node_filesystem_files{fstype=~"(ext.|xfs)", env="gstg",type="zoekt"} + container_spec_cpu_period:labeled{container!="", container!="POD", env="gstg",type="zoekt"} ) , 1) @@ -38,14 +52,14 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: disk_space + component: kube_container_memory expr: | max by(env, environment, tier, type, stage, shard) ( clamp_min( clamp_max( - ( - 1 - node_filesystem_avail_bytes{fstype=~"ext.|xfs", env="gstg",type="zoekt"} / node_filesystem_size_bytes{fstype=~"ext.|xfs", env="gstg",type="zoekt"} - ) + container_memory_working_set_bytes:labeled{container!="", container!="POD", env="gstg",type="zoekt"} + / + (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", env="gstg",type="zoekt"} > 0) , 1) , @@ -53,12 +67,17 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: memory + component: kube_container_throttling expr: | - max by(env, environment, tier, type, stage, shard) ( + quantile by(env, environment, tier, type, stage, shard) ( + 0.99, clamp_min( clamp_max( - instance:node_memory_utilization:ratio{env="gstg",type="zoekt"} or instance:node_memory_utilisation:ratio{env="gstg",type="zoekt"} + avg by (env, environment, tier, type, stage, shard, pod, container)( + rate(container_cpu_cfs_throttled_periods_total:labeled{container!="", env="gstg",type="zoekt"}[5m]) + / + rate(container_cpu_cfs_periods_total:labeled{container!="", env="gstg",type="zoekt"}[5m]) + ) , 1) , @@ -66,27 +85,14 @@ groups: ) - record: gitlab_component_saturation:ratio labels: - component: nf_conntrack_entries + component: kube_horizontalpodautoscaler_desired_replicas expr: | max by(env, environment, tier, type, stage, shard) ( clamp_min( clamp_max( - max_over_time(node_nf_conntrack_entries{env="gstg",type="zoekt"}[1m]) + kube_horizontalpodautoscaler_status_desired_replicas:labeled{env="gstg",type="zoekt", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} / - node_nf_conntrack_entries_limit{env="gstg",type="zoekt"} - , - 1) - , - 0) - ) - - record: gitlab_component_saturation:ratio - labels: - component: node_schedstat_waiting - expr: | - max by(env, environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - avg without (cpu) (rate(node_schedstat_waiting_seconds_total{env="gstg",type="zoekt"}[1h])) + kube_horizontalpodautoscaler_spec_max_replicas:labeled{env="gstg",type="zoekt", shard!~"database-throttled|elasticsearch|gitaly-throttled|urgent-authorized-projects|urgent-other", namespace!~"pubsubbeat"} , 1) , @@ -115,31 +121,3 @@ groups: , 0) ) - - record: gitlab_component_saturation:ratio - labels: - component: shard_cpu - expr: | - max by(env, environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - 1 - avg by (env, environment, tier, type, stage, shard, shard) ( - rate(node_cpu_seconds_total{mode="idle", env="gstg",type="zoekt"}[5m]) - ) - , - 1) - , - 0) - ) - - record: gitlab_component_saturation:ratio - labels: - component: single_node_cpu - expr: | - max by(env, environment, tier, type, stage, shard) ( - clamp_min( - clamp_max( - avg without(cpu, mode) (1 - rate(node_cpu_seconds_total{mode="idle", env="gstg",type="zoekt"}[5m])) - , - 1) - , - 0) - ) -- GitLab