Switch plantuml from kube_container_memory to kube_container_rss saturation point

f9cd633f · Igor · f4b80b21 · f9cd633f · f9cd633f · f9cd633f
Commit f9cd633f authored 6 months ago by Igor
--- a/libsonnet/saturation-monitoring/kube_container_rss.libsonnet
+++ b/libsonnet/saturation-monitoring/kube_container_rss.libsonnet
@@ -6,7 +6,7 @@ local resourceSaturationPoint = (import 'servicemetrics/resource_saturation_poin
    title: 'Kube Container Memory Utilization (RSS)',
    severity: 's4',
    horizontallyScalable: true,
-    appliesTo: metricsCatalog.findServicesWithTag(tag='rails'),
+    appliesTo: metricsCatalog.findServicesWithTag(tag='kube_container_rss'),
    description: |||
      Records the total anonymous (unevictable) memory utilization for containers for this
      service, as a percentage of the memory limit as configured through Kubernetes.

--- a/metrics-catalog/services/ai-assisted.jsonnet
+++ b/metrics-catalog/services/ai-assisted.jsonnet
@@ -13,7 +13,7 @@ metricsCatalog.serviceDefinition({
  type: 'ai-assisted',
  tier: 'sv',
-  tags: ['golang', 'rails', 'puma'],
+  tags: ['golang', 'rails', 'puma', 'kube_container_rss'],
  contractualThresholds: {
    apdexRatio: 0.9,

--- a/metrics-catalog/services/api.jsonnet
+++ b/metrics-catalog/services/api.jsonnet
@@ -15,7 +15,7 @@ metricsCatalog.serviceDefinition({
  tier: 'sv',
  tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'],
-  tags: ['golang', 'rails', 'puma'],
+  tags: ['golang', 'rails', 'puma', 'kube_container_rss'],
  contractualThresholds: {
    apdexRatio: 0.9,

--- a/metrics-catalog/services/git.jsonnet
+++ b/metrics-catalog/services/git.jsonnet
@@ -18,7 +18,7 @@ metricsCatalog.serviceDefinition({
  tier: 'sv',
  tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'],
-  tags: ['golang', 'rails', 'puma'],
+  tags: ['golang', 'rails', 'puma', 'kube_container_rss'],
  contractualThresholds: {
    apdexRatio: 0.9,

--- a/metrics-catalog/services/internal-api.jsonnet
+++ b/metrics-catalog/services/internal-api.jsonnet
@@ -15,7 +15,7 @@ metricsCatalog.serviceDefinition({
  tier: 'sv',
  tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'],
-  tags: ['golang', 'rails', 'puma'],
+  tags: ['golang', 'rails', 'puma', 'kube_container_rss'],
  contractualThresholds: {
    apdexRatio: 0.9,

--- a/metrics-catalog/services/plantuml.jsonnet
+++ b/metrics-catalog/services/plantuml.jsonnet
@@ -5,6 +5,9 @@ local toolingLinks = import 'toolinglinks/toolinglinks.libsonnet';
 metricsCatalog.serviceDefinition({
  type: 'plantuml',
  tier: 'sv',
+  tags: ['java', 'kube_container_rss'],
  // plantuml doesn't have a `cny` stage
  serviceIsStageless: true,
  monitoringThresholds: {

--- a/metrics-catalog/services/sidekiq.jsonnet
+++ b/metrics-catalog/services/sidekiq.jsonnet
@@ -23,7 +23,7 @@ metricsCatalog.serviceDefinition({
  type: 'sidekiq',
  tier: 'sv',
  tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'],
-  tags: ['rails'],
+  tags: ['rails', 'kube_container_rss'],
  // overrides monitoringThresholds for specific shards and SLIs
  monitoring: {

--- a/metrics-catalog/services/web.jsonnet
+++ b/metrics-catalog/services/web.jsonnet
@@ -16,7 +16,7 @@ metricsCatalog.serviceDefinition({
  tier: 'sv',
  tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'],
-  tags: ['golang', 'rails', 'puma'],
+  tags: ['golang', 'rails', 'puma', 'kube_container_rss'],
  contractualThresholds: {
    apdexRatio: 0.9,

--- a/metrics-catalog/services/websockets.jsonnet
+++ b/metrics-catalog/services/websockets.jsonnet
@@ -14,7 +14,7 @@ metricsCatalog.serviceDefinition({
  tier: 'sv',
  tenants: ['gitlab-gprd', 'gitlab-gstg', 'gitlab-pre'],
-  tags: ['golang', 'rails', 'puma'],
+  tags: ['golang', 'rails', 'puma', 'kube_container_rss'],
  monitoringThresholds: {
    apdexScore: 0.95,

--- a/mimir-rules/gitlab-gprd/plantuml/autogenerated-gitlab-gprd-plantuml-saturation-alerts.yml
+++ b/mimir-rules/gitlab-gprd/plantuml/autogenerated-gitlab-gprd-plantuml-saturation-alerts.yml
@@ -83,31 +83,39 @@ groups:
    expr: |
      gitlab_component_saturation:ratio{component="kube_container_cpu_limit",env="gprd",type="plantuml"} > on(component) group_left
      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_cpu_limit"}
-  - alert: component_saturation_slo_out_of_bounds:kube_container_memory
+  - alert: component_saturation_slo_out_of_bounds:kube_container_rss
    for: 15m
    annotations:
-      title: The Kube Container Memory Utilization resource of the {{ $labels.type
+      title: The Kube Container Memory Utilization (RSS) resource of the {{ $labels.type
        }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and
        is close to its capacity limit.
      description: |
        This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit.
-        Details of the Kube Container Memory Utilization resource:
+        Details of the Kube Container Memory Utilization (RSS) resource:
-        This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed.
+        Records the total anonymous (unevictable) memory utilization for containers for this service, as a percentage of the memory limit as configured through Kubernetes.
-      grafana_dashboard_id: alerts-sat_kube_container_memory
-      grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_memory?from=now-6h/m&to=now-1m/m&var-environment={{
+        This is computed using the container's resident set size (RSS), as opposed to kube_container_memory which uses the working set size. For our purposes, RSS is the better metric as cAdvisor's working set calculation includes pages from the filesystem cache that can (and will) be evicted before the OOM killer kills the cgroup.
+        A container's RSS (anonymous memory usage) is still not precisely what the OOM killer will use, but it's a better approximation of what the container's workload is actually using. RSS metrics can, however, be dramatically inflated if a process in the container uses MADV_FREE (lazy-free) memory. RSS will include the memory that is available to be reclaimed without a page fault, but not currently in use.
+        The most common case of OOM kills is for anonymous memory demand to overwhelm the container's memory limit. On swapless hosts, anonymous memory cannot be evicted from the page cache, so when a container's memory usage is mostly anonymous pages, the only remaining option to relieve memory pressure may be the OOM killer.
+        As container RSS approaches container memory limit, OOM kills become much more likely. Consequently, this ratio is a good leading indicator of memory saturation and OOM risk.
+      grafana_dashboard_id: alerts-sat_kube_container_rss
+      grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_rss?from=now-6h/m&to=now-1m/m&var-environment={{
        $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
        }}
      grafana_datasource_id: mimir-gitlab-gprd
      grafana_min_zoom_hours: "6"
-      grafana_panel_id: "172578411"
+      grafana_panel_id: "2875690100"
      grafana_variables: environment,type,stage
      promql_query: |
        max by(environment, tier, type, stage, shard) (
          clamp_min(
            clamp_max(
-              container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
+              container_memory_rss:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
              /
              (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0)
              ,
@@ -119,7 +127,7 @@ groups:
        max by(environment, tier, type, stage, shard) (
          clamp_min(
            clamp_max(
-              container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
+              container_memory_rss:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
              /
              (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0)
              ,
@@ -133,8 +141,8 @@ groups:
      rules_domain: general
      severity: s4
    expr: |
-      gitlab_component_saturation:ratio{component="kube_container_memory",env="gprd",type="plantuml"} > on(component) group_left
+      gitlab_component_saturation:ratio{component="kube_container_rss",env="gprd",type="plantuml"} > on(component) group_left
-      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_memory"}
+      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_rss"}
  - alert: component_saturation_slo_out_of_bounds:kube_container_throttling
    for: 10m
    annotations:

--- a/mimir-rules/gitlab-gprd/plantuml/autogenerated-gitlab-gprd-plantuml-saturation-metadata.yml
+++ b/mimir-rules/gitlab-gprd/plantuml/autogenerated-gitlab-gprd-plantuml-saturation-metadata.yml
@@ -22,11 +22,11 @@ groups:
    expr: "0.99"
  - record: slo:max:soft:gitlab_component_saturation:ratio
    labels:
-      component: kube_container_memory
+      component: kube_container_rss
    expr: "0.8"
  - record: slo:max:hard:gitlab_component_saturation:ratio
    labels:
-      component: kube_container_memory
+      component: kube_container_rss
    expr: "0.9"
  - record: slo:max:soft:gitlab_component_saturation:ratio
    labels:

--- a/mimir-rules/gitlab-gprd/plantuml/autogenerated-gitlab-gprd-plantuml-saturation.yml
+++ b/mimir-rules/gitlab-gprd/plantuml/autogenerated-gitlab-gprd-plantuml-saturation.yml
@@ -52,12 +52,12 @@ groups:
      )
  - record: gitlab_component_saturation:ratio
    labels:
-      component: kube_container_memory
+      component: kube_container_rss
    expr: |
      max by(env, environment, tier, type, stage, shard) (
        clamp_min(
          clamp_max(
-            container_memory_working_set_bytes:labeled{container!="", container!="POD", env="gprd",type="plantuml"}
+            container_memory_rss:labeled{container!="", container!="POD", env="gprd",type="plantuml"}
            /
            (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", env="gprd",type="plantuml"} > 0)
            ,

--- a/mimir-rules/gitlab-gstg/plantuml/autogenerated-gitlab-gstg-plantuml-saturation-alerts.yml
+++ b/mimir-rules/gitlab-gstg/plantuml/autogenerated-gitlab-gstg-plantuml-saturation-alerts.yml
@@ -83,31 +83,39 @@ groups:
    expr: |
      gitlab_component_saturation:ratio{component="kube_container_cpu_limit",env="gstg",type="plantuml"} > on(component) group_left
      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_cpu_limit"}
-  - alert: component_saturation_slo_out_of_bounds:kube_container_memory
+  - alert: component_saturation_slo_out_of_bounds:kube_container_rss
    for: 15m
    annotations:
-      title: The Kube Container Memory Utilization resource of the {{ $labels.type
+      title: The Kube Container Memory Utilization (RSS) resource of the {{ $labels.type
        }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and
        is close to its capacity limit.
      description: |
        This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit.
-        Details of the Kube Container Memory Utilization resource:
+        Details of the Kube Container Memory Utilization (RSS) resource:
-        This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed.
+        Records the total anonymous (unevictable) memory utilization for containers for this service, as a percentage of the memory limit as configured through Kubernetes.
-      grafana_dashboard_id: alerts-sat_kube_container_memory
-      grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_memory?from=now-6h/m&to=now-1m/m&var-environment={{
+        This is computed using the container's resident set size (RSS), as opposed to kube_container_memory which uses the working set size. For our purposes, RSS is the better metric as cAdvisor's working set calculation includes pages from the filesystem cache that can (and will) be evicted before the OOM killer kills the cgroup.
+        A container's RSS (anonymous memory usage) is still not precisely what the OOM killer will use, but it's a better approximation of what the container's workload is actually using. RSS metrics can, however, be dramatically inflated if a process in the container uses MADV_FREE (lazy-free) memory. RSS will include the memory that is available to be reclaimed without a page fault, but not currently in use.
+        The most common case of OOM kills is for anonymous memory demand to overwhelm the container's memory limit. On swapless hosts, anonymous memory cannot be evicted from the page cache, so when a container's memory usage is mostly anonymous pages, the only remaining option to relieve memory pressure may be the OOM killer.
+        As container RSS approaches container memory limit, OOM kills become much more likely. Consequently, this ratio is a good leading indicator of memory saturation and OOM risk.
+      grafana_dashboard_id: alerts-sat_kube_container_rss
+      grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_rss?from=now-6h/m&to=now-1m/m&var-environment={{
        $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
        }}
      grafana_datasource_id: mimir-gitlab-gstg
      grafana_min_zoom_hours: "6"
-      grafana_panel_id: "172578411"
+      grafana_panel_id: "2875690100"
      grafana_variables: environment,type,stage
      promql_query: |
        max by(environment, tier, type, stage, shard) (
          clamp_min(
            clamp_max(
-              container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
+              container_memory_rss:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
              /
              (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0)
              ,
@@ -119,7 +127,7 @@ groups:
        max by(environment, tier, type, stage, shard) (
          clamp_min(
            clamp_max(
-              container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
+              container_memory_rss:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
              /
              (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0)
              ,
@@ -133,8 +141,8 @@ groups:
      rules_domain: general
      severity: s4
    expr: |
-      gitlab_component_saturation:ratio{component="kube_container_memory",env="gstg",type="plantuml"} > on(component) group_left
+      gitlab_component_saturation:ratio{component="kube_container_rss",env="gstg",type="plantuml"} > on(component) group_left
-      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_memory"}
+      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_rss"}
  - alert: component_saturation_slo_out_of_bounds:kube_container_throttling
    for: 10m
    annotations:

--- a/mimir-rules/gitlab-gstg/plantuml/autogenerated-gitlab-gstg-plantuml-saturation-metadata.yml
+++ b/mimir-rules/gitlab-gstg/plantuml/autogenerated-gitlab-gstg-plantuml-saturation-metadata.yml
@@ -22,11 +22,11 @@ groups:
    expr: "0.99"
  - record: slo:max:soft:gitlab_component_saturation:ratio
    labels:
-      component: kube_container_memory
+      component: kube_container_rss
    expr: "0.8"
  - record: slo:max:hard:gitlab_component_saturation:ratio
    labels:
-      component: kube_container_memory
+      component: kube_container_rss
    expr: "0.9"
  - record: slo:max:soft:gitlab_component_saturation:ratio
    labels:

--- a/mimir-rules/gitlab-gstg/plantuml/autogenerated-gitlab-gstg-plantuml-saturation.yml
+++ b/mimir-rules/gitlab-gstg/plantuml/autogenerated-gitlab-gstg-plantuml-saturation.yml
@@ -52,12 +52,12 @@ groups:
      )
  - record: gitlab_component_saturation:ratio
    labels:
-      component: kube_container_memory
+      component: kube_container_rss
    expr: |
      max by(env, environment, tier, type, stage, shard) (
        clamp_min(
          clamp_max(
-            container_memory_working_set_bytes:labeled{container!="", container!="POD", env="gstg",type="plantuml"}
+            container_memory_rss:labeled{container!="", container!="POD", env="gstg",type="plantuml"}
            /
            (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", env="gstg",type="plantuml"} > 0)
            ,

--- a/mimir-rules/gitlab-ops/ops-gitlab-net/autogenerated-gitlab-ops-ops-gitlab-net-saturation-alerts.yml
+++ b/mimir-rules/gitlab-ops/ops-gitlab-net/autogenerated-gitlab-ops-ops-gitlab-net-saturation-alerts.yml
@@ -233,39 +233,31 @@ groups:
    expr: |
      gitlab_component_saturation:ratio{component="kube_container_cpu_limit",env="ops",type="ops-gitlab-net"} > on(component) group_left
      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_cpu_limit"}
-  - alert: component_saturation_slo_out_of_bounds:kube_container_rss
+  - alert: component_saturation_slo_out_of_bounds:kube_container_memory
    for: 15m
    annotations:
-      title: The Kube Container Memory Utilization (RSS) resource of the {{ $labels.type
+      title: The Kube Container Memory Utilization resource of the {{ $labels.type
        }} service ({{ $labels.stage }} stage) has a saturation exceeding SLO and
        is close to its capacity limit.
      description: |
        This means that this resource is running close to capacity and is at risk of exceeding its current capacity limit.
-        Details of the Kube Container Memory Utilization (RSS) resource:
+        Details of the Kube Container Memory Utilization resource:
-        Records the total anonymous (unevictable) memory utilization for containers for this service, as a percentage of the memory limit as configured through Kubernetes.
+        This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed.
+      grafana_dashboard_id: alerts-sat_kube_container_memory
-        This is computed using the container's resident set size (RSS), as opposed to kube_container_memory which uses the working set size. For our purposes, RSS is the better metric as cAdvisor's working set calculation includes pages from the filesystem cache that can (and will) be evicted before the OOM killer kills the cgroup.
+      grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_memory?from=now-6h/m&to=now-1m/m&var-environment={{
-        A container's RSS (anonymous memory usage) is still not precisely what the OOM killer will use, but it's a better approximation of what the container's workload is actually using. RSS metrics can, however, be dramatically inflated if a process in the container uses MADV_FREE (lazy-free) memory. RSS will include the memory that is available to be reclaimed without a page fault, but not currently in use.
-        The most common case of OOM kills is for anonymous memory demand to overwhelm the container's memory limit. On swapless hosts, anonymous memory cannot be evicted from the page cache, so when a container's memory usage is mostly anonymous pages, the only remaining option to relieve memory pressure may be the OOM killer.
-        As container RSS approaches container memory limit, OOM kills become much more likely. Consequently, this ratio is a good leading indicator of memory saturation and OOM risk.
-      grafana_dashboard_id: alerts-sat_kube_container_rss
-      grafana_dashboard_link: https://dashboards.gitlab.net/d/alerts-sat_kube_container_rss?from=now-6h/m&to=now-1m/m&var-environment={{
        $labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
        }}
      grafana_datasource_id: mimir-gitlab-ops
      grafana_min_zoom_hours: "6"
-      grafana_panel_id: "2875690100"
+      grafana_panel_id: "172578411"
      grafana_variables: environment,type,stage
      promql_query: |
        max by(environment, tier, type, stage, shard) (
          clamp_min(
            clamp_max(
-              container_memory_rss:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
+              container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
              /
              (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0)
              ,
@@ -277,7 +269,7 @@ groups:
        max by(environment, tier, type, stage, shard) (
          clamp_min(
            clamp_max(
-              container_memory_rss:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
+              container_memory_working_set_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"}
              /
              (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", environment="{{ $labels.environment }}",stage="{{ $labels.stage }}",type="{{ $labels.type }}"} > 0)
              ,
@@ -291,8 +283,8 @@ groups:
      rules_domain: general
      severity: s4
    expr: |
-      gitlab_component_saturation:ratio{component="kube_container_rss",env="ops",type="ops-gitlab-net"} > on(component) group_left
+      gitlab_component_saturation:ratio{component="kube_container_memory",env="ops",type="ops-gitlab-net"} > on(component) group_left
-      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_rss"}
+      slo:max:hard:gitlab_component_saturation:ratio{component="kube_container_memory"}
  - alert: component_saturation_slo_out_of_bounds:kube_container_throttling
    for: 10m
    annotations:

--- a/mimir-rules/gitlab-ops/ops-gitlab-net/autogenerated-gitlab-ops-ops-gitlab-net-saturation-metadata.yml
+++ b/mimir-rules/gitlab-ops/ops-gitlab-net/autogenerated-gitlab-ops-ops-gitlab-net-saturation-metadata.yml
@@ -46,11 +46,11 @@ groups:
    expr: "0.99"
  - record: slo:max:soft:gitlab_component_saturation:ratio
    labels:
-      component: kube_container_rss
+      component: kube_container_memory
    expr: "0.8"
  - record: slo:max:hard:gitlab_component_saturation:ratio
    labels:
-      component: kube_container_rss
+      component: kube_container_memory
    expr: "0.9"
  - record: slo:max:soft:gitlab_component_saturation:ratio
    labels:

--- a/mimir-rules/gitlab-ops/ops-gitlab-net/autogenerated-gitlab-ops-ops-gitlab-net-saturation.yml
+++ b/mimir-rules/gitlab-ops/ops-gitlab-net/autogenerated-gitlab-ops-ops-gitlab-net-saturation.yml
@@ -100,12 +100,12 @@ groups:
      )
  - record: gitlab_component_saturation:ratio
    labels:
-      component: kube_container_rss
+      component: kube_container_memory
    expr: |
      max by(env, environment, tier, type, stage, shard) (
        clamp_min(
          clamp_max(
-            container_memory_rss:labeled{container!="", container!="POD", env="ops",type="ops-gitlab-net"}
+            container_memory_working_set_bytes:labeled{container!="", container!="POD", env="ops",type="ops-gitlab-net"}
            /
            (container_spec_memory_limit_bytes:labeled{container!="", container!="POD", env="ops",type="ops-gitlab-net"} > 0)
            ,

--- a/reference-architectures/get-hybrid/README.md
+++ b/reference-architectures/get-hybrid/README.md
@@ -73,8 +73,8 @@ Note that these metrics may have other requirements, please see the metric defin
 | `go_goroutines` | `gitaly`, `praefect` | Go goroutines utilization per node.  Goroutines leaks can cause memory saturation which can cause service degradation.  A limit of 250k goroutines is very generous, so if a service exceeds this limit, it's a sign of a leak and it should be dealt with.  | ✅ | 98% |
 | `go_memory` | `gitaly`, `praefect` | Go's memory allocation strategy can make it look like a Go process is saturating memory when measured using RSS, when in fact the process is not at risk of memory saturation. For this reason, we measure Go processes using the `go_memstat_alloc_bytes`  | ✅ | 98% |
 | `kube_container_cpu_limit` | `consul`, `gitlab-shell`, `registry`, `sidekiq`, `webservice` | Kubernetes containers can have a limit configured on how much CPU they can consume in a burst. If we are at this limit, exceeding the allocated requested resources, we should consider revisting the container's HPA configuration.  When a container is utilizing CPU resources up-to it's configured limit for extended periods of time, this could cause it and other running containers to be throttled.  | ✅ | 99% |
-| `kube_container_memory` | `consul`, `gitlab-shell`, `registry`, `sidekiq` | This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed.  | ✅ | 90% |
+| `kube_container_memory` | `consul`, `gitlab-shell`, `registry` | This uses the working set size from cAdvisor for the cgroup's memory usage. That may not be a good measure as it includes filesystem cache pages that are not necessarily attributable to the application inside the cgroup, and are permitted to be evicted instead of being OOM killed.  | ✅ | 90% |
-| `kube_container_rss` | `webservice` | Records the total anonymous (unevictable) memory utilization for containers for this service, as a percentage of the memory limit as configured through Kubernetes.  This is computed using the container's resident set size (RSS), as opposed to kube_container_memory which uses the working set size. For our purposes, RSS is the better metric as cAdvisor's working set calculation includes pages from the filesystem cache that can (and will) be evicted before the OOM killer kills the cgroup.  A container's RSS (anonymous memory usage) is still not precisely what the OOM killer will use, but it's a better approximation of what the container's workload is actually using. RSS metrics can, however, be dramatically inflated if a process in the container uses MADV_FREE (lazy-free) memory. RSS will include the memory that is available to be reclaimed without a page fault, but not currently in use.  The most common case of OOM kills is for anonymous memory demand to overwhelm the container's memory limit. On swapless hosts, anonymous memory cannot be evicted from the page cache, so when a container's memory usage is mostly anonymous pages, the only remaining option to relieve memory pressure may be the OOM killer.  As container RSS approaches container memory limit, OOM kills become much more likely. Consequently, this ratio is a good leading indicator of memory saturation and OOM risk.  | ✅ | 90% |
+| `kube_container_rss` | `sidekiq`, `webservice` | Records the total anonymous (unevictable) memory utilization for containers for this service, as a percentage of the memory limit as configured through Kubernetes.  This is computed using the container's resident set size (RSS), as opposed to kube_container_memory which uses the working set size. For our purposes, RSS is the better metric as cAdvisor's working set calculation includes pages from the filesystem cache that can (and will) be evicted before the OOM killer kills the cgroup.  A container's RSS (anonymous memory usage) is still not precisely what the OOM killer will use, but it's a better approximation of what the container's workload is actually using. RSS metrics can, however, be dramatically inflated if a process in the container uses MADV_FREE (lazy-free) memory. RSS will include the memory that is available to be reclaimed without a page fault, but not currently in use.  The most common case of OOM kills is for anonymous memory demand to overwhelm the container's memory limit. On swapless hosts, anonymous memory cannot be evicted from the page cache, so when a container's memory usage is mostly anonymous pages, the only remaining option to relieve memory pressure may be the OOM killer.  As container RSS approaches container memory limit, OOM kills become much more likely. Consequently, this ratio is a good leading indicator of memory saturation and OOM risk.  | ✅ | 90% |
 | `kube_persistent_volume_claim_disk_space` | `kube` | disk space utilization on persistent volume claims.  | ✅ | 90% |
 | `kube_persistent_volume_claim_inodes` | `kube` | inode utilization on persistent volume claims.  | ✅ | 90% |
 | `memory` | `consul`, `gitaly`, `praefect` | Memory utilization per device per node.  | ✅ | 98% |

--- a/reference-architectures/get-hybrid/config/dashboards/sidekiq.json
+++ b/reference-architectures/get-hybrid/config/dashboards/sidekiq.json
@@ -1070,8 +1070,8 @@
          "url": "https://dashboards.gitlab.net/d/alerts-sat_kube_container_cpu_limit/?var-environment=gprd\u0026var-type=sidekiq\u0026var-stage=main\u0026var-component=kube_container_cpu_limit"
        },
        {
-          "title": "sidekiq Service | kube_container_memory resource Dashboard",
+          "title": "sidekiq Service | kube_container_rss resource Dashboard",
-          "url": "https://dashboards.gitlab.net/d/alerts-sat_kube_container_memory/?var-environment=gprd\u0026var-type=sidekiq\u0026var-stage=main\u0026var-component=kube_container_memory"
+          "url": "https://dashboards.gitlab.net/d/alerts-sat_kube_container_rss/?var-environment=gprd\u0026var-type=sidekiq\u0026var-stage=main\u0026var-component=kube_container_rss"
        }
      ],
      "nullPointMode": "null",
@@ -4344,7 +4344,7 @@
          "dashes": false,
          "datasource": "$PROMETHEUS_DS",
          "decimals": 2,
-          "description": "This uses the working set size from cAdvisor for the cgroup's memory usage. That may\nnot be a good measure as it includes filesystem cache pages that are not necessarily\nattributable to the application inside the cgroup, and are permitted to be evicted\ninstead of being OOM killed.\n Lower is better.",
+          "description": "Records the total anonymous (unevictable) memory utilization for containers for this\nservice, as a percentage of the memory limit as configured through Kubernetes.\n\nThis is computed using the container's resident set size (RSS), as opposed to\nkube_container_memory which uses the working set size. For our purposes, RSS is the\nbetter metric as cAdvisor's working set calculation includes pages from the\nfilesystem cache that can (and will) be evicted before the OOM killer kills the\ncgroup.\n\nA container's RSS (anonymous memory usage) is still not precisely what the OOM\nkiller will use, but it's a better approximation of what the container's workload is\nactually using. RSS metrics can, however, be dramatically inflated if a process in\nthe container uses MADV_FREE (lazy-free) memory. RSS will include the memory that is\navailable to be reclaimed without a page fault, but not currently in use.\n\nThe most common case of OOM kills is for anonymous memory demand to overwhelm the\ncontainer's memory limit. On swapless hosts, anonymous memory cannot be evicted from\nthe page cache, so when a container's memory usage is mostly anonymous pages, the\nonly remaining option to relieve memory pressure may be the OOM killer.\n\nAs container RSS approaches container memory limit, OOM kills become much more\nlikely. Consequently, this ratio is a good leading indicator of memory saturation\nand OOM risk.\n Lower is better.",
          "fill": 0,
          "fillGradient": 0,
          "gridPos": {
@@ -4353,7 +4353,7 @@
            "x": 12,
            "y": 0
          },
-          "id": 172578411,
+          "id": 2875690100,
          "legend": {
            "alignAsTable": true,
            "avg": true,
@@ -4435,12 +4435,12 @@
            }
          ],
          "spaceLength": 10,
-          "stableId": "saturation-kube_container_memory",
+          "stableId": "saturation-kube_container_rss",
          "stack": false,
          "steppedLine": false,
          "targets": [
            {
-              "expr": "clamp_min(\n  clamp_max(\n    max by(type) (\n  clamp_min(\n    clamp_max(\n      container_memory_working_set_bytes:labeled{container!=\"\", container!=\"POD\", type=\"sidekiq\"}\n      /\n      (container_spec_memory_limit_bytes:labeled{container!=\"\", container!=\"POD\", type=\"sidekiq\"} \u003e 0)\n      ,\n      1)\n  ,\n  0)\n)\n\n  ,1)\n,0)\n",
+              "expr": "clamp_min(\n  clamp_max(\n    max by(type) (\n  clamp_min(\n    clamp_max(\n      container_memory_rss:labeled{container!=\"\", container!=\"POD\", type=\"sidekiq\"}\n      /\n      (container_spec_memory_limit_bytes:labeled{container!=\"\", container!=\"POD\", type=\"sidekiq\"} \u003e 0)\n      ,\n      1)\n  ,\n  0)\n)\n\n  ,1)\n,0)\n",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
@@ -4448,7 +4448,7 @@
              "refId": "A"
            },
            {
-              "expr": "clamp_min(\n  clamp_max(\n    max(\n      max_over_time(gitlab_component_saturation:ratio{type=\"sidekiq\", component=\"kube_container_memory\"}[$__interval])\n    ) by (component)\n  ,1)\n,0)\n",
+              "expr": "clamp_min(\n  clamp_max(\n    max(\n      max_over_time(gitlab_component_saturation:ratio{type=\"sidekiq\", component=\"kube_container_rss\"}[$__interval])\n    ) by (component)\n  ,1)\n,0)\n",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
@@ -4456,7 +4456,7 @@
              "refId": "B"
            },
            {
-              "expr": "max(\n  gitlab_component_saturation:ratio_quantile95_1w{type=\"sidekiq\", component=\"kube_container_memory\"}\n)\n",
+              "expr": "max(\n  gitlab_component_saturation:ratio_quantile95_1w{type=\"sidekiq\", component=\"kube_container_rss\"}\n)\n",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
@@ -4464,7 +4464,7 @@
              "refId": "C"
            },
            {
-              "expr": "max(\n  gitlab_component_saturation:ratio_quantile99_1w{type=\"sidekiq\", component=\"kube_container_memory\"}\n)\n",
+              "expr": "max(\n  gitlab_component_saturation:ratio_quantile99_1w{type=\"sidekiq\", component=\"kube_container_rss\"}\n)\n",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
@@ -4472,7 +4472,7 @@
              "refId": "D"
            },
            {
-              "expr": "avg(slo:max:soft:gitlab_component_saturation:ratio{component=\"kube_container_memory\"}) by (component)\n",
+              "expr": "avg(slo:max:soft:gitlab_component_saturation:ratio{component=\"kube_container_rss\"}) by (component)\n",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
@@ -4480,7 +4480,7 @@
              "refId": "E"
            },
            {
-              "expr": "avg(slo:max:hard:gitlab_component_saturation:ratio{component=\"kube_container_memory\"}) by (component)\n",
+              "expr": "avg(slo:max:hard:gitlab_component_saturation:ratio{component=\"kube_container_rss\"}) by (component)\n",
              "format": "time_series",
              "interval": "1m",
              "intervalFactor": 1,
@@ -4491,7 +4491,7 @@
          "thresholds": [],
          "timeFrom": null,
          "timeShift": null,
-          "title": "kube_container_memory component saturation: Kube Container Memory Utilization",
+          "title": "kube_container_rss component saturation: Kube Container Memory Utilization (RSS)",
          "tooltip": {
            "shared": true,
            "sort": 2,