From 016de10c15c80530332ba92823b8e551e96904ea Mon Sep 17 00:00:00 2001
From: Tomasz Maczukin <tomasz@maczukin.pl>
Date: Thu, 23 Nov 2023 15:07:46 +0100
Subject: [PATCH] Add 'Idle Efficiency' graph to ci-runners autoscaling
 dashboard

---
 .../incident-autoscaling.dashboard.jsonnet    |  1 +
 .../autoscaling_graphs.libsonnet              | 31 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet b/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet
index 3abfc3f3f3..0b976bb65b 100644
--- a/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet
+++ b/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet
@@ -44,6 +44,7 @@ dashboardIncident.incidentDashboard(
 .addGrid(
   panels=[
     autoscalingGraphs.vmStates(),
+    autoscalingGraphs.idleEfficiency(),
     autoscalingGraphs.vmOperationsRate(),
     autoscalingGraphs.vmCreationTiming(),
   ],
diff --git a/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet b/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet
index 622cec6b3c..faea6d1910 100644
--- a/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet
+++ b/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet
@@ -2,6 +2,7 @@ local panels = import './panels.libsonnet';
 local basic = import 'grafana/basic.libsonnet';
 local promQuery = import 'grafana/prom_query.libsonnet';
 local seriesOverrides = import 'grafana/series_overrides.libsonnet';
+local thresholds = import 'gitlab-dashboards/thresholds.libsonnet';
 
 local runnersManagerMatching = import './runner_managers_matching.libsonnet';
 
@@ -45,6 +46,35 @@ local vmCreationTiming(partition=runnersManagerMatching.defaultPartition) =
     intervalFactor=2,
   );
 
+local idleEfficiency(partition=runnersManagerMatching.defaultPartition) =
+  basic.timeseries(
+    'Idle efficiency',
+    legendFormat='{{shard}}',
+    format='percentunit',
+    query=runnersManagerMatching.formatQuery(|||
+      1 - (
+        sum by(shard) (
+          gitlab_runner_autoscaling_machine_states{environment=~"$environment", stage=~"$stage", executor="docker+machine", %(runnerManagersMatcher)s, state=~"idle|acquired"}
+        )
+        /
+        sum by(shard) (
+          gitlab_runner_autoscaling_machine_states{environment=~"$environment", stage=~"$stage", executor="docker+machine", %(runnerManagersMatcher)s}
+        )
+      )
+    |||, partition),
+    description=|||
+      Shows what percentages of instances are in the idle or acquired state. There is no golden rule here and the metric
+      should be analyzed together with raw numbers showing the different instance states, but in a very generlized view:
+      the higher number the better, more than 50% is what we aim to if there is a constant number of jobs in the
+      incoming queue for a shard. For shards that have times with no jobs in the queue, having the efficiency dropped
+      below 50% is something normal, but in that case we aim to have a small raw number of idle instances.
+    |||,
+    thresholds=[
+      thresholds.warningLevel('lt', 0.5),
+      thresholds.optimalLevel('gt', 0.5),
+    ],
+  );
+
 local gcpRegionQuotas =
   basic.timeseries(
     'GCP region quotas',
@@ -105,6 +135,7 @@ local gcpInstances =
   vmStates:: vmStates,
   vmOperationsRate:: vmOperationsRate,
   vmCreationTiming:: vmCreationTiming,
+  idleEfficiency:: idleEfficiency,
   gcpRegionQuotas:: gcpRegionQuotas,
   gcpInstances:: gcpInstances,
 }
-- 
GitLab