From 016de10c15c80530332ba92823b8e551e96904ea Mon Sep 17 00:00:00 2001 From: Tomasz Maczukin <tomasz@maczukin.pl> Date: Thu, 23 Nov 2023 15:07:46 +0100 Subject: [PATCH] Add 'Idle Efficiency' graph to ci-runners autoscaling dashboard --- .../incident-autoscaling.dashboard.jsonnet | 1 + .../autoscaling_graphs.libsonnet | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet b/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet index 3abfc3f3f3..0b976bb65b 100644 --- a/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet +++ b/dashboards/ci-runners/incident-autoscaling.dashboard.jsonnet @@ -44,6 +44,7 @@ dashboardIncident.incidentDashboard( .addGrid( panels=[ autoscalingGraphs.vmStates(), + autoscalingGraphs.idleEfficiency(), autoscalingGraphs.vmOperationsRate(), autoscalingGraphs.vmCreationTiming(), ], diff --git a/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet b/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet index 622cec6b3c..faea6d1910 100644 --- a/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet +++ b/libsonnet/stage-groups/verify-runner/autoscaling_graphs.libsonnet @@ -2,6 +2,7 @@ local panels = import './panels.libsonnet'; local basic = import 'grafana/basic.libsonnet'; local promQuery = import 'grafana/prom_query.libsonnet'; local seriesOverrides = import 'grafana/series_overrides.libsonnet'; +local thresholds = import 'gitlab-dashboards/thresholds.libsonnet'; local runnersManagerMatching = import './runner_managers_matching.libsonnet'; @@ -45,6 +46,35 @@ local vmCreationTiming(partition=runnersManagerMatching.defaultPartition) = intervalFactor=2, ); +local idleEfficiency(partition=runnersManagerMatching.defaultPartition) = + basic.timeseries( + 'Idle efficiency', + legendFormat='{{shard}}', + format='percentunit', + query=runnersManagerMatching.formatQuery(||| + 1 - ( + sum by(shard) ( + gitlab_runner_autoscaling_machine_states{environment=~"$environment", stage=~"$stage", executor="docker+machine", %(runnerManagersMatcher)s, state=~"idle|acquired"} + ) + / + sum by(shard) ( + gitlab_runner_autoscaling_machine_states{environment=~"$environment", stage=~"$stage", executor="docker+machine", %(runnerManagersMatcher)s} + ) + ) + |||, partition), + description=||| + Shows what percentages of instances are in the idle or acquired state. There is no golden rule here and the metric + should be analyzed together with raw numbers showing the different instance states, but in a very generlized view: + the higher number the better, more than 50% is what we aim to if there is a constant number of jobs in the + incoming queue for a shard. For shards that have times with no jobs in the queue, having the efficiency dropped + below 50% is something normal, but in that case we aim to have a small raw number of idle instances. + |||, + thresholds=[ + thresholds.warningLevel('lt', 0.5), + thresholds.optimalLevel('gt', 0.5), + ], + ); + local gcpRegionQuotas = basic.timeseries( 'GCP region quotas', @@ -105,6 +135,7 @@ local gcpInstances = vmStates:: vmStates, vmOperationsRate:: vmOperationsRate, vmCreationTiming:: vmCreationTiming, + idleEfficiency:: idleEfficiency, gcpRegionQuotas:: gcpRegionQuotas, gcpInstances:: gcpInstances, } -- GitLab