Skip to content
Snippets Groups Projects
Verified Commit 016de10c authored by Tomasz Maczukin's avatar Tomasz Maczukin 💬
Browse files

Add 'Idle Efficiency' graph to ci-runners autoscaling dashboard

parent 0631b266
No related branches found
No related tags found
Loading
......@@ -44,6 +44,7 @@ dashboardIncident.incidentDashboard(
.addGrid(
panels=[
autoscalingGraphs.vmStates(),
autoscalingGraphs.idleEfficiency(),
autoscalingGraphs.vmOperationsRate(),
autoscalingGraphs.vmCreationTiming(),
],
......
......@@ -2,6 +2,7 @@ local panels = import './panels.libsonnet';
local basic = import 'grafana/basic.libsonnet';
local promQuery = import 'grafana/prom_query.libsonnet';
local seriesOverrides = import 'grafana/series_overrides.libsonnet';
local thresholds = import 'gitlab-dashboards/thresholds.libsonnet';
local runnersManagerMatching = import './runner_managers_matching.libsonnet';
......@@ -45,6 +46,35 @@ local vmCreationTiming(partition=runnersManagerMatching.defaultPartition) =
intervalFactor=2,
);
local idleEfficiency(partition=runnersManagerMatching.defaultPartition) =
basic.timeseries(
'Idle efficiency',
legendFormat='{{shard}}',
format='percentunit',
query=runnersManagerMatching.formatQuery(|||
1 - (
sum by(shard) (
gitlab_runner_autoscaling_machine_states{environment=~"$environment", stage=~"$stage", executor="docker+machine", %(runnerManagersMatcher)s, state=~"idle|acquired"}
)
/
sum by(shard) (
gitlab_runner_autoscaling_machine_states{environment=~"$environment", stage=~"$stage", executor="docker+machine", %(runnerManagersMatcher)s}
)
)
|||, partition),
description=|||
Shows what percentages of instances are in the idle or acquired state. There is no golden rule here and the metric
should be analyzed together with raw numbers showing the different instance states, but in a very generlized view:
the higher number the better, more than 50% is what we aim to if there is a constant number of jobs in the
incoming queue for a shard. For shards that have times with no jobs in the queue, having the efficiency dropped
below 50% is something normal, but in that case we aim to have a small raw number of idle instances.
|||,
thresholds=[
thresholds.warningLevel('lt', 0.5),
thresholds.optimalLevel('gt', 0.5),
],
);
local gcpRegionQuotas =
basic.timeseries(
'GCP region quotas',
......@@ -105,6 +135,7 @@ local gcpInstances =
vmStates:: vmStates,
vmOperationsRate:: vmOperationsRate,
vmCreationTiming:: vmCreationTiming,
idleEfficiency:: idleEfficiency,
gcpRegionQuotas:: gcpRegionQuotas,
gcpInstances:: gcpInstances,
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment