Commit af0c8f0d authored by Andrew Newdigate's avatar Andrew Newdigate

Merge branch 'update-triage-with-anomalies-and-stage' into 'master'

Updates triage with anomalies and stage template

See merge request !1095
parents 8abdc49c 1e2a0d68
local grafana = import 'grafonnet/grafana.libsonnet';
local promQuery = import 'prom_query.libsonnet';
local templates = import 'templates.libsonnet';
local commonAnnotations = import 'common_annotations.libsonnet';
local seriesOverrides = import 'series_overrides.libsonnet';
local colors = import 'colors.libsonnet';
local dashboard = grafana.dashboard;
......@@ -8,6 +9,8 @@ local row = grafana.row;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local rowHeight = 10;
local generalGraphPanel(
title,
description=null
......@@ -25,11 +28,42 @@ local generalGraphPanel(
legend_total=false,
legend_avg=true,
legend_alignAsTable=true,
legend_rightSide=true,
legend_hideEmpty=true,
)
.addSeriesOverride(seriesOverrides.sloViolation);
local generateAnomalyPanel(title, query) =
graphPanel.new(
title,
linewidth=1,
fill=0,
decimals=2,
legend_show=true,
legend_values=true,
legend_min=true,
legend_max=true,
legend_current=true,
legend_total=false,
legend_avg=true,
legend_alignAsTable=true,
legend_hideEmpty=true,
)
.addTarget(
promQuery.target(
query,
legendFormat='{{ type }} service',
)
)
.resetYaxes()
.addYaxis(
format='none',
label="Sigma σ",
)
.addYaxis(
format='none',
label="Sigma σ",
);
local activeAlertsPanel = grafana.tablePanel.new(
'Active Alerts',
styles=[{
......@@ -92,14 +126,16 @@ dashboard.new(
timezone='UTC',
graphTooltip='shared_crosshair',
)
.addAnnotation(commonAnnotations.deploymentsForEnvironment)
.addAnnotation(commonAnnotations.deploymentsForEnvironmentCanary)
.addTemplate(templates.ds)
.addTemplate(templates.environment)
.addTemplate(templates.sigma)
.addTemplate(templates.stage)
.addPanel(activeAlertsPanel, gridPos={
x: 0,
y: 0,
y: 0 * rowHeight,
w: 24,
h: 10,
h: rowHeight,
})
.addPanel(
generalGraphPanel(
......@@ -110,7 +146,7 @@ dashboard.new(
promQuery.target('
avg(
avg_over_time(
gitlab_service_apdex:ratio{environment="$environment"}[$__interval]
gitlab_service_apdex:ratio{environment="$environment", stage="$stage"}[$__interval]
)
) by (type)
',
......@@ -121,7 +157,7 @@ dashboard.new(
promQuery.target('
avg(
avg_over_time(
gitlab_service_apdex:ratio{environment="$environment"}[$__interval]
gitlab_service_apdex:ratio{environment="$environment", stage="$stage"}[$__interval]
)
) by (type)
<= on(type) group_left
......@@ -144,9 +180,29 @@ dashboard.new(
)
, gridPos={
x: 0,
y: 0,
w: 24,
h: 10,
y: 1 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
generateAnomalyPanel(
'Anomaly detection: Latency: Apdex Score',
'
avg(
(
gitlab_service_apdex:ratio{environment="$environment", stage="$stage"}
- gitlab_service_apdex:ratio:avg_over_time_1w{environment="$environment", stage="$stage"}
)
/
gitlab_service_apdex:ratio:stddev_over_time_1w{environment="$environment", stage="$stage"}
) by (type, stage)
')
, gridPos={
x: 12,
y: 1 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
......@@ -158,7 +214,7 @@ dashboard.new(
promQuery.target('
avg(
max_over_time(
gitlab_service_errors:ratio{environment="$environment"}[$__interval]
gitlab_service_errors:ratio{environment="$environment", stage="$stage"}[$__interval]
)
) by (type)
',
......@@ -169,7 +225,7 @@ dashboard.new(
promQuery.target('
avg(
avg_over_time(
gitlab_service_errors:ratio{environment="$environment"}[$__interval]
gitlab_service_errors:ratio{environment="$environment", stage="$stage"}[$__interval]
)
) by (type)
>= on(type) group_left
......@@ -192,32 +248,53 @@ dashboard.new(
)
, gridPos={
x: 0,
y: 0,
w: 24,
h: 10,
y: 2 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
generateAnomalyPanel(
'Anomaly detection: Error Ratio',
'
avg(
(
gitlab_service_errors:ratio{environment="$environment", stage="$stage"}
- gitlab_service_errors:ratio:avg_over_time_1w{environment="$environment", stage="$stage"}
)
/
gitlab_service_errors:ratio:stddev_over_time_1w{environment="$environment", stage="$stage"}
) by (type, stage)
')
, gridPos={
x: 12,
y: 2 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
generalGraphPanel(
"Service Availability",
description="Availability measures the ratio of component processes in the service that are currently healthy and able to handle requests. The closer to 100% the better."
"Service Operation Rates - per Second",
description="The operation rate is the sum total of all requests being handle for all components within this service. Note that a single user request can lead to requests to multiple components. Higher is busier."
)
.addTarget( // Primary metric
promQuery.target('
min(
min_over_time(
gitlab_service_availability:ratio{environment="$environment"}[$__interval]
sum(
avg_over_time(
gitlab_service_ops:rate{environment="$environment", stage="$stage"}[$__interval]
)
) by (tier, type)
) by (type)
',
legendFormat='{{ type }} service',
)
)
.resetYaxes()
.addYaxis(
format='percentunit',
max=1,
label="Availability %",
format='short',
min=1,
label="Operations per Second",
logBase=10,
)
.addYaxis(
format='short',
......@@ -227,21 +304,42 @@ dashboard.new(
)
, gridPos={
x: 0,
y: 0,
w: 24,
h: 10,
y: 3 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
generateAnomalyPanel(
'Anomaly detection: Requests per second',
'
avg(
(
gitlab_service_ops:rate{environment="$environment", stage="$stage"}
-
gitlab_service_ops:rate:prediction{environment="$environment", stage="$stage"}
)
/
gitlab_service_ops:rate:stddev_over_time_1w{environment="$environment", stage="$stage"}
) by (type)
')
, gridPos={
x: 12,
y: 3 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
generalGraphPanel(
"Service Operation Rates - per Second",
description="The operation rate is the sum total of all requests being handle for all components within this service. Note that a single user request can lead to requests to multiple components. Higher is busier."
"Service Availability",
description="Availability measures the ratio of component processes in the service that are currently healthy and able to handle requests. The closer to 100% the better."
)
.addTarget( // Primary metric
promQuery.target('
sum(
avg_over_time(
gitlab_service_ops:rate{environment="$environment"}[$__interval]
min(
min_over_time(
gitlab_service_availability:ratio{environment="$environment", stage="$stage"}[$__interval]
)
) by (type)
',
......@@ -250,10 +348,9 @@ dashboard.new(
)
.resetYaxes()
.addYaxis(
format='short',
min=1,
label="Operations per Second",
logBase=10,
format='percentunit',
max=1,
label="Availability %",
)
.addYaxis(
format='short',
......@@ -263,8 +360,29 @@ dashboard.new(
)
, gridPos={
x: 0,
y: 0,
w: 24,
h: 10,
y: 4 * rowHeight,
w: 12,
h: rowHeight,
}
)
.addPanel(
generateAnomalyPanel(
'Anomaly detection: Availability',
'
avg(
(
gitlab_service_availability:ratio{environment="$environment", stage="$stage"}
-
gitlab_service_availability:ratio:avg_over_time_1w{environment="$environment", stage="$stage"}
)
/
gitlab_service_availability:ratio:stddev_over_time_1w{environment="$environment", stage="$stage"}
) by (tier, type)
')
, gridPos={
x: 12,
y: 4 * rowHeight,
w: 12,
h: rowHeight,
}
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment