Skip to content
Snippets Groups Projects
Verified Commit 96bc50f5 authored by Bob Van Landuyt's avatar Bob Van Landuyt
Browse files

chore: remove apdex anomaly detection rules

These were an experiment for using anomaly detection as an indicator
in deployment health status in
gitlab-com/gl-infra&1333 (comment 2198480614)
but this was not the signal we were looking for so we can remove them.
parent 49e00e75
No related branches found
No related tags found
Loading
Showing
with 0 additions and 735 deletions
...@@ -22,23 +22,7 @@ ...@@ -22,23 +22,7 @@
] ]
else else
[] []
) +
(
if serviceDefinition.disableApdexSuccessRatePrediction then
[
{
record: 'gitlab_service:mapping:disable_apdex_success_rate_prediction',
labels: {
type: serviceDefinition.type,
tier: serviceDefinition.tier,
},
expr: '1',
},
]
else
[]
), ),
}, },
} }
...@@ -22,7 +22,6 @@ local serviceDefaults = { ...@@ -22,7 +22,6 @@ local serviceDefaults = {
serviceIsStageless: false, // Set to true for services that don't use stage labels serviceIsStageless: false, // Set to true for services that don't use stage labels
autogenerateRecordingRules: true, autogenerateRecordingRules: true,
disableOpsRatePrediction: false, disableOpsRatePrediction: false,
disableApdexSuccessRatePrediction: false,
nodeLevelMonitoring: false, // By default we do not use node-level monitoring nodeLevelMonitoring: false, // By default we do not use node-level monitoring
monitoring: { monitoring: {
shard: { shard: {
......
...@@ -20,7 +20,6 @@ metricsCatalog.serviceDefinition({ ...@@ -20,7 +20,6 @@ metricsCatalog.serviceDefinition({
* disable ops-rate anomaly detection on this service. * disable ops-rate anomaly detection on this service.
*/ */
disableOpsRatePrediction: true, disableOpsRatePrediction: true,
disableApdexSuccessRatePrediction: true,
serviceDependencies: { serviceDependencies: {
}, },
provisioning: { provisioning: {
......
...@@ -36,8 +36,6 @@ metricsCatalog.serviceDefinition({ ...@@ -36,8 +36,6 @@ metricsCatalog.serviceDefinition({
*/ */
disableOpsRatePrediction: true, disableOpsRatePrediction: true,
disableApdexSuccessRatePrediction: true,
// Thanos needs to self-monitor in Thanos // Thanos needs to self-monitor in Thanos
// this should not be required for other services. // this should not be required for other services.
dangerouslyThanosEvaluated: false, dangerouslyThanosEvaluated: false,
......
...@@ -24,7 +24,6 @@ metricsCatalog.serviceDefinition({ ...@@ -24,7 +24,6 @@ metricsCatalog.serviceDefinition({
* disable ops-rate anomaly detection on this service. * disable ops-rate anomaly detection on this service.
*/ */
disableOpsRatePrediction: true, disableOpsRatePrediction: true,
disableApdexSuccessRatePrediction: true,
provisioning: { provisioning: {
kubernetes: true, kubernetes: true,
vms: true, vms: true,
......
...@@ -11,7 +11,6 @@ metricsCatalog.serviceDefinition({ ...@@ -11,7 +11,6 @@ metricsCatalog.serviceDefinition({
* disable anomaly detection for RPS * disable anomaly detection for RPS
*/ */
disableOpsRatePrediction: true, disableOpsRatePrediction: true,
disableApdexSuccessRatePrediction: true,
provisioning: { provisioning: {
/* Provisioned with Elastic Cloud, no VMs, no Kube */ /* Provisioned with Elastic Cloud, no VMs, no Kube */
vms: false, vms: false,
......
...@@ -44,31 +44,6 @@ local fileForService(service, extraSelector, _extraArgs, tenant) = ...@@ -44,31 +44,6 @@ local fileForService(service, extraSelector, _extraArgs, tenant) =
) )
), ),
}, },
{
name: '%s - service_apdex_anomaly_detection' % service.type,
rules: alerts.processAlertRules(
serviceAnomalyDetectionAlerts(
selector,
'service_apdex',
'gitlab_service_apdex:success',
'disable_apdex_success_rate_prediction',
'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal',
|||
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal.
This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
|||,
'https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md',
'gitlab_component_apdex:success',
'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal',
|||
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal.
This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
|||,
'service-$type-apdex',
tenant
)
),
},
], ],
), ),
}; };
......
...@@ -18,15 +18,6 @@ local fileForService(service, selector, _extraArgs, _) = { ...@@ -18,15 +18,6 @@ local fileForService(service, selector, _extraArgs, _) = {
'gitlab_service_ops', 'gitlab_service_ops',
'disable_ops_rate_prediction', 'disable_ops_rate_prediction',
selector { type: service.type }, selector { type: service.type },
) +
serviceAnomalyDetection.recordingRuleGroupsFor(
service.type,
serviceAggregation,
serviceAggregation.getApdexSuccessRateMetricForBurnRate,
'apdex success rate',
'gitlab_service_apdex:success',
'disable_apdex_success_rate_prediction',
selector { type: service.type },
) )
), ),
}; };
......
...@@ -78,80 +78,3 @@ groups: ...@@ -78,80 +78,3 @@ groups:
) )
unless on(tier, type) unless on(tier, type)
gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"}
- name: customersdot - service_apdex_anomaly_detection
rules:
- alert: service_apdex_out_of_bounds_upper_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has higher apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal. This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-fulfillment-platform
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_trigger: service_apdex_anomaly
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{monitor="global",type="customersdot"} - gitlab_service_apdex:success:rate:prediction{monitor="global",type="customersdot"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{monitor="global",type="customersdot"}
)
>
3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
- alert: service_apdex_out_of_bounds_lower_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has lower apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal. This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-fulfillment-platform
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{monitor="global",type="customersdot"} - gitlab_service_apdex:success:rate:prediction{monitor="global",type="customersdot"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{monitor="global",type="customersdot"}
)
<
-3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
...@@ -36,38 +36,3 @@ groups: ...@@ -36,38 +36,3 @@ groups:
, "p", "3w", "", "") , "p", "3w", "", "")
) )
without (p) without (p)
- name: 'customersdot apdex success rate weekly statistics: {"type": "customersdot"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:avg_over_time_1w
expr: |
avg_over_time(gitlab_service_apdex:success:rate_5m{monitor="global",type="customersdot"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="customersdot"}
- record: gitlab_service_apdex:success:rate:stddev_over_time_1w
expr: |
stddev_over_time(gitlab_service_apdex:success:rate_5m{monitor="global",type="customersdot"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="customersdot"}
- name: 'customersdot apdex success rate weekly prediction values: {"type": "customersdot"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:prediction
expr: |
quantile(0.5,
label_replace(
gitlab_service_apdex:success:rate_1h{monitor="global",type="customersdot"} offset 10050m # 1 week - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{monitor="global",type="customersdot"}[1w])
, "p", "1w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{monitor="global",type="customersdot"} offset 20130m # 2 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{monitor="global",type="customersdot"}[2w])
, "p", "2w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{monitor="global",type="customersdot"} offset 30210m # 3 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{monitor="global",type="customersdot"}[3w])
, "p", "3w", "", "")
)
without (p)
...@@ -78,80 +78,3 @@ groups: ...@@ -78,80 +78,3 @@ groups:
) )
unless on(tier, type) unless on(tier, type)
gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"}
- name: ai-assisted - service_apdex_anomaly_detection
rules:
- alert: service_apdex_out_of_bounds_upper_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has higher apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal. This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_trigger: service_apdex_anomaly
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ai-assisted"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="ai-assisted"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="ai-assisted"}
)
>
3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
- alert: service_apdex_out_of_bounds_lower_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has lower apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal. This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ai-assisted"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="ai-assisted"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="ai-assisted"}
)
<
-3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
...@@ -36,40 +36,3 @@ groups: ...@@ -36,40 +36,3 @@ groups:
, "p", "3w", "", "") , "p", "3w", "", "")
) )
without (p) without (p)
- name: 'ai-assisted apdex success rate weekly statistics: {"env": "gprd", "type":
"ai-assisted"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:avg_over_time_1w
expr: |
avg_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ai-assisted"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="ai-assisted"}
- record: gitlab_service_apdex:success:rate:stddev_over_time_1w
expr: |
stddev_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ai-assisted"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="ai-assisted"}
- name: 'ai-assisted apdex success rate weekly prediction values: {"env": "gprd",
"type": "ai-assisted"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:prediction
expr: |
quantile(0.5,
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="ai-assisted"} offset 10050m # 1 week - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="ai-assisted"}[1w])
, "p", "1w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="ai-assisted"} offset 20130m # 2 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="ai-assisted"}[2w])
, "p", "2w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="ai-assisted"} offset 30210m # 3 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="ai-assisted"}[3w])
, "p", "3w", "", "")
)
without (p)
...@@ -78,80 +78,3 @@ groups: ...@@ -78,80 +78,3 @@ groups:
) )
unless on(tier, type) unless on(tier, type)
gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"}
- name: api - service_apdex_anomaly_detection
rules:
- alert: service_apdex_out_of_bounds_upper_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has higher apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal. This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_trigger: service_apdex_anomaly
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="api"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="api"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="api"}
)
>
3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
- alert: service_apdex_out_of_bounds_lower_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has lower apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal. This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="api"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="api"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="api"}
)
<
-3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
...@@ -36,39 +36,3 @@ groups: ...@@ -36,39 +36,3 @@ groups:
, "p", "3w", "", "") , "p", "3w", "", "")
) )
without (p) without (p)
- name: 'api apdex success rate weekly statistics: {"env": "gprd", "type": "api"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:avg_over_time_1w
expr: |
avg_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="api"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="api"}
- record: gitlab_service_apdex:success:rate:stddev_over_time_1w
expr: |
stddev_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="api"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="api"}
- name: 'api apdex success rate weekly prediction values: {"env": "gprd", "type":
"api"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:prediction
expr: |
quantile(0.5,
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="api"} offset 10050m # 1 week - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="api"}[1w])
, "p", "1w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="api"} offset 20130m # 2 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="api"}[2w])
, "p", "2w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="api"} offset 30210m # 3 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="api"}[3w])
, "p", "3w", "", "")
)
without (p)
...@@ -78,80 +78,3 @@ groups: ...@@ -78,80 +78,3 @@ groups:
) )
unless on(tier, type) unless on(tier, type)
gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"}
- name: camoproxy - service_apdex_anomaly_detection
rules:
- alert: service_apdex_out_of_bounds_upper_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has higher apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal. This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_trigger: service_apdex_anomaly
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="camoproxy"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="camoproxy"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="camoproxy"}
)
>
3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
- alert: service_apdex_out_of_bounds_lower_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has lower apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal. This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="camoproxy"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="camoproxy"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="camoproxy"}
)
<
-3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
...@@ -36,39 +36,3 @@ groups: ...@@ -36,39 +36,3 @@ groups:
, "p", "3w", "", "") , "p", "3w", "", "")
) )
without (p) without (p)
- name: 'camoproxy apdex success rate weekly statistics: {"env": "gprd", "type": "camoproxy"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:avg_over_time_1w
expr: |
avg_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="camoproxy"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="camoproxy"}
- record: gitlab_service_apdex:success:rate:stddev_over_time_1w
expr: |
stddev_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="camoproxy"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="camoproxy"}
- name: 'camoproxy apdex success rate weekly prediction values: {"env": "gprd", "type":
"camoproxy"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:prediction
expr: |
quantile(0.5,
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="camoproxy"} offset 10050m # 1 week - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="camoproxy"}[1w])
, "p", "1w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="camoproxy"} offset 20130m # 2 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="camoproxy"}[2w])
, "p", "2w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="camoproxy"} offset 30210m # 3 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="camoproxy"}[3w])
, "p", "3w", "", "")
)
without (p)
...@@ -78,80 +78,3 @@ groups: ...@@ -78,80 +78,3 @@ groups:
) )
unless on(tier, type) unless on(tier, type)
gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"}
- name: ci-runners - service_apdex_anomaly_detection
rules:
- alert: service_apdex_out_of_bounds_upper_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has higher apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal. This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_trigger: service_apdex_anomaly
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ci-runners"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="ci-runners"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="ci-runners"}
)
>
3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
- alert: service_apdex_out_of_bounds_lower_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has lower apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal. This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ci-runners"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="ci-runners"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="ci-runners"}
)
<
-3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
...@@ -36,40 +36,3 @@ groups: ...@@ -36,40 +36,3 @@ groups:
, "p", "3w", "", "") , "p", "3w", "", "")
) )
without (p) without (p)
- name: 'ci-runners apdex success rate weekly statistics: {"env": "gprd", "type":
"ci-runners"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:avg_over_time_1w
expr: |
avg_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ci-runners"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="ci-runners"}
- record: gitlab_service_apdex:success:rate:stddev_over_time_1w
expr: |
stddev_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="ci-runners"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="ci-runners"}
- name: 'ci-runners apdex success rate weekly prediction values: {"env": "gprd", "type":
"ci-runners"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:prediction
expr: |
quantile(0.5,
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="ci-runners"} offset 10050m # 1 week - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="ci-runners"}[1w])
, "p", "1w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="ci-runners"} offset 20130m # 2 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="ci-runners"}[2w])
, "p", "2w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="ci-runners"} offset 30210m # 3 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="ci-runners"}[3w])
, "p", "3w", "", "")
)
without (p)
...@@ -78,80 +78,3 @@ groups: ...@@ -78,80 +78,3 @@ groups:
) )
unless on(tier, type) unless on(tier, type)
gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"} gitlab_service:mapping:disable_ops_rate_prediction{monitor="global"}
- name: cloud-sql - service_apdex_anomaly_detection
rules:
- alert: service_apdex_out_of_bounds_upper_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has higher apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has higher apdex than normal. This is often caused by user generated traffic, sometimes abuse. It can also be caused by application changes that lead to higher apdex success rates. Check the abuse reporting watches in Elastic, ELK for possible abuse, error rates (possibly on upstream services) for root cause.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_trigger: service_apdex_anomaly
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="cloud-sql"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="cloud-sql"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="cloud-sql"}
)
>
3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
- alert: service_apdex_out_of_bounds_lower_5m
for: 5m
annotations:
title: 'Anomaly detection: The `{{ $labels.type }}` service (`{{ $labels.stage
}}` stage) has lower apdex than normal'
description: |
The `{{ $labels.type }}` service (`{{ $labels.stage }}` stage) has lower apdex than normal. This can be caused by a failure or latency increases in an upstream service. In many cases, this is as serious or more serious than a traffic spike. Check upstream services for errors or latency increases that may be leading to traffic flow issues in downstream services.
grafana_dashboard_id: general-service/service-platform-metrics
grafana_dashboard_link: https://dashboards.gitlab.net/d/general-service/service-platform-metrics?from=now-12h/m&to=now-1m/m&var-environment={{
$labels.environment }}&var-type={{ $labels.type }}&var-stage={{ $labels.stage
}}
grafana_datasource_id: mimir-gitlab-gprd
grafana_min_zoom_hours: "12"
grafana_panel_id: "3770536810"
grafana_variables: environment,type,stage
link1_title: Definition
link1_url: https://gitlab.com/gitlab-com/runbooks/-/blob/master/docs/monitoring/definition-service-apdex.md
promql_template_1: gitlab_service_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
promql_template_2: gitlab_component_apdex:success:rate{environment="$environment",
type="$type", stage="$stage"}
runbook: docs/{{ $labels.type }}/README.md
labels:
alert_type: cause
rules_domain: general
severity: s4
expr: |
(
(
(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="cloud-sql"} - gitlab_service_apdex:success:rate:prediction{env="gprd",monitor="global",type="cloud-sql"}) /
gitlab_service_apdex:success:rate:stddev_over_time_1w{env="gprd",monitor="global",type="cloud-sql"}
)
<
-3
)
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global"}
...@@ -36,39 +36,3 @@ groups: ...@@ -36,39 +36,3 @@ groups:
, "p", "3w", "", "") , "p", "3w", "", "")
) )
without (p) without (p)
- name: 'cloud-sql apdex success rate weekly statistics: {"env": "gprd", "type": "cloud-sql"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:avg_over_time_1w
expr: |
avg_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="cloud-sql"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="cloud-sql"}
- record: gitlab_service_apdex:success:rate:stddev_over_time_1w
expr: |
stddev_over_time(gitlab_service_apdex:success:rate_5m{env="gprd",monitor="global",type="cloud-sql"}[1w])
unless on(tier, type)
gitlab_service:mapping:disable_apdex_success_rate_prediction{monitor="global",type="cloud-sql"}
- name: 'cloud-sql apdex success rate weekly prediction values: {"env": "gprd", "type":
"cloud-sql"}'
interval: 5m
rules:
- record: gitlab_service_apdex:success:rate:prediction
expr: |
quantile(0.5,
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="cloud-sql"} offset 10050m # 1 week - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="cloud-sql"}[1w])
, "p", "1w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="cloud-sql"} offset 20130m # 2 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="cloud-sql"}[2w])
, "p", "2w", "", "")
or
label_replace(
gitlab_service_apdex:success:rate_1h{env="gprd",monitor="global",type="cloud-sql"} offset 30210m # 3 weeks - 30mins
+ delta(gitlab_service_apdex:success:rate:avg_over_time_1w{env="gprd",monitor="global",type="cloud-sql"}[3w])
, "p", "3w", "", "")
)
without (p)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment