Skip to content
Snippets Groups Projects
Commit 521b77d1 authored by Nick Ilieskou's avatar Nick Ilieskou :speech_balloon:
Browse files

Merge branch 'nick/KustomizeAlerts' into 'main'

Alerts on staging should not send notifications on PagerDuty

See merge request !1869



Merged-by: default avatarNick Ilieskou <nilieskou@gitlab.com>
Approved-by: default avatarPawel Rozlach <prozlach@gitlab.com>
Approved-by: default avatarAnkit Bhatnagar <abhatnagar@gitlab.com>
Reviewed-by: default avatarPawel Rozlach <prozlach@gitlab.com>
parents 2f26175c ced4b0d7
No related branches found
No related tags found
1 merge request!1869Alerts on staging should not send notifications on PagerDuty
Showing
with 141 additions and 46 deletions
......@@ -20,7 +20,6 @@ spec:
type: blackbox-probe-tests
severity: s1
alert_type: smoke_test
pager: observability_pagerduty
- alert: GouiProbeDown3min
annotations:
title: GOUI is down
......@@ -46,5 +45,4 @@ spec:
type: blackbox-probe-tests
severity: s1
alert_type: smoke_test
pager: observability_pagerduty
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: smoketests-errortracking-rules
spec:
groups:
- name: smoketests-errortracking.rules
rules:
- alert: ErrorTrackingReadPathIsDown
annotations:
title: Error-tracking read path is down
description: Error tracking read path is down for 15 minutes. Bear in mind that the sampling period is defined in the smoke test env var configurations
expr: |
smoketest_errortracking_success_read_path == 0
for: 15m
labels:
alertname: ErrorTrackingReadPathIsDown
type: smoketests-errortracking
severity: s1
alert_type: smoke_test
- alert: ErrorTrackingWritePathIsDown
annotations:
title: Error-tracking write path is down
description: Error tracking write path is down for 15 minutes. Bear in mind that the sampling period is defined in the smoke test env var configurations
expr: |
smoketest_errortracking_success_write_path == 0
for: 15m
labels:
alertname: ErrorTrackingWritePathIsDown
type: smoketests-errortracking
severity: s1
alert_type: smoke_test
......@@ -26,7 +26,6 @@ spec:
type: errortracking
severity: s1
alert_type: cause
pager: observability_pagerduty
- alert: ErrortrackingLatencyHigh
annotations:
......@@ -60,6 +59,5 @@ spec:
type: errortracking
severity: s2
alert_type: cause
pager: observability_pagerduty
# NOTE(prozlach) No 404s alert for now. ATM there are too many 404s in
# productions compared to normal 200s to have meaningfull signal.
......@@ -23,7 +23,6 @@ spec:
type: gatekeeper
severity: s1
alert_type: cause
pager: observability_pagerduty
- alert: GatekeeperRedisMasterFailover
annotations:
......@@ -53,7 +52,6 @@ spec:
alertname: GatekeeperRedisMasterFlaky
type: gatekeeper
severity: s2
pager: observability_pagerduty
alert_type: cause
- alert: GatekeeperRedisSETLatencyHigh
......@@ -118,6 +116,5 @@ spec:
type: gatekeeper
severity: s2
alert_type: cause
pager: observability_pagerduty
# NOTE(prozlach) No 404s alert for now. ATM there are too many 404s in
# productions compared to normal 200s to have meaningfull signal.
......@@ -8,3 +8,5 @@ resources:
- nginx-ingress.yaml
- gatekeeper.yaml
- dms.yaml
- blackbox-exporter.yaml
- errortracking-smoketests.yaml
\ No newline at end of file
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: prometheus-blackbox-exporter-rules
spec:
groups:
- name: blackbox-exporter.rules
rules:
- alert: GouiProbeDown15min
labels:
pager: observability_pagerduty
- alert: GatekeeperProbeDown
labels:
pager: observability_pagerduty
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: smoketests-errortracking-rules
spec:
groups:
- name: smoketests-errortracking.rules
rules:
- alert: ErrorTrackingReadPathIsDown
labels:
pager: observability_pagerduty
- alert: ErrorTrackingWritePathIsDown
labels:
pager: observability_pagerduty
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: prometheus-errortracking-rules
spec:
groups:
- name: errortracking
rules:
- alert: ErrortrackingAbsent
labels:
pager: observability_pagerduty
- alert: ErrortrackingTooMany500s
labels:
pager: observability_pagerduty
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: prometheus-gatekeeper-rules
spec:
groups:
- name: gatekeeper
rules:
- alert: GatekeeperAbsent
labels:
pager: observability_pagerduty
- alert: GatekeeperRedisMasterFlaky
labels:
pager: observability_pagerduty
- alert: GatekeeperTooMany500s
labels:
pager: observability_pagerduty
resources:
- ../../base
patchesStrategicMerge:
- errortracking.yaml
- blackbox-exporter.yaml
- errortracking-smoketests.yaml
- gatekeeper.yaml
- nginx-ingress.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: prometheus-nginx-ingress-rules
spec:
groups:
- name: nginx
rules:
- alert: NGINXAbsent
labels:
pager: observability_pagerduty
- alert: NGINXTooMany500s
labels:
pager: observability_pagerduty
resources:
- ../../base
......@@ -3,6 +3,5 @@ namespace: default
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- alert.yaml
- deployment.yaml
- servicemonitor.yaml
......@@ -67,7 +67,7 @@ resource "kustomization_resource" "scheduler-crds" {
}
data "kustomization_build" "alerting" {
path = "${path.module}/alerting/"
path = "${path.module}/alerting/overlays/${var.environment}"
}
resource "kustomization_resource" "alerting" {
......
......@@ -97,43 +97,6 @@ spec:
selector:
matchLabels:
app: smoketests-errortracking
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
tenant: system
name: smoketests-errortracking-rules
spec:
groups:
- name: smoketests-errortracking.rules
rules:
- alert: ErrorTrackingReadPathIsDown
annotations:
title: Error-tracking read path is down
description: Error tracking read path is down for 15 minutes. Bear in mind that the sampling period is defined in the smoke test env var configurations
expr: |
smoketest_errortracking_success_read_path == 0
for: 15m
labels:
alertname: ErrorTrackingReadPathIsDown
type: smoketests-errortracking
severity: s1
alert_type: smoke_test
pager: observability_pagerduty
- alert: ErrorTrackingWritePathIsDown
annotations:
title: Error-tracking write path is down
description: Error tracking write path is down for 15 minutes. Bear in mind that the sampling period is defined in the smoke test env var configurations
expr: |
smoketest_errortracking_success_write_path == 0
for: 15m
labels:
alertname: ErrorTrackingWritePathIsDown
type: smoketests-errortracking
severity: s1
alert_type: smoke_test
pager: observability_pagerduty
......@@ -85,4 +85,10 @@ variable "goui_probe_url" {
description = "GOUI probe url"
type = string
default = ""
}
variable "environment" {
description = "Environment. Example staging, prod etc"
type = string
default = "prod"
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment