Commit 654465ed authored by Ben Kochie's avatar Ben Kochie

Merge branch 'bjk/issue_alerts' into 'master'

Add new issue alerts

See merge request !1440
parents 2396b816 4870cdd6
......@@ -20,6 +20,17 @@ groups:
grafana_dashboard_id: "000000231/chef-client"
runbook: troubleshooting/chef.md
title: Chef client hasn't run for longer than expected
- alert: ChefClientStale
expr: time() - chef_client_last_run_timestamp_seconds > 3 * 86400
for: 1h
labels:
severity: s2
pager: issue
annotations:
description: Last Chef run for {{ $labels.fqdn }} was over {{ $value | humanizeDuration }} ago
grafana_dashboard_id: "000000231/chef-client"
runbook: troubleshooting/chef.md
title: Chef client hasn't run for longer than expected
- alert: ChefClientErrorCritical
expr: avg(chef_client_error) by (type) * 100 > 10
for: 1h
......
......@@ -2,8 +2,7 @@ groups:
- name: ssl-certificate-expiration.rules
rules:
- alert: SSLCertExpiresSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 7 * 60 * 60
* 24
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 1 * 86400
for: 30m
labels:
severity: s1
......@@ -13,3 +12,14 @@ groups:
runbook: troubleshooting/ssl_cert.md
title: SSL certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration
}}
- alert: SSLCertExpiresSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 7 * 86400
for: 30m
labels:
severity: s2
pager: issue
annotations:
description: Check SSL for specified nodes and consider reissuing certificate.
runbook: troubleshooting/ssl_cert.md
title: SSL certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration
}}
......@@ -23,7 +23,7 @@ def validate_rule(alert_file_path, rule)
raise StandardError, " #{alert}: rules contains an invalid `severity` label: #{labels["severity"]}" unless ["s1", "s2", "s3", "s4"].include?(labels["severity"])
if labels["pager"]
raise StandardError, " #{alert}: rules contains an invalid `pager` label: #{labels["pager"]}" unless labels["pager"] == "pagerduty"
raise StandardError, " #{alert}: rules contains an invalid `pager` label: #{labels["pager"]}" unless ["pagerduty", "issue"].include?(labels["pager"])
raise StandardError, " #{alert}: only severity s1 and s2 errors should page" unless labels["severity"] == "s1" or labels["severity"] == "s2"
else
raise StandardError, " #{alert}: s1 and s2 alerts must be configured to send to pagerduty" if labels["severity"] == "s1" or labels["severity"] == "s2"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment