When 1 server in a fleet of many goes down, multiple alerts fire
The name of this alert:
- IncreasedServerConnectionErrors
- IncreasedBackendConnectionErrors
Problem description goes here
- One server in a fleet of 8 barfed. Do we care why that ONE server went down? Sure. We totally should. Should I get woken up at midnight cuz of one server out of 8 barfed? No. We have over provisioned this service, and have 7 other server that have the ability to take up the slack during the time for which this server is down.
This server was rebooted by google. The underlying host for which it was running had failed.
[
{
"insertId": "teusfpfndyiui",
"jsonPayload": {
"event_subtype": "compute.instances.automaticRestart",
"info": [
{
"detail_message": "Instance automatically restarted by Compute Engine.",
"code": "STATUS_MESSAGE"
}
],
"version": "1.2",
"event_timestamp_us": "1548218996130755",
"actor": {
"user": "system"
},
"resource": {
"zone": "us-east1-d",
"id": "8456396115316406636",
"name": "web-pages-02-sv-gprd",
"type": "instance"
},
"event_type": "GCE_OPERATION_DONE",
"trace_id": "systemevent-1548218990085-58018d2f0450d-d19e7c56-5ebab86d",
"operation": {
"name": "systemevent-1548218990085-58018d2f0450d-d19e7c56-5ebab86d",
"type": "operation",
"zone": "us-east1-d",
"id": "6370736769153518747"
}
},
"resource": {
"type": "gce_instance",
"labels": {
"zone": "us-east1-d",
"project_id": "gitlab-production",
"instance_id": "8456396115316406636"
}
},
"timestamp": "2019-01-23T04:49:56.130755Z",
"severity": "INFO",
"labels": {
"compute.googleapis.com/resource_zone": "us-east1-d",
"compute.googleapis.com/resource_name": "web-pages-02-sv-gprd",
"compute.googleapis.com/resource_id": "8456396115316406636",
"compute.googleapis.com/resource_type": "instance"
},
"logName": "projects/gitlab-production/logs/compute.googleapis.com%2Factivity_log",
"receiveTimestamp": "2019-01-23T04:49:56.197639846Z"
},
{
"protoPayload": {
"@type": "type.googleapis.com/google.cloud.audit.AuditLog",
"serviceName": "compute.googleapis.com",
"methodName": "compute.instances.automaticRestart"
},
"insertId": "xkoj2rdqd0q",
"resource": {
"type": "gce_instance",
"labels": {
"zone": "us-east1-d",
"project_id": "gitlab-production",
"instance_id": "8456396115316406636"
}
},
"timestamp": "2019-01-23T04:49:56.026Z",
"severity": "INFO",
"logName": "projects/gitlab-production/logs/cloudaudit.googleapis.com%2Fsystem_event",
"operation": {
"id": "systemevent-1548218990085-58018d2f0450d-d19e7c56-5ebab86d",
"producer": "compute.instances.automaticRestart",
"first": true,
"last": true
},
"receiveTimestamp": "2019-01-23T04:49:56.726006935Z"
},
{
"insertId": "1jhx4mlf6fqw8x",
"jsonPayload": {
"version": "1.2",
"event_timestamp_us": "1548218979841940",
"actor": {
"user": "system"
},
"resource": {
"name": "web-pages-02-sv-gprd",
"type": "instance",
"zone": "us-east1-d",
"id": "8456396115316406636"
},
"event_type": "GCE_OPERATION_DONE",
"trace_id": "systemevent-1548218979548-58018d24f7bad-a4bf7a75-8c20aaef",
"operation": {
"type": "operation",
"zone": "us-east1-d",
"id": "2051909572614573196",
"name": "systemevent-1548218979548-58018d24f7bad-a4bf7a75-8c20aaef"
},
"event_subtype": "compute.instances.hostError",
"info": [
{
"code": "STATUS_MESSAGE",
"detail_message": "Instance terminated by Compute Engine."
}
]
},
"resource": {
"type": "gce_instance",
"labels": {
"zone": "us-east1-d",
"project_id": "gitlab-production",
"instance_id": "8456396115316406636"
}
},
"timestamp": "2019-01-23T04:49:39.841940Z",
"severity": "INFO",
"labels": {
"compute.googleapis.com/resource_zone": "us-east1-d",
"compute.googleapis.com/resource_name": "web-pages-02-sv-gprd",
"compute.googleapis.com/resource_id": "8456396115316406636",
"compute.googleapis.com/resource_type": "instance"
},
"logName": "projects/gitlab-production/logs/compute.googleapis.com%2Factivity_log",
"receiveTimestamp": "2019-01-23T04:49:39.929195716Z"
},
{
"protoPayload": {
"@type": "type.googleapis.com/google.cloud.audit.AuditLog",
"serviceName": "compute.googleapis.com",
"methodName": "compute.instances.hostError"
},
"insertId": "-u8lpvcdrl98",
"resource": {
"type": "gce_instance",
"labels": {
"instance_id": "8456396115316406636",
"zone": "us-east1-d",
"project_id": "gitlab-production"
}
},
"timestamp": "2019-01-23T04:49:39.743Z",
"severity": "INFO",
"logName": "projects/gitlab-production/logs/cloudaudit.googleapis.com%2Fsystem_event",
"operation": {
"id": "systemevent-1548218979548-58018d24f7bad-a4bf7a75-8c20aaef",
"producer": "compute.instances.hostError",
"first": true,
"last": true
},
"receiveTimestamp": "2019-01-23T04:49:40.879207891Z"
}
]
- Use this issue as a means of discussing what we can do to remove pages and woke engineers when it's not needed. The server came back online just fine. It's healthy, chef did it's job, haproxy knows it's back.