Review App Gitaly service sometimes unresponsive after restart and endpoint update
Summary
QA tests have been failing on review apps intermittently but frequently with error 500 due to requests sent to Gitaly returning failed to connect to all addresses.
The errors occur just after Gitaly restarts, although it's not clear why Gitaly restarts at that time.
We also see failures where the Gitaly request returns DNS resolution failed, which seems to happen under the same circumstances.
Details
Here is a breakdown of one error 500 failure (similar to others): https://gitlab.com/gitlab-org/gitlab/-/jobs/1388825975
The logs show failed to connect to all addresses.
Expand for the full log entry JSON content
{
"insertId": "1mu1vedg3d88p6k",
"jsonPayload": {
"username": "root",
"db_duration_s": 0.04486,
"redis_duration_s": 0.006518,
"params": [
{
"value": "[FILTERED]",
"key": "private_token"
}
],
"redis_cache_duration_s": 0.006518,
"gitaly_calls": 1,
"mem_bytes": 2765528,
"path": "/api/v4/groups/gitlab-qa-sandbox-group%2Fqa-test-2021-06-30-12-02-23-25784b8dc53e85e6",
"status": 500,
"duration_s": 0.13731,
"redis_read_bytes": 1860,
"exception.backtrace": [
"lib/gitlab/gitaly_client.rb:177:in `execute'",
"lib/gitlab/gitaly_client/call.rb:18:in `block in call'",
"lib/gitlab/gitaly_client/call.rb:55:in `recording_request'",
"lib/gitlab/gitaly_client/call.rb:17:in `call'",
"lib/gitlab/gitaly_client.rb:167:in `call'",
"lib/gitlab/gitaly_client/ref_service.rb:39:in `default_branch_name'",
"lib/gitlab/git/repository.rb:90:in `root_ref'",
"app/models/repository.rb:520:in `root_ref'",
"lib/gitlab/metrics/instrumentation.rb:160:in `block in _uncached_root_ref'",
"lib/gitlab/metrics/method_call.rb:27:in `measure'",
"lib/gitlab/metrics/instrumentation.rb:160:in `_uncached_root_ref'",
"lib/gitlab/repository_cache_adapter.rb:95:in `block (2 levels) in cache_method_asymmetrically'",
"lib/gitlab/repository_cache.rb:44:in `fetch_without_caching_false'",
"lib/gitlab/repository_cache_adapter.rb:190:in `block (2 levels) in cache_method_output_asymmetrically'",
"lib/gitlab/safe_request_store.rb:12:in `fetch'",
"lib/gitlab/repository_cache.rb:25:in `fetch'",
"lib/gitlab/repository_cache_adapter.rb:189:in `block in cache_method_output_asymmetrically'",
"lib/gitlab/utils/strong_memoize.rb:30:in `strong_memoize'",
"lib/gitlab/repository_cache_adapter.rb:203:in `block in memoize_method_output'",
"lib/gitlab/repository_cache_adapter.rb:212:in `no_repository_fallback'",
"lib/gitlab/repository_cache_adapter.rb:202:in `memoize_method_output'",
"lib/gitlab/repository_cache_adapter.rb:188:in `cache_method_output_asymmetrically'",
"lib/gitlab/repository_cache_adapter.rb:94:in `block in cache_method_asymmetrically'",
"lib/gitlab/metrics/instrumentation.rb:160:in `block in root_ref'",
"lib/gitlab/metrics/method_call.rb:27:in `measure'",
"lib/gitlab/metrics/instrumentation.rb:160:in `root_ref'",
"app/models/concerns/has_repository.rb:74:in `default_branch'",
"lib/gitlab/json.rb:110:in `dump'",
"lib/gitlab/json.rb:110:in `adapter_dump'",
"lib/gitlab/json.rb:42:in `dump'",
"lib/gitlab/json.rb:198:in `call'",
"ee/lib/gitlab/middleware/ip_restrictor.rb:14:in `block in call'",
"ee/lib/gitlab/ip_address_state.rb:10:in `with'",
"ee/lib/gitlab/middleware/ip_restrictor.rb:13:in `call'",
"lib/api/api_guard.rb:213:in `call'",
"config/initializers/gitlab_experiment.rb:17:in `call'",
"lib/gitlab/metrics/elasticsearch_rack_middleware.rb:16:in `call'",
"lib/gitlab/middleware/rails_queue_duration.rb:33:in `call'",
"lib/gitlab/metrics/rack_middleware.rb:16:in `block in call'",
"lib/gitlab/metrics/web_transaction.rb:21:in `run'",
"lib/gitlab/metrics/rack_middleware.rb:16:in `call'",
"lib/gitlab/middleware/speedscope.rb:13:in `call'",
"lib/gitlab/request_profiler/middleware.rb:17:in `call'",
"lib/gitlab/jira/middleware.rb:19:in `call'",
"lib/gitlab/middleware/go.rb:20:in `call'",
"lib/gitlab/etag_caching/middleware.rb:21:in `call'",
"lib/gitlab/middleware/multipart.rb:172:in `call'",
"lib/gitlab/middleware/read_only/controller.rb:50:in `call'",
"lib/gitlab/middleware/read_only.rb:18:in `call'",
"lib/gitlab/middleware/same_site_cookies.rb:27:in `call'",
"lib/gitlab/middleware/handle_malformed_strings.rb:21:in `call'",
"lib/gitlab/middleware/basic_health_check.rb:25:in `call'",
"lib/gitlab/middleware/handle_ip_spoof_attack_error.rb:25:in `call'",
"lib/gitlab/middleware/request_context.rb:21:in `call'",
"config/initializers/fix_local_cache_middleware.rb:11:in `call'",
"lib/gitlab/middleware/rack_multipart_tempfile_factory.rb:19:in `call'",
"lib/gitlab/metrics/requests_rack_middleware.rb:74:in `call'",
"lib/gitlab/middleware/release_env.rb:12:in `call'"
],
"remote_ip": "34.73.108.143, 34.73.108.143",
"route": "/api/:version/groups/:id",
"meta.remote_ip": "34.73.108.143",
"method": "GET",
"cpu_s": 0.100414,
"exception.class": "GRPC::Unavailable",
"redis_cache_write_bytes": 1023,
"host": "gitlab-review-332321-ref-gezybs.gitlab-review.app",
"mem_total_bytes": 4312288,
"db_cached_count": 5,
"meta.caller_id": "GET /api/:version/groups/:id",
"redis_calls": 14,
"ua": "rest-client/2.1.0 (linux x86_64) ruby/2.7.3p183",
"meta.feature_category": "subgroups",
"view_duration_s": 0.09245,
"meta.client_id": "user/1",
"mem_objects": 38669,
"db_count": 33,
"redis_write_bytes": 1023,
"db_write_count": 0,
"meta.user": "root",
"correlation_id": "01F9ED5CF1C22C0YRD8G7C4DR2",
"user_id": 1,
"mem_mallocs": 9940,
"queue_duration_s": 0.007417,
"redis_cache_calls": 14,
"redis_cache_read_bytes": 1860,
"gitaly_duration_s": 0.001674,
"exception.message": "14:failed to connect to all addresses. debug_error_string:{\"created\":\"@1625054949.995369728\",\"description\":\"Failed to pick subchannel\",\"file\":\"src/core/ext/filters/client_channel/client_channel.cc\",\"file_line\":3952,\"referenced_errors\":[{\"created\":\"@1625054927.915030211\",\"description\":\"failed to connect to all addresses\",\"file\":\"src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc\",\"file_line\":394,\"grpc_status\":14}]}"
},
"resource": {
"type": "k8s_container",
"labels": {
"project_id": "gitlab-review-apps",
"cluster_name": "review-apps",
"namespace_name": "review-332321-ref-gezybs",
"pod_name": "review-332321-ref-gezybs-webservice-default-c5f4f9456-cfmsw",
"location": "us-central1-b",
"container_name": "webservice"
}
},
"timestamp": "2021-06-30T12:09:10.003Z",
"severity": "INFO",
"labels": {
"k8s-pod/gitlab_com/webservice-name": "default",
"k8s-pod/heritage": "Helm",
"k8s-pod/app": "webservice",
"k8s-pod/pod-template-hash": "c5f4f9456",
"k8s-pod/release": "review-332321-ref-gezybs",
"k8s-pod/chart": "webservice-4.12.0",
"compute.googleapis.com/resource_name": "gke-review-apps-e2-16cpu-64gb-preemp-35813f67-7vd5"
},
"logName": "projects/gitlab-review-apps/logs/stdout",
"receiveTimestamp": "2021-06-30T12:09:15.027696839Z"
}
The error happened at 2021-06-30T12:09:10.003Z and there are no Gitaly logs at that time. The previous Gitaly log was at 2021-06-30T12:08:18.01Z and then the next wasn't until 2021-06-30T12:11:48.780758814Z.
The 12:08 log entry was a warning about certificates that always appears when a Gitaly container starts. And the 12:11 log entry usually immediately follows. So it looks like the container started and then was unavailable for 3 minutes.
There were actually a few gaps in Gitaly availability for that review app around that time. These CPU/memory usage graphs show the gaps:
The bottom line is usage and the lines above are requests and limits. It looks like there were no usage metrics recorded during those periods.
The audit logs show a few endpoint updates at that time: https://cloudlogging.app.goo.gl/PPdgPZikQHTpVmRX8
The updates show the IP changing, so that might explain why the service is unavailable at those times.
Questions
- Is the cause of
failed to connect to all addresseserrors due to the lack of access to the Gitaly service following an endpoint update. - If so, what causes the endpoint update, and can anything be done to prevent it or mitigate the consequences?
- Why does Gitaly restart? GCP audit logs do not seem to show Gitaly pods being destroyed and recreated.
