Geo: Personal snippets not syncing
Summary
Personal snippets are not syncing in 16.5+.
Geo::EventWorker
jobs are finishing with job_status: "fail"
and exception.message: "Cannot log a Geo updated event for PersonalSnippet"
.
Steps to reproduce
Initially reported in https://gitlab.com/gitlab-com/geo-customers/-/issues/186.
- Create a personal snippet
- Visit Admin > Geo > Sites and look at Snippet repositories progress bar. The number of synced will not increase by 1. The new snippet repo will perpetually be either "Queued" or "Failed".
Example Project
An example of the error in staging-ref https://console.cloud.google.com/logs/query;cursorTimestamp=2024-01-31T21:22:14.987Z;duration=PT1H;query=SEARCH%2528%22%60log_geo_updated_event%60%22%2529%0Atimestamp%3D%222024-01-31T21:22:14.987Z%22%0AinsertId%3D%2290qydjxgsqg7qumq%22?project=gitlab-staging-ref:
Click here to expand
{
"insertId": "90qydjxgsqg7qumq",
"jsonPayload": {
"db_geo_wal_cached_count": 0,
"exception.message": "Cannot log a Geo updated event for PersonalSnippet",
"redis_cache_duration_s": 0.047918,
"correlation_id": "52623e38c3f148aa226f5e63ae458b1b",
"db_ci_count": 2,
"db_main_wal_count": 2,
"worker_data_consistency": "always",
"db_main_replica_cached_count": 0,
"retried_at": "2024-01-31T21:21:38.801Z",
"gitaly_calls": 11,
"db_geo_wal_count": 0,
"exception.class": "RuntimeError",
"created_at": "2024-01-31T21:21:07.041Z",
"db_geo_duration_s": 0.002,
"mem_bytes": 6931702,
"dead": false,
"db_main_count": 10,
"redis_queues_duration_s": 0.003054,
"meta.client_id": "ip/",
"redis_read_bytes": 1990,
"db_replica_cached_count": 0,
"version": 0,
"db_ci_replica_wal_count": 0,
"redis_queues_read_bytes": 9,
"redis_cache_allowed_cross_slot_calls": 10,
"db_count": 17,
"db_ci_wal_count": 2,
"redis_shared_state_duration_s": 0.004124,
"db_write_count": 2,
"class": "Geo::EventWorker",
"cpu_s": 0.120456,
"redis_duration_s": 0.055096,
"db_replica_wal_cached_count": 0,
"db_geo_cached_count": 1,
"duration_s": 1.679356,
"db_ci_cached_count": 1,
"target_scheduling_latency_s": 60,
"redis_queues_write_bytes": 2192,
"meta.feature_category": "geo_replication",
"redis_shared_state_write_bytes": 391,
"db_replica_duration_s": 0,
"db_primary_wal_cached_count": 2,
"db_main_replica_count": 0,
"worker_id": "sidekiq_0",
"redis_shared_state_read_bytes": 3,
"db_main_replica_duration_s": 0,
"meta.root_caller_id": "Cronjob",
"redis_calls": 124,
"db_main_cached_count": 2,
"db_ci_replica_duration_s": 0,
"gitaly_duration_s": 1.511743,
"db_primary_count": 17,
"db_ci_replica_wal_cached_count": 0,
"retry_count": 1,
"db_ci_duration_s": 0.001,
"message": "Geo::EventWorker JID-9205bec8d761fb8b35f6616f: fail: 1.679356 sec",
"args": [
"snippet_repository",
"created",
"{\"model_record_id\"=>36665}"
],
"db_replica_count": 0,
"size_limiter": "validated",
"enqueued_at": "2024-01-31T21:22:13.305Z",
"mem_mallocs": 8802,
"db_primary_duration_s": 0.019,
"target_duration_s": 300,
"db_primary_wal_count": 4,
"db_main_wal_cached_count": 1,
"redis_queues_calls": 7,
"completed_at": "2024-01-31T21:22:14.986Z",
"scheduling_latency_s": 0.001577,
"db_ci_wal_cached_count": 1,
"queue": "default",
"db_cached_count": 4,
"db_main_replica_wal_count": 0,
"queue_namespace": "geo",
"load_balancing_strategy": "primary",
"db_duration_s": 0.026644,
"exception.backtrace": [
"ee/app/models/ee/repository.rb:97:in `log_geo_updated_event'",
"ee/app/models/ee/repository.rb:83:in `after_change_head'",
"app/models/repository.rb:1228:in `change_head'",
"ee/app/models/ee/repository.rb:123:in `update_root_ref'",
"ee/app/services/geo/framework_repository_sync_service.rb:200:in `update_root_ref'",
"ee/app/services/geo/framework_repository_sync_service.rb:92:in `fetch_repository'",
"ee/app/services/geo/framework_repository_sync_service.rb:41:in `sync_repository'",
"ee/app/services/geo/framework_repository_sync_service.rb:29:in `block in execute'",
"app/services/concerns/exclusive_lease_guard.rb:29:in `try_obtain_lease'",
"ee/app/services/geo/framework_repository_sync_service.rb:26:in `execute'",
"ee/app/models/concerns/geo/repository_replicator_strategy.rb:87:in `sync_repository'",
"ee/app/models/concerns/geo/repository_replicator_strategy.rb:66:in `resync'",
"ee/app/models/concerns/geo/repository_replicator_strategy.rb:45:in `consume_event_updated'",
"ee/app/models/concerns/geo/repository_replicator_strategy.rb:71:in `consume_event_created'",
"ee/lib/gitlab/geo/replicator.rb:269:in `consume'",
"ee/app/services/geo/event_service.rb:18:in `execute'",
"ee/app/workers/geo/event_worker.rb:15:in `perform'",
"lib/gitlab/sidekiq_middleware/skip_jobs.rb:49:in `call'",
"lib/gitlab/database/load_balancing/sidekiq_server_middleware.rb:29:in `call'",
"lib/gitlab/sidekiq_middleware/duplicate_jobs/strategies/until_executing.rb:16:in `perform'",
"lib/gitlab/sidekiq_middleware/duplicate_jobs/duplicate_job.rb:44:in `perform'",
"lib/gitlab/sidekiq_middleware/duplicate_jobs/server.rb:8:in `call'",
"lib/gitlab/sidekiq_middleware/concurrency_limit/middleware.rb:32:in `perform'",
"lib/gitlab/sidekiq_middleware/concurrency_limit/server.rb:8:in `call'",
"lib/click_house/migration_support/sidekiq_middleware.rb:7:in `call'",
"lib/gitlab/sidekiq_middleware/pause_control/strategies/base.rb:31:in `perform'",
"lib/gitlab/sidekiq_middleware/pause_control/strategy_handler.rb:22:in `perform'",
"lib/gitlab/sidekiq_middleware/pause_control/server.rb:8:in `call'",
"lib/gitlab/sidekiq_middleware/worker_context.rb:9:in `wrap_in_optional_context'",
"lib/gitlab/sidekiq_middleware/worker_context/server.rb:19:in `block in call'",
"lib/gitlab/application_context.rb:130:in `block in use'",
"lib/gitlab/application_context.rb:130:in `use'",
"lib/gitlab/application_context.rb:64:in `with_context'",
"lib/gitlab/sidekiq_middleware/worker_context/server.rb:17:in `call'",
"lib/gitlab/sidekiq_status/server_middleware.rb:7:in `call'",
"lib/gitlab/sidekiq_versioning/middleware.rb:9:in `call'",
"lib/gitlab/sidekiq_middleware/query_analyzer.rb:7:in `block in call'",
"lib/gitlab/database/query_analyzer.rb:40:in `within'",
"lib/gitlab/sidekiq_middleware/query_analyzer.rb:7:in `call'",
"lib/gitlab/sidekiq_middleware/admin_mode/server.rb:14:in `call'",
"lib/gitlab/sidekiq_middleware/instrumentation_logger.rb:9:in `call'",
"lib/gitlab/sidekiq_middleware/batch_loader.rb:7:in `call'",
"lib/gitlab/sidekiq_middleware/extra_done_log_metadata.rb:7:in `call'",
"lib/gitlab/sidekiq_middleware/request_store_middleware.rb:8:in `block in call'",
"lib/gitlab/sidekiq_middleware/request_store_middleware.rb:7:in `call'",
"lib/gitlab/sidekiq_middleware/server_metrics.rb:105:in `block in call'",
"lib/gitlab/sidekiq_middleware/server_metrics.rb:133:in `block in instrument'",
"lib/gitlab/metrics/background_transaction.rb:33:in `run'",
"lib/gitlab/sidekiq_middleware/server_metrics.rb:133:in `instrument'",
"lib/gitlab/sidekiq_middleware/server_metrics.rb:104:in `call'",
"lib/gitlab/sidekiq_middleware/monitor.rb:10:in `block in call'",
"lib/gitlab/sidekiq_daemon/monitor.rb:46:in `within_job'",
"lib/gitlab/sidekiq_middleware/monitor.rb:9:in `call'",
"lib/gitlab/sidekiq_middleware/size_limiter/server.rb:13:in `call'",
"lib/gitlab/sidekiq_logging/structured_logger.rb:21:in `call'"
],
"mem_objects": 38067,
"db_ci_replica_cached_count": 0,
"mem_total_bytes": 8454382,
"redis_cache_read_bytes": 1978,
"rate_limiting_gates": [],
"jid": "9205bec8d761fb8b35f6616f",
"redis_shared_state_calls": 3,
"redis_write_bytes": 12462,
"retry": 3,
"db_geo_count": 5,
"db_main_replica_wal_cached_count": 0,
"job_size_bytes": 58,
"job_status": "fail",
"pid": 25,
"db_primary_cached_count": 4,
"db_ci_replica_count": 0,
"urgency": "low",
"idempotency_key": "resque:gitlab:duplicate:default:928121ff4d7f26f430ac22b6c8d9008e7f85796c4b660374e43c8ffbb99c5645",
"redis_cache_calls": 114,
"db_replica_wal_count": 0,
"redis_allowed_cross_slot_calls": 10,
"redis_cache_write_bytes": 9879,
"status_expiration": 1800,
"meta.caller_id": "Geo::RepositoryRegistrySyncWorker",
"db_main_duration_s": 0.016
},
"resource": {
"type": "k8s_container",
"labels": {
"project_id": "gitlab-staging-ref",
"namespace_name": "default",
"location": "europe-west6-c",
"cluster_name": "staging-ref-3k-hybrid-eu",
"container_name": "sidekiq",
"pod_name": "gitlab-sidekiq-all-in-1-v2-5f4dd98d8-kwczn"
}
},
"timestamp": "2024-01-31T21:22:14.987Z",
"severity": "WARNING",
"labels": {
"k8s-pod/pod-template-hash": "5f4dd98d8",
"compute.googleapis.com/resource_name": "gke-staging-ref-3k-h-gl-sidekiq-20230-dbafd31d-1tbr",
"k8s-pod/release": "gitlab",
"k8s-pod/queue-pod-name": "all-in-1",
"k8s-pod/chart": "sidekiq-7.8.1",
"k8s-pod/app": "sidekiq",
"k8s-pod/heritage": "Helm"
},
"logName": "projects/gitlab-staging-ref/logs/stdout",
"receiveTimestamp": "2024-01-31T21:22:17.125664365Z"
}
Relevant logs and/or screenshots
Possible fixes
-
Add PersonalSnippet
to this case statement -
Skip the case statement like: return unless Gitlab::Geo.primary?
-
Backport as far as we can
Workaround
To sync the stuck snippet repositories now:
-
SSH into a machine running GitLab Puma/Sidekiq/Geo Log Cursor in the secondary Geo site.
-
Run
gitlab-rails console
-
Run
module EE; module Repository; def log_geo_updated_event; end; end; end def resync_stuck_personal_snippets registry_records = Geo::SnippetRepositoryRegistry.with_state(:started, :failed) puts "Found #{registry_records.count} registry records" registry_records.find_each { |r| print "Resyncing snippet ID: #{r.snippet_repository.snippet_id}..."; r.replicator.resync; puts "#{r.reload.state_name}" } end resync_stuck_personal_snippets
Or to run this in the background:
$ cat <<EOF >> resync.rb
module EE; module Repository; def log_geo_updated_event; end; end; end
def resync_stuck_personal_snippets
registry_records = Geo::SnippetRepositoryRegistry.with_state(:started, :failed)
puts "Found #{registry_records.count} registry records"
registry_records.find_each { |r| print "Resyncing snippet ID: #{r.snippet_repository.snippet_id}..."; r.replicator.resync; puts "#{r.reload.state_name}" }
end
resync_stuck_personal_snippets
EOF
$ nohup gitlab-rails runner "$(pwd)/resync.rb" &> resync.log &
[1] 272
$ tail -f resync.log