Skip to content

Geo: Personal snippets not syncing

Summary

Personal snippets are not syncing in 16.5+.

Geo::EventWorker jobs are finishing with job_status: "fail" and exception.message: "Cannot log a Geo updated event for PersonalSnippet".

Steps to reproduce

Initially reported in https://gitlab.com/gitlab-com/geo-customers/-/issues/186.

  • Create a personal snippet
  • Visit Admin > Geo > Sites and look at Snippet repositories progress bar. The number of synced will not increase by 1. The new snippet repo will perpetually be either "Queued" or "Failed".

Example Project

An example of the error in staging-ref https://console.cloud.google.com/logs/query;cursorTimestamp=2024-01-31T21:22:14.987Z;duration=PT1H;query=SEARCH%2528%22%60log_geo_updated_event%60%22%2529%0Atimestamp%3D%222024-01-31T21:22:14.987Z%22%0AinsertId%3D%2290qydjxgsqg7qumq%22?project=gitlab-staging-ref:

Click here to expand
{
  "insertId": "90qydjxgsqg7qumq",
  "jsonPayload": {
    "db_geo_wal_cached_count": 0,
    "exception.message": "Cannot log a Geo updated event for PersonalSnippet",
    "redis_cache_duration_s": 0.047918,
    "correlation_id": "52623e38c3f148aa226f5e63ae458b1b",
    "db_ci_count": 2,
    "db_main_wal_count": 2,
    "worker_data_consistency": "always",
    "db_main_replica_cached_count": 0,
    "retried_at": "2024-01-31T21:21:38.801Z",
    "gitaly_calls": 11,
    "db_geo_wal_count": 0,
    "exception.class": "RuntimeError",
    "created_at": "2024-01-31T21:21:07.041Z",
    "db_geo_duration_s": 0.002,
    "mem_bytes": 6931702,
    "dead": false,
    "db_main_count": 10,
    "redis_queues_duration_s": 0.003054,
    "meta.client_id": "ip/",
    "redis_read_bytes": 1990,
    "db_replica_cached_count": 0,
    "version": 0,
    "db_ci_replica_wal_count": 0,
    "redis_queues_read_bytes": 9,
    "redis_cache_allowed_cross_slot_calls": 10,
    "db_count": 17,
    "db_ci_wal_count": 2,
    "redis_shared_state_duration_s": 0.004124,
    "db_write_count": 2,
    "class": "Geo::EventWorker",
    "cpu_s": 0.120456,
    "redis_duration_s": 0.055096,
    "db_replica_wal_cached_count": 0,
    "db_geo_cached_count": 1,
    "duration_s": 1.679356,
    "db_ci_cached_count": 1,
    "target_scheduling_latency_s": 60,
    "redis_queues_write_bytes": 2192,
    "meta.feature_category": "geo_replication",
    "redis_shared_state_write_bytes": 391,
    "db_replica_duration_s": 0,
    "db_primary_wal_cached_count": 2,
    "db_main_replica_count": 0,
    "worker_id": "sidekiq_0",
    "redis_shared_state_read_bytes": 3,
    "db_main_replica_duration_s": 0,
    "meta.root_caller_id": "Cronjob",
    "redis_calls": 124,
    "db_main_cached_count": 2,
    "db_ci_replica_duration_s": 0,
    "gitaly_duration_s": 1.511743,
    "db_primary_count": 17,
    "db_ci_replica_wal_cached_count": 0,
    "retry_count": 1,
    "db_ci_duration_s": 0.001,
    "message": "Geo::EventWorker JID-9205bec8d761fb8b35f6616f: fail: 1.679356 sec",
    "args": [
      "snippet_repository",
      "created",
      "{\"model_record_id\"=>36665}"
    ],
    "db_replica_count": 0,
    "size_limiter": "validated",
    "enqueued_at": "2024-01-31T21:22:13.305Z",
    "mem_mallocs": 8802,
    "db_primary_duration_s": 0.019,
    "target_duration_s": 300,
    "db_primary_wal_count": 4,
    "db_main_wal_cached_count": 1,
    "redis_queues_calls": 7,
    "completed_at": "2024-01-31T21:22:14.986Z",
    "scheduling_latency_s": 0.001577,
    "db_ci_wal_cached_count": 1,
    "queue": "default",
    "db_cached_count": 4,
    "db_main_replica_wal_count": 0,
    "queue_namespace": "geo",
    "load_balancing_strategy": "primary",
    "db_duration_s": 0.026644,
    "exception.backtrace": [
      "ee/app/models/ee/repository.rb:97:in `log_geo_updated_event'",
      "ee/app/models/ee/repository.rb:83:in `after_change_head'",
      "app/models/repository.rb:1228:in `change_head'",
      "ee/app/models/ee/repository.rb:123:in `update_root_ref'",
      "ee/app/services/geo/framework_repository_sync_service.rb:200:in `update_root_ref'",
      "ee/app/services/geo/framework_repository_sync_service.rb:92:in `fetch_repository'",
      "ee/app/services/geo/framework_repository_sync_service.rb:41:in `sync_repository'",
      "ee/app/services/geo/framework_repository_sync_service.rb:29:in `block in execute'",
      "app/services/concerns/exclusive_lease_guard.rb:29:in `try_obtain_lease'",
      "ee/app/services/geo/framework_repository_sync_service.rb:26:in `execute'",
      "ee/app/models/concerns/geo/repository_replicator_strategy.rb:87:in `sync_repository'",
      "ee/app/models/concerns/geo/repository_replicator_strategy.rb:66:in `resync'",
      "ee/app/models/concerns/geo/repository_replicator_strategy.rb:45:in `consume_event_updated'",
      "ee/app/models/concerns/geo/repository_replicator_strategy.rb:71:in `consume_event_created'",
      "ee/lib/gitlab/geo/replicator.rb:269:in `consume'",
      "ee/app/services/geo/event_service.rb:18:in `execute'",
      "ee/app/workers/geo/event_worker.rb:15:in `perform'",
      "lib/gitlab/sidekiq_middleware/skip_jobs.rb:49:in `call'",
      "lib/gitlab/database/load_balancing/sidekiq_server_middleware.rb:29:in `call'",
      "lib/gitlab/sidekiq_middleware/duplicate_jobs/strategies/until_executing.rb:16:in `perform'",
      "lib/gitlab/sidekiq_middleware/duplicate_jobs/duplicate_job.rb:44:in `perform'",
      "lib/gitlab/sidekiq_middleware/duplicate_jobs/server.rb:8:in `call'",
      "lib/gitlab/sidekiq_middleware/concurrency_limit/middleware.rb:32:in `perform'",
      "lib/gitlab/sidekiq_middleware/concurrency_limit/server.rb:8:in `call'",
      "lib/click_house/migration_support/sidekiq_middleware.rb:7:in `call'",
      "lib/gitlab/sidekiq_middleware/pause_control/strategies/base.rb:31:in `perform'",
      "lib/gitlab/sidekiq_middleware/pause_control/strategy_handler.rb:22:in `perform'",
      "lib/gitlab/sidekiq_middleware/pause_control/server.rb:8:in `call'",
      "lib/gitlab/sidekiq_middleware/worker_context.rb:9:in `wrap_in_optional_context'",
      "lib/gitlab/sidekiq_middleware/worker_context/server.rb:19:in `block in call'",
      "lib/gitlab/application_context.rb:130:in `block in use'",
      "lib/gitlab/application_context.rb:130:in `use'",
      "lib/gitlab/application_context.rb:64:in `with_context'",
      "lib/gitlab/sidekiq_middleware/worker_context/server.rb:17:in `call'",
      "lib/gitlab/sidekiq_status/server_middleware.rb:7:in `call'",
      "lib/gitlab/sidekiq_versioning/middleware.rb:9:in `call'",
      "lib/gitlab/sidekiq_middleware/query_analyzer.rb:7:in `block in call'",
      "lib/gitlab/database/query_analyzer.rb:40:in `within'",
      "lib/gitlab/sidekiq_middleware/query_analyzer.rb:7:in `call'",
      "lib/gitlab/sidekiq_middleware/admin_mode/server.rb:14:in `call'",
      "lib/gitlab/sidekiq_middleware/instrumentation_logger.rb:9:in `call'",
      "lib/gitlab/sidekiq_middleware/batch_loader.rb:7:in `call'",
      "lib/gitlab/sidekiq_middleware/extra_done_log_metadata.rb:7:in `call'",
      "lib/gitlab/sidekiq_middleware/request_store_middleware.rb:8:in `block in call'",
      "lib/gitlab/sidekiq_middleware/request_store_middleware.rb:7:in `call'",
      "lib/gitlab/sidekiq_middleware/server_metrics.rb:105:in `block in call'",
      "lib/gitlab/sidekiq_middleware/server_metrics.rb:133:in `block in instrument'",
      "lib/gitlab/metrics/background_transaction.rb:33:in `run'",
      "lib/gitlab/sidekiq_middleware/server_metrics.rb:133:in `instrument'",
      "lib/gitlab/sidekiq_middleware/server_metrics.rb:104:in `call'",
      "lib/gitlab/sidekiq_middleware/monitor.rb:10:in `block in call'",
      "lib/gitlab/sidekiq_daemon/monitor.rb:46:in `within_job'",
      "lib/gitlab/sidekiq_middleware/monitor.rb:9:in `call'",
      "lib/gitlab/sidekiq_middleware/size_limiter/server.rb:13:in `call'",
      "lib/gitlab/sidekiq_logging/structured_logger.rb:21:in `call'"
    ],
    "mem_objects": 38067,
    "db_ci_replica_cached_count": 0,
    "mem_total_bytes": 8454382,
    "redis_cache_read_bytes": 1978,
    "rate_limiting_gates": [],
    "jid": "9205bec8d761fb8b35f6616f",
    "redis_shared_state_calls": 3,
    "redis_write_bytes": 12462,
    "retry": 3,
    "db_geo_count": 5,
    "db_main_replica_wal_cached_count": 0,
    "job_size_bytes": 58,
    "job_status": "fail",
    "pid": 25,
    "db_primary_cached_count": 4,
    "db_ci_replica_count": 0,
    "urgency": "low",
    "idempotency_key": "resque:gitlab:duplicate:default:928121ff4d7f26f430ac22b6c8d9008e7f85796c4b660374e43c8ffbb99c5645",
    "redis_cache_calls": 114,
    "db_replica_wal_count": 0,
    "redis_allowed_cross_slot_calls": 10,
    "redis_cache_write_bytes": 9879,
    "status_expiration": 1800,
    "meta.caller_id": "Geo::RepositoryRegistrySyncWorker",
    "db_main_duration_s": 0.016
  },
  "resource": {
    "type": "k8s_container",
    "labels": {
      "project_id": "gitlab-staging-ref",
      "namespace_name": "default",
      "location": "europe-west6-c",
      "cluster_name": "staging-ref-3k-hybrid-eu",
      "container_name": "sidekiq",
      "pod_name": "gitlab-sidekiq-all-in-1-v2-5f4dd98d8-kwczn"
    }
  },
  "timestamp": "2024-01-31T21:22:14.987Z",
  "severity": "WARNING",
  "labels": {
    "k8s-pod/pod-template-hash": "5f4dd98d8",
    "compute.googleapis.com/resource_name": "gke-staging-ref-3k-h-gl-sidekiq-20230-dbafd31d-1tbr",
    "k8s-pod/release": "gitlab",
    "k8s-pod/queue-pod-name": "all-in-1",
    "k8s-pod/chart": "sidekiq-7.8.1",
    "k8s-pod/app": "sidekiq",
    "k8s-pod/heritage": "Helm"
  },
  "logName": "projects/gitlab-staging-ref/logs/stdout",
  "receiveTimestamp": "2024-01-31T21:22:17.125664365Z"
}

Relevant logs and/or screenshots

Possible fixes

  • Add PersonalSnippet to this case statement
  • Skip the case statement like: return unless Gitlab::Geo.primary?
  • Backport as far as we can

Workaround

To sync the stuck snippet repositories now:

  1. SSH into a machine running GitLab Puma/Sidekiq/Geo Log Cursor in the secondary Geo site.

  2. Run gitlab-rails console

  3. Run

    module EE; module Repository; def log_geo_updated_event; end; end; end
    def resync_stuck_personal_snippets
      registry_records = Geo::SnippetRepositoryRegistry.with_state(:started, :failed)
      puts "Found #{registry_records.count} registry records"
      registry_records.find_each { |r| print "Resyncing snippet ID: #{r.snippet_repository.snippet_id}..."; r.replicator.resync; puts "#{r.reload.state_name}" }
    end
    resync_stuck_personal_snippets

Or to run this in the background:

$ cat <<EOF >> resync.rb
module EE; module Repository; def log_geo_updated_event; end; end; end
def resync_stuck_personal_snippets
  registry_records = Geo::SnippetRepositoryRegistry.with_state(:started, :failed)
  puts "Found #{registry_records.count} registry records"
  registry_records.find_each { |r| print "Resyncing snippet ID: #{r.snippet_repository.snippet_id}..."; r.replicator.resync; puts "#{r.reload.state_name}" }
end
resync_stuck_personal_snippets
EOF
$ nohup gitlab-rails runner "$(pwd)/resync.rb" &> resync.log &
[1] 272
$ tail -f resync.log
Edited by Michael Kozono