Skip to content

Sidekiq high-urgency-cpu-bound job PostReceive not meeting performance targets

Summary

On GitLab Dedicated, running 18.0.x at the time of writing, we've had a number of incidents caused by the slow execution of PostReceive, marked as high urgency CPU bound.

Without any further saturation on the infrastructure side, these jobs do not fit within the requirements for the high job urgency.

For this example tenant (outer_tomato_gull, inc-2103), we found a number of PostReceive requests taking > 10s over a 7 day span. Execution appears to be spent heavily in CPU and redis:

Screenshot_2025-07-03_at_16.29.36

Extract from one sample log message with durations (sensitive fields removed):

{
  "time": "2025-07-03T19:29:13.884Z",
  "kubernetes": {
    "container_name": "sidekiq",
    "namespace_name": "default",
    "labels": {
      "app.kubernetes.io/name": "gitlab",
      "app.kubernetes.io/version": "v18.0.2",
      "chart": "sidekiq-9.0.2",
      "queue-pod-name": "urgent-cpu-bound",
      "release": "gitlab",
      "shard": "urgent-cpu-bound"
    }
  },
  "severity": "INFO",
  "retry": 3,
  "queue": "urgent_cpu_bound",
  "class": "PostReceive",
  "created_at": "2025-07-03T19:28:54.596Z",
  "meta.caller_id": "POST /api/:version/internal/post_receive",
  "meta.remote_ip": "44.238.132.161",
  "meta.feature_category": "source_code_management",
  "meta.root_caller_id": "POST /api/:version/internal/post_receive",
  "worker_data_consistency": "sticky",
  "worker_data_consistency_per_db": {
    "main": "sticky",
    "ci": "sticky"
  },
  "wal_locations": {},
  "wal_location_sources": {
    "main": "primary",
    "ci": "primary"
  },
  "size_limiter": "validated",
  "enqueued_at": "2025-07-03T19:28:54.597Z",
  "job_size_bytes": 133,
  "pid": 26,
  "sidekiq_tid": "1o6hjy",
  "sidekiq_thread_name": "sidekiq.default/processor",
  "message": "PostReceive JID-5eeef3b7eb00d91bdd075e93: done: 19.28643 sec",
  "job_status": "done",
  "queue_duration_s": 0.001222,
  "scheduling_latency_s": 0.001244,
  "gitaly_calls": 39,
  "gitaly_duration_s": 1.418332,
  "redis_calls": 35710,
  "redis_allowed_cross_slot_calls": 4,
  "redis_duration_s": 9.827100999999999,
  "redis_read_bytes": 42341,
  "redis_write_bytes": 4399305,
  "redis_cache_calls": 28,
  "redis_cache_duration_s": 0.02686,
  "redis_cache_read_bytes": 4672,
  "redis_cache_write_bytes": 2218,
  "redis_feature_flag_calls": 13,
  "redis_feature_flag_duration_s": 0.013384,
  "redis_feature_flag_read_bytes": 1517,
  "redis_feature_flag_write_bytes": 850,
  "redis_queues_calls": 32,
  "redis_queues_duration_s": 0.025917,
  "redis_queues_read_bytes": 32,
  "redis_queues_write_bytes": 71585,
  "redis_queues_metadata_calls": 17,
  "redis_queues_metadata_duration_s": 0.021508,
  "redis_queues_metadata_read_bytes": 32,
  "redis_queues_metadata_write_bytes": 3231,
  "redis_repository_cache_calls": 35566,
  "redis_repository_cache_allowed_cross_slot_calls": 4,
  "redis_repository_cache_duration_s": 9.697947,
  "redis_repository_cache_read_bytes": 35831,
  "redis_repository_cache_write_bytes": 4318619,
  "redis_shared_state_calls": 53,
  "redis_shared_state_duration_s": 0.04116,
  "redis_shared_state_read_bytes": 256,
  "redis_shared_state_write_bytes": 2594,
  "redis_action_cable_calls": 1,
  "redis_action_cable_duration_s": 0.000325,
  "redis_action_cable_read_bytes": 1,
  "redis_action_cable_write_bytes": 208,
  "db_count": 114,
  "db_write_count": 14,
  "db_cached_count": 11,
  "db_txn_count": 4,
  "db_replica_txn_count": 0,
  "db_primary_txn_count": 0,
  "db_replica_count": 0,
  "db_primary_count": 114,
  "db_replica_write_count": 0,
  "db_primary_write_count": 14,
  "db_replica_cached_count": 0,
  "db_primary_cached_count": 11,
  "db_replica_wal_count": 0,
  "db_primary_wal_count": 0,
  "db_replica_wal_cached_count": 0,
  "db_primary_wal_cached_count": 0,
  "db_replica_txn_max_duration_s": 0,
  "db_primary_txn_max_duration_s": 0,
  "db_replica_txn_duration_s": 0,
  "db_primary_txn_duration_s": 0,
  "db_replica_duration_s": 0,
  "db_primary_duration_s": 0.245,
  "db_main_txn_count": 3,
  "db_ci_txn_count": 1,
  "db_main_replica_txn_count": 0,
  "db_ci_replica_txn_count": 0,
  "db_main_count": 92,
  "db_ci_count": 22,
  "db_main_replica_count": 0,
  "db_ci_replica_count": 0,
  "db_main_write_count": 6,
  "db_ci_write_count": 8,
  "db_main_replica_write_count": 0,
  "db_ci_replica_write_count": 0,
  "db_main_cached_count": 10,
  "db_ci_cached_count": 1,
  "db_main_replica_cached_count": 0,
  "db_ci_replica_cached_count": 0,
  "db_main_wal_count": 0,
  "db_ci_wal_count": 0,
  "db_main_replica_wal_count": 0,
  "db_ci_replica_wal_count": 0,
  "db_main_wal_cached_count": 0,
  "db_ci_wal_cached_count": 0,
  "db_main_replica_wal_cached_count": 0,
  "db_ci_replica_wal_cached_count": 0,
  "db_main_txn_max_duration_s": 0.009,
  "db_ci_txn_max_duration_s": 0.078,
  "db_main_replica_txn_max_duration_s": 0,
  "db_ci_replica_txn_max_duration_s": 0,
  "db_main_txn_duration_s": 0.015,
  "db_ci_txn_duration_s": 0.078,
  "db_main_replica_txn_duration_s": 0,
  "db_ci_replica_txn_duration_s": 0,
  "db_main_duration_s": 0.146,
  "db_ci_duration_s": 0.099,
  "db_main_replica_duration_s": 0,
  "db_ci_replica_duration_s": 0,
  "cpu_s": 11.623523,
  "mem_objects": 8659455,
  "mem_bytes": 342552154,
  "mem_mallocs": 2106311,
  "mem_total_bytes": 688930354,
  "worker_id": "sidekiq_0",
  "rate_limiting_gates": [
    "pipelines_create"
  ],
  "exclusive_lock_requested_count": 1,
  "exclusive_lock_wait_duration_s": 0.000764007,
  "exclusive_lock_hold_duration_s": 0.260136944,
  "duration_s": 19.28643,
  "completed_at": "2025-07-03T19:29:13.884Z",
  "load_balancing_strategy": "primary_no_wal",
  "db_duration_s": 0.266344,
  "urgency": "high",
  "target_duration_s": 10,
  "target_scheduling_latency_s": 10,
}

Impact

This causes impact by slowing down and queuing other high-urgency-cpu-bound jobs.

Recommendation

Expectation: These high urgency jobs should complete within 10s or be downgraded to low.

Verification

Edited by 🤖 GitLab Bot 🤖