Sidekiq high-urgency-cpu-bound job PostReceive not meeting performance targets
Summary
On GitLab Dedicated, running 18.0.x at the time of writing, we've had a number of incidents caused by the slow execution of PostReceive
, marked as high urgency CPU bound.
Without any further saturation on the infrastructure side, these jobs do not fit within the requirements for the high job urgency.
For this example tenant (outer_tomato_gull
, inc-2103), we found a number of PostReceive
requests taking > 10s over a 7 day span. Execution appears to be spent heavily in CPU and redis:
Extract from one sample log message with durations (sensitive fields removed):
{
"time": "2025-07-03T19:29:13.884Z",
"kubernetes": {
"container_name": "sidekiq",
"namespace_name": "default",
"labels": {
"app.kubernetes.io/name": "gitlab",
"app.kubernetes.io/version": "v18.0.2",
"chart": "sidekiq-9.0.2",
"queue-pod-name": "urgent-cpu-bound",
"release": "gitlab",
"shard": "urgent-cpu-bound"
}
},
"severity": "INFO",
"retry": 3,
"queue": "urgent_cpu_bound",
"class": "PostReceive",
"created_at": "2025-07-03T19:28:54.596Z",
"meta.caller_id": "POST /api/:version/internal/post_receive",
"meta.remote_ip": "44.238.132.161",
"meta.feature_category": "source_code_management",
"meta.root_caller_id": "POST /api/:version/internal/post_receive",
"worker_data_consistency": "sticky",
"worker_data_consistency_per_db": {
"main": "sticky",
"ci": "sticky"
},
"wal_locations": {},
"wal_location_sources": {
"main": "primary",
"ci": "primary"
},
"size_limiter": "validated",
"enqueued_at": "2025-07-03T19:28:54.597Z",
"job_size_bytes": 133,
"pid": 26,
"sidekiq_tid": "1o6hjy",
"sidekiq_thread_name": "sidekiq.default/processor",
"message": "PostReceive JID-5eeef3b7eb00d91bdd075e93: done: 19.28643 sec",
"job_status": "done",
"queue_duration_s": 0.001222,
"scheduling_latency_s": 0.001244,
"gitaly_calls": 39,
"gitaly_duration_s": 1.418332,
"redis_calls": 35710,
"redis_allowed_cross_slot_calls": 4,
"redis_duration_s": 9.827100999999999,
"redis_read_bytes": 42341,
"redis_write_bytes": 4399305,
"redis_cache_calls": 28,
"redis_cache_duration_s": 0.02686,
"redis_cache_read_bytes": 4672,
"redis_cache_write_bytes": 2218,
"redis_feature_flag_calls": 13,
"redis_feature_flag_duration_s": 0.013384,
"redis_feature_flag_read_bytes": 1517,
"redis_feature_flag_write_bytes": 850,
"redis_queues_calls": 32,
"redis_queues_duration_s": 0.025917,
"redis_queues_read_bytes": 32,
"redis_queues_write_bytes": 71585,
"redis_queues_metadata_calls": 17,
"redis_queues_metadata_duration_s": 0.021508,
"redis_queues_metadata_read_bytes": 32,
"redis_queues_metadata_write_bytes": 3231,
"redis_repository_cache_calls": 35566,
"redis_repository_cache_allowed_cross_slot_calls": 4,
"redis_repository_cache_duration_s": 9.697947,
"redis_repository_cache_read_bytes": 35831,
"redis_repository_cache_write_bytes": 4318619,
"redis_shared_state_calls": 53,
"redis_shared_state_duration_s": 0.04116,
"redis_shared_state_read_bytes": 256,
"redis_shared_state_write_bytes": 2594,
"redis_action_cable_calls": 1,
"redis_action_cable_duration_s": 0.000325,
"redis_action_cable_read_bytes": 1,
"redis_action_cable_write_bytes": 208,
"db_count": 114,
"db_write_count": 14,
"db_cached_count": 11,
"db_txn_count": 4,
"db_replica_txn_count": 0,
"db_primary_txn_count": 0,
"db_replica_count": 0,
"db_primary_count": 114,
"db_replica_write_count": 0,
"db_primary_write_count": 14,
"db_replica_cached_count": 0,
"db_primary_cached_count": 11,
"db_replica_wal_count": 0,
"db_primary_wal_count": 0,
"db_replica_wal_cached_count": 0,
"db_primary_wal_cached_count": 0,
"db_replica_txn_max_duration_s": 0,
"db_primary_txn_max_duration_s": 0,
"db_replica_txn_duration_s": 0,
"db_primary_txn_duration_s": 0,
"db_replica_duration_s": 0,
"db_primary_duration_s": 0.245,
"db_main_txn_count": 3,
"db_ci_txn_count": 1,
"db_main_replica_txn_count": 0,
"db_ci_replica_txn_count": 0,
"db_main_count": 92,
"db_ci_count": 22,
"db_main_replica_count": 0,
"db_ci_replica_count": 0,
"db_main_write_count": 6,
"db_ci_write_count": 8,
"db_main_replica_write_count": 0,
"db_ci_replica_write_count": 0,
"db_main_cached_count": 10,
"db_ci_cached_count": 1,
"db_main_replica_cached_count": 0,
"db_ci_replica_cached_count": 0,
"db_main_wal_count": 0,
"db_ci_wal_count": 0,
"db_main_replica_wal_count": 0,
"db_ci_replica_wal_count": 0,
"db_main_wal_cached_count": 0,
"db_ci_wal_cached_count": 0,
"db_main_replica_wal_cached_count": 0,
"db_ci_replica_wal_cached_count": 0,
"db_main_txn_max_duration_s": 0.009,
"db_ci_txn_max_duration_s": 0.078,
"db_main_replica_txn_max_duration_s": 0,
"db_ci_replica_txn_max_duration_s": 0,
"db_main_txn_duration_s": 0.015,
"db_ci_txn_duration_s": 0.078,
"db_main_replica_txn_duration_s": 0,
"db_ci_replica_txn_duration_s": 0,
"db_main_duration_s": 0.146,
"db_ci_duration_s": 0.099,
"db_main_replica_duration_s": 0,
"db_ci_replica_duration_s": 0,
"cpu_s": 11.623523,
"mem_objects": 8659455,
"mem_bytes": 342552154,
"mem_mallocs": 2106311,
"mem_total_bytes": 688930354,
"worker_id": "sidekiq_0",
"rate_limiting_gates": [
"pipelines_create"
],
"exclusive_lock_requested_count": 1,
"exclusive_lock_wait_duration_s": 0.000764007,
"exclusive_lock_hold_duration_s": 0.260136944,
"duration_s": 19.28643,
"completed_at": "2025-07-03T19:29:13.884Z",
"load_balancing_strategy": "primary_no_wal",
"db_duration_s": 0.266344,
"urgency": "high",
"target_duration_s": 10,
"target_scheduling_latency_s": 10,
}
Impact
This causes impact by slowing down and queuing other high-urgency-cpu-bound jobs.
Recommendation
Expectation: These high urgency jobs should complete within 10s or be downgraded to low.
Verification
Edited by 🤖 GitLab Bot 🤖