Sidekiq high-urgency-cpu-bound job Ci::CreateDownstreamPipelineWorker not meeting performance targets
Summary
On GitLab Dedicated, running 18.0.x at the time of writing, we've had a number of incidents caused by the slow execution of Ci::CreateDownstreamPipelineWorker, marked as high urgency CPU bound.
Without any further saturation on the infrastructure side, these jobs do not fit within the requirements for the high job urgency.
For this example tenant (outer_tomato_gull, inc-2103), over the span of 7 days, 8% of Ci::CreateDownstreamPipelineWorker jobs took > 10s to complete, not meeting performance targets:
Extract from one sample log message with durations (sensitive fields removed):
{
"time": "2025-07-03T19:31:19.779Z",
"kubernetes": {
"container_name": "sidekiq",
"labels": {
"app.kubernetes.io/name": "gitlab",
"app.kubernetes.io/version": "v18.0.2",
"chart": "sidekiq-9.0.2",
"queue-pod-name": "urgent-cpu-bound",
"release": "gitlab",
"shard": "urgent-cpu-bound"
},
"namespace_labels": {
"kubernetes.io/metadata.name": "default"
}
},
"severity": "INFO",
"retry": 3,
"queue": "urgent_cpu_bound",
"queue_namespace": "pipeline_default",
"class": "Ci::CreateDownstreamPipelineWorker",
"created_at": "2025-07-03T19:30:39.810Z",
"meta.caller_id": "PipelineProcessWorker",
"meta.feature_category": "continuous_integration",
"meta.root_caller_id": "PUT /api/:version/jobs/:id",
"worker_data_consistency": "always",
"worker_data_consistency_per_db": {
"main": "always",
"ci": "always"
},
"size_limiter": "validated",
"enqueued_at": "2025-07-03T19:30:39.814Z",
"job_size_bytes": 11,
"pid": 26,
"sidekiq_thread_name": "sidekiq.default/processor",
"message": "Ci::CreateDownstreamPipelineWorker JID-175e1b1a511c91295fecc5d9: done: 39.962475 sec",
"job_status": "done",
"queue_duration_s": 0.002044,
"scheduling_latency_s": 0.002062,
"gitaly_calls": 8,
"gitaly_duration_s": 39.126492,
"redis_calls": 97,
"redis_duration_s": 0.054063,
"redis_read_bytes": 1390,
"redis_write_bytes": 13246,
"redis_cache_calls": 12,
"redis_cache_duration_s": 0.011452,
"redis_cache_read_bytes": 162,
"redis_cache_write_bytes": 1143,
"redis_feature_flag_calls": 7,
"redis_feature_flag_duration_s": 0.009463,
"redis_feature_flag_read_bytes": 805,
"redis_feature_flag_write_bytes": 466,
"redis_queues_calls": 10,
"redis_queues_duration_s": 0.003411,
"redis_queues_read_bytes": 10,
"redis_queues_write_bytes": 7013,
"redis_queues_metadata_calls": 10,
"redis_queues_metadata_duration_s": 0.007274,
"redis_queues_metadata_read_bytes": 92,
"redis_queues_metadata_write_bytes": 1703,
"redis_repository_cache_calls": 38,
"redis_repository_cache_duration_s": 0.015581,
"redis_repository_cache_read_bytes": 238,
"redis_repository_cache_write_bytes": 1761,
"redis_shared_state_calls": 19,
"redis_shared_state_duration_s": 0.006219,
"redis_shared_state_read_bytes": 82,
"redis_shared_state_write_bytes": 952,
"redis_action_cable_calls": 1,
"redis_action_cable_duration_s": 0.000663,
"redis_action_cable_read_bytes": 1,
"redis_action_cable_write_bytes": 208,
"db_count": 176,
"db_write_count": 22,
"db_cached_count": 23,
"db_txn_count": 2,
"db_replica_txn_count": 0,
"db_primary_txn_count": 0,
"db_replica_count": 0,
"db_primary_count": 176,
"db_replica_write_count": 0,
"db_primary_write_count": 22,
"db_replica_cached_count": 0,
"db_primary_cached_count": 23,
"db_replica_wal_count": 0,
"db_primary_wal_count": 0,
"db_replica_wal_cached_count": 0,
"db_primary_wal_cached_count": 0,
"db_replica_txn_max_duration_s": 0,
"db_primary_txn_max_duration_s": 0,
"db_replica_txn_duration_s": 0,
"db_primary_txn_duration_s": 0,
"db_replica_duration_s": 0,
"db_primary_duration_s": 0.213,
"db_main_txn_count": 0,
"db_ci_txn_count": 2,
"db_main_replica_txn_count": 0,
"db_ci_replica_txn_count": 0,
"db_main_count": 123,
"db_ci_count": 53,
"db_main_replica_count": 0,
"db_ci_replica_count": 0,
"db_main_write_count": 1,
"db_ci_write_count": 21,
"db_main_replica_write_count": 0,
"db_ci_replica_write_count": 0,
"db_main_cached_count": 19,
"db_ci_cached_count": 4,
"db_main_replica_cached_count": 0,
"db_ci_replica_cached_count": 0,
"db_main_wal_count": 0,
"db_ci_wal_count": 0,
"db_main_replica_wal_count": 0,
"db_ci_replica_wal_count": 0,
"db_main_wal_cached_count": 0,
"db_ci_wal_cached_count": 0,
"db_main_replica_wal_cached_count": 0,
"db_ci_replica_wal_cached_count": 0,
"db_main_txn_max_duration_s": 0,
"db_ci_txn_max_duration_s": 0.2,
"db_main_replica_txn_max_duration_s": 0,
"db_ci_replica_txn_max_duration_s": 0,
"db_main_txn_duration_s": 0,
"db_ci_txn_duration_s": 0.205,
"db_main_replica_txn_duration_s": 0,
"db_ci_replica_txn_duration_s": 0,
"db_main_duration_s": 0.112,
"db_ci_duration_s": 0.102,
"db_main_replica_duration_s": 0,
"db_ci_replica_duration_s": 0,
"cpu_s": 0.621752,
"mem_objects": 389919,
"mem_bytes": 35556591,
"mem_mallocs": 68651,
"mem_total_bytes": 51153351,
"worker_id": "sidekiq_0",
"rate_limiting_gates": [
"pipelines_create"
],
"duration_s": 39.962475,
"completed_at": "2025-07-03T19:31:19.779Z",
"load_balancing_strategy": "primary",
"db_duration_s": 0.220311,
"urgency": "high",
"target_duration_s": 10,
"target_scheduling_latency_s": 10,
"@timestamp": "2025-07-03T19:31:19.779535298Z"
}
Impact
This causes impact by slowing down and queuing other high-urgency-cpu-bound jobs.
Recommendation
Expectation: These high urgency jobs should complete within 10s or be downgraded to low.
