Skip to content

Sidekiq high-urgency-cpu-bound job Ci::CreateDownstreamPipelineWorker not meeting performance targets

Summary

On GitLab Dedicated, running 18.0.x at the time of writing, we've had a number of incidents caused by the slow execution of Ci::CreateDownstreamPipelineWorker, marked as high urgency CPU bound.

Without any further saturation on the infrastructure side, these jobs do not fit within the requirements for the high job urgency.

For this example tenant (outer_tomato_gull, inc-2103), over the span of 7 days, 8% of Ci::CreateDownstreamPipelineWorker jobs took > 10s to complete, not meeting performance targets:

Screenshot_2025-07-03_at_15.59.44

Extract from one sample log message with durations (sensitive fields removed):

{
  "time": "2025-07-03T19:31:19.779Z",
  "kubernetes": {
    "container_name": "sidekiq",
    "labels": {
      "app.kubernetes.io/name": "gitlab",
      "app.kubernetes.io/version": "v18.0.2",
      "chart": "sidekiq-9.0.2",
      "queue-pod-name": "urgent-cpu-bound",
      "release": "gitlab",
      "shard": "urgent-cpu-bound"
    },
    "namespace_labels": {
      "kubernetes.io/metadata.name": "default"
    }
  },
  "severity": "INFO",
  "retry": 3,
  "queue": "urgent_cpu_bound",
  "queue_namespace": "pipeline_default",
  "class": "Ci::CreateDownstreamPipelineWorker",
  "created_at": "2025-07-03T19:30:39.810Z",
  "meta.caller_id": "PipelineProcessWorker",
  "meta.feature_category": "continuous_integration",
  "meta.root_caller_id": "PUT /api/:version/jobs/:id",
  "worker_data_consistency": "always",
  "worker_data_consistency_per_db": {
    "main": "always",
    "ci": "always"
  },
  "size_limiter": "validated",
  "enqueued_at": "2025-07-03T19:30:39.814Z",
  "job_size_bytes": 11,
  "pid": 26,
  "sidekiq_thread_name": "sidekiq.default/processor",
  "message": "Ci::CreateDownstreamPipelineWorker JID-175e1b1a511c91295fecc5d9: done: 39.962475 sec",
  "job_status": "done",
  "queue_duration_s": 0.002044,
  "scheduling_latency_s": 0.002062,
  "gitaly_calls": 8,
  "gitaly_duration_s": 39.126492,
  "redis_calls": 97,
  "redis_duration_s": 0.054063,
  "redis_read_bytes": 1390,
  "redis_write_bytes": 13246,
  "redis_cache_calls": 12,
  "redis_cache_duration_s": 0.011452,
  "redis_cache_read_bytes": 162,
  "redis_cache_write_bytes": 1143,
  "redis_feature_flag_calls": 7,
  "redis_feature_flag_duration_s": 0.009463,
  "redis_feature_flag_read_bytes": 805,
  "redis_feature_flag_write_bytes": 466,
  "redis_queues_calls": 10,
  "redis_queues_duration_s": 0.003411,
  "redis_queues_read_bytes": 10,
  "redis_queues_write_bytes": 7013,
  "redis_queues_metadata_calls": 10,
  "redis_queues_metadata_duration_s": 0.007274,
  "redis_queues_metadata_read_bytes": 92,
  "redis_queues_metadata_write_bytes": 1703,
  "redis_repository_cache_calls": 38,
  "redis_repository_cache_duration_s": 0.015581,
  "redis_repository_cache_read_bytes": 238,
  "redis_repository_cache_write_bytes": 1761,
  "redis_shared_state_calls": 19,
  "redis_shared_state_duration_s": 0.006219,
  "redis_shared_state_read_bytes": 82,
  "redis_shared_state_write_bytes": 952,
  "redis_action_cable_calls": 1,
  "redis_action_cable_duration_s": 0.000663,
  "redis_action_cable_read_bytes": 1,
  "redis_action_cable_write_bytes": 208,
  "db_count": 176,
  "db_write_count": 22,
  "db_cached_count": 23,
  "db_txn_count": 2,
  "db_replica_txn_count": 0,
  "db_primary_txn_count": 0,
  "db_replica_count": 0,
  "db_primary_count": 176,
  "db_replica_write_count": 0,
  "db_primary_write_count": 22,
  "db_replica_cached_count": 0,
  "db_primary_cached_count": 23,
  "db_replica_wal_count": 0,
  "db_primary_wal_count": 0,
  "db_replica_wal_cached_count": 0,
  "db_primary_wal_cached_count": 0,
  "db_replica_txn_max_duration_s": 0,
  "db_primary_txn_max_duration_s": 0,
  "db_replica_txn_duration_s": 0,
  "db_primary_txn_duration_s": 0,
  "db_replica_duration_s": 0,
  "db_primary_duration_s": 0.213,
  "db_main_txn_count": 0,
  "db_ci_txn_count": 2,
  "db_main_replica_txn_count": 0,
  "db_ci_replica_txn_count": 0,
  "db_main_count": 123,
  "db_ci_count": 53,
  "db_main_replica_count": 0,
  "db_ci_replica_count": 0,
  "db_main_write_count": 1,
  "db_ci_write_count": 21,
  "db_main_replica_write_count": 0,
  "db_ci_replica_write_count": 0,
  "db_main_cached_count": 19,
  "db_ci_cached_count": 4,
  "db_main_replica_cached_count": 0,
  "db_ci_replica_cached_count": 0,
  "db_main_wal_count": 0,
  "db_ci_wal_count": 0,
  "db_main_replica_wal_count": 0,
  "db_ci_replica_wal_count": 0,
  "db_main_wal_cached_count": 0,
  "db_ci_wal_cached_count": 0,
  "db_main_replica_wal_cached_count": 0,
  "db_ci_replica_wal_cached_count": 0,
  "db_main_txn_max_duration_s": 0,
  "db_ci_txn_max_duration_s": 0.2,
  "db_main_replica_txn_max_duration_s": 0,
  "db_ci_replica_txn_max_duration_s": 0,
  "db_main_txn_duration_s": 0,
  "db_ci_txn_duration_s": 0.205,
  "db_main_replica_txn_duration_s": 0,
  "db_ci_replica_txn_duration_s": 0,
  "db_main_duration_s": 0.112,
  "db_ci_duration_s": 0.102,
  "db_main_replica_duration_s": 0,
  "db_ci_replica_duration_s": 0,
  "cpu_s": 0.621752,
  "mem_objects": 389919,
  "mem_bytes": 35556591,
  "mem_mallocs": 68651,
  "mem_total_bytes": 51153351,
  "worker_id": "sidekiq_0",
  "rate_limiting_gates": [
    "pipelines_create"
  ],
  "duration_s": 39.962475,
  "completed_at": "2025-07-03T19:31:19.779Z",
  "load_balancing_strategy": "primary",
  "db_duration_s": 0.220311,
  "urgency": "high",
  "target_duration_s": 10,
  "target_scheduling_latency_s": 10,
  "@timestamp": "2025-07-03T19:31:19.779535298Z"
}

Impact

This causes impact by slowing down and queuing other high-urgency-cpu-bound jobs.

Recommendation

Expectation: These high urgency jobs should complete within 10s or be downgraded to low.

Verification

Edited by 🤖 GitLab Bot 🤖