Siphon fails during merge with deadline exceeded

I snapshotted a larger table (100GB) and the snapshot process failed with deadline error during the merge phase. The merge take a very long time which is expected for large tables.

After the built-in retries were finished, Siphon exited as expected. After restarting Siphon, the merge was finalized successfully: the merge code noticed that the replication of messages to the main stream is finished.

I think the problem is around the merge code where we probably up a context with timeout (around 10s).

Logs {"level":"debug","ts":"2025-04-05T16:03:46.130+0200","caller":"siphon/queue_controller.go:424","msg":"destination stream has event","applicationId":"gdk","name":"snapshot-complete"} {"level":"debug","ts":"2025-04-05T16:03:46.130+0200","caller":"siphon/queue_controller.go:429","msg":"destination stream has caught up","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:03:46.135+0200","caller":"siphon/queue_controller.go:332","msg":"original stream config","applicationId":"gdk","config":{"name":"replication_events","subjects":["replication_events.>"],"retention":"limits","max_consumers":-1,"max_msgs":-1,"max_bytes":-1,"discard":"old","max_age":0,"max_msgs_per_subject":-1,"max_msg_size":-1,"storage":"file","num_replicas":1,"duplicate_window":120000000000,"sources":[{"name":"snapshot_replication_events","subject_transforms":[{"src":"snapshot_replication_events.issues","dest":"replication_events.issues"}]}],"compression":"none","allow_direct":false,"mirror_direct":false,"consumer_limits":{}}} {"level":"debug","ts":"2025-04-05T16:03:46.139+0200","caller":"siphon/queue_controller.go:338","msg":"updating origin stream","applicationId":"gdk","origin":"replication_events"} {"level":"debug","ts":"2025-04-05T16:03:46.144+0200","caller":"siphon/queue_controller.go:343","msg":"updated origin stream","applicationId":"gdk","config":{"name":"replication_events","subjects":["replication_events.>"],"retention":"limits","max_consumers":-1,"max_msgs":-1,"max_bytes":-1,"discard":"old","max_age":0,"max_msgs_per_subject":-1,"max_msg_size":-1,"storage":"file","num_replicas":1,"duplicate_window":120000000000,"compression":"none","allow_direct":false,"mirror_direct":false,"consumer_limits":{}}} {"level":"debug","ts":"2025-04-05T16:03:46.149+0200","caller":"siphon/nats_queue.go:156","msg":"sent message with seq number","applicationId":"gdk","seq":1,"stream":"temp_replication_events","subject":"temp_replication_events.issues"} {"level":"info","ts":"2025-04-05T16:03:46.150+0200","caller":"siphon/queue_controller.go:301","msg":"original stream config","applicationId":"gdk","config":{"name":"replication_events","subjects":["replication_events.>"],"retention":"limits","max_consumers":-1,"max_msgs":-1,"max_bytes":-1,"discard":"old","max_age":0,"max_msgs_per_subject":-1,"max_msg_size":-1,"storage":"file","num_replicas":1,"duplicate_window":120000000000,"compression":"none","allow_direct":false,"mirror_direct":false,"consumer_limits":{}}} {"level":"debug","ts":"2025-04-05T16:03:46.150+0200","caller":"siphon/queue_controller.go:312","msg":"updating origin stream","applicationId":"gdk","origin":"replication_events"} {"level":"info","ts":"2025-04-05T16:03:47.028+0200","caller":"siphon/replication_event_manager.go:263","msg":"Send periodic status update to PostgreSQL with the last persisted LSN value","applicationId":"gdk","component":"replication-manager","LSN":"16/A0EE69F8"}

{"level":"debug","ts":"2025-04-05T16:03:52.032+0200","caller":"siphon/siphon.go:474","msg":"sending InProgress status to snapshot mesage","applicationId":"gdk","table":"issues","schema":"public","lsn":"16/A02FAE38"} {"level":"info","ts":"2025-04-05T16:03:53.028+0200","caller":"siphon/replication_event_manager.go:263","msg":"Send periodic status update to PostgreSQL with the last persisted LSN value","applicationId":"gdk","component":"replication-manager","LSN":"16/A0EE7148"} {"level":"debug","ts":"2025-04-05T16:03:53.028+0200","caller":"siphon/siphon.go:178","msg":"run heartbeat query success, duration: 159.352µs","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:03:54.028+0200","caller":"siphon/siphon.go:178","msg":"run heartbeat query success, duration: 228.63µs","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:03:55.031+0200","caller":"siphon/siphon.go:178","msg":"run heartbeat query success, duration: 2.779636ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:03:56.028+0200","caller":"siphon/siphon.go:178","msg":"run heartbeat query success, duration: 217.679µs","applicationId":"gdk"} {"level":"info","ts":"2025-04-05T16:03:56.028+0200","caller":"siphon/replication_event_manager.go:263","msg":"Send periodic status update to PostgreSQL with the last persisted LSN value","applicationId":"gdk","component":"replication-manager","LSN":"16/A0EE74F0"} {"level":"error","ts":"2025-04-05T16:03:56.151+0200","caller":"siphon/replication_event_manager.go:517","msg":"snapshot error","applicationId":"gdk","component":"replication-manager","error":"updating stream context deadline exceeded","stacktrace":"gitlab.com/gitlab-org/analytics-section/siphon/pkg/siphon.(*ReplicationEventHandler).handleMerge.func1\n\t/home/dev/siphon/pkg/siphon/replication_event_manager.go:517"} {"level":"debug","ts":"2025-04-05T16:03:57.028+0200","caller":"siphon/siphon.go:178","msg":"run heartbeat query success, duration: 142.773µs","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:03:57.033+0200","caller":"siphon/siphon.go:474","msg":"sending InProgress status to snapshot mesage","applicationId":"gdk","table":"issues","schema":"public","lsn":"16/A02FAE38"} {"level":"info","ts":"2025-04-05T16:03:57.068+0200","caller":"siphon/siphon.go:440","msg":"shutting down prometheus HTTP server","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:03:57.068+0200","caller":"siphon/siphon.go:161","msg":"stopping heartbeat: ","applicationId":"gdk","error":"context canceled"} {"level":"warn","ts":"2025-04-05T16:03:57.068+0200","caller":"siphon/replication_manager.go:383","msg":"returning from runMessageReceiver","applicationId":"gdk","component":"replication-manager"} {"level":"info","ts":"2025-04-05T16:03:57.068+0200","caller":"siphon/siphon.go:470","msg":"stopping message notification loop: ","applicationId":"gdk","error":"context canceled"} {"level":"info","ts":"2025-04-05T16:03:57.069+0200","caller":"siphon/replication_manager.go:433","msg":"context done, shutting down snapshot handler","applicationId":"gdk","component":"replication-manager"} {"level":"warn","ts":"2025-04-05T16:03:57.068+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"ReplicationLoop: updating stream context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:04:00.072+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:00.091+0200","caller":"siphon/siphon.go:122","msg":"checking that logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:00.093+0200","caller":"siphon/siphon.go:134","msg":"logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:00.093+0200","caller":"siphon/siphon.go:137","msg":"setting PostgreSQL lock_timeout to 73ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:00.094+0200","caller":"siphon/siphon.go:144","msg":"obtaining advisory lock: 1","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:04:10.114+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"setting up NATS streams failed: creating stream: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:04:13.125+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:13.166+0200","caller":"siphon/siphon.go:122","msg":"checking that logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:13.170+0200","caller":"siphon/siphon.go:134","msg":"logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:13.171+0200","caller":"siphon/siphon.go:137","msg":"setting PostgreSQL lock_timeout to 140ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:13.171+0200","caller":"siphon/siphon.go:144","msg":"obtaining advisory lock: 1","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:04:23.197+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"setting up NATS streams failed: creating stream: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:04:26.207+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:26.232+0200","caller":"siphon/siphon.go:122","msg":"checking that logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:26.238+0200","caller":"siphon/siphon.go:134","msg":"logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:26.238+0200","caller":"siphon/siphon.go:137","msg":"setting PostgreSQL lock_timeout to 136ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:26.238+0200","caller":"siphon/siphon.go:144","msg":"obtaining advisory lock: 1","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:04:36.402+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"setting up NATS streams failed: creating stream: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:04:39.536+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:40.925+0200","caller":"siphon/siphon.go:122","msg":"checking that logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:41.640+0200","caller":"siphon/siphon.go:134","msg":"logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:41.678+0200","caller":"siphon/siphon.go:137","msg":"setting PostgreSQL lock_timeout to 148ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:04:41.703+0200","caller":"siphon/siphon.go:144","msg":"obtaining advisory lock: 1","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:04:56.035+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"setting up NATS streams failed: creating stream: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:05:02.328+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:15.791+0200","caller":"siphon/siphon.go:122","msg":"checking that logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:15.800+0200","caller":"siphon/siphon.go:134","msg":"logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:15.801+0200","caller":"siphon/siphon.go:137","msg":"setting PostgreSQL lock_timeout to 132ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:15.801+0200","caller":"siphon/siphon.go:144","msg":"obtaining advisory lock: 1","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:05:25.907+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"setting up NATS streams failed: creating stream: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:05:28.924+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:29.146+0200","caller":"siphon/siphon.go:122","msg":"checking that logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:29.176+0200","caller":"siphon/siphon.go:134","msg":"logical replication is enabled","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:29.176+0200","caller":"siphon/siphon.go:137","msg":"setting PostgreSQL lock_timeout to 57ms","applicationId":"gdk"} {"level":"debug","ts":"2025-04-05T16:05:29.185+0200","caller":"siphon/siphon.go:144","msg":"obtaining advisory lock: 1","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:05:40.423+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"setting up NATS streams failed: creating stream: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:05:44.637+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:06:11.362+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"version check failed: could not determine PG major version: could not find PostgreSQL version: timeout: context already done: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:06:21.820+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"} {"level":"warn","ts":"2025-04-05T16:06:37.391+0200","caller":"common/retry_utils.go:74","msg":"siphon failed, restarting...","applicationId":"gdk","error":"version check failed: could not determine PG major version: could not find PostgreSQL version: timeout: context deadline exceeded"} {"level":"debug","ts":"2025-04-05T16:06:48.282+0200","caller":"cmd/main.go:84","msg":"starting Siphon with application ID: gdk","applicationId":"gdk"}