Geo: Gap handling should account for replication lag
Currently with https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/6640, if we don't see a gap in the event log for an hour, we drop the hole.
However, if the database is lagging, then we may want to retain these holes a little longer.
I attempted to add a replication lag checking in Gitlab::Database in https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/6640 (also see diff below), but when EE load balancing is in use this is a little trickier: we really want to know if the database that we are querying is lagging or not. Alternatively, we could just look at all replicas and ensure that all of them are up-to-date.
As mentioned by Yorick in that merge request, we can take the work in https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/20720 and add the maximum replication lag among all the secondaries.
diff --git a/ee/lib/gitlab/geo/event_gap_tracking.rb b/ee/lib/gitlab/geo/event_gap_tracking.rb
index 172595d4bb7..f46389f8489 100644
--- a/ee/lib/gitlab/geo/event_gap_tracking.rb
+++ b/ee/lib/gitlab/geo/event_gap_tracking.rb
@@ -47,6 +47,8 @@ module Gitlab
# accepts a block that should return whether the event was handled
def fill_gaps
+ return unless replication_lag_below_threshold?
+
with_redis do |redis|
redis.zremrangebyscore(GEO_EVENT_LOG_GAPS, '-inf', outdated_timestamp)
@@ -87,6 +89,15 @@ module Gitlab
def outdated_timestamp
(Time.now - GAP_OUTDATED_PERIOD).to_i
end
+
+ def replication_lag_below_threshold?
+ byebug
+ if (lag_time = Gitlab::Database.replication_lag_seconds)
+ lag_time <= GAP_OUTDATED_PERIOD
+ else
+ false
+ end
+ end
end
end
end
diff --git a/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb b/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb
index c98b4e48922..eae0fe9092c 100644
--- a/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb
+++ b/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb
@@ -100,6 +100,18 @@ describe Gitlab::Geo::EventGapTracking, :clean_gitlab_redis_cache do
expect(read_gaps).to be_empty
end
+
+ it 'keeps gaps if replication lag is too high' do
+ gap_tracking.check!(event_id_with_gap)
+
+ expect(Gitlab::Database).to receive(:replication_lag_time).and_return(70.minutes)
+
+ Timecop.travel(62.minutes) do
+ expect { |blk| gap_tracking.fill_gaps(&blk) }.not_to yield_with_args(anything)
+ end
+
+ expect(read_gaps.count).to eq(1)
+ end
end
describe '#track_gaps' do
diff --git a/lib/gitlab/database.rb b/lib/gitlab/database.rb
index af8798cba8b..2b181291032 100644
--- a/lib/gitlab/database.rb
+++ b/lib/gitlab/database.rb
@@ -57,6 +57,32 @@ module Gitlab
!self.db_read_only?
end
+ # Returns the replication lag time of this secondary in seconds as a
+ # float.
+ #
+ # This method will return nil if no lag time could be calculated.
+ def self.replication_lag_seconds
+ return unless postgresql?
+
+ byebug
+ # Obtain the replication lag in seconds
+ lag = ActiveRecord::Base.connection.execute(self.pg_replication_lag_query).first.fetch('lag')
+
+ lag.present? ? lag.to_f : lag
+ end
+
+ def self.pg_replication_lag_query
+ <<~SQL
+ SELECT CASE
+ WHEN #{Gitlab::Database.pg_last_wal_receive_lsn}() = #{Gitlab::Database.pg_last_wal_receive_lsn}()
+ THEN 0
+ ELSE
+ EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::float
+ END
+ AS lag
+ SQL
+ end
+
def self.version
@version ||= database_version.match(/\A(?:PostgreSQL |)([^\s]+).*\z/)[1]
end
diff --git a/spec/lib/gitlab/database_spec.rb b/spec/lib/gitlab/database_spec.rb
index 81a2a08b17b..8037fe0dd1d 100644
--- a/spec/lib/gitlab/database_spec.rb
+++ b/spec/lib/gitlab/database_spec.rb
@@ -520,6 +520,37 @@ describe Gitlab::Database do
end
end
+ describe '.replication_lag_seconds' do
+ context 'when using PostgreSQL' do
+ let(:sql) { Gitlab::Database.pg_replication_lag_query }
+
+ before do
+ allow(ActiveRecord::Base.connection).to receive(:execute).and_call_original
+ allow(described_class).to receive(:postgresql?).and_return(true)
+ end
+
+ it 'reports unknown lag' do
+ # allow(ActiveRecord::Base.connection).to receive(:execute).with(sql).and_return([{ "lag" => nil }])
+
+ expect(described_class.replication_lag_seconds).to be_nil
+ end
+
+ it 'reports replication lag' do
+ allow(ActiveRecord::Base.connection).to receive(:execute).with(sql).and_return([{ "lag" => "10.0" }])
+
+ expect(described_class.replication_lag_seconds).to eq(10.0)
+ end
+ end
+
+ context 'when using MySQL' do
+ before do
+ expect(described_class).to receive(:postgresql?).and_return(false)
+ end
+
+ it { expect(described_class.replication_lag_seconds).to be_falsey }
+ end
+ end
+
describe '#sanitize_timestamp' do
let(:max_timestamp) { Time.at((1 << 31) - 1) }