Geo: Gap handling should account for replication lag

Currently with https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/6640, if we don't see a gap in the event log for an hour, we drop the hole.

However, if the database is lagging, then we may want to retain these holes a little longer.

I attempted to add a replication lag checking in Gitlab::Database in https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/6640 (also see diff below), but when EE load balancing is in use this is a little trickier: we really want to know if the database that we are querying is lagging or not. Alternatively, we could just look at all replicas and ensure that all of them are up-to-date.

As mentioned by Yorick in that merge request, we can take the work in https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/20720 and add the maximum replication lag among all the secondaries.

diff --git a/ee/lib/gitlab/geo/event_gap_tracking.rb b/ee/lib/gitlab/geo/event_gap_tracking.rb
index 172595d4bb7..f46389f8489 100644
--- a/ee/lib/gitlab/geo/event_gap_tracking.rb
+++ b/ee/lib/gitlab/geo/event_gap_tracking.rb
@@ -47,6 +47,8 @@ module Gitlab
 
       # accepts a block that should return whether the event was handled
       def fill_gaps
+        return unless replication_lag_below_threshold?
+
         with_redis do |redis|
           redis.zremrangebyscore(GEO_EVENT_LOG_GAPS, '-inf', outdated_timestamp)
 
@@ -87,6 +89,15 @@ module Gitlab
       def outdated_timestamp
         (Time.now - GAP_OUTDATED_PERIOD).to_i
       end
+
+      def replication_lag_below_threshold?
+        byebug
+        if (lag_time = Gitlab::Database.replication_lag_seconds)
+          lag_time <= GAP_OUTDATED_PERIOD
+        else
+          false
+        end
+      end
     end
   end
 end
diff --git a/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb b/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb
index c98b4e48922..eae0fe9092c 100644
--- a/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb
+++ b/ee/spec/lib/gitlab/geo/event_gap_tracking_spec.rb
@@ -100,6 +100,18 @@ describe Gitlab::Geo::EventGapTracking, :clean_gitlab_redis_cache do
 
       expect(read_gaps).to be_empty
     end
+
+    it 'keeps gaps if replication lag is too high' do
+      gap_tracking.check!(event_id_with_gap)
+
+      expect(Gitlab::Database).to receive(:replication_lag_time).and_return(70.minutes)
+
+      Timecop.travel(62.minutes) do
+        expect { |blk| gap_tracking.fill_gaps(&blk) }.not_to yield_with_args(anything)
+      end
+
+      expect(read_gaps.count).to eq(1)
+    end
   end
 
   describe '#track_gaps' do
diff --git a/lib/gitlab/database.rb b/lib/gitlab/database.rb
index af8798cba8b..2b181291032 100644
--- a/lib/gitlab/database.rb
+++ b/lib/gitlab/database.rb
@@ -57,6 +57,32 @@ module Gitlab
       !self.db_read_only?
     end
 
+    # Returns the replication lag time of this secondary in seconds as a
+    # float.
+    #
+    # This method will return nil if no lag time could be calculated.
+    def self.replication_lag_seconds
+      return unless postgresql?
+
+      byebug
+      # Obtain the replication lag in seconds
+      lag = ActiveRecord::Base.connection.execute(self.pg_replication_lag_query).first.fetch('lag')
+
+      lag.present? ? lag.to_f : lag
+    end
+
+    def self.pg_replication_lag_query
+      <<~SQL
+          SELECT CASE
+                 WHEN #{Gitlab::Database.pg_last_wal_receive_lsn}() = #{Gitlab::Database.pg_last_wal_receive_lsn}()
+                  THEN 0
+                 ELSE
+                  EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::float
+                 END
+                 AS lag
+        SQL
+    end
+
     def self.version
       @version ||= database_version.match(/\A(?:PostgreSQL |)([^\s]+).*\z/)[1]
     end
diff --git a/spec/lib/gitlab/database_spec.rb b/spec/lib/gitlab/database_spec.rb
index 81a2a08b17b..8037fe0dd1d 100644
--- a/spec/lib/gitlab/database_spec.rb
+++ b/spec/lib/gitlab/database_spec.rb
@@ -520,6 +520,37 @@ describe Gitlab::Database do
     end
   end
 
+  describe '.replication_lag_seconds' do
+    context 'when using PostgreSQL' do
+      let(:sql) { Gitlab::Database.pg_replication_lag_query }
+
+      before do
+        allow(ActiveRecord::Base.connection).to receive(:execute).and_call_original
+        allow(described_class).to receive(:postgresql?).and_return(true)
+      end
+
+      it 'reports unknown lag' do
+      #  allow(ActiveRecord::Base.connection).to receive(:execute).with(sql).and_return([{ "lag" => nil }])
+
+        expect(described_class.replication_lag_seconds).to be_nil
+      end
+
+      it 'reports replication lag' do
+        allow(ActiveRecord::Base.connection).to receive(:execute).with(sql).and_return([{ "lag" => "10.0" }])
+
+        expect(described_class.replication_lag_seconds).to eq(10.0)
+      end
+    end
+
+    context 'when using MySQL' do
+      before do
+        expect(described_class).to receive(:postgresql?).and_return(false)
+      end
+
+      it { expect(described_class.replication_lag_seconds).to be_falsey }
+    end
+  end
+
   describe '#sanitize_timestamp' do
     let(:max_timestamp) { Time.at((1 << 31) - 1) }
Assignee Loading
Time tracking Loading