Geo: LFS files not actually replicated

From #36628 (comment 246286539):

I've found that Geo sometimes thinks that a file is replicated but it doesn't exist on disk. I think this is a hangover from the cutover from the old Geo replication methodology to the current but I'm finding a few LFS files that Geo::FileRegistry thinks are replicated but which are not on disk.. eg

irb(main):002:0> LfsObject.find_by_oid('7923fdfe304391b6722c8108c75b2cba9b5ee96ddb256455338e60b94fb345d0').id
=> 819581

irb(main):003:0> LfsObject.find_by_oid('7923fdfe304391b6722c8108c75b2cba9b5ee96ddb256455338e60b94fb345d0').file.path
=> "/gitlab/gitlab-lfs/79/23/fdfe304391b6722c8108c75b2cba9b5ee96ddb256455338e60b94fb345d0"

root@git-mirror-hkdc-04:~ # ls -l /gitlab/gitlab-lfs/79/23/fdfe304391b6722c8108c75b2cba9b5ee96ddb256455338e60b94fb345d0
ls: cannot access /gitlab/gitlab-lfs/79/23/fdfe304391b6722c8108c75b2cba9b5ee96ddb256455338e60b94fb345d0: No such file or directory

irb(main):003:0> Geo::FileRegistry.where(file_type: "lfs", file_id: 819581)
=> #<ActiveRecord::Relation [#<Geo::LfsObjectRegistry id: 718216, file_type: "lfs", file_id: 819581, bytes: 206180, sha256: nil, created_at: "2019-08-22 13:59:52", success: true, retry_count: 0, retry_at: nil, missing_on_primary: false>]>

Just deleting the Geo::FileRegistry entry is enough to trigger Geo to resync it.

I wrote this rake task to clean up this condition:

 task run_lfs_object_validator: :environment do

    unless Gitlab::Geo.secondary?
      abort 'This is not a secondary node'
    end

    delete_missing = ENV['DELETE'] || false
    from_lfs_id = ENV['FROM_LFS_ID'] || Geo::FileRegistry.where(file_type: "lfs").minimum(:id)
    to_lfs_id = ENV['TO_LFS_ID'] || Geo::FileRegistry.where(file_type: "lfs").maximum(:id)

    puts "  Geo Primary LFS objects: #{LfsObject.count}"  
    puts "Geo Secondary LFS objects: #{Geo::FileRegistry.where(file_type: "lfs").count}"
    puts "Checking from Geo File Registry id #{from_lfs_id} to #{to_lfs_id}"

   batch_size = 1000
   total_count = 0
   current_max_id = 0

    until current_max_id >= to_lfs_id
      current_max_id = [from_lfs_id + batch_size, to_lfs_id + 1].min

     puts "Checking Geo:FileRegistry ids from #{from_lfs_id} to #{current_max_id}"

     lfs_ids = Geo::FileRegistry
           .where('file_type = ? AND id >= ? AND id < ?', 'lfs', from_lfs_id, current_max_id)
           .pluck(:id)     

     missing_lfs_files = Array.new
     lfs_ids.each do |lfs|

         lfs_file_path = LfsObject.find(Geo::FileRegistry.find(lfs).file_id).file.path
         if not File.file?(lfs_file_path) then
             total_count += 1
             Geo::FileRegistry.find(lfs).delete if delete_missing
             puts " LFS id #{lfs} not found on disk"
         end

     end

      from_lfs_id = current_max_id
      
    end

    puts "Found #{total_count} missing LFS objects"

end
Assignee Loading
Time tracking Loading