Commit 86ec52e0 authored by Douwe Maan's avatar Douwe Maan

Merge branch 'fj-40401-support-import-lfs-objects' into 'master'

Support LFS objects when creating a project by import

Closes #40401

See merge request gitlab-org/gitlab-ce!18871
parents 0dd7563b e8f49b4b
......@@ -956,6 +956,10 @@ class Repository
blob_data_at(sha, path)
end
def lfsconfig_for(sha)
blob_data_at(sha, '.lfsconfig')
end
def fetch_ref(source_repository, source_ref:, target_ref:)
raw_repository.fetch_ref(source_repository.raw_repository, source_ref: source_ref, target_ref: target_ref)
end
......
......@@ -3,7 +3,7 @@ class BaseService
attr_accessor :project, :current_user, :params
def initialize(project, user, params = {})
def initialize(project, user = nil, params = {})
@project, @current_user, @params = project, user, params.dup
end
......
......@@ -17,6 +17,8 @@ module Projects
def execute
add_repository_to_project
download_lfs_objects
import_data
success
......@@ -37,7 +39,7 @@ module Projects
# We should skip the repository for a GitHub import or GitLab project import,
# because these importers fetch the project repositories for us.
return if has_importer? && importer_class.try(:imports_repository?)
return if importer_imports_repository?
if unknown_url?
# In this case, we only want to import issues, not a repository.
......@@ -73,6 +75,27 @@ module Projects
end
end
def download_lfs_objects
# In this case, we only want to import issues
return if unknown_url?
# If it has its own repository importer, it has to implements its own lfs import download
return if importer_imports_repository?
return unless project.lfs_enabled?
oids_to_download = Projects::LfsPointers::LfsImportService.new(project).execute
download_service = Projects::LfsPointers::LfsDownloadService.new(project)
oids_to_download.each do |oid, link|
download_service.execute(oid, link)
end
rescue => e
# Right now, to avoid aborting the importing process, we silently fail
# if any exception raises.
Rails.logger.error("The Lfs import process failed. #{e.message}")
end
def import_data
return unless has_importer?
......@@ -98,5 +121,9 @@ module Projects
def unknown_url?
project.import_url == Project::UNKNOWN_IMPORT_URL
end
def importer_imports_repository?
has_importer? && importer_class.try(:imports_repository?)
end
end
end
# This service lists the download link from a remote source based on the
# oids provided
module Projects
module LfsPointers
class LfsDownloadLinkListService < BaseService
DOWNLOAD_ACTION = 'download'.freeze
DownloadLinksError = Class.new(StandardError)
DownloadLinkNotFound = Class.new(StandardError)
attr_reader :remote_uri
def initialize(project, remote_uri: nil)
super(project)
@remote_uri = remote_uri
end
# This method accepts two parameters:
# - oids: hash of oids to query. The structure is { lfs_file_oid => lfs_file_size }
#
# Returns a hash with the structure { lfs_file_oids => download_link }
def execute(oids)
return {} unless project&.lfs_enabled? && remote_uri && oids.present?
get_download_links(oids)
end
private
def get_download_links(oids)
response = Gitlab::HTTP.post(remote_uri,
body: request_body(oids),
headers: headers)
raise DownloadLinksError, response.message unless response.success?
parse_response_links(response['objects'])
end
def parse_response_links(objects_response)
objects_response.each_with_object({}) do |entry, link_list|
begin
oid = entry['oid']
link = entry.dig('actions', DOWNLOAD_ACTION, 'href')
raise DownloadLinkNotFound unless link
link_list[oid] = add_credentials(link)
rescue DownloadLinkNotFound, URI::InvalidURIError
Rails.logger.error("Link for Lfs Object with oid #{oid} not found or invalid.")
end
end
end
def request_body(oids)
{
operation: DOWNLOAD_ACTION,
objects: oids.map { |oid, size| { oid: oid, size: size } }
}.to_json
end
def headers
{
'Accept' => LfsRequest::CONTENT_TYPE,
'Content-Type' => LfsRequest::CONTENT_TYPE
}.freeze
end
def add_credentials(link)
uri = URI.parse(link)
if should_add_credentials?(uri)
uri.user = remote_uri.user
uri.password = remote_uri.password
end
uri.to_s
end
# The download link can be a local url or an object storage url
# If the download link has the some host as the import url then
# we add the same credentials because we may need them
def should_add_credentials?(link_uri)
url_credentials? && link_uri.host == remote_uri.host
end
def url_credentials?
remote_uri.user.present? || remote_uri.password.present?
end
end
end
end
# This service downloads and links lfs objects from a remote URL
module Projects
module LfsPointers
class LfsDownloadService < BaseService
def execute(oid, url)
return unless project&.lfs_enabled? && oid.present? && url.present?
return if LfsObject.exists?(oid: oid)
sanitized_uri = Gitlab::UrlSanitizer.new(url)
with_tmp_file(oid) do |file|
size = download_and_save_file(file, sanitized_uri)
lfs_object = LfsObject.new(oid: oid, size: size, file: file)
project.all_lfs_objects << lfs_object
end
rescue StandardError => e
Rails.logger.error("LFS file with oid #{oid} could't be downloaded from #{sanitized_uri.sanitized_url}: #{e.message}")
end
private
def download_and_save_file(file, sanitized_uri)
IO.copy_stream(open(sanitized_uri.sanitized_url, headers(sanitized_uri)), file)
end
def headers(sanitized_uri)
{}.tap do |headers|
credentials = sanitized_uri.credentials
if credentials[:user].present? || credentials[:password].present?
# Using authentication headers in the request
headers[:http_basic_authentication] = [credentials[:user], credentials[:password]]
end
end
end
def with_tmp_file(oid)
create_tmp_storage_dir
File.open(File.join(tmp_storage_dir, oid), 'w') { |file| yield file }
end
def create_tmp_storage_dir
FileUtils.makedirs(tmp_storage_dir) unless Dir.exist?(tmp_storage_dir)
end
def tmp_storage_dir
@tmp_storage_dir ||= File.join(storage_dir, 'tmp', 'download')
end
def storage_dir
@storage_dir ||= Gitlab.config.lfs.storage_path
end
end
end
end
# This service manages the whole worflow of discovering the Lfs files in a
# repository, linking them to the project and downloading (and linking) the non
# existent ones.
module Projects
module LfsPointers
class LfsImportService < BaseService
include Gitlab::Utils::StrongMemoize
HEAD_REV = 'HEAD'.freeze
LFS_ENDPOINT_PATTERN = /^\t?url\s*=\s*(.+)$/.freeze
LFS_BATCH_API_ENDPOINT = '/info/lfs/objects/batch'.freeze
LfsImportError = Class.new(StandardError)
def execute
return {} unless project&.lfs_enabled?
if external_lfs_endpoint?
# If the endpoint host is different from the import_url it means
# that the repo is using a third party service for storing the LFS files.
# In this case, we have to disable lfs in the project
disable_lfs!
return {}
end
get_download_links
rescue LfsDownloadLinkListService::DownloadLinksError => e
raise LfsImportError, "The LFS objects download list couldn't be imported. Error: #{e.message}"
end
private
def external_lfs_endpoint?
lfsconfig_endpoint_uri && lfsconfig_endpoint_uri.host != import_uri.host
end
def disable_lfs!
project.update(lfs_enabled: false)
end
def get_download_links
existent_lfs = LfsListService.new(project).execute
linked_oids = LfsLinkService.new(project).execute(existent_lfs.keys)
# Retrieving those oids not linked and which we need to download
not_linked_lfs = existent_lfs.except(*linked_oids)
LfsDownloadLinkListService.new(project, remote_uri: current_endpoint_uri).execute(not_linked_lfs)
end
def lfsconfig_endpoint_uri
strong_memoize(:lfsconfig_endpoint_uri) do
# Retrieveing the blob data from the .lfsconfig file
data = project.repository.lfsconfig_for(HEAD_REV)
# Parsing the data to retrieve the url
parsed_data = data&.match(LFS_ENDPOINT_PATTERN)
if parsed_data
URI.parse(parsed_data[1]).tap do |endpoint|
endpoint.user ||= import_uri.user
endpoint.password ||= import_uri.password
end
end
end
rescue URI::InvalidURIError
raise LfsImportError, 'Invalid URL in .lfsconfig file'
end
def import_uri
@import_uri ||= URI.parse(project.import_url)
rescue URI::InvalidURIError
raise LfsImportError, 'Invalid project import URL'
end
def current_endpoint_uri
(lfsconfig_endpoint_uri || default_endpoint_uri)
end
# The import url must end with '.git' here we ensure it is
def default_endpoint_uri
@default_endpoint_uri ||= begin
import_uri.dup.tap do |uri|
path = uri.path.gsub(%r(/$), '')
path += '.git' unless path.ends_with?('.git')
uri.path = path + LFS_BATCH_API_ENDPOINT
end
end
end
end
end
end
# Given a list of oids, this services links the existent Lfs Objects to the project
module Projects
module LfsPointers
class LfsLinkService < BaseService
# Accept an array of oids to link
#
# Returns a hash with the same structure with oids linked
def execute(oids)
return {} unless project&.lfs_enabled?
# Search and link existing LFS Object
link_existing_lfs_objects(oids)
end
private
def link_existing_lfs_objects(oids)
existent_lfs_objects = LfsObject.where(oid: oids)
return [] unless existent_lfs_objects.any?
not_linked_lfs_objects = existent_lfs_objects.where.not(id: project.all_lfs_objects)
project.all_lfs_objects << not_linked_lfs_objects
existent_lfs_objects.pluck(:oid)
end
end
end
end
# This service list all existent Lfs objects in a repository
module Projects
module LfsPointers
class LfsListService < BaseService
REV = 'HEAD'.freeze
# Retrieve all lfs blob pointers and returns a hash
# with the structure { lfs_file_oid => lfs_file_size }
def execute
return {} unless project&.lfs_enabled?
Gitlab::Git::LfsChanges.new(project.repository, REV)
.all_pointers
.map! { |blob| [blob.lfs_oid, blob.lfs_size] }
.to_h
end
end
end
end
......@@ -31,12 +31,14 @@
- github_importer:github_import_import_diff_note
- github_importer:github_import_import_issue
- github_importer:github_import_import_note
- github_importer:github_import_import_lfs_object
- github_importer:github_import_import_pull_request
- github_importer:github_import_refresh_import_jid
- github_importer:github_import_stage_finish_import
- github_importer:github_import_stage_import_base_data
- github_importer:github_import_stage_import_issues_and_diff_notes
- github_importer:github_import_stage_import_notes
- github_importer:github_import_stage_import_lfs_objects
- github_importer:github_import_stage_import_pull_requests
- github_importer:github_import_stage_import_repository
......
......@@ -21,6 +21,7 @@ module Gitlab
STAGES = {
issues_and_diff_notes: Stage::ImportIssuesAndDiffNotesWorker,
notes: Stage::ImportNotesWorker,
lfs_objects: Stage::ImportLfsObjectsWorker,
finish: Stage::FinishImportWorker
}.freeze
......
# frozen_string_literal: true
module Gitlab
module GithubImport
class ImportLfsObjectWorker
include ObjectImporter
def representation_class
Representation::LfsObject
end
def importer_class
Importer::LfsObjectImporter
end
def counter_name
:github_importer_imported_lfs_objects
end
def counter_description
'The number of imported GitHub Lfs Objects'
end
end
end
end
# frozen_string_literal: true
module Gitlab
module GithubImport
module Stage
class ImportLfsObjectsWorker
include ApplicationWorker
include GithubImport::Queue
include StageMethods
def perform(project_id)
return unless (project = find_project(project_id))
import(project)
end
# project - An instance of Project.
def import(project)
waiter = Importer::LfsObjectsImporter
.new(project, nil)
.execute
AdvanceStageWorker.perform_async(
project.id,
{ waiter.key => waiter.jobs_remaining },
:finish
)
end
end
end
end
end
......@@ -18,7 +18,7 @@ module Gitlab
AdvanceStageWorker.perform_async(
project.id,
{ waiter.key => waiter.jobs_remaining },
:finish
:lfs_objects
)
end
end
......
---
title: Added support for LFS Download in the importing process
merge_request: 18871
author:
type: fixed
......@@ -1543,7 +1543,7 @@ module Gitlab
end
end
def rev_list(including: [], excluding: [], objects: false, &block)
def rev_list(including: [], excluding: [], options: [], objects: false, &block)
args = ['rev-list']
args.push(*rev_list_param(including))
......@@ -1556,6 +1556,10 @@ module Gitlab
args.push('--objects') if objects
if options.any?
args.push(*options)
end
run_git!(args, lazy_block: block)
end
......
......@@ -38,7 +38,10 @@ module Gitlab
end
def all_objects(require_path: nil, &lazy_block)
get_objects(including: :all, require_path: require_path, &lazy_block)
get_objects(including: :all,
options: ["--filter=blob:limit=#{Gitlab::Git::Blob::LFS_POINTER_MAX_SIZE}"],
require_path: require_path,
&lazy_block)
end
# This methods returns an array of missed references
......@@ -54,8 +57,8 @@ module Gitlab
repository.rev_list(args).split("\n")
end
def get_objects(including: [], excluding: [], require_path: nil)
opts = { including: including, excluding: excluding, objects: true }
def get_objects(including: [], excluding: [], options: [], require_path: nil)
opts = { including: including, excluding: excluding, options: options, objects: true }
repository.rev_list(opts) do |lazy_output|
objects = objects_from_output(lazy_output, require_path: require_path)
......
# frozen_string_literal: true
module Gitlab
module GithubImport
module Importer
class LfsObjectImporter
attr_reader :lfs_object, :project
# lfs_object - An instance of `Gitlab::GithubImport::Representation::LfsObject`.
# project - An instance of `Project`.
def initialize(lfs_object, project, _)
@lfs_object = lfs_object
@project = project
end
def execute
Projects::LfsPointers::LfsDownloadService
.new(project)
.execute(lfs_object.oid, lfs_object.download_link)
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module GithubImport
module Importer
class LfsObjectsImporter
include ParallelScheduling
def importer_class
LfsObjectImporter
end
def representation_class
Representation::LfsObject
end
def sidekiq_worker_class
ImportLfsObjectWorker
end
def collection_method
:lfs_objects
end
def each_object_to_import
lfs_objects = Projects::LfsPointers::LfsImportService.new(project).execute
lfs_objects.each do |object|
yield object
end
rescue StandardError => e
Rails.logger.error("The Lfs import process failed. #{e.message}")
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module GithubImport
module Representation
class LfsObject
include ToHash
include ExposeAttribute
attr_reader :attributes
expose_attribute :oid, :download_link
# Builds a lfs_object
def self.from_api_response(lfs_object)
new({ oid: lfs_object[0], download_link: lfs_object[1] })
end
# Builds a new lfs_object using a Hash that was built from a JSON payload.
def self.from_json_hash(raw_hash)
new(Representation.symbolize_hash(raw_hash))
end
# attributes - A Hash containing the raw lfs_object details. The keys of this
# Hash must be Symbols.
def initialize(attributes)
@attributes = attributes
end
end
end
end
end
......@@ -19,7 +19,8 @@ module Gitlab
Importer::PullRequestsImporter,
Importer::IssuesImporter,
Importer::DiffNotesImporter,
Importer::NotesImporter
Importer::NotesImporter,
Importer::LfsObjectsImporter
].freeze
# project - The project to import the data into.
......
......@@ -88,7 +88,7 @@ describe Gitlab::Git::RevList do
context '#all_objects' do
it 'fetches list of all pushed objects using rev-list' do
stub_popen_rev_list('--all', '--objects', output: "sha1\nsha2")
stub_popen_rev_list('--all', '--objects', '--filter=blob:limit=200', output: "sha1\nsha2")
expect { |b| rev_list.all_objects(&b) }.to yield_with_args(%w[sha1 sha2])
end
......
require 'spec_helper'
describe Gitlab::GithubImport::Importer::LfsObjectImporter do
let(:project) { create(:project) }
let(:download_link) { "http://www.gitlab.com/lfs_objects/oid" }
let(:github_lfs_object) do
Gitlab::GithubImport::Representation::LfsObject.new(
oid: 'oid', download_link: download_link
)
end
let(:importer) { described_class.new(github_lfs_object, project, nil) }
describe '#execute' do
it 'calls the LfsDownloadService with the lfs object attributes' do
expect_any_instance_of(Projects::LfsPointers::LfsDownloadService)
.to receive(:execute).with('oid', download_link)
importer.execute
end
end
end
require 'spec_helper'
describe Gitlab::GithubImport::Importer::LfsObjectsImporter do
let(:project) { double(:project, id: 4, import_source: 'foo/bar') }
let(:client) { double(:client) }
let(:download_link) { "http://www.gitlab.com/lfs_objects/oid" }
let(:github_lfs_object) { ['oid', download_link] }
describe '#parallel?' do
it 'returns true when running in parallel mode' do
importer = described_class.new(project, client)
expect(importer).to be_parallel
end
it 'returns false when running in sequential mode' do
importer = described_class.new(project, client, parallel: false)
expect(importer).not_to be_parallel
end
end
describe '#execute' do
context 'when running in parallel mode' do
it 'imports lfs objects in parallel' do
importer = described_class.new(project, client)