Skip to content
Snippets Groups Projects
Commit 1daabc1f authored by Valentin David's avatar Valentin David
Browse files

Fetch git shallow clone when possible

When the requested ref is advertised by remote and that no tag is
required, then we shallow clone that requested ref. Otherwise we
fallback on full clone.

Workspace opening and tracking opeerations still get a full clone.

Fixes #261
parent 13eb7ed2
No related branches found
No related tags found
Loading
Pipeline #40156427 failed
......@@ -183,7 +183,7 @@ WARN_INVALID_SUBMODULE = "invalid-submodule"
#
class GitMirror(SourceFetcher):
def __init__(self, source, path, url, ref, *, primary=False, tags=[]):
def __init__(self, source, path, url, ref, *, primary=False, tags=[], tracking=None):
super().__init__()
self.source = source
......@@ -192,11 +192,101 @@ class GitMirror(SourceFetcher):
self.ref = ref
self.tags = tags
self.primary = primary
dirname = utils.url_directory_name(url)
self.mirror = os.path.join(source.get_mirror_directory(), utils.url_directory_name(url))
self.fetch_mirror = os.path.join(source.get_mirror_directory(), '{}-{}'.format(dirname, ref))
self.mark_download_url(url)
self.tracking = tracking
def mirror_path(self):
if os.path.exists(self.mirror):
return self.mirror
else:
assert os.path.exists(self.fetch_mirror)
return self.fetch_mirror
def ensure_fetchable(self, alias_override=None):
if os.path.exists(self.mirror):
return
if self.tags:
for tag, commit, _ in self.tags:
if commit != self.ref:
self.source.status("{}: tag '{}' is not on commit '{}', so a full clone is required"
.format(self.source, tag, commit))
self.ensure_trackable(alias_override=alias_override)
return
if os.path.exists(self.fetch_mirror):
return
with self.source.tempdir() as tmpdir:
self.source.call([self.source.host_git, 'init', '--bare', tmpdir],
fail="Failed to init git repository",
fail_temporarily=True)
url = self.source.translate_url(self.url, alias_override=alias_override,
primary=self.primary)
self.source.call([self.source.host_git, 'remote', 'add', '--mirror=fetch', 'origin', url],
cwd=tmpdir,
fail="Failed to init git repository",
fail_temporarily=True)
_, refs = self.source.check_output([self.source.host_git, 'ls-remote', 'origin'],
cwd=tmpdir,
fail="Failed to clone git repository {}".format(url),
fail_temporarily=True)
advertised = None
for ref_line in refs.splitlines():
commit, ref = ref_line.split('\t', 1)
if ref == 'HEAD':
continue
if self.tracking:
# For validate_cache to work
if ref not in ['refs/heads/{}'.format(self.tracking),
'refs/tags/{}'.format(self.tracking),
'refs/tags/{}{}'.format(self.tracking, '^{}')]:
continue
if self.ref == commit:
if ref.endswith('^{}'):
ref = ref[:-3]
advertised = ref
break
if advertised is None:
self.source.status("{}: {} is not advertised on {}, so a full clone is required"
.format(self.source, self.ref, url))
self.ensure_trackable(alias_override=alias_override)
return
self.source.call([self.source.host_git, 'fetch', '--depth=1', 'origin', advertised],
cwd=tmpdir,
fail="Failed to fetch repository",
fail_temporarily=True)
# We need to have a ref to make it clonable
self.source.call([self.source.host_git, 'update-ref', 'HEAD', self.ref],
cwd=tmpdir,
fail="Failed to tag HEAD",
fail_temporarily=True)
try:
move_atomic(tmpdir, self.fetch_mirror)
except DirectoryExistsError:
# Another process was quicker to download this repository.
# Let's discard our own
self.source.status("{}: Discarding duplicate clone of {}"
.format(self.source, url))
except OSError as e:
raise SourceError("{}: Failed to move cloned git repository {} from '{}' to '{}': {}"
.format(self.source, url, tmpdir, self.fetch_mirror, e)) from e
# Ensures that the mirror exists
def ensure(self, alias_override=None):
def ensure_trackable(self, alias_override=None):
# Unfortunately, git does not know how to only clone just a specific ref,
# so we have to download all of those gigs even if we only need a couple
......@@ -231,18 +321,20 @@ class GitMirror(SourceFetcher):
alias_override=alias_override,
primary=self.primary)
mirror = self.mirror_path()
if alias_override:
remote_name = utils.url_directory_name(alias_override)
_, remotes = self.source.check_output(
[self.source.host_git, 'remote'],
fail="Failed to retrieve list of remotes in {}".format(self.mirror),
cwd=self.mirror
fail="Failed to retrieve list of remotes in {}".format(mirror),
cwd=mirror
)
if remote_name not in remotes:
self.source.call(
[self.source.host_git, 'remote', 'add', remote_name, url],
fail="Failed to add remote {} with url {}".format(remote_name, url),
cwd=self.mirror
cwd=mirror
)
else:
remote_name = "origin"
......@@ -250,7 +342,7 @@ class GitMirror(SourceFetcher):
self.source.call([self.source.host_git, 'fetch', remote_name, '--prune', '--force', '--tags'],
fail="Failed to fetch from remote git repository: {}".format(url),
fail_temporarily=True,
cwd=self.mirror)
cwd=mirror)
def fetch(self, alias_override=None):
# Resolve the URL for the message
......@@ -261,7 +353,7 @@ class GitMirror(SourceFetcher):
with self.source.timed_activity("Fetching from {}"
.format(resolved_url),
silent_nested=True):
self.ensure(alias_override)
self.ensure_fetchable(alias_override)
if not self.has_ref():
self._fetch(alias_override)
self.assert_ref()
......@@ -270,12 +362,14 @@ class GitMirror(SourceFetcher):
if not self.ref:
return False
# If the mirror doesnt exist, we also dont have the ref
if not os.path.exists(self.mirror):
if not os.path.exists(self.mirror) and not os.path.exists(self.fetch_mirror):
# If the mirror doesnt exist, we also dont have the ref
return False
mirror = self.mirror_path()
# Check if the ref is really there
rc = self.source.call([self.source.host_git, 'cat-file', '-t', self.ref], cwd=self.mirror)
rc = self.source.call([self.source.host_git, 'cat-file', '-t', self.ref], cwd=mirror)
return rc == 0
def assert_ref(self):
......@@ -325,11 +419,13 @@ class GitMirror(SourceFetcher):
def stage(self, directory):
fullpath = os.path.join(directory, self.path)
mirror = self.mirror_path()
# Using --shared here avoids copying the objects into the checkout, in any
# case we're just checking out a specific commit and then removing the .git/
# directory.
self.source.call([self.source.host_git, 'clone', '--no-checkout', '--shared', self.mirror, fullpath],
fail="Failed to create git mirror {} in directory: {}".format(self.mirror, fullpath),
self.source.call([self.source.host_git, 'clone', '--no-checkout', '--shared', mirror, fullpath],
fail="Failed to create git mirror {} in directory: {}".format(mirror, fullpath),
fail_temporarily=True)
self.source.call([self.source.host_git, 'checkout', '--force', self.ref],
......@@ -359,9 +455,11 @@ class GitMirror(SourceFetcher):
# List the submodules (path/url tuples) present at the given ref of this repo
def submodule_list(self):
mirror = self.mirror_path()
modules = "{}:{}".format(self.ref, GIT_MODULES)
exit_code, output = self.source.check_output(
[self.source.host_git, 'show', modules], cwd=self.mirror)
[self.source.host_git, 'show', modules], cwd=mirror)
# If git show reports error code 128 here, we take it to mean there is
# no .gitmodules file to display for the given revision.
......@@ -389,6 +487,8 @@ class GitMirror(SourceFetcher):
# Fetch the ref which this mirror requires its submodule to have,
# at the given ref of this mirror.
def submodule_ref(self, submodule, ref=None):
mirror = self.mirror_path()
if not ref:
ref = self.ref
......@@ -397,7 +497,7 @@ class GitMirror(SourceFetcher):
_, output = self.source.check_output([self.source.host_git, 'ls-tree', ref, submodule],
fail="ls-tree failed for commit {} and submodule: {}".format(
ref, submodule),
cwd=self.mirror)
cwd=mirror)
# read the commit hash from the output
fields = output.split()
......@@ -514,8 +614,8 @@ class GitSource(Source):
self.track_tags = self.node_get_member(node, bool, 'track-tags', False)
self.original_url = self.node_get_member(node, str, 'url')
self.mirror = GitMirror(self, '', self.original_url, ref, tags=tags, primary=True)
self.tracking = self.node_get_member(node, str, 'track', None)
self.mirror = GitMirror(self, '', self.original_url, ref, tags=tags, primary=True, tracking=self.tracking)
self.ref_format = self.node_get_member(node, str, 'ref-format', 'sha1')
if self.ref_format not in ['sha1', 'git-describe']:
......@@ -633,7 +733,7 @@ class GitSource(Source):
with self.timed_activity("Tracking {} from {}"
.format(self.tracking, resolved_url),
silent_nested=True):
self.mirror.ensure()
self.mirror.ensure_trackable()
self.mirror._fetch()
# Update self.mirror.ref and node.ref from the self.tracking branch
......@@ -643,6 +743,7 @@ class GitSource(Source):
def init_workspace(self, directory):
# XXX: may wish to refactor this as some code dupe with stage()
self.mirror.ensure_trackable()
self.refresh_submodules()
with self.timed_activity('Setting up workspace "{}"'.format(directory), silent_nested=True):
......@@ -717,15 +818,16 @@ class GitSource(Source):
# Assert that the ref exists in the track tag/branch, if track has been specified.
ref_in_track = False
if self.tracking:
mirror = self.mirror.mirror_path()
_, branch = self.check_output([self.host_git, 'branch', '--list', self.tracking,
'--contains', self.mirror.ref],
cwd=self.mirror.mirror)
cwd=mirror)
if branch:
ref_in_track = True
else:
_, tag = self.check_output([self.host_git, 'tag', '--list', self.tracking,
'--contains', self.mirror.ref],
cwd=self.mirror.mirror)
cwd=mirror)
if tag:
ref_in_track = True
......@@ -749,7 +851,7 @@ class GitSource(Source):
self.refresh_submodules()
for mirror in self.submodules:
if not os.path.exists(mirror.mirror):
if not os.path.exists(mirror.mirror) and not os.path.exists(mirror.fetch_mirror):
return False
if not mirror.has_ref():
return False
......@@ -761,7 +863,7 @@ class GitSource(Source):
# Assumes that we have our mirror and we have the ref which we point to
#
def refresh_submodules(self):
self.mirror.ensure()
self.mirror.ensure_fetchable()
submodules = []
for path, url in self.mirror.submodule_list():
......
......@@ -28,6 +28,7 @@ import shutil
from buildstream._exceptions import ErrorDomain
from buildstream import _yaml
from buildstream.plugin import CoreWarnings
from buildstream.utils import url_directory_name
from tests.testutils import cli, create_repo
from tests.testutils.site import HAVE_GIT
......@@ -1018,3 +1019,249 @@ def test_overwrite_rogue_tag_multiple_remotes(cli, tmpdir, datafiles):
result = cli.run(project=project, args=['build', 'target.bst'])
result.assert_success()
@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
def test_fetch_shallow(cli, tmpdir, datafiles):
project = str(datafiles)
repo = create_repo('git', str(tmpdir))
previous_ref = repo.create(os.path.join(project, 'repofiles'))
file1 = os.path.join(str(tmpdir), 'file1')
with open(file1, 'w') as f:
f.write('test\n')
ref = repo.add_file(file1)
source_config = repo.source_config(ref=ref)
# Write out our test target with a bad ref
element = {
'kind': 'import',
'sources': [
source_config
]
}
_yaml.dump(element, os.path.join(project, 'target.bst'))
sources_dir = os.path.join(str(tmpdir), 'sources')
os.makedirs(sources_dir, exist_ok=True)
config = {
'sourcedir': sources_dir
}
cli.configure(config)
result = cli.run(project=project, args=[
'fetch', 'target.bst'
])
result.assert_success()
cache_dir_name = url_directory_name(source_config['url'])
full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, ref))
assert os.path.exists(shallow_cache_path)
assert not os.path.exists(full_cache_path)
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=shallow_cache_path,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [ref]
result = cli.run(project=project, args=[
'build', 'target.bst'
])
result.assert_success()
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=shallow_cache_path,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [ref]
assert os.path.exists(shallow_cache_path)
assert not os.path.exists(full_cache_path)
result = cli.run(project=project, args=[
'track', 'target.bst'
])
result.assert_success()
assert os.path.exists(full_cache_path)
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=full_cache_path,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [ref, previous_ref]
@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
def test_fetch_shallow_not_tagged(cli, tmpdir, datafiles):
"""When a ref is not tagged and not head of branch on remote we cannot
get a shallow clone. It should automatically get a full clone.
"""
project = str(datafiles)
repo = create_repo('git', str(tmpdir))
previous_ref = repo.create(os.path.join(project, 'repofiles'))
file1 = os.path.join(str(tmpdir), 'file1')
with open(file1, 'w') as f:
f.write('test\n')
ref = repo.add_file(file1)
source_config = repo.source_config(ref=previous_ref)
# Write out our test target with a bad ref
element = {
'kind': 'import',
'sources': [
source_config
]
}
_yaml.dump(element, os.path.join(project, 'target.bst'))
sources_dir = os.path.join(str(tmpdir), 'sources')
os.makedirs(sources_dir, exist_ok=True)
config = {
'sourcedir': sources_dir
}
cli.configure(config)
result = cli.run(project=project, args=[
'fetch', 'target.bst'
])
result.assert_success()
cache_dir_name = url_directory_name(source_config['url'])
full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, previous_ref))
assert not os.path.exists(shallow_cache_path)
assert os.path.exists(full_cache_path)
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=full_cache_path,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [ref, previous_ref]
@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
def test_fetch_shallow_annotated_tag(cli, tmpdir, datafiles):
"""When a ref is not tagged and not head of branch on remote we cannot
get a shallow clone. It should automatically get a full clone.
"""
project = str(datafiles)
repo = create_repo('git', str(tmpdir))
previous_ref = repo.create(os.path.join(project, 'repofiles'))
repo.add_annotated_tag('tag', 'tag')
file1 = os.path.join(str(tmpdir), 'file1')
with open(file1, 'w') as f:
f.write('test\n')
ref = repo.add_file(file1)
source_config = repo.source_config(ref=previous_ref)
del source_config['track']
# Write out our test target with a bad ref
element = {
'kind': 'import',
'sources': [
source_config
]
}
_yaml.dump(element, os.path.join(project, 'target.bst'))
sources_dir = os.path.join(str(tmpdir), 'sources')
os.makedirs(sources_dir, exist_ok=True)
config = {
'sourcedir': sources_dir
}
cli.configure(config)
result = cli.run(project=project, args=[
'fetch', 'target.bst'
])
result.assert_success()
cache_dir_name = url_directory_name(source_config['url'])
full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, previous_ref))
assert os.path.exists(shallow_cache_path)
assert not os.path.exists(full_cache_path)
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=shallow_cache_path,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [previous_ref]
@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
def test_fetch_shallow_workspace_open(cli, tmpdir, datafiles):
"""
Workspaces should get a full clone.
"""
project = str(datafiles)
repo = create_repo('git', str(tmpdir))
previous_ref = repo.create(os.path.join(project, 'repofiles'))
file1 = os.path.join(str(tmpdir), 'file1')
with open(file1, 'w') as f:
f.write('test\n')
ref = repo.add_file(file1)
source_config = repo.source_config(ref=ref)
# Write out our test target with a bad ref
element = {
'kind': 'import',
'sources': [
source_config
]
}
_yaml.dump(element, os.path.join(project, 'target.bst'))
sources_dir = os.path.join(str(tmpdir), 'sources')
os.makedirs(sources_dir, exist_ok=True)
config = {
'sourcedir': sources_dir
}
cli.configure(config)
result = cli.run(project=project, args=[
'fetch', 'target.bst'
])
result.assert_success()
cache_dir_name = url_directory_name(source_config['url'])
full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, ref))
assert os.path.exists(shallow_cache_path)
assert not os.path.exists(full_cache_path)
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=shallow_cache_path,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [ref]
workspace = os.path.join(str(tmpdir), 'workspace')
result = cli.run(project=project, args=[
'workspace', 'open', 'target.bst', '--directory', workspace
])
result.assert_success()
output = subprocess.run(['git', 'log', '--format=format:%H'],
cwd=workspace,
stdout=subprocess.PIPE).stdout.decode('ascii')
assert output.splitlines() == [ref, previous_ref]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment