From 3c8973af1ae90a3b10e261c0edd0bf0d5655b2d5 Mon Sep 17 00:00:00 2001 From: Tom Pollard <tom.pollard@codethink.co.uk> Date: Fri, 7 Sep 2018 13:48:50 +0100 Subject: [PATCH] WIP: Don't pull artifact buildtrees by default - Set default pull to not include buildtree artifact dir - PullQueue configurable subdir attribute - Add 'pullbuildtrees' global option to for user conf - Add --pull-buildtrees flag to bst build cli - Add --pull-buildtrees flag to bst pull cli - Add helper function _fetch_subdir to cascache, to fetch buildtree or any other subdir digest - Make element._pull_pending not assume no need to process pull if artifact is cached if buildtrees are set to be pulled - Ensure cascache.py doesn't try to checkout/extract a dangling ref - Required test and changes --- buildstream/_artifactcache/artifactcache.py | 19 +++- buildstream/_artifactcache/cascache.py | 39 +++++-- buildstream/_context.py | 8 +- buildstream/_frontend/cli.py | 13 ++- buildstream/_scheduler/queues/pullqueue.py | 15 ++- buildstream/_stream.py | 18 +++- buildstream/element.py | 46 +++++--- tests/completions/completions.py | 2 +- tests/integration/pullbuildtrees.py | 112 ++++++++++++++++++++ tests/testutils/artifactshare.py | 2 +- 10 files changed, 239 insertions(+), 35 deletions(-) create mode 100644 tests/integration/pullbuildtrees.py diff --git a/buildstream/_artifactcache/artifactcache.py b/buildstream/_artifactcache/artifactcache.py index 6a9b57f2c1..da39972c44 100644 --- a/buildstream/_artifactcache/artifactcache.py +++ b/buildstream/_artifactcache/artifactcache.py @@ -426,6 +426,22 @@ class ArtifactCache(): raise ImplError("Cache '{kind}' does not implement contains()" .format(kind=type(self).__name__)) + # contains_subdir_artifact(): + # + # Check whether an artifact element contains a digest for a subdir + # which is populated in the cache, i.e non dangling. + # + # Args: + # element (Element): The Element to check + # key (str): The cache key to use + # subdir (str): The subdir to check + # + # Returns: True if the subdir exists & is populated in the cache, False otherwise + # + def contains_subdir_artifact(self, element, key, subdir): + raise ImplError("Cache '{kind}' does not implement contains_subdir_artifact()" + .format(kind=type(self).__name__)) + # list_artifacts(): # # List artifacts in this cache in LRU order. @@ -551,11 +567,12 @@ class ArtifactCache(): # element (Element): The Element whose artifact is to be fetched # key (str): The cache key to use # progress (callable): The progress callback, if any + # subdir (str): The optional specific subdir to pull # # Returns: # (bool): True if pull was successful, False if artifact was not available # - def pull(self, element, key, *, progress=None): + def pull(self, element, key, *, progress=None, subdir=None, excluded_subdirs=None): raise ImplError("Cache '{kind}' does not implement pull()" .format(kind=type(self).__name__)) diff --git a/buildstream/_artifactcache/cascache.py b/buildstream/_artifactcache/cascache.py index 3e63608bee..cc21905a6f 100644 --- a/buildstream/_artifactcache/cascache.py +++ b/buildstream/_artifactcache/cascache.py @@ -92,6 +92,16 @@ class CASCache(ArtifactCache): # This assumes that the repository doesn't have any dangling pointers return os.path.exists(refpath) + def contains_subdir_artifact(self, element, key, subdir): + tree = self.resolve_ref(self.get_artifact_fullname(element, key)) + + # This assumes that the subdir digest is present in the element tree + subdirdigest = self._get_subdir(tree, subdir) + objpath = self.objpath(subdirdigest) + + # True if subdir content is cached or if empty as expected + return os.path.exists(objpath) + def extract(self, element, key): ref = self.get_artifact_fullname(element, key) @@ -228,7 +238,7 @@ class CASCache(ArtifactCache): remotes_for_project = self._remotes[element._get_project()] return any(remote.spec.push for remote in remotes_for_project) - def pull(self, element, key, *, progress=None): + def pull(self, element, key, *, progress=None, subdir=None, excluded_subdirs=None): ref = self.get_artifact_fullname(element, key) project = element._get_project() @@ -247,8 +257,14 @@ class CASCache(ArtifactCache): tree.hash = response.digest.hash tree.size_bytes = response.digest.size_bytes - self._fetch_directory(remote, tree) + # Check if the element artifact is present, if so just fetch subdir + if subdir and os.path.exists(self.objpath(tree)): + self._fetch_subdir(remote, tree, subdir) + else: + # Fetch artifact, excluded_subdirs determined in pullqueue + self._fetch_directory(remote, tree, excluded_subdirs=excluded_subdirs) + # tree is the remote value, so is the same without or without dangling ref locally self.set_ref(ref, tree) element.info("Pulled artifact {} <- {}".format(display_key, remote.spec.url)) @@ -668,8 +684,10 @@ class CASCache(ArtifactCache): stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) for dirnode in directory.directories: - fullpath = os.path.join(dest, dirnode.name) - self._checkout(fullpath, dirnode.digest) + # Don't try to checkout a dangling ref + if os.path.exists(self.objpath(dirnode.digest)): + fullpath = os.path.join(dest, dirnode.name) + self._checkout(fullpath, dirnode.digest) for symlinknode in directory.symlinks: # symlink @@ -948,10 +966,12 @@ class CASCache(ArtifactCache): # remote (Remote): The remote to use. # dir_digest (Digest): Digest object for the directory to fetch. # - def _fetch_directory(self, remote, dir_digest): + def _fetch_directory(self, remote, dir_digest, *, excluded_subdirs=None): fetch_queue = [dir_digest] fetch_next_queue = [] batch = _CASBatchRead(remote) + if not excluded_subdirs: + excluded_subdirs = [] while len(fetch_queue) + len(fetch_next_queue) > 0: if len(fetch_queue) == 0: @@ -966,8 +986,9 @@ class CASCache(ArtifactCache): directory.ParseFromString(f.read()) for dirnode in directory.directories: - batch = self._fetch_directory_node(remote, dirnode.digest, batch, - fetch_queue, fetch_next_queue, recursive=True) + if dirnode.name not in excluded_subdirs: + batch = self._fetch_directory_node(remote, dirnode.digest, batch, + fetch_queue, fetch_next_queue, recursive=True) for filenode in directory.files: batch = self._fetch_directory_node(remote, filenode.digest, batch, @@ -976,6 +997,10 @@ class CASCache(ArtifactCache): # Fetch final batch self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue) + def _fetch_subdir(self, remote, tree, subdir): + subdirdigest = self._get_subdir(tree, subdir) + self._fetch_directory(remote, subdirdigest) + def _fetch_tree(self, remote, digest): # download but do not store the Tree object with tempfile.NamedTemporaryFile(dir=self.tmpdir) as out: diff --git a/buildstream/_context.py b/buildstream/_context.py index cb29968a47..321753eb39 100644 --- a/buildstream/_context.py +++ b/buildstream/_context.py @@ -110,6 +110,9 @@ class Context(): # Make sure the XDG vars are set in the environment before loading anything self._init_xdg() + # Whether or not to attempt to pull buildtrees globally + self.pullbuildtrees = False + # Private variables self._cache_key = None self._message_handler = None @@ -160,7 +163,7 @@ class Context(): _yaml.node_validate(defaults, [ 'sourcedir', 'builddir', 'artifactdir', 'logdir', 'scheduler', 'artifacts', 'logging', 'projects', - 'cache' + 'cache', 'pullbuildtrees' ]) for directory in ['sourcedir', 'builddir', 'artifactdir', 'logdir']: @@ -185,6 +188,9 @@ class Context(): # Load artifact share configuration self.artifact_cache_specs = ArtifactCache.specs_from_config_node(defaults) + # Load pull buildtrees configuration + self.pullbuildtrees = _yaml.node_get(defaults, bool, 'pullbuildtrees', default_value='False') + # Load logging config logging = _yaml.node_get(defaults, Mapping, 'logging') _yaml.node_validate(logging, [ diff --git a/buildstream/_frontend/cli.py b/buildstream/_frontend/cli.py index 20624e2acb..a338201a06 100644 --- a/buildstream/_frontend/cli.py +++ b/buildstream/_frontend/cli.py @@ -305,10 +305,12 @@ def init(app, project_name, format_version, element_path, force): help="Allow tracking to cross junction boundaries") @click.option('--track-save', default=False, is_flag=True, help="Deprecated: This is ignored") +@click.option('--pull-buildtrees', default=False, is_flag=True, + help="Pull buildtrees from a remote cache server") @click.argument('elements', nargs=-1, type=click.Path(readable=False)) @click.pass_obj -def build(app, elements, all_, track_, track_save, track_all, track_except, track_cross_junctions): +def build(app, elements, all_, track_, track_save, track_all, track_except, track_cross_junctions, pull_buildtrees): """Build elements in a pipeline""" if (track_except or track_cross_junctions) and not (track_ or track_all): @@ -327,7 +329,8 @@ def build(app, elements, all_, track_, track_save, track_all, track_except, trac track_targets=track_, track_except=track_except, track_cross_junctions=track_cross_junctions, - build_all=all_) + build_all=all_, + pull_buildtrees=pull_buildtrees) ################################################################## @@ -429,10 +432,12 @@ def track(app, elements, deps, except_, cross_junctions): help='The dependency artifacts to pull (default: none)') @click.option('--remote', '-r', help="The URL of the remote cache (defaults to the first configured cache)") +@click.option('--pull-buildtrees', default=False, is_flag=True, + help="Pull buildtrees from a remote cache server") @click.argument('elements', nargs=-1, type=click.Path(readable=False)) @click.pass_obj -def pull(app, elements, deps, remote): +def pull(app, elements, deps, remote, pull_buildtrees): """Pull a built artifact from the configured remote artifact cache. By default the artifact will be pulled one of the configured caches @@ -446,7 +451,7 @@ def pull(app, elements, deps, remote): all: All dependencies """ with app.initialized(session_name="Pull"): - app.stream.pull(elements, selection=deps, remote=remote) + app.stream.pull(elements, selection=deps, remote=remote, pull_buildtrees=pull_buildtrees) ################################################################## diff --git a/buildstream/_scheduler/queues/pullqueue.py b/buildstream/_scheduler/queues/pullqueue.py index 2842c5e21f..001897b94a 100644 --- a/buildstream/_scheduler/queues/pullqueue.py +++ b/buildstream/_scheduler/queues/pullqueue.py @@ -32,9 +32,20 @@ class PullQueue(Queue): complete_name = "Pulled" resources = [ResourceType.DOWNLOAD, ResourceType.CACHE] + def __init__(self, scheduler, buildtrees=False): + super().__init__(scheduler) + + # Current default exclusions on pull + self._excluded_subdirs = ["buildtree"] + self._subdir = None + # If buildtrees are to be pulled, remove the value from exclusion list + if buildtrees: + self._subdir = "buildtree" + self._excluded_subdirs.remove(self._subdir) + def process(self, element): # returns whether an artifact was downloaded or not - if not element._pull(): + if not element._pull(subdir=self._subdir, excluded_subdirs=self._excluded_subdirs): raise SkipJob(self.action_name) def status(self, element): @@ -49,7 +60,7 @@ class PullQueue(Queue): if not element._can_query_cache(): return QueueStatus.WAIT - if element._pull_pending(): + if element._pull_pending(subdir=self._subdir): return QueueStatus.READY else: return QueueStatus.SKIP diff --git a/buildstream/_stream.py b/buildstream/_stream.py index e7a71978bb..de1fc7c485 100644 --- a/buildstream/_stream.py +++ b/buildstream/_stream.py @@ -160,12 +160,14 @@ class Stream(): # track_cross_junctions (bool): Whether tracking should cross junction boundaries # build_all (bool): Whether to build all elements, or only those # which are required to build the target. + # pull_buildtrees (bool): Whether to pull buildtrees from a remote cache server # def build(self, targets, *, track_targets=None, track_except=None, track_cross_junctions=False, - build_all=False): + build_all=False, + pull_buildtrees=False): if build_all: selection = PipelineSelection.ALL @@ -195,7 +197,10 @@ class Stream(): self._add_queue(track_queue, track=True) if self._artifacts.has_fetch_remotes(): - self._add_queue(PullQueue(self._scheduler)) + # Query if pullbuildtrees has been set globally in user config + if self._context.pullbuildtrees: + pull_buildtrees = True + self._add_queue(PullQueue(self._scheduler, buildtrees=pull_buildtrees)) self._add_queue(FetchQueue(self._scheduler, skip_cached=True)) self._add_queue(BuildQueue(self._scheduler)) @@ -295,7 +300,8 @@ class Stream(): # def pull(self, targets, *, selection=PipelineSelection.NONE, - remote=None): + remote=None, + pull_buildtrees=False): use_config = True if remote: @@ -310,8 +316,12 @@ class Stream(): if not self._artifacts.has_fetch_remotes(): raise StreamError("No artifact caches available for pulling artifacts") + # Query if pullbuildtrees has been set globally in user config + if self._context.pullbuildtrees: + pull_buildtrees = True + self._pipeline.assert_consistent(elements) - self._add_queue(PullQueue(self._scheduler)) + self._add_queue(PullQueue(self._scheduler, buildtrees=pull_buildtrees)) self._enqueue_plan(elements) self._run() diff --git a/buildstream/element.py b/buildstream/element.py index 25ef22ed28..2aede0e567 100644 --- a/buildstream/element.py +++ b/buildstream/element.py @@ -1691,18 +1691,26 @@ class Element(Plugin): # _pull_pending() # - # Check whether the artifact will be pulled. + # Check whether the artifact will be pulled. If the pull operation is to + # include a specific subdir of the element artifact (from cli or user conf) + # then the local cache is queried for the subdirs existence. + # + # Args: + # subdir (str): Whether the pull has been invoked with a specific subdir set # # Returns: # (bool): Whether a pull operation is pending # - def _pull_pending(self): + def _pull_pending(self, subdir=None): if self._get_workspace(): # Workspace builds are never pushed to artifact servers return False - if self.__strong_cached: - # Artifact already in local cache + if self.__strong_cached and subdir: + # If we've specified a subdir, check if the subdir is cached locally + if self.__artifacts.contains_subdir_artifact(self, self.__strict_cache_key, subdir): + return False + elif self.__strong_cached: return False # Pull is pending if artifact remote server available @@ -1724,11 +1732,10 @@ class Element(Plugin): self._update_state() - def _pull_strong(self, *, progress=None): + def _pull_strong(self, *, progress=None, subdir=None, excluded_subdirs=None): weak_key = self._get_cache_key(strength=_KeyStrength.WEAK) - key = self.__strict_cache_key - if not self.__artifacts.pull(self, key, progress=progress): + if not self.__artifacts.pull(self, key, progress=progress, subdir=subdir, excluded_subdirs=excluded_subdirs): return False # update weak ref by pointing it to this newly fetched artifact @@ -1736,10 +1743,10 @@ class Element(Plugin): return True - def _pull_weak(self, *, progress=None): + def _pull_weak(self, *, progress=None, subdir=None, excluded_subdirs=None): weak_key = self._get_cache_key(strength=_KeyStrength.WEAK) - - if not self.__artifacts.pull(self, weak_key, progress=progress): + if not self.__artifacts.pull(self, weak_key, progress=progress, subdir=subdir, + excluded_subdirs=excluded_subdirs): return False # extract strong cache key from this newly fetched artifact @@ -1757,17 +1764,17 @@ class Element(Plugin): # # Returns: True if the artifact has been downloaded, False otherwise # - def _pull(self): + def _pull(self, subdir=None, excluded_subdirs=None): context = self._get_context() def progress(percent, message): self.status(message) # Attempt to pull artifact without knowing whether it's available - pulled = self._pull_strong(progress=progress) + pulled = self._pull_strong(progress=progress, subdir=subdir, excluded_subdirs=excluded_subdirs) if not pulled and not self._cached() and not context.get_strict(): - pulled = self._pull_weak(progress=progress) + pulled = self._pull_weak(progress=progress, subdir=subdir, excluded_subdirs=excluded_subdirs) if not pulled: return False @@ -1790,10 +1797,21 @@ class Element(Plugin): if not self._cached(): return True - # Do not push tained artifact + # Do not push tainted artifact if self.__get_tainted(): return True + # Do not push elements that have a dangling buildtree artifact unless element type is + # expected to have an empty buildtree directory + if not self.__artifacts.contains_subdir_artifact(self, self.__strict_cache_key, 'buildtree'): + return True + + # strict_cache_key can't be relied on to be available when running in non strict mode + context = self._get_context() + if not context.get_strict(): + if not self.__artifacts.contains_subdir_artifact(self, self.__weak_cache_key, 'buildtree'): + return True + return False # _push(): diff --git a/tests/completions/completions.py b/tests/completions/completions.py index 50b41f7b30..e4412b5b97 100644 --- a/tests/completions/completions.py +++ b/tests/completions/completions.py @@ -103,7 +103,7 @@ def test_commands(cli, cmd, word_idx, expected): ('bst --no-colors build -', 3, ['--all ', '--track ', '--track-all ', '--track-except ', '--track-cross-junctions ', '-J ', - '--track-save ']), + '--track-save ', '--pull-buildtrees ']), # Test the behavior of completing after an option that has a # parameter that cannot be completed, vs an option that has diff --git a/tests/integration/pullbuildtrees.py b/tests/integration/pullbuildtrees.py new file mode 100644 index 0000000000..b07b3823b1 --- /dev/null +++ b/tests/integration/pullbuildtrees.py @@ -0,0 +1,112 @@ +import os +import shutil +import pytest + +from tests.testutils import cli_integration as cli, create_artifact_share +from tests.testutils.integration import assert_contains + + +DATA_DIR = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "project" +) + + +# Remove artifact cache & set cli.config value of pullbuildtrees +# to false, which is the default user context +def default_state(cli, integration_cache, share): + # cache_dir = os.path.join(integration_cache, 'artifacts') + # Is this leaving behind the buildtree dir object? + # Might need to nuke the whole /artifacts dir instead + # cli.remove_artifact_from_cache(project, element_name, cache_dir=cache_dir) + shutil.rmtree(os.path.join(integration_cache, 'artifacts')) + cli.configure({'pullbuildtrees': False, 'artifacts': {'url': share.repo, 'push': False}}) + + +# A test to capture the integration of the pullbuildtrees +# behaviour, which by default is to not include the buildtree +# directory of an element +@pytest.mark.integration +@pytest.mark.datafiles(DATA_DIR) +def test_pullbuildtrees(cli, tmpdir, datafiles, integration_cache): + project = os.path.join(datafiles.dirname, datafiles.basename) + element_name = 'autotools/amhello.bst' + + # Create artifact shares for pull & push testing + with create_artifact_share(os.path.join(str(tmpdir), 'remote1')) as share1,\ + create_artifact_share(os.path.join(str(tmpdir), 'remote2')) as share2: + cli.configure({ + 'artifacts': {'url': share1.repo, 'push': True}, + }) + + # Build autotools element, checked pushed, delete local + result = cli.run(project=project, args=['build', element_name]) + assert result.exit_code == 0 + assert cli.get_element_state(project, element_name) == 'cached' + assert share1.has_artifact('test', element_name, cli.get_element_key(project, element_name)) + default_state(cli, integration_cache, share1) + + # Pull artifact with default config, assert that pulling again + # doesn't create a pull job, then assert with buildtrees user + # config set creates a pull job. + result = cli.run(project=project, args=['pull', element_name]) + assert element_name in result.get_pulled_elements() + result = cli.run(project=project, args=['pull', element_name]) + assert element_name not in result.get_pulled_elements() + cli.configure({'pullbuildtrees': True}) + result = cli.run(project=project, args=['pull', element_name]) + assert element_name in result.get_pulled_elements() + default_state(cli, integration_cache, share1) + + # Pull artifact with default config, then assert that pulling + # with buildtrees cli flag set creates a pull job. + result = cli.run(project=project, args=['pull', element_name]) + assert element_name in result.get_pulled_elements() + result = cli.run(project=project, args=['pull', '--pull-buildtrees', element_name]) + assert element_name in result.get_pulled_elements() + default_state(cli, integration_cache, share1) + + # Pull artifact with pullbuildtrees set in user config, then assert + # that pulling with the same user config doesn't creates a pull job, + # or when buildtrees cli flag is set. + cli.configure({'pullbuildtrees': True}) + result = cli.run(project=project, args=['pull', element_name]) + assert element_name in result.get_pulled_elements() + result = cli.run(project=project, args=['pull', element_name]) + assert element_name not in result.get_pulled_elements() + result = cli.run(project=project, args=['pull', '--pull-buildtrees', element_name]) + assert element_name not in result.get_pulled_elements() + default_state(cli, integration_cache, share1) + + # Pull artifact with default config and buildtrees cli flag set, then assert + # that pulling with pullbuildtrees set in user config doesn't create a pull + # job. + result = cli.run(project=project, args=['pull', '--pull-buildtrees', element_name]) + assert element_name in result.get_pulled_elements() + cli.configure({'pullbuildtrees': True}) + result = cli.run(project=project, args=['pull', element_name]) + assert element_name not in result.get_pulled_elements() + default_state(cli, integration_cache, share1) + + # Assert that a partial build element (not containing a populated buildtree dir) + # can't be pushed to an artifact share, then assert that a complete build element + # can be. This will attempt a partial pull from share1 and then a partial push + # to share2 + result = cli.run(project=project, args=['pull', element_name]) + assert element_name in result.get_pulled_elements() + cli.configure({'artifacts': {'url': share2.repo, 'push': True}}) + result = cli.run(project=project, args=['push', element_name]) + assert element_name not in result.get_pushed_elements() + assert not share2.has_artifact('test', element_name, cli.get_element_key(project, element_name)) + + # Assert that after pulling the missing buildtree the element artifact can be + # successfully pushed to the remote. This will attempt to pull the buildtree + # from share1 and then a 'complete' push to share2 + cli.configure({'artifacts': {'url': share1.repo, 'push': False}}) + result = cli.run(project=project, args=['pull', '--pull-buildtrees', element_name]) + assert element_name in result.get_pulled_elements() + cli.configure({'artifacts': {'url': share2.repo, 'push': True}}) + result = cli.run(project=project, args=['push', element_name]) + assert element_name in result.get_pushed_elements() + assert share2.has_artifact('test', element_name, cli.get_element_key(project, element_name)) + default_state(cli, integration_cache, share1) diff --git a/tests/testutils/artifactshare.py b/tests/testutils/artifactshare.py index d956672272..e5abd3908e 100644 --- a/tests/testutils/artifactshare.py +++ b/tests/testutils/artifactshare.py @@ -128,7 +128,7 @@ class ArtifactShare(): valid_chars = string.digits + string.ascii_letters + '-._' element_name = ''.join([ - x if x in valid_chars else '_' + x if x in valid_chars else '-' for x in element_name ]) artifact_key = '{0}/{1}/{2}'.format(project_name, element_name, cache_key) -- GitLab