From 160bb0c6962efba83918a053ee0a78d4eac349e6 Mon Sep 17 00:00:00 2001 From: Tristan Van Berkom Date: Fri, 14 Sep 2018 20:03:28 +0900 Subject: _artifactcache/artifactcache.py: Changes to mark_required_elements() This was previously append_required_artifacts(), which presumed that we knew at startup time what the cache keys of all elements in the loaded pipeline would be. This fixes unexpected deletions of required artifacts when dynamic tracking is enabled with `bst build --track-all target.bst` --- buildstream/_artifactcache/artifactcache.py | 57 +++++++++++++++++++---------- buildstream/_stream.py | 9 ++--- buildstream/element.py | 19 ++++++---- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/buildstream/_artifactcache/artifactcache.py b/buildstream/_artifactcache/artifactcache.py index 9ac68b102..45293e345 100644 --- a/buildstream/_artifactcache/artifactcache.py +++ b/buildstream/_artifactcache/artifactcache.py @@ -87,7 +87,7 @@ class ArtifactCache(): self.global_remote_specs = [] self.project_remote_specs = {} - self._required_artifacts = set() # The artifacts required for this session + self._required_elements = set() # The elements required for this session self._cache_size = None # The current cache size, sometimes it's an estimate self._cache_quota = None # The cache quota self._cache_lower_threshold = None # The target cache size for a cleanup @@ -189,33 +189,40 @@ class ArtifactCache(): (str(provenance))) return cache_specs - # append_required_artifacts(): + # mark_required_elements(): # - # Append to the list of elements whose artifacts are required for - # the current run. Artifacts whose elements are in this list will - # be locked by the artifact cache and not touched for the duration - # of the current pipeline. + # Mark elements whose artifacts are required for the current run. + # + # Artifacts whose elements are in this list will be locked by the artifact + # cache and not touched for the duration of the current pipeline. # # Args: # elements (iterable): A set of elements to mark as required # - def append_required_artifacts(self, elements): - # We lock both strong and weak keys - deleting one but not the - # other won't save space in most cases anyway, but would be a - # user inconvenience. + def mark_required_elements(self, elements): + + # We risk calling this function with a generator, so we + # better consume it first. + # + elements = list(elements) + + # Mark the elements as required. We cannot know that we know the + # cache keys yet, so we only check that later when deleting. + # + self._required_elements.update(elements) + # For the cache keys which were resolved so far, we bump + # the atime of them. + # + # This is just in case we have concurrent instances of + # BuildStream running with the same artifact cache, it will + # reduce the likelyhood of one instance deleting artifacts + # which are required by the other. for element in elements: strong_key = element._get_cache_key(strength=_KeyStrength.STRONG) weak_key = element._get_cache_key(strength=_KeyStrength.WEAK) - for key in (strong_key, weak_key): - if key and key not in self._required_artifacts: - self._required_artifacts.add(key) - - # We also update the usage times of any artifacts - # we will be using, which helps preventing a - # buildstream process that runs in parallel with - # this one from removing artifacts in-use. + if key: try: self.update_atime(key) except ArtifactError: @@ -231,6 +238,18 @@ class ArtifactCache(): def clean(self): artifacts = self.list_artifacts() + # Build a set of the cache keys which are required + # based on the required elements at cleanup time + # + # We lock both strong and weak keys - deleting one but not the + # other won't save space, but would be a user inconvenience. + required_artifacts = set() + for element in self._required_elements: + required_artifacts.update([ + element._get_cache_key(strength=_KeyStrength.STRONG), + element._get_cache_key(strength=_KeyStrength.WEAK) + ]) + # Do a real computation of the cache size once, just in case self.compute_cache_size() @@ -256,7 +275,7 @@ class ArtifactCache(): break key = to_remove.rpartition('/')[2] - if key not in self._required_artifacts: + if key not in required_artifacts: # Remove the actual artifact, if it's not required. size = self.remove(to_remove) diff --git a/buildstream/_stream.py b/buildstream/_stream.py index cceb3d3a5..2e78a1c45 100644 --- a/buildstream/_stream.py +++ b/buildstream/_stream.py @@ -938,13 +938,10 @@ class Stream(): # Set the "required" artifacts that should not be removed # while this pipeline is active # - # FIXME: The set of required artifacts is only really needed - # for build and pull tasks. + # It must include all the artifacts which are required by the + # final product. Note that this is a superset of the build plan. # - # It must include all the artifacts which are required by the - # final product. Note that this is a superset of the build plan. - # - self._artifacts.append_required_artifacts((e for e in self._pipeline.dependencies(elements, Scope.ALL))) + self._artifacts.mark_required_elements(self._pipeline.dependencies(elements, Scope.ALL)) if selection == PipelineSelection.PLAN and dynamic_plan: # We use a dynamic build plan, only request artifacts of top-level targets, diff --git a/buildstream/element.py b/buildstream/element.py index f484c88e5..13d76dbad 100644 --- a/buildstream/element.py +++ b/buildstream/element.py @@ -1486,15 +1486,20 @@ class Element(Plugin): workspace.clear_running_files() self._get_context().get_workspaces().save_config() - # We also need to update the required artifacts, since - # workspaced dependencies do not have a fixed cache key - # when the build starts. + # This element will have already been marked as + # required, but we bump the atime again, in case + # we did not know the cache key until now. # - # This does *not* cause a race condition, because - # _assemble_done is called before a cleanup job may be - # launched. + # FIXME: This is not exactly correct, we should be + # doing this at the time which we have discovered + # a new cache key, this just happens to be the + # last place where that can happen. # - self.__artifacts.append_required_artifacts([self]) + # Ultimately, we should be refactoring + # Element._update_state() such that we know + # when a cache key is actually discovered. + # + self.__artifacts.mark_required_elements([self]) # _assemble(): # -- cgit v1.2.1 From 39125d24668cd717cfb3b22c2d164f2630518a4c Mon Sep 17 00:00:00 2001 From: Tristan Van Berkom Date: Fri, 14 Sep 2018 17:31:16 +0900 Subject: tests/artifactcache/expiry.py: Cleanup of test for required artifacts This commit renames test_never_delete_dependencies() to test_never_delete_required(), renders the test more readable by renaming some elements and reordering some statements and makes the comments more straight forward and accurate. --- tests/artifactcache/expiry.py | 62 ++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/tests/artifactcache/expiry.py b/tests/artifactcache/expiry.py index 980710957..162635a05 100644 --- a/tests/artifactcache/expiry.py +++ b/tests/artifactcache/expiry.py @@ -196,24 +196,8 @@ def test_keep_dependencies(cli, datafiles, tmpdir): # Assert that we never delete a dependency required for a build tree -# -# NOTE: This test expects that a build will fail if it attempts to -# put more artifacts in the cache than the quota can hold, -# and expects that the last two elements which don't fit into -# the quota wont even be built. -# -# In real life, this will not be the case, since once we reach -# the estimated quota we launch a cache size calculation job and -# only launch a cleanup job when the size is calculated; and -# other build tasks will be scheduled while the cache size job -# is running. -# -# This test only passes because we configure `builders` to 1, -# ensuring that the cache size job runs exclusively since it -# also requires a compute resource (a "builder"). -# @pytest.mark.datafiles(DATA_DIR) -def test_never_delete_dependencies(cli, datafiles, tmpdir): +def test_never_delete_required(cli, datafiles, tmpdir): project = os.path.join(datafiles.dirname, datafiles.basename) element_path = 'elements' @@ -226,37 +210,37 @@ def test_never_delete_dependencies(cli, datafiles, tmpdir): } }) - # Create a build tree - create_element_size('dependency.bst', project, - element_path, [], 8000000) - create_element_size('related.bst', project, - element_path, ['dependency.bst'], 8000000) - create_element_size('target.bst', project, - element_path, ['related.bst'], 8000000) - create_element_size('target2.bst', project, - element_path, ['target.bst'], 8000000) + # Create a linear build tree + create_element_size('dep1.bst', project, element_path, [], 8000000) + create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 8000000) + create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 8000000) + create_element_size('target.bst', project, element_path, ['dep3.bst'], 8000000) # We try to build this pipeline, but it's too big for the # cache. Since all elements are required, the build should fail. - res = cli.run(project=project, args=['build', 'target2.bst']) + res = cli.run(project=project, args=['build', 'target.bst']) res.assert_main_error(ErrorDomain.STREAM, None) - assert cli.get_element_state(project, 'dependency.bst') == 'cached' - - # This is *technically* above the cache limit. BuildStream accepts - # some fuzziness, since it's hard to assert that we don't create - # an artifact larger than the cache quota. We would have to remove - # the artifact after-the-fact, but since it is required for the - # current build and nothing broke yet, it's nicer to keep it - # around. + # Only the first artifact fits in the cache, but we expect + # that the first *two* artifacts will be cached. + # + # This is because after caching the first artifact we must + # proceed to build the next artifact, and we cannot really + # know how large an artifact will be until we try to cache it. + # + # In this case, we deem it more acceptable to not delete an + # artifact which caused the cache to outgrow the quota. # - # This scenario is quite unlikely, and the cache overflow will be - # resolved if the user does something about it anyway. + # Note that this test only works because we have forced + # the configuration to build one element at a time, in real + # life there may potentially be N-builders cached artifacts + # which exceed the quota # - assert cli.get_element_state(project, 'related.bst') == 'cached' + assert cli.get_element_state(project, 'dep1.bst') == 'cached' + assert cli.get_element_state(project, 'dep2.bst') == 'cached' + assert cli.get_element_state(project, 'dep3.bst') != 'cached' assert cli.get_element_state(project, 'target.bst') != 'cached' - assert cli.get_element_state(project, 'target2.bst') != 'cached' # Ensure that only valid cache quotas make it through the loading -- cgit v1.2.1 From f60558a328e1803d385756f2f6e24bf385e79041 Mon Sep 17 00:00:00 2001 From: Tristan Van Berkom Date: Fri, 14 Sep 2018 17:31:35 +0900 Subject: tests/artifactcache/expiry.py: Assert the expected errors These tests were not checking that we fail for the expected reasons. Added `res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')` where we expect to fail because the cache is too full. --- tests/artifactcache/expiry.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/artifactcache/expiry.py b/tests/artifactcache/expiry.py index 162635a05..d998ffdd8 100644 --- a/tests/artifactcache/expiry.py +++ b/tests/artifactcache/expiry.py @@ -93,6 +93,7 @@ def test_artifact_too_large(cli, datafiles, tmpdir, size): create_element_size('target.bst', project, element_path, [], size) res = cli.run(project=project, args=['build', 'target.bst']) res.assert_main_error(ErrorDomain.STREAM, None) + res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full') @pytest.mark.datafiles(DATA_DIR) @@ -220,6 +221,7 @@ def test_never_delete_required(cli, datafiles, tmpdir): # cache. Since all elements are required, the build should fail. res = cli.run(project=project, args=['build', 'target.bst']) res.assert_main_error(ErrorDomain.STREAM, None) + res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full') # Only the first artifact fits in the cache, but we expect # that the first *two* artifacts will be cached. -- cgit v1.2.1 From ce68fd2755db2cad98a9498efb289f9a7d8899d3 Mon Sep 17 00:00:00 2001 From: Tristan Van Berkom Date: Fri, 14 Sep 2018 18:46:27 +0900 Subject: testutils/repo/git.py: Added modify_file() method This allows one to modify a file in an existing git repo, as opposed to adding a new one. --- tests/testutils/repo/git.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/testutils/repo/git.py b/tests/testutils/repo/git.py index 8c2a8c177..c598eb4d0 100644 --- a/tests/testutils/repo/git.py +++ b/tests/testutils/repo/git.py @@ -52,6 +52,13 @@ class Git(Repo): self._run_git('commit', '-m', 'Added {}'.format(os.path.basename(filename))) return self.latest_commit() + def modify_file(self, new_file, path): + shutil.copy(new_file, os.path.join(self.repo, path)) + subprocess.call([ + 'git', 'commit', path, '-m', 'Modified {}'.format(os.path.basename(path)) + ], env=GIT_ENV, cwd=self.repo) + return self.latest_commit() + def add_submodule(self, subdir, url=None, checkout=None): submodule = {} if checkout is not None: -- cgit v1.2.1 From 532ec1eb2a8d43626d8864e49e87b75454f4485e Mon Sep 17 00:00:00 2001 From: Tristan Van Berkom Date: Fri, 14 Sep 2018 18:47:35 +0900 Subject: testutils/element_generators.py: Changing sized element functions * create_element_size() Now uses a git Repo object instead of a local source, and returns the repo. * update_element_size() Added this function which can now resize the expected output of an element generated with create_element_size(), useful to allow testing sized elements with the tracking feature. --- tests/testutils/__init__.py | 2 +- tests/testutils/element_generators.py | 93 ++++++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 18 deletions(-) diff --git a/tests/testutils/__init__.py b/tests/testutils/__init__.py index c2fae1cc4..4a79c3be2 100644 --- a/tests/testutils/__init__.py +++ b/tests/testutils/__init__.py @@ -26,6 +26,6 @@ from .runcli import cli, cli_integration from .repo import create_repo, ALL_REPO_KINDS from .artifactshare import create_artifact_share -from .element_generators import create_element_size +from .element_generators import create_element_size, update_element_size from .junction import generate_junction from .runner_integration import wait_for_cache_granularity diff --git a/tests/testutils/element_generators.py b/tests/testutils/element_generators.py index 49f235c61..448c8571a 100644 --- a/tests/testutils/element_generators.py +++ b/tests/testutils/element_generators.py @@ -1,41 +1,100 @@ import os from buildstream import _yaml +from buildstream import utils + +from . import create_repo # create_element_size() # -# This will open a "_data" file for writing and write -# MB of urandom (/dev/urandom) "stuff" into the file. -# A bst import element file is then created: .bst +# Creates an import element with a git repo, using random +# data to create a file in that repo of the specified size, +# such that building it will add an artifact of the specified +# size to the artifact cache. # # Args: -# name: (str) of the element name (e.g. target.bst) -# path: (str) pathway to the project/elements directory -# dependencies: A list of strings (can also be an empty list) -# size: (int) size of the element in bytes +# name: (str) of the element name (e.g. target.bst) +# project_dir (str): The path to the project +# element_path (str): The element path within the project +# dependencies: A list of strings (can also be an empty list) +# size: (int) size of the element in bytes # # Returns: -# Nothing (creates a .bst file of specified size) +# (Repo): A git repo which can be used to introduce trackable changes +# by using the update_element_size() function below. # def create_element_size(name, project_dir, elements_path, dependencies, size): full_elements_path = os.path.join(project_dir, elements_path) os.makedirs(full_elements_path, exist_ok=True) - # Create a file to be included in this element's artifact - with open(os.path.join(project_dir, name + '_data'), 'wb+') as f: - f.write(os.urandom(size)) + # Create a git repo + repodir = os.path.join(project_dir, 'repos') + repo = create_repo('git', repodir, subdir=name) + + with utils._tempdir(dir=project_dir) as tmp: + + # We use a data/ subdir in the git repo we create, + # and we set the import element to only extract that + # part; this ensures we never include a .git/ directory + # in the cached artifacts for these sized elements. + # + datadir = os.path.join(tmp, 'data') + os.makedirs(datadir) + + # Use /dev/urandom to create the sized file in the datadir + with open(os.path.join(datadir, name), 'wb+') as f: + f.write(os.urandom(size)) + + # Create the git repo from the temp directory + ref = repo.create(tmp) - # Simplest case: We want this file (of specified size) to just - # be an import element. element = { 'kind': 'import', 'sources': [ - { - 'kind': 'local', - 'path': name + '_data' - } + repo.source_config(ref=ref) ], + 'config': { + # Extract only the data directory + 'source': 'data' + }, 'depends': dependencies } _yaml.dump(element, os.path.join(project_dir, elements_path, name)) + + # Return the repo, so that it can later be used to add commits + return repo + + +# update_element_size() +# +# Updates a repo returned by create_element_size() such that +# the newly added commit is completely changed, and has the newly +# specified size. +# +# The name and project_dir arguments must match the arguments +# previously given to create_element_size() +# +# Args: +# name: (str) of the element name (e.g. target.bst) +# project_dir (str): The path to the project +# repo: (Repo) The Repo returned by create_element_size() +# size: (int) The new size which the element generates, in bytes +# +# Returns: +# (Repo): A git repo which can be used to introduce trackable changes +# by using the update_element_size() function below. +# +def update_element_size(name, project_dir, repo, size): + + with utils._tempdir(dir=project_dir) as tmp: + + new_file = os.path.join(tmp, name) + + # Use /dev/urandom to create the sized file in the datadir + with open(new_file, 'wb+') as f: + f.write(os.urandom(size)) + + # Modify the git repo with a new commit to the same path, + # replacing the original file with a new one. + repo.modify_file(new_file, os.path.join('data', name)) -- cgit v1.2.1 From 20b797cbf015be982b67e36709173941fbc16871 Mon Sep 17 00:00:00 2001 From: Tristan Van Berkom Date: Fri, 14 Sep 2018 20:08:11 +0900 Subject: tests/artifactcache/expiry.py: Added test_never_delete_required_track() Same test as test_never_delete_required(), except that this test ensures that we never delete required artifacts when their cache keys are discovered dynamically during the build. --- tests/artifactcache/expiry.py | 58 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/tests/artifactcache/expiry.py b/tests/artifactcache/expiry.py index d998ffdd8..0ac86b91f 100644 --- a/tests/artifactcache/expiry.py +++ b/tests/artifactcache/expiry.py @@ -24,7 +24,7 @@ import pytest from buildstream import _yaml from buildstream._exceptions import ErrorDomain, LoadErrorReason -from tests.testutils import cli, create_element_size, wait_for_cache_granularity +from tests.testutils import cli, create_element_size, update_element_size, wait_for_cache_granularity DATA_DIR = os.path.join( @@ -245,6 +245,62 @@ def test_never_delete_required(cli, datafiles, tmpdir): assert cli.get_element_state(project, 'target.bst') != 'cached' +# Assert that we never delete a dependency required for a build tree, +# even when the artifact cache was previously populated with +# artifacts we do not require, and the new build is run with dynamic tracking. +# +@pytest.mark.datafiles(DATA_DIR) +def test_never_delete_required_track(cli, datafiles, tmpdir): + project = os.path.join(datafiles.dirname, datafiles.basename) + element_path = 'elements' + + cli.configure({ + 'cache': { + 'quota': 10000000 + }, + 'scheduler': { + 'builders': 1 + } + }) + + # Create a linear build tree + repo_dep1 = create_element_size('dep1.bst', project, element_path, [], 2000000) + repo_dep2 = create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 2000000) + repo_dep3 = create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 2000000) + repo_target = create_element_size('target.bst', project, element_path, ['dep3.bst'], 2000000) + + # This should all fit into the artifact cache + res = cli.run(project=project, args=['build', 'target.bst']) + res.assert_success() + + # They should all be cached + assert cli.get_element_state(project, 'dep1.bst') == 'cached' + assert cli.get_element_state(project, 'dep2.bst') == 'cached' + assert cli.get_element_state(project, 'dep3.bst') == 'cached' + assert cli.get_element_state(project, 'target.bst') == 'cached' + + # Now increase the size of all the elements + # + update_element_size('dep1.bst', project, repo_dep1, 8000000) + update_element_size('dep2.bst', project, repo_dep2, 8000000) + update_element_size('dep3.bst', project, repo_dep3, 8000000) + update_element_size('target.bst', project, repo_target, 8000000) + + # Now repeat the same test we did in test_never_delete_required(), + # except this time let's add dynamic tracking + # + res = cli.run(project=project, args=['build', '--track-all', 'target.bst']) + res.assert_main_error(ErrorDomain.STREAM, None) + res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full') + + # Expect the same result that we did in test_never_delete_required() + # + assert cli.get_element_state(project, 'dep1.bst') == 'cached' + assert cli.get_element_state(project, 'dep2.bst') == 'cached' + assert cli.get_element_state(project, 'dep3.bst') != 'cached' + assert cli.get_element_state(project, 'target.bst') != 'cached' + + # Ensure that only valid cache quotas make it through the loading # process. @pytest.mark.parametrize("quota,success", [ -- cgit v1.2.1