diff options
author | Tristan Van Berkom <tristan.van.berkom@gmail.com> | 2018-09-14 12:44:42 +0000 |
---|---|---|
committer | Tristan Van Berkom <tristan.van.berkom@gmail.com> | 2018-09-14 12:44:42 +0000 |
commit | c2af0d5108c333cafd877670e86f61c84238c8f9 (patch) | |
tree | ad6c76e94109ade6529f875b055dadbcaefb5d89 | |
parent | d7152ef47c73c8f56812d132d317f14a6665523a (diff) | |
parent | 20b797cbf015be982b67e36709173941fbc16871 (diff) | |
download | buildstream-c2af0d5108c333cafd877670e86f61c84238c8f9.tar.gz |
Merge branch 'tristan/fix-required-artifacts' into 'master'
Don't delete required artifacts when tracking is enabled
See merge request BuildStream/buildstream!793
-rw-r--r-- | buildstream/_artifactcache/artifactcache.py | 57 | ||||
-rw-r--r-- | buildstream/_stream.py | 9 | ||||
-rw-r--r-- | buildstream/element.py | 19 | ||||
-rw-r--r-- | tests/artifactcache/expiry.py | 120 | ||||
-rw-r--r-- | tests/testutils/__init__.py | 2 | ||||
-rw-r--r-- | tests/testutils/element_generators.py | 93 | ||||
-rw-r--r-- | tests/testutils/repo/git.py | 7 |
7 files changed, 218 insertions, 89 deletions
diff --git a/buildstream/_artifactcache/artifactcache.py b/buildstream/_artifactcache/artifactcache.py index 9ac68b102..45293e345 100644 --- a/buildstream/_artifactcache/artifactcache.py +++ b/buildstream/_artifactcache/artifactcache.py @@ -87,7 +87,7 @@ class ArtifactCache(): self.global_remote_specs = [] self.project_remote_specs = {} - self._required_artifacts = set() # The artifacts required for this session + self._required_elements = set() # The elements required for this session self._cache_size = None # The current cache size, sometimes it's an estimate self._cache_quota = None # The cache quota self._cache_lower_threshold = None # The target cache size for a cleanup @@ -189,33 +189,40 @@ class ArtifactCache(): (str(provenance))) return cache_specs - # append_required_artifacts(): + # mark_required_elements(): # - # Append to the list of elements whose artifacts are required for - # the current run. Artifacts whose elements are in this list will - # be locked by the artifact cache and not touched for the duration - # of the current pipeline. + # Mark elements whose artifacts are required for the current run. + # + # Artifacts whose elements are in this list will be locked by the artifact + # cache and not touched for the duration of the current pipeline. # # Args: # elements (iterable): A set of elements to mark as required # - def append_required_artifacts(self, elements): - # We lock both strong and weak keys - deleting one but not the - # other won't save space in most cases anyway, but would be a - # user inconvenience. + def mark_required_elements(self, elements): + + # We risk calling this function with a generator, so we + # better consume it first. + # + elements = list(elements) + + # Mark the elements as required. We cannot know that we know the + # cache keys yet, so we only check that later when deleting. + # + self._required_elements.update(elements) + # For the cache keys which were resolved so far, we bump + # the atime of them. + # + # This is just in case we have concurrent instances of + # BuildStream running with the same artifact cache, it will + # reduce the likelyhood of one instance deleting artifacts + # which are required by the other. for element in elements: strong_key = element._get_cache_key(strength=_KeyStrength.STRONG) weak_key = element._get_cache_key(strength=_KeyStrength.WEAK) - for key in (strong_key, weak_key): - if key and key not in self._required_artifacts: - self._required_artifacts.add(key) - - # We also update the usage times of any artifacts - # we will be using, which helps preventing a - # buildstream process that runs in parallel with - # this one from removing artifacts in-use. + if key: try: self.update_atime(key) except ArtifactError: @@ -231,6 +238,18 @@ class ArtifactCache(): def clean(self): artifacts = self.list_artifacts() + # Build a set of the cache keys which are required + # based on the required elements at cleanup time + # + # We lock both strong and weak keys - deleting one but not the + # other won't save space, but would be a user inconvenience. + required_artifacts = set() + for element in self._required_elements: + required_artifacts.update([ + element._get_cache_key(strength=_KeyStrength.STRONG), + element._get_cache_key(strength=_KeyStrength.WEAK) + ]) + # Do a real computation of the cache size once, just in case self.compute_cache_size() @@ -256,7 +275,7 @@ class ArtifactCache(): break key = to_remove.rpartition('/')[2] - if key not in self._required_artifacts: + if key not in required_artifacts: # Remove the actual artifact, if it's not required. size = self.remove(to_remove) diff --git a/buildstream/_stream.py b/buildstream/_stream.py index cceb3d3a5..2e78a1c45 100644 --- a/buildstream/_stream.py +++ b/buildstream/_stream.py @@ -938,13 +938,10 @@ class Stream(): # Set the "required" artifacts that should not be removed # while this pipeline is active # - # FIXME: The set of required artifacts is only really needed - # for build and pull tasks. + # It must include all the artifacts which are required by the + # final product. Note that this is a superset of the build plan. # - # It must include all the artifacts which are required by the - # final product. Note that this is a superset of the build plan. - # - self._artifacts.append_required_artifacts((e for e in self._pipeline.dependencies(elements, Scope.ALL))) + self._artifacts.mark_required_elements(self._pipeline.dependencies(elements, Scope.ALL)) if selection == PipelineSelection.PLAN and dynamic_plan: # We use a dynamic build plan, only request artifacts of top-level targets, diff --git a/buildstream/element.py b/buildstream/element.py index f484c88e5..13d76dbad 100644 --- a/buildstream/element.py +++ b/buildstream/element.py @@ -1486,15 +1486,20 @@ class Element(Plugin): workspace.clear_running_files() self._get_context().get_workspaces().save_config() - # We also need to update the required artifacts, since - # workspaced dependencies do not have a fixed cache key - # when the build starts. + # This element will have already been marked as + # required, but we bump the atime again, in case + # we did not know the cache key until now. # - # This does *not* cause a race condition, because - # _assemble_done is called before a cleanup job may be - # launched. + # FIXME: This is not exactly correct, we should be + # doing this at the time which we have discovered + # a new cache key, this just happens to be the + # last place where that can happen. # - self.__artifacts.append_required_artifacts([self]) + # Ultimately, we should be refactoring + # Element._update_state() such that we know + # when a cache key is actually discovered. + # + self.__artifacts.mark_required_elements([self]) # _assemble(): # diff --git a/tests/artifactcache/expiry.py b/tests/artifactcache/expiry.py index 980710957..0ac86b91f 100644 --- a/tests/artifactcache/expiry.py +++ b/tests/artifactcache/expiry.py @@ -24,7 +24,7 @@ import pytest from buildstream import _yaml from buildstream._exceptions import ErrorDomain, LoadErrorReason -from tests.testutils import cli, create_element_size, wait_for_cache_granularity +from tests.testutils import cli, create_element_size, update_element_size, wait_for_cache_granularity DATA_DIR = os.path.join( @@ -93,6 +93,7 @@ def test_artifact_too_large(cli, datafiles, tmpdir, size): create_element_size('target.bst', project, element_path, [], size) res = cli.run(project=project, args=['build', 'target.bst']) res.assert_main_error(ErrorDomain.STREAM, None) + res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full') @pytest.mark.datafiles(DATA_DIR) @@ -196,24 +197,8 @@ def test_keep_dependencies(cli, datafiles, tmpdir): # Assert that we never delete a dependency required for a build tree -# -# NOTE: This test expects that a build will fail if it attempts to -# put more artifacts in the cache than the quota can hold, -# and expects that the last two elements which don't fit into -# the quota wont even be built. -# -# In real life, this will not be the case, since once we reach -# the estimated quota we launch a cache size calculation job and -# only launch a cleanup job when the size is calculated; and -# other build tasks will be scheduled while the cache size job -# is running. -# -# This test only passes because we configure `builders` to 1, -# ensuring that the cache size job runs exclusively since it -# also requires a compute resource (a "builder"). -# @pytest.mark.datafiles(DATA_DIR) -def test_never_delete_dependencies(cli, datafiles, tmpdir): +def test_never_delete_required(cli, datafiles, tmpdir): project = os.path.join(datafiles.dirname, datafiles.basename) element_path = 'elements' @@ -226,37 +211,94 @@ def test_never_delete_dependencies(cli, datafiles, tmpdir): } }) - # Create a build tree - create_element_size('dependency.bst', project, - element_path, [], 8000000) - create_element_size('related.bst', project, - element_path, ['dependency.bst'], 8000000) - create_element_size('target.bst', project, - element_path, ['related.bst'], 8000000) - create_element_size('target2.bst', project, - element_path, ['target.bst'], 8000000) + # Create a linear build tree + create_element_size('dep1.bst', project, element_path, [], 8000000) + create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 8000000) + create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 8000000) + create_element_size('target.bst', project, element_path, ['dep3.bst'], 8000000) # We try to build this pipeline, but it's too big for the # cache. Since all elements are required, the build should fail. - res = cli.run(project=project, args=['build', 'target2.bst']) + res = cli.run(project=project, args=['build', 'target.bst']) res.assert_main_error(ErrorDomain.STREAM, None) + res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full') - assert cli.get_element_state(project, 'dependency.bst') == 'cached' + # Only the first artifact fits in the cache, but we expect + # that the first *two* artifacts will be cached. + # + # This is because after caching the first artifact we must + # proceed to build the next artifact, and we cannot really + # know how large an artifact will be until we try to cache it. + # + # In this case, we deem it more acceptable to not delete an + # artifact which caused the cache to outgrow the quota. + # + # Note that this test only works because we have forced + # the configuration to build one element at a time, in real + # life there may potentially be N-builders cached artifacts + # which exceed the quota + # + assert cli.get_element_state(project, 'dep1.bst') == 'cached' + assert cli.get_element_state(project, 'dep2.bst') == 'cached' + + assert cli.get_element_state(project, 'dep3.bst') != 'cached' + assert cli.get_element_state(project, 'target.bst') != 'cached' + + +# Assert that we never delete a dependency required for a build tree, +# even when the artifact cache was previously populated with +# artifacts we do not require, and the new build is run with dynamic tracking. +# +@pytest.mark.datafiles(DATA_DIR) +def test_never_delete_required_track(cli, datafiles, tmpdir): + project = os.path.join(datafiles.dirname, datafiles.basename) + element_path = 'elements' + + cli.configure({ + 'cache': { + 'quota': 10000000 + }, + 'scheduler': { + 'builders': 1 + } + }) - # This is *technically* above the cache limit. BuildStream accepts - # some fuzziness, since it's hard to assert that we don't create - # an artifact larger than the cache quota. We would have to remove - # the artifact after-the-fact, but since it is required for the - # current build and nothing broke yet, it's nicer to keep it - # around. + # Create a linear build tree + repo_dep1 = create_element_size('dep1.bst', project, element_path, [], 2000000) + repo_dep2 = create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 2000000) + repo_dep3 = create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 2000000) + repo_target = create_element_size('target.bst', project, element_path, ['dep3.bst'], 2000000) + + # This should all fit into the artifact cache + res = cli.run(project=project, args=['build', 'target.bst']) + res.assert_success() + + # They should all be cached + assert cli.get_element_state(project, 'dep1.bst') == 'cached' + assert cli.get_element_state(project, 'dep2.bst') == 'cached' + assert cli.get_element_state(project, 'dep3.bst') == 'cached' + assert cli.get_element_state(project, 'target.bst') == 'cached' + + # Now increase the size of all the elements # - # This scenario is quite unlikely, and the cache overflow will be - # resolved if the user does something about it anyway. + update_element_size('dep1.bst', project, repo_dep1, 8000000) + update_element_size('dep2.bst', project, repo_dep2, 8000000) + update_element_size('dep3.bst', project, repo_dep3, 8000000) + update_element_size('target.bst', project, repo_target, 8000000) + + # Now repeat the same test we did in test_never_delete_required(), + # except this time let's add dynamic tracking # - assert cli.get_element_state(project, 'related.bst') == 'cached' + res = cli.run(project=project, args=['build', '--track-all', 'target.bst']) + res.assert_main_error(ErrorDomain.STREAM, None) + res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full') + # Expect the same result that we did in test_never_delete_required() + # + assert cli.get_element_state(project, 'dep1.bst') == 'cached' + assert cli.get_element_state(project, 'dep2.bst') == 'cached' + assert cli.get_element_state(project, 'dep3.bst') != 'cached' assert cli.get_element_state(project, 'target.bst') != 'cached' - assert cli.get_element_state(project, 'target2.bst') != 'cached' # Ensure that only valid cache quotas make it through the loading diff --git a/tests/testutils/__init__.py b/tests/testutils/__init__.py index c2fae1cc4..4a79c3be2 100644 --- a/tests/testutils/__init__.py +++ b/tests/testutils/__init__.py @@ -26,6 +26,6 @@ from .runcli import cli, cli_integration from .repo import create_repo, ALL_REPO_KINDS from .artifactshare import create_artifact_share -from .element_generators import create_element_size +from .element_generators import create_element_size, update_element_size from .junction import generate_junction from .runner_integration import wait_for_cache_granularity diff --git a/tests/testutils/element_generators.py b/tests/testutils/element_generators.py index 49f235c61..448c8571a 100644 --- a/tests/testutils/element_generators.py +++ b/tests/testutils/element_generators.py @@ -1,41 +1,100 @@ import os from buildstream import _yaml +from buildstream import utils + +from . import create_repo # create_element_size() # -# This will open a "<name>_data" file for writing and write -# <size> MB of urandom (/dev/urandom) "stuff" into the file. -# A bst import element file is then created: <name>.bst +# Creates an import element with a git repo, using random +# data to create a file in that repo of the specified size, +# such that building it will add an artifact of the specified +# size to the artifact cache. # # Args: -# name: (str) of the element name (e.g. target.bst) -# path: (str) pathway to the project/elements directory -# dependencies: A list of strings (can also be an empty list) -# size: (int) size of the element in bytes +# name: (str) of the element name (e.g. target.bst) +# project_dir (str): The path to the project +# element_path (str): The element path within the project +# dependencies: A list of strings (can also be an empty list) +# size: (int) size of the element in bytes # # Returns: -# Nothing (creates a .bst file of specified size) +# (Repo): A git repo which can be used to introduce trackable changes +# by using the update_element_size() function below. # def create_element_size(name, project_dir, elements_path, dependencies, size): full_elements_path = os.path.join(project_dir, elements_path) os.makedirs(full_elements_path, exist_ok=True) - # Create a file to be included in this element's artifact - with open(os.path.join(project_dir, name + '_data'), 'wb+') as f: - f.write(os.urandom(size)) + # Create a git repo + repodir = os.path.join(project_dir, 'repos') + repo = create_repo('git', repodir, subdir=name) + + with utils._tempdir(dir=project_dir) as tmp: + + # We use a data/ subdir in the git repo we create, + # and we set the import element to only extract that + # part; this ensures we never include a .git/ directory + # in the cached artifacts for these sized elements. + # + datadir = os.path.join(tmp, 'data') + os.makedirs(datadir) + + # Use /dev/urandom to create the sized file in the datadir + with open(os.path.join(datadir, name), 'wb+') as f: + f.write(os.urandom(size)) + + # Create the git repo from the temp directory + ref = repo.create(tmp) - # Simplest case: We want this file (of specified size) to just - # be an import element. element = { 'kind': 'import', 'sources': [ - { - 'kind': 'local', - 'path': name + '_data' - } + repo.source_config(ref=ref) ], + 'config': { + # Extract only the data directory + 'source': 'data' + }, 'depends': dependencies } _yaml.dump(element, os.path.join(project_dir, elements_path, name)) + + # Return the repo, so that it can later be used to add commits + return repo + + +# update_element_size() +# +# Updates a repo returned by create_element_size() such that +# the newly added commit is completely changed, and has the newly +# specified size. +# +# The name and project_dir arguments must match the arguments +# previously given to create_element_size() +# +# Args: +# name: (str) of the element name (e.g. target.bst) +# project_dir (str): The path to the project +# repo: (Repo) The Repo returned by create_element_size() +# size: (int) The new size which the element generates, in bytes +# +# Returns: +# (Repo): A git repo which can be used to introduce trackable changes +# by using the update_element_size() function below. +# +def update_element_size(name, project_dir, repo, size): + + with utils._tempdir(dir=project_dir) as tmp: + + new_file = os.path.join(tmp, name) + + # Use /dev/urandom to create the sized file in the datadir + with open(new_file, 'wb+') as f: + f.write(os.urandom(size)) + + # Modify the git repo with a new commit to the same path, + # replacing the original file with a new one. + repo.modify_file(new_file, os.path.join('data', name)) diff --git a/tests/testutils/repo/git.py b/tests/testutils/repo/git.py index 8c2a8c177..c598eb4d0 100644 --- a/tests/testutils/repo/git.py +++ b/tests/testutils/repo/git.py @@ -52,6 +52,13 @@ class Git(Repo): self._run_git('commit', '-m', 'Added {}'.format(os.path.basename(filename))) return self.latest_commit() + def modify_file(self, new_file, path): + shutil.copy(new_file, os.path.join(self.repo, path)) + subprocess.call([ + 'git', 'commit', path, '-m', 'Modified {}'.format(os.path.basename(path)) + ], env=GIT_ENV, cwd=self.repo) + return self.latest_commit() + def add_submodule(self, subdir, url=None, checkout=None): submodule = {} if checkout is not None: |