summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTristan Van Berkom <tristan.van.berkom@gmail.com>2018-09-14 12:44:45 +0000
committerTristan Van Berkom <tristan.van.berkom@gmail.com>2018-09-14 12:44:45 +0000
commitef846ced8135249a20237c9ec1a2eae5460f8282 (patch)
treee52c5dbfb947ddb5e4be1e5225b63e4189d9ca41
parent32d0ce664093a3b5256b7af449a047cc4ffe0a87 (diff)
parent673f2372b38b017ed581b5100a9feffc29d66c41 (diff)
downloadbuildstream-ef846ced8135249a20237c9ec1a2eae5460f8282.tar.gz
Merge branch 'tristan/fix-required-artifacts-1.2' into 'bst-1.2'
Don't delete required artifacts when tracking is enabled See merge request BuildStream/buildstream!794
-rw-r--r--buildstream/_artifactcache/artifactcache.py57
-rw-r--r--buildstream/_stream.py9
-rw-r--r--buildstream/element.py19
-rw-r--r--tests/artifactcache/expiry.py120
-rw-r--r--tests/testutils/__init__.py2
-rw-r--r--tests/testutils/element_generators.py93
-rw-r--r--tests/testutils/repo/git.py7
7 files changed, 218 insertions, 89 deletions
diff --git a/buildstream/_artifactcache/artifactcache.py b/buildstream/_artifactcache/artifactcache.py
index ead44c4ff..aa11d92a9 100644
--- a/buildstream/_artifactcache/artifactcache.py
+++ b/buildstream/_artifactcache/artifactcache.py
@@ -87,7 +87,7 @@ class ArtifactCache():
self.global_remote_specs = []
self.project_remote_specs = {}
- self._required_artifacts = set() # The artifacts required for this session
+ self._required_elements = set() # The elements required for this session
self._cache_size = None # The current cache size, sometimes it's an estimate
self._cache_quota = None # The cache quota
self._cache_lower_threshold = None # The target cache size for a cleanup
@@ -189,33 +189,40 @@ class ArtifactCache():
(str(provenance)))
return cache_specs
- # append_required_artifacts():
+ # mark_required_elements():
#
- # Append to the list of elements whose artifacts are required for
- # the current run. Artifacts whose elements are in this list will
- # be locked by the artifact cache and not touched for the duration
- # of the current pipeline.
+ # Mark elements whose artifacts are required for the current run.
+ #
+ # Artifacts whose elements are in this list will be locked by the artifact
+ # cache and not touched for the duration of the current pipeline.
#
# Args:
# elements (iterable): A set of elements to mark as required
#
- def append_required_artifacts(self, elements):
- # We lock both strong and weak keys - deleting one but not the
- # other won't save space in most cases anyway, but would be a
- # user inconvenience.
+ def mark_required_elements(self, elements):
+
+ # We risk calling this function with a generator, so we
+ # better consume it first.
+ #
+ elements = list(elements)
+
+ # Mark the elements as required. We cannot know that we know the
+ # cache keys yet, so we only check that later when deleting.
+ #
+ self._required_elements.update(elements)
+ # For the cache keys which were resolved so far, we bump
+ # the atime of them.
+ #
+ # This is just in case we have concurrent instances of
+ # BuildStream running with the same artifact cache, it will
+ # reduce the likelyhood of one instance deleting artifacts
+ # which are required by the other.
for element in elements:
strong_key = element._get_cache_key(strength=_KeyStrength.STRONG)
weak_key = element._get_cache_key(strength=_KeyStrength.WEAK)
-
for key in (strong_key, weak_key):
- if key and key not in self._required_artifacts:
- self._required_artifacts.add(key)
-
- # We also update the usage times of any artifacts
- # we will be using, which helps preventing a
- # buildstream process that runs in parallel with
- # this one from removing artifacts in-use.
+ if key:
try:
self.update_atime(key)
except ArtifactError:
@@ -231,6 +238,18 @@ class ArtifactCache():
def clean(self):
artifacts = self.list_artifacts()
+ # Build a set of the cache keys which are required
+ # based on the required elements at cleanup time
+ #
+ # We lock both strong and weak keys - deleting one but not the
+ # other won't save space, but would be a user inconvenience.
+ required_artifacts = set()
+ for element in self._required_elements:
+ required_artifacts.update([
+ element._get_cache_key(strength=_KeyStrength.STRONG),
+ element._get_cache_key(strength=_KeyStrength.WEAK)
+ ])
+
# Do a real computation of the cache size once, just in case
self.compute_cache_size()
@@ -256,7 +275,7 @@ class ArtifactCache():
break
key = to_remove.rpartition('/')[2]
- if key not in self._required_artifacts:
+ if key not in required_artifacts:
# Remove the actual artifact, if it's not required.
size = self.remove(to_remove)
diff --git a/buildstream/_stream.py b/buildstream/_stream.py
index 252bed26e..5d862de91 100644
--- a/buildstream/_stream.py
+++ b/buildstream/_stream.py
@@ -937,13 +937,10 @@ class Stream():
# Set the "required" artifacts that should not be removed
# while this pipeline is active
#
- # FIXME: The set of required artifacts is only really needed
- # for build and pull tasks.
+ # It must include all the artifacts which are required by the
+ # final product. Note that this is a superset of the build plan.
#
- # It must include all the artifacts which are required by the
- # final product. Note that this is a superset of the build plan.
- #
- self._artifacts.append_required_artifacts((e for e in self._pipeline.dependencies(elements, Scope.ALL)))
+ self._artifacts.mark_required_elements(self._pipeline.dependencies(elements, Scope.ALL))
if selection == PipelineSelection.PLAN and dynamic_plan:
# We use a dynamic build plan, only request artifacts of top-level targets,
diff --git a/buildstream/element.py b/buildstream/element.py
index e6b330492..4f7fc0564 100644
--- a/buildstream/element.py
+++ b/buildstream/element.py
@@ -1434,15 +1434,20 @@ class Element(Plugin):
workspace.clear_running_files()
self._get_context().get_workspaces().save_config()
- # We also need to update the required artifacts, since
- # workspaced dependencies do not have a fixed cache key
- # when the build starts.
+ # This element will have already been marked as
+ # required, but we bump the atime again, in case
+ # we did not know the cache key until now.
#
- # This does *not* cause a race condition, because
- # _assemble_done is called before a cleanup job may be
- # launched.
+ # FIXME: This is not exactly correct, we should be
+ # doing this at the time which we have discovered
+ # a new cache key, this just happens to be the
+ # last place where that can happen.
#
- self.__artifacts.append_required_artifacts([self])
+ # Ultimately, we should be refactoring
+ # Element._update_state() such that we know
+ # when a cache key is actually discovered.
+ #
+ self.__artifacts.mark_required_elements([self])
# _assemble():
#
diff --git a/tests/artifactcache/expiry.py b/tests/artifactcache/expiry.py
index 9ea889815..f8b928cbf 100644
--- a/tests/artifactcache/expiry.py
+++ b/tests/artifactcache/expiry.py
@@ -5,7 +5,7 @@ import pytest
from buildstream import _yaml
from buildstream._exceptions import ErrorDomain, LoadErrorReason
-from tests.testutils import cli, create_element_size
+from tests.testutils import cli, create_element_size, update_element_size
DATA_DIR = os.path.join(
@@ -74,6 +74,7 @@ def test_artifact_too_large(cli, datafiles, tmpdir, size):
create_element_size('target.bst', project, element_path, [], size)
res = cli.run(project=project, args=['build', 'target.bst'])
res.assert_main_error(ErrorDomain.STREAM, None)
+ res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')
@pytest.mark.datafiles(DATA_DIR)
@@ -175,24 +176,8 @@ def test_keep_dependencies(cli, datafiles, tmpdir):
# Assert that we never delete a dependency required for a build tree
-#
-# NOTE: This test expects that a build will fail if it attempts to
-# put more artifacts in the cache than the quota can hold,
-# and expects that the last two elements which don't fit into
-# the quota wont even be built.
-#
-# In real life, this will not be the case, since once we reach
-# the estimated quota we launch a cache size calculation job and
-# only launch a cleanup job when the size is calculated; and
-# other build tasks will be scheduled while the cache size job
-# is running.
-#
-# This test only passes because we configure `builders` to 1,
-# ensuring that the cache size job runs exclusively since it
-# also requires a compute resource (a "builder").
-#
@pytest.mark.datafiles(DATA_DIR)
-def test_never_delete_dependencies(cli, datafiles, tmpdir):
+def test_never_delete_required(cli, datafiles, tmpdir):
project = os.path.join(datafiles.dirname, datafiles.basename)
element_path = 'elements'
@@ -205,37 +190,94 @@ def test_never_delete_dependencies(cli, datafiles, tmpdir):
}
})
- # Create a build tree
- create_element_size('dependency.bst', project,
- element_path, [], 8000000)
- create_element_size('related.bst', project,
- element_path, ['dependency.bst'], 8000000)
- create_element_size('target.bst', project,
- element_path, ['related.bst'], 8000000)
- create_element_size('target2.bst', project,
- element_path, ['target.bst'], 8000000)
+ # Create a linear build tree
+ create_element_size('dep1.bst', project, element_path, [], 8000000)
+ create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 8000000)
+ create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 8000000)
+ create_element_size('target.bst', project, element_path, ['dep3.bst'], 8000000)
# We try to build this pipeline, but it's too big for the
# cache. Since all elements are required, the build should fail.
- res = cli.run(project=project, args=['build', 'target2.bst'])
+ res = cli.run(project=project, args=['build', 'target.bst'])
res.assert_main_error(ErrorDomain.STREAM, None)
+ res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')
- assert cli.get_element_state(project, 'dependency.bst') == 'cached'
+ # Only the first artifact fits in the cache, but we expect
+ # that the first *two* artifacts will be cached.
+ #
+ # This is because after caching the first artifact we must
+ # proceed to build the next artifact, and we cannot really
+ # know how large an artifact will be until we try to cache it.
+ #
+ # In this case, we deem it more acceptable to not delete an
+ # artifact which caused the cache to outgrow the quota.
+ #
+ # Note that this test only works because we have forced
+ # the configuration to build one element at a time, in real
+ # life there may potentially be N-builders cached artifacts
+ # which exceed the quota
+ #
+ assert cli.get_element_state(project, 'dep1.bst') == 'cached'
+ assert cli.get_element_state(project, 'dep2.bst') == 'cached'
+
+ assert cli.get_element_state(project, 'dep3.bst') != 'cached'
+ assert cli.get_element_state(project, 'target.bst') != 'cached'
+
+
+# Assert that we never delete a dependency required for a build tree,
+# even when the artifact cache was previously populated with
+# artifacts we do not require, and the new build is run with dynamic tracking.
+#
+@pytest.mark.datafiles(DATA_DIR)
+def test_never_delete_required_track(cli, datafiles, tmpdir):
+ project = os.path.join(datafiles.dirname, datafiles.basename)
+ element_path = 'elements'
+
+ cli.configure({
+ 'cache': {
+ 'quota': 10000000
+ },
+ 'scheduler': {
+ 'builders': 1
+ }
+ })
- # This is *technically* above the cache limit. BuildStream accepts
- # some fuzziness, since it's hard to assert that we don't create
- # an artifact larger than the cache quota. We would have to remove
- # the artifact after-the-fact, but since it is required for the
- # current build and nothing broke yet, it's nicer to keep it
- # around.
+ # Create a linear build tree
+ repo_dep1 = create_element_size('dep1.bst', project, element_path, [], 2000000)
+ repo_dep2 = create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 2000000)
+ repo_dep3 = create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 2000000)
+ repo_target = create_element_size('target.bst', project, element_path, ['dep3.bst'], 2000000)
+
+ # This should all fit into the artifact cache
+ res = cli.run(project=project, args=['build', 'target.bst'])
+ res.assert_success()
+
+ # They should all be cached
+ assert cli.get_element_state(project, 'dep1.bst') == 'cached'
+ assert cli.get_element_state(project, 'dep2.bst') == 'cached'
+ assert cli.get_element_state(project, 'dep3.bst') == 'cached'
+ assert cli.get_element_state(project, 'target.bst') == 'cached'
+
+ # Now increase the size of all the elements
#
- # This scenario is quite unlikely, and the cache overflow will be
- # resolved if the user does something about it anyway.
+ update_element_size('dep1.bst', project, repo_dep1, 8000000)
+ update_element_size('dep2.bst', project, repo_dep2, 8000000)
+ update_element_size('dep3.bst', project, repo_dep3, 8000000)
+ update_element_size('target.bst', project, repo_target, 8000000)
+
+ # Now repeat the same test we did in test_never_delete_required(),
+ # except this time let's add dynamic tracking
#
- assert cli.get_element_state(project, 'related.bst') == 'cached'
+ res = cli.run(project=project, args=['build', '--track-all', 'target.bst'])
+ res.assert_main_error(ErrorDomain.STREAM, None)
+ res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')
+ # Expect the same result that we did in test_never_delete_required()
+ #
+ assert cli.get_element_state(project, 'dep1.bst') == 'cached'
+ assert cli.get_element_state(project, 'dep2.bst') == 'cached'
+ assert cli.get_element_state(project, 'dep3.bst') != 'cached'
assert cli.get_element_state(project, 'target.bst') != 'cached'
- assert cli.get_element_state(project, 'target2.bst') != 'cached'
# Ensure that only valid cache quotas make it through the loading
diff --git a/tests/testutils/__init__.py b/tests/testutils/__init__.py
index e9db94989..b7f6ecca2 100644
--- a/tests/testutils/__init__.py
+++ b/tests/testutils/__init__.py
@@ -1,5 +1,5 @@
from .runcli import cli, cli_integration
from .repo import create_repo, ALL_REPO_KINDS
from .artifactshare import create_artifact_share
-from .element_generators import create_element_size
+from .element_generators import create_element_size, update_element_size
from .junction import generate_junction
diff --git a/tests/testutils/element_generators.py b/tests/testutils/element_generators.py
index 49f235c61..448c8571a 100644
--- a/tests/testutils/element_generators.py
+++ b/tests/testutils/element_generators.py
@@ -1,41 +1,100 @@
import os
from buildstream import _yaml
+from buildstream import utils
+
+from . import create_repo
# create_element_size()
#
-# This will open a "<name>_data" file for writing and write
-# <size> MB of urandom (/dev/urandom) "stuff" into the file.
-# A bst import element file is then created: <name>.bst
+# Creates an import element with a git repo, using random
+# data to create a file in that repo of the specified size,
+# such that building it will add an artifact of the specified
+# size to the artifact cache.
#
# Args:
-# name: (str) of the element name (e.g. target.bst)
-# path: (str) pathway to the project/elements directory
-# dependencies: A list of strings (can also be an empty list)
-# size: (int) size of the element in bytes
+# name: (str) of the element name (e.g. target.bst)
+# project_dir (str): The path to the project
+# element_path (str): The element path within the project
+# dependencies: A list of strings (can also be an empty list)
+# size: (int) size of the element in bytes
#
# Returns:
-# Nothing (creates a .bst file of specified size)
+# (Repo): A git repo which can be used to introduce trackable changes
+# by using the update_element_size() function below.
#
def create_element_size(name, project_dir, elements_path, dependencies, size):
full_elements_path = os.path.join(project_dir, elements_path)
os.makedirs(full_elements_path, exist_ok=True)
- # Create a file to be included in this element's artifact
- with open(os.path.join(project_dir, name + '_data'), 'wb+') as f:
- f.write(os.urandom(size))
+ # Create a git repo
+ repodir = os.path.join(project_dir, 'repos')
+ repo = create_repo('git', repodir, subdir=name)
+
+ with utils._tempdir(dir=project_dir) as tmp:
+
+ # We use a data/ subdir in the git repo we create,
+ # and we set the import element to only extract that
+ # part; this ensures we never include a .git/ directory
+ # in the cached artifacts for these sized elements.
+ #
+ datadir = os.path.join(tmp, 'data')
+ os.makedirs(datadir)
+
+ # Use /dev/urandom to create the sized file in the datadir
+ with open(os.path.join(datadir, name), 'wb+') as f:
+ f.write(os.urandom(size))
+
+ # Create the git repo from the temp directory
+ ref = repo.create(tmp)
- # Simplest case: We want this file (of specified size) to just
- # be an import element.
element = {
'kind': 'import',
'sources': [
- {
- 'kind': 'local',
- 'path': name + '_data'
- }
+ repo.source_config(ref=ref)
],
+ 'config': {
+ # Extract only the data directory
+ 'source': 'data'
+ },
'depends': dependencies
}
_yaml.dump(element, os.path.join(project_dir, elements_path, name))
+
+ # Return the repo, so that it can later be used to add commits
+ return repo
+
+
+# update_element_size()
+#
+# Updates a repo returned by create_element_size() such that
+# the newly added commit is completely changed, and has the newly
+# specified size.
+#
+# The name and project_dir arguments must match the arguments
+# previously given to create_element_size()
+#
+# Args:
+# name: (str) of the element name (e.g. target.bst)
+# project_dir (str): The path to the project
+# repo: (Repo) The Repo returned by create_element_size()
+# size: (int) The new size which the element generates, in bytes
+#
+# Returns:
+# (Repo): A git repo which can be used to introduce trackable changes
+# by using the update_element_size() function below.
+#
+def update_element_size(name, project_dir, repo, size):
+
+ with utils._tempdir(dir=project_dir) as tmp:
+
+ new_file = os.path.join(tmp, name)
+
+ # Use /dev/urandom to create the sized file in the datadir
+ with open(new_file, 'wb+') as f:
+ f.write(os.urandom(size))
+
+ # Modify the git repo with a new commit to the same path,
+ # replacing the original file with a new one.
+ repo.modify_file(new_file, os.path.join('data', name))
diff --git a/tests/testutils/repo/git.py b/tests/testutils/repo/git.py
index eea43d608..75624d597 100644
--- a/tests/testutils/repo/git.py
+++ b/tests/testutils/repo/git.py
@@ -46,6 +46,13 @@ class Git(Repo):
], env=GIT_ENV, cwd=self.repo)
return self.latest_commit()
+ def modify_file(self, new_file, path):
+ shutil.copy(new_file, os.path.join(self.repo, path))
+ subprocess.call([
+ 'git', 'commit', path, '-m', 'Modified {}'.format(os.path.basename(path))
+ ], env=GIT_ENV, cwd=self.repo)
+ return self.latest_commit()
+
def add_submodule(self, subdir, url=None, checkout=None):
submodule = {}
if checkout is not None: