summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Maw <jonathan.maw@codethink.co.uk>2018-10-15 12:43:42 +0100
committerJonathan Maw <jonathan.maw@codethink.co.uk>2018-10-15 15:54:59 +0000
commitd15ed8fcafca3bfc7109adfc99a0bc4da6322b93 (patch)
tree1af0ffa1a46c69db7a7c10f19286cf3ade141a49
parent4e3ec89e95955eb3605b7e50b8a35183bbd42be6 (diff)
downloadbuildstream-d15ed8fcafca3bfc7109adfc99a0bc4da6322b93.tar.gz
yaml: Add a cache of parsed and provenanced yaml
Note that the ProvenanceFile's names will be incorrect after loading from the cache, but this is currently only used for writeback, which isn't used in junctions.
-rw-r--r--buildstream/_loader/loader.py31
-rw-r--r--buildstream/_yaml.py16
-rw-r--r--buildstream/_yamlcache.py348
3 files changed, 382 insertions, 13 deletions
diff --git a/buildstream/_loader/loader.py b/buildstream/_loader/loader.py
index 1b27d9d55..1bdbca90f 100644
--- a/buildstream/_loader/loader.py
+++ b/buildstream/_loader/loader.py
@@ -29,6 +29,7 @@ from .. import _yaml
from ..element import Element
from .._profile import Topics, profile_start, profile_end
from .._includes import Includes
+from .._yamlcache import YamlCache
from .types import Symbol, Dependency
from .loadelement import LoadElement
@@ -108,13 +109,19 @@ class Loader():
#
deps = []
- for target in targets:
- profile_start(Topics.LOAD_PROJECT, target)
- junction, name, loader = self._parse_name(target, rewritable, ticker,
- fetch_subprojects=fetch_subprojects)
- loader._load_file(name, rewritable, ticker, fetch_subprojects)
- deps.append(Dependency(name, junction=junction))
- profile_end(Topics.LOAD_PROJECT, target)
+ # XXX This will need to be changed to the context's top-level project if this method
+ # is ever used for subprojects
+ top_dir = self.project.directory
+
+ cache_file = YamlCache.get_cache_file(top_dir)
+ with YamlCache.open(self._context, cache_file) as yaml_cache:
+ for target in targets:
+ profile_start(Topics.LOAD_PROJECT, target)
+ junction, name, loader = self._parse_name(target, rewritable, ticker,
+ fetch_subprojects=fetch_subprojects)
+ loader._load_file(name, rewritable, ticker, fetch_subprojects, yaml_cache)
+ deps.append(Dependency(name, junction=junction))
+ profile_end(Topics.LOAD_PROJECT, target)
#
# Now that we've resolve the dependencies, scan them for circular dependencies
@@ -201,11 +208,12 @@ class Loader():
# rewritable (bool): Whether we should load in round trippable mode
# ticker (callable): A callback to report loaded filenames to the frontend
# fetch_subprojects (bool): Whether to fetch subprojects while loading
+ # yaml_cache (YamlCache): A yaml cache
#
# Returns:
# (LoadElement): A loaded LoadElement
#
- def _load_file(self, filename, rewritable, ticker, fetch_subprojects):
+ def _load_file(self, filename, rewritable, ticker, fetch_subprojects, yaml_cache=None):
# Silently ignore already loaded files
if filename in self._elements:
@@ -218,7 +226,8 @@ class Loader():
# Load the data and process any conditional statements therein
fullpath = os.path.join(self._basedir, filename)
try:
- node = _yaml.load(fullpath, shortname=filename, copy_tree=rewritable, project=self.project)
+ node = _yaml.load(fullpath, shortname=filename, copy_tree=rewritable,
+ project=self.project, yaml_cache=yaml_cache)
except LoadError as e:
if e.reason == LoadErrorReason.MISSING_FILE:
# If we can't find the file, try to suggest plausible
@@ -261,13 +270,13 @@ class Loader():
# Load all dependency files for the new LoadElement
for dep in element.deps:
if dep.junction:
- self._load_file(dep.junction, rewritable, ticker, fetch_subprojects)
+ self._load_file(dep.junction, rewritable, ticker, fetch_subprojects, yaml_cache)
loader = self._get_loader(dep.junction, rewritable=rewritable, ticker=ticker,
fetch_subprojects=fetch_subprojects)
else:
loader = self
- dep_element = loader._load_file(dep.name, rewritable, ticker, fetch_subprojects)
+ dep_element = loader._load_file(dep.name, rewritable, ticker, fetch_subprojects, yaml_cache)
if _yaml.node_get(dep_element.node, str, Symbol.KIND) == 'junction':
raise LoadError(LoadErrorReason.INVALID_DATA,
diff --git a/buildstream/_yaml.py b/buildstream/_yaml.py
index 4ee12a18c..e24d482f0 100644
--- a/buildstream/_yaml.py
+++ b/buildstream/_yaml.py
@@ -183,20 +183,32 @@ class CompositeTypeError(CompositeError):
# shortname (str): The filename in shorthand for error reporting (or None)
# copy_tree (bool): Whether to make a copy, preserving the original toplevels
# for later serialization
+# yaml_cache (YamlCache): A yaml cache to consult rather than parsing
#
# Returns (dict): A loaded copy of the YAML file with provenance information
#
# Raises: LoadError
#
-def load(filename, shortname=None, copy_tree=False, *, project=None):
+def load(filename, shortname=None, copy_tree=False, *, project=None, yaml_cache=None):
if not shortname:
shortname = filename
file = ProvenanceFile(filename, shortname, project)
try:
+ data = None
with open(filename) as f:
- return load_data(f, file, copy_tree=copy_tree)
+ contents = f.read()
+ if yaml_cache:
+ data, key = yaml_cache.get(project, filename, contents, copy_tree)
+
+ if not data:
+ data = load_data(contents, file, copy_tree=copy_tree)
+
+ if yaml_cache:
+ yaml_cache.put_from_key(project, filename, key, data)
+
+ return data
except FileNotFoundError as e:
raise LoadError(LoadErrorReason.MISSING_FILE,
"Could not find file at {}".format(filename)) from e
diff --git a/buildstream/_yamlcache.py b/buildstream/_yamlcache.py
new file mode 100644
index 000000000..39b24cccc
--- /dev/null
+++ b/buildstream/_yamlcache.py
@@ -0,0 +1,348 @@
+#
+# Copyright 2018 Bloomberg Finance LP
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Authors:
+# Jonathan Maw <jonathan.maw@codethink.co.uk>
+
+import os
+import pickle
+import hashlib
+import io
+
+import sys
+
+from contextlib import contextmanager
+from collections import namedtuple
+
+from ._cachekey import generate_key
+from ._context import Context
+from . import utils, _yaml
+
+
+YAML_CACHE_FILENAME = "yaml_cache.pickle"
+
+
+# YamlCache()
+#
+# A cache that wraps around the loading of yaml in projects.
+#
+# The recommended way to use a YamlCache is:
+# with YamlCache.open(context) as yamlcache:
+# # Load all the yaml
+# ...
+#
+# Args:
+# context (Context): The invocation Context
+#
+class YamlCache():
+
+ def __init__(self, context):
+ self._project_caches = {}
+ self._context = context
+
+ ##################
+ # Public Methods #
+ ##################
+
+ # is_cached():
+ #
+ # Checks whether a file is cached.
+ #
+ # Args:
+ # project (Project): The project this file is in.
+ # filepath (str): The path to the file, *relative to the project's directory*.
+ #
+ # Returns:
+ # (bool): Whether the file is cached.
+ def is_cached(self, project, filepath):
+ cache_path = self._get_filepath(project, filepath)
+ project_name = project.name if project else ""
+ try:
+ project_cache = self._project_caches[project_name]
+ if cache_path in project_cache.elements:
+ return True
+ except KeyError:
+ pass
+ return False
+
+ # open():
+ #
+ # Return an instance of the YamlCache which writes to disk when it leaves scope.
+ #
+ # Args:
+ # context (Context): The context.
+ # cachefile (str): The path to the cache file.
+ #
+ # Returns:
+ # (YamlCache): A YamlCache.
+ @staticmethod
+ @contextmanager
+ def open(context, cachefile):
+ # Try to load from disk first
+ cache = None
+ if os.path.exists(cachefile):
+ try:
+ with open(cachefile, "rb") as f:
+ cache = BstUnpickler(f, context).load()
+ except EOFError:
+ # The file was empty
+ pass
+ except pickle.UnpicklingError as e:
+ sys.stderr.write("Failed to load YamlCache, {}\n".format(e))
+
+ # Failed to load from disk, create a new one
+ if not cache:
+ cache = YamlCache(context)
+
+ yield cache
+
+ cache._write(cachefile)
+
+ # get_cache_file():
+ #
+ # Retrieves a path to the yaml cache file.
+ #
+ # Returns:
+ # (str): The path to the cache file
+ @staticmethod
+ def get_cache_file(top_dir):
+ return os.path.join(top_dir, ".bst", YAML_CACHE_FILENAME)
+
+ # get():
+ #
+ # Gets a parsed file from the cache.
+ #
+ # Args:
+ # project (Project) or None: The project this file is in, if it exists.
+ # filepath (str): The absolute path to the file.
+ # contents (str): The contents of the file to be cached
+ # copy_tree (bool): Whether the data should make a copy when it's being generated
+ # (i.e. exactly as when called in yaml)
+ #
+ # Returns:
+ # (decorated dict): The parsed yaml from the cache, or None if the file isn't in the cache.
+ # (str): The key used to look up the parsed yaml in the cache
+ def get(self, project, filepath, contents, copy_tree):
+ key = self._calculate_key(contents, copy_tree)
+ data = self._get(project, filepath, key)
+ return data, key
+
+ # put():
+ #
+ # Puts a parsed file into the cache.
+ #
+ # Args:
+ # project (Project): The project this file is in.
+ # filepath (str): The path to the file.
+ # contents (str): The contents of the file that has been cached
+ # copy_tree (bool): Whether the data should make a copy when it's being generated
+ # (i.e. exactly as when called in yaml)
+ # value (decorated dict): The data to put into the cache.
+ def put(self, project, filepath, contents, copy_tree, value):
+ key = self._calculate_key(contents, copy_tree)
+ self.put_from_key(project, filepath, key, value)
+
+ # put_from_key():
+ #
+ # Put a parsed file into the cache when given a key.
+ #
+ # Args:
+ # project (Project): The project this file is in.
+ # filepath (str): The path to the file.
+ # key (str): The key to the file within the cache. Typically, this is the
+ # value of `calculate_key()` with the file's unparsed contents
+ # and any relevant metadata passed in.
+ # value (decorated dict): The data to put into the cache.
+ def put_from_key(self, project, filepath, key, value):
+ cache_path = self._get_filepath(project, filepath)
+ project_name = project.name if project else ""
+ try:
+ project_cache = self._project_caches[project_name]
+ except KeyError:
+ project_cache = self._project_caches[project_name] = CachedProject({})
+
+ project_cache.elements[cache_path] = CachedYaml(key, value)
+
+ ###################
+ # Private Methods #
+ ###################
+
+ # Writes the yaml cache to the specified path.
+ #
+ # Args:
+ # path (str): The path to the cache file.
+ def _write(self, path):
+ parent_dir = os.path.dirname(path)
+ os.makedirs(parent_dir, exist_ok=True)
+ with open(path, "wb") as f:
+ BstPickler(f).dump(self)
+
+ # _get_filepath():
+ #
+ # Returns a file path relative to a project if passed, or the original path if
+ # the project is None
+ #
+ # Args:
+ # project (Project) or None: The project the filepath exists within
+ # full_path (str): The path that the returned path is based on
+ #
+ # Returns:
+ # (str): The path to the file, relative to a project if it exists
+ def _get_filepath(self, project, full_path):
+ if project:
+ assert full_path.startswith(project.directory)
+ filepath = os.path.relpath(full_path, project.directory)
+ else:
+ filepath = full_path
+ return full_path
+
+ # _calculate_key():
+ #
+ # Calculates a key for putting into the cache.
+ #
+ # Args:
+ # (basic object)... : Any number of strictly-ordered basic objects
+ #
+ # Returns:
+ # (str): A key made out of every arg passed in
+ @staticmethod
+ def _calculate_key(*args):
+ string = pickle.dumps(args)
+ return hashlib.sha1(string).hexdigest()
+
+ # _get():
+ #
+ # Gets a parsed file from the cache when given a key.
+ #
+ # Args:
+ # project (Project): The project this file is in.
+ # filepath (str): The path to the file.
+ # key (str): The key to the file within the cache. Typically, this is the
+ # value of `calculate_key()` with the file's unparsed contents
+ # and any relevant metadata passed in.
+ #
+ # Returns:
+ # (decorated dict): The parsed yaml from the cache, or None if the file isn't in the cache.
+ def _get(self, project, filepath, key):
+ cache_path = self._get_filepath(project, filepath)
+ project_name = project.name if project else ""
+ try:
+ project_cache = self._project_caches[project_name]
+ try:
+ cachedyaml = project_cache.elements[cache_path]
+ if cachedyaml._key == key:
+ # We've unpickled the YamlCache, but not the specific file
+ if cachedyaml._contents is None:
+ cachedyaml._contents = BstUnpickler.loads(cachedyaml._pickled_contents, self._context)
+ return cachedyaml._contents
+ except KeyError:
+ pass
+ except KeyError:
+ pass
+ return None
+
+
+CachedProject = namedtuple('CachedProject', ['elements'])
+
+
+class CachedYaml():
+ def __init__(self, key, contents):
+ self._key = key
+ self.set_contents(contents)
+
+ # Sets the contents of the CachedYaml.
+ #
+ # Args:
+ # contents (provenanced dict): The contents to put in the cache.
+ #
+ def set_contents(self, contents):
+ self._contents = contents
+ self._pickled_contents = BstPickler.dumps(contents)
+
+ # Pickling helper method, prevents 'contents' from being serialised
+ def __getstate__(self):
+ data = self.__dict__.copy()
+ data['_contents'] = None
+ return data
+
+
+# In _yaml.load, we have a ProvenanceFile that stores the project the file
+# came from. Projects can't be pickled, but it's always going to be the same
+# project between invocations (unless the entire project is moved but the
+# file stayed in the same place)
+class BstPickler(pickle.Pickler):
+ def persistent_id(self, obj):
+ if isinstance(obj, _yaml.ProvenanceFile):
+ if obj.project:
+ # ProvenanceFile's project object cannot be stored as it is.
+ project_tag = obj.project.name
+ # ProvenanceFile's filename must be stored relative to the
+ # project, as the project dir may move.
+ name = os.path.relpath(obj.name, obj.project.directory)
+ else:
+ project_tag = None
+ name = obj.name
+ return ("ProvenanceFile", name, obj.shortname, project_tag)
+ elif isinstance(obj, Context):
+ return ("Context",)
+ else:
+ return None
+
+ @staticmethod
+ def dumps(obj):
+ stream = io.BytesIO()
+ BstPickler(stream).dump(obj)
+ stream.seek(0)
+ return stream.read()
+
+
+class BstUnpickler(pickle.Unpickler):
+ def __init__(self, file, context):
+ super().__init__(file)
+ self._context = context
+
+ def persistent_load(self, pid):
+ if pid[0] == "ProvenanceFile":
+ _, tagged_name, shortname, project_tag = pid
+
+ if project_tag is not None:
+ for p in self._context.get_projects():
+ if project_tag == p.name:
+ project = p
+ break
+
+ name = os.path.join(project.directory, tagged_name)
+
+ if not project:
+ projects = [p.name for p in self._context.get_projects()]
+ raise pickle.UnpicklingError("No project with name {} found in {}"
+ .format(key_id, projects))
+ else:
+ project = None
+ name = tagged_name
+
+ return _yaml.ProvenanceFile(name, shortname, project)
+ elif pid[0] == "Context":
+ return self._context
+ else:
+ raise pickle.UnpicklingError("Unsupported persistent object, {}".format(pid))
+
+ @staticmethod
+ def loads(text, context):
+ stream = io.BytesIO()
+ stream.write(bytes(text))
+ stream.seek(0)
+ return BstUnpickler(stream, context).load()