From 7c9004b0d2836f8c2349f13f9e3d4bdc1eb6b8ed Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 09:19:17 +0100
Subject: _variables.py: Optimize storing and usage of the variables resolution

This removes the precomputation of the size, which makes it slightly
faster.
---
 src/buildstream/_variables.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/buildstream/_variables.py b/src/buildstream/_variables.py
index 74314cf1f..dc20a60c3 100644
--- a/src/buildstream/_variables.py
+++ b/src/buildstream/_variables.py
@@ -34,9 +34,8 @@ PARSE_EXPANSION = re.compile(r"\%\{([a-zA-Z][a-zA-Z0-9_-]*)\}")
 # These hold data structures called "expansion strings" and are the parsed
 # form of the strings which are the input to this subsystem.  Strings
 # such as "Hello %{name}, how are you?" are parsed into the form:
-# (3, ["Hello ", "name", ", how are you?"])
-# i.e. a tuple of an integer and a list, where the integer is the cached
-# length of the list, and the list consists of one or more strings.
+# ["Hello ", "name", ", how are you?"]
+# i.e. a list which consists of one or more strings.
 # Strings in even indices of the list (0, 2, 4, etc) are constants which
 # are copied into the output of the expansion algorithm.  Strings in the
 # odd indices (1, 3, 5, etc) are the names of further expansions to make.
@@ -93,7 +92,7 @@ class Variables():
             unmatched = []
 
             # Look for any unmatched variable names in the expansion string
-            for var in expstr[1][1::2]:
+            for var in expstr[1::2]:
                 if var not in self._expstr_map:
                     unmatched.append(var)
 
@@ -130,7 +129,7 @@ class Variables():
         # First the check for anything unresolvable
         summary = []
         for key, expstr in self._expstr_map.items():
-            for var in expstr[1][1::2]:
+            for var in expstr[1::2]:
                 if var not in self._expstr_map:
                     line = "  unresolved variable '{unmatched}' in declaration of '{variable}' at: {provenance}"
                     provenance = _yaml.node_get_provenance(self.original, key)
@@ -142,7 +141,7 @@ class Variables():
     def _check_for_cycles(self):
         # And now the cycle checks
         def cycle_check(expstr, visited, cleared):
-            for var in expstr[1][1::2]:
+            for var in expstr[1::2]:
                 if var in cleared:
                     continue
                 if var in visited:
@@ -173,10 +172,10 @@ class Variables():
         flat = {}
         try:
             for key, expstr in self._expstr_map.items():
-                if expstr[0] > 1:
-                    expstr = (1, [sys.intern(_expand_expstr(self._expstr_map, expstr))])
+                if len(expstr) > 1:
+                    expstr = [sys.intern(_expand_expstr(self._expstr_map, expstr))]
                     self._expstr_map[key] = expstr
-                flat[key] = expstr[1][0]
+                flat[key] = expstr[0]
         except KeyError:
             self._check_for_missing()
             raise
@@ -193,7 +192,7 @@ class Variables():
 PARSE_CACHE = {
     # Prime the cache with the empty string since otherwise that can
     # cause issues with the parser, complications to which cause slowdown
-    "": (1, [""]),
+    "": [""],
 }
 
 
@@ -216,7 +215,7 @@ def _parse_expstr(instr):
         # memory impact of the cache.  It seems odd to cache the list length
         # but this is measurably cheaper than calculating it each time during
         # string expansion.
-        PARSE_CACHE[instr] = (len(splits), [sys.intern(s) for s in splits])
+        PARSE_CACHE[instr] = [sys.intern(s) for s in splits]
         return PARSE_CACHE[instr]
 
 
@@ -226,26 +225,27 @@ def _parse_expstr(instr):
 # Note: Will raise KeyError if any expansion is missing
 def _expand_expstr(content, topvalue):
     # Short-circuit constant strings
-    if topvalue[0] == 1:
-        return topvalue[1][0]
+    if len(topvalue) == 1:
+        return topvalue[0]
 
     # Short-circuit strings which are entirely an expansion of another variable
     # e.g. "%{another}"
-    if topvalue[0] == 2 and topvalue[1][0] == "":
-        return _expand_expstr(content, content[topvalue[1][1]])
+    if len(topvalue) == 2 and topvalue[0] == "":
+        return _expand_expstr(content, content[topvalue[1]])
 
     # Otherwise process fully...
     def internal_expand(value):
-        (expansion_len, expansion_bits) = value
         idx = 0
-        while idx < expansion_len:
+        value_len = len(value)
+
+        while idx < value_len:
             # First yield any constant string content
-            yield expansion_bits[idx]
+            yield value[idx]
             idx += 1
             # Now, if there is an expansion variable left to expand, yield
             # the expansion of that variable too
-            if idx < expansion_len:
-                yield from internal_expand(content[expansion_bits[idx]])
+            if idx < value_len:
+                yield from internal_expand(content[value[idx]])
             idx += 1
 
     return "".join(internal_expand(topvalue))
-- 
cgit v1.2.1


From d220c4c3bcf31b9d4660a6e915e70269c891bd9f Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Sat, 18 May 2019 14:01:08 +0100
Subject: Introduce pyproject.toml

Using pyproject.toml, defined in PEP518, allows us to have
an isolated build environment, ensuring that Cython can be installed
before calling setup.py in tox. This allows us to use cython helpers
in the setup.py script.

This is a prerequisite for introducing Cython in the codebase
---
 .gitignore     | 3 +++
 MANIFEST.in    | 3 +++
 pyproject.toml | 8 ++++++++
 setup.py       | 6 ++++--
 tox.ini        | 6 ++++++
 5 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index 3bc4e29af..340d7ebe4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ build
 # Setuptools distribution folder.
 /dist/
 
+# Pip build metadata
+pip-wheel-metadata/
+
 # Python egg metadata, regenerated from source files by setuptools.
 /src/*.egg-info
 .eggs
diff --git a/MANIFEST.in b/MANIFEST.in
index d0de0e593..7be35c0be 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -43,3 +43,6 @@ include requirements/plugin-requirements.txt
 
 # Versioneer
 include versioneer.py
+
+# setuptools.build_meta don't include setup.py by default. Add it
+include setup.py
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..38bb870e3
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,8 @@
+[build-system]
+requires = [
+    # We need at least version 36.6.0 that introduced "build_meta"
+    "setuptools>=36.6.0",
+    # In order to build wheels, and as required by PEP 517
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index 267ca6fb4..0caf7bae9 100755
--- a/setup.py
+++ b/setup.py
@@ -19,13 +19,15 @@
 #        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
 
 import os
+from pathlib import Path
 import re
 import shutil
 import subprocess
 import sys
-import versioneer
 
-from pathlib import Path
+# Add local directory to the path, in order to be able to import versioneer
+sys.path.append(os.path.dirname(__file__))
+import versioneer  # noqa
 
 
 ##################################################################
diff --git a/tox.ini b/tox.ini
index a7a4874c7..0a269c17d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,6 +4,7 @@
 [tox]
 envlist = py{35,36,37}
 skip_missing_interpreters = true
+isolated_build = true
 
 #
 # Defaults for all environments
@@ -147,3 +148,8 @@ deps =
     -rrequirements/requirements.txt
     -rrequirements/dev-requirements.txt
     -rrequirements/plugin-requirements.txt
+
+# When building using PEP518 and 517, we don't want default dependencies
+# installed by the base environment.
+[testenv:.package]
+deps =
-- 
cgit v1.2.1


From 4e9b5803e7241cc87c14126d320dc744ac4798cf Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Sun, 26 May 2019 09:37:26 +0100
Subject: Introduce Cython to the project and document

Cython requires a plugin to allow coverage of cython files, which
was updated in coveragerc. It also means we need to build the
dependencies and install cython for coverage.

Cython requires access to both source and compiled files when
running coverage. We therefore need to install project in develop
mode.

Updated documentation to explain how to run tests without tox but
with coverage
---
 .coveragerc                       |   3 ++
 .gitignore                        |   7 +++
 CONTRIBUTING.rst                  |  41 +++++++++++++++
 MANIFEST.in                       |   5 ++
 pyproject.toml                    |   3 +-
 requirements/cov-requirements.in  |   1 +
 requirements/cov-requirements.txt |   1 +
 setup.py                          | 102 +++++++++++++++++++++++++++++++++++++-
 tox.ini                           |  14 +++++-
 9 files changed, 174 insertions(+), 3 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index 457c439a6..5b06d817a 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,6 @@
 [run]
 concurrency = multiprocessing
+plugins = Cython.Coverage
 
 omit =
   # Omit some internals
@@ -11,6 +12,8 @@ omit =
   */.eggs/*
   # Omit .tox directory
   */.tox/*
+  # Omit a dynamically generated Cython file
+  */stringsource
 
 [report]
 show_missing = True
diff --git a/.gitignore b/.gitignore
index 340d7ebe4..a61a975ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,13 @@
 src/buildstream/**/*.pyc
 tests/**/*.pyc
 
+# Generated C files
+src/buildstream/**/*.c
+src/buildstream/**/*.so
+
+# Cython report files when using annotations
+src/buildstream/**/*.html
+
 # Build output directory
 build
 
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 921cee909..b128c7f9b 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -1598,6 +1598,15 @@ can run ``tox`` with ``-r`` or  ``--recreate`` option.
 
      ./setup.py test --addopts 'tests/frontend/buildtrack.py::test_build_track'
 
+   If you want to run coverage, you will need need to add ``BST_CYTHON_TRACE=1``
+   to your environment if you also want coverage on cython files. You could then
+   get coverage by running::
+
+     BST_CYTHON_TRACE=1 coverage run ./setup.py test
+
+   Note that to be able to run ``./setup.py test``, you will need to have ``Cython``
+   installed.
+
 .. tip::
 
    We also have an environment called 'venv' which takes any arguments
@@ -1737,6 +1746,12 @@ You can install it with `pip install snakeviz`. Here is an example invocation:
 
 It will then start a webserver and launch a browser to the relevant page.
 
+.. note::
+
+    If you want to also profile cython files, you will need to add
+    BST_CYTHON_PROFILE=1 and recompile the cython files.
+    ``pip install`` would take care of that.
+
 Profiling specific parts of BuildStream with BST_PROFILE
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 BuildStream can also turn on cProfile for specific parts of execution
@@ -1753,6 +1768,32 @@ call most of `initialized`, for each element. These profile files
 are in the same cProfile format as those mentioned in the previous
 section, and can be analysed in the same way.
 
+Fixing performance issues
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+BuildStream uses `Cython <https://cython.org/>`_ in order to speed up specific parts of the
+code base.
+
+.. note::
+
+    When optimizing for performance, please ensure that you optimize the algorithms before
+    jumping into Cython code. Cython will make the code harder to maintain and less accessible
+    to all developers.
+
+When adding a new cython file to the codebase, you will need to register it in ``setup.py``.
+
+For example, for a module ``buildstream._my_module``, to be written in ``src/buildstream/_my_module.pyx``, you would do::
+
+   register_cython_module("buildstream._my_module")
+
+In ``setup.py`` and the build tool will automatically use your module.
+
+.. note::
+
+   Please register cython modules at the same place as the others.
+
+When adding a definition class to share cython symbols between modules, please document the implementation file
+and only expose the required definitions.
 
 Managing data files
 -------------------
diff --git a/MANIFEST.in b/MANIFEST.in
index 7be35c0be..07369c481 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,6 +6,11 @@ include MAINTAINERS
 include NEWS
 include README.rst
 
+# Cython files
+recursive-include src/buildstream *.pyx
+recursive-include src/buildstream *.pxd
+recursive-include src/buildstream *.c
+
 # Data files required by BuildStream's generic source tests
 recursive-include src/buildstream/testing/_sourcetests/project *
 
diff --git a/pyproject.toml b/pyproject.toml
index 38bb870e3..4dd02d1e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,6 +3,7 @@ requires = [
     # We need at least version 36.6.0 that introduced "build_meta"
     "setuptools>=36.6.0",
     # In order to build wheels, and as required by PEP 517
-    "wheel"
+    "wheel",
+    "Cython"
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements/cov-requirements.in b/requirements/cov-requirements.in
index 455b91ba6..fb582f816 100644
--- a/requirements/cov-requirements.in
+++ b/requirements/cov-requirements.in
@@ -1,2 +1,3 @@
 coverage == 4.4.0
 pytest-cov >= 2.5.0
+Cython
diff --git a/requirements/cov-requirements.txt b/requirements/cov-requirements.txt
index 843df85f3..a8ba7843b 100644
--- a/requirements/cov-requirements.txt
+++ b/requirements/cov-requirements.txt
@@ -1,4 +1,5 @@
 coverage==4.4
+cython==0.29.7
 pytest-cov==2.6.1
 ## The following requirements were added by pip freeze:
 atomicwrites==1.3.0
diff --git a/setup.py b/setup.py
index 0caf7bae9..fe977c123 100755
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,7 @@
 #
 #  Authors:
 #        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
+#        Benjamin Schubert <bschubert15@bloomberg.net>
 
 import os
 from pathlib import Path
@@ -41,7 +42,7 @@ if sys.version_info[0] != REQUIRED_PYTHON_MAJOR or sys.version_info[1] < REQUIRE
     sys.exit(1)
 
 try:
-    from setuptools import setup, find_packages, Command
+    from setuptools import setup, find_packages, Command, Extension
     from setuptools.command.easy_install import ScriptWriter
     from setuptools.command.test import test as TestCommand
 except ImportError:
@@ -304,6 +305,93 @@ with open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
     long_description = readme.read()
 
 
+#####################################################
+#            Setup Cython and extensions            #
+#####################################################
+# We want to ensure that source distributions always
+# include the .c files, in order to allow users to
+# not need cython when building.
+def assert_cython_required():
+    if "sdist" not in sys.argv:
+        return
+
+    print("Cython is required when building 'sdist' in order to "
+          "ensure source distributions can be built without Cython. "
+          "Please install it using your package manager (usually 'python3-cython') "
+          "or pip (pip install cython).",
+          file=sys.stderr)
+
+    raise SystemExit(1)
+
+
+extension_macros = [
+    ("CYTHON_TRACE", os.environ.get("BST_CYTHON_TRACE", 0))
+]
+
+
+def cythonize(extensions, **kwargs):
+    try:
+        from Cython.Build import cythonize as _cythonize
+    except ImportError:
+        assert_cython_required()
+
+        print("Cython not found. Using preprocessed c files instead")
+
+        missing_c_sources = []
+
+        for extension in extensions:
+            for source in extension.sources:
+                if source.endswith(".pyx"):
+                    c_file = source.replace(".pyx", ".c")
+
+                    if not os.path.exists(c_file):
+                        missing_c_sources.append((extension, c_file))
+
+        if missing_c_sources:
+            for extension, source in missing_c_sources:
+                print("Missing '{}' for building extension '{}'".format(source, extension.name))
+
+            raise SystemExit(1)
+        return extensions
+
+    return _cythonize(extensions, **kwargs)
+
+
+def register_cython_module(module_name, dependencies=None):
+    def files_from_module(modname):
+        basename = "src/{}".format(modname.replace(".", "/"))
+        return "{}.pyx".format(basename), "{}.pxd".format(basename)
+
+    if dependencies is None:
+        dependencies = []
+
+    implementation_file, definition_file = files_from_module(module_name)
+
+    assert os.path.exists(implementation_file)
+
+    depends = []
+    if os.path.exists(definition_file):
+        depends.append(definition_file)
+
+    for module in dependencies:
+        imp_file, def_file = files_from_module(module)
+        assert os.path.exists(imp_file), "Dependency file not found: {}".format(imp_file)
+        assert os.path.exists(def_file), "Dependency declaration file not found: {}".format(def_file)
+
+        depends.append(imp_file)
+        depends.append(def_file)
+
+    BUILD_EXTENSIONS.append(Extension(
+        name=module_name,
+        sources=[implementation_file],
+        depends=depends,
+        define_macros=extension_macros,
+    ))
+
+
+BUILD_EXTENSIONS = []
+
+
 #####################################################
 #             Main setup() Invocation               #
 #####################################################
@@ -362,4 +450,16 @@ setup(name='BuildStream',
       install_requires=install_requires,
       entry_points=bst_install_entry_points,
       tests_require=dev_requires,
+      ext_modules=cythonize(
+          BUILD_EXTENSIONS,
+          compiler_directives={
+              # Version of python to use
+              # https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#arguments
+              "language_level": "3",
+              # Enable line tracing, this is needed in order to generate coverage.
+              # This is not enabled unless the CYTHON_TRACE macro for distutils is defined.
+              "linetrace": True,
+              "profile": os.environ.get("BST_CYTHON_PROFILE", False),
+          }
+      ),
       zip_safe=False)
diff --git a/tox.ini b/tox.ini
index 0a269c17d..94e96d9b0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -12,6 +12,10 @@ isolated_build = true
 # Anything specified here is inherited by the sections
 #
 [testenv]
+usedevelop =
+    # This is required by Cython in order to get coverage for cython files.
+    py{35,36,37}-!nocover: True
+
 commands =
     # Running with coverage reporting enabled
     py{35,36,37}-!external-!nocover: pytest --basetemp {envtmpdir} --cov=buildstream --cov-config .coveragerc {posargs}
@@ -52,6 +56,8 @@ setenv =
     py{35,36,37}: XDG_CACHE_HOME = {envtmpdir}/cache
     py{35,36,37}: XDG_CONFIG_HOME = {envtmpdir}/config
     py{35,36,37}: XDG_DATA_HOME = {envtmpdir}/share
+    # This is required to get coverage for Cython
+    py{35,36,37}-!nocover: BST_CYTHON_TRACE = 1
 
 whitelist_externals =
     py{35,36,37}:
@@ -62,7 +68,9 @@ whitelist_externals =
 # Coverage reporting
 #
 [testenv:coverage]
-skip_install = true
+# This is required by Cython in order to get coverage for cython files.
+usedevelop = True
+
 commands =
     coverage combine --rcfile={toxinidir}/.coveragerc {toxinidir}/.coverage-reports/
     coverage html --rcfile={toxinidir}/.coveragerc --directory={toxinidir}/.coverage-reports/
@@ -76,6 +84,10 @@ setenv =
 # Running linters
 #
 [testenv:lint]
+commands_pre =
+    # Build C extensions to allow Pylint to analyse them
+    {envpython} setup.py build_ext --inplace
+
 commands =
     pycodestyle
     pylint src/buildstream tests
-- 
cgit v1.2.1


From 41293b1badeb12689ea72cd42c2d68df492c4274 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Sun, 26 May 2019 09:43:20 +0100
Subject: _variables: Cythonize _internal_expand

Move _variables.py to be a Cython module.

`_internal_expand` is a function that is called a lot in BuildStream.
It is also entirely isolated and easy to cythonize.
---
 .pylintrc                      |   2 +-
 setup.py                       |   1 +
 src/buildstream/_variables.py  | 251 --------------------------------------
 src/buildstream/_variables.pyx | 270 +++++++++++++++++++++++++++++++++++++++++
 src/buildstream/element.py     |   3 +-
 5 files changed, 274 insertions(+), 253 deletions(-)
 delete mode 100644 src/buildstream/_variables.py
 create mode 100644 src/buildstream/_variables.pyx

diff --git a/.pylintrc b/.pylintrc
index c47ef92cf..0dc625a46 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -3,7 +3,7 @@
 # A comma-separated list of package or module names from where C extensions may
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code
-extension-pkg-whitelist=
+extension-pkg-whitelist=buildstream._variables
 
 # Add files or directories to the blacklist. They should be base names, not
 # paths.
diff --git a/setup.py b/setup.py
index fe977c123..90aa74d31 100755
--- a/setup.py
+++ b/setup.py
@@ -391,6 +391,7 @@ def register_cython_module(module_name, dependencies=None):
 
 BUILD_EXTENSIONS = []
 
+register_cython_module("buildstream._variables")
 
 #####################################################
 #             Main setup() Invocation               #
diff --git a/src/buildstream/_variables.py b/src/buildstream/_variables.py
deleted file mode 100644
index dc20a60c3..000000000
--- a/src/buildstream/_variables.py
+++ /dev/null
@@ -1,251 +0,0 @@
-#
-#  Copyright (C) 2016 Codethink Limited
-#  Copyright (C) 2019 Bloomberg L.P.
-#
-#  This program is free software; you can redistribute it and/or
-#  modify it under the terms of the GNU Lesser General Public
-#  License as published by the Free Software Foundation; either
-#  version 2 of the License, or (at your option) any later version.
-#
-#  This library is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-#  Lesser General Public License for more details.
-#
-#  You should have received a copy of the GNU Lesser General Public
-#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
-#
-#  Authors:
-#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
-#        Daniel Silverstone <daniel.silverstone@codethink.co.uk>
-
-import re
-import sys
-
-from ._exceptions import LoadError, LoadErrorReason
-from . import _yaml
-
-# Variables are allowed to have dashes here
-#
-PARSE_EXPANSION = re.compile(r"\%\{([a-zA-Z][a-zA-Z0-9_-]*)\}")
-
-
-# Throughout this code you will see variables named things like `expstr`.
-# These hold data structures called "expansion strings" and are the parsed
-# form of the strings which are the input to this subsystem.  Strings
-# such as "Hello %{name}, how are you?" are parsed into the form:
-# ["Hello ", "name", ", how are you?"]
-# i.e. a list which consists of one or more strings.
-# Strings in even indices of the list (0, 2, 4, etc) are constants which
-# are copied into the output of the expansion algorithm.  Strings in the
-# odd indices (1, 3, 5, etc) are the names of further expansions to make.
-# In the example above, first "Hello " is copied, then "name" is expanded
-# and so must be another named expansion string passed in to the constructor
-# of the Variables class, and whatever is yielded from the expansion of "name"
-# is added to the concatenation for the result.  Finally ", how are you?" is
-# copied in and the whole lot concatenated for return.
-#
-# To see how strings are parsed, see `_parse_expstr()` after the class, and
-# to see how expansion strings are expanded, see `_expand_expstr()` after that.
-
-
-# The Variables helper object will resolve the variable references in
-# the given dictionary, expecting that any dictionary values which contain
-# variable references can be resolved from the same dictionary.
-#
-# Each Element creates its own Variables instance to track the configured
-# variable settings for the element.
-#
-# Args:
-#     node (dict): A node loaded and composited with yaml tools
-#
-# Raises:
-#     LoadError, if unresolved variables, or cycles in resolution, occur.
-#
-class Variables():
-
-    def __init__(self, node):
-
-        self.original = node
-        self._expstr_map = self._resolve(node)
-        self.flat = self._flatten()
-
-    # subst():
-    #
-    # Substitutes any variables in 'string' and returns the result.
-    #
-    # Args:
-    #    (string): The string to substitute
-    #
-    # Returns:
-    #    (string): The new string with any substitutions made
-    #
-    # Raises:
-    #    LoadError, if the string contains unresolved variable references.
-    #
-    def subst(self, string):
-        expstr = _parse_expstr(string)
-
-        try:
-            return _expand_expstr(self._expstr_map, expstr)
-        except KeyError:
-            unmatched = []
-
-            # Look for any unmatched variable names in the expansion string
-            for var in expstr[1::2]:
-                if var not in self._expstr_map:
-                    unmatched.append(var)
-
-            if unmatched:
-                message = "Unresolved variable{}: {}".format(
-                    "s" if len(unmatched) > 1 else "",
-                    ", ".join(unmatched)
-                )
-
-                raise LoadError(LoadErrorReason.UNRESOLVED_VARIABLE, message)
-            # Otherwise, re-raise the KeyError since it clearly came from some
-            # other unknowable cause.
-            raise
-
-    # Variable resolving code
-    #
-    # Here we resolve all of our inputs into a dictionary, ready for use
-    # in subst()
-    def _resolve(self, node):
-        # Special case, if notparallel is specified in the variables for this
-        # element, then override max-jobs to be 1.
-        # Initialize it as a string as all variables are processed as strings.
-        #
-        if _yaml.node_get(node, bool, 'notparallel', default_value=False):
-            _yaml.node_set(node, 'max-jobs', str(1))
-
-        ret = {}
-        for key, value in _yaml.node_items(node):
-            value = _yaml.node_get(node, str, key)
-            ret[sys.intern(key)] = _parse_expstr(value)
-        return ret
-
-    def _check_for_missing(self):
-        # First the check for anything unresolvable
-        summary = []
-        for key, expstr in self._expstr_map.items():
-            for var in expstr[1::2]:
-                if var not in self._expstr_map:
-                    line = "  unresolved variable '{unmatched}' in declaration of '{variable}' at: {provenance}"
-                    provenance = _yaml.node_get_provenance(self.original, key)
-                    summary.append(line.format(unmatched=var, variable=key, provenance=provenance))
-        if summary:
-            raise LoadError(LoadErrorReason.UNRESOLVED_VARIABLE,
-                            "Failed to resolve one or more variable:\n{}\n".format("\n".join(summary)))
-
-    def _check_for_cycles(self):
-        # And now the cycle checks
-        def cycle_check(expstr, visited, cleared):
-            for var in expstr[1::2]:
-                if var in cleared:
-                    continue
-                if var in visited:
-                    raise LoadError(LoadErrorReason.RECURSIVE_VARIABLE,
-                                    "{}: ".format(_yaml.node_get_provenance(self.original, var)) +
-                                    ("Variable '{}' expands to contain a reference to itself. " +
-                                     "Perhaps '{}' contains '%{{{}}}").format(var, visited[-1], var))
-                visited.append(var)
-                cycle_check(self._expstr_map[var], visited, cleared)
-                visited.pop()
-                cleared.add(var)
-
-        cleared = set()
-        for key, expstr in self._expstr_map.items():
-            if key not in cleared:
-                cycle_check(expstr, [key], cleared)
-
-    # _flatten():
-    #
-    # Turn our dictionary of expansion strings into a flattened dict
-    # so that we can run expansions faster in the future
-    #
-    # Raises:
-    #    LoadError, if the string contains unresolved variable references or
-    #               if cycles are detected in the variable references
-    #
-    def _flatten(self):
-        flat = {}
-        try:
-            for key, expstr in self._expstr_map.items():
-                if len(expstr) > 1:
-                    expstr = [sys.intern(_expand_expstr(self._expstr_map, expstr))]
-                    self._expstr_map[key] = expstr
-                flat[key] = expstr[0]
-        except KeyError:
-            self._check_for_missing()
-            raise
-        except RecursionError:
-            self._check_for_cycles()
-            raise
-        return flat
-
-
-# Cache for the parsed expansion strings.  While this is nominally
-# something which might "waste" memory, in reality each of these
-# will live as long as the element which uses it, which is the
-# vast majority of the memory usage across the execution of BuildStream.
-PARSE_CACHE = {
-    # Prime the cache with the empty string since otherwise that can
-    # cause issues with the parser, complications to which cause slowdown
-    "": [""],
-}
-
-
-# Helper to parse a string into an expansion string tuple, caching
-# the results so that future parse requests don't need to think about
-# the string
-def _parse_expstr(instr):
-    try:
-        return PARSE_CACHE[instr]
-    except KeyError:
-        # This use of the regex turns a string like "foo %{bar} baz" into
-        # a list ["foo ", "bar", " baz"]
-        splits = PARSE_EXPANSION.split(instr)
-        # If an expansion ends the string, we get an empty string on the end
-        # which we can optimise away, making the expansion routines not need
-        # a test for this.
-        if splits[-1] == '':
-            splits = splits[:-1]
-        # Cache an interned copy of this.  We intern it to try and reduce the
-        # memory impact of the cache.  It seems odd to cache the list length
-        # but this is measurably cheaper than calculating it each time during
-        # string expansion.
-        PARSE_CACHE[instr] = [sys.intern(s) for s in splits]
-        return PARSE_CACHE[instr]
-
-
-# Helper to expand a given top level expansion string tuple in the context
-# of the given dictionary of expansion strings.
-#
-# Note: Will raise KeyError if any expansion is missing
-def _expand_expstr(content, topvalue):
-    # Short-circuit constant strings
-    if len(topvalue) == 1:
-        return topvalue[0]
-
-    # Short-circuit strings which are entirely an expansion of another variable
-    # e.g. "%{another}"
-    if len(topvalue) == 2 and topvalue[0] == "":
-        return _expand_expstr(content, content[topvalue[1]])
-
-    # Otherwise process fully...
-    def internal_expand(value):
-        idx = 0
-        value_len = len(value)
-
-        while idx < value_len:
-            # First yield any constant string content
-            yield value[idx]
-            idx += 1
-            # Now, if there is an expansion variable left to expand, yield
-            # the expansion of that variable too
-            if idx < value_len:
-                yield from internal_expand(content[value[idx]])
-            idx += 1
-
-    return "".join(internal_expand(topvalue))
diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
new file mode 100644
index 000000000..405171b61
--- /dev/null
+++ b/src/buildstream/_variables.pyx
@@ -0,0 +1,270 @@
+#
+#  Copyright (C) 2016 Codethink Limited
+#  Copyright (C) 2019 Bloomberg L.P.
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2 of the License, or (at your option) any later version.
+#
+#  This library is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+#  Authors:
+#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
+#        Daniel Silverstone <daniel.silverstone@codethink.co.uk>
+#        Benjamin Schubert <bschubert@bloomberg.net>
+
+import re
+import sys
+
+from ._exceptions import LoadError, LoadErrorReason
+from . import _yaml
+
+# Variables are allowed to have dashes here
+#
+PARSE_EXPANSION = re.compile(r"\%\{([a-zA-Z][a-zA-Z0-9_-]*)\}")
+
+
+# Throughout this code you will see variables named things like `expstr`.
+# These hold data structures called "expansion strings" and are the parsed
+# form of the strings which are the input to this subsystem.  Strings
+# such as "Hello %{name}, how are you?" are parsed into the form:
+# ["Hello ", "name", ", how are you?"]
+# i.e. a list which consists of one or more strings.
+# Strings in even indices of the list (0, 2, 4, etc) are constants which
+# are copied into the output of the expansion algorithm.  Strings in the
+# odd indices (1, 3, 5, etc) are the names of further expansions to make.
+# In the example above, first "Hello " is copied, then "name" is expanded
+# and so must be another named expansion string passed in to the constructor
+# of the Variables class, and whatever is yielded from the expansion of "name"
+# is added to the concatenation for the result.  Finally ", how are you?" is
+# copied in and the whole lot concatenated for return.
+#
+# To see how strings are parsed, see `_parse_expstr()` after the class, and
+# to see how expansion strings are expanded, see `_expand_expstr()` after that.
+
+
+# The Variables helper object will resolve the variable references in
+# the given dictionary, expecting that any dictionary values which contain
+# variable references can be resolved from the same dictionary.
+#
+# Each Element creates its own Variables instance to track the configured
+# variable settings for the element.
+#
+# Args:
+#     node (dict): A node loaded and composited with yaml tools
+#
+# Raises:
+#     LoadError, if unresolved variables, or cycles in resolution, occur.
+#
+class Variables():
+
+    def __init__(self, node):
+
+        self.original = node
+        self._expstr_map = self._resolve(node)
+        self.flat = self._flatten()
+
+    # subst():
+    #
+    # Substitutes any variables in 'string' and returns the result.
+    #
+    # Args:
+    #    (string): The string to substitute
+    #
+    # Returns:
+    #    (string): The new string with any substitutions made
+    #
+    # Raises:
+    #    LoadError, if the string contains unresolved variable references.
+    #
+    def subst(self, string):
+        expstr = _parse_expstr(string)
+
+        try:
+            return _expand_expstr(self._expstr_map, expstr)
+        except KeyError:
+            unmatched = []
+
+            # Look for any unmatched variable names in the expansion string
+            for var in expstr[1::2]:
+                if var not in self._expstr_map:
+                    unmatched.append(var)
+
+            if unmatched:
+                message = "Unresolved variable{}: {}".format(
+                    "s" if len(unmatched) > 1 else "",
+                    ", ".join(unmatched)
+                )
+
+                raise LoadError(LoadErrorReason.UNRESOLVED_VARIABLE, message)
+            # Otherwise, re-raise the KeyError since it clearly came from some
+            # other unknowable cause.
+            raise
+
+    # Variable resolving code
+    #
+    # Here we resolve all of our inputs into a dictionary, ready for use
+    # in subst()
+    def _resolve(self, node):
+        # Special case, if notparallel is specified in the variables for this
+        # element, then override max-jobs to be 1.
+        # Initialize it as a string as all variables are processed as strings.
+        #
+        if _yaml.node_get(node, bool, 'notparallel', default_value=False):
+            _yaml.node_set(node, 'max-jobs', str(1))
+
+        ret = {}
+        for key, value in _yaml.node_items(node):
+            value = _yaml.node_get(node, str, key)
+            ret[sys.intern(key)] = _parse_expstr(value)
+        return ret
+
+    def _check_for_missing(self):
+        # First the check for anything unresolvable
+        summary = []
+        for key, expstr in self._expstr_map.items():
+            for var in expstr[1::2]:
+                if var not in self._expstr_map:
+                    line = "  unresolved variable '{unmatched}' in declaration of '{variable}' at: {provenance}"
+                    provenance = _yaml.node_get_provenance(self.original, key)
+                    summary.append(line.format(unmatched=var, variable=key, provenance=provenance))
+        if summary:
+            raise LoadError(LoadErrorReason.UNRESOLVED_VARIABLE,
+                            "Failed to resolve one or more variable:\n{}\n".format("\n".join(summary)))
+
+    def _check_for_cycles(self):
+        # And now the cycle checks
+        def cycle_check(expstr, visited, cleared):
+            for var in expstr[1::2]:
+                if var in cleared:
+                    continue
+                if var in visited:
+                    raise LoadError(LoadErrorReason.RECURSIVE_VARIABLE,
+                                    "{}: ".format(_yaml.node_get_provenance(self.original, var)) +
+                                    ("Variable '{}' expands to contain a reference to itself. " +
+                                     "Perhaps '{}' contains '%{{{}}}").format(var, visited[-1], var))
+                visited.append(var)
+                cycle_check(self._expstr_map[var], visited, cleared)
+                visited.pop()
+                cleared.add(var)
+
+        cleared = set()
+        for key, expstr in self._expstr_map.items():
+            if key not in cleared:
+                cycle_check(expstr, [key], cleared)
+
+    # _flatten():
+    #
+    # Turn our dictionary of expansion strings into a flattened dict
+    # so that we can run expansions faster in the future
+    #
+    # Raises:
+    #    LoadError, if the string contains unresolved variable references or
+    #               if cycles are detected in the variable references
+    #
+    def _flatten(self):
+        flat = {}
+        try:
+            for key, expstr in self._expstr_map.items():
+                if len(expstr) > 1:
+                    expstr = [sys.intern(_expand_expstr(self._expstr_map, expstr))]
+                    self._expstr_map[key] = expstr
+                flat[key] = expstr[0]
+        except KeyError:
+            self._check_for_missing()
+            raise
+        except RecursionError:
+            self._check_for_cycles()
+            raise
+        return flat
+
+
+# Cache for the parsed expansion strings.  While this is nominally
+# something which might "waste" memory, in reality each of these
+# will live as long as the element which uses it, which is the
+# vast majority of the memory usage across the execution of BuildStream.
+PARSE_CACHE = {
+    # Prime the cache with the empty string since otherwise that can
+    # cause issues with the parser, complications to which cause slowdown
+    "": [""],
+}
+
+
+# Helper to parse a string into an expansion string tuple, caching
+# the results so that future parse requests don't need to think about
+# the string
+def _parse_expstr(instr):
+    try:
+        return PARSE_CACHE[instr]
+    except KeyError:
+        # This use of the regex turns a string like "foo %{bar} baz" into
+        # a list ["foo ", "bar", " baz"]
+        splits = PARSE_EXPANSION.split(instr)
+        # If an expansion ends the string, we get an empty string on the end
+        # which we can optimise away, making the expansion routines not need
+        # a test for this.
+        if splits[-1] == '':
+            splits = splits[:-1]
+        # Cache an interned copy of this.  We intern it to try and reduce the
+        # memory impact of the cache.  It seems odd to cache the list length
+        # but this is measurably cheaper than calculating it each time during
+        # string expansion.
+        PARSE_CACHE[instr] = [sys.intern(s) for s in splits]
+        return PARSE_CACHE[instr]
+
+
+# Helper to expand recursively an expansion string in the context
+# of the given dictionary of expansion string
+#
+# Args:
+#     content (dict): dictionary context for resolving the variables
+#     value (list): expansion string to expand
+#     acc (list): list in which to add the result
+#     counter (int): counter to count the number of recursion in order to
+#                    detect cycles.
+#
+# Raises:
+#     KeyError: if any expansion is missing
+#     RecursionError: if a variable is defined recursively
+#
+cdef void _expand_expstr_helper(dict content, list value, list acc, int counter = 0) except *:
+    cdef Py_ssize_t idx = 0
+    cdef Py_ssize_t value_len = len(value)
+
+    if counter > 1000:
+        raise RecursionError()
+
+    while idx < value_len:
+        acc.append(value[idx])
+        idx += 1
+
+        if idx < value_len:
+            _expand_expstr_helper(content, <list> content[value[idx]], acc, counter + 1)
+
+        idx += 1
+
+
+# Helper to expand a given top level expansion string tuple in the context
+# of the given dictionary of expansion strings.
+#
+# Note: Will raise KeyError if any expansion is missing
+def _expand_expstr(content, topvalue):
+    # Short-circuit constant strings
+    if len(topvalue) == 1:
+        return topvalue[0]
+
+    # Short-circuit strings which are entirely an expansion of another variable
+    # e.g. "%{another}"
+    if len(topvalue) == 2 and topvalue[0] == "":
+        return _expand_expstr(content, content[topvalue[1]])
+
+    cdef list result = []
+    _expand_expstr_helper(content, topvalue, result)
+    return "".join(result)
diff --git a/src/buildstream/element.py b/src/buildstream/element.py
index 08326c6f3..8e006ea6b 100644
--- a/src/buildstream/element.py
+++ b/src/buildstream/element.py
@@ -931,7 +931,8 @@ class Element(Plugin):
            (str): The resolved value for *varname*, or None if no
            variable was declared with the given name.
         """
-        return self.__variables.flat.get(varname)
+        # Flat is not recognized correctly by Pylint as being a dictionary
+        return self.__variables.flat.get(varname)  # pylint: disable=no-member
 
     def batch_prepare_assemble(self, flags, *, collect=None):
         """ Configure command batching across prepare() and assemble()
-- 
cgit v1.2.1


From 738e7e68087f34421eea7478cc2743c962dd4ae5 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 14:47:35 +0100
Subject: _variables: Cythonize _expand_expstr

---
 src/buildstream/_variables.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
index 405171b61..ccf1a0b2c 100644
--- a/src/buildstream/_variables.pyx
+++ b/src/buildstream/_variables.pyx
@@ -255,15 +255,15 @@ cdef void _expand_expstr_helper(dict content, list value, list acc, int counter
 # of the given dictionary of expansion strings.
 #
 # Note: Will raise KeyError if any expansion is missing
-def _expand_expstr(content, topvalue):
+cdef str _expand_expstr(dict content, list topvalue):
     # Short-circuit constant strings
     if len(topvalue) == 1:
-        return topvalue[0]
+        return <str> topvalue[0]
 
     # Short-circuit strings which are entirely an expansion of another variable
     # e.g. "%{another}"
-    if len(topvalue) == 2 and topvalue[0] == "":
-        return _expand_expstr(content, content[topvalue[1]])
+    if len(topvalue) == 2 and len(<str> topvalue[0]) == 0:
+        return _expand_expstr(content, <list> content[topvalue[1]])
 
     cdef list result = []
     _expand_expstr_helper(content, topvalue, result)
-- 
cgit v1.2.1


From 4ce763851a07bd7aeb9593b60248467932d4bfbc Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 15:01:44 +0100
Subject: _variables: Cythonize _parse_expstr

Also type the `PARSE_CACHE` in order to speedup access to it.
---
 src/buildstream/_variables.pyx | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
index ccf1a0b2c..466187dde 100644
--- a/src/buildstream/_variables.pyx
+++ b/src/buildstream/_variables.pyx
@@ -190,7 +190,7 @@ class Variables():
 # something which might "waste" memory, in reality each of these
 # will live as long as the element which uses it, which is the
 # vast majority of the memory usage across the execution of BuildStream.
-PARSE_CACHE = {
+cdef dict PARSE_CACHE = {
     # Prime the cache with the empty string since otherwise that can
     # cause issues with the parser, complications to which cause slowdown
     "": [""],
@@ -200,9 +200,11 @@ PARSE_CACHE = {
 # Helper to parse a string into an expansion string tuple, caching
 # the results so that future parse requests don't need to think about
 # the string
-def _parse_expstr(instr):
+cdef list _parse_expstr(str instr):
+    cdef list ret
+
     try:
-        return PARSE_CACHE[instr]
+        return <list> PARSE_CACHE[instr]
     except KeyError:
         # This use of the regex turns a string like "foo %{bar} baz" into
         # a list ["foo ", "bar", " baz"]
@@ -211,13 +213,14 @@ def _parse_expstr(instr):
         # which we can optimise away, making the expansion routines not need
         # a test for this.
         if splits[-1] == '':
-            splits = splits[:-1]
+           del splits [-1]
         # Cache an interned copy of this.  We intern it to try and reduce the
         # memory impact of the cache.  It seems odd to cache the list length
         # but this is measurably cheaper than calculating it each time during
         # string expansion.
-        PARSE_CACHE[instr] = [sys.intern(s) for s in splits]
-        return PARSE_CACHE[instr]
+        ret = [sys.intern(<str> s) for s in <list> splits]
+        PARSE_CACHE[instr] = ret
+        return ret
 
 
 # Helper to expand recursively an expansion string in the context
-- 
cgit v1.2.1


From d1fa3bb34147ba2d14abd10e9343de61425e5c75 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 15:07:44 +0100
Subject: _variables: make Variables extension class

Extension class are faster for access and take less memory than
a normal Python class. Variables is self contained and easy to
cythonize.
---
 src/buildstream/_variables.pyx | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
index 466187dde..0e3bdb29c 100644
--- a/src/buildstream/_variables.pyx
+++ b/src/buildstream/_variables.pyx
@@ -63,10 +63,13 @@ PARSE_EXPANSION = re.compile(r"\%\{([a-zA-Z][a-zA-Z0-9_-]*)\}")
 # Raises:
 #     LoadError, if unresolved variables, or cycles in resolution, occur.
 #
-class Variables():
+cdef class Variables:
 
-    def __init__(self, node):
+    cdef object original
+    cdef dict _expstr_map
+    cdef public dict flat
 
+    def __init__(self, node):
         self.original = node
         self._expstr_map = self._resolve(node)
         self.flat = self._flatten()
-- 
cgit v1.2.1


From a521c0f0f65afdd3b4e76b403cd28fe627a94914 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 15:17:47 +0100
Subject: _variables: Cythonize Variables._flatten

---
 src/buildstream/_variables.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
index 0e3bdb29c..d12938dd9 100644
--- a/src/buildstream/_variables.pyx
+++ b/src/buildstream/_variables.pyx
@@ -172,11 +172,15 @@ cdef class Variables:
     #    LoadError, if the string contains unresolved variable references or
     #               if cycles are detected in the variable references
     #
-    def _flatten(self):
-        flat = {}
+    cdef dict _flatten(self):
+        cdef dict flat = {}
+        cdef str key
+        cdef list expstr
+
         try:
             for key, expstr in self._expstr_map.items():
                 if len(expstr) > 1:
+                    # FIXME: do we really gain anything by interning?
                     expstr = [sys.intern(_expand_expstr(self._expstr_map, expstr))]
                     self._expstr_map[key] = expstr
                 flat[key] = expstr[0]
-- 
cgit v1.2.1


From f5ee7679b0b9b469dacc25f71dc46fa788367a1b Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 15:25:18 +0100
Subject: _variables: Cythonize Variables.subst

---
 src/buildstream/_variables.pyx | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
index d12938dd9..d99aaeb78 100644
--- a/src/buildstream/_variables.pyx
+++ b/src/buildstream/_variables.pyx
@@ -87,7 +87,7 @@ cdef class Variables:
     # Raises:
     #    LoadError, if the string contains unresolved variable references.
     #
-    def subst(self, string):
+    def subst(self, str string):
         expstr = _parse_expstr(string)
 
         try:
@@ -115,7 +115,8 @@ cdef class Variables:
     #
     # Here we resolve all of our inputs into a dictionary, ready for use
     # in subst()
-    def _resolve(self, node):
+    # FIXME: node should be a yaml Node if moved
+    cdef dict _resolve(self, object node):
         # Special case, if notparallel is specified in the variables for this
         # element, then override max-jobs to be 1.
         # Initialize it as a string as all variables are processed as strings.
@@ -123,9 +124,12 @@ cdef class Variables:
         if _yaml.node_get(node, bool, 'notparallel', default_value=False):
             _yaml.node_set(node, 'max-jobs', str(1))
 
-        ret = {}
-        for key, value in _yaml.node_items(node):
-            value = _yaml.node_get(node, str, key)
+        cdef dict ret = {}
+        cdef str key
+        cdef str value
+
+        for key in _yaml.node_keys(node):
+            value = <str> _yaml.node_get(node, str, key)
             ret[sys.intern(key)] = _parse_expstr(value)
         return ret
 
-- 
cgit v1.2.1


From a5a16bee862f7e552a4a8dbd75ff10db7b9c1342 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Fri, 17 May 2019 18:47:27 +0100
Subject: _yaml: Cythonize and internalize Node

Node used to be a NamedTuple that we used to access by index
for speed reasons. Moving to an extension class allows us to
access attributes by name, making the code easier to read and
less error prone. Moreover, we do gain speed and memory by
doing this move.

Also fix a few places where we would not have an entire `Node`
object but were instead just returning a tuple, missing some entries.
---
 .pylintrc                 |    2 +-
 setup.py                  |    1 +
 src/buildstream/_yaml.py  | 1432 --------------------------------------------
 src/buildstream/_yaml.pyx | 1444 +++++++++++++++++++++++++++++++++++++++++++++
 tests/internals/yaml.py   |   14 +-
 5 files changed, 1453 insertions(+), 1440 deletions(-)
 delete mode 100644 src/buildstream/_yaml.py
 create mode 100644 src/buildstream/_yaml.pyx

diff --git a/.pylintrc b/.pylintrc
index 0dc625a46..e0941862f 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -3,7 +3,7 @@
 # A comma-separated list of package or module names from where C extensions may
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code
-extension-pkg-whitelist=buildstream._variables
+extension-pkg-whitelist=buildstream._variables,buildstream._yaml
 
 # Add files or directories to the blacklist. They should be base names, not
 # paths.
diff --git a/setup.py b/setup.py
index 90aa74d31..b848e2458 100755
--- a/setup.py
+++ b/setup.py
@@ -391,6 +391,7 @@ def register_cython_module(module_name, dependencies=None):
 
 BUILD_EXTENSIONS = []
 
+register_cython_module("buildstream._yaml")
 register_cython_module("buildstream._variables")
 
 #####################################################
diff --git a/src/buildstream/_yaml.py b/src/buildstream/_yaml.py
deleted file mode 100644
index cdab4269e..000000000
--- a/src/buildstream/_yaml.py
+++ /dev/null
@@ -1,1432 +0,0 @@
-#
-#  Copyright (C) 2018 Codethink Limited
-#  Copyright (C) 2019 Bloomberg LLP
-#
-#  This program is free software; you can redistribute it and/or
-#  modify it under the terms of the GNU Lesser General Public
-#  License as published by the Free Software Foundation; either
-#  version 2 of the License, or (at your option) any later version.
-#
-#  This library is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-#  Lesser General Public License for more details.
-#
-#  You should have received a copy of the GNU Lesser General Public
-#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
-#
-#  Authors:
-#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
-#        Daniel Silverstone <daniel.silverstone@codethink.co.uk>
-#        James Ennis <james.ennis@codethink.co.uk>
-
-import sys
-import string
-from contextlib import ExitStack
-from collections import OrderedDict, namedtuple
-from collections.abc import Mapping, Sequence
-from copy import deepcopy
-from itertools import count
-
-from ruamel import yaml
-from ._exceptions import LoadError, LoadErrorReason
-
-
-# Without this, pylint complains about all the `type(foo) is blah` checks
-# because it feels isinstance() is more idiomatic.  Sadly, it is much slower to
-# do `isinstance(foo, blah)` for reasons I am unable to fathom.  As such, we
-# blanket disable the check for this module.
-#
-# pylint: disable=unidiomatic-typecheck
-
-
-# Node()
-#
-# Container for YAML loaded data and its provenance
-#
-# All nodes returned (and all internal lists/strings) have this type (rather
-# than a plain tuple, to distinguish them in things like node_sanitize)
-#
-# Members:
-#   value (str/list/dict): The loaded value.
-#   file_index (int): Index within _FILE_LIST (a list of loaded file paths).
-#                     Negative indices indicate synthetic nodes so that
-#                     they can be referenced.
-#   line (int): The line number within the file where the value appears.
-#   col (int): The column number within the file where the value appears.
-#
-# For efficiency, each field should be accessed by its integer index:
-#   value = Node[0]
-#   file_index = Node[1]
-#   line = Node[2]
-#   column = Node[3]
-#
-class Node(namedtuple('Node', ['value', 'file_index', 'line', 'column'])):
-    def __contains__(self, what):
-        # Delegate to the inner value, though this will likely not work
-        # very well if the node is a list or string, it's unlikely that
-        # code which has access to such nodes would do this.
-        return what in self[0]
-
-
-# File name handling
-_FILE_LIST = []
-
-
-# Purely synthetic node will have None for the file number, have line number
-# zero, and a negative column number which comes from inverting the next value
-# out of this counter.  Synthetic nodes created with a reference node will
-# have a file number from the reference node, some unknown line number, and
-# a negative column number from this counter.
-_SYNTHETIC_COUNTER = count(start=-1, step=-1)
-
-
-# Returned from node_get_provenance
-class ProvenanceInformation:
-
-    __slots__ = (
-        "filename",
-        "shortname",
-        "displayname",
-        "line",
-        "col",
-        "toplevel",
-        "node",
-        "project",
-        "is_synthetic",
-    )
-
-    def __init__(self, nodeish):
-        self.node = nodeish
-        if (nodeish is None) or (nodeish[1] is None):
-            self.filename = ""
-            self.shortname = ""
-            self.displayname = ""
-            self.line = 1
-            self.col = 0
-            self.toplevel = None
-            self.project = None
-        else:
-            fileinfo = _FILE_LIST[nodeish[1]]
-            self.filename = fileinfo[0]
-            self.shortname = fileinfo[1]
-            self.displayname = fileinfo[2]
-            # We add 1 here to convert from computerish to humanish
-            self.line = nodeish[2] + 1
-            self.col = nodeish[3]
-            self.toplevel = fileinfo[3]
-            self.project = fileinfo[4]
-        self.is_synthetic = (self.filename == '') or (self.col < 0)
-
-    # Convert a Provenance to a string for error reporting
-    def __str__(self):
-        if self.is_synthetic:
-            return "{} [synthetic node]".format(self.displayname)
-        else:
-            return "{} [line {:d} column {:d}]".format(self.displayname, self.line, self.col)
-
-
-# These exceptions are intended to be caught entirely within
-# the BuildStream framework, hence they do not reside in the
-# public exceptions.py
-class CompositeError(Exception):
-    def __init__(self, path, message):
-        super(CompositeError, self).__init__(message)
-        self.path = path
-        self.message = message
-
-
-class YAMLLoadError(Exception):
-    pass
-
-
-# Representer for YAML events comprising input to the BuildStream format.
-#
-# All streams MUST represent a single document which must be a Mapping.
-# Anything else is considered an error.
-#
-# Mappings must only have string keys, values are always represented as
-# strings if they are scalar, or else as simple dictionaries and lists.
-#
-class Representer:
-    __slots__ = (
-        "_file_index",
-        "state",
-        "output",
-        "keys",
-    )
-
-    # Initialise a new representer
-    #
-    # The file index is used to store into the Node instances so that the
-    # provenance of the YAML can be tracked.
-    #
-    # Args:
-    #   file_index (int): The index of this YAML file
-    def __init__(self, file_index):
-        self._file_index = file_index
-        self.state = "init"
-        self.output = []
-        self.keys = []
-
-    # Handle a YAML parse event
-    #
-    # Args:
-    #   event (YAML Event): The event to be handled
-    #
-    # Raises:
-    #   YAMLLoadError: Something went wrong.
-    def handle_event(self, event):
-        if getattr(event, "anchor", None) is not None:
-            raise YAMLLoadError("Anchors are disallowed in BuildStream at line {} column {}"
-                                .format(event.start_mark.line, event.start_mark.column))
-
-        if event.__class__.__name__ == "ScalarEvent":
-            if event.tag is not None:
-                if not event.tag.startswith("tag:yaml.org,2002:"):
-                    raise YAMLLoadError(
-                        "Non-core tag expressed in input.  " +
-                        "This is disallowed in BuildStream. At line {} column {}"
-                        .format(event.start_mark.line, event.start_mark.column))
-
-        handler = "_handle_{}_{}".format(self.state, event.__class__.__name__)
-        handler = getattr(self, handler, None)
-        if handler is None:
-            raise YAMLLoadError(
-                "Invalid input detected. No handler for {} in state {} at line {} column {}"
-                .format(event, self.state, event.start_mark.line, event.start_mark.column))
-
-        self.state = handler(event)  # pylint: disable=not-callable
-
-    # Get the output of the YAML parse
-    #
-    # Returns:
-    #   (Node or None): Return the Node instance of the top level mapping or
-    #                   None if there wasn't one.
-    def get_output(self):
-        try:
-            return self.output[0]
-        except IndexError:
-            return None
-
-    def _handle_init_StreamStartEvent(self, ev):
-        return "stream"
-
-    def _handle_stream_DocumentStartEvent(self, ev):
-        return "doc"
-
-    def _handle_doc_MappingStartEvent(self, ev):
-        newmap = Node({}, self._file_index, ev.start_mark.line, ev.start_mark.column)
-        self.output.append(newmap)
-        return "wait_key"
-
-    def _handle_wait_key_ScalarEvent(self, ev):
-        self.keys.append(ev.value)
-        return "wait_value"
-
-    def _handle_wait_value_ScalarEvent(self, ev):
-        key = self.keys.pop()
-        self.output[-1][0][key] = \
-            Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column)
-        return "wait_key"
-
-    def _handle_wait_value_MappingStartEvent(self, ev):
-        new_state = self._handle_doc_MappingStartEvent(ev)
-        key = self.keys.pop()
-        self.output[-2][0][key] = self.output[-1]
-        return new_state
-
-    def _handle_wait_key_MappingEndEvent(self, ev):
-        # We've finished a mapping, so pop it off the output stack
-        # unless it's the last one in which case we leave it
-        if len(self.output) > 1:
-            self.output.pop()
-            if type(self.output[-1][0]) is list:
-                return "wait_list_item"
-            else:
-                return "wait_key"
-        else:
-            return "doc"
-
-    def _handle_wait_value_SequenceStartEvent(self, ev):
-        self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
-        self.output[-2][0][self.keys[-1]] = self.output[-1]
-        return "wait_list_item"
-
-    def _handle_wait_list_item_SequenceStartEvent(self, ev):
-        self.keys.append(len(self.output[-1][0]))
-        self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
-        self.output[-2][0].append(self.output[-1])
-        return "wait_list_item"
-
-    def _handle_wait_list_item_SequenceEndEvent(self, ev):
-        # When ending a sequence, we need to pop a key because we retain the
-        # key until the end so that if we need to mutate the underlying entry
-        # we can.
-        key = self.keys.pop()
-        self.output.pop()
-        if type(key) is int:
-            return "wait_list_item"
-        else:
-            return "wait_key"
-
-    def _handle_wait_list_item_ScalarEvent(self, ev):
-        self.output[-1][0].append(
-            Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column))
-        return "wait_list_item"
-
-    def _handle_wait_list_item_MappingStartEvent(self, ev):
-        new_state = self._handle_doc_MappingStartEvent(ev)
-        self.output[-2][0].append(self.output[-1])
-        return new_state
-
-    def _handle_doc_DocumentEndEvent(self, ev):
-        if len(self.output) != 1:
-            raise YAMLLoadError("Zero, or more than one document found in YAML stream")
-        return "stream"
-
-    def _handle_stream_StreamEndEvent(self, ev):
-        return "init"
-
-
-# Loads a dictionary from some YAML
-#
-# Args:
-#    filename (str): The YAML file to load
-#    shortname (str): The filename in shorthand for error reporting (or None)
-#    copy_tree (bool): Whether to make a copy, preserving the original toplevels
-#                      for later serialization
-#    project (Project): The (optional) project to associate the parsed YAML with
-#
-# Returns (dict): A loaded copy of the YAML file with provenance information
-#
-# Raises: LoadError
-#
-def load(filename, shortname=None, copy_tree=False, *, project=None):
-    if not shortname:
-        shortname = filename
-
-    if (project is not None) and (project.junction is not None):
-        displayname = "{}:{}".format(project.junction.name, shortname)
-    else:
-        displayname = shortname
-
-    file_number = len(_FILE_LIST)
-    _FILE_LIST.append((filename, shortname, displayname, None, project))
-
-    try:
-        with open(filename) as f:
-            contents = f.read()
-
-        data = load_data(contents,
-                         file_index=file_number,
-                         file_name=filename,
-                         copy_tree=copy_tree)
-
-        return data
-    except FileNotFoundError as e:
-        raise LoadError(LoadErrorReason.MISSING_FILE,
-                        "Could not find file at {}".format(filename)) from e
-    except IsADirectoryError as e:
-        raise LoadError(LoadErrorReason.LOADING_DIRECTORY,
-                        "{} is a directory. bst command expects a .bst file."
-                        .format(filename)) from e
-    except LoadError as e:
-        raise LoadError(e.reason, "{}: {}".format(displayname, e)) from e
-
-
-# Like load(), but doesnt require the data to be in a file
-#
-def load_data(data, file_index=None, file_name=None, copy_tree=False):
-
-    try:
-        rep = Representer(file_index)
-        for event in yaml.parse(data, Loader=yaml.CBaseLoader):
-            rep.handle_event(event)
-        contents = rep.get_output()
-    except YAMLLoadError as e:
-        raise LoadError(LoadErrorReason.INVALID_YAML,
-                        "Malformed YAML:\n\n{}\n\n".format(e)) from e
-    except Exception as e:
-        raise LoadError(LoadErrorReason.INVALID_YAML,
-                        "Severely malformed YAML:\n\n{}\n\n".format(e)) from e
-
-    if not isinstance(contents, tuple) or not isinstance(contents[0], dict):
-        # Special case allowance for None, when the loaded file has only comments in it.
-        if contents is None:
-            contents = Node({}, file_index, 0, 0)
-        else:
-            raise LoadError(LoadErrorReason.INVALID_YAML,
-                            "YAML file has content of type '{}' instead of expected type 'dict': {}"
-                            .format(type(contents[0]).__name__, file_name))
-
-    # Store this away because we'll use it later for "top level" provenance
-    if file_index is not None:
-        _FILE_LIST[file_index] = (
-            _FILE_LIST[file_index][0],  # Filename
-            _FILE_LIST[file_index][1],  # Shortname
-            _FILE_LIST[file_index][2],  # Displayname
-            contents,
-            _FILE_LIST[file_index][4],  # Project
-        )
-
-    if copy_tree:
-        contents = node_copy(contents)
-    return contents
-
-
-# dump()
-#
-# Write a YAML node structure out to disk.
-#
-# This will always call `node_sanitize` on its input, so if you wanted
-# to output something close to what you read in, consider using the
-# `roundtrip_load` and `roundtrip_dump` function pair instead.
-#
-# Args:
-#    contents (any): Content to write out
-#    filename (str): The (optional) file name to write out to
-def dump(contents, filename=None):
-    roundtrip_dump(node_sanitize(contents), file=filename)
-
-
-# node_get_provenance()
-#
-# Gets the provenance for a node
-#
-# Args:
-#   node (dict): a dictionary
-#   key (str): key in the dictionary
-#   indices (list of indexes): Index path, in the case of list values
-#
-# Returns: The Provenance of the dict, member or list element
-#
-def node_get_provenance(node, key=None, indices=None):
-    assert is_node(node)
-
-    if key is None:
-        # Retrieving the provenance for this node directly
-        return ProvenanceInformation(node)
-
-    if key and not indices:
-        return ProvenanceInformation(node[0].get(key))
-
-    nodeish = node[0].get(key)
-    for idx in indices:
-        nodeish = nodeish[0][idx]
-
-    return ProvenanceInformation(nodeish)
-
-
-# A sentinel to be used as a default argument for functions that need
-# to distinguish between a kwarg set to None and an unset kwarg.
-_sentinel = object()
-
-
-# node_get()
-#
-# Fetches a value from a dictionary node and checks it for
-# an expected value. Use default_value when parsing a value
-# which is only optionally supplied.
-#
-# Args:
-#    node (dict): The dictionary node
-#    expected_type (type): The expected type for the value being searched
-#    key (str): The key to get a value for in node
-#    indices (list of ints): Optionally decend into lists of lists
-#    default_value: Optionally return this value if the key is not found
-#    allow_none: (bool): Allow None to be a valid value
-#
-# Returns:
-#    The value if found in node, otherwise default_value is returned
-#
-# Raises:
-#    LoadError, when the value found is not of the expected type
-#
-# Note:
-#    Returned strings are stripped of leading and trailing whitespace
-#
-def node_get(node, expected_type, key, indices=None, *, default_value=_sentinel, allow_none=False):
-    assert type(node) is Node
-
-    if indices is None:
-        if default_value is _sentinel:
-            value = node[0].get(key, Node(default_value, None, 0, 0))
-        else:
-            value = node[0].get(key, Node(default_value, None, 0, next(_SYNTHETIC_COUNTER)))
-
-        if value[0] is _sentinel:
-            provenance = node_get_provenance(node)
-            raise LoadError(LoadErrorReason.INVALID_DATA,
-                            "{}: Dictionary did not contain expected key '{}'".format(provenance, key))
-    else:
-        # Implied type check of the element itself
-        # No need to synthesise useful node content as we destructure it immediately
-        value = Node(node_get(node, list, key), None, 0, 0)
-        for index in indices:
-            value = value[0][index]
-            if type(value) is not Node:
-                value = (value,)
-
-    # Optionally allow None as a valid value for any type
-    if value[0] is None and (allow_none or default_value is None):
-        return None
-
-    if (expected_type is not None) and (not isinstance(value[0], expected_type)):
-        # Attempt basic conversions if possible, typically we want to
-        # be able to specify numeric values and convert them to strings,
-        # but we dont want to try converting dicts/lists
-        try:
-            if (expected_type == bool and isinstance(value[0], str)):
-                # Dont coerce booleans to string, this makes "False" strings evaluate to True
-                # We don't structure into full nodes since there's no need.
-                if value[0] in ('True', 'true'):
-                    value = (True, None, 0, 0)
-                elif value[0] in ('False', 'false'):
-                    value = (False, None, 0, 0)
-                else:
-                    raise ValueError()
-            elif not (expected_type == list or
-                      expected_type == dict or
-                      isinstance(value[0], (list, dict))):
-                value = (expected_type(value[0]), None, 0, 0)
-            else:
-                raise ValueError()
-        except (ValueError, TypeError):
-            provenance = node_get_provenance(node, key=key, indices=indices)
-            if indices:
-                path = [key]
-                path.extend("[{:d}]".format(i) for i in indices)
-                path = "".join(path)
-            else:
-                path = key
-            raise LoadError(LoadErrorReason.INVALID_DATA,
-                            "{}: Value of '{}' is not of the expected type '{}'"
-                            .format(provenance, path, expected_type.__name__))
-
-    # Now collapse lists, and scalars, to their value, leaving nodes as-is
-    if type(value[0]) is not dict:
-        value = value[0]
-
-    # Trim it at the bud, let all loaded strings from yaml be stripped of whitespace
-    if type(value) is str:
-        value = value.strip()
-
-    elif type(value) is list:
-        # Now we create a fresh list which unwraps the str and list types
-        # semi-recursively.
-        value = __trim_list_provenance(value)
-
-    return value
-
-
-def __trim_list_provenance(value):
-    ret = []
-    for entry in value:
-        if type(entry) is not Node:
-            entry = (entry, None, 0, 0)
-        if type(entry[0]) is list:
-            ret.append(__trim_list_provenance(entry[0]))
-        elif type(entry[0]) is dict:
-            ret.append(entry)
-        else:
-            ret.append(entry[0])
-    return ret
-
-
-# node_set()
-#
-# Set an item within the node.  If using `indices` be aware that the entry must
-# already exist, or else a KeyError will be raised.  Use `node_extend_list` to
-# create entries before using `node_set`
-#
-# Args:
-#    node (tuple): The node
-#    key (str): The key name
-#    value: The value
-#    indices: Any indices to index into the list referenced by key, like in
-#             `node_get` (must be a list of integers)
-#
-def node_set(node, key, value, indices=None):
-    if indices:
-        node = node[0][key]
-        key = indices.pop()
-        for idx in indices:
-            node = node[0][idx]
-    if type(value) is Node:
-        node[0][key] = value
-    else:
-        try:
-            # Need to do this just in case we're modifying a list
-            old_value = node[0][key]
-        except KeyError:
-            old_value = None
-        if old_value is None:
-            node[0][key] = Node(value, node[1], node[2], next(_SYNTHETIC_COUNTER))
-        else:
-            node[0][key] = Node(value, old_value[1], old_value[2], old_value[3])
-
-
-# node_extend_list()
-#
-# Extend a list inside a node to a given length, using the passed
-# default value to fill it out.
-#
-# Valid default values are:
-#    Any string
-#    An empty dict
-#    An empty list
-#
-# Args:
-#    node (node): The node
-#    key (str): The list name in the node
-#    length (int): The length to extend the list to
-#    default (any): The default value to extend with.
-def node_extend_list(node, key, length, default):
-    assert type(default) is str or default in ([], {})
-
-    list_node = node[0].get(key)
-    if list_node is None:
-        list_node = node[0][key] = Node([], node[1], node[2], next(_SYNTHETIC_COUNTER))
-
-    assert type(list_node[0]) is list
-
-    the_list = list_node[0]
-    def_type = type(default)
-
-    file_index = node[1]
-    if the_list:
-        line_num = the_list[-1][2]
-    else:
-        line_num = list_node[2]
-
-    while length > len(the_list):
-        if def_type is str:
-            value = default
-        elif def_type is list:
-            value = []
-        else:
-            value = {}
-
-        line_num += 1
-
-        the_list.append(Node(value, file_index, line_num, next(_SYNTHETIC_COUNTER)))
-
-
-# node_items()
-#
-# A convenience generator for iterating over loaded key/value
-# tuples in a dictionary loaded from project YAML.
-#
-# Args:
-#    node (dict): The dictionary node
-#
-# Yields:
-#    (str): The key name
-#    (anything): The value for the key
-#
-def node_items(node):
-    if type(node) is not Node:
-        node = Node(node, None, 0, 0)
-    for key, value in node[0].items():
-        if type(value) is not Node:
-            value = Node(value, None, 0, 0)
-        if type(value[0]) is dict:
-            yield (key, value)
-        elif type(value[0]) is list:
-            yield (key, __trim_list_provenance(value[0]))
-        else:
-            yield (key, value[0])
-
-
-# node_keys()
-#
-# A convenience generator for iterating over loaded keys
-# in a dictionary loaded from project YAML.
-#
-# Args:
-#    node (dict): The dictionary node
-#
-# Yields:
-#    (str): The key name
-#
-def node_keys(node):
-    if type(node) is not Node:
-        node = Node(node, None, 0, 0)
-    yield from node[0].keys()
-
-
-# node_del()
-#
-# A convenience generator for iterating over loaded key/value
-# tuples in a dictionary loaded from project YAML.
-#
-# Args:
-#    node (dict): The dictionary node
-#    key (str): The key we want to remove
-#    safe (bool): Whether to raise a KeyError if unable
-#
-def node_del(node, key, safe=False):
-    try:
-        del node[0][key]
-    except KeyError:
-        if not safe:
-            raise
-
-
-# is_node()
-#
-# A test method which returns whether or not the passed in value
-# is a valid YAML node.  It is not valid to call this on a Node
-# object which is not a Mapping.
-#
-# Args:
-#    maybenode (any): The object to test for nodeness
-#
-# Returns:
-#    (bool): Whether or not maybenode was a Node
-#
-def is_node(maybenode):
-    # It's a programming error to give this a Node which isn't a mapping
-    # so assert that.
-    assert (type(maybenode) is not Node) or (type(maybenode[0]) is dict)
-    # Now return the type check
-    return type(maybenode) is Node
-
-
-# new_synthetic_file()
-#
-# Create a new synthetic mapping node, with an associated file entry
-# (in _FILE_LIST) such that later tracking can correctly determine which
-# file needs writing to in order to persist the changes.
-#
-# Args:
-#    filename (str): The name of the synthetic file to create
-#    project (Project): The optional project to associate this synthetic file with
-#
-# Returns:
-#    (Node): An empty YAML mapping node, whose provenance is to this new
-#            synthetic file
-#
-def new_synthetic_file(filename, project=None):
-    file_index = len(_FILE_LIST)
-    node = Node({}, file_index, 0, 0)
-    _FILE_LIST.append((filename,
-                       filename,
-                       "<synthetic {}>".format(filename),
-                       node,
-                       project))
-    return node
-
-
-# new_empty_node()
-#
-# Args:
-#    ref_node (Node): Optional node whose provenance should be referenced
-#
-# Returns
-#    (Node): A new empty YAML mapping node
-#
-def new_empty_node(ref_node=None):
-    if ref_node is not None:
-        return Node({}, ref_node[1], ref_node[2], next(_SYNTHETIC_COUNTER))
-    else:
-        return Node({}, None, 0, 0)
-
-
-# new_node_from_dict()
-#
-# Args:
-#   indict (dict): The input dictionary
-#
-# Returns:
-#   (Node): A new synthetic YAML tree which represents this dictionary
-#
-def new_node_from_dict(indict):
-    ret = {}
-    for k, v in indict.items():
-        vtype = type(v)
-        if vtype is dict:
-            ret[k] = new_node_from_dict(v)
-        elif vtype is list:
-            ret[k] = __new_node_from_list(v)
-        else:
-            ret[k] = Node(str(v), None, 0, next(_SYNTHETIC_COUNTER))
-    return Node(ret, None, 0, next(_SYNTHETIC_COUNTER))
-
-
-# Internal function to help new_node_from_dict() to handle lists
-def __new_node_from_list(inlist):
-    ret = []
-    for v in inlist:
-        vtype = type(v)
-        if vtype is dict:
-            ret.append(new_node_from_dict(v))
-        elif vtype is list:
-            ret.append(__new_node_from_list(v))
-        else:
-            ret.append(Node(str(v), None, 0, next(_SYNTHETIC_COUNTER)))
-    return Node(ret, None, 0, next(_SYNTHETIC_COUNTER))
-
-
-# _is_composite_list
-#
-# Checks if the given node is a Mapping with array composition
-# directives.
-#
-# Args:
-#    node (value): Any node
-#
-# Returns:
-#    (bool): True if node was a Mapping containing only
-#            list composition directives
-#
-# Raises:
-#    (LoadError): If node was a mapping and contained a mix of
-#                 list composition directives and other keys
-#
-def _is_composite_list(node):
-
-    if type(node[0]) is dict:
-        has_directives = False
-        has_keys = False
-
-        for key, _ in node_items(node):
-            if key in ['(>)', '(<)', '(=)']:  # pylint: disable=simplifiable-if-statement
-                has_directives = True
-            else:
-                has_keys = True
-
-            if has_keys and has_directives:
-                provenance = node_get_provenance(node)
-                raise LoadError(LoadErrorReason.INVALID_DATA,
-                                "{}: Dictionary contains array composition directives and arbitrary keys"
-                                .format(provenance))
-        return has_directives
-
-    return False
-
-
-# _compose_composite_list()
-#
-# Composes a composite list (i.e. a dict with list composition directives)
-# on top of a target list which is a composite list itself.
-#
-# Args:
-#    target (Node): A composite list
-#    source (Node): A composite list
-#
-def _compose_composite_list(target, source):
-    clobber = source[0].get("(=)")
-    prefix = source[0].get("(<)")
-    suffix = source[0].get("(>)")
-    if clobber is not None:
-        # We want to clobber the target list
-        # which basically means replacing the target list
-        # with ourselves
-        target[0]["(=)"] = clobber
-        if prefix is not None:
-            target[0]["(<)"] = prefix
-        elif "(<)" in target[0]:
-            target[0]["(<)"][0].clear()
-        if suffix is not None:
-            target[0]["(>)"] = suffix
-        elif "(>)" in target[0]:
-            target[0]["(>)"][0].clear()
-    else:
-        # Not clobbering, so prefix the prefix and suffix the suffix
-        if prefix is not None:
-            if "(<)" in target[0]:
-                for v in reversed(prefix[0]):
-                    target[0]["(<)"][0].insert(0, v)
-            else:
-                target[0]["(<)"] = prefix
-        if suffix is not None:
-            if "(>)" in target[0]:
-                target[0]["(>)"][0].extend(suffix[0])
-            else:
-                target[0]["(>)"] = suffix
-
-
-# _compose_list()
-#
-# Compose a composite list (a dict with composition directives) on top of a
-# simple list.
-#
-# Args:
-#    target (Node): The target list to be composed into
-#    source (Node): The composition list to be composed from
-#
-def _compose_list(target, source):
-    clobber = source[0].get("(=)")
-    prefix = source[0].get("(<)")
-    suffix = source[0].get("(>)")
-    if clobber is not None:
-        target[0].clear()
-        target[0].extend(clobber[0])
-    if prefix is not None:
-        for v in reversed(prefix[0]):
-            target[0].insert(0, v)
-    if suffix is not None:
-        target[0].extend(suffix[0])
-
-
-# composite_dict()
-#
-# Compose one mapping node onto another
-#
-# Args:
-#    target (Node): The target to compose into
-#    source (Node): The source to compose from
-#    path   (list): The path to the current composition node
-#
-# Raises: CompositeError
-#
-def composite_dict(target, source, path=None):
-    if path is None:
-        path = []
-    for k, v in source[0].items():
-        path.append(k)
-        if type(v[0]) is list:
-            # List clobbers anything list-like
-            target_value = target[0].get(k)
-            if not (target_value is None or
-                    type(target_value[0]) is list or
-                    _is_composite_list(target_value)):
-                raise CompositeError(path,
-                                     "{}: List cannot overwrite {} at: {}"
-                                     .format(node_get_provenance(source, k),
-                                             k,
-                                             node_get_provenance(target, k)))
-            # Looks good, clobber it
-            target[0][k] = v
-        elif _is_composite_list(v):
-            if k not in target[0]:
-                # Composite list clobbers empty space
-                target[0][k] = v
-            elif type(target[0][k][0]) is list:
-                # Composite list composes into a list
-                _compose_list(target[0][k], v)
-            elif _is_composite_list(target[0][k]):
-                # Composite list merges into composite list
-                _compose_composite_list(target[0][k], v)
-            else:
-                # Else composing on top of normal dict or a scalar, so raise...
-                raise CompositeError(path,
-                                     "{}: Cannot compose lists onto {}".format(
-                                         node_get_provenance(v),
-                                         node_get_provenance(target[0][k])))
-        elif type(v[0]) is dict:
-            # We're composing a dict into target now
-            if k not in target[0]:
-                # Target lacks a dict at that point, make a fresh one with
-                # the same provenance as the incoming dict
-                target[0][k] = Node({}, v[1], v[2], v[3])
-            if type(target[0]) is not dict:
-                raise CompositeError(path,
-                                     "{}: Cannot compose dictionary onto {}".format(
-                                         node_get_provenance(v),
-                                         node_get_provenance(target[0][k])))
-            composite_dict(target[0][k], v, path)
-        else:
-            target_value = target[0].get(k)
-            if target_value is not None and type(target_value[0]) is not str:
-                raise CompositeError(path,
-                                     "{}: Cannot compose scalar on non-scalar at {}".format(
-                                         node_get_provenance(v),
-                                         node_get_provenance(target[0][k])))
-            target[0][k] = v
-        path.pop()
-
-
-# Like composite_dict(), but raises an all purpose LoadError for convenience
-#
-def composite(target, source):
-    assert type(source[0]) is dict
-    assert type(target[0]) is dict
-
-    try:
-        composite_dict(target, source)
-    except CompositeError as e:
-        source_provenance = node_get_provenance(source)
-        error_prefix = ""
-        if source_provenance:
-            error_prefix = "{}: ".format(source_provenance)
-        raise LoadError(LoadErrorReason.ILLEGAL_COMPOSITE,
-                        "{}Failure composing {}: {}"
-                        .format(error_prefix,
-                                e.path,
-                                e.message)) from e
-
-
-# Like composite(target, source), but where target overrides source instead.
-#
-def composite_and_move(target, source):
-    composite(source, target)
-
-    to_delete = [key for key in target[0].keys() if key not in source[0]]
-    for key, value in source[0].items():
-        target[0][key] = value
-    for key in to_delete:
-        del target[0][key]
-
-
-# Types we can short-circuit in node_sanitize for speed.
-__SANITIZE_SHORT_CIRCUIT_TYPES = (int, float, str, bool)
-
-
-# node_sanitize()
-#
-# Returns an alphabetically ordered recursive copy
-# of the source node with internal provenance information stripped.
-#
-# Only dicts are ordered, list elements are left in order.
-#
-def node_sanitize(node, *, dict_type=OrderedDict):
-    node_type = type(node)
-
-    # If we have an unwrappable node, unwrap it
-    if node_type is Node:
-        node = node[0]
-        node_type = type(node)
-
-    # Short-circuit None which occurs ca. twice per element
-    if node is None:
-        return node
-
-    # Next short-circuit integers, floats, strings, booleans, and tuples
-    if node_type in __SANITIZE_SHORT_CIRCUIT_TYPES:
-        return node
-
-    # Now short-circuit lists.
-    elif node_type is list:
-        return [node_sanitize(elt, dict_type=dict_type) for elt in node]
-
-    # Finally dict, and other Mappings need special handling
-    elif node_type is dict:
-        result = dict_type()
-
-        key_list = [key for key, _ in node.items()]
-        for key in sorted(key_list):
-            result[key] = node_sanitize(node[key], dict_type=dict_type)
-
-        return result
-
-    # Sometimes we're handed tuples and we can't be sure what they contain
-    # so we have to sanitize into them
-    elif node_type is tuple:
-        return tuple((node_sanitize(v, dict_type=dict_type) for v in node))
-
-    # Everything else just gets returned as-is.
-    return node
-
-
-# node_validate()
-#
-# Validate the node so as to ensure the user has not specified
-# any keys which are unrecognized by buildstream (usually this
-# means a typo which would otherwise not trigger an error).
-#
-# Args:
-#    node (dict): A dictionary loaded from YAML
-#    valid_keys (list): A list of valid keys for the specified node
-#
-# Raises:
-#    LoadError: In the case that the specified node contained
-#               one or more invalid keys
-#
-def node_validate(node, valid_keys):
-
-    # Probably the fastest way to do this: https://stackoverflow.com/a/23062482
-    valid_keys = set(valid_keys)
-    invalid = next((key for key in node[0] if key not in valid_keys), None)
-
-    if invalid:
-        provenance = node_get_provenance(node, key=invalid)
-        raise LoadError(LoadErrorReason.INVALID_DATA,
-                        "{}: Unexpected key: {}".format(provenance, invalid))
-
-
-# Node copying
-#
-# Unfortunately we copy nodes a *lot* and `isinstance()` is super-slow when
-# things from collections.abc get involved.  The result is the following
-# intricate but substantially faster group of tuples and the use of `in`.
-#
-# If any of the {node,list}_copy routines raise a ValueError
-# then it's likely additional types need adding to these tuples.
-
-
-# These types just have their value copied
-__QUICK_TYPES = (str, bool)
-
-# These are the directives used to compose lists, we need this because it's
-# slightly faster during the node_final_assertions checks
-__NODE_ASSERT_COMPOSITION_DIRECTIVES = ('(>)', '(<)', '(=)')
-
-
-# node_copy()
-#
-# Make a deep copy of the given YAML node, preserving provenance.
-#
-# Args:
-#    source (Node): The YAML node to copy
-#
-# Returns:
-#    (Node): A deep copy of source with provenance preserved.
-#
-def node_copy(source):
-    copy = {}
-    for key, value in source[0].items():
-        value_type = type(value[0])
-        if value_type is dict:
-            copy[key] = node_copy(value)
-        elif value_type is list:
-            copy[key] = _list_copy(value)
-        elif value_type in __QUICK_TYPES:
-            copy[key] = value
-        else:
-            raise ValueError("Unable to be quick about node_copy of {}".format(value_type))
-
-    return Node(copy, source[1], source[2], source[3])
-
-
-# Internal function to help node_copy() but for lists.
-def _list_copy(source):
-    copy = []
-    for item in source[0]:
-        item_type = type(item[0])
-        if item_type is dict:
-            copy.append(node_copy(item))
-        elif item_type is list:
-            copy.append(_list_copy(item))
-        elif item_type in __QUICK_TYPES:
-            copy.append(item)
-        else:
-            raise ValueError("Unable to be quick about list_copy of {}".format(item_type))
-
-    return Node(copy, source[1], source[2], source[3])
-
-
-# node_final_assertions()
-#
-# This must be called on a fully loaded and composited node,
-# after all composition has completed.
-#
-# Args:
-#    node (Mapping): The final composited node
-#
-# Raises:
-#    (LoadError): If any assertions fail
-#
-def node_final_assertions(node):
-    assert type(node) is Node
-
-    for key, value in node[0].items():
-
-        # Assert that list composition directives dont remain, this
-        # indicates that the user intended to override a list which
-        # never existed in the underlying data
-        #
-        if key in __NODE_ASSERT_COMPOSITION_DIRECTIVES:
-            provenance = node_get_provenance(node, key)
-            raise LoadError(LoadErrorReason.TRAILING_LIST_DIRECTIVE,
-                            "{}: Attempt to override non-existing list".format(provenance))
-
-        value_type = type(value[0])
-
-        if value_type is dict:
-            node_final_assertions(value)
-        elif value_type is list:
-            _list_final_assertions(value)
-
-
-# Helper function for node_final_assertions(), but for lists.
-def _list_final_assertions(values):
-    for value in values[0]:
-        value_type = type(value[0])
-
-        if value_type is dict:
-            node_final_assertions(value)
-        elif value_type is list:
-            _list_final_assertions(value)
-
-
-# assert_symbol_name()
-#
-# A helper function to check if a loaded string is a valid symbol
-# name and to raise a consistent LoadError if not. For strings which
-# are required to be symbols.
-#
-# Args:
-#    provenance (Provenance): The provenance of the loaded symbol, or None
-#    symbol_name (str): The loaded symbol name
-#    purpose (str): The purpose of the string, for an error message
-#    allow_dashes (bool): Whether dashes are allowed for this symbol
-#
-# Raises:
-#    LoadError: If the symbol_name is invalid
-#
-# Note that dashes are generally preferred for variable names and
-# usage in YAML, but things such as option names which will be
-# evaluated with jinja2 cannot use dashes.
-def assert_symbol_name(provenance, symbol_name, purpose, *, allow_dashes=True):
-    valid_chars = string.digits + string.ascii_letters + '_'
-    if allow_dashes:
-        valid_chars += '-'
-
-    valid = True
-    if not symbol_name:
-        valid = False
-    elif any(x not in valid_chars for x in symbol_name):
-        valid = False
-    elif symbol_name[0] in string.digits:
-        valid = False
-
-    if not valid:
-        detail = "Symbol names must contain only alphanumeric characters, " + \
-                 "may not start with a digit, and may contain underscores"
-        if allow_dashes:
-            detail += " or dashes"
-
-        message = "Invalid symbol name for {}: '{}'".format(purpose, symbol_name)
-        if provenance is not None:
-            message = "{}: {}".format(provenance, message)
-
-        raise LoadError(LoadErrorReason.INVALID_SYMBOL_NAME,
-                        message, detail=detail)
-
-
-# node_find_target()
-#
-# Searches the given node tree for the given target node.
-#
-# This is typically used when trying to walk a path to a given node
-# for the purpose of then modifying a similar tree of objects elsewhere
-#
-# If the key is provided, then we actually hunt for the node represented by
-# target[key] and return its container, rather than hunting for target directly
-#
-# Args:
-#    node (Node): The node at the root of the tree to search
-#    target (Node): The node you are looking for in that tree
-#    key (str): Optional string key within target node
-#
-# Returns:
-#    (list): A path from `node` to `target` or None if `target` is not in the subtree
-def node_find_target(node, target, *, key=None):
-    assert type(node) is Node
-    assert type(target) is Node
-    if key is not None:
-        target = target[0][key]
-
-    path = []
-    if _walk_find_target(node, path, target):
-        if key:
-            # Remove key from end of path
-            path = path[:-1]
-        return path
-    return None
-
-
-# Helper for node_find_target() which walks a value
-def _walk_find_target(node, path, target):
-    if node[1:] == target[1:]:
-        return True
-    elif type(node[0]) is dict:
-        return _walk_dict_node(node, path, target)
-    elif type(node[0]) is list:
-        return _walk_list_node(node, path, target)
-    return False
-
-
-# Helper for node_find_target() which walks a list
-def _walk_list_node(node, path, target):
-    for i, v in enumerate(node[0]):
-        path.append(i)
-        if _walk_find_target(v, path, target):
-            return True
-        del path[-1]
-    return False
-
-
-# Helper for node_find_target() which walks a mapping
-def _walk_dict_node(node, path, target):
-    for k, v in node[0].items():
-        path.append(k)
-        if _walk_find_target(v, path, target):
-            return True
-        del path[-1]
-    return False
-
-
-###############################################################################
-
-# Roundtrip code
-
-# Always represent things consistently:
-
-yaml.RoundTripRepresenter.add_representer(OrderedDict,
-                                          yaml.SafeRepresenter.represent_dict)
-
-# Always parse things consistently
-
-yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:int',
-                                          yaml.RoundTripConstructor.construct_yaml_str)
-yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:float',
-                                          yaml.RoundTripConstructor.construct_yaml_str)
-yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:bool',
-                                          yaml.RoundTripConstructor.construct_yaml_str)
-yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:null',
-                                          yaml.RoundTripConstructor.construct_yaml_str)
-yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:timestamp',
-                                          yaml.RoundTripConstructor.construct_yaml_str)
-
-
-# HardlineDumper
-#
-# This is a dumper used during roundtrip_dump which forces every scalar to be
-# a plain string, in order to match the output format to the input format.
-#
-# If you discover something is broken, please add a test case to the roundtrip
-# test in tests/internals/yaml/roundtrip-test.yaml
-#
-class HardlineDumper(yaml.RoundTripDumper):
-    def __init__(self, *args, **kwargs):
-        yaml.RoundTripDumper.__init__(self, *args, **kwargs)
-        # For each of YAML 1.1 and 1.2, force everything to be a plain string
-        for version in [(1, 1), (1, 2), None]:
-            self.add_version_implicit_resolver(
-                version,
-                u'tag:yaml.org,2002:str',
-                yaml.util.RegExp(r'.*'),
-                None)
-
-
-# roundtrip_load()
-#
-# Load a YAML file into memory in a form which allows roundtripping as best
-# as ruamel permits.
-#
-# Note, the returned objects can be treated as Mappings and Lists and Strings
-# but replacing content wholesale with plain dicts and lists may result
-# in a loss of comments and formatting.
-#
-# Args:
-#    filename (str): The file to load in
-#    allow_missing (bool): Optionally set this to True to allow missing files
-#
-# Returns:
-#    (Mapping): The loaded YAML mapping.
-#
-# Raises:
-#    (LoadError): If the file is missing, or a directory, this is raised.
-#                 Also if the YAML is malformed.
-#
-def roundtrip_load(filename, *, allow_missing=False):
-    try:
-        with open(filename, "r") as fh:
-            data = fh.read()
-        contents = roundtrip_load_data(data, filename=filename)
-    except FileNotFoundError as e:
-        if allow_missing:
-            # Missing files are always empty dictionaries
-            return {}
-        else:
-            raise LoadError(LoadErrorReason.MISSING_FILE,
-                            "Could not find file at {}".format(filename)) from e
-    except IsADirectoryError as e:
-        raise LoadError(LoadErrorReason.LOADING_DIRECTORY,
-                        "{} is a directory."
-                        .format(filename)) from e
-    return contents
-
-
-# roundtrip_load_data()
-#
-# Parse the given contents as YAML, returning them as a roundtrippable data
-# structure.
-#
-# A lack of content will be returned as an empty mapping.
-#
-# Args:
-#    contents (str): The contents to be parsed as YAML
-#    filename (str): Optional filename to be used in error reports
-#
-# Returns:
-#    (Mapping): The loaded YAML mapping
-#
-# Raises:
-#    (LoadError): Raised on invalid YAML, or YAML which parses to something other
-#                 than a Mapping
-#
-def roundtrip_load_data(contents, *, filename=None):
-    try:
-        contents = yaml.load(contents, yaml.RoundTripLoader, preserve_quotes=True)
-    except (yaml.scanner.ScannerError, yaml.composer.ComposerError, yaml.parser.ParserError) as e:
-        raise LoadError(LoadErrorReason.INVALID_YAML,
-                        "Malformed YAML:\n\n{}\n\n{}\n".format(e.problem, e.problem_mark)) from e
-
-    # Special case empty files at this point
-    if contents is None:
-        # We'll make them empty mappings like the main Node loader
-        contents = {}
-
-    if not isinstance(contents, Mapping):
-        raise LoadError(LoadErrorReason.INVALID_YAML,
-                        "YAML file has content of type '{}' instead of expected type 'dict': {}"
-                        .format(type(contents).__name__, filename))
-
-    return contents
-
-
-# roundtrip_dump()
-#
-# Dumps the given contents as a YAML file.  Ideally the contents came from
-# parsing with `roundtrip_load` or `roundtrip_load_data` so that they will be
-# dumped in the same form as they came from.
-#
-# If `file` is a string, it is the filename to write to, if `file` has a
-# `write` method, it's treated as a stream, otherwise output is to stdout.
-#
-# Args:
-#    contents (Mapping or list): The content to write out as YAML.
-#    file (any): The file to write to
-#
-def roundtrip_dump(contents, file=None):
-    assert type(contents) is not Node
-
-    def stringify_dict(thing):
-        for k, v in thing.items():
-            if type(v) is str:
-                pass
-            elif isinstance(v, Mapping):
-                stringify_dict(v)
-            elif isinstance(v, Sequence):
-                stringify_list(v)
-            else:
-                thing[k] = str(v)
-
-    def stringify_list(thing):
-        for i, v in enumerate(thing):
-            if type(v) is str:
-                pass
-            elif isinstance(v, Mapping):
-                stringify_dict(v)
-            elif isinstance(v, Sequence):
-                stringify_list(v)
-            else:
-                thing[i] = str(v)
-
-    contents = deepcopy(contents)
-    stringify_dict(contents)
-
-    with ExitStack() as stack:
-        if type(file) is str:
-            from . import utils
-            f = stack.enter_context(utils.save_file_atomic(file, 'w'))
-        elif hasattr(file, 'write'):
-            f = file
-        else:
-            f = sys.stdout
-        yaml.round_trip_dump(contents, f, Dumper=HardlineDumper)
diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
new file mode 100644
index 000000000..012400d2b
--- /dev/null
+++ b/src/buildstream/_yaml.pyx
@@ -0,0 +1,1444 @@
+#
+#  Copyright (C) 2018 Codethink Limited
+#  Copyright (C) 2019 Bloomberg LLP
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2 of the License, or (at your option) any later version.
+#
+#  This library is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+#  Authors:
+#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
+#        Daniel Silverstone <daniel.silverstone@codethink.co.uk>
+#        James Ennis <james.ennis@codethink.co.uk>
+#        Benjamin Schubert <bschubert@bloomberg.net>
+
+import sys
+import string
+from contextlib import ExitStack
+from collections import OrderedDict
+from collections.abc import Mapping, Sequence
+from copy import deepcopy
+from itertools import count
+
+from ruamel import yaml
+from ._exceptions import LoadError, LoadErrorReason
+
+
+# Without this, pylint complains about all the `type(foo) is blah` checks
+# because it feels isinstance() is more idiomatic.  Sadly, it is much slower to
+# do `isinstance(foo, blah)` for reasons I am unable to fathom.  As such, we
+# blanket disable the check for this module.
+#
+# pylint: disable=unidiomatic-typecheck
+
+
+# Node()
+#
+# Container for YAML loaded data and its provenance
+#
+# All nodes returned (and all internal lists/strings) have this type (rather
+# than a plain tuple, to distinguish them in things like node_sanitize)
+#
+# Members:
+#   value (str/list/dict): The loaded value.
+#   file_index (int): Index within _FILE_LIST (a list of loaded file paths).
+#                     Negative indices indicate synthetic nodes so that
+#                     they can be referenced.
+#   line (int): The line number within the file where the value appears.
+#   col (int): The column number within the file where the value appears.
+#
+cdef class Node:
+
+    cdef public object value
+    cdef public int file_index
+    cdef public int line
+    cdef public int column
+
+    def __init__(self, object value, int file_index, int line, int column):
+        self.value = value
+        self.file_index = file_index
+        self.line = line
+        self.column = column
+
+    def __contains__(self, what):
+        # Delegate to the inner value, though this will likely not work
+        # very well if the node is a list or string, it's unlikely that
+        # code which has access to such nodes would do this.
+        return what in self.value
+
+# File name handling
+_FILE_LIST = []
+
+
+# Purely synthetic node will have _SYNCTHETIC_FILE_INDEX for the file number, have line number
+# zero, and a negative column number which comes from inverting the next value
+# out of this counter.  Synthetic nodes created with a reference node will
+# have a file number from the reference node, some unknown line number, and
+# a negative column number from this counter.
+cdef int _SYNTHETIC_FILE_INDEX = -1
+
+_SYNTHETIC_COUNTER = count(start=-1, step=-1)
+
+
+# Returned from node_get_provenance
+class ProvenanceInformation:
+
+    __slots__ = (
+        "filename",
+        "shortname",
+        "displayname",
+        "line",
+        "col",
+        "toplevel",
+        "node",
+        "project",
+        "is_synthetic",
+    )
+
+    def __init__(self, nodeish):
+        self.node = nodeish
+        if (nodeish is None) or (nodeish.file_index is None):
+            self.filename = ""
+            self.shortname = ""
+            self.displayname = ""
+            self.line = 1
+            self.col = 0
+            self.toplevel = None
+            self.project = None
+        else:
+            fileinfo = _FILE_LIST[nodeish.file_index]
+            self.filename = fileinfo[0]
+            self.shortname = fileinfo[1]
+            self.displayname = fileinfo[2]
+            # We add 1 here to convert from computerish to humanish
+            self.line = nodeish.line + 1
+            self.col = nodeish.column
+            self.toplevel = fileinfo[3]
+            self.project = fileinfo[4]
+        self.is_synthetic = (self.filename == '') or (self.col < 0)
+
+    # Convert a Provenance to a string for error reporting
+    def __str__(self):
+        if self.is_synthetic:
+            return "{} [synthetic node]".format(self.displayname)
+        else:
+            return "{} [line {:d} column {:d}]".format(self.displayname, self.line, self.col)
+
+
+# These exceptions are intended to be caught entirely within
+# the BuildStream framework, hence they do not reside in the
+# public exceptions.py
+class CompositeError(Exception):
+    def __init__(self, path, message):
+        super(CompositeError, self).__init__(message)
+        self.path = path
+        self.message = message
+
+
+class YAMLLoadError(Exception):
+    pass
+
+
+# Representer for YAML events comprising input to the BuildStream format.
+#
+# All streams MUST represent a single document which must be a Mapping.
+# Anything else is considered an error.
+#
+# Mappings must only have string keys, values are always represented as
+# strings if they are scalar, or else as simple dictionaries and lists.
+#
+class Representer:
+    __slots__ = (
+        "_file_index",
+        "state",
+        "output",
+        "keys",
+    )
+
+    # Initialise a new representer
+    #
+    # The file index is used to store into the Node instances so that the
+    # provenance of the YAML can be tracked.
+    #
+    # Args:
+    #   file_index (int): The index of this YAML file
+    def __init__(self, file_index):
+        self._file_index = file_index
+        self.state = "init"
+        self.output = []
+        self.keys = []
+
+    # Handle a YAML parse event
+    #
+    # Args:
+    #   event (YAML Event): The event to be handled
+    #
+    # Raises:
+    #   YAMLLoadError: Something went wrong.
+    def handle_event(self, event):
+        if getattr(event, "anchor", None) is not None:
+            raise YAMLLoadError("Anchors are disallowed in BuildStream at line {} column {}"
+                                .format(event.start_mark.line, event.start_mark.column))
+
+        if event.__class__.__name__ == "ScalarEvent":
+            if event.tag is not None:
+                if not event.tag.startswith("tag:yaml.org,2002:"):
+                    raise YAMLLoadError(
+                        "Non-core tag expressed in input.  " +
+                        "This is disallowed in BuildStream. At line {} column {}"
+                        .format(event.start_mark.line, event.start_mark.column))
+
+        handler = "_handle_{}_{}".format(self.state, event.__class__.__name__)
+        handler = getattr(self, handler, None)
+        if handler is None:
+            raise YAMLLoadError(
+                "Invalid input detected. No handler for {} in state {} at line {} column {}"
+                .format(event, self.state, event.start_mark.line, event.start_mark.column))
+
+        self.state = handler(event)  # pylint: disable=not-callable
+
+    # Get the output of the YAML parse
+    #
+    # Returns:
+    #   (Node or None): Return the Node instance of the top level mapping or
+    #                   None if there wasn't one.
+    def get_output(self):
+        try:
+            return self.output[0]
+        except IndexError:
+            return None
+
+    def _handle_init_StreamStartEvent(self, ev):
+        return "stream"
+
+    def _handle_stream_DocumentStartEvent(self, ev):
+        return "doc"
+
+    def _handle_doc_MappingStartEvent(self, ev):
+        newmap = Node({}, self._file_index, ev.start_mark.line, ev.start_mark.column)
+        self.output.append(newmap)
+        return "wait_key"
+
+    def _handle_wait_key_ScalarEvent(self, ev):
+        self.keys.append(ev.value)
+        return "wait_value"
+
+    def _handle_wait_value_ScalarEvent(self, ev):
+        key = self.keys.pop()
+        self.output[-1].value[key] = \
+            Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column)
+        return "wait_key"
+
+    def _handle_wait_value_MappingStartEvent(self, ev):
+        new_state = self._handle_doc_MappingStartEvent(ev)
+        key = self.keys.pop()
+        self.output[-2].value[key] = self.output[-1]
+        return new_state
+
+    def _handle_wait_key_MappingEndEvent(self, ev):
+        # We've finished a mapping, so pop it off the output stack
+        # unless it's the last one in which case we leave it
+        if len(self.output) > 1:
+            self.output.pop()
+            if type(self.output[-1].value) is list:
+                return "wait_list_item"
+            else:
+                return "wait_key"
+        else:
+            return "doc"
+
+    def _handle_wait_value_SequenceStartEvent(self, ev):
+        self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
+        self.output[-2].value[self.keys[-1]] = self.output[-1]
+        return "wait_list_item"
+
+    def _handle_wait_list_item_SequenceStartEvent(self, ev):
+        self.keys.append(len(self.output[-1].value))
+        self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
+        self.output[-2].value.append(self.output[-1])
+        return "wait_list_item"
+
+    def _handle_wait_list_item_SequenceEndEvent(self, ev):
+        # When ending a sequence, we need to pop a key because we retain the
+        # key until the end so that if we need to mutate the underlying entry
+        # we can.
+        key = self.keys.pop()
+        self.output.pop()
+        if type(key) is int:
+            return "wait_list_item"
+        else:
+            return "wait_key"
+
+    def _handle_wait_list_item_ScalarEvent(self, ev):
+        self.output[-1].value.append(
+            Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column))
+        return "wait_list_item"
+
+    def _handle_wait_list_item_MappingStartEvent(self, ev):
+        new_state = self._handle_doc_MappingStartEvent(ev)
+        self.output[-2].value.append(self.output[-1])
+        return new_state
+
+    def _handle_doc_DocumentEndEvent(self, ev):
+        if len(self.output) != 1:
+            raise YAMLLoadError("Zero, or more than one document found in YAML stream")
+        return "stream"
+
+    def _handle_stream_StreamEndEvent(self, ev):
+        return "init"
+
+
+# Loads a dictionary from some YAML
+#
+# Args:
+#    filename (str): The YAML file to load
+#    shortname (str): The filename in shorthand for error reporting (or None)
+#    copy_tree (bool): Whether to make a copy, preserving the original toplevels
+#                      for later serialization
+#    project (Project): The (optional) project to associate the parsed YAML with
+#
+# Returns (dict): A loaded copy of the YAML file with provenance information
+#
+# Raises: LoadError
+#
+def load(filename, shortname=None, copy_tree=False, *, project=None):
+    if not shortname:
+        shortname = filename
+
+    if (project is not None) and (project.junction is not None):
+        displayname = "{}:{}".format(project.junction.name, shortname)
+    else:
+        displayname = shortname
+
+    file_number = len(_FILE_LIST)
+    _FILE_LIST.append((filename, shortname, displayname, None, project))
+
+    try:
+        with open(filename) as f:
+            contents = f.read()
+
+        data = load_data(contents,
+                         file_index=file_number,
+                         file_name=filename,
+                         copy_tree=copy_tree)
+
+        return data
+    except FileNotFoundError as e:
+        raise LoadError(LoadErrorReason.MISSING_FILE,
+                        "Could not find file at {}".format(filename)) from e
+    except IsADirectoryError as e:
+        raise LoadError(LoadErrorReason.LOADING_DIRECTORY,
+                        "{} is a directory. bst command expects a .bst file."
+                        .format(filename)) from e
+    except LoadError as e:
+        raise LoadError(e.reason, "{}: {}".format(displayname, e)) from e
+
+
+# Like load(), but doesnt require the data to be in a file
+#
+def load_data(data, file_index=_SYNTHETIC_FILE_INDEX, file_name=None, copy_tree=False):
+
+    try:
+        rep = Representer(file_index)
+        for event in yaml.parse(data, Loader=yaml.CBaseLoader):
+            rep.handle_event(event)
+        contents = rep.get_output()
+    except YAMLLoadError as e:
+        raise LoadError(LoadErrorReason.INVALID_YAML,
+                        "Malformed YAML:\n\n{}\n\n".format(e)) from e
+    except Exception as e:
+        raise LoadError(LoadErrorReason.INVALID_YAML,
+                        "Severely malformed YAML:\n\n{}\n\n".format(e)) from e
+
+    if type(contents) != Node:
+        # Special case allowance for None, when the loaded file has only comments in it.
+        if contents is None:
+            contents = Node({}, file_index, 0, 0)
+        else:
+            raise LoadError(LoadErrorReason.INVALID_YAML,
+                            "YAML file has content of type '{}' instead of expected type 'dict': {}"
+                            .format(type(contents[0]).__name__, file_name))
+
+    # Store this away because we'll use it later for "top level" provenance
+    if file_index is not None:
+        _FILE_LIST[file_index] = (
+            _FILE_LIST[file_index][0],  # Filename
+            _FILE_LIST[file_index][1],  # Shortname
+            _FILE_LIST[file_index][2],  # Displayname
+            contents,
+            _FILE_LIST[file_index][4],  # Project
+        )
+
+    if copy_tree:
+        contents = node_copy(contents)
+    return contents
+
+
+# dump()
+#
+# Write a YAML node structure out to disk.
+#
+# This will always call `node_sanitize` on its input, so if you wanted
+# to output something close to what you read in, consider using the
+# `roundtrip_load` and `roundtrip_dump` function pair instead.
+#
+# Args:
+#    contents (any): Content to write out
+#    filename (str): The (optional) file name to write out to
+def dump(contents, filename=None):
+    roundtrip_dump(node_sanitize(contents), file=filename)
+
+
+# node_get_provenance()
+#
+# Gets the provenance for a node
+#
+# Args:
+#   node (dict): a dictionary
+#   key (str): key in the dictionary
+#   indices (list of indexes): Index path, in the case of list values
+#
+# Returns: The Provenance of the dict, member or list element
+#
+def node_get_provenance(node, key=None, indices=None):
+    assert is_node(node)
+
+    if key is None:
+        # Retrieving the provenance for this node directly
+        return ProvenanceInformation(node)
+
+    if key and not indices:
+        return ProvenanceInformation(node.value.get(key))
+
+    nodeish = node.value.get(key)
+    for idx in indices:
+        nodeish = nodeish.value[idx]
+
+    return ProvenanceInformation(nodeish)
+
+
+# A sentinel to be used as a default argument for functions that need
+# to distinguish between a kwarg set to None and an unset kwarg.
+_sentinel = object()
+
+
+# node_get()
+#
+# Fetches a value from a dictionary node and checks it for
+# an expected value. Use default_value when parsing a value
+# which is only optionally supplied.
+#
+# Args:
+#    node (dict): The dictionary node
+#    expected_type (type): The expected type for the value being searched
+#    key (str): The key to get a value for in node
+#    indices (list of ints): Optionally decend into lists of lists
+#    default_value: Optionally return this value if the key is not found
+#    allow_none: (bool): Allow None to be a valid value
+#
+# Returns:
+#    The value if found in node, otherwise default_value is returned
+#
+# Raises:
+#    LoadError, when the value found is not of the expected type
+#
+# Note:
+#    Returned strings are stripped of leading and trailing whitespace
+#
+def node_get(node, expected_type, key, indices=None, *, default_value=_sentinel, allow_none=False):
+    assert type(node) is Node
+
+    if indices is None:
+        if default_value is _sentinel:
+            value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, 0))
+        else:
+            value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER)))
+
+        if value.value is _sentinel:
+            provenance = node_get_provenance(node)
+            raise LoadError(LoadErrorReason.INVALID_DATA,
+                            "{}: Dictionary did not contain expected key '{}'".format(provenance, key))
+    else:
+        # Implied type check of the element itself
+        # No need to synthesise useful node content as we destructure it immediately
+        value = Node(node_get(node, list, key), _SYNTHETIC_FILE_INDEX, 0, 0)
+        for index in indices:
+            value = value.value[index]
+            if type(value) is not Node:
+                value = Node(value, _SYNTHETIC_FILE_INDEX, 0, 0)
+
+    # Optionally allow None as a valid value for any type
+    if value.value is None and (allow_none or default_value is None):
+        return None
+
+    if (expected_type is not None) and (not isinstance(value.value, expected_type)):
+        # Attempt basic conversions if possible, typically we want to
+        # be able to specify numeric values and convert them to strings,
+        # but we dont want to try converting dicts/lists
+        try:
+            if (expected_type == bool and isinstance(value.value, str)):
+                # Dont coerce booleans to string, this makes "False" strings evaluate to True
+                # We don't structure into full nodes since there's no need.
+                if value.value in ('True', 'true'):
+                    value = Node(True, _SYNTHETIC_FILE_INDEX, 0, 0)
+                elif value.value in ('False', 'false'):
+                    value = Node(False, _SYNTHETIC_FILE_INDEX, 0, 0)
+                else:
+                    raise ValueError()
+            elif not (expected_type == list or
+                      expected_type == dict or
+                      isinstance(value.value, (list, dict))):
+                value = Node(expected_type(value.value), _SYNTHETIC_FILE_INDEX, 0, 0)
+            else:
+                raise ValueError()
+        except (ValueError, TypeError):
+            provenance = node_get_provenance(node, key=key, indices=indices)
+            if indices:
+                path = [key]
+                path.extend("[{:d}]".format(i) for i in indices)
+                path = "".join(path)
+            else:
+                path = key
+            raise LoadError(LoadErrorReason.INVALID_DATA,
+                            "{}: Value of '{}' is not of the expected type '{}'"
+                            .format(provenance, path, expected_type.__name__))
+
+    # Now collapse lists, and scalars, to their value, leaving nodes as-is
+    if type(value.value) is not dict:
+        value = value.value
+
+    # Trim it at the bud, let all loaded strings from yaml be stripped of whitespace
+    if type(value) is str:
+        value = value.strip()
+
+    elif type(value) is list:
+        # Now we create a fresh list which unwraps the str and list types
+        # semi-recursively.
+        value = __trim_list_provenance(value)
+
+    return value
+
+
+def __trim_list_provenance(value):
+    ret = []
+    for entry in value:
+        if type(entry) is not Node:
+            entry = Node(entry, _SYNTHETIC_FILE_INDEX, 0, 0)
+        if type(entry.value) is list:
+            ret.append(__trim_list_provenance(entry.value))
+        elif type(entry.value) is dict:
+            ret.append(entry)
+        else:
+            ret.append(entry.value)
+    return ret
+
+
+# node_set()
+#
+# Set an item within the node.  If using `indices` be aware that the entry must
+# already exist, or else a KeyError will be raised.  Use `node_extend_list` to
+# create entries before using `node_set`
+#
+# Args:
+#    node (tuple): The node
+#    key (str): The key name
+#    value: The value
+#    indices: Any indices to index into the list referenced by key, like in
+#             `node_get` (must be a list of integers)
+#
+def node_set(node, key, value, indices=None):
+    if indices:
+        node = node.value[key]
+        key = indices.pop()
+        for idx in indices:
+            node = node.value[idx]
+    if type(value) is Node:
+        node.value[key] = value
+    else:
+        try:
+            # Need to do this just in case we're modifying a list
+            old_value = node.value[key]
+        except KeyError:
+            old_value = None
+        if old_value is None:
+            node.value[key] = Node(value, node.file_index, node.line, next(_SYNTHETIC_COUNTER))
+        else:
+            node.value[key] = Node(value, old_value.file_index, old_value.line, old_value.column)
+
+
+# node_extend_list()
+#
+# Extend a list inside a node to a given length, using the passed
+# default value to fill it out.
+#
+# Valid default values are:
+#    Any string
+#    An empty dict
+#    An empty list
+#
+# Args:
+#    node (node): The node
+#    key (str): The list name in the node
+#    length (int): The length to extend the list to
+#    default (any): The default value to extend with.
+def node_extend_list(node, key, length, default):
+    assert type(default) is str or default in ([], {})
+
+    list_node = node.value.get(key)
+    if list_node is None:
+        list_node = node.value[key] = Node([], node.file_index, node.line, next(_SYNTHETIC_COUNTER))
+
+    assert type(list_node.value) is list
+
+    the_list = list_node.value
+    def_type = type(default)
+
+    file_index = node.file_index
+    if the_list:
+        line_num = the_list[-1][2]
+    else:
+        line_num = list_node.line
+
+    while length > len(the_list):
+        if def_type is str:
+            value = default
+        elif def_type is list:
+            value = []
+        else:
+            value = {}
+
+        line_num += 1
+
+        the_list.append(Node(value, file_index, line_num, next(_SYNTHETIC_COUNTER)))
+
+
+# node_items()
+#
+# A convenience generator for iterating over loaded key/value
+# tuples in a dictionary loaded from project YAML.
+#
+# Args:
+#    node (dict): The dictionary node
+#
+# Yields:
+#    (str): The key name
+#    (anything): The value for the key
+#
+def node_items(node):
+    if type(node) is not Node:
+        node = Node(node, _SYNTHETIC_FILE_INDEX, 0, 0)
+    for key, value in node.value.items():
+        if type(value) is not Node:
+            value = Node(value, _SYNTHETIC_FILE_INDEX, 0, 0)
+        if type(value.value) is dict:
+            yield (key, value)
+        elif type(value.value) is list:
+            yield (key, __trim_list_provenance(value.value))
+        else:
+            yield (key, value.value)
+
+
+# node_keys()
+#
+# A convenience generator for iterating over loaded keys
+# in a dictionary loaded from project YAML.
+#
+# Args:
+#    node (dict): The dictionary node
+#
+# Yields:
+#    (str): The key name
+#
+def node_keys(node):
+    if type(node) is not Node:
+        node = Node(node, _SYNTHETIC_FILE_INDEX, 0, 0)
+    yield from node.value.keys()
+
+
+# node_del()
+#
+# A convenience generator for iterating over loaded key/value
+# tuples in a dictionary loaded from project YAML.
+#
+# Args:
+#    node (dict): The dictionary node
+#    key (str): The key we want to remove
+#    safe (bool): Whether to raise a KeyError if unable
+#
+def node_del(node, key, safe=False):
+    try:
+        del node.value[key]
+    except KeyError:
+        if not safe:
+            raise
+
+
+# is_node()
+#
+# A test method which returns whether or not the passed in value
+# is a valid YAML node.  It is not valid to call this on a Node
+# object which is not a Mapping.
+#
+# Args:
+#    maybenode (any): The object to test for nodeness
+#
+# Returns:
+#    (bool): Whether or not maybenode was a Node
+#
+def is_node(maybenode):
+    # It's a programming error to give this a Node which isn't a mapping
+    # so assert that.
+    assert (type(maybenode) is not Node) or (type(maybenode.value) is dict)
+    # Now return the type check
+    return type(maybenode) is Node
+
+
+# new_synthetic_file()
+#
+# Create a new synthetic mapping node, with an associated file entry
+# (in _FILE_LIST) such that later tracking can correctly determine which
+# file needs writing to in order to persist the changes.
+#
+# Args:
+#    filename (str): The name of the synthetic file to create
+#    project (Project): The optional project to associate this synthetic file with
+#
+# Returns:
+#    (Node): An empty YAML mapping node, whose provenance is to this new
+#            synthetic file
+#
+def new_synthetic_file(filename, project=None):
+    file_index = len(_FILE_LIST)
+    node = Node({}, file_index, 0, 0)
+    _FILE_LIST.append((filename,
+                       filename,
+                       "<synthetic {}>".format(filename),
+                       node,
+                       project))
+    return node
+
+
+# new_empty_node()
+#
+# Args:
+#    ref_node (Node): Optional node whose provenance should be referenced
+#
+# Returns
+#    (Node): A new empty YAML mapping node
+#
+def new_empty_node(ref_node=None):
+    if ref_node is not None:
+        return Node({}, ref_node.file_index, ref_node.line, next(_SYNTHETIC_COUNTER))
+    else:
+        return Node({}, _SYNTHETIC_FILE_INDEX, 0, 0)
+
+
+# new_node_from_dict()
+#
+# Args:
+#   indict (dict): The input dictionary
+#
+# Returns:
+#   (Node): A new synthetic YAML tree which represents this dictionary
+#
+def new_node_from_dict(indict):
+    ret = {}
+    for k, v in indict.items():
+        vtype = type(v)
+        if vtype is dict:
+            ret[k] = new_node_from_dict(v)
+        elif vtype is list:
+            ret[k] = __new_node_from_list(v)
+        else:
+            ret[k] = Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER))
+    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER))
+
+
+# Internal function to help new_node_from_dict() to handle lists
+def __new_node_from_list(inlist):
+    ret = []
+    for v in inlist:
+        vtype = type(v)
+        if vtype is dict:
+            ret.append(new_node_from_dict(v))
+        elif vtype is list:
+            ret.append(__new_node_from_list(v))
+        else:
+            ret.append(Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER)))
+    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER))
+
+
+# _is_composite_list
+#
+# Checks if the given node is a Mapping with array composition
+# directives.
+#
+# Args:
+#    node (value): Any node
+#
+# Returns:
+#    (bool): True if node was a Mapping containing only
+#            list composition directives
+#
+# Raises:
+#    (LoadError): If node was a mapping and contained a mix of
+#                 list composition directives and other keys
+#
+def _is_composite_list(node):
+
+    if type(node.value) is dict:
+        has_directives = False
+        has_keys = False
+
+        for key, _ in node_items(node):
+            if key in ['(>)', '(<)', '(=)']:  # pylint: disable=simplifiable-if-statement
+                has_directives = True
+            else:
+                has_keys = True
+
+            if has_keys and has_directives:
+                provenance = node_get_provenance(node)
+                raise LoadError(LoadErrorReason.INVALID_DATA,
+                                "{}: Dictionary contains array composition directives and arbitrary keys"
+                                .format(provenance))
+        return has_directives
+
+    return False
+
+
+# _compose_composite_list()
+#
+# Composes a composite list (i.e. a dict with list composition directives)
+# on top of a target list which is a composite list itself.
+#
+# Args:
+#    target (Node): A composite list
+#    source (Node): A composite list
+#
+def _compose_composite_list(target, source):
+    clobber = source.value.get("(=)")
+    prefix = source.value.get("(<)")
+    suffix = source.value.get("(>)")
+    if clobber is not None:
+        # We want to clobber the target list
+        # which basically means replacing the target list
+        # with ourselves
+        target.value["(=)"] = clobber
+        if prefix is not None:
+            target.value["(<)"] = prefix
+        elif "(<)" in target.value:
+            target.value["(<)"].value.clear()
+        if suffix is not None:
+            target.value["(>)"] = suffix
+        elif "(>)" in target.value:
+            target.value["(>)"].value.clear()
+    else:
+        # Not clobbering, so prefix the prefix and suffix the suffix
+        if prefix is not None:
+            if "(<)" in target.value:
+                for v in reversed(prefix.value):
+                    target.value["(<)"].value.insert(0, v)
+            else:
+                target.value["(<)"] = prefix
+        if suffix is not None:
+            if "(>)" in target.value:
+                target.value["(>)"].value.extend(suffix.value)
+            else:
+                target.value["(>)"] = suffix
+
+
+# _compose_list()
+#
+# Compose a composite list (a dict with composition directives) on top of a
+# simple list.
+#
+# Args:
+#    target (Node): The target list to be composed into
+#    source (Node): The composition list to be composed from
+#
+def _compose_list(target, source):
+    clobber = source.value.get("(=)")
+    prefix = source.value.get("(<)")
+    suffix = source.value.get("(>)")
+    if clobber is not None:
+        target.value.clear()
+        target.value.extend(clobber.value)
+    if prefix is not None:
+        for v in reversed(prefix.value):
+            target.value.insert(0, v)
+    if suffix is not None:
+        target.value.extend(suffix.value)
+
+
+# composite_dict()
+#
+# Compose one mapping node onto another
+#
+# Args:
+#    target (Node): The target to compose into
+#    source (Node): The source to compose from
+#    path   (list): The path to the current composition node
+#
+# Raises: CompositeError
+#
+def composite_dict(target, source, path=None):
+    if path is None:
+        path = []
+    for k, v in source.value.items():
+        path.append(k)
+        if type(v.value) is list:
+            # List clobbers anything list-like
+            target_value = target.value.get(k)
+            if not (target_value is None or
+                    type(target_value.value) is list or
+                    _is_composite_list(target_value)):
+                raise CompositeError(path,
+                                     "{}: List cannot overwrite {} at: {}"
+                                     .format(node_get_provenance(source, k),
+                                             k,
+                                             node_get_provenance(target, k)))
+            # Looks good, clobber it
+            target.value[k] = v
+        elif _is_composite_list(v):
+            if k not in target.value:
+                # Composite list clobbers empty space
+                target.value[k] = v
+            elif type(target.value[k].value) is list:
+                # Composite list composes into a list
+                _compose_list(target.value[k], v)
+            elif _is_composite_list(target.value[k]):
+                # Composite list merges into composite list
+                _compose_composite_list(target.value[k], v)
+            else:
+                # Else composing on top of normal dict or a scalar, so raise...
+                raise CompositeError(path,
+                                     "{}: Cannot compose lists onto {}".format(
+                                         node_get_provenance(v),
+                                         node_get_provenance(target.value[k])))
+        elif type(v.value) is dict:
+            # We're composing a dict into target now
+            if k not in target.value:
+                # Target lacks a dict at that point, make a fresh one with
+                # the same provenance as the incoming dict
+                target.value[k] = Node({}, v.file_index, v.line, v.column)
+            if type(target.value) is not dict:
+                raise CompositeError(path,
+                                     "{}: Cannot compose dictionary onto {}".format(
+                                         node_get_provenance(v),
+                                         node_get_provenance(target.value[k])))
+            composite_dict(target.value[k], v, path)
+        else:
+            target_value = target.value.get(k)
+            if target_value is not None and type(target_value.value) is not str:
+                raise CompositeError(path,
+                                     "{}: Cannot compose scalar on non-scalar at {}".format(
+                                         node_get_provenance(v),
+                                         node_get_provenance(target.value[k])))
+            target.value[k] = v
+        path.pop()
+
+
+# Like composite_dict(), but raises an all purpose LoadError for convenience
+#
+def composite(target, source):
+    assert type(source.value) is dict
+    assert type(target.value) is dict
+
+    try:
+        composite_dict(target, source)
+    except CompositeError as e:
+        source_provenance = node_get_provenance(source)
+        error_prefix = ""
+        if source_provenance:
+            error_prefix = "{}: ".format(source_provenance)
+        raise LoadError(LoadErrorReason.ILLEGAL_COMPOSITE,
+                        "{}Failure composing {}: {}"
+                        .format(error_prefix,
+                                e.path,
+                                e.message)) from e
+
+
+# Like composite(target, source), but where target overrides source instead.
+#
+def composite_and_move(target, source):
+    composite(source, target)
+
+    to_delete = [key for key in target.value.keys() if key not in source.value]
+    for key, value in source.value.items():
+        target.value[key] = value
+    for key in to_delete:
+        del target.value[key]
+
+
+# Types we can short-circuit in node_sanitize for speed.
+__SANITIZE_SHORT_CIRCUIT_TYPES = (int, float, str, bool)
+
+
+# node_sanitize()
+#
+# Returns an alphabetically ordered recursive copy
+# of the source node with internal provenance information stripped.
+#
+# Only dicts are ordered, list elements are left in order.
+#
+def node_sanitize(node, *, dict_type=OrderedDict):
+    node_type = type(node)
+
+    # If we have an unwrappable node, unwrap it
+    if node_type is Node:
+        node = node.value
+        node_type = type(node)
+
+    # Short-circuit None which occurs ca. twice per element
+    if node is None:
+        return node
+
+    # Next short-circuit integers, floats, strings, booleans, and tuples
+    if node_type in __SANITIZE_SHORT_CIRCUIT_TYPES:
+        return node
+
+    # Now short-circuit lists.
+    elif node_type is list:
+        return [node_sanitize(elt, dict_type=dict_type) for elt in node]
+
+    # Finally dict, and other Mappings need special handling
+    elif node_type is dict:
+        result = dict_type()
+
+        key_list = [key for key, _ in node.items()]
+        for key in sorted(key_list):
+            result[key] = node_sanitize(node[key], dict_type=dict_type)
+
+        return result
+
+    # Sometimes we're handed tuples and we can't be sure what they contain
+    # so we have to sanitize into them
+    elif node_type is tuple:
+        return tuple((node_sanitize(v, dict_type=dict_type) for v in node))
+
+    # Everything else just gets returned as-is.
+    return node
+
+
+# node_validate()
+#
+# Validate the node so as to ensure the user has not specified
+# any keys which are unrecognized by buildstream (usually this
+# means a typo which would otherwise not trigger an error).
+#
+# Args:
+#    node (dict): A dictionary loaded from YAML
+#    valid_keys (list): A list of valid keys for the specified node
+#
+# Raises:
+#    LoadError: In the case that the specified node contained
+#               one or more invalid keys
+#
+def node_validate(node, valid_keys):
+
+    # Probably the fastest way to do this: https://stackoverflow.com/a/23062482
+    valid_keys = set(valid_keys)
+    invalid = next((key for key in node.value if key not in valid_keys), None)
+
+    if invalid:
+        provenance = node_get_provenance(node, key=invalid)
+        raise LoadError(LoadErrorReason.INVALID_DATA,
+                        "{}: Unexpected key: {}".format(provenance, invalid))
+
+
+# Node copying
+#
+# Unfortunately we copy nodes a *lot* and `isinstance()` is super-slow when
+# things from collections.abc get involved.  The result is the following
+# intricate but substantially faster group of tuples and the use of `in`.
+#
+# If any of the {node,list}_copy routines raise a ValueError
+# then it's likely additional types need adding to these tuples.
+
+
+# These types just have their value copied
+__QUICK_TYPES = (str, bool)
+
+# These are the directives used to compose lists, we need this because it's
+# slightly faster during the node_final_assertions checks
+__NODE_ASSERT_COMPOSITION_DIRECTIVES = ('(>)', '(<)', '(=)')
+
+
+# node_copy()
+#
+# Make a deep copy of the given YAML node, preserving provenance.
+#
+# Args:
+#    source (Node): The YAML node to copy
+#
+# Returns:
+#    (Node): A deep copy of source with provenance preserved.
+#
+def node_copy(source):
+    copy = {}
+    for key, value in source.value.items():
+        value_type = type(value.value)
+        if value_type is dict:
+            copy[key] = node_copy(value)
+        elif value_type is list:
+            copy[key] = _list_copy(value)
+        elif value_type in __QUICK_TYPES:
+            copy[key] = value
+        else:
+            raise ValueError("Unable to be quick about node_copy of {}".format(value_type))
+
+    return Node(copy, source.file_index, source.line, source.column)
+
+
+# Internal function to help node_copy() but for lists.
+def _list_copy(source):
+    copy = []
+    for item in source.value:
+        if type(item) is Node:
+            item_type = type(item.value)
+        else:
+            item_type = type(item)
+
+        if item_type is dict:
+            copy.append(node_copy(item))
+        elif item_type is list:
+            copy.append(_list_copy(item))
+        elif item_type in __QUICK_TYPES:
+            copy.append(item)
+        else:
+            raise ValueError("Unable to be quick about list_copy of {}".format(item_type))
+
+    return Node(copy, source.file_index, source.line, source.column)
+
+
+# node_final_assertions()
+#
+# This must be called on a fully loaded and composited node,
+# after all composition has completed.
+#
+# Args:
+#    node (Mapping): The final composited node
+#
+# Raises:
+#    (LoadError): If any assertions fail
+#
+def node_final_assertions(node):
+    assert type(node) is Node
+
+    for key, value in node.value.items():
+
+        # Assert that list composition directives dont remain, this
+        # indicates that the user intended to override a list which
+        # never existed in the underlying data
+        #
+        if key in __NODE_ASSERT_COMPOSITION_DIRECTIVES:
+            provenance = node_get_provenance(node, key)
+            raise LoadError(LoadErrorReason.TRAILING_LIST_DIRECTIVE,
+                            "{}: Attempt to override non-existing list".format(provenance))
+
+        value_type = type(value.value)
+
+        if value_type is dict:
+            node_final_assertions(value)
+        elif value_type is list:
+            _list_final_assertions(value)
+
+
+# Helper function for node_final_assertions(), but for lists.
+def _list_final_assertions(values):
+    for value in values.value:
+        value_type = type(value.value)
+
+        if value_type is dict:
+            node_final_assertions(value)
+        elif value_type is list:
+            _list_final_assertions(value)
+
+
+# assert_symbol_name()
+#
+# A helper function to check if a loaded string is a valid symbol
+# name and to raise a consistent LoadError if not. For strings which
+# are required to be symbols.
+#
+# Args:
+#    provenance (Provenance): The provenance of the loaded symbol, or None
+#    symbol_name (str): The loaded symbol name
+#    purpose (str): The purpose of the string, for an error message
+#    allow_dashes (bool): Whether dashes are allowed for this symbol
+#
+# Raises:
+#    LoadError: If the symbol_name is invalid
+#
+# Note that dashes are generally preferred for variable names and
+# usage in YAML, but things such as option names which will be
+# evaluated with jinja2 cannot use dashes.
+def assert_symbol_name(provenance, symbol_name, purpose, *, allow_dashes=True):
+    valid_chars = string.digits + string.ascii_letters + '_'
+    if allow_dashes:
+        valid_chars += '-'
+
+    valid = True
+    if not symbol_name:
+        valid = False
+    elif any(x not in valid_chars for x in symbol_name):
+        valid = False
+    elif symbol_name[0] in string.digits:
+        valid = False
+
+    if not valid:
+        detail = "Symbol names must contain only alphanumeric characters, " + \
+                 "may not start with a digit, and may contain underscores"
+        if allow_dashes:
+            detail += " or dashes"
+
+        message = "Invalid symbol name for {}: '{}'".format(purpose, symbol_name)
+        if provenance is not None:
+            message = "{}: {}".format(provenance, message)
+
+        raise LoadError(LoadErrorReason.INVALID_SYMBOL_NAME,
+                        message, detail=detail)
+
+
+# node_find_target()
+#
+# Searches the given node tree for the given target node.
+#
+# This is typically used when trying to walk a path to a given node
+# for the purpose of then modifying a similar tree of objects elsewhere
+#
+# If the key is provided, then we actually hunt for the node represented by
+# target[key] and return its container, rather than hunting for target directly
+#
+# Args:
+#    node (Node): The node at the root of the tree to search
+#    target (Node): The node you are looking for in that tree
+#    key (str): Optional string key within target node
+#
+# Returns:
+#    (list): A path from `node` to `target` or None if `target` is not in the subtree
+def node_find_target(node, target, *, key=None):
+    assert type(node) is Node
+    assert type(target) is Node
+    if key is not None:
+        target = target.value[key]
+
+    path = []
+    if _walk_find_target(node, path, target):
+        if key:
+            # Remove key from end of path
+            path = path[:-1]
+        return path
+    return None
+
+
+# Helper for node_find_target() which walks a value
+def _walk_find_target(node, path, target):
+    if node.file_index == target.file_index and node.line == target.line and node.column == target.column:
+        return True
+    elif type(node.value) is dict:
+        return _walk_dict_node(node, path, target)
+    elif type(node.value) is list:
+        return _walk_list_node(node, path, target)
+    return False
+
+
+# Helper for node_find_target() which walks a list
+def _walk_list_node(node, path, target):
+    for i, v in enumerate(node.value):
+        path.append(i)
+        if _walk_find_target(v, path, target):
+            return True
+        del path[-1]
+    return False
+
+
+# Helper for node_find_target() which walks a mapping
+def _walk_dict_node(node, path, target):
+    for k, v in node.value.items():
+        path.append(k)
+        if _walk_find_target(v, path, target):
+            return True
+        del path[-1]
+    return False
+
+
+###############################################################################
+
+# Roundtrip code
+
+# Always represent things consistently:
+
+yaml.RoundTripRepresenter.add_representer(OrderedDict,
+                                          yaml.SafeRepresenter.represent_dict)
+
+# Always parse things consistently
+
+yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:int',
+                                          yaml.RoundTripConstructor.construct_yaml_str)
+yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:float',
+                                          yaml.RoundTripConstructor.construct_yaml_str)
+yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:bool',
+                                          yaml.RoundTripConstructor.construct_yaml_str)
+yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:null',
+                                          yaml.RoundTripConstructor.construct_yaml_str)
+yaml.RoundTripConstructor.add_constructor(u'tag:yaml.org,2002:timestamp',
+                                          yaml.RoundTripConstructor.construct_yaml_str)
+
+
+# HardlineDumper
+#
+# This is a dumper used during roundtrip_dump which forces every scalar to be
+# a plain string, in order to match the output format to the input format.
+#
+# If you discover something is broken, please add a test case to the roundtrip
+# test in tests/internals/yaml/roundtrip-test.yaml
+#
+class HardlineDumper(yaml.RoundTripDumper):
+    def __init__(self, *args, **kwargs):
+        yaml.RoundTripDumper.__init__(self, *args, **kwargs)
+        # For each of YAML 1.1 and 1.2, force everything to be a plain string
+        for version in [(1, 1), (1, 2), None]:
+            self.add_version_implicit_resolver(
+                version,
+                u'tag:yaml.org,2002:str',
+                yaml.util.RegExp(r'.*'),
+                None)
+
+
+# roundtrip_load()
+#
+# Load a YAML file into memory in a form which allows roundtripping as best
+# as ruamel permits.
+#
+# Note, the returned objects can be treated as Mappings and Lists and Strings
+# but replacing content wholesale with plain dicts and lists may result
+# in a loss of comments and formatting.
+#
+# Args:
+#    filename (str): The file to load in
+#    allow_missing (bool): Optionally set this to True to allow missing files
+#
+# Returns:
+#    (Mapping): The loaded YAML mapping.
+#
+# Raises:
+#    (LoadError): If the file is missing, or a directory, this is raised.
+#                 Also if the YAML is malformed.
+#
+def roundtrip_load(filename, *, allow_missing=False):
+    try:
+        with open(filename, "r") as fh:
+            data = fh.read()
+        contents = roundtrip_load_data(data, filename=filename)
+    except FileNotFoundError as e:
+        if allow_missing:
+            # Missing files are always empty dictionaries
+            return {}
+        else:
+            raise LoadError(LoadErrorReason.MISSING_FILE,
+                            "Could not find file at {}".format(filename)) from e
+    except IsADirectoryError as e:
+        raise LoadError(LoadErrorReason.LOADING_DIRECTORY,
+                        "{} is a directory."
+                        .format(filename)) from e
+    return contents
+
+
+# roundtrip_load_data()
+#
+# Parse the given contents as YAML, returning them as a roundtrippable data
+# structure.
+#
+# A lack of content will be returned as an empty mapping.
+#
+# Args:
+#    contents (str): The contents to be parsed as YAML
+#    filename (str): Optional filename to be used in error reports
+#
+# Returns:
+#    (Mapping): The loaded YAML mapping
+#
+# Raises:
+#    (LoadError): Raised on invalid YAML, or YAML which parses to something other
+#                 than a Mapping
+#
+def roundtrip_load_data(contents, *, filename=None):
+    try:
+        contents = yaml.load(contents, yaml.RoundTripLoader, preserve_quotes=True)
+    except (yaml.scanner.ScannerError, yaml.composer.ComposerError, yaml.parser.ParserError) as e:
+        raise LoadError(LoadErrorReason.INVALID_YAML,
+                        "Malformed YAML:\n\n{}\n\n{}\n".format(e.problem, e.problem_mark)) from e
+
+    # Special case empty files at this point
+    if contents is None:
+        # We'll make them empty mappings like the main Node loader
+        contents = {}
+
+    if not isinstance(contents, Mapping):
+        raise LoadError(LoadErrorReason.INVALID_YAML,
+                        "YAML file has content of type '{}' instead of expected type 'dict': {}"
+                        .format(type(contents).__name__, filename))
+
+    return contents
+
+
+# roundtrip_dump()
+#
+# Dumps the given contents as a YAML file.  Ideally the contents came from
+# parsing with `roundtrip_load` or `roundtrip_load_data` so that they will be
+# dumped in the same form as they came from.
+#
+# If `file` is a string, it is the filename to write to, if `file` has a
+# `write` method, it's treated as a stream, otherwise output is to stdout.
+#
+# Args:
+#    contents (Mapping or list): The content to write out as YAML.
+#    file (any): The file to write to
+#
+def roundtrip_dump(contents, file=None):
+    assert type(contents) is not Node
+
+    def stringify_dict(thing):
+        for k, v in thing.items():
+            if type(v) is str:
+                pass
+            elif isinstance(v, Mapping):
+                stringify_dict(v)
+            elif isinstance(v, Sequence):
+                stringify_list(v)
+            else:
+                thing[k] = str(v)
+
+    def stringify_list(thing):
+        for i, v in enumerate(thing):
+            if type(v) is str:
+                pass
+            elif isinstance(v, Mapping):
+                stringify_dict(v)
+            elif isinstance(v, Sequence):
+                stringify_list(v)
+            else:
+                thing[i] = str(v)
+
+    contents = deepcopy(contents)
+    stringify_dict(contents)
+
+    with ExitStack() as stack:
+        if type(file) is str:
+            from . import utils
+            f = stack.enter_context(utils.save_file_atomic(file, 'w'))
+        elif hasattr(file, 'write'):
+            f = file
+        else:
+            f = sys.stdout
+        yaml.round_trip_dump(contents, f, Dumper=HardlineDumper)
diff --git a/tests/internals/yaml.py b/tests/internals/yaml.py
index dae20b326..6b26fdfaf 100644
--- a/tests/internals/yaml.py
+++ b/tests/internals/yaml.py
@@ -22,7 +22,7 @@ def test_load_yaml(datafiles):
                             'basics.yaml')
 
     loaded = _yaml.load(filename)
-    assert loaded[0].get('kind')[0] == 'pony'
+    assert loaded.value.get('kind').value == 'pony'
 
 
 def assert_provenance(filename, line, col, node, key=None, indices=None):
@@ -43,7 +43,7 @@ def test_basic_provenance(datafiles):
                             'basics.yaml')
 
     loaded = _yaml.load(filename)
-    assert loaded[0].get('kind')[0] == 'pony'
+    assert loaded.value.get('kind').value == 'pony'
 
     assert_provenance(filename, 1, 0, loaded)
 
@@ -56,7 +56,7 @@ def test_member_provenance(datafiles):
                             'basics.yaml')
 
     loaded = _yaml.load(filename)
-    assert loaded[0].get('kind')[0] == 'pony'
+    assert loaded.value.get('kind').value == 'pony'
     assert_provenance(filename, 2, 13, loaded, 'description')
 
 
@@ -68,7 +68,7 @@ def test_element_provenance(datafiles):
                             'basics.yaml')
 
     loaded = _yaml.load(filename)
-    assert loaded[0].get('kind')[0] == 'pony'
+    assert loaded.value.get('kind').value == 'pony'
     assert_provenance(filename, 5, 2, loaded, 'moods', [1])
 
 
@@ -102,7 +102,7 @@ def test_node_get(datafiles):
                             'basics.yaml')
 
     base = _yaml.load(filename)
-    assert base[0].get('kind')[0] == 'pony'
+    assert base.value.get('kind').value == 'pony'
 
     children = _yaml.node_get(base, list, 'children')
     assert isinstance(children, list)
@@ -522,9 +522,9 @@ def test_node_find_target(datafiles, case):
     # laid out.  Client code should never do this.
     def _walk(node, entry, rest):
         if rest:
-            return _walk(node[0][entry], rest[0], rest[1:])
+            return _walk(node.value[entry], rest[0], rest[1:])
         else:
-            return node[0][entry]
+            return node.value[entry]
 
     want = _walk(loaded, case[0], case[1:])
     found_path = _yaml.node_find_target(toplevel, want)
-- 
cgit v1.2.1


From 4fa94f338ea653d85cafa9c9a9adfd3c533c0090 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Sun, 19 May 2019 22:21:49 +0100
Subject: _yaml: Cythonize all internal functions

Internal functions are not used outside the module, and are
therefore easy to refactor and only call from C, leading us to
significant performance gains.
---
 src/buildstream/_yaml.pyx | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index 012400d2b..e8a30bd88 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -528,8 +528,8 @@ def node_get(node, expected_type, key, indices=None, *, default_value=_sentinel,
     return value
 
 
-def __trim_list_provenance(value):
-    ret = []
+cdef list __trim_list_provenance(list value):
+    cdef list ret = []
     for entry in value:
         if type(entry) is not Node:
             entry = Node(entry, _SYNTHETIC_FILE_INDEX, 0, 0)
@@ -764,8 +764,8 @@ def new_node_from_dict(indict):
 
 
 # Internal function to help new_node_from_dict() to handle lists
-def __new_node_from_list(inlist):
-    ret = []
+cdef Node __new_node_from_list(list inlist):
+    cdef list ret = []
     for v in inlist:
         vtype = type(v)
         if vtype is dict:
@@ -793,13 +793,13 @@ def __new_node_from_list(inlist):
 #    (LoadError): If node was a mapping and contained a mix of
 #                 list composition directives and other keys
 #
-def _is_composite_list(node):
+cdef bint _is_composite_list(Node node):
+    cdef bint has_directives = False
+    cdef bint has_keys = False
+    cdef str key
 
     if type(node.value) is dict:
-        has_directives = False
-        has_keys = False
-
-        for key, _ in node_items(node):
+        for key in node_keys(node):
             if key in ['(>)', '(<)', '(=)']:  # pylint: disable=simplifiable-if-statement
                 has_directives = True
             else:
@@ -824,7 +824,7 @@ def _is_composite_list(node):
 #    target (Node): A composite list
 #    source (Node): A composite list
 #
-def _compose_composite_list(target, source):
+cdef void _compose_composite_list(Node target, Node source):
     clobber = source.value.get("(=)")
     prefix = source.value.get("(<)")
     suffix = source.value.get("(>)")
@@ -865,7 +865,7 @@ def _compose_composite_list(target, source):
 #    target (Node): The target list to be composed into
 #    source (Node): The composition list to be composed from
 #
-def _compose_list(target, source):
+cdef void _compose_list(Node target, Node source):
     clobber = source.value.get("(=)")
     prefix = source.value.get("(<)")
     suffix = source.value.get("(>)")
@@ -1100,8 +1100,8 @@ def node_copy(source):
 
 
 # Internal function to help node_copy() but for lists.
-def _list_copy(source):
-    copy = []
+cdef Node _list_copy(Node source):
+    cdef list copy = []
     for item in source.value:
         if type(item) is Node:
             item_type = type(item.value)
@@ -1154,7 +1154,7 @@ def node_final_assertions(node):
 
 
 # Helper function for node_final_assertions(), but for lists.
-def _list_final_assertions(values):
+def _list_final_assertions(Node values):
     for value in values.value:
         value_type = type(value.value)
 
@@ -1242,7 +1242,7 @@ def node_find_target(node, target, *, key=None):
 
 
 # Helper for node_find_target() which walks a value
-def _walk_find_target(node, path, target):
+cdef bint _walk_find_target(Node node, list path, Node target):
     if node.file_index == target.file_index and node.line == target.line and node.column == target.column:
         return True
     elif type(node.value) is dict:
@@ -1253,7 +1253,10 @@ def _walk_find_target(node, path, target):
 
 
 # Helper for node_find_target() which walks a list
-def _walk_list_node(node, path, target):
+cdef bint _walk_list_node(Node node, list path, Node target):
+    cdef int i
+    cdef Node v
+
     for i, v in enumerate(node.value):
         path.append(i)
         if _walk_find_target(v, path, target):
@@ -1263,7 +1266,10 @@ def _walk_list_node(node, path, target):
 
 
 # Helper for node_find_target() which walks a mapping
-def _walk_dict_node(node, path, target):
+cdef bint _walk_dict_node(Node node, list path, Node target):
+    cdef str k
+    cdef Node v
+
     for k, v in node.value.items():
         path.append(k)
         if _walk_find_target(v, path, target):
-- 
cgit v1.2.1


From 2de2675f03e16303af289b5551356af256c961b9 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Mon, 20 May 2019 12:07:59 +0100
Subject: _yaml: Cythonize public api of _yaml

_yaml is a bottleneck in a normal BuildStream run. Typing the
external API allows us to reduce this bottleneck.

Since we type the input variables, we can also remove some asserts
that are checking if the parameters are of the correct type as
Cython will throw TypeError if called incorrectly.
---
 src/buildstream/_yaml.pyx | 111 ++++++++++++++++++++++++++--------------------
 1 file changed, 62 insertions(+), 49 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index e8a30bd88..3989fe82b 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -76,7 +76,7 @@ cdef class Node:
         return what in self.value
 
 # File name handling
-_FILE_LIST = []
+cdef _FILE_LIST = []
 
 
 # Purely synthetic node will have _SYNCTHETIC_FILE_INDEX for the file number, have line number
@@ -310,18 +310,21 @@ class Representer:
 #
 # Raises: LoadError
 #
-def load(filename, shortname=None, copy_tree=False, *, project=None):
+cpdef Node load(str filename, str shortname=None, bint copy_tree=False, object project=None):
     if not shortname:
         shortname = filename
 
+    cdef str displayname
     if (project is not None) and (project.junction is not None):
         displayname = "{}:{}".format(project.junction.name, shortname)
     else:
         displayname = shortname
 
-    file_number = len(_FILE_LIST)
+    cdef Py_ssize_t file_number = len(_FILE_LIST)
     _FILE_LIST.append((filename, shortname, displayname, None, project))
 
+    cdef Node data
+
     try:
         with open(filename) as f:
             contents = f.read()
@@ -345,7 +348,7 @@ def load(filename, shortname=None, copy_tree=False, *, project=None):
 
 # Like load(), but doesnt require the data to be in a file
 #
-def load_data(data, file_index=_SYNTHETIC_FILE_INDEX, file_name=None, copy_tree=False):
+def load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False):
 
     try:
         rep = Representer(file_index)
@@ -394,7 +397,7 @@ def load_data(data, file_index=_SYNTHETIC_FILE_INDEX, file_name=None, copy_tree=
 # Args:
 #    contents (any): Content to write out
 #    filename (str): The (optional) file name to write out to
-def dump(contents, filename=None):
+def dump(object contents, str filename=None):
     roundtrip_dump(node_sanitize(contents), file=filename)
 
 
@@ -403,14 +406,14 @@ def dump(contents, filename=None):
 # Gets the provenance for a node
 #
 # Args:
-#   node (dict): a dictionary
+#   node (Node): a dictionary
 #   key (str): key in the dictionary
 #   indices (list of indexes): Index path, in the case of list values
 #
 # Returns: The Provenance of the dict, member or list element
 #
-def node_get_provenance(node, key=None, indices=None):
-    assert is_node(node)
+def node_get_provenance(Node node, str key=None, list indices=None):
+    assert type(node.value) is dict
 
     if key is None:
         # Retrieving the provenance for this node directly
@@ -419,9 +422,9 @@ def node_get_provenance(node, key=None, indices=None):
     if key and not indices:
         return ProvenanceInformation(node.value.get(key))
 
-    nodeish = node.value.get(key)
+    cdef Node nodeish = <Node> node.value.get(key)
     for idx in indices:
-        nodeish = nodeish.value[idx]
+        nodeish = <Node> nodeish.value[idx]
 
     return ProvenanceInformation(nodeish)
 
@@ -454,9 +457,7 @@ _sentinel = object()
 # Note:
 #    Returned strings are stripped of leading and trailing whitespace
 #
-def node_get(node, expected_type, key, indices=None, *, default_value=_sentinel, allow_none=False):
-    assert type(node) is Node
-
+def node_get(Node node, object expected_type, str key, list indices=None, *, object default_value=_sentinel, bint allow_none=False):
     if indices is None:
         if default_value is _sentinel:
             value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, 0))
@@ -549,24 +550,26 @@ cdef list __trim_list_provenance(list value):
 # create entries before using `node_set`
 #
 # Args:
-#    node (tuple): The node
+#    node (Node): The node
 #    key (str): The key name
 #    value: The value
 #    indices: Any indices to index into the list referenced by key, like in
 #             `node_get` (must be a list of integers)
 #
-def node_set(node, key, value, indices=None):
+def node_set(Node node, object key, object value, list indices=None):
+    cdef int idx
+
     if indices:
-        node = node.value[key]
+        node = <Node> (<dict> node.value)[key]
         key = indices.pop()
         for idx in indices:
-            node = node.value[idx]
+            node = <Node> (<list> node.value)[idx]
     if type(value) is Node:
         node.value[key] = value
     else:
         try:
             # Need to do this just in case we're modifying a list
-            old_value = node.value[key]
+            old_value = <Node> node.value[key]
         except KeyError:
             old_value = None
         if old_value is None:
@@ -590,16 +593,14 @@ def node_set(node, key, value, indices=None):
 #    key (str): The list name in the node
 #    length (int): The length to extend the list to
 #    default (any): The default value to extend with.
-def node_extend_list(node, key, length, default):
+def node_extend_list(Node node, str key, Py_ssize_t length, object default):
     assert type(default) is str or default in ([], {})
 
-    list_node = node.value.get(key)
+    cdef Node list_node = <Node> node.value.get(key)
     if list_node is None:
         list_node = node.value[key] = Node([], node.file_index, node.line, next(_SYNTHETIC_COUNTER))
 
-    assert type(list_node.value) is list
-
-    the_list = list_node.value
+    cdef list the_list = list_node.value
     def_type = type(default)
 
     file_index = node.file_index
@@ -636,6 +637,9 @@ def node_extend_list(node, key, length, default):
 def node_items(node):
     if type(node) is not Node:
         node = Node(node, _SYNTHETIC_FILE_INDEX, 0, 0)
+
+    cdef str key
+
     for key, value in node.value.items():
         if type(value) is not Node:
             value = Node(value, _SYNTHETIC_FILE_INDEX, 0, 0)
@@ -674,7 +678,7 @@ def node_keys(node):
 #    key (str): The key we want to remove
 #    safe (bool): Whether to raise a KeyError if unable
 #
-def node_del(node, key, safe=False):
+def node_del(Node node, str key, bint safe=False):
     try:
         del node.value[key]
     except KeyError:
@@ -716,9 +720,10 @@ def is_node(maybenode):
 #    (Node): An empty YAML mapping node, whose provenance is to this new
 #            synthetic file
 #
-def new_synthetic_file(filename, project=None):
-    file_index = len(_FILE_LIST)
-    node = Node({}, file_index, 0, 0)
+def new_synthetic_file(str filename, object project=None):
+    cdef Py_ssize_t file_index = len(_FILE_LIST)
+    cdef Node node = Node({}, file_index, 0, 0)
+
     _FILE_LIST.append((filename,
                        filename,
                        "<synthetic {}>".format(filename),
@@ -735,7 +740,7 @@ def new_synthetic_file(filename, project=None):
 # Returns
 #    (Node): A new empty YAML mapping node
 #
-def new_empty_node(ref_node=None):
+def new_empty_node(Node ref_node=None):
     if ref_node is not None:
         return Node({}, ref_node.file_index, ref_node.line, next(_SYNTHETIC_COUNTER))
     else:
@@ -750,8 +755,9 @@ def new_empty_node(ref_node=None):
 # Returns:
 #   (Node): A new synthetic YAML tree which represents this dictionary
 #
-def new_node_from_dict(indict):
-    ret = {}
+def new_node_from_dict(dict indict):
+    cdef dict ret = {}
+    cdef str k
     for k, v in indict.items():
         vtype = type(v)
         if vtype is dict:
@@ -890,7 +896,10 @@ cdef void _compose_list(Node target, Node source):
 #
 # Raises: CompositeError
 #
-def composite_dict(target, source, path=None):
+def composite_dict(Node target, Node source, list path=None):
+    cdef str k
+    cdef Node v, target_value
+
     if path is None:
         path = []
     for k, v in source.value.items():
@@ -949,7 +958,7 @@ def composite_dict(target, source, path=None):
 
 # Like composite_dict(), but raises an all purpose LoadError for convenience
 #
-def composite(target, source):
+def composite(Node target, Node source):
     assert type(source.value) is dict
     assert type(target.value) is dict
 
@@ -969,10 +978,12 @@ def composite(target, source):
 
 # Like composite(target, source), but where target overrides source instead.
 #
-def composite_and_move(target, source):
+def composite_and_move(Node target, Node source):
     composite(source, target)
 
-    to_delete = [key for key in target.value.keys() if key not in source.value]
+    cdef str key
+    cdef Node value
+    cdef list to_delete = [key for key in target.value.keys() if key not in source.value]
     for key, value in source.value.items():
         target.value[key] = value
     for key in to_delete:
@@ -1036,18 +1047,18 @@ def node_sanitize(node, *, dict_type=OrderedDict):
 # means a typo which would otherwise not trigger an error).
 #
 # Args:
-#    node (dict): A dictionary loaded from YAML
+#    node (Node): A dictionary loaded from YAML
 #    valid_keys (list): A list of valid keys for the specified node
 #
 # Raises:
 #    LoadError: In the case that the specified node contained
 #               one or more invalid keys
 #
-def node_validate(node, valid_keys):
+def node_validate(Node node, list valid_keys):
 
     # Probably the fastest way to do this: https://stackoverflow.com/a/23062482
-    valid_keys = set(valid_keys)
-    invalid = next((key for key in node.value if key not in valid_keys), None)
+    cdef set valid_keys_set = set(valid_keys)
+    invalid = next((key for key in node.value if key not in valid_keys_set), None)
 
     if invalid:
         provenance = node_get_provenance(node, key=invalid)
@@ -1083,8 +1094,11 @@ __NODE_ASSERT_COMPOSITION_DIRECTIVES = ('(>)', '(<)', '(=)')
 # Returns:
 #    (Node): A deep copy of source with provenance preserved.
 #
-def node_copy(source):
-    copy = {}
+def node_copy(Node source):
+    cdef dict copy = {}
+    cdef str key
+    cdef Node value
+
     for key, value in source.value.items():
         value_type = type(value.value)
         if value_type is dict:
@@ -1131,8 +1145,9 @@ cdef Node _list_copy(Node source):
 # Raises:
 #    (LoadError): If any assertions fail
 #
-def node_final_assertions(node):
-    assert type(node) is Node
+def node_final_assertions(Node node):
+    cdef str key
+    cdef Node value
 
     for key, value in node.value.items():
 
@@ -1182,12 +1197,12 @@ def _list_final_assertions(Node values):
 # Note that dashes are generally preferred for variable names and
 # usage in YAML, but things such as option names which will be
 # evaluated with jinja2 cannot use dashes.
-def assert_symbol_name(provenance, symbol_name, purpose, *, allow_dashes=True):
-    valid_chars = string.digits + string.ascii_letters + '_'
+def assert_symbol_name(object provenance, str symbol_name, str purpose, *, bint allow_dashes=True):
+    cdef str valid_chars = string.digits + string.ascii_letters + '_'
     if allow_dashes:
         valid_chars += '-'
 
-    valid = True
+    cdef bint valid = True
     if not symbol_name:
         valid = False
     elif any(x not in valid_chars for x in symbol_name):
@@ -1226,13 +1241,11 @@ def assert_symbol_name(provenance, symbol_name, purpose, *, allow_dashes=True):
 #
 # Returns:
 #    (list): A path from `node` to `target` or None if `target` is not in the subtree
-def node_find_target(node, target, *, key=None):
-    assert type(node) is Node
-    assert type(target) is Node
+def node_find_target(Node node, Node target, *, str key=None):
     if key is not None:
         target = target.value[key]
 
-    path = []
+    cdef list path = []
     if _walk_find_target(node, path, target):
         if key:
             # Remove key from end of path
-- 
cgit v1.2.1


From 865fbce3570fd0a07bb46e80db76d2dc9f3fa0e5 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Mon, 20 May 2019 18:37:52 +0100
Subject: _yaml: Cythonize `Representer`.

`Representer` is the main interface with the `ruamel` library to
parse the yaml files. Rewriting it with Cython introduces significant
performance gains.

Since `Representer` is not a python class anymore, we can't call
`getattr` on it, and therefore have to do a manual switch on the types
of events. While this is harder to read, it is also much more
performant.

Finally, sotp using `yaml.parse`, but call the parser manually, in order
to avoid going in and out of the python code. This part could be made
even better in the future when `ruamel` becomes stable and if they
expose cython definitions, as they are coded in Cython.
---
 src/buildstream/_yaml.pyx | 130 ++++++++++++++++++++++++++++++----------------
 1 file changed, 86 insertions(+), 44 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index 3989fe82b..9ad5b34ce 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -156,13 +156,11 @@ class YAMLLoadError(Exception):
 # Mappings must only have string keys, values are always represented as
 # strings if they are scalar, or else as simple dictionaries and lists.
 #
-class Representer:
-    __slots__ = (
-        "_file_index",
-        "state",
-        "output",
-        "keys",
-    )
+cdef class Representer:
+
+    cdef int _file_index
+    cdef str state
+    cdef list output, keys
 
     # Initialise a new representer
     #
@@ -171,7 +169,7 @@ class Representer:
     #
     # Args:
     #   file_index (int): The index of this YAML file
-    def __init__(self, file_index):
+    def __init__(self, int file_index):
         self._file_index = file_index
         self.state = "init"
         self.output = []
@@ -184,12 +182,14 @@ class Representer:
     #
     # Raises:
     #   YAMLLoadError: Something went wrong.
-    def handle_event(self, event):
+    cdef void handle_event(self, event) except *:
         if getattr(event, "anchor", None) is not None:
             raise YAMLLoadError("Anchors are disallowed in BuildStream at line {} column {}"
                                 .format(event.start_mark.line, event.start_mark.column))
 
-        if event.__class__.__name__ == "ScalarEvent":
+        cdef str event_name = event.__class__.__name__
+
+        if event_name == "ScalarEvent":
             if event.tag is not None:
                 if not event.tag.startswith("tag:yaml.org,2002:"):
                     raise YAMLLoadError(
@@ -197,77 +197,112 @@ class Representer:
                         "This is disallowed in BuildStream. At line {} column {}"
                         .format(event.start_mark.line, event.start_mark.column))
 
-        handler = "_handle_{}_{}".format(self.state, event.__class__.__name__)
-        handler = getattr(self, handler, None)
+        cdef object handler = self._get_handler_for_event(event_name)
         if handler is None:
             raise YAMLLoadError(
                 "Invalid input detected. No handler for {} in state {} at line {} column {}"
                 .format(event, self.state, event.start_mark.line, event.start_mark.column))
 
-        self.state = handler(event)  # pylint: disable=not-callable
+        # Cython weirdness here, we need to pass self to the function
+        self.state = handler(self, event)  # pylint: disable=not-callable
 
     # Get the output of the YAML parse
     #
     # Returns:
     #   (Node or None): Return the Node instance of the top level mapping or
     #                   None if there wasn't one.
-    def get_output(self):
-        try:
+    cdef Node get_output(self):
+        if len(self.output):
             return self.output[0]
-        except IndexError:
-            return None
+        return None
 
-    def _handle_init_StreamStartEvent(self, ev):
+    cdef object _get_handler_for_event(self, str event_name):
+        if self.state == "wait_list_item":
+            if event_name == "ScalarEvent":
+                return self._handle_wait_list_item_ScalarEvent
+            elif event_name == "MappingStartEvent":
+                return self._handle_wait_list_item_MappingStartEvent
+            elif event_name == "SequenceStartEvent":
+                return self._handle_wait_list_item_SequenceStartEvent
+            elif event_name == "SequenceEndEvent":
+                return self._handle_wait_list_item_SequenceEndEvent
+        elif self.state == "wait_value":
+            if event_name == "ScalarEvent":
+                return self._handle_wait_value_ScalarEvent
+            elif event_name == "MappingStartEvent":
+                return self._handle_wait_value_MappingStartEvent
+            elif event_name == "SequenceStartEvent":
+                return self._handle_wait_value_SequenceStartEvent
+        elif self.state == "wait_key":
+            if event_name == "ScalarEvent":
+                return self._handle_wait_key_ScalarEvent
+            elif event_name == "MappingEndEvent":
+                return self._handle_wait_key_MappingEndEvent
+        elif self.state == "stream":
+            if event_name == "DocumentStartEvent":
+                return self._handle_stream_DocumentStartEvent
+            elif event_name == "StreamEndEvent":
+                return self._handle_stream_StreamEndEvent
+        elif self.state == "doc":
+            if event_name == "MappingStartEvent":
+                return self._handle_doc_MappingStartEvent
+            elif event_name == "DocumentEndEvent":
+                return self._handle_doc_DocumentEndEvent
+        elif self.state == "init" and event_name == "StreamStartEvent":
+            return self._handle_init_StreamStartEvent
+        return None
+
+    cdef str _handle_init_StreamStartEvent(self, object ev):
         return "stream"
 
-    def _handle_stream_DocumentStartEvent(self, ev):
+    cdef str _handle_stream_DocumentStartEvent(self, object ev):
         return "doc"
 
-    def _handle_doc_MappingStartEvent(self, ev):
+    cdef str _handle_doc_MappingStartEvent(self, object ev):
         newmap = Node({}, self._file_index, ev.start_mark.line, ev.start_mark.column)
         self.output.append(newmap)
         return "wait_key"
 
-    def _handle_wait_key_ScalarEvent(self, ev):
+    cdef str _handle_wait_key_ScalarEvent(self, object ev):
         self.keys.append(ev.value)
         return "wait_value"
 
-    def _handle_wait_value_ScalarEvent(self, ev):
+    cdef str _handle_wait_value_ScalarEvent(self, object ev):
         key = self.keys.pop()
-        self.output[-1].value[key] = \
+        (<dict> (<Node> self.output[-1]).value)[key] = \
             Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column)
         return "wait_key"
 
-    def _handle_wait_value_MappingStartEvent(self, ev):
-        new_state = self._handle_doc_MappingStartEvent(ev)
+    cdef str _handle_wait_value_MappingStartEvent(self, object ev):
+        cdef str new_state = self._handle_doc_MappingStartEvent(ev)
         key = self.keys.pop()
-        self.output[-2].value[key] = self.output[-1]
+        (<dict> (<Node> self.output[-2]).value)[key] = self.output[-1]
         return new_state
 
-    def _handle_wait_key_MappingEndEvent(self, ev):
+    cdef str _handle_wait_key_MappingEndEvent(self, object ev):
         # We've finished a mapping, so pop it off the output stack
         # unless it's the last one in which case we leave it
         if len(self.output) > 1:
             self.output.pop()
-            if type(self.output[-1].value) is list:
+            if type((<Node> self.output[-1]).value) is list:
                 return "wait_list_item"
             else:
                 return "wait_key"
         else:
             return "doc"
 
-    def _handle_wait_value_SequenceStartEvent(self, ev):
+    cdef str _handle_wait_value_SequenceStartEvent(self, object ev):
         self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
-        self.output[-2].value[self.keys[-1]] = self.output[-1]
+        (<dict> (<Node> self.output[-2]).value)[self.keys[-1]] = self.output[-1]
         return "wait_list_item"
 
-    def _handle_wait_list_item_SequenceStartEvent(self, ev):
-        self.keys.append(len(self.output[-1].value))
+    cdef str _handle_wait_list_item_SequenceStartEvent(self, object ev):
+        self.keys.append(len((<Node> self.output[-1]).value))
         self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
-        self.output[-2].value.append(self.output[-1])
+        (<list> (<Node> self.output[-2]).value).append(self.output[-1])
         return "wait_list_item"
 
-    def _handle_wait_list_item_SequenceEndEvent(self, ev):
+    cdef str _handle_wait_list_item_SequenceEndEvent(self, object ev):
         # When ending a sequence, we need to pop a key because we retain the
         # key until the end so that if we need to mutate the underlying entry
         # we can.
@@ -278,22 +313,22 @@ class Representer:
         else:
             return "wait_key"
 
-    def _handle_wait_list_item_ScalarEvent(self, ev):
-        self.output[-1].value.append(
+    cdef str _handle_wait_list_item_ScalarEvent(self, object ev):
+        (<Node> self.output[-1]).value.append(
             Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column))
         return "wait_list_item"
 
-    def _handle_wait_list_item_MappingStartEvent(self, ev):
-        new_state = self._handle_doc_MappingStartEvent(ev)
-        self.output[-2].value.append(self.output[-1])
+    cdef str _handle_wait_list_item_MappingStartEvent(self, object ev):
+        cdef str new_state = self._handle_doc_MappingStartEvent(ev)
+        (<list> (<Node> self.output[-2]).value).append(self.output[-1])
         return new_state
 
-    def _handle_doc_DocumentEndEvent(self, ev):
+    cdef str _handle_doc_DocumentEndEvent(self, object ev):
         if len(self.output) != 1:
             raise YAMLLoadError("Zero, or more than one document found in YAML stream")
         return "stream"
 
-    def _handle_stream_StreamEndEvent(self, ev):
+    cdef str _handle_stream_StreamEndEvent(self, object ev):
         return "init"
 
 
@@ -348,12 +383,19 @@ cpdef Node load(str filename, str shortname=None, bint copy_tree=False, object p
 
 # Like load(), but doesnt require the data to be in a file
 #
-def load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False):
+cpdef Node load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False):
+    cdef Representer rep
 
     try:
         rep = Representer(file_index)
-        for event in yaml.parse(data, Loader=yaml.CBaseLoader):
-            rep.handle_event(event)
+        parser = yaml.CParser(data)
+
+        try:
+            while parser.check_event():
+                rep.handle_event(parser.get_event())
+        finally:
+            parser.dispose()
+
         contents = rep.get_output()
     except YAMLLoadError as e:
         raise LoadError(LoadErrorReason.INVALID_YAML,
-- 
cgit v1.2.1


From 8d1022bb610364e6bad563eef05f011b0d6a17c6 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Mon, 20 May 2019 22:01:10 +0100
Subject: _yaml: Internalize `ProvenanceInformation`

---
 src/buildstream/_yaml.pyx | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index 9ad5b34ce..3a81595bb 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -90,21 +90,16 @@ _SYNTHETIC_COUNTER = count(start=-1, step=-1)
 
 
 # Returned from node_get_provenance
-class ProvenanceInformation:
-
-    __slots__ = (
-        "filename",
-        "shortname",
-        "displayname",
-        "line",
-        "col",
-        "toplevel",
-        "node",
-        "project",
-        "is_synthetic",
-    )
-
-    def __init__(self, nodeish):
+cdef class ProvenanceInformation:
+
+    cdef public Node node
+    cdef str displayname
+    cdef public str filename, shortname
+    cdef public int col, line
+    cdef public object project, toplevel
+    cdef public bint is_synthetic
+
+    def __init__(self, Node nodeish):
         self.node = nodeish
         if (nodeish is None) or (nodeish.file_index is None):
             self.filename = ""
@@ -1239,7 +1234,7 @@ def _list_final_assertions(Node values):
 # Note that dashes are generally preferred for variable names and
 # usage in YAML, but things such as option names which will be
 # evaluated with jinja2 cannot use dashes.
-def assert_symbol_name(object provenance, str symbol_name, str purpose, *, bint allow_dashes=True):
+def assert_symbol_name(ProvenanceInformation provenance, str symbol_name, str purpose, *, bint allow_dashes=True):
     cdef str valid_chars = string.digits + string.ascii_letters + '_'
     if allow_dashes:
         valid_chars += '-'
-- 
cgit v1.2.1


From 4c92789fa42a6414ba20586daed63ccf19c991d0 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Mon, 20 May 2019 22:10:29 +0100
Subject: _yaml: introduce FileInfo extension class

We used to have a tuple to keep file information.

This makes it hard to read, accessing attributes by index.
With an extension class FileInfo, we get better readability,
without sacrificing performance
---
 src/buildstream/_yaml.pyx | 51 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index 3a81595bb..044f5d2a3 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -75,6 +75,26 @@ cdef class Node:
         # code which has access to such nodes would do this.
         return what in self.value
 
+
+# Metadata container for a yaml toplevel node.
+#
+# This class contains metadata around a yaml node in order to be able
+# to trace back the provenance of a node to the file.
+#
+cdef class FileInfo:
+
+    cdef str filename, shortname, displayname
+    cdef Node toplevel,
+    cdef object project
+
+    def __init__(self, str filename, str shortname, str displayname, Node toplevel, object project):
+        self.filename = filename
+        self.shortname = shortname
+        self.displayname = displayname
+        self.toplevel = toplevel
+        self.project = project
+
+
 # File name handling
 cdef _FILE_LIST = []
 
@@ -100,6 +120,8 @@ cdef class ProvenanceInformation:
     cdef public bint is_synthetic
 
     def __init__(self, Node nodeish):
+        cdef FileInfo fileinfo
+
         self.node = nodeish
         if (nodeish is None) or (nodeish.file_index is None):
             self.filename = ""
@@ -110,15 +132,15 @@ cdef class ProvenanceInformation:
             self.toplevel = None
             self.project = None
         else:
-            fileinfo = _FILE_LIST[nodeish.file_index]
-            self.filename = fileinfo[0]
-            self.shortname = fileinfo[1]
-            self.displayname = fileinfo[2]
+            fileinfo = <FileInfo> _FILE_LIST[nodeish.file_index]
+            self.filename = fileinfo.filename
+            self.shortname = fileinfo.shortname
+            self.displayname = fileinfo.displayname
             # We add 1 here to convert from computerish to humanish
             self.line = nodeish.line + 1
             self.col = nodeish.column
-            self.toplevel = fileinfo[3]
-            self.project = fileinfo[4]
+            self.toplevel = fileinfo.toplevel
+            self.project = fileinfo.project
         self.is_synthetic = (self.filename == '') or (self.col < 0)
 
     # Convert a Provenance to a string for error reporting
@@ -351,7 +373,7 @@ cpdef Node load(str filename, str shortname=None, bint copy_tree=False, object p
         displayname = shortname
 
     cdef Py_ssize_t file_number = len(_FILE_LIST)
-    _FILE_LIST.append((filename, shortname, displayname, None, project))
+    _FILE_LIST.append(FileInfo(filename, shortname, displayname, None, project))
 
     cdef Node data
 
@@ -380,6 +402,7 @@ cpdef Node load(str filename, str shortname=None, bint copy_tree=False, object p
 #
 cpdef Node load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_name=None, bint copy_tree=False):
     cdef Representer rep
+    cdef FileInfo f_info
 
     try:
         rep = Representer(file_index)
@@ -410,12 +433,14 @@ cpdef Node load_data(str data, int file_index=_SYNTHETIC_FILE_INDEX, str file_na
 
     # Store this away because we'll use it later for "top level" provenance
     if file_index is not None:
-        _FILE_LIST[file_index] = (
-            _FILE_LIST[file_index][0],  # Filename
-            _FILE_LIST[file_index][1],  # Shortname
-            _FILE_LIST[file_index][2],  # Displayname
+        f_info = <FileInfo> _FILE_LIST[file_index]
+
+        _FILE_LIST[file_index] = FileInfo(
+            f_info.filename,
+            f_info.shortname,
+            f_info.displayname,
             contents,
-            _FILE_LIST[file_index][4],  # Project
+            f_info.project,
         )
 
     if copy_tree:
@@ -761,7 +786,7 @@ def new_synthetic_file(str filename, object project=None):
     cdef Py_ssize_t file_index = len(_FILE_LIST)
     cdef Node node = Node({}, file_index, 0, 0)
 
-    _FILE_LIST.append((filename,
+    _FILE_LIST.append(FileInfo(filename,
                        filename,
                        "<synthetic {}>".format(filename),
                        node,
-- 
cgit v1.2.1


From 6901a829027a1fe0dd2002852a80a2eef0b322e4 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Mon, 20 May 2019 22:24:42 +0100
Subject: _yaml: provide c definitions of functions called internally

Providing c definitions for functions allows us to not have to
go back to the python interpreter when inside the module. We
therefore gain more performance.

One gotcha is that keyword only arguments are not valid in cpdef
functions.
---
 src/buildstream/_yaml.pyx | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index 044f5d2a3..b981e1f8a 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -474,7 +474,7 @@ def dump(object contents, str filename=None):
 #
 # Returns: The Provenance of the dict, member or list element
 #
-def node_get_provenance(Node node, str key=None, list indices=None):
+cpdef ProvenanceInformation node_get_provenance(Node node, str key=None, list indices=None):
     assert type(node.value) is dict
 
     if key is None:
@@ -519,7 +519,7 @@ _sentinel = object()
 # Note:
 #    Returned strings are stripped of leading and trailing whitespace
 #
-def node_get(Node node, object expected_type, str key, list indices=None, *, object default_value=_sentinel, bint allow_none=False):
+cpdef node_get(Node node, object expected_type, str key, list indices=None, object default_value=_sentinel, bint allow_none=False):
     if indices is None:
         if default_value is _sentinel:
             value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, 0))
@@ -566,8 +566,7 @@ def node_get(Node node, object expected_type, str key, list indices=None, *, obj
         except (ValueError, TypeError):
             provenance = node_get_provenance(node, key=key, indices=indices)
             if indices:
-                path = [key]
-                path.extend("[{:d}]".format(i) for i in indices)
+                path = [key, *["[{:d}]".format(i) for i in indices]]
                 path = "".join(path)
             else:
                 path = key
@@ -817,7 +816,7 @@ def new_empty_node(Node ref_node=None):
 # Returns:
 #   (Node): A new synthetic YAML tree which represents this dictionary
 #
-def new_node_from_dict(dict indict):
+cpdef Node new_node_from_dict(dict indict):
     cdef dict ret = {}
     cdef str k
     for k, v in indict.items():
@@ -958,7 +957,7 @@ cdef void _compose_list(Node target, Node source):
 #
 # Raises: CompositeError
 #
-def composite_dict(Node target, Node source, list path=None):
+cpdef void composite_dict(Node target, Node source, list path=None) except *:
     cdef str k
     cdef Node v, target_value
 
@@ -1020,7 +1019,7 @@ def composite_dict(Node target, Node source, list path=None):
 
 # Like composite_dict(), but raises an all purpose LoadError for convenience
 #
-def composite(Node target, Node source):
+cpdef void composite(Node target, Node source) except *:
     assert type(source.value) is dict
     assert type(target.value) is dict
 
@@ -1063,7 +1062,7 @@ __SANITIZE_SHORT_CIRCUIT_TYPES = (int, float, str, bool)
 #
 # Only dicts are ordered, list elements are left in order.
 #
-def node_sanitize(node, *, dict_type=OrderedDict):
+cpdef object node_sanitize(object node, object dict_type=OrderedDict):
     node_type = type(node)
 
     # If we have an unwrappable node, unwrap it
@@ -1096,7 +1095,7 @@ def node_sanitize(node, *, dict_type=OrderedDict):
     # Sometimes we're handed tuples and we can't be sure what they contain
     # so we have to sanitize into them
     elif node_type is tuple:
-        return tuple((node_sanitize(v, dict_type=dict_type) for v in node))
+        return tuple([node_sanitize(v, dict_type=dict_type) for v in node])
 
     # Everything else just gets returned as-is.
     return node
@@ -1156,7 +1155,7 @@ __NODE_ASSERT_COMPOSITION_DIRECTIVES = ('(>)', '(<)', '(=)')
 # Returns:
 #    (Node): A deep copy of source with provenance preserved.
 #
-def node_copy(Node source):
+cpdef Node node_copy(Node source):
     cdef dict copy = {}
     cdef str key
     cdef Node value
@@ -1207,7 +1206,7 @@ cdef Node _list_copy(Node source):
 # Raises:
 #    (LoadError): If any assertions fail
 #
-def node_final_assertions(Node node):
+cpdef void node_final_assertions(Node node) except *:
     cdef str key
     cdef Node value
 
@@ -1303,7 +1302,7 @@ def assert_symbol_name(ProvenanceInformation provenance, str symbol_name, str pu
 #
 # Returns:
 #    (list): A path from `node` to `target` or None if `target` is not in the subtree
-def node_find_target(Node node, Node target, *, str key=None):
+cpdef list node_find_target(Node node, Node target, str key=None):
     if key is not None:
         target = target.value[key]
 
-- 
cgit v1.2.1


From eadcca3eaa921341b6c29d48c841614d9da7472a Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <ben.c.schubert@gmail.com>
Date: Tue, 21 May 2019 11:52:51 +0100
Subject: _variable: Import _yaml from C.

This requires the addition of a definition file (.pxd), to list
symbols exported.
---
 setup.py                       |  2 +-
 src/buildstream/_variables.pyx | 13 ++++++-------
 src/buildstream/_yaml.pxd      | 44 ++++++++++++++++++++++++++++++++++++++++++
 src/buildstream/_yaml.pyx      | 24 ++++++-----------------
 4 files changed, 57 insertions(+), 26 deletions(-)
 create mode 100644 src/buildstream/_yaml.pxd

diff --git a/setup.py b/setup.py
index b848e2458..a57e65b56 100755
--- a/setup.py
+++ b/setup.py
@@ -392,7 +392,7 @@ def register_cython_module(module_name, dependencies=None):
 BUILD_EXTENSIONS = []
 
 register_cython_module("buildstream._yaml")
-register_cython_module("buildstream._variables")
+register_cython_module("buildstream._variables", dependencies=["buildstream._yaml"])
 
 #####################################################
 #             Main setup() Invocation               #
diff --git a/src/buildstream/_variables.pyx b/src/buildstream/_variables.pyx
index d99aaeb78..9b8b5a902 100644
--- a/src/buildstream/_variables.pyx
+++ b/src/buildstream/_variables.pyx
@@ -24,7 +24,7 @@ import re
 import sys
 
 from ._exceptions import LoadError, LoadErrorReason
-from . import _yaml
+from . cimport _yaml
 
 # Variables are allowed to have dashes here
 #
@@ -58,18 +58,18 @@ PARSE_EXPANSION = re.compile(r"\%\{([a-zA-Z][a-zA-Z0-9_-]*)\}")
 # variable settings for the element.
 #
 # Args:
-#     node (dict): A node loaded and composited with yaml tools
+#     node (Node): A node loaded and composited with yaml tools
 #
 # Raises:
 #     LoadError, if unresolved variables, or cycles in resolution, occur.
 #
 cdef class Variables:
 
-    cdef object original
+    cdef _yaml.Node original
     cdef dict _expstr_map
     cdef public dict flat
 
-    def __init__(self, node):
+    def __init__(self, _yaml.Node node):
         self.original = node
         self._expstr_map = self._resolve(node)
         self.flat = self._flatten()
@@ -115,13 +115,12 @@ cdef class Variables:
     #
     # Here we resolve all of our inputs into a dictionary, ready for use
     # in subst()
-    # FIXME: node should be a yaml Node if moved
-    cdef dict _resolve(self, object node):
+    cdef dict _resolve(self, _yaml.Node node):
         # Special case, if notparallel is specified in the variables for this
         # element, then override max-jobs to be 1.
         # Initialize it as a string as all variables are processed as strings.
         #
-        if _yaml.node_get(node, bool, 'notparallel', default_value=False):
+        if _yaml.node_get(node, bool, 'notparallel', None, False):
             _yaml.node_set(node, 'max-jobs', str(1))
 
         cdef dict ret = {}
diff --git a/src/buildstream/_yaml.pxd b/src/buildstream/_yaml.pxd
new file mode 100644
index 000000000..27a1a888e
--- /dev/null
+++ b/src/buildstream/_yaml.pxd
@@ -0,0 +1,44 @@
+#
+#  Copyright (C) 2019 Bloomberg L.P.
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2 of the License, or (at your option) any later version.
+#
+#  This library is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+#  Authors:
+#        Benjamin Schubert <bschubert@bloomberg.net>
+
+# Documentation for each class and method here can be found in the adjacent
+# implementation file (_yaml.pyx)
+
+cdef class Node:
+
+    cdef public object value
+    cdef public int file_index
+    cdef public int line
+    cdef public int column
+
+
+cdef class ProvenanceInformation:
+
+    cdef public Node node
+    cdef str displayname
+    cdef public str filename, shortname
+    cdef public int col, line
+    cdef public object project, toplevel
+    cdef public bint is_synthetic
+
+
+cpdef object node_get(Node node, object expected_type, str key, list indices=*, object default_value=*, bint allow_none=*)
+cpdef void node_set(Node node, object key, object value, list indices=*) except *
+cpdef list node_keys(object node)
+cpdef ProvenanceInformation node_get_provenance(Node node, str key=*, list indices=*)
diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index b981e1f8a..b7e0e26f8 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -58,11 +58,6 @@ from ._exceptions import LoadError, LoadErrorReason
 #
 cdef class Node:
 
-    cdef public object value
-    cdef public int file_index
-    cdef public int line
-    cdef public int column
-
     def __init__(self, object value, int file_index, int line, int column):
         self.value = value
         self.file_index = file_index
@@ -112,13 +107,6 @@ _SYNTHETIC_COUNTER = count(start=-1, step=-1)
 # Returned from node_get_provenance
 cdef class ProvenanceInformation:
 
-    cdef public Node node
-    cdef str displayname
-    cdef public str filename, shortname
-    cdef public int col, line
-    cdef public object project, toplevel
-    cdef public bint is_synthetic
-
     def __init__(self, Node nodeish):
         cdef FileInfo fileinfo
 
@@ -519,7 +507,7 @@ _sentinel = object()
 # Note:
 #    Returned strings are stripped of leading and trailing whitespace
 #
-cpdef node_get(Node node, object expected_type, str key, list indices=None, object default_value=_sentinel, bint allow_none=False):
+cpdef object node_get(Node node, object expected_type, str key, list indices=None, object default_value=_sentinel, bint allow_none=False):
     if indices is None:
         if default_value is _sentinel:
             value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, 0))
@@ -617,7 +605,7 @@ cdef list __trim_list_provenance(list value):
 #    indices: Any indices to index into the list referenced by key, like in
 #             `node_get` (must be a list of integers)
 #
-def node_set(Node node, object key, object value, list indices=None):
+cpdef void node_set(Node node, object key, object value, list indices=None) except *:
     cdef int idx
 
     if indices:
@@ -723,10 +711,10 @@ def node_items(node):
 # Yields:
 #    (str): The key name
 #
-def node_keys(node):
-    if type(node) is not Node:
-        node = Node(node, _SYNTHETIC_FILE_INDEX, 0, 0)
-    yield from node.value.keys()
+cpdef list node_keys(object node):
+    if type(node) is Node:
+        return list((<Node> node).value.keys())
+    return list(node.keys())
 
 
 # node_del()
-- 
cgit v1.2.1


From 39fd83c74040ba6e850f526303e4caf9899d2c96 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Tue, 21 May 2019 22:11:09 +0100
Subject: _yaml: rework SYNTHETIC_COUNTER to be a C function

The `SYNTHETIC_COUNTER` is an iterator that is called a lot in
_yaml, one for each synthetic node. Cython is not able to optimize
`itertools.counter` well enough. Providing a custom C function allows
to reduce the amount of python code called in this critical codepath
---
 src/buildstream/_yaml.pyx | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index b7e0e26f8..7f7b74169 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -27,7 +27,6 @@ from contextlib import ExitStack
 from collections import OrderedDict
 from collections.abc import Mapping, Sequence
 from copy import deepcopy
-from itertools import count
 
 from ruamel import yaml
 from ._exceptions import LoadError, LoadErrorReason
@@ -94,14 +93,18 @@ cdef class FileInfo:
 cdef _FILE_LIST = []
 
 
-# Purely synthetic node will have _SYNCTHETIC_FILE_INDEX for the file number, have line number
+# Purely synthetic node will have _SYNTHETIC_FILE_INDEX for the file number, have line number
 # zero, and a negative column number which comes from inverting the next value
 # out of this counter.  Synthetic nodes created with a reference node will
 # have a file number from the reference node, some unknown line number, and
 # a negative column number from this counter.
 cdef int _SYNTHETIC_FILE_INDEX = -1
+cdef int __counter = 0
 
-_SYNTHETIC_COUNTER = count(start=-1, step=-1)
+cdef int next_synthetic_counter():
+    global __counter
+    __counter -= 1
+    return __counter
 
 
 # Returned from node_get_provenance
@@ -512,7 +515,7 @@ cpdef object node_get(Node node, object expected_type, str key, list indices=Non
         if default_value is _sentinel:
             value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, 0))
         else:
-            value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER)))
+            value = node.value.get(key, Node(default_value, _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter()))
 
         if value.value is _sentinel:
             provenance = node_get_provenance(node)
@@ -622,7 +625,7 @@ cpdef void node_set(Node node, object key, object value, list indices=None) exce
         except KeyError:
             old_value = None
         if old_value is None:
-            node.value[key] = Node(value, node.file_index, node.line, next(_SYNTHETIC_COUNTER))
+            node.value[key] = Node(value, node.file_index, node.line, next_synthetic_counter())
         else:
             node.value[key] = Node(value, old_value.file_index, old_value.line, old_value.column)
 
@@ -647,7 +650,7 @@ def node_extend_list(Node node, str key, Py_ssize_t length, object default):
 
     cdef Node list_node = <Node> node.value.get(key)
     if list_node is None:
-        list_node = node.value[key] = Node([], node.file_index, node.line, next(_SYNTHETIC_COUNTER))
+        list_node = node.value[key] = Node([], node.file_index, node.line, next_synthetic_counter())
 
     cdef list the_list = list_node.value
     def_type = type(default)
@@ -668,7 +671,7 @@ def node_extend_list(Node node, str key, Py_ssize_t length, object default):
 
         line_num += 1
 
-        the_list.append(Node(value, file_index, line_num, next(_SYNTHETIC_COUNTER)))
+        the_list.append(Node(value, file_index, line_num, next_synthetic_counter()))
 
 
 # node_items()
@@ -791,7 +794,7 @@ def new_synthetic_file(str filename, object project=None):
 #
 def new_empty_node(Node ref_node=None):
     if ref_node is not None:
-        return Node({}, ref_node.file_index, ref_node.line, next(_SYNTHETIC_COUNTER))
+        return Node({}, ref_node.file_index, ref_node.line, next_synthetic_counter())
     else:
         return Node({}, _SYNTHETIC_FILE_INDEX, 0, 0)
 
@@ -814,8 +817,8 @@ cpdef Node new_node_from_dict(dict indict):
         elif vtype is list:
             ret[k] = __new_node_from_list(v)
         else:
-            ret[k] = Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER))
-    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER))
+            ret[k] = Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())
+    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())
 
 
 # Internal function to help new_node_from_dict() to handle lists
@@ -828,8 +831,8 @@ cdef Node __new_node_from_list(list inlist):
         elif vtype is list:
             ret.append(__new_node_from_list(v))
         else:
-            ret.append(Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER)))
-    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next(_SYNTHETIC_COUNTER))
+            ret.append(Node(str(v), _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter()))
+    return Node(ret, _SYNTHETIC_FILE_INDEX, 0, next_synthetic_counter())
 
 
 # _is_composite_list
-- 
cgit v1.2.1


From 32ee4163bc6c628dc1d475a110022b37bf661fa5 Mon Sep 17 00:00:00 2001
From: Benjamin Schubert <contact@benschubert.me>
Date: Wed, 29 May 2019 22:16:52 +0100
Subject: _yaml: Replace strings by a C enum for Representer states.

This allows for a quicker comparison while keeping a good readability
of the code
---
 src/buildstream/_yaml.pyx | 101 ++++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 44 deletions(-)

diff --git a/src/buildstream/_yaml.pyx b/src/buildstream/_yaml.pyx
index 7f7b74169..50f0c64a9 100644
--- a/src/buildstream/_yaml.pyx
+++ b/src/buildstream/_yaml.pyx
@@ -156,6 +156,19 @@ class YAMLLoadError(Exception):
     pass
 
 
+# Represents the various states in which the Representer can be
+# while parsing yaml.
+cdef enum RepresenterState:
+    doc
+    init
+    stream
+    wait_key
+    wait_list_item
+    wait_value
+
+
+ctypedef RepresenterState (*representer_action)(Representer, object)
+
 # Representer for YAML events comprising input to the BuildStream format.
 #
 # All streams MUST represent a single document which must be a Mapping.
@@ -167,7 +180,7 @@ class YAMLLoadError(Exception):
 cdef class Representer:
 
     cdef int _file_index
-    cdef str state
+    cdef RepresenterState state
     cdef list output, keys
 
     # Initialise a new representer
@@ -179,7 +192,7 @@ cdef class Representer:
     #   file_index (int): The index of this YAML file
     def __init__(self, int file_index):
         self._file_index = file_index
-        self.state = "init"
+        self.state = RepresenterState.init
         self.output = []
         self.keys = []
 
@@ -205,14 +218,14 @@ cdef class Representer:
                         "This is disallowed in BuildStream. At line {} column {}"
                         .format(event.start_mark.line, event.start_mark.column))
 
-        cdef object handler = self._get_handler_for_event(event_name)
-        if handler is None:
+        cdef representer_action handler = self._get_handler_for_event(event_name)
+        if not handler:
             raise YAMLLoadError(
                 "Invalid input detected. No handler for {} in state {} at line {} column {}"
                 .format(event, self.state, event.start_mark.line, event.start_mark.column))
 
         # Cython weirdness here, we need to pass self to the function
-        self.state = handler(self, event)  # pylint: disable=not-callable
+        self.state = <RepresenterState> handler(self, event)  # pylint: disable=not-callable
 
     # Get the output of the YAML parse
     #
@@ -224,8 +237,8 @@ cdef class Representer:
             return self.output[0]
         return None
 
-    cdef object _get_handler_for_event(self, str event_name):
-        if self.state == "wait_list_item":
+    cdef representer_action _get_handler_for_event(self, str event_name):
+        if self.state == RepresenterState.wait_list_item:
             if event_name == "ScalarEvent":
                 return self._handle_wait_list_item_ScalarEvent
             elif event_name == "MappingStartEvent":
@@ -234,110 +247,110 @@ cdef class Representer:
                 return self._handle_wait_list_item_SequenceStartEvent
             elif event_name == "SequenceEndEvent":
                 return self._handle_wait_list_item_SequenceEndEvent
-        elif self.state == "wait_value":
+        elif self.state == RepresenterState.wait_value:
             if event_name == "ScalarEvent":
                 return self._handle_wait_value_ScalarEvent
             elif event_name == "MappingStartEvent":
                 return self._handle_wait_value_MappingStartEvent
             elif event_name == "SequenceStartEvent":
                 return self._handle_wait_value_SequenceStartEvent
-        elif self.state == "wait_key":
+        elif self.state == RepresenterState.wait_key:
             if event_name == "ScalarEvent":
                 return self._handle_wait_key_ScalarEvent
             elif event_name == "MappingEndEvent":
                 return self._handle_wait_key_MappingEndEvent
-        elif self.state == "stream":
+        elif self.state == RepresenterState.stream:
             if event_name == "DocumentStartEvent":
                 return self._handle_stream_DocumentStartEvent
             elif event_name == "StreamEndEvent":
                 return self._handle_stream_StreamEndEvent
-        elif self.state == "doc":
+        elif self.state == RepresenterState.doc:
             if event_name == "MappingStartEvent":
                 return self._handle_doc_MappingStartEvent
             elif event_name == "DocumentEndEvent":
                 return self._handle_doc_DocumentEndEvent
-        elif self.state == "init" and event_name == "StreamStartEvent":
+        elif self.state == RepresenterState.init and event_name == "StreamStartEvent":
             return self._handle_init_StreamStartEvent
-        return None
+        return NULL
 
-    cdef str _handle_init_StreamStartEvent(self, object ev):
-        return "stream"
+    cdef RepresenterState _handle_init_StreamStartEvent(self, object ev):
+        return RepresenterState.stream
 
-    cdef str _handle_stream_DocumentStartEvent(self, object ev):
-        return "doc"
+    cdef RepresenterState _handle_stream_DocumentStartEvent(self, object ev):
+        return RepresenterState.doc
 
-    cdef str _handle_doc_MappingStartEvent(self, object ev):
+    cdef RepresenterState _handle_doc_MappingStartEvent(self, object ev):
         newmap = Node({}, self._file_index, ev.start_mark.line, ev.start_mark.column)
         self.output.append(newmap)
-        return "wait_key"
+        return RepresenterState.wait_key
 
-    cdef str _handle_wait_key_ScalarEvent(self, object ev):
+    cdef RepresenterState _handle_wait_key_ScalarEvent(self, object ev):
         self.keys.append(ev.value)
-        return "wait_value"
+        return RepresenterState.wait_value
 
-    cdef str _handle_wait_value_ScalarEvent(self, object ev):
+    cdef RepresenterState _handle_wait_value_ScalarEvent(self, object ev):
         key = self.keys.pop()
         (<dict> (<Node> self.output[-1]).value)[key] = \
             Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column)
-        return "wait_key"
+        return RepresenterState.wait_key
 
-    cdef str _handle_wait_value_MappingStartEvent(self, object ev):
-        cdef str new_state = self._handle_doc_MappingStartEvent(ev)
+    cdef RepresenterState _handle_wait_value_MappingStartEvent(self, object ev):
+        cdef RepresenterState new_state = self._handle_doc_MappingStartEvent(ev)
         key = self.keys.pop()
         (<dict> (<Node> self.output[-2]).value)[key] = self.output[-1]
         return new_state
 
-    cdef str _handle_wait_key_MappingEndEvent(self, object ev):
+    cdef RepresenterState _handle_wait_key_MappingEndEvent(self, object ev):
         # We've finished a mapping, so pop it off the output stack
         # unless it's the last one in which case we leave it
         if len(self.output) > 1:
             self.output.pop()
             if type((<Node> self.output[-1]).value) is list:
-                return "wait_list_item"
+                return RepresenterState.wait_list_item
             else:
-                return "wait_key"
+                return RepresenterState.wait_key
         else:
-            return "doc"
+            return RepresenterState.doc
 
-    cdef str _handle_wait_value_SequenceStartEvent(self, object ev):
+    cdef RepresenterState _handle_wait_value_SequenceStartEvent(self, object ev):
         self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
         (<dict> (<Node> self.output[-2]).value)[self.keys[-1]] = self.output[-1]
-        return "wait_list_item"
+        return RepresenterState.wait_list_item
 
-    cdef str _handle_wait_list_item_SequenceStartEvent(self, object ev):
+    cdef RepresenterState _handle_wait_list_item_SequenceStartEvent(self, object ev):
         self.keys.append(len((<Node> self.output[-1]).value))
         self.output.append(Node([], self._file_index, ev.start_mark.line, ev.start_mark.column))
         (<list> (<Node> self.output[-2]).value).append(self.output[-1])
-        return "wait_list_item"
+        return RepresenterState.wait_list_item
 
-    cdef str _handle_wait_list_item_SequenceEndEvent(self, object ev):
+    cdef RepresenterState _handle_wait_list_item_SequenceEndEvent(self, object ev):
         # When ending a sequence, we need to pop a key because we retain the
         # key until the end so that if we need to mutate the underlying entry
         # we can.
         key = self.keys.pop()
         self.output.pop()
         if type(key) is int:
-            return "wait_list_item"
+            return RepresenterState.wait_list_item
         else:
-            return "wait_key"
+            return RepresenterState.wait_key
 
-    cdef str _handle_wait_list_item_ScalarEvent(self, object ev):
+    cdef RepresenterState _handle_wait_list_item_ScalarEvent(self, object ev):
         (<Node> self.output[-1]).value.append(
             Node(ev.value, self._file_index, ev.start_mark.line, ev.start_mark.column))
-        return "wait_list_item"
+        return RepresenterState.wait_list_item
 
-    cdef str _handle_wait_list_item_MappingStartEvent(self, object ev):
-        cdef str new_state = self._handle_doc_MappingStartEvent(ev)
+    cdef RepresenterState _handle_wait_list_item_MappingStartEvent(self, object ev):
+        cdef RepresenterState new_state = self._handle_doc_MappingStartEvent(ev)
         (<list> (<Node> self.output[-2]).value).append(self.output[-1])
         return new_state
 
-    cdef str _handle_doc_DocumentEndEvent(self, object ev):
+    cdef RepresenterState _handle_doc_DocumentEndEvent(self, object ev):
         if len(self.output) != 1:
             raise YAMLLoadError("Zero, or more than one document found in YAML stream")
-        return "stream"
+        return RepresenterState.stream
 
-    cdef str _handle_stream_StreamEndEvent(self, object ev):
-        return "init"
+    cdef RepresenterState _handle_stream_StreamEndEvent(self, object ev):
+        return RepresenterState.init
 
 
 # Loads a dictionary from some YAML
-- 
cgit v1.2.1