summaryrefslogtreecommitdiff
path: root/src/buildstream/_scheduler
diff options
context:
space:
mode:
authorAngelos Evripiotis <jevripiotis@bloomberg.net>2019-07-15 10:38:49 +0100
committerbst-marge-bot <marge-bot@buildstream.build>2019-07-24 12:27:10 +0000
commitd3e7857a1eabd06d9d15ca6c201ed7b66064cc98 (patch)
treec092b1ef736918e803e5fbffc1e1218d0739febd /src/buildstream/_scheduler
parente02a2dcfe18fd3374c28624dcc61dbb3244630a9 (diff)
downloadbuildstream-d3e7857a1eabd06d9d15ca6c201ed7b66064cc98.tar.gz
Make ChildJobs and friends picklable
Pave the way toward supporting the 'spawn' method of creating jobs, by adding support for pickling ChildJobs. Introduce a new 'jobpickler' module that provides an entrypoint for this functionality. This also makes replays of jobs possible, which has made the debugging of plugins much easier for me.
Diffstat (limited to 'src/buildstream/_scheduler')
-rw-r--r--src/buildstream/_scheduler/jobs/jobpickler.py132
-rw-r--r--src/buildstream/_scheduler/scheduler.py15
2 files changed, 147 insertions, 0 deletions
diff --git a/src/buildstream/_scheduler/jobs/jobpickler.py b/src/buildstream/_scheduler/jobs/jobpickler.py
new file mode 100644
index 000000000..0edf88c10
--- /dev/null
+++ b/src/buildstream/_scheduler/jobs/jobpickler.py
@@ -0,0 +1,132 @@
+#
+# Copyright (C) 2019 Bloomberg Finance LP
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Authors:
+# Angelos Evripiotis <jevripiotis@bloomberg.net>
+
+
+import copyreg
+import io
+import pickle
+
+from ..._protos.buildstream.v2.artifact_pb2 import Artifact as ArtifactProto
+
+# BuildStream toplevel imports
+from ..._loader import Loader
+from ..._messenger import Messenger
+
+
+# pickle_child_job()
+#
+# Perform the special case pickling required to pickle a child job for
+# unpickling in a child process.
+#
+# Note that we don't need an `unpickle_child_job`, as regular `pickle.load()`
+# will do everything required.
+#
+# Args:
+# child_job (ChildJob): The job to be pickled.
+# projects (List[Project]): The list of loaded projects, so we can get the
+# relevant factories.
+#
+# Returns:
+# An `io.BytesIO`, with the pickled contents of the ChildJob and everything it
+# transitively refers to.
+#
+# Some types require special handling when pickling to send to another process.
+# We register overrides for those special cases:
+#
+# o Very stateful objects: Some things carry much more state than they need for
+# pickling over to the child job process. This extra state brings
+# complication of supporting pickling of more types, and the performance
+# penalty of the actual pickling. Use private knowledge of these objects to
+# safely reduce the pickled state.
+#
+# o gRPC objects: These don't pickle, but they do have their own serialization
+# mechanism, which we use instead. To avoid modifying generated code, we
+# instead register overrides here.
+#
+# o Plugins: These cannot be unpickled unless the factory which created them
+# has been unpickled first, with the same identifier as before. See note
+# below. Some state in plugins is not necessary for child jobs, and comes
+# with a heavy cost; we also need to remove this before pickling.
+#
+def pickle_child_job(child_job, projects):
+
+ element_classes = [
+ cls
+ for p in projects
+ if p.config.element_factory is not None
+ for cls, _ in p.config.element_factory.all_loaded_plugins()
+ ]
+ source_classes = [
+ cls
+ for p in projects
+ if p.config.source_factory is not None
+ for cls, _ in p.config.source_factory.all_loaded_plugins()
+ ]
+
+ data = io.BytesIO()
+ pickler = pickle.Pickler(data)
+ pickler.dispatch_table = copyreg.dispatch_table.copy()
+
+ for cls in element_classes:
+ pickler.dispatch_table[cls] = _reduce_plugin
+ for cls in source_classes:
+ pickler.dispatch_table[cls] = _reduce_plugin
+ pickler.dispatch_table[ArtifactProto] = _reduce_artifact_proto
+ pickler.dispatch_table[Loader] = _reduce_object
+ pickler.dispatch_table[Messenger] = _reduce_object
+
+ pickler.dump(child_job)
+ data.seek(0)
+
+ return data
+
+
+def _reduce_object(instance):
+ cls = type(instance)
+ state = instance.get_state_for_child_job_pickling()
+ return (cls.__new__, (cls,), state)
+
+
+def _reduce_artifact_proto(instance):
+ assert isinstance(instance, ArtifactProto)
+ data = instance.SerializeToString()
+ return (_new_artifact_proto_from_reduction_args, (data,))
+
+
+def _new_artifact_proto_from_reduction_args(data):
+ instance = ArtifactProto()
+ instance.ParseFromString(data)
+ return instance
+
+
+def _reduce_plugin(plugin):
+ factory, meta_kind, state = plugin._get_args_for_child_job_pickling()
+ args = (factory, meta_kind)
+ return (_new_plugin_from_reduction_args, args, state)
+
+
+def _new_plugin_from_reduction_args(factory, meta_kind):
+ cls, _ = factory.lookup(meta_kind)
+ plugin = cls.__new__(cls)
+
+ # Note that we rely on the `__project` member of the Plugin to keep
+ # `factory` alive after the scope of this function. If `factory` were to be
+ # GC'd then we would see undefined behaviour.
+
+ return plugin
diff --git a/src/buildstream/_scheduler/scheduler.py b/src/buildstream/_scheduler/scheduler.py
index 00d61140e..2dea1d48b 100644
--- a/src/buildstream/_scheduler/scheduler.py
+++ b/src/buildstream/_scheduler/scheduler.py
@@ -601,3 +601,18 @@ class Scheduler():
def _tick(self):
self._ticker_callback()
self.loop.call_later(1, self._tick)
+
+ def __getstate__(self):
+ # The only use-cases for pickling in BuildStream at the time of writing
+ # are enabling the 'spawn' method of starting child processes, and
+ # saving jobs to disk for replays.
+ #
+ # In both of these use-cases, a common mistake is that something being
+ # pickled indirectly holds a reference to the Scheduler, which in turn
+ # holds lots of things that are not pickleable.
+ #
+ # Make this situation easier to debug by failing early, in the
+ # Scheduler itself. Pickling this is almost certainly a mistake, unless
+ # a new use-case arises.
+ #
+ raise TypeError("Scheduler objects should not be pickled.")