Make serialise work with artifact splitting

Serialisation was simple when we only had 1 artifact per source. However, to allow smaller systems, we need artifact splitting to produce multiple artifacts per chunk source. So now the new serialisation format has a separate list of artifacts and sources, rather than the Source being generated from the artifact's serialisation. Python's id() function is used to encode the references between the various Sources and Artifacts, these are replaced with a reference to the new object after deserialisation. Previously the cache-key was used, but this is no longer sufficient to uniquely identify an Artifact. The resultant build graph after deserialisation is a little different to what went in: Strata end up with a different Source per Artifact, so it _is_ a 1 to 1 mapping, as opposed to Chunks, where it's many to 1. We serialise strata and chunks differently because stratum artifacts from the same source can have different dependencies, for example core-devel can have different dependencies to core-runtime. Without intervention we would serialise core-devel and core-devel's dependencies without including core-runtime's dependencies. To solve this we've decided to encode stratum artifacts completely indepedently: each stratum artifact has its own source. This is safe because stratum artifacts can be constructed independently, as opposed to Chunks where all the Artifacts for a Source are produced together. This is a little hacky in its current form, but it simplifies matters later in distbuild with regards to how it handles expressing that every Artifact that shares a Source is built together. Arguably, this should be the output of producing the build graph anyway, since it more helpfully represents which Artifacts are built together than checking the morphology kind all the time, but more assumptions need checking in morph before it's safe to make this change across the whole of the morph codebase.
author: Richard Ipsum <richard.ipsum@codethink.co.uk> 2014-03-14 16:33:21 +0000
committer: Richard Ipsum <richard.ipsum@codethink.co.uk> 2014-03-26 21:03:51 +0000
commit: 2632a9ac870177f6ec743fdd96e40dc1d71314a8 (patch)
tree: 1b9939442c311b03585b146db8d9917835b3a16a
parent: 4e1153649e5d531b7017ac2a1b7791f9ad3c774b (diff)
download: morph-2632a9ac870177f6ec743fdd96e40dc1d71314a8.tar.gz
1 files changed, 118 insertions, 26 deletions
diff --git a/distbuild/serialise.py b/distbuild/serialise.py
index 060833b1..cd871042 100644
--- a/distbuild/serialise.py
+++ b/distbuild/serialise.py
@@ -19,6 +19,7 @@
 import json
 
 import morphlib
+import logging
 
 
 morphology_attributes = [
@@ -36,6 +37,16 @@ def serialise_artifact(artifact):
         for x in morphology_attributes:
             result['__%s' % x] = getattr(morphology, x)
         return result
+
+    def encode_artifact(artifact):
+        return {
+            'name': artifact.name,
+            'cache_id': artifact.cache_id,
+            'cache_key': artifact.cache_key,
+            'dependencies': artifact.dependencies,
+            'dependents': artifact.dependents,
+            'metadata_version': artifact.metadata_version,
+        }
     
     def encode_source(source):
         source_dic = {
@@ -46,25 +57,35 @@ def serialise_artifact(artifact):
             'tree': source.tree,
             'morphology': encode_morphology(source.morphology),
             'filename': source.filename,
+
+            # dict keys are converted to strings by json
+            # so we encode the artifact ids as strings
+            'artifact_ids': [str(id(artifact)) for (_, artifact)
+                in source.artifacts.iteritems()],
         }
+
         if source.morphology['kind'] == 'chunk':
             source_dic['build_mode'] = source.build_mode
             source_dic['prefix'] = source.prefix
         return source_dic
 
-    def encode_single_artifact(a, encoded):
+    def encode_single_artifact(a, artifacts, source_id):
         if artifact.source.morphology['kind'] == 'system':
             arch = artifact.source.morphology['arch']
         else:
             arch = artifact.arch
+
+        logging.debug('encode_single_artifact dependencies: %s'
+            % str([('id: %s' % str(id(d)), d.name) for d in a.dependencies]))
+
         return {
-            'source': encode_source(a.source),
+            'source_id': source_id,
             'name': a.name,
             'cache_id': a.cache_id,
             'cache_key': a.cache_key,
-            'dependencies': [encoded[d.cache_key]['cache_key']
-                             for d in a.dependencies],
-            'arch': arch,
+            'dependencies': [str(id(artifacts[id(d)]))
+                for d in a.dependencies],
+            'arch': arch
         }
 
     visited = set()
@@ -77,13 +98,46 @@ def serialise_artifact(artifact):
                 yield ret
         yield a
     
-    encoded = {}
+
+    artifacts = {}
+    encoded_artifacts = {}
+    encoded_sources = {}
+
     for a in traverse(artifact):
-        if a.cache_key not in encoded:
-            encoded[a.cache_key] = encode_single_artifact(a, encoded)
+        logging.debug('traversing artifacts at %s' % a.name)
+
+        if id(a.source) not in encoded_sources:
+            if a.source.morphology['kind'] == 'chunk':
+                for (_, sa) in a.source.artifacts.iteritems():
+                    if id(sa) not in artifacts:
+                        logging.debug('encoding source artifact %s' % sa.name)
+                        artifacts[id(sa)] = sa
+                        encoded_artifacts[id(sa)] = encode_single_artifact(sa,
+                            artifacts, id(a.source))
+            else:
+                # We create separate sources for strata and systems,
+                # this is a bit of a hack, but needed to allow
+                # us to build strata and systems independently
+
+                s = a.source
+                t = morphlib.source.Source(s.repo_name, s.original_ref,
+                    s.sha1, s.tree, s.morphology, s.filename)
+
+                t.artifacts = {a.name: a}
+                a.source = t
+
+            encoded_sources[id(a.source)] = encode_source(a.source)
+
+        if id(a) not in artifacts:
+            artifacts[id(a)] = a
+            logging.debug('encoding artifact %s' % a.name)
+            encoded_artifacts[id(a)] = encode_single_artifact(a, artifacts,
+                id(a.source))
 
-    encoded['_root'] = artifact.cache_key
-    return json.dumps(encoded)
+    encoded_artifacts['_root'] = str(id(artifact))
+
+    return json.dumps({'sources': encoded_sources,
+        'artifacts': encoded_artifacts})
 
 
 def deserialise_artifact(encoded):
@@ -121,7 +175,17 @@ def deserialise_artifact(encoded):
             setattr(morphology, x, le_dict['__%s' % x])
             del morphology['__%s' % x]
         return morphology
-        
+
+    def unserialise_source_artifacts(source, artifacts_dict):
+        '''Convert this dict into a list of artifacts'''
+        return {a['name']: Artifact(source,
+            a['name'],
+            a['cache_id'],
+            a['cache_key'],
+            a['dependencies'],
+            a['dependents'],
+            a['metadata_version']) for a in artifacts_dict}
+
     def unserialise_source(le_dict):
         '''Convert a dict into a Source object.'''
 
@@ -132,35 +196,63 @@ def deserialise_artifact(encoded):
                                         le_dict['tree'],
                                         morphology,
                                         le_dict['filename'])
+
         if morphology['kind'] == 'chunk':
             source.build_mode = le_dict['build_mode']
             source.prefix = le_dict['prefix']
         return source
         
-    def unserialise_single_artifact(le_dict):
+    def unserialise_single_artifact(artifact_dict, source):
         '''Convert dict into an Artifact object.
         
         Do not set dependencies, that will be dealt with later.
         
         '''
 
-        source = unserialise_source(le_dict['source'])
-        artifact = morphlib.artifact.Artifact(source, le_dict['name'])
-        artifact.cache_id = le_dict['cache_id']
-        artifact.cache_key = le_dict['cache_key']
-        artifact.arch = le_dict['arch']
+        artifact = morphlib.artifact.Artifact(source, artifact_dict['name'])
+        artifact.cache_id = artifact_dict['cache_id']
+        artifact.cache_key = artifact_dict['cache_key']
+        artifact.arch = artifact_dict['arch']
+        artifact.source = source
+
         return artifact
 
     le_dicts = json.loads(encoded)
-    cache_keys = [k for k in le_dicts.keys() if k != '_root']
+    artifacts_dict = le_dicts['artifacts']
+    sources_dict = le_dicts['sources']
+
+    artifact_ids = ([artifacts_dict['_root']] +
+        filter(lambda k: k != '_root', artifacts_dict.keys()))
+
+    source_ids = [sid for sid in sources_dict.keys()]
+
     artifacts = {}
-    for cache_key in cache_keys:
-        le_dict = le_dicts[cache_key]
-        artifacts[cache_key] = unserialise_single_artifact(le_dict)
-    for cache_key in cache_keys:
-        le_dict = le_dicts[cache_key]
-        artifact = artifacts[cache_key]
-        artifact.dependencies = [artifacts[k] for k in le_dict['dependencies']]
+    sources = {}
+
+    for source_id in source_ids:
+        source_dict = sources_dict[source_id]
+        sources[source_id] = unserialise_source(source_dict)
+
+        # clear the source artifacts that get automatically generated
+        # we want to add the ones that were sent to us
+        sources[source_id].artifacts = {}
+        source_artifacts = source_dict['artifact_ids']
+
+        for artifact_id in source_artifacts:
+            if artifact_id not in artifacts:
+                artifact_dict = artifacts_dict[artifact_id]
+                artifact = unserialise_single_artifact(artifact_dict,
+                    sources[source_id])
+
+                artifacts[artifact_id] = artifact
+
+            key = artifacts[artifact_id].name
+            sources[source_id].artifacts[key] = artifacts[artifact_id]
 
-    return artifacts[le_dicts['_root']]
+    # now add the dependencies
+    for artifact_id in artifact_ids:
+        artifact = artifacts[artifact_id]
+        artifact.dependencies = [artifacts[aid] for aid in
+            artifacts_dict[artifact_id]['dependencies']]
 
+    return artifacts[artifacts_dict['_root']]
author	Richard Ipsum <richard.ipsum@codethink.co.uk>	2014-03-14 16:33:21 +0000
committer	Richard Ipsum <richard.ipsum@codethink.co.uk>	2014-03-26 21:03:51 +0000
commit	2632a9ac870177f6ec743fdd96e40dc1d71314a8 (patch)
tree	1b9939442c311b03585b146db8d9917835b3a16a
parent	4e1153649e5d531b7017ac2a1b7791f9ad3c774b (diff)
download	morph-2632a9ac870177f6ec743fdd96e40dc1d71314a8.tar.gz