From 1a5e3e748a5ea4f48e3e88fa3859db4c186d6ba5 Mon Sep 17 00:00:00 2001 From: Richard Maw Date: Fri, 11 Jul 2014 15:09:14 +0000 Subject: Make our use of json binary path safe json only accepts unicode. Various APIs such as file paths and environment variables allow binary data, so we need to support this properly. This patch changes every[1] use of json.load or json.dump to escape non-unicode data strings. This appears exactly as it used to if the input was valid unicode, if it isn't it will insert \xabcd escapes in the place of non-unicode data. When loading back in, if json.load is told to unescape it with `encoding='unicode-escape'` then it will convert it back correctly. This change was primarily to support file paths that weren't valid unicode, where this would choke and die. Now it works, but any tools that parsed the metadata need to unescape the paths. [1]: The interface to the remote repo cache uses json data, but I haven't changes its json.load calls to unescape the data, since the repo caches haven't been made to escape the data. --- distbuild/build_controller.py | 4 ++-- distbuild/jm.py | 4 ++-- distbuild/serialise.py | 4 ++-- morphlib/builder2.py | 21 +++++++++++++-------- morphlib/morph2.py | 5 +++-- morphlib/plugins/deploy_plugin.py | 3 ++- morphlib/systemmetadatadir.py | 7 ++++--- scripts/list-overlaps | 4 ++-- 8 files changed, 30 insertions(+), 22 deletions(-) diff --git a/distbuild/build_controller.py b/distbuild/build_controller.py index 987f01f4..e0aec24e 100644 --- a/distbuild/build_controller.py +++ b/distbuild/build_controller.py @@ -340,7 +340,7 @@ class BuildController(distbuild.StateMachine): id=self._helper_id, url=url, headers={'Content-type': 'application/json'}, - body=json.dumps(artifact_names), + body=json.dumps(artifact_names, encoding='unicode-escape'), method='POST') request = distbuild.HelperRequest(msg) @@ -369,7 +369,7 @@ class BuildController(distbuild.StateMachine): _AnnotationFailed(http_status_code, error_msg)) return - cache_state = json.loads(event.msg['body']) + cache_state = json.loads(event.msg['body'], encoding='unicode-escape') map_build_graph(self._artifact, set_status) self.mainloop.queue_event(self, _Annotated()) diff --git a/distbuild/jm.py b/distbuild/jm.py index 69fa5bd1..97ee1a0f 100644 --- a/distbuild/jm.py +++ b/distbuild/jm.py @@ -67,7 +67,7 @@ class JsonMachine(StateMachine): def send(self, msg): '''Send a message to the other side.''' - self.sockbuf.write('%s\n' % json.dumps(msg)) + self.sockbuf.write('%s\n' % json.dumps(msg, encoding='unicode-escape')) def close(self): '''Tell state machine it should shut down. @@ -91,7 +91,7 @@ class JsonMachine(StateMachine): line = line.rstrip() if self.debug_json: logging.debug('JsonMachine: line: %s' % repr(line)) - msg = json.loads(line) + msg = json.loads(line, encoding='unicode-escape') self.mainloop.queue_event(self, JsonNewMessage(msg)) def _send_eof(self, event_source, event): diff --git a/distbuild/serialise.py b/distbuild/serialise.py index 44d96eee..914c3ae4 100644 --- a/distbuild/serialise.py +++ b/distbuild/serialise.py @@ -130,7 +130,7 @@ def serialise_artifact(artifact): encoded_artifacts['_root'] = str(id(artifact)) return json.dumps({'sources': encoded_sources, - 'artifacts': encoded_artifacts}) + 'artifacts': encoded_artifacts}, encoding='unicode-escape') def deserialise_artifact(encoded): @@ -210,7 +210,7 @@ def deserialise_artifact(encoded): return artifact - le_dicts = json.loads(encoded) + le_dicts = json.loads(encoded, encoding='unicode-escape') artifacts_dict = le_dicts['artifacts'] sources_dict = le_dicts['sources'] diff --git a/morphlib/builder2.py b/morphlib/builder2.py index 3c0d9e02..4bb435d9 100644 --- a/morphlib/builder2.py +++ b/morphlib/builder2.py @@ -154,7 +154,8 @@ def get_chunk_files(f): # pragma: no cover def get_stratum_files(f, lac): # pragma: no cover - for ca in (ArtifactCacheReference(a) for a in json.load(f)): + for ca in (ArtifactCacheReference(a) + for a in json.load(f, encoding='unicode-escape')): cf = lac.get(ca) for filename in get_chunk_files(cf): yield filename @@ -197,7 +198,7 @@ def write_overlap_metadata(artifact, overlaps, lac): # pragma: no cover [ [a.name for a in afs], list(files) ] for afs, files in overlaps.iteritems() - ], f, indent=4) + ], f, indent=4, encoding='unicode-escape') f.close() @@ -234,7 +235,8 @@ class BuilderBase(object): with self.local_artifact_cache.put_source_metadata( self.artifact.source, self.artifact.cache_key, 'meta') as f: - json.dump(meta, f, indent=4, sort_keys=True) + json.dump(meta, f, indent=4, sort_keys=True, + encoding='unicode-escape') f.write('\n') def create_metadata(self, artifact_name, contents=[]): @@ -294,7 +296,7 @@ class BuilderBase(object): # Unit tests use StringIO, which in Python 2.6 isn't usable with # the "with" statement. So we don't do it with "with". f = self._open(filename, 'w') - f.write(json.dumps(meta, indent=4, sort_keys=True)) + json.dump(meta, f, indent=4, sort_keys=True, encoding='unicode-escape') f.close() def new_artifact(self, artifact_name): @@ -580,9 +582,11 @@ class StratumBuilder(BuilderBase): meta = self.create_metadata(self.artifact.name, [x.name for x in constituents]) with lac.put_artifact_metadata(self.artifact, 'meta') as f: - json.dump(meta, f, indent=4, sort_keys=True) + json.dump(meta, f, indent=4, sort_keys=True, + encoding='unicode-escape') with self.local_artifact_cache.put(self.artifact) as f: - json.dump([c.basename() for c in constituents], f) + json.dump([c.basename() for c in constituents], f, + encoding='unicode-escape') self.save_build_times() return [self.artifact] @@ -643,7 +647,7 @@ class SystemBuilder(BuilderBase): # pragma: no cover cache = self.local_artifact_cache with cache.get(stratum_artifact) as stratum_file: - artifact_list = json.load(stratum_file) + artifact_list = json.load(stratum_file, encoding='unicode-escape') for chunk in (ArtifactCacheReference(a) for a in artifact_list): self.app.status(msg='Unpacking chunk %(basename)s', basename=chunk.basename(), chatty=True) @@ -671,7 +675,8 @@ class SystemBuilder(BuilderBase): # pragma: no cover # download the chunk artifacts if necessary for stratum_artifact in self.artifact.dependencies: f = self.local_artifact_cache.get(stratum_artifact) - chunks = [ArtifactCacheReference(a) for a in json.load(f)] + chunks = [ArtifactCacheReference(a) + for a in json.load(f, encoding='unicode-escape')] download_depends(chunks, self.local_artifact_cache, self.remote_artifact_cache) diff --git a/morphlib/morph2.py b/morphlib/morph2.py index cc6ce926..83971bb8 100644 --- a/morphlib/morph2.py +++ b/morphlib/morph2.py @@ -66,11 +66,12 @@ class Morphology(object): @staticmethod def _load_json(text): - return json.loads(text, object_pairs_hook=OrderedDict) + return json.loads(text, object_pairs_hook=OrderedDict, + encoding='unicode-escape') @staticmethod def _dump_json(obj, f): - text = json.dumps(obj, indent=4) + text = json.dumps(obj, indent=4, encoding='unicode-escape') text = re.sub(" \n", "\n", text) f.write(text) f.write('\n') diff --git a/morphlib/plugins/deploy_plugin.py b/morphlib/plugins/deploy_plugin.py index 9384c422..30e356e8 100644 --- a/morphlib/plugins/deploy_plugin.py +++ b/morphlib/plugins/deploy_plugin.py @@ -497,7 +497,8 @@ class DeployPlugin(cliapp.Plugin): metadata_path = os.path.join( system_tree, 'baserock', 'deployment.meta') with morphlib.savefile.SaveFile(metadata_path, 'w') as f: - f.write(json.dumps(metadata, indent=4, sort_keys=True)) + json.dump(metadata, f, indent=4, + sort_keys=True, encoding='unicode-escape') return system_tree except Exception: shutil.rmtree(system_tree) diff --git a/morphlib/systemmetadatadir.py b/morphlib/systemmetadatadir.py index eac5b446..7e89142c 100644 --- a/morphlib/systemmetadatadir.py +++ b/morphlib/systemmetadatadir.py @@ -1,4 +1,4 @@ -# Copyright (C) 2013 Codethink Limited +# Copyright (C) 2013-2014 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -66,14 +66,15 @@ class SystemMetadataDir(collections.MutableMapping): self._check_key(key) try: with open(self._join_path('%s.meta' % key), 'r') as f: - return json.load(f) + return json.load(f, encoding='unicode-escape') except IOError: raise KeyError(key) def __setitem__(self, key, value): self._check_key(key) with open(self._join_path('%s.meta' % key), 'w') as f: - json.dump(value, f, indent=4, sort_keys=True) + json.dump(value, f, indent=4, sort_keys=True, + encoding='unicode-escape') def __delitem__(self, key): self._check_key(key) diff --git a/scripts/list-overlaps b/scripts/list-overlaps index 5fe45a0e..d092ba75 100755 --- a/scripts/list-overlaps +++ b/scripts/list-overlaps @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright (C) 2011-2012 Codethink Limited +# Copyright (C) 2011-2014 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ class ListOverlaps(cliapp.Application): @staticmethod def _load_overlap(filename): - data = json.load(open(filename)) + data = json.load(open(filename), encoding='unicode-escape') overlaps = dict((frozenset(pair[0]), set(pair[1])) for pair in data) return overlaps -- cgit v1.2.1 From 40a5c752de62dd1b3ef3ec0850f1f24e7af67052 Mon Sep 17 00:00:00 2001 From: Richard Maw Date: Fri, 11 Jul 2014 17:37:54 +0000 Subject: Allow non-unicode paths to be hardlinked into staging areas Parts of the morphology go into the name of the staging area, so it helps to convert them into a str, so later attempts to join it with another string don't result in a unicode string. pyfilesystem insists that file paths must be unicode. It is incorrect, but we passed something unicode compatible in in the first place, so we can get away with converting it back to a bytestring. --- morphlib/artifact.py | 10 +++++----- morphlib/localartifactcache.py | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/morphlib/artifact.py b/morphlib/artifact.py index 20fdb185..da6d3763 100644 --- a/morphlib/artifact.py +++ b/morphlib/artifact.py @@ -1,4 +1,4 @@ -# Copyright (C) 2012, 2013 Codethink Limited +# Copyright (C) 2012, 2013, 2014 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -56,13 +56,13 @@ class Artifact(object): def basename(self): # pragma: no cover return '%s.%s.%s' % (self.cache_key, - self.source.morphology['kind'], - self.name) + str(self.source.morphology['kind']), + str(self.name)) def metadata_basename(self, metadata_name): # pragma: no cover return '%s.%s.%s.%s' % (self.cache_key, - self.source.morphology['kind'], - self.name, + str(self.source.morphology['kind']), + str(self.name), metadata_name) def get_dependency_prefix_set(self): diff --git a/morphlib/localartifactcache.py b/morphlib/localartifactcache.py index 4c7f7832..955ee97f 100644 --- a/morphlib/localartifactcache.py +++ b/morphlib/localartifactcache.py @@ -96,14 +96,23 @@ class LocalArtifactCache(object): os.utime(filename, None) return open(filename) + def _join(self, basename): + '''Wrapper for pyfilesystem's getsyspath. + + This is required because its API throws us a garbage unicode + string, when file paths are binary data. + ''' + return str(self.cachefs.getsyspath(basename)) + def artifact_filename(self, artifact): - return self.cachefs.getsyspath(artifact.basename()) + basename = artifact.basename() + return self._join(basename) def _artifact_metadata_filename(self, artifact, name): - return self.cachefs.getsyspath(artifact.metadata_basename(name)) + return self._join(artifact.metadata_basename(name)) def _source_metadata_filename(self, source, cachekey, name): - return self.cachefs.getsyspath('%s.%s' % (cachekey, name)) + return self._join('%s.%s' % (cachekey, name)) def clear(self): '''Clear everything from the artifact cache directory. -- cgit v1.2.1