From 1a5e3e748a5ea4f48e3e88fa3859db4c186d6ba5 Mon Sep 17 00:00:00 2001
From: Richard Maw <richard.maw@codethink.co.uk>
Date: Fri, 11 Jul 2014 15:09:14 +0000
Subject: Make our use of json binary path safe

json only accepts unicode. Various APIs such as file paths and environment
variables allow binary data, so we need to support this properly.

This patch changes every[1] use of json.load or json.dump to escape
non-unicode data strings. This appears exactly as it used to if the
input was valid unicode, if it isn't it will insert \xabcd escapes in
the place of non-unicode data.

When loading back in, if json.load is told to unescape it with
`encoding='unicode-escape'` then it will convert it back correctly.

This change was primarily to support file paths that weren't valid
unicode, where this would choke and die. Now it works, but any tools
that parsed the metadata need to unescape the paths.

[1]: The interface to the remote repo cache uses json data, but I haven't
     changes its json.load calls to unescape the data, since the repo
     caches haven't been made to escape the data.
---
 morphlib/morph2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'morphlib/morph2.py')

diff --git a/morphlib/morph2.py b/morphlib/morph2.py
index cc6ce926..83971bb8 100644
--- a/morphlib/morph2.py
+++ b/morphlib/morph2.py
@@ -66,11 +66,12 @@ class Morphology(object):
 
     @staticmethod
     def _load_json(text):
-        return json.loads(text, object_pairs_hook=OrderedDict)
+        return json.loads(text, object_pairs_hook=OrderedDict,
+                          encoding='unicode-escape')
 
     @staticmethod
     def _dump_json(obj, f):
-        text = json.dumps(obj, indent=4)
+        text = json.dumps(obj, indent=4, encoding='unicode-escape')
         text = re.sub(" \n", "\n", text)
         f.write(text)
         f.write('\n')
-- 
cgit v1.2.1