summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Thursfield <sam.thursfield@codethink.co.uk>2014-12-04 18:33:19 +0000
committerSam Thursfield <sam.thursfield@codethink.co.uk>2014-12-04 18:33:19 +0000
commitdc4ad1abc9127da0ca556fdc6b5fabd6c82a062a (patch)
treeb4a7bda39f50c4a671eea8a8eea32f7b10fdc56b
parentfc5d4e58753d113d898744c1924b13ddc34e526e (diff)
downloadmorph-sam/escape-non-utf8-strings.tar.gz
I'm a helpful commit messagesam/escape-non-utf8-strings
I link this change back to 4be1620dca24d539573027831b65ffc040bc1ebb and 1a5e3e748a5ea4f48e3e88fa3859db4c186d6ba5 FIXME: need to replace all calls to json.dump() to use SafeishJSONEncoder
-rw-r--r--morphlib/plugins/deploy_plugin.py5
-rw-r--r--morphlib/util.py40
-rw-r--r--morphlib/util_tests.py10
3 files changed, 53 insertions, 2 deletions
diff --git a/morphlib/plugins/deploy_plugin.py b/morphlib/plugins/deploy_plugin.py
index 87e129e5..1faad46b 100644
--- a/morphlib/plugins/deploy_plugin.py
+++ b/morphlib/plugins/deploy_plugin.py
@@ -524,9 +524,10 @@ class DeployPlugin(cliapp.Plugin):
artifact, root_repo_dir, deployment_type, location, env)
metadata_path = os.path.join(
system_tree, 'baserock', 'deployment.meta')
+
with morphlib.savefile.SaveFile(metadata_path, 'w') as f:
- json.dump(metadata, f, indent=4,
- sort_keys=True, encoding='unicode-escape')
+ json.dump(metadata, f, indent=4, sort_keys=True,
+ cls=morphlib.util.SafeishJSONEncoder)
return system_tree
except Exception:
shutil.rmtree(system_tree)
diff --git a/morphlib/util.py b/morphlib/util.py
index 6f735387..7c97e1cf 100644
--- a/morphlib/util.py
+++ b/morphlib/util.py
@@ -626,3 +626,43 @@ def containerised_cmdline(args, cwd='.', root='/', binds=(),
cmdargs.append(root)
cmdargs.extend(args)
return unshared_cmdline(cmdargs, root=root, **kwargs)
+
+
+class SafeishJSONEncoder(json.JSONEncoder):
+ '''JSON encoder that replaces invalid UTF-8 sequences with a placeholder.
+
+ Morph receives string inputs from the host system from a few places.
+ Filenames and environment variables are the main ones. These inputs are not
+ restricted on what character encoding they can use, and can contain random
+ binary data if the user wishes.
+
+ When we write strings to JSON files they must be valid Unicode.
+
+ The default Python JSON encoder takes an 'encoding' parameter, defaulting
+ to UTF-8. This parameter tells it "you can assume all of the strings you're
+ given are valid in this encoding." Since we pass it filenames and
+ environment variables, that may not be true, and the program aborts with
+ UnicodeDecodeError if so. You can pass encoding='unicode-escape', but that
+ will break if there are any invalid escape sequences in the string: so
+ '\u' cannot be encoded, for example!
+
+ This class filters the input strings so that any sequence of bytes that is
+ not valid UTF-8 will be replaced with 'REPLACEMENT CHARACTER' (U+FFFD).
+ This is a lossy transformation! Hence this is a 'safeish' (won't cause the
+ program to crash) rather than a 'safe' (won't lose any information)
+ encoder.
+
+ Note that for YAML we use the yaml.safe_dump() function, which causes any
+ strings that aren't representable as Unicode to be encoded as base-64 using
+ the YAML !!binary operator. This means no information will be lost.
+
+ '''
+
+ def _replace_non_utf8_sequences_with_placeholder(self, obj):
+ return codecs.decode(obj, 'utf-8', 'replace')
+
+ def encode(self, obj):
+ if isinstance(obj, str):
+ return self._replace_non_utf8_sequences_with_placeholder(obj)
+ else:
+ return json.JSONEncoder.encode(self, obj)
diff --git a/morphlib/util_tests.py b/morphlib/util_tests.py
index 715892b6..bfc7c324 100644
--- a/morphlib/util_tests.py
+++ b/morphlib/util_tests.py
@@ -14,6 +14,7 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import json
import os
import shutil
import tempfile
@@ -136,3 +137,12 @@ class IterTrickleTests(unittest.TestCase):
def test_truncated_final_sequence(self):
self.assertEqual(list(morphlib.util.iter_trickle("barquux", 3)),
[["b", "a", "r"], ["q", "u", "u"], ["x"]])
+
+
+class SafeishJSONEncoderTests(unittest.TestCase):
+
+ def _encode(self, obj):
+ return json.dumps(obj, cls=morphlib.util.SafeishJSONEncoder):w
+
+ def test_non_ascii_characters(self):
+ j