diff options
author | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2014-12-04 18:33:19 +0000 |
---|---|---|
committer | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2014-12-04 18:33:19 +0000 |
commit | dc4ad1abc9127da0ca556fdc6b5fabd6c82a062a (patch) | |
tree | b4a7bda39f50c4a671eea8a8eea32f7b10fdc56b | |
parent | fc5d4e58753d113d898744c1924b13ddc34e526e (diff) | |
download | morph-dc4ad1abc9127da0ca556fdc6b5fabd6c82a062a.tar.gz |
I'm a helpful commit messagesam/escape-non-utf8-strings
I link this change back to 4be1620dca24d539573027831b65ffc040bc1ebb
and 1a5e3e748a5ea4f48e3e88fa3859db4c186d6ba5
FIXME: need to replace all calls to json.dump() to use
SafeishJSONEncoder
-rw-r--r-- | morphlib/plugins/deploy_plugin.py | 5 | ||||
-rw-r--r-- | morphlib/util.py | 40 | ||||
-rw-r--r-- | morphlib/util_tests.py | 10 |
3 files changed, 53 insertions, 2 deletions
diff --git a/morphlib/plugins/deploy_plugin.py b/morphlib/plugins/deploy_plugin.py index 87e129e5..1faad46b 100644 --- a/morphlib/plugins/deploy_plugin.py +++ b/morphlib/plugins/deploy_plugin.py @@ -524,9 +524,10 @@ class DeployPlugin(cliapp.Plugin): artifact, root_repo_dir, deployment_type, location, env) metadata_path = os.path.join( system_tree, 'baserock', 'deployment.meta') + with morphlib.savefile.SaveFile(metadata_path, 'w') as f: - json.dump(metadata, f, indent=4, - sort_keys=True, encoding='unicode-escape') + json.dump(metadata, f, indent=4, sort_keys=True, + cls=morphlib.util.SafeishJSONEncoder) return system_tree except Exception: shutil.rmtree(system_tree) diff --git a/morphlib/util.py b/morphlib/util.py index 6f735387..7c97e1cf 100644 --- a/morphlib/util.py +++ b/morphlib/util.py @@ -626,3 +626,43 @@ def containerised_cmdline(args, cwd='.', root='/', binds=(), cmdargs.append(root) cmdargs.extend(args) return unshared_cmdline(cmdargs, root=root, **kwargs) + + +class SafeishJSONEncoder(json.JSONEncoder): + '''JSON encoder that replaces invalid UTF-8 sequences with a placeholder. + + Morph receives string inputs from the host system from a few places. + Filenames and environment variables are the main ones. These inputs are not + restricted on what character encoding they can use, and can contain random + binary data if the user wishes. + + When we write strings to JSON files they must be valid Unicode. + + The default Python JSON encoder takes an 'encoding' parameter, defaulting + to UTF-8. This parameter tells it "you can assume all of the strings you're + given are valid in this encoding." Since we pass it filenames and + environment variables, that may not be true, and the program aborts with + UnicodeDecodeError if so. You can pass encoding='unicode-escape', but that + will break if there are any invalid escape sequences in the string: so + '\u' cannot be encoded, for example! + + This class filters the input strings so that any sequence of bytes that is + not valid UTF-8 will be replaced with 'REPLACEMENT CHARACTER' (U+FFFD). + This is a lossy transformation! Hence this is a 'safeish' (won't cause the + program to crash) rather than a 'safe' (won't lose any information) + encoder. + + Note that for YAML we use the yaml.safe_dump() function, which causes any + strings that aren't representable as Unicode to be encoded as base-64 using + the YAML !!binary operator. This means no information will be lost. + + ''' + + def _replace_non_utf8_sequences_with_placeholder(self, obj): + return codecs.decode(obj, 'utf-8', 'replace') + + def encode(self, obj): + if isinstance(obj, str): + return self._replace_non_utf8_sequences_with_placeholder(obj) + else: + return json.JSONEncoder.encode(self, obj) diff --git a/morphlib/util_tests.py b/morphlib/util_tests.py index 715892b6..bfc7c324 100644 --- a/morphlib/util_tests.py +++ b/morphlib/util_tests.py @@ -14,6 +14,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import json import os import shutil import tempfile @@ -136,3 +137,12 @@ class IterTrickleTests(unittest.TestCase): def test_truncated_final_sequence(self): self.assertEqual(list(morphlib.util.iter_trickle("barquux", 3)), [["b", "a", "r"], ["q", "u", "u"], ["x"]]) + + +class SafeishJSONEncoderTests(unittest.TestCase): + + def _encode(self, obj): + return json.dumps(obj, cls=morphlib.util.SafeishJSONEncoder):w + + def test_non_ascii_characters(self): + j |