From a74c994b704d3476e2054cc6332c0a4c49ea1c69 Mon Sep 17 00:00:00 2001 From: Adam Hupp Date: Fri, 6 Nov 2020 10:43:39 -0800 Subject: Handle undecodable characters in description We've historically expected that the return values from libmagic are ascii, since they are constant strings or stuff like dates/numbers. In some cases, however, it will return information like the title of the document in the doc's native character set, which is unknown to us. This produces decode errors. I have not been able to get a document that triggers this behavior, but the safest change is to decode with 'backslashreplace', which escapes undecodable characters with a backslash. --- magic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/magic.py b/magic.py index aab7987..92005bd 100644 --- a/magic.py +++ b/magic.py @@ -239,7 +239,8 @@ def maybe_decode(s): if str == bytes: return s else: - return s.decode('utf-8') + # backslashreplace here because sometimes + return s.decode('utf-8', 'backslashreplace') def coerce_filename(filename): -- cgit v1.2.1