1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
# Copyright 2014 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import sys
# NOTE(blk-u): This provides a symbol that can be overridden just for this
# module during testing. sys.getfilesystemencoding() is called by coverage so
# mocking it globally caused the coverage job to fail.
_getfilesystemencoding = sys.getfilesystemencoding
def safe_decode(text, incoming=None, errors='strict'):
"""Decodes incoming text/bytes string using `incoming` if they're not
already unicode.
:param incoming: Text's current encoding
:param errors: Errors handling policy. See here for valid
values http://docs.python.org/2/library/codecs.html
:returns: text or a unicode `incoming` encoded
representation of it.
:raises TypeError: If text is not an instance of str
"""
if not isinstance(text, (str, bytes)):
raise TypeError("%s can't be decoded" % type(text))
if isinstance(text, str):
return text
if not incoming:
incoming = (getattr(sys.stdin, 'encoding', None) or
sys.getdefaultencoding())
try:
return text.decode(incoming, errors)
except UnicodeDecodeError:
# Note(flaper87) If we get here, it means that
# sys.stdin.encoding / sys.getdefaultencoding
# didn't return a suitable encoding to decode
# text. This happens mostly when global LANG
# var is not set correctly and there's no
# default encoding. In this case, most likely
# python will use ASCII or ANSI encoders as
# default encodings but they won't be capable
# of decoding non-ASCII characters.
#
# Also, UTF-8 is being used since it's an ASCII
# extension.
return text.decode('utf-8', errors)
def safe_encode(text, incoming=None,
encoding='utf-8', errors='strict'):
"""Encodes incoming text/bytes string using `encoding`.
If incoming is not specified, text is expected to be encoded with
current python's default encoding. (`sys.getdefaultencoding`)
:param incoming: Text's current encoding
:param encoding: Expected encoding for text (Default UTF-8)
:param errors: Errors handling policy. See here for valid
values http://docs.python.org/2/library/codecs.html
:returns: text or a bytestring `encoding` encoded
representation of it.
:raises TypeError: If text is not an instance of str
See also to_utf8() function which is simpler and don't depend on
the locale encoding.
"""
if not isinstance(text, (str, bytes)):
raise TypeError("%s can't be encoded" % type(text))
if not incoming:
incoming = (getattr(sys.stdin, 'encoding', None) or
sys.getdefaultencoding())
# Avoid case issues in comparisons
if hasattr(incoming, 'lower'):
incoming = incoming.lower()
if hasattr(encoding, 'lower'):
encoding = encoding.lower()
if isinstance(text, str):
return text.encode(encoding, errors)
elif text and encoding != incoming:
# Decode text before encoding it with `encoding`
text = safe_decode(text, incoming, errors)
return text.encode(encoding, errors)
else:
return text
def to_utf8(text):
"""Encode Unicode to UTF-8, return bytes unchanged.
Raise TypeError if text is not a bytes string or a Unicode string.
.. versionadded:: 3.5
"""
if isinstance(text, bytes):
return text
elif isinstance(text, str):
return text.encode('utf-8')
else:
raise TypeError("bytes or Unicode expected, got %s"
% type(text).__name__)
def exception_to_unicode(exc):
"""Get the message of an exception as a Unicode string.
On Python 3, the exception message is always a Unicode string. On
Python 2, the exception message is a bytes string *most* of the time.
If the exception message is a bytes strings, try to decode it from UTF-8
(superset of ASCII), from the locale encoding, or fallback to decoding it
from ISO-8859-1 (which never fails).
.. versionadded:: 1.6
"""
msg = None
if msg is None:
# Don't call directly str(exc), because it fails with
# UnicodeEncodeError on Python 2 if exc.__str__() returns a Unicode
# string not encodable to the default encoding (ASCII)
msg = exc.__str__()
if isinstance(msg, str):
# This should be the default path on Python 3 and an *optional* path
# on Python 2 (if for some reason the exception message was already
# in unicode instead of the more typical bytes string); so avoid
# further converting to unicode in both of these cases.
return msg
try:
# Try to decode from UTF-8 (superset of ASCII). The decoder fails
# if the string is not a valid UTF-8 string: the UTF-8 codec includes
# a validation algorithm to ensure the consistency of the codec.
return msg.decode('utf-8')
except UnicodeDecodeError: # nosec
pass
# Try the locale encoding, most error messages are encoded to this encoding
# (ex: os.strerror(errno))
encoding = _getfilesystemencoding()
try:
return msg.decode(encoding)
except UnicodeDecodeError: # nosec
pass
# The encoding is not ASCII, not UTF-8, nor the locale encoding. Fallback
# to the ISO-8859-1 encoding which never fails. It will produce mojibake
# if the message is not encoded to ISO-8859-1, but we don't want a super
# complex heuristic to get the encoding of an exception message.
return msg.decode('latin1')
|