oslo_utils/encodeutils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

# Copyright 2014 Red Hat, Inc.
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import sys

import six


# NOTE(blk-u): This provides a symbol that can be overridden just for this
# module during testing. sys.getfilesystemencoding() is called by coverage so
# mocking it globally caused the coverage job to fail.
_getfilesystemencoding = sys.getfilesystemencoding


def safe_decode(text, incoming=None, errors='strict'):
    """Decodes incoming text/bytes string using `incoming` if they're not
       already unicode.

    :param incoming: Text's current encoding
    :param errors: Errors handling policy. See here for valid
        values http://docs.python.org/2/library/codecs.html
    :returns: text or a unicode `incoming` encoded
                representation of it.
    :raises TypeError: If text is not an instance of str
    """
    if not isinstance(text, (six.string_types, six.binary_type)):
        raise TypeError("%s can't be decoded" % type(text))

    if isinstance(text, six.text_type):
        return text

    if not incoming:
        incoming = (getattr(sys.stdin, 'encoding', None) or
                    sys.getdefaultencoding())

    try:
        return text.decode(incoming, errors)
    except UnicodeDecodeError:
        # Note(flaper87) If we get here, it means that
        # sys.stdin.encoding / sys.getdefaultencoding
        # didn't return a suitable encoding to decode
        # text. This happens mostly when global LANG
        # var is not set correctly and there's no
        # default encoding. In this case, most likely
        # python will use ASCII or ANSI encoders as
        # default encodings but they won't be capable
        # of decoding non-ASCII characters.
        #
        # Also, UTF-8 is being used since it's an ASCII
        # extension.
        return text.decode('utf-8', errors)


def safe_encode(text, incoming=None,
                encoding='utf-8', errors='strict'):
    """Encodes incoming text/bytes string using `encoding`.

    If incoming is not specified, text is expected to be encoded with
    current python's default encoding. (`sys.getdefaultencoding`)

    :param incoming: Text's current encoding
    :param encoding: Expected encoding for text (Default UTF-8)
    :param errors: Errors handling policy. See here for valid
        values http://docs.python.org/2/library/codecs.html
    :returns: text or a bytestring `encoding` encoded
                representation of it.
    :raises TypeError: If text is not an instance of str

    See also to_utf8() function which is simpler and don't depend on
    the locale encoding.
    """
    if not isinstance(text, (six.string_types, six.binary_type)):
        raise TypeError("%s can't be encoded" % type(text))

    if not incoming:
        incoming = (getattr(sys.stdin, 'encoding', None) or
                    sys.getdefaultencoding())

    # Avoid case issues in comparisons
    if hasattr(incoming, 'lower'):
        incoming = incoming.lower()
    if hasattr(encoding, 'lower'):
        encoding = encoding.lower()

    if isinstance(text, six.text_type):
        return text.encode(encoding, errors)
    elif text and encoding != incoming:
        # Decode text before encoding it with `encoding`
        text = safe_decode(text, incoming, errors)
        return text.encode(encoding, errors)
    else:
        return text


def to_utf8(text):
    """Encode Unicode to UTF-8, return bytes unchanged.

    Raise TypeError if text is not a bytes string or a Unicode string.

    .. versionadded:: 3.5
    """
    if isinstance(text, six.binary_type):
        return text
    elif isinstance(text, six.text_type):
        return text.encode('utf-8')
    else:
        raise TypeError("bytes or Unicode expected, got %s"
                        % type(text).__name__)


def exception_to_unicode(exc):
    """Get the message of an exception as a Unicode string.

    On Python 3, the exception message is always a Unicode string. On
    Python 2, the exception message is a bytes string *most* of the time.

    If the exception message is a bytes strings, try to decode it from UTF-8
    (superset of ASCII), from the locale encoding, or fallback to decoding it
    from ISO-8859-1 (which never fails).

    .. versionadded:: 1.6
    """
    msg = None
    if six.PY2:
        # First try by calling the unicode type constructor. We should try
        # unicode() before exc.__unicode__() because subclasses of unicode can
        # be easily casted to unicode, whereas they have no __unicode__()
        # method.
        try:
            msg = unicode(exc)  # NOQA
        except UnicodeError:
            # unicode(exc) fail with UnicodeDecodeError on Python 2 if
            # exc.__unicode__() or exc.__str__() returns a bytes string not
            # decodable from the default encoding (ASCII)
            if hasattr(exc, '__unicode__'):
                # Call directly the __unicode__() method to avoid
                # the implicit decoding from the default encoding
                try:
                    msg = exc.__unicode__()
                except UnicodeError:  # nosec
                    pass

    if msg is None:
        # Don't call directly str(exc), because it fails with
        # UnicodeEncodeError on Python 2 if exc.__str__() returns a Unicode
        # string not encodable to the default encoding (ASCII)
        msg = exc.__str__()

    if isinstance(msg, six.text_type):
        # This should be the default path on Python 3 and an *optional* path
        # on Python 2 (if for some reason the exception message was already
        # in unicode instead of the more typical bytes string); so avoid
        # further converting to unicode in both of these cases.
        return msg

    try:
        # Try to decode from UTF-8 (superset of ASCII). The decoder fails
        # if the string is not a valid UTF-8 string: the UTF-8 codec includes
        # a validation algorithm to ensure the consistency of the codec.
        return msg.decode('utf-8')
    except UnicodeDecodeError:  # nosec
        pass

    # Try the locale encoding, most error messages are encoded to this encoding
    # (ex: os.strerror(errno))
    encoding = _getfilesystemencoding()
    try:
        return msg.decode(encoding)
    except UnicodeDecodeError:  # nosec
        pass

    # The encoding is not ASCII, not UTF-8, nor the locale encoding. Fallback
    # to the ISO-8859-1 encoding which never fails. It will produce mojibake
    # if the message is not encoded to ISO-8859-1, but we don't want a super
    # complex heuristic to get the encoding of an exception message.
    return msg.decode('latin1')