# -*- coding: utf-8 -*- # config.py # Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors # # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php """utilities to help provide compatibility with python 3""" # flake8: noqa import locale import os import sys import codecs from gitdb.utils.compat import ( xrange, MAXSIZE, # @UnusedImport izip, # @UnusedImport ) from gitdb.utils.encoding import ( string_types, # @UnusedImport text_type, # @UnusedImport force_bytes, # @UnusedImport force_text # @UnusedImport ) PY3 = sys.version_info[0] >= 3 is_win = (os.name == 'nt') is_posix = (os.name == 'posix') is_darwin = (os.name == 'darwin') if hasattr(sys, 'getfilesystemencoding'): defenc = sys.getfilesystemencoding() if defenc is None: defenc = sys.getdefaultencoding() if PY3: import io FileType = io.IOBase def byte_ord(b): return b def bchr(n): return bytes([n]) def mviter(d): return d.values() range = xrange # @ReservedAssignment unicode = str binary_type = bytes else: FileType = file # @UndefinedVariable on PY3 # usually, this is just ascii, which might not enough for our encoding needs # Unless it's set specifically, we override it to be utf-8 if defenc == 'ascii': defenc = 'utf-8' byte_ord = ord bchr = chr unicode = unicode binary_type = str range = xrange # @ReservedAssignment def mviter(d): return d.itervalues() def safe_decode(s): """Safely decodes a binary string to unicode""" if isinstance(s, unicode): return s elif isinstance(s, bytes): return s.decode(defenc, 'surrogateescape') elif s is not None: raise TypeError('Expected bytes or text, but got %r' % (s,)) def safe_encode(s): """Safely decodes a binary string to unicode""" if isinstance(s, unicode): return s.encode(defenc) elif isinstance(s, bytes): return s elif s is not None: raise TypeError('Expected bytes or text, but got %r' % (s,)) def win_encode(s): """Encode unicodes for process arguments on Windows.""" if isinstance(s, unicode): return s.encode(locale.getpreferredencoding(False)) elif isinstance(s, bytes): return s elif s is not None: raise TypeError('Expected bytes or text, but got %r' % (s,)) def with_metaclass(meta, *bases): """copied from https://github.com/Byron/bcore/blob/master/src/python/butility/future.py#L15""" class metaclass(meta): __call__ = type.__call__ __init__ = type.__init__ def __new__(cls, name, nbases, d): if nbases is None: return type.__new__(cls, name, (), d) # There may be clients who rely on this attribute to be set to a reasonable value, which is why # we set the __metaclass__ attribute explicitly if not PY3 and '___metaclass__' not in d: d['__metaclass__'] = meta return meta(name, bases, d) return metaclass(meta.__name__ + 'Helper', None, {}) ## From https://docs.python.org/3.3/howto/pyporting.html class UnicodeMixin(object): """Mixin class to handle defining the proper __str__/__unicode__ methods in Python 2 or 3.""" if PY3: def __str__(self): return self.__unicode__() else: # Python 2 def __str__(self): return self.__unicode__().encode(defenc) """ This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error handler of Python 3. Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc """ # This code is released under the Python license and the BSD 2-clause license FS_ERRORS = 'surrogateescape' # # -- Python 2/3 compatibility ------------------------------------- # FS_ERRORS = 'my_surrogateescape' def u(text): if PY3: return text else: return text.decode('unicode_escape') def b(data): if PY3: return data.encode('latin1') else: return data if PY3: _unichr = chr bytes_chr = lambda code: bytes((code,)) else: _unichr = unichr bytes_chr = chr def surrogateescape_handler(exc): """ Pure Python implementation of the PEP 383: the "surrogateescape" error handler of Python 3. Undecodable bytes will be replaced by a Unicode character U+DCxx on decoding, and these are translated into the original bytes on encoding. """ mystring = exc.object[exc.start:exc.end] try: if isinstance(exc, UnicodeDecodeError): # mystring is a byte-string in this case decoded = replace_surrogate_decode(mystring) elif isinstance(exc, UnicodeEncodeError): # In the case of u'\udcc3'.encode('ascii', # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an # exception anyway after this function is called, even though I think # it's doing what it should. It seems that the strict encoder is called # to encode the unicode string that this function returns ... decoded = replace_surrogate_encode(mystring, exc) else: raise exc except NotASurrogateError: raise exc return (decoded, exc.end) class NotASurrogateError(Exception): pass def replace_surrogate_encode(mystring, exc): """ Returns a (unicode) string, not the more logical bytes, because the codecs register_error functionality expects this. """ decoded = [] for ch in mystring: # if PY3: # code = ch # else: code = ord(ch) # The following magic comes from Py3.3's Python/codecs.c file: if not 0xD800 <= code <= 0xDCFF: # Not a surrogate. Fail with the original exception. raise exc # mybytes = [0xe0 | (code >> 12), # 0x80 | ((code >> 6) & 0x3f), # 0x80 | (code & 0x3f)] # Is this a good idea? if 0xDC00 <= code <= 0xDC7F: decoded.append(_unichr(code - 0xDC00)) elif code <= 0xDCFF: decoded.append(_unichr(code - 0xDC00)) else: raise NotASurrogateError return str().join(decoded) def replace_surrogate_decode(mybytes): """ Returns a (unicode) string """ decoded = [] for ch in mybytes: # We may be parsing newbytes (in which case ch is an int) or a native # str on Py2 if isinstance(ch, int): code = ch else: code = ord(ch) if 0x80 <= code <= 0xFF: decoded.append(_unichr(0xDC00 + code)) elif code <= 0x7F: decoded.append(_unichr(code)) else: # # It may be a bad byte # # Try swallowing it. # continue # print("RAISE!") raise NotASurrogateError return str().join(decoded) def encodefilename(fn): if FS_ENCODING == 'ascii': # ASCII encoder of Python 2 expects that the error handler returns a # Unicode string encodable to ASCII, whereas our surrogateescape error # handler has to return bytes in 0x80-0xFF range. encoded = [] for index, ch in enumerate(fn): code = ord(ch) if code < 128: ch = bytes_chr(code) elif 0xDC80 <= code <= 0xDCFF: ch = bytes_chr(code - 0xDC00) else: raise UnicodeEncodeError(FS_ENCODING, fn, index, index+1, 'ordinal not in range(128)') encoded.append(ch) return bytes().join(encoded) elif FS_ENCODING == 'utf-8': # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF # doesn't go through our error handler encoded = [] for index, ch in enumerate(fn): code = ord(ch) if 0xD800 <= code <= 0xDFFF: if 0xDC80 <= code <= 0xDCFF: ch = bytes_chr(code - 0xDC00) encoded.append(ch) else: raise UnicodeEncodeError( FS_ENCODING, fn, index, index+1, 'surrogates not allowed') else: ch_utf8 = ch.encode('utf-8') encoded.append(ch_utf8) return bytes().join(encoded) else: return fn.encode(FS_ENCODING, FS_ERRORS) def decodefilename(fn): return fn.decode(FS_ENCODING, FS_ERRORS) FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') # normalize the filesystem encoding name. # For example, we expect "utf-8", not "UTF8". FS_ENCODING = codecs.lookup(FS_ENCODING).name def register_surrogateescape(): """ Registers the surrogateescape error handler on Python 2 (only) """ if PY3: return try: codecs.lookup_error(FS_ERRORS) except LookupError: codecs.register_error(FS_ERRORS, surrogateescape_handler) try: b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") except Exception: register_surrogateescape()