diff options
author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2011-10-20 23:04:46 +0000 |
---|---|---|
committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2011-10-20 23:04:46 +0000 |
commit | 76be9443c22bc1291ea01585c128e1951620f030 (patch) | |
tree | 2cffc0143845170e4c7136f3ee895c78c00b90d6 /docutils/io.py | |
parent | bb003e6e6aed8122d04871309fac475a3b78ecdb (diff) | |
download | docutils-76be9443c22bc1291ea01585c128e1951620f030.tar.gz |
Work around encoding problems in Py3k. Fixes [ 3395948 ]
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@7196 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'docutils/io.py')
-rw-r--r-- | docutils/io.py | 78 |
1 files changed, 56 insertions, 22 deletions
diff --git a/docutils/io.py b/docutils/io.py index 4330b2eed..40630af55 100644 --- a/docutils/io.py +++ b/docutils/io.py @@ -10,6 +10,7 @@ will exist for a variety of input/output mechanisms. __docformat__ = 'reStructuredText' import sys +import os import re import codecs from docutils import TransformSpec @@ -84,10 +85,9 @@ class Input(TransformSpec): # Apply heuristics only if no encoding is explicitly given and # no BOM found. Start with UTF-8, because that only matches # data that *IS* UTF-8: - encodings = [enc for enc in ('utf-8', - locale_encoding, # can be None - 'latin-1') # fallback encoding - if enc] + encodings = ['utf-8', 'latin-1'] + if locale_encoding: + encodings.insert(1, locale_encoding) for enc in encodings: try: decoded = unicode(data, enc, self.error_handler) @@ -105,7 +105,7 @@ class Input(TransformSpec): coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)")) """Encoding declaration pattern.""" - byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # actually 'utf-8-sig' + byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5 (codecs.BOM_UTF16_BE, 'utf-16-be'), (codecs.BOM_UTF16_LE, 'utf-16-le'),) """Sequence of (start_bytes, encoding) tuples for encoding detection. @@ -224,6 +224,15 @@ class FileInput(Input): sys.exit(1) else: self.source = sys.stdin + elif (sys.version_info >= (3,0) and + self.encoding and hasattr(self.source, 'encoding') and + self.encoding != self.source.encoding and + codecs.lookup(self.encoding) != + codecs.lookup(self.source.encoding)): + # TODO: re-open, warn or raise error? + raise UnicodeError('Encoding clash: encoding given is "%s" ' + 'but source is opened with encoding "%s".' % + (self.encoding, self.source.encoding)) if not source_path: try: self.source_path = self.source.name @@ -234,8 +243,25 @@ class FileInput(Input): """ Read and decode a single file and return the data (Unicode string). """ - try: - data = self.source.read() + try: # In Python < 2.5, try...except has to be nested in try...finally. + try: + if self.source is sys.stdin and sys.version_info >= (3,0): + # read as binary data to circumvent auto-decoding + data = self.source.buffer.read() + # normalize newlines + data = b('\n').join(data.splitlines()) + b('\n') + else: + data = self.source.read() + except (UnicodeError, LookupError), err: # (in Py3k read() decodes) + if not self.encoding and self.source_path: + # re-read in binary mode and decode with heuristics + b_source = open(self.source_path, 'rb') + data = b_source.read() + b_source.close() + # normalize newlines + data = b('\n').join(data.splitlines()) + b('\n') + else: + raise finally: if self.autoclose: self.close() @@ -245,12 +271,7 @@ class FileInput(Input): """ Return lines of a single file as list of Unicode strings. """ - try: - lines = self.source.readlines() - finally: - if self.autoclose: - self.close() - return [self.decode(line) for line in lines] + return self.read().splitlines(True) def close(self): if self.source is not sys.stdin: @@ -302,7 +323,6 @@ class FileOutput(Output): 'errors': self.error_handler} else: kwargs = {} - try: self.destination = open(self.destination_path, 'w', **kwargs) except IOError, error: @@ -317,20 +337,34 @@ class FileOutput(Output): def write(self, data): """Encode `data`, write it to a single file, and return it. - In Python 3, a (unicode) string is returned. + In Python 3, `data` is returned unchanged. """ - if sys.version_info >= (3,0): - output = data # in py3k, write expects a (Unicode) string - else: - output = self.encode(data) + if sys.version_info < (3,0): + data = self.encode(data) if not self.opened: self.open() - try: - self.destination.write(output) + try: # In Python < 2.5, try...except has to be nested in try...finally. + try: + if (sys.version_info >= (3,0) and self.encoding and + hasattr(self.destination,'encoding') and + self.encoding != self.destination.encoding and + codecs.lookup(self.encoding) != + codecs.lookup(self.destination.encoding)): + # encode self, write bytes + bdata = self.encode(data) + if os.linesep != '\n': + bdata = bdata.replace('\n', os.linesep) + sys.stdout.buffer.write(bdata) + else: + self.destination.write(data) + except (UnicodeError, LookupError), err: # can only happen in py3k + raise UnicodeError( + 'Unable to encode output data. output-encoding is: ' + '%s.\n(%s)' % (self.encoding, ErrorString(err))) finally: if self.autoclose: self.close() - return output + return data def close(self): if self.destination not in (sys.stdout, sys.stderr): |