Work around encoding problems in Py3k. Fixes [ 3395948 ]

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@7196 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2011-10-20 23:04:46 +0000
committer: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2011-10-20 23:04:46 +0000
commit: 76be9443c22bc1291ea01585c128e1951620f030 (patch)
tree: 2cffc0143845170e4c7136f3ee895c78c00b90d6 /docutils/io.py
parent: bb003e6e6aed8122d04871309fac475a3b78ecdb (diff)
download: docutils-76be9443c22bc1291ea01585c128e1951620f030.tar.gz
1 files changed, 56 insertions, 22 deletions
diff --git a/docutils/io.py b/docutils/io.py
index 4330b2eed..40630af55 100644
--- a/docutils/io.py
+++ b/docutils/io.py
@@ -10,6 +10,7 @@ will exist for a variety of input/output mechanisms.
 __docformat__ = 'reStructuredText'
 
 import sys
+import os
 import re
 import codecs
 from docutils import TransformSpec
@@ -84,10 +85,9 @@ class Input(TransformSpec):
                 # Apply heuristics only if no encoding is explicitly given and
                 # no BOM found.  Start with UTF-8, because that only matches
                 # data that *IS* UTF-8:
-                encodings = [enc for enc in ('utf-8',
-                                             locale_encoding, # can be None
-                                             'latin-1') # fallback encoding
-                             if enc]
+                encodings = ['utf-8', 'latin-1']
+                if locale_encoding:
+                    encodings.insert(1, locale_encoding)
         for enc in encodings:
             try:
                 decoded = unicode(data, enc, self.error_handler)
@@ -105,7 +105,7 @@ class Input(TransformSpec):
     coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
     """Encoding declaration pattern."""
 
-    byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # actually 'utf-8-sig'
+    byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
                         (codecs.BOM_UTF16_BE, 'utf-16-be'),
                         (codecs.BOM_UTF16_LE, 'utf-16-le'),)
     """Sequence of (start_bytes, encoding) tuples for encoding detection.
@@ -224,6 +224,15 @@ class FileInput(Input):
                     sys.exit(1)
             else:
                 self.source = sys.stdin
+        elif (sys.version_info >= (3,0) and
+              self.encoding and hasattr(self.source, 'encoding') and
+              self.encoding != self.source.encoding and
+              codecs.lookup(self.encoding) !=
+              codecs.lookup(self.source.encoding)):
+            # TODO: re-open, warn or raise error?
+            raise UnicodeError('Encoding clash: encoding given is "%s" '
+                               'but source is opened with encoding "%s".' %
+                               (self.encoding, self.source.encoding))
         if not source_path:
             try:
                 self.source_path = self.source.name
@@ -234,8 +243,25 @@ class FileInput(Input):
         """
         Read and decode a single file and return the data (Unicode string).
         """
-        try:
-            data = self.source.read()
+        try: # In Python < 2.5, try...except has to be nested in try...finally.
+            try:
+                if self.source is sys.stdin and sys.version_info >= (3,0):
+                    # read as binary data to circumvent auto-decoding
+                    data = self.source.buffer.read()
+                    # normalize newlines
+                    data = b('\n').join(data.splitlines()) + b('\n')
+                else:
+                    data = self.source.read()
+            except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
+                if not self.encoding and self.source_path:
+                    # re-read in binary mode and decode with heuristics
+                    b_source = open(self.source_path, 'rb')
+                    data = b_source.read()
+                    b_source.close()
+                    # normalize newlines
+                    data = b('\n').join(data.splitlines()) + b('\n')
+                else:
+                    raise
         finally:
             if self.autoclose:
                 self.close()
@@ -245,12 +271,7 @@ class FileInput(Input):
         """
         Return lines of a single file as list of Unicode strings.
         """
-        try:
-            lines = self.source.readlines()
-        finally:
-            if self.autoclose:
-                self.close()
-        return [self.decode(line) for line in lines]
+        return self.read().splitlines(True)
 
     def close(self):
         if self.source is not sys.stdin:
@@ -302,7 +323,6 @@ class FileOutput(Output):
                       'errors': self.error_handler}
         else:
             kwargs = {}
-
         try:
             self.destination = open(self.destination_path, 'w', **kwargs)
         except IOError, error:
@@ -317,20 +337,34 @@ class FileOutput(Output):
     def write(self, data):
         """Encode `data`, write it to a single file, and return it.
 
-        In Python 3, a (unicode) string is returned.
+        In Python 3, `data` is returned unchanged.
         """
-        if sys.version_info >= (3,0):
-            output = data # in py3k, write expects a (Unicode) string
-        else:
-            output = self.encode(data)
+        if sys.version_info < (3,0):
+            data = self.encode(data)
         if not self.opened:
             self.open()
-        try:
-            self.destination.write(output)
+        try: # In Python < 2.5, try...except has to be nested in try...finally.
+            try:
+                if (sys.version_info >= (3,0) and self.encoding and
+                    hasattr(self.destination,'encoding') and
+                    self.encoding != self.destination.encoding and
+                    codecs.lookup(self.encoding) !=
+                    codecs.lookup(self.destination.encoding)):
+                    # encode self, write bytes
+                    bdata = self.encode(data)
+                    if os.linesep != '\n':
+                        bdata = bdata.replace('\n', os.linesep)
+                    sys.stdout.buffer.write(bdata)
+                else:
+                    self.destination.write(data)
+            except (UnicodeError, LookupError), err: # can only happen in py3k
+                raise UnicodeError(
+                    'Unable to encode output data. output-encoding is: '
+                    '%s.\n(%s)' % (self.encoding, ErrorString(err)))
         finally:
             if self.autoclose:
                 self.close()
-        return output
+        return data
 
     def close(self):
         if self.destination not in (sys.stdout, sys.stderr):
author	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2011-10-20 23:04:46 +0000
committer	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2011-10-20 23:04:46 +0000
commit	76be9443c22bc1291ea01585c128e1951620f030 (patch)
tree	2cffc0143845170e4c7136f3ee895c78c00b90d6 /docutils/io.py
parent	bb003e6e6aed8122d04871309fac475a3b78ecdb (diff)
download	docutils-76be9443c22bc1291ea01585c128e1951620f030.tar.gz