Closes #657: viewcode now works correctly with source files that have non-ASCII encoding.

author: Georg Brandl <georg@python.org> 2011-05-15 13:31:39 +0200
committer: Georg Brandl <georg@python.org> 2011-05-15 13:31:39 +0200
commit: ba9d023acf4361093666eaf1aa6e78b0ab41a891 (patch)
tree: 38799b17ff70402dff5ae84ec38586c9fc6f0582
parent: 0dbb1d6d8311a9e829a58762a461b48d714ed4aa (diff)
download: sphinx-ba9d023acf4361093666eaf1aa6e78b0ab41a891.tar.gz
3 files changed, 59 insertions, 14 deletions
diff --git a/CHANGES b/CHANGES
index de7a620c..6a013d75 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,9 @@
 Release 1.0.8 (in development)
 ==============================
 
+* #657: viewcode now works correctly with source files that have
+  non-ASCII encoding.
+
 * #669: Respect the ``noindex`` flag option in py:module directives.
 
 * #675: Fix IndexErrors when including nonexisting lines with
diff --git a/sphinx/pycode/__init__.py b/sphinx/pycode/__init__.py
index 2d58ffd2..8271b299 100644
--- a/sphinx/pycode/__init__.py
+++ b/sphinx/pycode/__init__.py
@@ -17,7 +17,7 @@ from cStringIO import StringIO
 from sphinx.errors import PycodeError
 from sphinx.pycode import nodes
 from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
-from sphinx.util import get_module_source
+from sphinx.util import get_module_source, detect_encoding
 from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
 
 
@@ -37,10 +37,6 @@ for k, v in token.tok_name.iteritems():
 number2name = pygrammar.number2symbol.copy()
 number2name.update(token.tok_name)
 
-
-# a regex to recognize coding cookies
-_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
-
 _eq = nodes.Leaf(token.EQUAL, '=')
 
 
@@ -195,11 +191,10 @@ class ModuleAnalyzer(object):
         self.srcname = srcname
         # file-like object yielding source lines
         self.source = source
-        # will be changed when found by parse()
-        self.encoding = sys.getdefaultencoding()
 
         # cache the source code as well
         pos = self.source.tell()
+        self.encoding = detect_encoding(self.source.readline)
         self.code = self.source.read()
         self.source.seek(pos)
 
@@ -229,13 +224,6 @@ class ModuleAnalyzer(object):
             self.parsetree = pydriver.parse_tokens(self.tokens)
         except parse.ParseError, err:
             raise PycodeError('parsing failed', err)
-        # find the source code encoding, if present
-        comments = self.parsetree.get_prefix()
-        for line in comments.splitlines()[:2]:
-            match = _coding_re.search(line)
-            if match is not None:
-                self.encoding = match.group(1)
-                break
 
     def find_attr_docs(self, scope=''):
         """Find class and module-level attributes and their documentation."""
diff --git a/sphinx/util/__init__.py b/sphinx/util/__init__.py
index f4e08fbc..4749ca14 100644
--- a/sphinx/util/__init__.py
+++ b/sphinx/util/__init__.py
@@ -18,6 +18,7 @@ import tempfile
 import posixpath
 import traceback
 from os import path
+from codecs import BOM_UTF8
 
 import docutils
 from docutils.utils import relative_path
@@ -211,6 +212,59 @@ def get_module_source(modname):
     return 'file', filename
 
 
+# a regex to recognize coding cookies
+_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
+
+def detect_encoding(readline):
+    """Like tokenize.detect_encoding() from Py3k, but a bit simplified."""
+
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return None
+
+    def get_normal_name(orig_enc):
+        """Imitates get_normal_name in tokenizer.c."""
+        # Only care about the first 12 characters.
+        enc = orig_enc[:12].lower().replace('_', '-')
+        if enc == 'utf-8' or enc.startswith('utf-8-'):
+            return 'utf-8'
+        if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \
+           enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')):
+            return 'iso-8859-1'
+        return orig_enc
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+
+        matches = _coding_re.findall(line_string)
+        if not matches:
+            return None
+        return get_normal_name(matches[0])
+
+    default = sys.getdefaultencoding()
+    first = read_or_stop()
+    if first and first.startswith(BOM_UTF8):
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding
+    second = read_or_stop()
+    if not second:
+        return default
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding
+    return default
+
+
 # Low-level utility functions and classes.
 
 class Tee(object):
author	Georg Brandl <georg@python.org>	2011-05-15 13:31:39 +0200
committer	Georg Brandl <georg@python.org>	2011-05-15 13:31:39 +0200
commit	ba9d023acf4361093666eaf1aa6e78b0ab41a891 (patch)
tree	38799b17ff70402dff5ae84ec38586c9fc6f0582
parent	0dbb1d6d8311a9e829a58762a461b48d714ed4aa (diff)
download	sphinx-ba9d023acf4361093666eaf1aa6e78b0ab41a891.tar.gz