summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2011-05-15 13:31:39 +0200
committerGeorg Brandl <georg@python.org>2011-05-15 13:31:39 +0200
commitba9d023acf4361093666eaf1aa6e78b0ab41a891 (patch)
tree38799b17ff70402dff5ae84ec38586c9fc6f0582
parent0dbb1d6d8311a9e829a58762a461b48d714ed4aa (diff)
downloadsphinx-ba9d023acf4361093666eaf1aa6e78b0ab41a891.tar.gz
Closes #657: viewcode now works correctly with source files that have non-ASCII encoding.
-rw-r--r--CHANGES3
-rw-r--r--sphinx/pycode/__init__.py16
-rw-r--r--sphinx/util/__init__.py54
3 files changed, 59 insertions, 14 deletions
diff --git a/CHANGES b/CHANGES
index de7a620c..6a013d75 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,9 @@
Release 1.0.8 (in development)
==============================
+* #657: viewcode now works correctly with source files that have
+ non-ASCII encoding.
+
* #669: Respect the ``noindex`` flag option in py:module directives.
* #675: Fix IndexErrors when including nonexisting lines with
diff --git a/sphinx/pycode/__init__.py b/sphinx/pycode/__init__.py
index 2d58ffd2..8271b299 100644
--- a/sphinx/pycode/__init__.py
+++ b/sphinx/pycode/__init__.py
@@ -17,7 +17,7 @@ from cStringIO import StringIO
from sphinx.errors import PycodeError
from sphinx.pycode import nodes
from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
-from sphinx.util import get_module_source
+from sphinx.util import get_module_source, detect_encoding
from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
@@ -37,10 +37,6 @@ for k, v in token.tok_name.iteritems():
number2name = pygrammar.number2symbol.copy()
number2name.update(token.tok_name)
-
-# a regex to recognize coding cookies
-_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
-
_eq = nodes.Leaf(token.EQUAL, '=')
@@ -195,11 +191,10 @@ class ModuleAnalyzer(object):
self.srcname = srcname
# file-like object yielding source lines
self.source = source
- # will be changed when found by parse()
- self.encoding = sys.getdefaultencoding()
# cache the source code as well
pos = self.source.tell()
+ self.encoding = detect_encoding(self.source.readline)
self.code = self.source.read()
self.source.seek(pos)
@@ -229,13 +224,6 @@ class ModuleAnalyzer(object):
self.parsetree = pydriver.parse_tokens(self.tokens)
except parse.ParseError, err:
raise PycodeError('parsing failed', err)
- # find the source code encoding, if present
- comments = self.parsetree.get_prefix()
- for line in comments.splitlines()[:2]:
- match = _coding_re.search(line)
- if match is not None:
- self.encoding = match.group(1)
- break
def find_attr_docs(self, scope=''):
"""Find class and module-level attributes and their documentation."""
diff --git a/sphinx/util/__init__.py b/sphinx/util/__init__.py
index f4e08fbc..4749ca14 100644
--- a/sphinx/util/__init__.py
+++ b/sphinx/util/__init__.py
@@ -18,6 +18,7 @@ import tempfile
import posixpath
import traceback
from os import path
+from codecs import BOM_UTF8
import docutils
from docutils.utils import relative_path
@@ -211,6 +212,59 @@ def get_module_source(modname):
return 'file', filename
+# a regex to recognize coding cookies
+_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
+
+def detect_encoding(readline):
+ """Like tokenize.detect_encoding() from Py3k, but a bit simplified."""
+
+ def read_or_stop():
+ try:
+ return readline()
+ except StopIteration:
+ return None
+
+ def get_normal_name(orig_enc):
+ """Imitates get_normal_name in tokenizer.c."""
+ # Only care about the first 12 characters.
+ enc = orig_enc[:12].lower().replace('_', '-')
+ if enc == 'utf-8' or enc.startswith('utf-8-'):
+ return 'utf-8'
+ if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \
+ enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')):
+ return 'iso-8859-1'
+ return orig_enc
+
+ def find_cookie(line):
+ try:
+ line_string = line.decode('ascii')
+ except UnicodeDecodeError:
+ return None
+
+ matches = _coding_re.findall(line_string)
+ if not matches:
+ return None
+ return get_normal_name(matches[0])
+
+ default = sys.getdefaultencoding()
+ first = read_or_stop()
+ if first and first.startswith(BOM_UTF8):
+ first = first[3:]
+ default = 'utf-8-sig'
+ if not first:
+ return default
+ encoding = find_cookie(first)
+ if encoding:
+ return encoding
+ second = read_or_stop()
+ if not second:
+ return default
+ encoding = find_cookie(second)
+ if encoding:
+ return encoding
+ return default
+
+
# Low-level utility functions and classes.
class Tee(object):