diff options
author | Georg Brandl <georg@python.org> | 2011-05-15 13:31:39 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2011-05-15 13:31:39 +0200 |
commit | ba9d023acf4361093666eaf1aa6e78b0ab41a891 (patch) | |
tree | 38799b17ff70402dff5ae84ec38586c9fc6f0582 | |
parent | 0dbb1d6d8311a9e829a58762a461b48d714ed4aa (diff) | |
download | sphinx-ba9d023acf4361093666eaf1aa6e78b0ab41a891.tar.gz |
Closes #657: viewcode now works correctly with source files that have non-ASCII encoding.
-rw-r--r-- | CHANGES | 3 | ||||
-rw-r--r-- | sphinx/pycode/__init__.py | 16 | ||||
-rw-r--r-- | sphinx/util/__init__.py | 54 |
3 files changed, 59 insertions, 14 deletions
@@ -1,6 +1,9 @@ Release 1.0.8 (in development) ============================== +* #657: viewcode now works correctly with source files that have + non-ASCII encoding. + * #669: Respect the ``noindex`` flag option in py:module directives. * #675: Fix IndexErrors when including nonexisting lines with diff --git a/sphinx/pycode/__init__.py b/sphinx/pycode/__init__.py index 2d58ffd2..8271b299 100644 --- a/sphinx/pycode/__init__.py +++ b/sphinx/pycode/__init__.py @@ -17,7 +17,7 @@ from cStringIO import StringIO from sphinx.errors import PycodeError from sphinx.pycode import nodes from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals -from sphinx.util import get_module_source +from sphinx.util import get_module_source, detect_encoding from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc @@ -37,10 +37,6 @@ for k, v in token.tok_name.iteritems(): number2name = pygrammar.number2symbol.copy() number2name.update(token.tok_name) - -# a regex to recognize coding cookies -_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)') - _eq = nodes.Leaf(token.EQUAL, '=') @@ -195,11 +191,10 @@ class ModuleAnalyzer(object): self.srcname = srcname # file-like object yielding source lines self.source = source - # will be changed when found by parse() - self.encoding = sys.getdefaultencoding() # cache the source code as well pos = self.source.tell() + self.encoding = detect_encoding(self.source.readline) self.code = self.source.read() self.source.seek(pos) @@ -229,13 +224,6 @@ class ModuleAnalyzer(object): self.parsetree = pydriver.parse_tokens(self.tokens) except parse.ParseError, err: raise PycodeError('parsing failed', err) - # find the source code encoding, if present - comments = self.parsetree.get_prefix() - for line in comments.splitlines()[:2]: - match = _coding_re.search(line) - if match is not None: - self.encoding = match.group(1) - break def find_attr_docs(self, scope=''): """Find class and module-level attributes and their documentation.""" diff --git a/sphinx/util/__init__.py b/sphinx/util/__init__.py index f4e08fbc..4749ca14 100644 --- a/sphinx/util/__init__.py +++ b/sphinx/util/__init__.py @@ -18,6 +18,7 @@ import tempfile import posixpath import traceback from os import path +from codecs import BOM_UTF8 import docutils from docutils.utils import relative_path @@ -211,6 +212,59 @@ def get_module_source(modname): return 'file', filename +# a regex to recognize coding cookies +_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)') + +def detect_encoding(readline): + """Like tokenize.detect_encoding() from Py3k, but a bit simplified.""" + + def read_or_stop(): + try: + return readline() + except StopIteration: + return None + + def get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace('_', '-') + if enc == 'utf-8' or enc.startswith('utf-8-'): + return 'utf-8' + if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \ + enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')): + return 'iso-8859-1' + return orig_enc + + def find_cookie(line): + try: + line_string = line.decode('ascii') + except UnicodeDecodeError: + return None + + matches = _coding_re.findall(line_string) + if not matches: + return None + return get_normal_name(matches[0]) + + default = sys.getdefaultencoding() + first = read_or_stop() + if first and first.startswith(BOM_UTF8): + first = first[3:] + default = 'utf-8-sig' + if not first: + return default + encoding = find_cookie(first) + if encoding: + return encoding + second = read_or_stop() + if not second: + return default + encoding = find_cookie(second) + if encoding: + return encoding + return default + + # Low-level utility functions and classes. class Tee(object): |