diff options
| author | Benjamin Peterson <benjamin@python.org> | 2009-10-09 21:43:09 +0000 | 
|---|---|---|
| committer | Benjamin Peterson <benjamin@python.org> | 2009-10-09 21:43:09 +0000 | 
| commit | d3afadaa4908df544e0181c11199e59b1bfb5c37 (patch) | |
| tree | 5b214ec4a85f64411b50dd40499bf9a7691d4a5f /Lib/tokenize.py | |
| parent | ffc08fcad6d91a50224914e94eae6505b2e55548 (diff) | |
| download | cpython-git-d3afadaa4908df544e0181c11199e59b1bfb5c37.tar.gz | |
normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does
Diffstat (limited to 'Lib/tokenize.py')
| -rw-r--r-- | Lib/tokenize.py | 13 | 
1 files changed, 12 insertions, 1 deletions
| diff --git a/Lib/tokenize.py b/Lib/tokenize.py index f83bda522a..fb58c6b77a 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -279,6 +279,17 @@ def untokenize(iterable):      return out +def _get_normal_name(orig_enc): +    """Imitates get_normal_name in tokenizer.c.""" +    # Only care about the first 12 characters. +    enc = orig_enc[:12].lower().replace("_", "-") +    if enc == "utf-8" or enc.startswith("utf-8-"): +        return "utf-8" +    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ +       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): +        return "iso-8859-1" +    return orig_enc +  def detect_encoding(readline):      """      The detect_encoding() function is used to detect the encoding that should @@ -313,7 +324,7 @@ def detect_encoding(readline):          matches = cookie_re.findall(line_string)          if not matches:              return None -        encoding = matches[0] +        encoding = _get_normal_name(matches[0])          try:              codec = lookup(encoding)          except LookupError: | 
