[format checker] check for anomalous backslash escape (new W1401, W1402). Closes #104571

author: Sylvain Th?nault <sylvain.thenault@logilab.fr> 2012-09-19 16:11:08 +0200
committer: Sylvain Th?nault <sylvain.thenault@logilab.fr> 2012-09-19 16:11:08 +0200
commit: 54dfc65486683aeff4d612ebc218795bd971de4b (patch)
tree: 6c6e35a4cfc12d5c806a80a9d8bb27e65f77eb62
parent: 4314608361014c4e376ac160f6097c0af56ee437 (diff)
download: pylint-54dfc65486683aeff4d612ebc218795bd971de4b.tar.gz
6 files changed, 159 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index cdaec5f..d8df40b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,9 @@ ChangeLog for PyLint
 ====================
 
 --
+    * #104571: check for anomalous backslash escape, introducing new
+      W1401 and W1402 messages (patch by Martin Pool)
+
     * #100707: check for boolop being used as exception class, introducing
       new W0711 message (patch by Tim Hatch)
 
diff --git a/README b/README
index 5f89aef..8123091 100644
--- a/README
+++ b/README
@@ -65,6 +65,7 @@ order doesn't matter...
 * Wolfgang Grafen, Axel Muller, Fabio Zadrozny, Pierre Rouleau,
   Maarten ter Huurne, Mirko Friedenhagen (among others):
   bug reports, feedback, feature requests...
+* Martin Pool (Google): warnings for anomalous backslashes
 * All the Logilab's team: daily use, bug reports, feature requests
 * Other people have contributed by their feedback, if I've forgotten
   you, send me a note !
diff --git a/checkers/__init__.py b/checkers/__init__.py
index f1271e9..519e43f 100644
--- a/checkers/__init__.py
+++ b/checkers/__init__.py
@@ -29,6 +29,7 @@ Base id of standard checkers (used in msg and report ids):
 11: typecheck
 12: logging
 13: string_format
+14: string_constant
 14-50: not yet used: reserved for future internal checkers.
 51-99: perhaps used: reserved for external checkers
 
diff --git a/checkers/format.py b/checkers/format.py
index 316aa6c..85eb667 100644
--- a/checkers/format.py
+++ b/checkers/format.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2003-2010 Sylvain Thenault (thenault@gmail.com).
 # Copyright (c) 2003-2012 LOGILAB S.A. (Paris, FRANCE).
+# Copyright 2012 Google Inc.
+#
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; either version 2 of the License, or (at your option) any later
@@ -52,10 +54,10 @@ MSGS = {
               'Used when more than on statement are found on the same line.'),
     'C0322': ('Operator not preceded by a space\n%s',
               'Used when one of the following operator (!= | <= | == | >= | < '
-              '| > | = | \+= | -= | \*= | /= | %) is not preceded by a space.'),
+              '| > | = | \\+= | -= | \\*= | /= | %) is not preceded by a space.'),
     'C0323': ('Operator not followed by a space\n%s',
               'Used when one of the following operator (!= | <= | == | >= | < '
-              '| > | = | \+= | -= | \*= | /= | %) is not followed by a space.'),
+              '| > | = | \\+= | -= | \\*= | /= | %) is not followed by a space.'),
     'C0324': ('Comma not followed by a space\n%s',
               'Used when a comma (",") is not followed by a space.'),
     }
@@ -81,7 +83,7 @@ SQSTRING_RGX = r'"([^"\\]|\\.)*?"'
 SASTRING_RGX = r"'([^'\\]|\\.)*?'"
 # triple quoted string rgx
 TQSTRING_RGX = r'"""([^"]|("(?!"")))*?(""")'
-# triple apostrophed string rgx # FIXME english please
+# triple apostrophe'd string rgx
 TASTRING_RGX = r"'''([^']|('(?!'')))*?(''')"
 
 # finally, the string regular expression
@@ -109,11 +111,12 @@ BAD_CONSTRUCT_RGXS = (
      re.compile(OP_RGX_SEARCH_2, re.M),
      'C0323'),
 
-    (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M), 
+    (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M),
      re.compile(r',[^\s)]', re.M),
      'C0324'),
     )
 
+_PY3K = sys.version_info >= (3, 0)
 
 def get_string_coords(line):
     """return a list of string positions (tuple (start, end)) in the line
@@ -356,6 +359,101 @@ class FormatChecker(BaseRawChecker):
                                    expected * unit_size))
 
 
+class StringConstantChecker(BaseRawChecker):
+    """Check string literals"""
+
+    msgs = {
+        'W1401': ('Anomalous backslash in string: \'%s\'. '
+                  'String constant might be missing an r prefix.',
+                  'Used when a backslash is in a literal string but not as an '
+                  'escape.'),
+        'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. '
+                  'String constant might be missing an r or u prefix.',
+                  'Used when an escape like \\u is encountered in a byte '
+                  'string where it has no effect.'),
+        }
+    name = 'string_constant'
+    __implements__ = (IRawChecker, IASTNGChecker)
+
+    # Characters that have a special meaning after a backslash in either
+    # Unicode or byte strings.
+    ESCAPE_CHARACTERS = 'abfnrtvox\n\r\t\\\'\"'
+
+    # Characters that have a special meaning after a backslash but only in
+    # Unicode strings.
+    UNICODE_ESCAPE_CHARACTERS = 'uUN'
+
+    def process_tokens(self, tokens):
+        for (tok_type, token, (start_row, start_col), _, _) in tokens:
+            if tok_type == tokenize.STRING:
+                # 'token' is the whole un-parsed token; we can look at the start
+                # of it to see whether it's a raw or unicode string etc.
+                self.process_string_token(token, start_row, start_col)
+
+    def process_string_token(self, token, start_row, start_col):
+        for i, c in enumerate(token):
+            if c in '\'\"':
+                quote_char = c
+                break
+        prefix = token[:i].lower()  #  markers like u, b, r.
+        after_prefix = token[i:]
+        if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
+            string_body = after_prefix[3:-3]
+        else:
+            string_body = after_prefix[1:-1]  # Chop off quotes
+        # No special checks on raw strings at the moment.
+        if 'r' not in prefix:
+            self.process_non_raw_string_token(prefix, string_body,
+                start_row, start_col)
+
+    def process_non_raw_string_token(self, prefix, string_body, start_row,
+        start_col):
+        """check for bad escapes in a non-raw string.
+
+        prefix: lowercase string of eg 'ur' string prefix markers.
+        string_body: the un-parsed body of the string, not including the quote
+        marks.
+        start_row: integer line number in the source.
+        start_col: integer column number in the source.
+        """
+        # Walk through the string; if we see a backslash then escape the next
+        # character, and skip over it.  If we see a non-escaped character,
+        # alert, and continue.
+        #
+        # Accept a backslash when it escapes a backslash, or a quote, or
+        # end-of-line, or one of the letters that introduce a special escape
+        # sequence <http://docs.python.org/reference/lexical_analysis.html>
+        #
+        # TODO(mbp): Maybe give a separate warning about the rarely-used
+        # \a \b \v \f?
+        #
+        # TODO(mbp): We could give the column of the problem character, but
+        # add_message doesn't seem to have a way to pass it through at present.
+        i = 0
+        while True:
+            i = string_body.find('\\', i)
+            if i == -1:
+                break
+            # There must be a next character; having a backslash at the end
+            # of the string would be a SyntaxError.
+            next_char = string_body[i+1]
+            match = string_body[i:i+2]
+            if next_char in self.UNICODE_ESCAPE_CHARACTERS:
+                if 'u' in prefix:
+                    pass
+                elif _PY3K and 'b' not in prefix:
+                    pass  # unicode by default
+                else:
+                    self.add_message('W1402', line=start_row, args=(match, ))
+            elif next_char not in self.ESCAPE_CHARACTERS:
+                self.add_message('W1401', line=start_row, args=(match, ))
+            # Whether it was a valid escape or not, backslash followed by
+            # another character can always be consumed whole: the second
+            # character can never be the start of a new backslash escape.
+            i += 2
+
+
 def register(linter):
     """required method to auto register this checker """
     linter.register_checker(FormatChecker(linter))
+    linter.register_checker(StringConstantChecker(linter))
diff --git a/test/input/func_excess_escapes.py b/test/input/func_excess_escapes.py
new file mode 100644
index 0000000..fe3dc11
--- /dev/null
+++ b/test/input/func_excess_escapes.py
@@ -0,0 +1,44 @@
+# pylint:disable=W0105, W0511
+"""Stray backslash escapes may be missing a raw-string prefix."""
+
+__revision__ = '$Id$'
+
+# Bad escape sequences, which probably don't do what you expect.
+A = "\[\]\\"
+assert '\/' == '\\/'
+ESCAPE_BACKSLASH = '\`'
+
+# Valid escape sequences.
+NEWLINE = "\n"
+OLD_ESCAPES = '\a\b\f\n\t\r\v'
+HEX = '\xad\x0a\x0d'
+OCTAL = '\o123\o000'
+UNICODE = u'\u1234'
+HIGH_UNICODE = u'\U0000abcd'
+QUOTES = '\'\"'
+LITERAL_NEWLINE = '\
+'
+ESCAPE_UNICODE = "\\\\n"
+
+# Bad docstring
+"""Even in a docstring
+
+You shouldn't have ambiguous text like: C:\Program Files\alpha
+"""
+
+# Would be valid in Unicode, but probably not what you want otherwise
+BAD_UNICODE = '\u0042'
+BAD_LONG_UNICODE = '\U00000042'
+BAD_NAMED_UNICODE = '\N{GREEK SMALL LETTER ALPHA}'
+
+GOOD_UNICODE = u'\u0042'
+GOOD_LONG_UNICODE = u'\U00000042'
+GOOD_NAMED_UNICODE = u'\N{GREEK SMALL LETTER ALPHA}'
+
+
+# Valid raw strings
+RAW_BACKSLASHES = r'raw'
+RAW_UNICODE = ur"\u0062\n"
+
+# In a comment you can have whatever you want: \ \\ \n \m
+# even things that look like bad strings: "C:\Program Files"
diff --git a/test/messages/func_excess_escapes.txt b/test/messages/func_excess_escapes.txt
new file mode 100644
index 0000000..aad9ebc
--- /dev/null
+++ b/test/messages/func_excess_escapes.txt
@@ -0,0 +1,8 @@
+W:  7: Anomalous backslash in string: '\['. String constant might be missing an r prefix.
+W:  7: Anomalous backslash in string: '\]'. String constant might be missing an r prefix.
+W:  8: Anomalous backslash in string: '\/'. String constant might be missing an r prefix.
+W:  9: Anomalous backslash in string: '\`'. String constant might be missing an r prefix.
+W: 24: Anomalous backslash in string: '\P'. String constant might be missing an r prefix.
+W: 30: Anomalous Unicode escape in byte string: '\u'. String constant might be missing an r or u prefix.
+W: 31: Anomalous Unicode escape in byte string: '\U'. String constant might be missing an r or u prefix.
+W: 32: Anomalous Unicode escape in byte string: '\N'. String constant might be missing an r or u prefix.
author	Sylvain Th?nault <sylvain.thenault@logilab.fr>	2012-09-19 16:11:08 +0200
committer	Sylvain Th?nault <sylvain.thenault@logilab.fr>	2012-09-19 16:11:08 +0200
commit	54dfc65486683aeff4d612ebc218795bd971de4b (patch)
tree	6c6e35a4cfc12d5c806a80a9d8bb27e65f77eb62
parent	4314608361014c4e376ac160f6097c0af56ee437 (diff)
download	pylint-54dfc65486683aeff4d612ebc218795bd971de4b.tar.gz