diff options
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | README | 1 | ||||
-rw-r--r-- | checkers/__init__.py | 1 | ||||
-rw-r--r-- | checkers/format.py | 106 | ||||
-rw-r--r-- | test/input/func_excess_escapes.py | 44 | ||||
-rw-r--r-- | test/messages/func_excess_escapes.txt | 8 |
6 files changed, 159 insertions, 4 deletions
@@ -2,6 +2,9 @@ ChangeLog for PyLint ==================== -- + * #104571: check for anomalous backslash escape, introducing new + W1401 and W1402 messages (patch by Martin Pool) + * #100707: check for boolop being used as exception class, introducing new W0711 message (patch by Tim Hatch) @@ -65,6 +65,7 @@ order doesn't matter... * Wolfgang Grafen, Axel Muller, Fabio Zadrozny, Pierre Rouleau, Maarten ter Huurne, Mirko Friedenhagen (among others): bug reports, feedback, feature requests... +* Martin Pool (Google): warnings for anomalous backslashes * All the Logilab's team: daily use, bug reports, feature requests * Other people have contributed by their feedback, if I've forgotten you, send me a note ! diff --git a/checkers/__init__.py b/checkers/__init__.py index f1271e9..519e43f 100644 --- a/checkers/__init__.py +++ b/checkers/__init__.py @@ -29,6 +29,7 @@ Base id of standard checkers (used in msg and report ids): 11: typecheck 12: logging 13: string_format +14: string_constant 14-50: not yet used: reserved for future internal checkers. 51-99: perhaps used: reserved for external checkers diff --git a/checkers/format.py b/checkers/format.py index 316aa6c..85eb667 100644 --- a/checkers/format.py +++ b/checkers/format.py @@ -1,5 +1,7 @@ # Copyright (c) 2003-2010 Sylvain Thenault (thenault@gmail.com). # Copyright (c) 2003-2012 LOGILAB S.A. (Paris, FRANCE). +# Copyright 2012 Google Inc. +# # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation; either version 2 of the License, or (at your option) any later @@ -52,10 +54,10 @@ MSGS = { 'Used when more than on statement are found on the same line.'), 'C0322': ('Operator not preceded by a space\n%s', 'Used when one of the following operator (!= | <= | == | >= | < ' - '| > | = | \+= | -= | \*= | /= | %) is not preceded by a space.'), + '| > | = | \\+= | -= | \\*= | /= | %) is not preceded by a space.'), 'C0323': ('Operator not followed by a space\n%s', 'Used when one of the following operator (!= | <= | == | >= | < ' - '| > | = | \+= | -= | \*= | /= | %) is not followed by a space.'), + '| > | = | \\+= | -= | \\*= | /= | %) is not followed by a space.'), 'C0324': ('Comma not followed by a space\n%s', 'Used when a comma (",") is not followed by a space.'), } @@ -81,7 +83,7 @@ SQSTRING_RGX = r'"([^"\\]|\\.)*?"' SASTRING_RGX = r"'([^'\\]|\\.)*?'" # triple quoted string rgx TQSTRING_RGX = r'"""([^"]|("(?!"")))*?(""")' -# triple apostrophed string rgx # FIXME english please +# triple apostrophe'd string rgx TASTRING_RGX = r"'''([^']|('(?!'')))*?(''')" # finally, the string regular expression @@ -109,11 +111,12 @@ BAD_CONSTRUCT_RGXS = ( re.compile(OP_RGX_SEARCH_2, re.M), 'C0323'), - (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M), + (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M), re.compile(r',[^\s)]', re.M), 'C0324'), ) +_PY3K = sys.version_info >= (3, 0) def get_string_coords(line): """return a list of string positions (tuple (start, end)) in the line @@ -356,6 +359,101 @@ class FormatChecker(BaseRawChecker): expected * unit_size)) +class StringConstantChecker(BaseRawChecker): + """Check string literals""" + + msgs = { + 'W1401': ('Anomalous backslash in string: \'%s\'. ' + 'String constant might be missing an r prefix.', + 'Used when a backslash is in a literal string but not as an ' + 'escape.'), + 'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. ' + 'String constant might be missing an r or u prefix.', + 'Used when an escape like \\u is encountered in a byte ' + 'string where it has no effect.'), + } + name = 'string_constant' + __implements__ = (IRawChecker, IASTNGChecker) + + # Characters that have a special meaning after a backslash in either + # Unicode or byte strings. + ESCAPE_CHARACTERS = 'abfnrtvox\n\r\t\\\'\"' + + # Characters that have a special meaning after a backslash but only in + # Unicode strings. + UNICODE_ESCAPE_CHARACTERS = 'uUN' + + def process_tokens(self, tokens): + for (tok_type, token, (start_row, start_col), _, _) in tokens: + if tok_type == tokenize.STRING: + # 'token' is the whole un-parsed token; we can look at the start + # of it to see whether it's a raw or unicode string etc. + self.process_string_token(token, start_row, start_col) + + def process_string_token(self, token, start_row, start_col): + for i, c in enumerate(token): + if c in '\'\"': + quote_char = c + break + prefix = token[:i].lower() # markers like u, b, r. + after_prefix = token[i:] + if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char: + string_body = after_prefix[3:-3] + else: + string_body = after_prefix[1:-1] # Chop off quotes + # No special checks on raw strings at the moment. + if 'r' not in prefix: + self.process_non_raw_string_token(prefix, string_body, + start_row, start_col) + + def process_non_raw_string_token(self, prefix, string_body, start_row, + start_col): + """check for bad escapes in a non-raw string. + + prefix: lowercase string of eg 'ur' string prefix markers. + string_body: the un-parsed body of the string, not including the quote + marks. + start_row: integer line number in the source. + start_col: integer column number in the source. + """ + # Walk through the string; if we see a backslash then escape the next + # character, and skip over it. If we see a non-escaped character, + # alert, and continue. + # + # Accept a backslash when it escapes a backslash, or a quote, or + # end-of-line, or one of the letters that introduce a special escape + # sequence <http://docs.python.org/reference/lexical_analysis.html> + # + # TODO(mbp): Maybe give a separate warning about the rarely-used + # \a \b \v \f? + # + # TODO(mbp): We could give the column of the problem character, but + # add_message doesn't seem to have a way to pass it through at present. + i = 0 + while True: + i = string_body.find('\\', i) + if i == -1: + break + # There must be a next character; having a backslash at the end + # of the string would be a SyntaxError. + next_char = string_body[i+1] + match = string_body[i:i+2] + if next_char in self.UNICODE_ESCAPE_CHARACTERS: + if 'u' in prefix: + pass + elif _PY3K and 'b' not in prefix: + pass # unicode by default + else: + self.add_message('W1402', line=start_row, args=(match, )) + elif next_char not in self.ESCAPE_CHARACTERS: + self.add_message('W1401', line=start_row, args=(match, )) + # Whether it was a valid escape or not, backslash followed by + # another character can always be consumed whole: the second + # character can never be the start of a new backslash escape. + i += 2 + + def register(linter): """required method to auto register this checker """ linter.register_checker(FormatChecker(linter)) + linter.register_checker(StringConstantChecker(linter)) diff --git a/test/input/func_excess_escapes.py b/test/input/func_excess_escapes.py new file mode 100644 index 0000000..fe3dc11 --- /dev/null +++ b/test/input/func_excess_escapes.py @@ -0,0 +1,44 @@ +# pylint:disable=W0105, W0511 +"""Stray backslash escapes may be missing a raw-string prefix.""" + +__revision__ = '$Id$' + +# Bad escape sequences, which probably don't do what you expect. +A = "\[\]\\" +assert '\/' == '\\/' +ESCAPE_BACKSLASH = '\`' + +# Valid escape sequences. +NEWLINE = "\n" +OLD_ESCAPES = '\a\b\f\n\t\r\v' +HEX = '\xad\x0a\x0d' +OCTAL = '\o123\o000' +UNICODE = u'\u1234' +HIGH_UNICODE = u'\U0000abcd' +QUOTES = '\'\"' +LITERAL_NEWLINE = '\ +' +ESCAPE_UNICODE = "\\\\n" + +# Bad docstring +"""Even in a docstring + +You shouldn't have ambiguous text like: C:\Program Files\alpha +""" + +# Would be valid in Unicode, but probably not what you want otherwise +BAD_UNICODE = '\u0042' +BAD_LONG_UNICODE = '\U00000042' +BAD_NAMED_UNICODE = '\N{GREEK SMALL LETTER ALPHA}' + +GOOD_UNICODE = u'\u0042' +GOOD_LONG_UNICODE = u'\U00000042' +GOOD_NAMED_UNICODE = u'\N{GREEK SMALL LETTER ALPHA}' + + +# Valid raw strings +RAW_BACKSLASHES = r'raw' +RAW_UNICODE = ur"\u0062\n" + +# In a comment you can have whatever you want: \ \\ \n \m +# even things that look like bad strings: "C:\Program Files" diff --git a/test/messages/func_excess_escapes.txt b/test/messages/func_excess_escapes.txt new file mode 100644 index 0000000..aad9ebc --- /dev/null +++ b/test/messages/func_excess_escapes.txt @@ -0,0 +1,8 @@ +W: 7: Anomalous backslash in string: '\['. String constant might be missing an r prefix. +W: 7: Anomalous backslash in string: '\]'. String constant might be missing an r prefix. +W: 8: Anomalous backslash in string: '\/'. String constant might be missing an r prefix. +W: 9: Anomalous backslash in string: '\`'. String constant might be missing an r prefix. +W: 24: Anomalous backslash in string: '\P'. String constant might be missing an r prefix. +W: 30: Anomalous Unicode escape in byte string: '\u'. String constant might be missing an r or u prefix. +W: 31: Anomalous Unicode escape in byte string: '\U'. String constant might be missing an r or u prefix. +W: 32: Anomalous Unicode escape in byte string: '\N'. String constant might be missing an r or u prefix. |