summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Th?nault <sylvain.thenault@logilab.fr>2012-09-19 16:11:08 +0200
committerSylvain Th?nault <sylvain.thenault@logilab.fr>2012-09-19 16:11:08 +0200
commit54dfc65486683aeff4d612ebc218795bd971de4b (patch)
tree6c6e35a4cfc12d5c806a80a9d8bb27e65f77eb62
parent4314608361014c4e376ac160f6097c0af56ee437 (diff)
downloadpylint-54dfc65486683aeff4d612ebc218795bd971de4b.tar.gz
[format checker] check for anomalous backslash escape (new W1401, W1402). Closes #104571
-rw-r--r--ChangeLog3
-rw-r--r--README1
-rw-r--r--checkers/__init__.py1
-rw-r--r--checkers/format.py106
-rw-r--r--test/input/func_excess_escapes.py44
-rw-r--r--test/messages/func_excess_escapes.txt8
6 files changed, 159 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index cdaec5f..d8df40b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,9 @@ ChangeLog for PyLint
====================
--
+ * #104571: check for anomalous backslash escape, introducing new
+ W1401 and W1402 messages (patch by Martin Pool)
+
* #100707: check for boolop being used as exception class, introducing
new W0711 message (patch by Tim Hatch)
diff --git a/README b/README
index 5f89aef..8123091 100644
--- a/README
+++ b/README
@@ -65,6 +65,7 @@ order doesn't matter...
* Wolfgang Grafen, Axel Muller, Fabio Zadrozny, Pierre Rouleau,
Maarten ter Huurne, Mirko Friedenhagen (among others):
bug reports, feedback, feature requests...
+* Martin Pool (Google): warnings for anomalous backslashes
* All the Logilab's team: daily use, bug reports, feature requests
* Other people have contributed by their feedback, if I've forgotten
you, send me a note !
diff --git a/checkers/__init__.py b/checkers/__init__.py
index f1271e9..519e43f 100644
--- a/checkers/__init__.py
+++ b/checkers/__init__.py
@@ -29,6 +29,7 @@ Base id of standard checkers (used in msg and report ids):
11: typecheck
12: logging
13: string_format
+14: string_constant
14-50: not yet used: reserved for future internal checkers.
51-99: perhaps used: reserved for external checkers
diff --git a/checkers/format.py b/checkers/format.py
index 316aa6c..85eb667 100644
--- a/checkers/format.py
+++ b/checkers/format.py
@@ -1,5 +1,7 @@
# Copyright (c) 2003-2010 Sylvain Thenault (thenault@gmail.com).
# Copyright (c) 2003-2012 LOGILAB S.A. (Paris, FRANCE).
+# Copyright 2012 Google Inc.
+#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
@@ -52,10 +54,10 @@ MSGS = {
'Used when more than on statement are found on the same line.'),
'C0322': ('Operator not preceded by a space\n%s',
'Used when one of the following operator (!= | <= | == | >= | < '
- '| > | = | \+= | -= | \*= | /= | %) is not preceded by a space.'),
+ '| > | = | \\+= | -= | \\*= | /= | %) is not preceded by a space.'),
'C0323': ('Operator not followed by a space\n%s',
'Used when one of the following operator (!= | <= | == | >= | < '
- '| > | = | \+= | -= | \*= | /= | %) is not followed by a space.'),
+ '| > | = | \\+= | -= | \\*= | /= | %) is not followed by a space.'),
'C0324': ('Comma not followed by a space\n%s',
'Used when a comma (",") is not followed by a space.'),
}
@@ -81,7 +83,7 @@ SQSTRING_RGX = r'"([^"\\]|\\.)*?"'
SASTRING_RGX = r"'([^'\\]|\\.)*?'"
# triple quoted string rgx
TQSTRING_RGX = r'"""([^"]|("(?!"")))*?(""")'
-# triple apostrophed string rgx # FIXME english please
+# triple apostrophe'd string rgx
TASTRING_RGX = r"'''([^']|('(?!'')))*?(''')"
# finally, the string regular expression
@@ -109,11 +111,12 @@ BAD_CONSTRUCT_RGXS = (
re.compile(OP_RGX_SEARCH_2, re.M),
'C0323'),
- (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M),
+ (re.compile(r'.*,[^(\s|\]|}|\))].*', re.M),
re.compile(r',[^\s)]', re.M),
'C0324'),
)
+_PY3K = sys.version_info >= (3, 0)
def get_string_coords(line):
"""return a list of string positions (tuple (start, end)) in the line
@@ -356,6 +359,101 @@ class FormatChecker(BaseRawChecker):
expected * unit_size))
+class StringConstantChecker(BaseRawChecker):
+ """Check string literals"""
+
+ msgs = {
+ 'W1401': ('Anomalous backslash in string: \'%s\'. '
+ 'String constant might be missing an r prefix.',
+ 'Used when a backslash is in a literal string but not as an '
+ 'escape.'),
+ 'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. '
+ 'String constant might be missing an r or u prefix.',
+ 'Used when an escape like \\u is encountered in a byte '
+ 'string where it has no effect.'),
+ }
+ name = 'string_constant'
+ __implements__ = (IRawChecker, IASTNGChecker)
+
+ # Characters that have a special meaning after a backslash in either
+ # Unicode or byte strings.
+ ESCAPE_CHARACTERS = 'abfnrtvox\n\r\t\\\'\"'
+
+ # Characters that have a special meaning after a backslash but only in
+ # Unicode strings.
+ UNICODE_ESCAPE_CHARACTERS = 'uUN'
+
+ def process_tokens(self, tokens):
+ for (tok_type, token, (start_row, start_col), _, _) in tokens:
+ if tok_type == tokenize.STRING:
+ # 'token' is the whole un-parsed token; we can look at the start
+ # of it to see whether it's a raw or unicode string etc.
+ self.process_string_token(token, start_row, start_col)
+
+ def process_string_token(self, token, start_row, start_col):
+ for i, c in enumerate(token):
+ if c in '\'\"':
+ quote_char = c
+ break
+ prefix = token[:i].lower() # markers like u, b, r.
+ after_prefix = token[i:]
+ if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
+ string_body = after_prefix[3:-3]
+ else:
+ string_body = after_prefix[1:-1] # Chop off quotes
+ # No special checks on raw strings at the moment.
+ if 'r' not in prefix:
+ self.process_non_raw_string_token(prefix, string_body,
+ start_row, start_col)
+
+ def process_non_raw_string_token(self, prefix, string_body, start_row,
+ start_col):
+ """check for bad escapes in a non-raw string.
+
+ prefix: lowercase string of eg 'ur' string prefix markers.
+ string_body: the un-parsed body of the string, not including the quote
+ marks.
+ start_row: integer line number in the source.
+ start_col: integer column number in the source.
+ """
+ # Walk through the string; if we see a backslash then escape the next
+ # character, and skip over it. If we see a non-escaped character,
+ # alert, and continue.
+ #
+ # Accept a backslash when it escapes a backslash, or a quote, or
+ # end-of-line, or one of the letters that introduce a special escape
+ # sequence <http://docs.python.org/reference/lexical_analysis.html>
+ #
+ # TODO(mbp): Maybe give a separate warning about the rarely-used
+ # \a \b \v \f?
+ #
+ # TODO(mbp): We could give the column of the problem character, but
+ # add_message doesn't seem to have a way to pass it through at present.
+ i = 0
+ while True:
+ i = string_body.find('\\', i)
+ if i == -1:
+ break
+ # There must be a next character; having a backslash at the end
+ # of the string would be a SyntaxError.
+ next_char = string_body[i+1]
+ match = string_body[i:i+2]
+ if next_char in self.UNICODE_ESCAPE_CHARACTERS:
+ if 'u' in prefix:
+ pass
+ elif _PY3K and 'b' not in prefix:
+ pass # unicode by default
+ else:
+ self.add_message('W1402', line=start_row, args=(match, ))
+ elif next_char not in self.ESCAPE_CHARACTERS:
+ self.add_message('W1401', line=start_row, args=(match, ))
+ # Whether it was a valid escape or not, backslash followed by
+ # another character can always be consumed whole: the second
+ # character can never be the start of a new backslash escape.
+ i += 2
+
+
def register(linter):
"""required method to auto register this checker """
linter.register_checker(FormatChecker(linter))
+ linter.register_checker(StringConstantChecker(linter))
diff --git a/test/input/func_excess_escapes.py b/test/input/func_excess_escapes.py
new file mode 100644
index 0000000..fe3dc11
--- /dev/null
+++ b/test/input/func_excess_escapes.py
@@ -0,0 +1,44 @@
+# pylint:disable=W0105, W0511
+"""Stray backslash escapes may be missing a raw-string prefix."""
+
+__revision__ = '$Id$'
+
+# Bad escape sequences, which probably don't do what you expect.
+A = "\[\]\\"
+assert '\/' == '\\/'
+ESCAPE_BACKSLASH = '\`'
+
+# Valid escape sequences.
+NEWLINE = "\n"
+OLD_ESCAPES = '\a\b\f\n\t\r\v'
+HEX = '\xad\x0a\x0d'
+OCTAL = '\o123\o000'
+UNICODE = u'\u1234'
+HIGH_UNICODE = u'\U0000abcd'
+QUOTES = '\'\"'
+LITERAL_NEWLINE = '\
+'
+ESCAPE_UNICODE = "\\\\n"
+
+# Bad docstring
+"""Even in a docstring
+
+You shouldn't have ambiguous text like: C:\Program Files\alpha
+"""
+
+# Would be valid in Unicode, but probably not what you want otherwise
+BAD_UNICODE = '\u0042'
+BAD_LONG_UNICODE = '\U00000042'
+BAD_NAMED_UNICODE = '\N{GREEK SMALL LETTER ALPHA}'
+
+GOOD_UNICODE = u'\u0042'
+GOOD_LONG_UNICODE = u'\U00000042'
+GOOD_NAMED_UNICODE = u'\N{GREEK SMALL LETTER ALPHA}'
+
+
+# Valid raw strings
+RAW_BACKSLASHES = r'raw'
+RAW_UNICODE = ur"\u0062\n"
+
+# In a comment you can have whatever you want: \ \\ \n \m
+# even things that look like bad strings: "C:\Program Files"
diff --git a/test/messages/func_excess_escapes.txt b/test/messages/func_excess_escapes.txt
new file mode 100644
index 0000000..aad9ebc
--- /dev/null
+++ b/test/messages/func_excess_escapes.txt
@@ -0,0 +1,8 @@
+W: 7: Anomalous backslash in string: '\['. String constant might be missing an r prefix.
+W: 7: Anomalous backslash in string: '\]'. String constant might be missing an r prefix.
+W: 8: Anomalous backslash in string: '\/'. String constant might be missing an r prefix.
+W: 9: Anomalous backslash in string: '\`'. String constant might be missing an r prefix.
+W: 24: Anomalous backslash in string: '\P'. String constant might be missing an r prefix.
+W: 30: Anomalous Unicode escape in byte string: '\u'. String constant might be missing an r or u prefix.
+W: 31: Anomalous Unicode escape in byte string: '\U'. String constant might be missing an r or u prefix.
+W: 32: Anomalous Unicode escape in byte string: '\N'. String constant might be missing an r or u prefix.