diff options
-rw-r--r-- | Doc/library/re.rst | 16 | ||||
-rw-r--r-- | Doc/tools/susp-ignored.csv | 2 | ||||
-rw-r--r-- | Doc/whatsnew/3.7.rst | 11 | ||||
-rw-r--r-- | Lib/email/_header_value_parser.py | 9 | ||||
-rw-r--r-- | Lib/re.py | 3 | ||||
-rw-r--r-- | Lib/sre_parse.py | 24 | ||||
-rw-r--r-- | Lib/test/test_re.py | 47 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst | 3 |
8 files changed, 106 insertions, 9 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index cbb2f439d1..8c15462871 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -200,6 +200,20 @@ The special characters are: place it at the beginning of the set. For example, both ``[()[\]{}]`` and ``[]()[{}]`` will both match a parenthesis. + * Support of nested sets and set operations as in `Unicode Technical + Standard #18`_ might be added in the future. This would change the + syntax, so to facilitate this change a :exc:`FutureWarning` will be raised + in ambiguous cases for the time being. + That include sets starting with a literal ``'['`` or containing literal + character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To + avoid a warning escape them with a backslash. + + .. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/ + + .. versionchanged:: 3.7 + :exc:`FutureWarning` is raised if a character set contains constructs + that will change semantically in the future. + ``|`` ``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that will match either *A* or *B*. An arbitrary number of REs can be separated by the @@ -829,7 +843,7 @@ form. >>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:" >>> print('[%s]+' % re.escape(legal_chars)) - [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+ + [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%\&'\*\+\-\.\^_`\|\~:]+ >>> operators = ['+', '-', '*', '/', '**'] >>> print('|'.join(map(re.escape, sorted(operators, reverse=True)))) diff --git a/Doc/tools/susp-ignored.csv b/Doc/tools/susp-ignored.csv index 2b3ccf3ac6..d52f81b76b 100644 --- a/Doc/tools/susp-ignored.csv +++ b/Doc/tools/susp-ignored.csv @@ -300,7 +300,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a whatsnew/3.2,,:location,zope9-location = ${zope9:location} whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf library/re,,`,!#$%&'*+-.^_`|~: -library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~: +library/re,,`,!\#\$%\&'\*\+\-\.\^_`\|\~: library/tarfile,,:xz,'x:xz' library/xml.etree.elementtree,,:sometag,prefix:sometag library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com""" diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index a2fea50d09..9d63540e63 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -700,6 +700,17 @@ Changes in the Python API argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory is failed. +* Support of nested sets and set operations in regular expressions as in + `Unicode Technical Standard #18`_ might be added in the future. This would + change the syntax, so to facilitate this change a :exc:`FutureWarning` will + be raised in ambiguous cases for the time being. + That include sets starting with a literal ``'['`` or containing literal + character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To + avoid a warning escape them with a backslash. + (Contributed by Serhiy Storchaka in :issue:`30349`.) + +.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/ + Changes in the C API -------------------- diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 9b9697f773..b4737c806e 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1354,15 +1354,14 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker') _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split _non_atom_end_matcher = re.compile(r"[^{}]+".format( - ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match + re.escape(''.join(ATOM_ENDS)))).match _non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall _non_token_end_matcher = re.compile(r"[^{}]+".format( - ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match + re.escape(''.join(TOKEN_ENDS)))).match _non_attribute_end_matcher = re.compile(r"[^{}]+".format( - ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match + re.escape(''.join(ATTRIBUTE_ENDS)))).match _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( - ''.join(EXTENDED_ATTRIBUTE_ENDS).replace( - '\\','\\\\').replace(']',r'\]'))).match + re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match def _validate_xtext(xtext): """If input token contains ASCII non-printables, register a defect.""" @@ -251,8 +251,9 @@ def template(pattern, flags=0): # SPECIAL_CHARS # closing ')', '}' and ']' # '-' (a range in character set) +# '&', '~', (extended character set operations) # '#' (comment) and WHITESPACE (ignored) in verbose mode -_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'} +_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'} def escape(pattern): """ diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 8527412293..a53735b07d 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -517,6 +517,12 @@ def _parse(source, state, verbose, nested, first=False): setappend = set.append ## if sourcematch(":"): ## pass # handle character classes + if source.next == '[': + import warnings + warnings.warn( + 'Possible nested set at position %d' % source.tell(), + FutureWarning, stacklevel=nested + 6 + ) negate = sourcematch("^") # check remaining characters while True: @@ -529,6 +535,17 @@ def _parse(source, state, verbose, nested, first=False): elif this[0] == "\\": code1 = _class_escape(source, this) else: + if set and this in '-&~|' and source.next == this: + import warnings + warnings.warn( + 'Possible set %s at position %d' % ( + 'difference' if this == '-' else + 'intersection' if this == '&' else + 'symmetric difference' if this == '~' else + 'union', + source.tell() - 1), + FutureWarning, stacklevel=nested + 6 + ) code1 = LITERAL, _ord(this) if sourcematch("-"): # potential range @@ -545,6 +562,13 @@ def _parse(source, state, verbose, nested, first=False): if that[0] == "\\": code2 = _class_escape(source, that) else: + if that == '-': + import warnings + warnings.warn( + 'Possible set difference at position %d' % ( + source.tell() - 2), + FutureWarning, stacklevel=nested + 6 + ) code2 = LITERAL, _ord(that) if code1[0] != LITERAL or code2[0] != LITERAL: msg = "bad character range %s-%s" % (this, that) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index fc015e4ed9..ee87446b79 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -914,6 +914,51 @@ class ReTests(unittest.TestCase): self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") + def test_possible_set_operations(self): + s = bytes(range(128)).decode() + with self.assertWarns(FutureWarning): + p = re.compile(r'[0-9--1]') + self.assertEqual(p.findall(s), list('-./0123456789')) + self.assertEqual(re.findall(r'[--1]', s), list('-./01')) + with self.assertWarns(FutureWarning): + p = re.compile(r'[%--1]') + self.assertEqual(p.findall(s), list("%&'()*+,-1")) + with self.assertWarns(FutureWarning): + p = re.compile(r'[%--]') + self.assertEqual(p.findall(s), list("%&'()*+,-")) + + with self.assertWarns(FutureWarning): + p = re.compile(r'[0-9&&1]') + self.assertEqual(p.findall(s), list('&0123456789')) + with self.assertWarns(FutureWarning): + p = re.compile(r'[\d&&1]') + self.assertEqual(p.findall(s), list('&0123456789')) + self.assertEqual(re.findall(r'[&&1]', s), list('&1')) + + with self.assertWarns(FutureWarning): + p = re.compile(r'[0-9||a]') + self.assertEqual(p.findall(s), list('0123456789a|')) + with self.assertWarns(FutureWarning): + p = re.compile(r'[\d||a]') + self.assertEqual(p.findall(s), list('0123456789a|')) + self.assertEqual(re.findall(r'[||1]', s), list('1|')) + + with self.assertWarns(FutureWarning): + p = re.compile(r'[0-9~~1]') + self.assertEqual(p.findall(s), list('0123456789~')) + with self.assertWarns(FutureWarning): + p = re.compile(r'[\d~~1]') + self.assertEqual(p.findall(s), list('0123456789~')) + self.assertEqual(re.findall(r'[~~1]', s), list('1~')) + + with self.assertWarns(FutureWarning): + p = re.compile(r'[[0-9]|]') + self.assertEqual(p.findall(s), list('0123456789[]')) + + with self.assertWarns(FutureWarning): + p = re.compile(r'[[:digit:]|]') + self.assertEqual(p.findall(s), list(':[]dgit')) + def test_search_coverage(self): self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") @@ -932,7 +977,7 @@ class ReTests(unittest.TestCase): self.assertEqual(m.group(), match) self.assertEqual(m.span(), span) - LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~' + LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`' def test_re_escape(self): p = ''.join(chr(i) for i in range(256)) diff --git a/Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst b/Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst new file mode 100644 index 0000000000..6862e02502 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst @@ -0,0 +1,3 @@ +FutureWarning is now emitted if a regular expression contains character set +constructs that will change semantically in the future (nested sets and set +operations). |