diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-05-05 08:53:40 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-05-05 08:53:40 +0300 |
commit | 898ff03e1e7925ecde3da66327d3cdc7e07625ba (patch) | |
tree | 977fc4b98c0e85816348cebd3b12026407c368b6 /Lib | |
parent | 647c3d381e67490e82cdbbe6c96e46d5e1628ce2 (diff) | |
download | cpython-git-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.gz |
bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time. Only the locale at matching
time affects the result of matching.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/re.py | 12 | ||||
-rw-r--r-- | Lib/sre_compile.py | 24 | ||||
-rw-r--r-- | Lib/sre_constants.py | 10 | ||||
-rw-r--r-- | Lib/test/test_re.py | 32 |
4 files changed, 58 insertions, 20 deletions
@@ -268,9 +268,7 @@ _MAXCACHE = 512 def _compile(pattern, flags): # internal: compile pattern try: - p, loc = _cache[type(pattern), pattern, flags] - if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE): - return p + return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, _pattern_type): @@ -284,13 +282,7 @@ def _compile(pattern, flags): if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: _cache.clear() - if p.flags & LOCALE: - if not _locale: - return p - loc = _locale.setlocale(_locale.LC_CTYPE) - else: - loc = None - _cache[type(pattern), pattern, flags] = p, loc + _cache[type(pattern), pattern, flags] = p return p @functools.lru_cache(_MAXCACHE) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 2cc39007ac..d7ee4e8cb6 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -78,7 +78,13 @@ def _compile(code, pattern, flags): fixes = None for op, av in pattern: if op in LITERAL_CODES: - if flags & SRE_FLAG_IGNORECASE: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + emit(av) + elif flags & SRE_FLAG_LOCALE: + emit(OP_LOC_IGNORE[op]) + emit(av) + else: lo = _sre.getlower(av, flags) if fixes and lo in fixes: emit(IN_IGNORE) @@ -93,17 +99,17 @@ def _compile(code, pattern, flags): else: emit(OP_IGNORE[op]) emit(lo) - else: - emit(op) - emit(av) elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OP_IGNORE[op]) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) - else: + if not flags & SRE_FLAG_IGNORECASE: emit(op) fixup = None + elif flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + fixup = None + else: + emit(IN_IGNORE) + def fixup(literal, flags=flags): + return _sre.getlower(literal, flags) skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup, fixes) code[skip] = _len(code) - skip diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index fc684ae96f..b0164312d0 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20140917 +MAGIC = 20170530 from _sre import MAXREPEAT, MAXGROUPS @@ -87,6 +87,9 @@ OPCODES = _makecodes(""" SUBPATTERN MIN_REPEAT_ONE RANGE_IGNORE + LITERAL_LOC_IGNORE + NOT_LITERAL_LOC_IGNORE + IN_LOC_IGNORE MIN_REPEAT MAX_REPEAT """) @@ -124,6 +127,11 @@ OP_IGNORE = { RANGE: RANGE_IGNORE, } +OP_LOC_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + AT_MULTILINE = { AT_BEGINNING: AT_BEGINNING_LINE, AT_END: AT_END_LINE diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index da5c953ced..7601dc88c7 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) + def test_locale_compiled(self): + oldlocale = locale.setlocale(locale.LC_CTYPE) + self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) + for loc in 'en_US.iso88591', 'en_US.utf8': + try: + locale.setlocale(locale.LC_CTYPE, loc) + except locale.Error: + # Unsupported locale on this system + self.skipTest('test needs %s locale' % loc) + + locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') + p1 = re.compile(b'\xc5\xe5', re.L|re.I) + p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) + p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) + p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) + for p in p1, p2, p3: + self.assertTrue(p.match(b'\xc5\xe5')) + self.assertTrue(p.match(b'\xe5\xe5')) + self.assertTrue(p.match(b'\xc5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xe5')) + self.assertIsNone(p4.match(b'\xc5\xc5')) + + locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') + for p in p1, p2, p3: + self.assertTrue(p.match(b'\xc5\xe5')) + self.assertIsNone(p.match(b'\xe5\xe5')) + self.assertIsNone(p.match(b'\xc5\xc5')) + self.assertTrue(p4.match(b'\xe5\xc5')) + self.assertIsNone(p4.match(b'\xe5\xe5')) + self.assertIsNone(p4.match(b'\xc5\xc5')) + def test_error(self): with self.assertRaises(re.error) as cm: re.compile('(\u20ac))') |