summaryrefslogtreecommitdiff
path: root/Lib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-05-05 08:53:40 +0300
committerGitHub <noreply@github.com>2017-05-05 08:53:40 +0300
commit898ff03e1e7925ecde3da66327d3cdc7e07625ba (patch)
tree977fc4b98c0e85816348cebd3b12026407c368b6 /Lib
parent647c3d381e67490e82cdbbe6c96e46d5e1628ce2 (diff)
downloadcpython-git-898ff03e1e7925ecde3da66327d3cdc7e07625ba.tar.gz
bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/re.py12
-rw-r--r--Lib/sre_compile.py24
-rw-r--r--Lib/sre_constants.py10
-rw-r--r--Lib/test/test_re.py32
4 files changed, 58 insertions, 20 deletions
diff --git a/Lib/re.py b/Lib/re.py
index 7053eddbe0..d0ee5db175 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -268,9 +268,7 @@ _MAXCACHE = 512
def _compile(pattern, flags):
# internal: compile pattern
try:
- p, loc = _cache[type(pattern), pattern, flags]
- if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
- return p
+ return _cache[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, _pattern_type):
@@ -284,13 +282,7 @@ def _compile(pattern, flags):
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
- if p.flags & LOCALE:
- if not _locale:
- return p
- loc = _locale.setlocale(_locale.LC_CTYPE)
- else:
- loc = None
- _cache[type(pattern), pattern, flags] = p, loc
+ _cache[type(pattern), pattern, flags] = p
return p
@functools.lru_cache(_MAXCACHE)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 2cc39007ac..d7ee4e8cb6 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -78,7 +78,13 @@ def _compile(code, pattern, flags):
fixes = None
for op, av in pattern:
if op in LITERAL_CODES:
- if flags & SRE_FLAG_IGNORECASE:
+ if not flags & SRE_FLAG_IGNORECASE:
+ emit(op)
+ emit(av)
+ elif flags & SRE_FLAG_LOCALE:
+ emit(OP_LOC_IGNORE[op])
+ emit(av)
+ else:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
emit(IN_IGNORE)
@@ -93,17 +99,17 @@ def _compile(code, pattern, flags):
else:
emit(OP_IGNORE[op])
emit(lo)
- else:
- emit(op)
- emit(av)
elif op is IN:
- if flags & SRE_FLAG_IGNORECASE:
- emit(OP_IGNORE[op])
- def fixup(literal, flags=flags):
- return _sre.getlower(literal, flags)
- else:
+ if not flags & SRE_FLAG_IGNORECASE:
emit(op)
fixup = None
+ elif flags & SRE_FLAG_LOCALE:
+ emit(IN_LOC_IGNORE)
+ fixup = None
+ else:
+ emit(IN_IGNORE)
+ def fixup(literal, flags=flags):
+ return _sre.getlower(literal, flags)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index fc684ae96f..b0164312d0 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20140917
+MAGIC = 20170530
from _sre import MAXREPEAT, MAXGROUPS
@@ -87,6 +87,9 @@ OPCODES = _makecodes("""
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
+ LITERAL_LOC_IGNORE
+ NOT_LITERAL_LOC_IGNORE
+ IN_LOC_IGNORE
MIN_REPEAT MAX_REPEAT
""")
@@ -124,6 +127,11 @@ OP_IGNORE = {
RANGE: RANGE_IGNORE,
}
+OP_LOC_IGNORE = {
+ LITERAL: LITERAL_LOC_IGNORE,
+ NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
+}
+
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index da5c953ced..7601dc88c7 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1730,6 +1730,38 @@ SUBPATTERN None 0 0
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
+ def test_locale_compiled(self):
+ oldlocale = locale.setlocale(locale.LC_CTYPE)
+ self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+ for loc in 'en_US.iso88591', 'en_US.utf8':
+ try:
+ locale.setlocale(locale.LC_CTYPE, loc)
+ except locale.Error:
+ # Unsupported locale on this system
+ self.skipTest('test needs %s locale' % loc)
+
+ locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
+ p1 = re.compile(b'\xc5\xe5', re.L|re.I)
+ p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
+ p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
+ p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
+ for p in p1, p2, p3:
+ self.assertTrue(p.match(b'\xc5\xe5'))
+ self.assertTrue(p.match(b'\xe5\xe5'))
+ self.assertTrue(p.match(b'\xc5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xe5'))
+ self.assertIsNone(p4.match(b'\xc5\xc5'))
+
+ locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
+ for p in p1, p2, p3:
+ self.assertTrue(p.match(b'\xc5\xe5'))
+ self.assertIsNone(p.match(b'\xe5\xe5'))
+ self.assertIsNone(p.match(b'\xc5\xc5'))
+ self.assertTrue(p4.match(b'\xe5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xe5'))
+ self.assertIsNone(p4.match(b'\xc5\xc5'))
+
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')