diff options
Diffstat (limited to 'Lib')
| -rw-r--r-- | Lib/sre_compile.py | 171 | ||||
| -rw-r--r-- | Lib/test/test_re.py | 9 | 
2 files changed, 111 insertions, 69 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index db8b8a2778..cebecb93c0 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -69,13 +69,16 @@ def _compile(code, pattern, flags):      REPEATING_CODES = _REPEATING_CODES      SUCCESS_CODES = _SUCCESS_CODES      ASSERT_CODES = _ASSERT_CODES +    iscased = None      tolower = None      fixes = None      if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:          if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: +            iscased = _sre.unicode_iscased              tolower = _sre.unicode_tolower              fixes = _ignorecase_fixes          else: +            iscased = _sre.ascii_iscased              tolower = _sre.ascii_tolower      for op, av in pattern:          if op in LITERAL_CODES: @@ -85,6 +88,9 @@ def _compile(code, pattern, flags):              elif flags & SRE_FLAG_LOCALE:                  emit(OP_LOC_IGNORE[op])                  emit(av) +            elif not iscased(av): +                emit(op) +                emit(av)              else:                  lo = tolower(av)                  if fixes and lo in fixes: @@ -101,14 +107,15 @@ def _compile(code, pattern, flags):                      emit(OP_IGNORE[op])                      emit(lo)          elif op is IN: -            if not flags & SRE_FLAG_IGNORECASE: -                emit(op) -            elif flags & SRE_FLAG_LOCALE: +            charset, hascased = _optimize_charset(av, iscased, tolower, fixes) +            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:                  emit(IN_LOC_IGNORE) -            else: +            elif hascased:                  emit(IN_IGNORE) +            else: +                emit(IN)              skip = _len(code); emit(0) -            _compile_charset(av, flags, code, tolower, fixes) +            _compile_charset(charset, flags, code)              code[skip] = _len(code) - skip          elif op is ANY:              if flags & SRE_FLAG_DOTALL: @@ -223,10 +230,10 @@ def _compile(code, pattern, flags):          else:              raise error("internal: unsupported operand type %r" % (op,)) -def _compile_charset(charset, flags, code, fixup=None, fixes=None): +def _compile_charset(charset, flags, code):      # compile charset subprogram      emit = code.append -    for op, av in _optimize_charset(charset, fixup, fixes): +    for op, av in charset:          emit(op)          if op is NEGATE:              pass @@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):              raise error("internal: unsupported set operator %r" % (op,))      emit(FAILURE) -def _optimize_charset(charset, fixup, fixes): +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):      # internal: optimize character set      out = []      tail = []      charmap = bytearray(256) +    hascased = False      for op, av in charset:          while True:              try: @@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):                          if fixes and lo in fixes:                              for k in fixes[lo]:                                  charmap[k] = 1 +                        if not hascased and iscased(av): +                            hascased = True                      else:                          charmap[av] = 1                  elif op is RANGE:                      r = range(av[0], av[1]+1)                      if fixup: -                        r = map(fixup, r) -                    if fixup and fixes: -                        for i in r: -                            charmap[i] = 1 -                            if i in fixes: -                                for k in fixes[i]: -                                    charmap[k] = 1 +                        if fixes: +                            for i in map(fixup, r): +                                charmap[i] = 1 +                                if i in fixes: +                                    for k in fixes[i]: +                                        charmap[k] = 1 +                        else: +                            for i in map(fixup, r): +                                charmap[i] = 1 +                        if not hascased: +                            hascased = any(map(iscased, r))                      else:                          for i in r:                              charmap[i] = 1 @@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):                      charmap += b'\0' * 0xff00                      continue                  # Character set contains non-BMP character codes. -                # There are only two ranges of cased non-BMP characters: -                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), -                # and for both ranges RANGE_IGNORE works. -                if fixup and op is RANGE: -                    op = RANGE_IGNORE +                if fixup: +                    hascased = True +                    # There are only two ranges of cased non-BMP characters: +                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), +                    # and for both ranges RANGE_IGNORE works. +                    if op is RANGE: +                        op = RANGE_IGNORE                  tail.append((op, av))              break @@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):                  out.append((RANGE, (p, q - 1)))          out += tail          # if the case was changed or new representation is more compact -        if fixup or len(out) < len(charset): -            return out +        if hascased or len(out) < len(charset): +            return out, hascased          # else original character set is good enough -        return charset +        return charset, hascased      # use bitmap      if len(charmap) == 256:          data = _mk_bitmap(charmap)          out.append((CHARSET, data))          out += tail -        return out +        return out, hascased      # To represent a big charset, first a bitmap of all characters in the      # set is constructed. Then, this bitmap is sliced into chunks of 256 @@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):      data[0:0] = [block] + _bytes_to_codes(mapping)      out.append((BIGCHARSET, data))      out += tail -    return out +    return out, hascased  _CODEBITS = _sre.CODESIZE * 8  MAXCODE = (1 << _CODEBITS) - 1 @@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):              table[i] = idx + 1      return table -def _get_literal_prefix(pattern): +def _get_iscased(flags): +    if not flags & SRE_FLAG_IGNORECASE: +        return None +    elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: +        return _sre.unicode_iscased +    else: +        return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags):      # look for literal prefix      prefix = []      prefixappend = prefix.append      prefix_skip = None +    iscased = _get_iscased(flags)      for op, av in pattern.data:          if op is LITERAL: +            if iscased and iscased(av): +                break              prefixappend(av)          elif op is SUBPATTERN:              group, add_flags, del_flags, p = av -            if add_flags & SRE_FLAG_IGNORECASE: +            flags1 = (flags | add_flags) & ~del_flags +            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:                  break -            prefix1, prefix_skip1, got_all = _get_literal_prefix(p) +            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)              if prefix_skip is None:                  if group is not None:                      prefix_skip = len(prefix) @@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):          return prefix, prefix_skip, True      return prefix, prefix_skip, False -def _get_charset_prefix(pattern): -    charset = [] # not used -    charsetappend = charset.append -    if pattern.data: +def _get_charset_prefix(pattern, flags): +    while True: +        if not pattern.data: +            return None          op, av = pattern.data[0] -        if op is SUBPATTERN: -            group, add_flags, del_flags, p = av -            if p and not (add_flags & SRE_FLAG_IGNORECASE): -                op, av = p[0] -                if op is LITERAL: -                    charsetappend((op, av)) -                elif op is BRANCH: -                    c = [] -                    cappend = c.append -                    for p in av[1]: -                        if not p: -                            break -                        op, av = p[0] -                        if op is LITERAL: -                            cappend((op, av)) -                        else: -                            break -                    else: -                        charset = c -        elif op is BRANCH: -            c = [] -            cappend = c.append -            for p in av[1]: -                if not p: -                    break -                op, av = p[0] -                if op is LITERAL: -                    cappend((op, av)) -                else: -                    break +        if op is not SUBPATTERN: +            break +        group, add_flags, del_flags, pattern = av +        flags = (flags | add_flags) & ~del_flags +        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: +            return None + +    iscased = _get_iscased(flags) +    if op is LITERAL: +        if iscased and iscased(av): +            return None +        return [(op, av)] +    elif op is BRANCH: +        charset = [] +        charsetappend = charset.append +        for p in av[1]: +            if not p: +                return None +            op, av = p[0] +            if op is LITERAL and not (iscased and iscased(av)): +                charsetappend((op, av))              else: -                charset = c -        elif op is IN: -            charset = av -    return charset +                return None +        return charset +    elif op is IN: +        charset = av +        if iscased: +            for op, av in charset: +                if op is LITERAL: +                    if iscased(av): +                        return None +                elif op is RANGE: +                    if av[1] > 0xffff: +                        return None +                    if any(map(iscased, range(av[0], av[1]+1))): +                        return None +        return charset +    return None  def _compile_info(code, pattern, flags):      # internal: compile an info block.  in the current version, @@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):      prefix = []      prefix_skip = 0      charset = [] # not used -    if not (flags & SRE_FLAG_IGNORECASE): +    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):          # look for literal prefix -        prefix, prefix_skip, got_all = _get_literal_prefix(pattern) +        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)          # if no prefix, look for charset prefix          if not prefix: -            charset = _get_charset_prefix(pattern) +            charset = _get_charset_prefix(pattern, flags)  ##     if prefix:  ##         print("*** PREFIX", prefix, prefix_skip)  ##     if charset: @@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):          # generate overlap table          code.extend(_generate_overlap_table(prefix))      elif charset: +        charset, hascased = _optimize_charset(charset) +        assert not hascased          _compile_charset(charset, flags, code)      code[skip] = len(code) - skip diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index b5b7cff9a2..3129f7e988 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -891,15 +891,24 @@ class ReTests(unittest.TestCase):              lo = ord(c.lower())              self.assertEqual(_sre.ascii_tolower(i), lo)              self.assertEqual(_sre.unicode_tolower(i), lo) +            iscased = c in string.ascii_letters +            self.assertEqual(_sre.ascii_iscased(i), iscased) +            self.assertEqual(_sre.unicode_iscased(i), iscased)          for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:              c = chr(i)              self.assertEqual(_sre.ascii_tolower(i), i)              if i != 0x0130:                  self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) +            iscased = c != c.lower() or c != c.upper() +            self.assertFalse(_sre.ascii_iscased(i)) +            self.assertEqual(_sre.unicode_iscased(i), +                             c != c.lower() or c != c.upper())          self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)          self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) +        self.assertFalse(_sre.ascii_iscased(0x0130)) +        self.assertTrue(_sre.unicode_iscased(0x0130))      def test_not_literal(self):          self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")  | 
