diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2014-10-10 11:14:49 +0300 |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2014-10-10 11:14:49 +0300 |
commit | e2ccf5608ca08f6b5adb8a3fd4d92bf9a450b0a8 (patch) | |
tree | c4d93f671b86fbe0f612997aa0941b07422a1f81 /Lib/sre_parse.py | |
parent | 5aa47443c60deb5cafbf6ad195bf49e364cbce4c (diff) | |
download | cpython-git-e2ccf5608ca08f6b5adb8a3fd4d92bf9a450b0a8.tar.gz |
Issue #19380: Optimized parsing of regular expressions.
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 268 |
1 files changed, 119 insertions, 149 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 063d1b7fda..68efece306 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -18,12 +18,15 @@ from _sre import MAXREPEAT SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" -DIGITS = set("0123456789") +DIGITS = frozenset("0123456789") -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") -WHITESPACE = set(" \t\n\r\v\f") +WHITESPACE = frozenset(" \t\n\r\v\f") + +_REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT)) +_UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -153,11 +156,9 @@ class SubPattern: self.data.append(code) def getwidth(self): # determine the width (min, max) for this subpattern - if self.width: + if self.width is not None: return self.width lo = hi = 0 - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) for op, av in self.data: if op is BRANCH: i = MAXREPEAT - 1 @@ -176,11 +177,11 @@ class SubPattern: i, j = av[1].getwidth() lo = lo + i hi = hi + j - elif op in REPEATCODES: + elif op in _REPEATCODES: i, j = av[2].getwidth() lo = lo + i * av[0] hi = hi + j * av[1] - elif op in UNITCODES: + elif op in _UNITCODES: lo = lo + 1 hi = hi + 1 elif op == SUCCESS: @@ -191,34 +192,31 @@ class SubPattern: class Tokenizer: def __init__(self, string): self.istext = isinstance(string, str) + if not self.istext: + string = str(string, 'latin1') self.string = string self.index = 0 self.__next() def __next(self): - if self.index >= len(self.string): + index = self.index + try: + char = self.string[index] + except IndexError: self.next = None return - char = self.string[self.index:self.index+1] - # Special case for the str8, since indexing returns a integer - # XXX This is only needed for test_bug_926075 in test_re.py - if char and not self.istext: - char = chr(char[0]) if char == "\\": + index += 1 try: - c = self.string[self.index + 1] + char += self.string[index] except IndexError: raise error("bogus escape (end of line)") - if not self.istext: - c = chr(c) - char = char + c - self.index = self.index + len(char) + self.index = index + 1 self.next = char - def match(self, char, skip=1): + def match(self, char): if char == self.next: - if skip: - self.__next() - return 1 - return 0 + self.__next() + return True + return False def get(self): this = self.next self.__next() @@ -232,6 +230,17 @@ class Tokenizer: result += c self.__next() return result + def getuntil(self, terminator): + result = '' + while True: + c = self.next + self.__next() + if c is None: + raise error("unterminated name") + if c == terminator: + break + result += c + return result def tell(self): return self.index, self.next def seek(self, index): @@ -270,7 +279,7 @@ def _class_escape(source, escape): if code: return code code = CATEGORIES.get(escape) - if code and code[0] == IN: + if code and code[0] is IN: return code try: c = escape[1:2] @@ -279,7 +288,7 @@ def _class_escape(source, escape): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -325,7 +334,7 @@ def _escape(source, escape, state): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -347,11 +356,11 @@ def _escape(source, escape, state): elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: - escape = escape + source.get() + escape += source.get() if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and source.next in OCTDIGITS): # got three octal digits; this is an octal escape - escape = escape + source.get() + escape += source.get() c = int(escape[1:], 8) if c > 0o377: raise error('octal escape value %r outside of ' @@ -370,22 +379,18 @@ def _escape(source, escape, state): pass raise error("bogus escape: %s" % repr(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match - while 1: + while True: itemsappend(_parse(source, state)) - if sourcematch("|"): - continue - if not nested: + if not sourcematch("|"): break - if not source.next or sourcematch(")", 0): - break - else: - raise error("pattern not properly closed") + if nested and source.next is not None and source.next != ")": + raise error("pattern not properly closed") if len(items) == 1: return items[0] @@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1): subpatternappend = subpattern.append # check if all items share a common prefix - while 1: + while True: prefix = None for item in items: if not item: @@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1): # check if the branch can be replaced by a character set for item in items: - if len(item) != 1 or item[0][0] != LITERAL: + if len(item) != 1 or item[0][0] is not LITERAL: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) + subpatternappend((IN, [item[0] for item in items])) return subpattern subpattern.append((BRANCH, (None, items))) @@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condgroup): item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state) - if source.match("|"): + if source.next == "|": raise error("conditional backref with more than two branches") else: item_no = None - if source.next and not source.match(")", 0): + if source.next is not None and source.next != ")": raise error("pattern not properly closed") subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) - def _parse(source, state): # parse a simple pattern subpattern = SubPattern(state) @@ -457,32 +453,35 @@ def _parse(source, state): sourceget = source.get sourcematch = source.match _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES + _ord = ord + verbose = state.flags & SRE_FLAG_VERBOSE - while 1: + while True: - if source.next in PATTERNENDERS: - break # end of subpattern - this = sourceget() + this = source.next if this is None: break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() - if state.flags & SRE_FLAG_VERBOSE: + if verbose: # skip whitespace and comments if this in WHITESPACE: continue if this == "#": - while 1: + while True: this = sourceget() - if this in (None, "\n"): + if this is None or this == "\n": break continue - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) elif this == "[": # character set @@ -494,39 +493,38 @@ def _parse(source, state): setappend((NEGATE, None)) # check remaining characters start = set[:] - while 1: + while True: this = sourceget() + if this is None: + raise error("unexpected end of regular expression") if this == "]" and set != start: break - elif this and this[0] == "\\": + elif this[0] == "\\": code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, ord(this) else: - raise error("unexpected end of regular expression") + code1 = LITERAL, _ord(this) if sourcematch("-"): # potential range this = sourceget() + if this is None: + raise error("unexpected end of regular expression") if this == "]": if code1[0] is IN: code1 = code1[1][0] setappend(code1) - setappend((LITERAL, ord("-"))) + setappend((LITERAL, _ord("-"))) break - elif this: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error("bad character range") - lo = code1[1] - hi = code2[1] - if hi < lo: - raise error("bad character range") - setappend((RANGE, (lo, hi))) + if this[0] == "\\": + code2 = _class_escape(source, this) else: - raise error("unexpected end of regular expression") + code2 = LITERAL, _ord(this) + if code1[0] != LITERAL or code2[0] != LITERAL: + raise error("bad character range") + lo = code1[1] + hi = code2[1] + if hi < lo: + raise error("bad character range") + setappend((RANGE, (lo, hi))) else: if code1[0] is IN: code1 = code1[1][0] @@ -541,7 +539,7 @@ def _parse(source, state): # XXX: <fl> should add charmap optimization here subpatternappend((IN, set)) - elif this and this[0] in REPEAT_CHARS: + elif this in REPEAT_CHARS: # repeat previous item if this == "?": min, max = 0, 1 @@ -552,20 +550,20 @@ def _parse(source, state): min, max = 1, MAXREPEAT elif this == "{": if source.next == "}": - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) continue here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: - lo = lo + source.get() + lo += sourceget() if sourcematch(","): while source.next in DIGITS: - hi = hi + sourceget() + hi += sourceget() else: hi = lo if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) source.seek(here) continue if lo: @@ -587,7 +585,7 @@ def _parse(source, state): item = None if not item or (_len(item) == 1 and item[0][0] == AT): raise error("nothing to repeat") - if item[0][0] in REPEATCODES: + if item[0][0] in _REPEATCODES: raise error("multiple repeat") if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) @@ -604,18 +602,14 @@ def _parse(source, state): if sourcematch("?"): group = 0 # options - if sourcematch("P"): + char = sourceget() + if char is None: + raise error("unexpected end of pattern") + if char == "P": # python extensions if sourcematch("<"): # named group: skip forward to end of name - name = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ">": - break - name = name + char + name = source.getuntil(">") group = 1 if not name: raise error("missing group name") @@ -623,14 +617,7 @@ def _parse(source, state): raise error("bad character in group name %r" % name) elif sourcematch("="): # named backreference - name = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ")": - break - name = name + char + name = source.getuntil(")") if not name: raise error("missing group name") if not name.isidentifier(): @@ -647,27 +634,25 @@ def _parse(source, state): if char is None: raise error("unexpected end of pattern") raise error("unknown specifier: ?P%s" % char) - elif sourcematch(":"): + elif char == ":": # non-capturing group group = 2 - elif sourcematch("#"): + elif char == "#": # comment - while 1: - if source.next is None or source.next == ")": + while True: + if source.next is None: + raise error("unbalanced parenthesis") + if sourceget() == ")": break - sourceget() - if not sourcematch(")"): - raise error("unbalanced parenthesis") continue - elif source.next in ASSERTCHARS: + elif char in "=!<": # lookahead assertions - char = sourceget() dir = 1 if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: + char = sourceget() + if char is None or char not in "=!": raise error("syntax error") dir = -1 # lookbehind - char = sourceget() p = _parse_sub(source, state) if not sourcematch(")"): raise error("unbalanced parenthesis") @@ -676,16 +661,9 @@ def _parse(source, state): else: subpatternappend((ASSERT_NOT, (dir, p))) continue - elif sourcematch("("): + elif char == "(": # conditional backreference group - condname = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ")": - break - condname = condname + char + condname = source.getuntil(")") group = 2 if not condname: raise error("missing group name") @@ -705,12 +683,14 @@ def _parse(source, state): raise error("bad group number") if condgroup >= MAXGROUPS: raise error("the group number is too large") - else: + elif char in FLAGS: # flags - if not source.next in FLAGS: - raise error("unexpected end of pattern") + state.flags |= FLAGS[char] while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] + state.flags |= FLAGS[sourceget()] + verbose = state.flags & SRE_FLAG_VERBOSE + else: + raise error("unexpected end of pattern " + char) if group: # parse group contents if group == 2: @@ -728,7 +708,7 @@ def _parse(source, state): state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) else: - while 1: + while True: char = sourceget() if char is None: raise error("unexpected end of pattern") @@ -742,10 +722,6 @@ def _parse(source, state): elif this == "$": subpattern.append((AT, AT_END)) - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - else: raise error("parser error") @@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None): p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags) - tail = source.get() - if tail == ")": - raise error("unbalanced parenthesis") - elif tail: - raise error("bogus characters at end of regular expression") + if source.next is not None: + if source.next == ")": + raise error("unbalanced parenthesis") + else: + raise error("bogus characters at end of regular expression") if flags & SRE_FLAG_DEBUG: p.dump() @@ -817,13 +793,7 @@ def parse_template(source, pattern): if c == "g": name = "" if s.match("<"): - while True: - char = sget() - if char is None: - raise error("unterminated group name") - if char == ">": - break - name += char + name = s.getuntil(">") if not name: raise error("missing group name") try: |