diff options
| author | Raymond Hettinger <python@rcn.com> | 2004-03-26 23:24:00 +0000 | 
|---|---|---|
| committer | Raymond Hettinger <python@rcn.com> | 2004-03-26 23:24:00 +0000 | 
| commit | 968c56a6264462c3db7b527cad561a929bde49b9 (patch) | |
| tree | e44c07926ca4003123970a2fb1e4d96d6b0783cc /Lib/sre_parse.py | |
| parent | 29e383754e4a96b46d1bd9f72a49694cdb993850 (diff) | |
| download | cpython-git-968c56a6264462c3db7b527cad561a929bde49b9.tar.gz | |
Simple Optimizations:
* Factor constant expressions out of loops.
* Presize a list being grown to a known length.
Diffstat (limited to 'Lib/sre_parse.py')
| -rw-r--r-- | Lib/sre_parse.py | 165 | 
1 files changed, 92 insertions, 73 deletions
| diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 9482bf81b0..94d526da80 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -103,6 +103,7 @@ class SubPattern:          self.width = None      def dump(self, level=0):          nl = 1 +        seqtypes = type(()), type([])          for op, av in self.data:              print level*"  " + op,; nl = 0              if op == "in": @@ -118,7 +119,7 @@ class SubPattern:                          print level*"  " + "or"                      a.dump(level+1); nl = 1                      i = i + 1 -            elif type(av) in (type(()), type([])): +            elif type(av) in seqtypes:                  for a in av:                      if isinstance(a, SubPattern):                          if not nl: print @@ -149,6 +150,8 @@ class SubPattern:          if self.width:              return self.width          lo = hi = 0L +        UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) +        REPEATCODES = (MIN_REPEAT, MAX_REPEAT)          for op, av in self.data:              if op is BRANCH:                  i = sys.maxint @@ -167,11 +170,11 @@ class SubPattern:                  i, j = av[1].getwidth()                  lo = lo + i                  hi = hi + j -            elif op in (MIN_REPEAT, MAX_REPEAT): +            elif op in REPEATCODES:                  i, j = av[2].getwidth()                  lo = lo + long(i) * av[0]                  hi = hi + long(j) * av[1] -            elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): +            elif op in UNITCODES:                  lo = lo + 1                  hi = hi + 1              elif op == SUCCESS: @@ -313,13 +316,15 @@ def _parse_sub(source, state, nested=1):      # parse an alternation: a|b|c      items = [] +    itemsappend = items.append +    sourcematch = source.match      while 1: -        items.append(_parse(source, state)) -        if source.match("|"): +        itemsappend(_parse(source, state)) +        if sourcematch("|"):              continue          if not nested:              break -        if not source.next or source.match(")", 0): +        if not source.next or sourcematch(")", 0):              break          else:              raise error, "pattern not properly closed" @@ -328,6 +333,7 @@ def _parse_sub(source, state, nested=1):          return items[0]      subpattern = SubPattern(state) +    subpatternappend = subpattern.append      # check if all items share a common prefix      while 1: @@ -344,7 +350,7 @@ def _parse_sub(source, state, nested=1):              # move it out of the branch              for item in items:                  del item[0] -            subpattern.append(prefix) +            subpatternappend(prefix)              continue # check next one          break @@ -356,9 +362,10 @@ def _parse_sub(source, state, nested=1):          # we can store this as a character set instead of a          # branch (the compiler may optimize this even more)          set = [] +        setappend = set.append          for item in items: -            set.append(item[0]) -        subpattern.append((IN, set)) +            setappend(item[0]) +        subpatternappend((IN, set))          return subpattern      subpattern.append((BRANCH, (None, items))) @@ -380,14 +387,23 @@ def _parse_sub_cond(source, state, condgroup):  def _parse(source, state):      # parse a simple pattern -      subpattern = SubPattern(state) +    # precompute constants into local variables +    subpatternappend = subpattern.append +    sourceget = source.get +    sourcematch = source.match +    _len = len +    PATTERNENDERS = ("|", ")") +    ASSERTCHARS = ("=", "!", "<") +    LOOKBEHINDASSERTCHARS = ("=", "!") +    REPEATCODES = (MIN_REPEAT, MAX_REPEAT) +      while 1: -        if source.next in ("|", ")"): +        if source.next in PATTERNENDERS:              break # end of subpattern -        this = source.get() +        this = sourceget()          if this is None:              break # end of pattern @@ -397,25 +413,26 @@ def _parse(source, state):                  continue              if this == "#":                  while 1: -                    this = source.get() +                    this = sourceget()                      if this in (None, "\n"):                          break                  continue          if this and this[0] not in SPECIAL_CHARS: -            subpattern.append((LITERAL, ord(this))) +            subpatternappend((LITERAL, ord(this)))          elif this == "[":              # character set              set = [] -##          if source.match(":"): +            setappend = set.append +##          if sourcematch(":"):  ##              pass # handle character classes -            if source.match("^"): -                set.append((NEGATE, None)) +            if sourcematch("^"): +                setappend((NEGATE, None))              # check remaining characters              start = set[:]              while 1: -                this = source.get() +                this = sourceget()                  if this == "]" and set != start:                      break                  elif this and this[0] == "\\": @@ -424,14 +441,14 @@ def _parse(source, state):                      code1 = LITERAL, ord(this)                  else:                      raise error, "unexpected end of regular expression" -                if source.match("-"): +                if sourcematch("-"):                      # potential range -                    this = source.get() +                    this = sourceget()                      if this == "]":                          if code1[0] is IN:                              code1 = code1[1][0] -                        set.append(code1) -                        set.append((LITERAL, ord("-"))) +                        setappend(code1) +                        setappend((LITERAL, ord("-")))                          break                      elif this:                          if this[0] == "\\": @@ -444,22 +461,22 @@ def _parse(source, state):                          hi = code2[1]                          if hi < lo:                              raise error, "bad character range" -                        set.append((RANGE, (lo, hi))) +                        setappend((RANGE, (lo, hi)))                      else:                          raise error, "unexpected end of regular expression"                  else:                      if code1[0] is IN:                          code1 = code1[1][0] -                    set.append(code1) +                    setappend(code1)              # XXX: <fl> should move set optimization to compiler! -            if len(set)==1 and set[0][0] is LITERAL: -                subpattern.append(set[0]) # optimization -            elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: -                subpattern.append((NOT_LITERAL, set[1][1])) # optimization +            if _len(set)==1 and set[0][0] is LITERAL: +                subpatternappend(set[0]) # optimization +            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: +                subpatternappend((NOT_LITERAL, set[1][1])) # optimization              else:                  # XXX: <fl> should add charmap optimization here -                subpattern.append((IN, set)) +                subpatternappend((IN, set))          elif this and this[0] in REPEAT_CHARS:              # repeat previous item @@ -476,13 +493,13 @@ def _parse(source, state):                  lo = hi = ""                  while source.next in DIGITS:                      lo = lo + source.get() -                if source.match(","): +                if sourcematch(","):                      while source.next in DIGITS: -                        hi = hi + source.get() +                        hi = hi + sourceget()                  else:                      hi = lo -                if not source.match("}"): -                    subpattern.append((LITERAL, ord(this))) +                if not sourcematch("}"): +                    subpatternappend((LITERAL, ord(this)))                      source.seek(here)                      continue                  if lo: @@ -498,32 +515,32 @@ def _parse(source, state):                  item = subpattern[-1:]              else:                  item = None -            if not item or (len(item) == 1 and item[0][0] == AT): +            if not item or (_len(item) == 1 and item[0][0] == AT):                  raise error, "nothing to repeat" -            if item[0][0] in (MIN_REPEAT, MAX_REPEAT): +            if item[0][0] in REPEATCODES:                  raise error, "multiple repeat" -            if source.match("?"): +            if sourcematch("?"):                  subpattern[-1] = (MIN_REPEAT, (min, max, item))              else:                  subpattern[-1] = (MAX_REPEAT, (min, max, item))          elif this == ".": -            subpattern.append((ANY, None)) +            subpatternappend((ANY, None))          elif this == "(":              group = 1              name = None              condgroup = None -            if source.match("?"): +            if sourcematch("?"):                  group = 0                  # options -                if source.match("P"): +                if sourcematch("P"):                      # python extensions -                    if source.match("<"): +                    if sourcematch("<"):                          # named group: skip forward to end of name                          name = ""                          while 1: -                            char = source.get() +                            char = sourceget()                              if char is None:                                  raise error, "unterminated name"                              if char == ">": @@ -532,11 +549,11 @@ def _parse(source, state):                          group = 1                          if not isname(name):                              raise error, "bad character in group name" -                    elif source.match("="): +                    elif sourcematch("="):                          # named backreference                          name = ""                          while 1: -                            char = source.get() +                            char = sourceget()                              if char is None:                                  raise error, "unterminated name"                              if char == ")": @@ -547,47 +564,47 @@ def _parse(source, state):                          gid = state.groupdict.get(name)                          if gid is None:                              raise error, "unknown group name" -                        subpattern.append((GROUPREF, gid)) +                        subpatternappend((GROUPREF, gid))                          continue                      else: -                        char = source.get() +                        char = sourceget()                          if char is None:                              raise error, "unexpected end of pattern"                          raise error, "unknown specifier: ?P%s" % char -                elif source.match(":"): +                elif sourcematch(":"):                      # non-capturing group                      group = 2 -                elif source.match("#"): +                elif sourcematch("#"):                      # comment                      while 1:                          if source.next is None or source.next == ")":                              break -                        source.get() -                    if not source.match(")"): +                        sourceget() +                    if not sourcematch(")"):                          raise error, "unbalanced parenthesis"                      continue -                elif source.next in ("=", "!", "<"): +                elif source.next in ASSERTCHARS:                      # lookahead assertions -                    char = source.get() +                    char = sourceget()                      dir = 1                      if char == "<": -                        if source.next not in ("=", "!"): +                        if source.next not in LOOKBEHINDASSERTCHARS:                              raise error, "syntax error"                          dir = -1 # lookbehind -                        char = source.get() +                        char = sourceget()                      p = _parse_sub(source, state) -                    if not source.match(")"): +                    if not sourcematch(")"):                          raise error, "unbalanced parenthesis"                      if char == "=": -                        subpattern.append((ASSERT, (dir, p))) +                        subpatternappend((ASSERT, (dir, p)))                      else: -                        subpattern.append((ASSERT_NOT, (dir, p))) +                        subpatternappend((ASSERT_NOT, (dir, p)))                      continue -                elif source.match("("): +                elif sourcematch("("):                      # conditional backreference group                      condname = ""                      while 1: -                        char = source.get() +                        char = sourceget()                          if char is None:                              raise error, "unterminated name"                          if char == ")": @@ -608,7 +625,7 @@ def _parse(source, state):                      if not source.next in FLAGS:                          raise error, "unexpected end of pattern"                      while source.next in FLAGS: -                        state.flags = state.flags | FLAGS[source.get()] +                        state.flags = state.flags | FLAGS[sourceget()]              if group:                  # parse group contents                  if group == 2: @@ -620,14 +637,14 @@ def _parse(source, state):                      p = _parse_sub_cond(source, state, condgroup)                  else:                      p = _parse_sub(source, state) -                if not source.match(")"): +                if not sourcematch(")"):                      raise error, "unbalanced parenthesis"                  if group is not None:                      state.closegroup(group) -                subpattern.append((SUBPATTERN, (group, p))) +                subpatternappend((SUBPATTERN, (group, p)))              else:                  while 1: -                    char = source.get() +                    char = sourceget()                      if char is None:                          raise error, "unexpected end of pattern"                      if char == ")": @@ -635,14 +652,14 @@ def _parse(source, state):                      raise error, "unknown extension"          elif this == "^": -            subpattern.append((AT, AT_BEGINNING)) +            subpatternappend((AT, AT_BEGINNING))          elif this == "$":              subpattern.append((AT, AT_END))          elif this and this[0] == "\\":              code = _escape(source, this, state) -            subpattern.append(code) +            subpatternappend(code)          else:              raise error, "parser error" @@ -681,20 +698,21 @@ def parse_template(source, pattern):      # parse 're' replacement string into list of literals and      # group references      s = Tokenizer(source) +    sget = s.get      p = []      a = p.append -    def literal(literal, p=p): +    def literal(literal, p=p, pappend=a):          if p and p[-1][0] is LITERAL:              p[-1] = LITERAL, p[-1][1] + literal          else: -            p.append((LITERAL, literal)) +            pappend((LITERAL, literal))      sep = source[:0]      if type(sep) is type(""):          makechar = chr      else:          makechar = unichr      while 1: -        this = s.get() +        this = sget()          if this is None:              break # end of replacement string          if this and this[0] == "\\": @@ -703,7 +721,7 @@ def parse_template(source, pattern):                  name = ""                  if s.match("<"):                      while 1: -                        char = s.get() +                        char = sget()                          if char is None:                              raise error, "unterminated group name"                          if char == ">": @@ -731,7 +749,7 @@ def parse_template(source, pattern):                              code = MARK, group                              break                      elif s.next in OCTDIGITS: -                        this = this + s.get() +                        this = this + sget()                      else:                          break                  if not code: @@ -752,13 +770,14 @@ def parse_template(source, pattern):      # convert template to groups and literals lists      i = 0      groups = [] -    literals = [] +    groupsappend = groups.append +    literals = [None] * len(p)      for c, s in p:          if c is MARK: -            groups.append((i, s)) -            literals.append(None) +            groupsappend((i, s)) +            # literal[i] is already None          else: -            literals.append(s) +            literals[i] = s          i = i + 1      return groups, literals | 
