summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeremy Hylton <jeremy@alum.mit.edu>2000-06-01 17:39:12 +0000
committerJeremy Hylton <jeremy@alum.mit.edu>2000-06-01 17:39:12 +0000
commit6ec5bf96e139a2e858ce02e4e9814e68ea1528e7 (patch)
treecaea18dd80d716e22340935190499055b89b5abb
parentf7183f6d5d2ea827a37769649b27615edccb555c (diff)
downloadcpython-6ec5bf96e139a2e858ce02e4e9814e68ea1528e7.tar.gz
Fredrik Lundh: here's the 96.6% version of SRE
-rw-r--r--Lib/sre.py123
-rw-r--r--Lib/sre_compile.py134
-rw-r--r--Lib/sre_constants.py95
-rw-r--r--Modules/_sre.c636
-rw-r--r--Modules/sre.h34
-rw-r--r--Modules/sre_constants.h24
6 files changed, 743 insertions, 303 deletions
diff --git a/Lib/sre.py b/Lib/sre.py
index 79878b35c4..455cd2785e 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py
@@ -1,4 +1,3 @@
-# -*- Mode: Python; tab-width: 4 -*-
#
# Secret Labs' Regular Expression Engine
# $Id$
@@ -7,39 +6,127 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
-# This code can only be used for 1.6 alpha testing. All other use
-# require explicit permission from Secret Labs AB.
-#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
-"""
-this is a long string
-"""
-
import sre_compile
+# flags
+I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
+L = LOCALE = sre_compile.SRE_FLAG_LOCALE
+M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
+S = DOTALL = sre_compile.SRE_FLAG_DOTALL
+X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
+
# --------------------------------------------------------------------
# public interface
-def compile(pattern, flags=0):
- return sre_compile.compile(pattern, _fixflags(flags))
+# FIXME: add docstrings
def match(pattern, string, flags=0):
- return compile(pattern, _fixflags(flags)).match(string)
+ return _compile(pattern, flags).match(string)
def search(pattern, string, flags=0):
- return compile(pattern, _fixflags(flags)).search(string)
+ return _compile(pattern, flags).search(string)
+
+def sub(pattern, repl, string, count=0):
+ return _compile(pattern).sub(repl, string, count)
+
+def subn(pattern, repl, string, count=0):
+ return _compile(pattern).subn(repl, string, count)
+
+def split(pattern, string, maxsplit=0):
+ return _compile(pattern).split(string, maxsplit)
-# FIXME: etc
+def findall(pattern, string, maxsplit=0):
+ return _compile(pattern).findall(string, maxsplit)
+
+def compile(pattern, flags=0):
+ return _compile(pattern, flags)
+
+def escape(pattern):
+ s = list(pattern)
+ for i in range(len(pattern)):
+ c = pattern[i]
+ if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
+ if c == "\000":
+ s[i] = "\\000"
+ else:
+ s[i] = "\\" + c
+ return pattern[:0].join(s)
# --------------------------------------------------------------------
-# helpers
+# internals
+
+_cache = {}
+_MAXCACHE = 100
+
+def _compile(pattern, flags=0):
+ # internal: compile pattern
+ tp = type(pattern)
+ if tp not in (type(""), type(u"")):
+ return pattern
+ key = (tp, pattern, flags)
+ try:
+ return _cache[key]
+ except KeyError:
+ pass
+ p = sre_compile.compile(pattern, flags)
+ if len(_cache) >= _MAXCACHE:
+ _cache.clear()
+ _cache[key] = p
+ return p
+
+def _sub(pattern, template, string, count=0):
+ # internal: pattern.sub implementation hook
+ return _subn(pattern, template, string, count)[0]
+
+def _expand(match, template):
+ # internal: expand template
+ return template # FIXME
-def _fixflags(flags):
- # convert flag bitmask to sequence
- assert not flags
- return ()
+def _subn(pattern, template, string, count=0):
+ # internal: pattern.subn implementation hook
+ if callable(template):
+ filter = callable
+ else:
+ # FIXME: prepare template
+ def filter(match, template=template):
+ return _expand(match, template)
+ n = i = 0
+ s = []
+ append = s.append
+ c = pattern.cursor(string)
+ while not count or n < count:
+ m = c.search()
+ if not m:
+ break
+ j = m.start()
+ if j > i:
+ append(string[i:j])
+ append(filter(m))
+ i = m.end()
+ n = n + 1
+ if i < len(string):
+ append(string[i:])
+ return string[:0].join(s), n
+def _split(pattern, string, maxsplit=0):
+ # internal: pattern.split implementation hook
+ n = i = 0
+ s = []
+ append = s.append
+ c = pattern.cursor(string)
+ while not maxsplit or n < maxsplit:
+ m = c.search()
+ if not m:
+ break
+ j = m.start()
+ append(string[i:j])
+ i = m.end()
+ n = n + 1
+ if i < len(string):
+ append(string[i:])
+ return s
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 87380614c8..53da005e76 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -14,9 +14,6 @@
# other compatibility work.
#
-# FIXME: <fl> formalize (objectify?) and document the compiler code
-# format, so that other frontends can use this compiler
-
import array, string, sys
import _sre
@@ -45,64 +42,70 @@ class Code:
self.data.append(code)
def todata(self):
# print self.data
- return array.array(WORDSIZE, self.data).tostring()
-
-def _lower(literal):
- # return _sre._lower(literal) # FIXME
- return string.lower(literal)
+ try:
+ return array.array(WORDSIZE, self.data).tostring()
+ except OverflowError:
+ print self.data
+ raise
-def _compile(code, pattern, flags):
+def _compile(code, pattern, flags, level=0):
append = code.append
for op, av in pattern:
if op is ANY:
- if "s" in flags:
- append(CODES[op]) # any character at all!
+ if flags & SRE_FLAG_DOTALL:
+ append(OPCODES[op]) # any character at all!
else:
- append(CODES[NOT_LITERAL])
- append(10)
+ append(OPCODES[CATEGORY])
+ append(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE):
- append(CODES[op])
+ append(OPCODES[op])
elif op is AT:
- append(CODES[op])
- append(POSITIONS[av])
+ append(OPCODES[op])
+ if flags & SRE_FLAG_MULTILINE:
+ append(ATCODES[AT_MULTILINE[av]])
+ else:
+ append(ATCODES[av])
elif op is BRANCH:
- append(CODES[op])
+ append(OPCODES[op])
tail = []
for av in av[1]:
skip = len(code); append(0)
- _compile(code, av, flags)
- append(CODES[JUMP])
+ _compile(code, av, flags, level)
+ append(OPCODES[JUMP])
tail.append(len(code)); append(0)
code[skip] = len(code) - skip
append(0) # end of branch
for tail in tail:
code[tail] = len(code) - tail
elif op is CALL:
- append(CODES[op])
+ append(OPCODES[op])
skip = len(code); append(0)
- _compile(code, av, flags)
- append(CODES[SUCCESS])
+ _compile(code, av, flags, level+1)
+ append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CATEGORY: # not used by current parser
- append(CODES[op])
- append(CATEGORIES[av])
+ append(OPCODES[op])
+ if flags & SRE_FLAG_LOCALE:
+ append(CH_LOCALE[CHCODES[av]])
+ else:
+ append(CHCODES[av])
elif op is GROUP:
- if "i" in flags:
- append(CODES[MAP_IGNORE[op]])
+ if flags & SRE_FLAG_IGNORECASE:
+ append(OPCODES[OP_IGNORE[op]])
else:
- append(CODES[op])
- append(av)
+ append(OPCODES[op])
+ append(av-1)
elif op is IN:
- if "i" in flags:
- append(CODES[MAP_IGNORE[op]])
+ if flags & SRE_FLAG_IGNORECASE:
+ append(OPCODES[OP_IGNORE[op]])
def fixup(literal):
- return ord(_lower(literal))
+ return ord(literal.lower())
else:
- append(CODES[op])
+ append(OPCODES[op])
fixup = ord
skip = len(code); append(0)
for op, av in av:
- append(CODES[op])
+ append(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
@@ -111,58 +114,60 @@ def _compile(code, pattern, flags):
append(fixup(av[0]))
append(fixup(av[1]))
elif op is CATEGORY:
- append(CATEGORIES[av])
+ if flags & SRE_FLAG_LOCALE:
+ append(CH_LOCALE[CHCODES[av]])
+ else:
+ append(CHCODES[av])
else:
raise ValueError, "unsupported set operator"
- append(CODES[FAILURE])
+ append(OPCODES[FAILURE])
code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL):
- if "i" in flags:
- append(CODES[MAP_IGNORE[op]])
- append(ord(_lower(av)))
+ if flags & SRE_FLAG_IGNORECASE:
+ append(OPCODES[OP_IGNORE[op]])
+ append(ord(av.lower()))
else:
- append(CODES[op])
+ append(OPCODES[op])
append(ord(av))
elif op is MARK:
- append(CODES[op])
+ append(OPCODES[op])
append(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
lo, hi = av[2].getwidth()
if lo == 0:
raise SyntaxError, "cannot repeat zero-width items"
if lo == hi == 1 and op is MAX_REPEAT:
- append(CODES[MAX_REPEAT_ONE])
+ append(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0)
append(av[0])
append(av[1])
- _compile(code, av[2], flags)
- append(CODES[SUCCESS])
+ _compile(code, av[2], flags, level+1)
+ append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
- append(CODES[op])
+ append(OPCODES[op])
skip = len(code); append(0)
append(av[0])
append(av[1])
- _compile(code, av[2], flags)
+ _compile(code, av[2], flags, level+1)
if op is MIN_REPEAT:
- append(CODES[MIN_UNTIL])
+ append(OPCODES[MIN_UNTIL])
else:
- # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
- append(CODES[MAX_UNTIL])
+ append(OPCODES[MAX_UNTIL])
code[skip] = len(code) - skip
elif op is SUBPATTERN:
-## group = av[0]
-## if group:
-## append(CODES[MARK])
-## append((group-1)*2)
- _compile(code, av[1], flags)
-## if group:
-## append(CODES[MARK])
-## append((group-1)*2+1)
+ group = av[0]
+ if group:
+ append(OPCODES[MARK])
+ append((group-1)*2)
+ _compile(code, av[1], flags, level+1)
+ if group:
+ append(OPCODES[MARK])
+ append((group-1)*2+1)
else:
raise ValueError, ("unsupported operand type", op)
-def compile(p, flags=()):
+def compile(p, flags=0):
# convert pattern list to internal format
if type(p) in (type(""), type(u"")):
import sre_parse
@@ -170,12 +175,10 @@ def compile(p, flags=()):
p = sre_parse.parse(p)
else:
pattern = None
- # print p.getwidth()
- # print p
+ flags = p.pattern.flags | flags
code = Code()
- _compile(code, p.data, p.pattern.flags)
- code.append(CODES[SUCCESS])
- # print list(code.data)
+ _compile(code, p.data, flags)
+ code.append(OPCODES[SUCCESS])
data = code.todata()
if 0: # debugging
print
@@ -183,5 +186,8 @@ def compile(p, flags=()):
import sre_disasm
sre_disasm.disasm(data)
print "-" * 68
- # print len(data), p.pattern.groups, len(p.pattern.groupdict)
- return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
+ return _sre.compile(
+ pattern, flags,
+ data,
+ p.pattern.groups-1, p.pattern.groupdict
+ )
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index af883094d4..531dc319cb 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -48,20 +48,31 @@ SUBPATTERN = "subpattern"
# positions
AT_BEGINNING = "at_beginning"
+AT_BEGINNING_LINE = "at_beginning_line"
AT_BOUNDARY = "at_boundary"
AT_NON_BOUNDARY = "at_non_boundary"
AT_END = "at_end"
+AT_END_LINE = "at_end_line"
# categories
-
CATEGORY_DIGIT = "category_digit"
CATEGORY_NOT_DIGIT = "category_not_digit"
CATEGORY_SPACE = "category_space"
CATEGORY_NOT_SPACE = "category_not_space"
CATEGORY_WORD = "category_word"
CATEGORY_NOT_WORD = "category_not_word"
+CATEGORY_LINEBREAK = "category_linebreak"
+CATEGORY_NOT_LINEBREAK = "category_not_linebreak"
+CATEGORY_LOC_DIGIT = "category_loc_digit"
+CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit"
+CATEGORY_LOC_SPACE = "category_loc_space"
+CATEGORY_LOC_NOT_SPACE = "category_loc_not_space"
+CATEGORY_LOC_WORD = "category_loc_word"
+CATEGORY_LOC_NOT_WORD = "category_loc_not_word"
+CATEGORY_LOC_LINEBREAK = "category_loc_linebreak"
+CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak"
-CODES = [
+OPCODES = [
# failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS,
@@ -87,45 +98,75 @@ CODES = [
]
-# convert to dictionary
-c = {}
-i = 0
-for code in CODES:
- c[code] = i
- i = i + 1
-CODES = c
+ATCODES = [
+ AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
+ AT_NON_BOUNDARY, AT_END, AT_END_LINE
+]
+
+CHCODES = [
+ CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
+ CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
+ CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT,
+ CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE,
+ CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD,
+ CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK
+]
+
+def makedict(list):
+ d = {}
+ i = 0
+ for item in list:
+ d[item] = i
+ i = i + 1
+ return d
+
+OPCODES = makedict(OPCODES)
+ATCODES = makedict(ATCODES)
+CHCODES = makedict(CHCODES)
# replacement operations for "ignore case" mode
-MAP_IGNORE = {
+OP_IGNORE = {
GROUP: GROUP_IGNORE,
IN: IN_IGNORE,
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE
}
-POSITIONS = {
- AT_BEGINNING: ord("a"),
- AT_BOUNDARY: ord("b"),
- AT_NON_BOUNDARY: ord("B"),
- AT_END: ord("z"),
+AT_MULTILINE = {
+ AT_BEGINNING: AT_BEGINNING_LINE,
+ AT_END: AT_END_LINE
}
-CATEGORIES = {
- CATEGORY_DIGIT: ord("d"),
- CATEGORY_NOT_DIGIT: ord("D"),
- CATEGORY_SPACE: ord("s"),
- CATEGORY_NOT_SPACE: ord("S"),
- CATEGORY_WORD: ord("w"),
- CATEGORY_NOT_WORD: ord("W"),
+CH_LOCALE = {
+ CATEGORY_DIGIT: CATEGORY_LOC_DIGIT,
+ CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT,
+ CATEGORY_SPACE: CATEGORY_LOC_SPACE,
+ CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE,
+ CATEGORY_WORD: CATEGORY_LOC_WORD,
+ CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
+ CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK,
+ CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK
}
+# flags
+SRE_FLAG_TEMPLATE = 1 # NYI
+SRE_FLAG_IGNORECASE = 2
+SRE_FLAG_LOCALE = 4
+SRE_FLAG_MULTILINE = 8
+SRE_FLAG_DOTALL = 16
+SRE_FLAG_VERBOSE = 32
+
if __name__ == "__main__":
import string
- items = CODES.items()
- items.sort(lambda a, b: cmp(a[1], b[1]))
+ def dump(f, d, prefix):
+ items = d.items()
+ items.sort(lambda a, b: cmp(a[1], b[1]))
+ for k, v in items:
+ f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
f = open("sre_constants.h", "w")
- f.write("/* generated by sre_constants.py */\n")
- for k, v in items:
- f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
+ f.write("/* generated from sre_constants.py */\n")
+ dump(f, OPCODES, "SRE_OP")
+ dump(f, ATCODES, "SRE")
+ dump(f, CHCODES, "SRE")
f.close()
print "done"
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 47b80c5d7f..8d468444b5 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -6,13 +6,16 @@
* simple regular expression matching engine
*
* partial history:
- * 99-10-24 fl created (bits and pieces from the template matcher)
+ * 99-10-24 fl created (based on the template matcher)
* 99-11-13 fl added categories, branching, and more (0.2)
* 99-11-16 fl some tweaks to compile on non-Windows platforms
* 99-12-18 fl non-literals, generic maximizing repeat (0.3)
* 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
* 99-03-06 fl first alpha, sort of (0.5)
* 99-03-14 fl removed most compatibility stuff (0.6)
+ * 99-05-10 fl towards third alpha (0.8.2)
+ * 99-05-13 fl added experimental cursor stuff (0.8.3)
+ * 99-05-27 fl final bug hunt (0.8.4)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
@@ -26,7 +29,7 @@
#ifndef SRE_RECURSIVE
-char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
+char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
@@ -40,7 +43,7 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#define INT_MAX 2147483647
#endif
-#include <ctype.h> /* temporary hack */
+#include <ctype.h>
/* defining this one enables tracing */
#undef DEBUG
@@ -59,61 +62,69 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#ifdef DEBUG
#define TRACE(v) printf v
-#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#else
#define TRACE(v)
#endif
+#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#define SRE_CODE unsigned short /* unsigned short or larger */
-typedef struct {
- /* string pointers */
- void* ptr; /* current position (also end of current slice) */
- void* beginning; /* start of original string */
- void* start; /* start of current slice */
- void* end; /* end of original string */
- /* character size */
- int charsize;
- /* registers */
- int marks;
- void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
- /* FIXME */
- /* backtracking stack */
- void** stack;
- int stacksize;
- int stackbase;
-} SRE_STATE;
-
-#if 1 /* FIXME: <fl> fix this one! */
-#define SRE_TO_LOWER Py_UNICODE_TOLOWER
-#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
-#define SRE_IS_SPACE Py_UNICODE_ISSPACE
-#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
-#else
-#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
-#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
-#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
+/* -------------------------------------------------------------------- */
+/* search engine state */
+
+/* unicode character predicates */
+#define SRE_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
+#define SRE_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
+#define SRE_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
+#define SRE_IS_LINEBREAK(ch) ((ch) == '\n')
+/* #define SRE_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) */
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
-#endif
-
#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
+/* locale-specific character predicates */
+#define SRE_LOC_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
+#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
+#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
+#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
+#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
+#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
+
LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch)
{
switch (category) {
- case 'd':
+ case SRE_CATEGORY_DIGIT:
return SRE_IS_DIGIT(ch);
- case 'D':
+ case SRE_CATEGORY_NOT_DIGIT:
return !SRE_IS_DIGIT(ch);
- case 's':
+ case SRE_CATEGORY_SPACE:
return SRE_IS_SPACE(ch);
- case 'S':
+ case SRE_CATEGORY_NOT_SPACE:
return !SRE_IS_SPACE(ch);
- case 'w':
+ case SRE_CATEGORY_WORD:
return SRE_IS_WORD(ch);
- case 'W':
+ case SRE_CATEGORY_NOT_WORD:
return !SRE_IS_WORD(ch);
+ case SRE_CATEGORY_LINEBREAK:
+ return SRE_IS_LINEBREAK(ch);
+ case SRE_CATEGORY_NOT_LINEBREAK:
+ return !SRE_IS_LINEBREAK(ch);
+ case SRE_CATEGORY_LOC_DIGIT:
+ return SRE_LOC_IS_DIGIT(ch);
+ case SRE_CATEGORY_LOC_NOT_DIGIT:
+ return !SRE_LOC_IS_DIGIT(ch);
+ case SRE_CATEGORY_LOC_SPACE:
+ return SRE_LOC_IS_SPACE(ch);
+ case SRE_CATEGORY_LOC_NOT_SPACE:
+ return !SRE_LOC_IS_SPACE(ch);
+ case SRE_CATEGORY_LOC_WORD:
+ return SRE_LOC_IS_WORD(ch);
+ case SRE_CATEGORY_LOC_NOT_WORD:
+ return !SRE_LOC_IS_WORD(ch);
+ case SRE_CATEGORY_LOC_LINEBREAK:
+ return SRE_LOC_IS_LINEBREAK(ch);
+ case SRE_CATEGORY_LOC_NOT_LINEBREAK:
+ return !SRE_LOC_IS_LINEBREAK(ch);
}
return 0;
}
@@ -174,7 +185,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
return 0;
}
-/* set things up for the 8-bit version */
+/* generate 8-bit version */
#define SRE_CHAR unsigned char
#define SRE_AT sre_at
@@ -192,7 +203,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
#undef SRE_AT
#undef SRE_CHAR
-/* set things up for the 16-bit unicode version */
+/* generate 16-bit unicode version */
#define SRE_CHAR Py_UNICODE
#define SRE_AT sre_uat
@@ -211,20 +222,22 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
LOCAL(int)
SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
{
- /* check if pointer is at given position. return 1 if so, 0
- otherwise */
+ /* check if pointer is at given position */
int this, that;
switch (at) {
- case 'a':
- /* beginning */
+ case SRE_AT_BEGINNING:
return ((void*) ptr == state->beginning);
- case 'z':
- /* end */
+ case SRE_AT_BEGINNING_LINE:
+ return ((void*) ptr == state->beginning ||
+ SRE_IS_LINEBREAK((int) ptr[-1]));
+ case SRE_AT_END:
return ((void*) ptr == state->end);
- case 'b':
- /* word boundary */
+ case SRE_AT_END_LINE:
+ return ((void*) ptr == state->end ||
+ SRE_IS_LINEBREAK((int) ptr[0]));
+ case SRE_AT_BOUNDARY:
if (state->beginning == state->end)
return 0;
that = ((void*) ptr > state->beginning) ?
@@ -232,8 +245,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
this = ((void*) ptr < state->end) ?
SRE_IS_WORD((int) ptr[0]) : 0;
return this != that;
- case 'B':
- /* word non-boundary */
+ case SRE_AT_NON_BOUNDARY:
if (state->beginning == state->end)
return 0;
that = ((void*) ptr > state->beginning) ?
@@ -249,8 +261,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
LOCAL(int)
SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
{
- /* check if character is a member of the given set. return 1 if
- so, 0 otherwise */
+ /* check if character is a member of the given set */
int ok = 1;
@@ -301,29 +312,42 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
int stackbase;
int i, count;
- for (;;) {
+ /* FIXME: this is one ugly hack */
+ void* *mark = NULL;
+ void* mark_data[64];
- TRACE(("[%p]\n", pattern));
+ for (;;) {
switch (*pattern++) {
case SRE_OP_FAILURE:
/* immediate failure */
TRACE(("%8d: failure\n", PTR(ptr)));
- return 0;
+ goto failure;
case SRE_OP_SUCCESS:
/* end of pattern */
TRACE(("%8d: success\n", PTR(ptr)));
state->ptr = ptr;
- return 1;
+ goto success;
case SRE_OP_AT:
/* match at given position */
+ /* args: <at> */
TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
if (!SRE_AT(state, ptr, *pattern))
- return 0;
+ goto failure;
+ pattern++;
+ break;
+
+ case SRE_OP_CATEGORY:
+ /* match at given category */
+ /* args: <category> */
+ TRACE(("%8d: category match at \\%c\n", PTR(ptr), *pattern));
+ if (ptr >= end || !sre_category(pattern[0], ptr[0]))
+ goto failure;
pattern++;
+ ptr++;
break;
case SRE_OP_LITERAL:
@@ -331,7 +355,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <code> */
TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
- return 0;
+ goto failure;
pattern++;
ptr++;
break;
@@ -341,7 +365,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <code> */
TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
- return 0;
+ goto failure;
pattern++;
ptr++;
break;
@@ -350,7 +374,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* match anything */
TRACE(("%8d: any\n", PTR(ptr)));
if (ptr >= end)
- return 0;
+ goto failure;
ptr++;
break;
@@ -359,23 +383,47 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <skip> <set> */
TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
- return 0;
+ goto failure;
pattern += pattern[0];
ptr++;
break;
case SRE_OP_GROUP:
/* match backreference */
+ TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
i = pattern[0];
{
- /* FIXME: optimize size! */
+ /* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
+ TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p)
- return 0;
+ goto failure;
while (p < e) {
+ TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
if (ptr >= end || *ptr != *p)
- return 0;
+ goto failure;
+ p++; ptr++;
+ }
+ }
+ pattern++;
+ break;
+
+ case SRE_OP_GROUP_IGNORE:
+ /* match backreference */
+ TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
+ i = pattern[0];
+ {
+ /* FIXME: optimize! */
+ SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
+ SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
+ TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
+ if (!p || !e || e < p)
+ goto failure;
+ while (p < e) {
+ TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
+ if (ptr >= end || SRE_TO_LOWER(*ptr) != SRE_TO_LOWER(*p))
+ goto failure;
p++; ptr++;
}
}
@@ -385,7 +433,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_LITERAL_IGNORE:
TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
- return 0;
+ goto failure;
pattern++;
ptr++;
break;
@@ -394,7 +442,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
(SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
- return 0;
+ goto failure;
pattern++;
ptr++;
break;
@@ -403,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
if (ptr >= end
|| !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
- return 0;
+ goto failure;
pattern += pattern[0];
ptr++;
break;
@@ -412,7 +460,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* set mark */
/* args: <mark> */
TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
- state->mark[pattern[0]] = ptr;
+ if (!mark) {
+ mark = mark_data;
+ memcpy(mark, state->mark, sizeof(state->mark));
+ }
+ state->mark[pattern[0]] = ptr;
pattern++;
break;
@@ -429,21 +481,18 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: match subpattern\n", PTR(ptr)));
state->ptr = ptr;
if (!SRE_MATCH(state, pattern + 1))
- return 0;
+ goto failure;
pattern += pattern[0];
ptr = state->ptr;
break;
case SRE_OP_MAX_REPEAT_ONE:
-
- /* match repeated sequence (maximizing regexp). this
- variant only works if the repeated item is exactly one
- character wide, and we're not already collecting
- backtracking points. for other cases, use the
+ /* match repeated sequence (maximizing regexp) */
+ /* this variant only works if the repeated item is exactly
+ one character wide, and we're not already collecting
+ backtracking points. for other cases, use the
MAX_REPEAT operator instead */
-
/* args: <skip> <min> <max> <step> */
-
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2]));
@@ -454,7 +503,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string, and backtrack from there */
/* FIXME: must look for line endings */
if (ptr + pattern[1] > end)
- return 0; /* cannot match */
+ goto failure; /* cannot match */
count = pattern[2];
if (count > end - ptr)
count = end - ptr;
@@ -515,7 +564,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3);
if (i < 0)
- return i;
+ goto failure;
if (i == 0)
break;
count++;
@@ -529,23 +578,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string. check if the rest of the pattern matches, and
backtrack if not. */
- /* FIXME: <fl> this is a mess. fix it! */
-
TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
if (count < (int) pattern[1])
- return 0;
+ goto failure;
if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */
TRACE(("%8d: tail is empty\n", PTR(ptr)));
state->ptr = ptr;
- return 1;
+ goto success;
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
- /* tail starts with a literal. we can speed things up
- by skipping positions where the rest of the pattern
- cannot possibly match */
+ /* tail starts with a literal. skip positions where
+ the rest of the pattern cannot possibly match */
SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) {
@@ -562,7 +608,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
- return 1;
+ goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--;
@@ -570,23 +616,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
}
} else {
+ /* general case */
TRACE(("%8d: tail is pattern\n", PTR(ptr)));
while (count >= (int) pattern[1]) {
state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
- return 1;
+ goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--;
count--;
}
}
- return 0; /* failure! */
-
-/* ----------------------------------------------------------------------- */
-/* FIXME: the following section is just plain broken */
+ goto failure;
case SRE_OP_MAX_REPEAT:
/* match repeated sequence (maximizing regexp). repeated
@@ -611,7 +655,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = _stack_extend(state, stackbase + count + 1,
stackbase + pattern[2]);
if (i < 0)
- return i;
+ goto failure;
}
state->stack[stackbase + count] = ptr;
/* check if we can match another item */
@@ -642,7 +686,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
ptr points to the tail. */
if (count < (int) pattern[1])
- return 0;
+ goto failure;
/* make sure that rest of the expression matches. if it
doesn't, backtrack */
@@ -659,7 +703,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->stackbase = stackbase;
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
- return 1;
+ goto success;
}
/* backtrack! */
@@ -673,10 +717,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->stackbase = stackbase;
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
- return 1;
+ goto success;
}
}
- return 0; /* failure! */
+ goto failure;
case SRE_OP_MAX_UNTIL:
/* match repeated sequence (maximizing regexp). repeated
@@ -684,13 +728,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: max until\n", PTR(ptr)));
state->ptr = ptr;
- return 2; /* always succeeds, for now... */
-
-/* end of totally broken section */
-/* ----------------------------------------------------------------------- */
+ goto success; /* always succeeds, for now... */
case SRE_OP_MIN_REPEAT:
/* match repeated sequence (minimizing regexp) */
+ /* FIXME: HERE BE BUGS! */
TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
pattern[1], pattern[2]));
count = 0;
@@ -699,7 +741,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[1]) {
i = SRE_MATCH(state, pattern + 3);
if (i <= 0)
- return i;
+ goto failure;
count++;
}
/* move forward until the tail matches. */
@@ -708,22 +750,22 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
- return 1;
+ goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
state->ptr = ptr; /* backtrack */
i = SRE_MATCH(state, pattern + 3);
if (i <= 0)
- return i;
+ goto failure;
count++;
}
- return 0; /* failure! */
+ goto failure;
case SRE_OP_MIN_UNTIL:
/* end of repeat group */
TRACE(("%8d: min until\n", PTR(ptr)));
state->ptr = ptr;
- return 2; /* always succeeds, for now... */
+ goto success; /* always succeeds, for now... */
case SRE_OP_BRANCH:
/* match one of several subpatterns */
@@ -737,13 +779,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + 1);
if (i > 0) {
TRACE(("%8d: branch succeeded\n", PTR(ptr)));
- return 1;
+ goto success;
}
}
pattern += *pattern;
}
TRACE(("%8d: branch failed\n", PTR(ptr)));
- return 0; /* failure! */
+ goto failure;
case SRE_OP_REPEAT:
/* TEMPLATE: match repeated sequence (no backtracking) */
@@ -758,16 +800,24 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
count++;
}
if (count <= (int) pattern[1])
- return 0;
+ goto failure;
TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
pattern += pattern[0];
ptr = state->ptr;
break;
- default:
+ default:
return SRE_ERROR_ILLEGAL;
}
}
+
+ failure:
+ if (mark)
+ memcpy(state->mark, mark, sizeof(state->mark));
+ return 0;
+
+ success:
+ return 1;
}
LOCAL(int)
@@ -832,6 +882,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
staticforward PyTypeObject Pattern_Type;
staticforward PyTypeObject Match_Type;
+staticforward PyTypeObject Cursor_Type;
static PyObject *
_compile(PyObject* self_, PyObject* args)
@@ -841,20 +892,25 @@ _compile(PyObject* self_, PyObject* args)
PatternObject* self;
PyObject* pattern;
+ int flags = 0;
PyObject* code;
int groups = 0;
PyObject* groupindex = NULL;
- if (!PyArg_ParseTuple(args, "OO!|iO", &pattern,
- &PyString_Type, &code, &groups, &groupindex))
+ if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
+ &PyString_Type, &code,
+ &groups, &groupindex))
return NULL;
- self = PyObject_New(PatternObject, &Pattern_Type);
+ self = PyObject_NEW(PatternObject, &Pattern_Type);
if (self == NULL)
+
return NULL;
Py_INCREF(pattern);
self->pattern = pattern;
+ self->flags = flags;
+
Py_INCREF(code);
self->code = code;
@@ -872,6 +928,69 @@ _getcodesize(PyObject* self_, PyObject* args)
return Py_BuildValue("i", sizeof(SRE_CODE));
}
+LOCAL(PyObject*)
+_setup(SRE_STATE* state, PyObject* args)
+{
+ /* prepare state object */
+
+ PyBufferProcs *buffer;
+ int i, count;
+ void* ptr;
+
+ PyObject* string;
+ int start = 0;
+ int end = INT_MAX;
+ if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
+ return NULL;
+
+ /* get pointer to string buffer */
+ buffer = string->ob_type->tp_as_buffer;
+ if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
+ buffer->bf_getsegcount(string, NULL) != 1) {
+ PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
+ return NULL;
+ }
+
+ /* determine buffer size */
+ count = buffer->bf_getreadbuffer(string, 0, &ptr);
+ if (count < 0) {
+ /* sanity check */
+ PyErr_SetString(PyExc_TypeError, "buffer has negative size");
+ return NULL;
+ }
+
+ /* determine character size */
+ state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
+
+ count /= state->charsize;
+
+ /* adjust boundaries */
+ if (start < 0)
+ start = 0;
+ else if (start > count)
+ start = count;
+
+ if (end < 0)
+ end = 0;
+ else if (end > count)
+ end = count;
+
+ state->beginning = ptr;
+
+ state->start = (void*) ((char*) ptr + start * state->charsize);
+ state->end = (void*) ((char*) ptr + end * state->charsize);
+
+ /* FIXME: dynamic! */
+ for (i = 0; i < 64; i++)
+ state->mark[i] = NULL;
+
+ state->stack = NULL;
+ state->stackbase = 0;
+ state->stacksize = 0;
+
+ return string;
+}
+
static PyObject*
_pattern_new_match(PatternObject* pattern, SRE_STATE* state,
PyObject* string, int status)
@@ -886,7 +1005,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
if (status > 0) {
/* create match object (with room for extra group marks) */
- match = PyObject_NewVar(MatchObject, &Match_Type, 2*pattern->groups);
+ match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
if (match == NULL)
return NULL;
@@ -930,70 +1049,32 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
return Py_None;
}
-/* -------------------------------------------------------------------- */
-/* pattern methods */
-
-LOCAL(PyObject*)
-_setup(SRE_STATE* state, PyObject* args)
+static PyObject*
+_pattern_cursor(PyObject* pattern, PyObject* args)
{
- /* prepare state object */
-
- PyBufferProcs *buffer;
- int i, count;
- void* ptr;
-
- PyObject* string;
- int start = 0;
- int end = INT_MAX;
- if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
- return NULL;
+ /* create search state object */
- /* get pointer to string buffer */
- buffer = string->ob_type->tp_as_buffer;
- if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
- buffer->bf_getsegcount(string, NULL) != 1) {
- PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
- return NULL;
- }
-
- /* determine buffer size */
- count = buffer->bf_getreadbuffer(string, 0, &ptr);
- if (count < 0) {
- /* sanity check */
- PyErr_SetString(PyExc_TypeError, "buffer has negative size");
- return NULL;
- }
-
- /* determine character size */
- state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
+ CursorObject* self;
+ PyObject* string;
- count /= state->charsize;
+ /* create match object (with room for extra group marks) */
+ self = PyObject_NEW(CursorObject, &Cursor_Type);
+ if (self == NULL)
+ return NULL;
- /* adjust boundaries */
- if (start < 0)
- start = 0;
- else if (start > count)
- start = count;
-
- if (end < 0)
- end = 0;
- else if (end > count)
- end = count;
-
- state->beginning = ptr;
-
- state->start = (void*) ((char*) ptr + start * state->charsize);
- state->end = (void*) ((char*) ptr + end * state->charsize);
+ string = _setup(&self->state, args);
+ if (!string) {
+ /* FIXME: dealloc cursor object */
+ return NULL;
+ }
- /* FIXME: dynamic! */
- for (i = 0; i < 64; i++)
- state->mark[i] = NULL;
+ Py_INCREF(pattern);
+ self->pattern = pattern;
- state->stack = NULL;
- state->stackbase = 0;
- state->stacksize = 0;
+ Py_INCREF(string);
+ self->string = string;
- return string;
+ return (PyObject*) self;
}
static void
@@ -1002,7 +1083,7 @@ _pattern_dealloc(PatternObject* self)
Py_XDECREF(self->code);
Py_XDECREF(self->pattern);
Py_XDECREF(self->groupindex);
- PyObject_Del(self);
+ PyMem_DEL(self);
}
static PyObject*
@@ -1052,11 +1133,71 @@ _pattern_search(PatternObject* self, PyObject* args)
}
static PyObject*
-_pattern_findall(PatternObject* self, PyObject* args)
+call(char* function, PyObject* args)
+{
+ PyObject* name;
+ PyObject* module;
+ PyObject* func;
+ PyObject* result;
+
+ name = PyString_FromString("sre");
+ if (!name)
+ return NULL;
+ module = PyImport_Import(name);
+ Py_DECREF(name);
+ if (!module)
+ return NULL;
+ func = PyObject_GetAttrString(module, function);
+ Py_DECREF(module);
+ if (!func)
+ return NULL;
+ result = PyObject_CallObject(func, args);
+ Py_DECREF(func);
+ Py_DECREF(args);
+ return result;
+}
+
+static PyObject*
+_pattern_sub(PatternObject* self, PyObject* args)
{
- /* FIXME: not sure about the semantics here. this is good enough
- for SXP, though... */
+ PyObject* template;
+ PyObject* string;
+ PyObject* count;
+ if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
+ return NULL;
+ /* delegate to Python code */
+ return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
+}
+
+static PyObject*
+_pattern_subn(PatternObject* self, PyObject* args)
+{
+ PyObject* template;
+ PyObject* string;
+ PyObject* count;
+ if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
+ return NULL;
+
+ /* delegate to Python code */
+ return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
+}
+
+static PyObject*
+_pattern_split(PatternObject* self, PyObject* args)
+{
+ PyObject* string;
+ PyObject* maxsplit;
+ if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
+ return NULL;
+
+ /* delegate to Python code */
+ return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
+}
+
+static PyObject*
+_pattern_findall(PatternObject* self, PyObject* args)
+{
SRE_STATE state;
PyObject* string;
PyObject* list;
@@ -1077,7 +1218,7 @@ _pattern_findall(PatternObject* self, PyObject* args)
if (state.charsize == 1) {
status = sre_match(&state, PatternObject_GetCode(self));
} else {
- status = sre_umatch(&state, PatternObject_GetCode(self));
+ status = sre_umatch(&state, PatternObject_GetCode(self));
}
if (status >= 0) {
@@ -1085,6 +1226,10 @@ _pattern_findall(PatternObject* self, PyObject* args)
if (status == 0)
state.ptr = (void*) ((char*) state.start + 1);
+ /* FIXME: if one group is defined, slice that group
+ instead. if multiple groups are defined, add tuple
+ containing all slices */
+
item = PySequence_GetSlice(
string,
((char*) state.start - (char*) state.beginning),
@@ -1121,7 +1266,12 @@ error:
static PyMethodDef _pattern_methods[] = {
{"match", (PyCFunction) _pattern_match, 1},
{"search", (PyCFunction) _pattern_search, 1},
+ {"sub", (PyCFunction) _pattern_sub, 1},
+ {"subn", (PyCFunction) _pattern_subn, 1},
+ {"split", (PyCFunction) _pattern_split, 1},
{"findall", (PyCFunction) _pattern_findall, 1},
+ /* experimental */
+ {"cursor", (PyCFunction) _pattern_cursor, 1},
{NULL, NULL}
};
@@ -1142,7 +1292,15 @@ _pattern_getattr(PatternObject* self, char* name)
Py_INCREF(self->pattern);
return self->pattern;
}
-
+
+ if (!strcmp(name, "flags"))
+ return Py_BuildValue("i", self->flags);
+
+ if (!strcmp(name, "groupindex") && self->groupindex) {
+ Py_INCREF(self->groupindex);
+ return self->groupindex;
+ }
+
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
}
@@ -1163,7 +1321,7 @@ _match_dealloc(MatchObject* self)
{
Py_XDECREF(self->string);
Py_DECREF(self->pattern);
- PyObject_Del(self);
+ PyMem_DEL(self);
}
static PyObject*
@@ -1244,6 +1402,8 @@ _match_groups(MatchObject* self, PyObject* args)
PyObject* result;
int index;
+ /* FIXME: <fl> handle default value! */
+
result = PyTuple_New(self->groups-1);
if (!result)
return NULL;
@@ -1269,6 +1429,8 @@ _match_groupdict(MatchObject* self, PyObject* args)
PyObject* keys;
int index;
+ /* FIXME: <fl> handle default value! */
+
result = PyDict_New();
if (!result)
return NULL;
@@ -1367,7 +1529,8 @@ _match_span(MatchObject* self, PyObject* args)
if (self->mark[index*2] < 0) {
Py_INCREF(Py_None);
- return Py_None;
+ Py_INCREF(Py_None);
+ return Py_BuildValue("OO", Py_None, Py_None);
}
return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
@@ -1394,24 +1557,20 @@ _match_getattr(MatchObject* self, char* name)
PyErr_Clear();
- /* attributes! */
+ /* attributes */
if (!strcmp(name, "string")) {
Py_INCREF(self->string);
return self->string;
}
- if (!strcmp(name, "regs"))
- /* FIXME: should return the whole list! */
- return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]);
+
if (!strcmp(name, "re")) {
Py_INCREF(self->pattern);
return (PyObject*) self->pattern;
}
- if (!strcmp(name, "groupindex") && self->pattern->groupindex) {
- Py_INCREF(self->pattern->groupindex);
- return self->pattern->groupindex;
- }
+
if (!strcmp(name, "pos"))
return Py_BuildValue("i", 0); /* FIXME */
+
if (!strcmp(name, "endpos"))
return Py_BuildValue("i", 0); /* FIXME */
@@ -1432,6 +1591,106 @@ statichere PyTypeObject Match_Type = {
(getattrfunc)_match_getattr, /*tp_getattr*/
};
+/* -------------------------------------------------------------------- */
+/* cursor methods (experimental) */
+
+static void
+_cursor_dealloc(CursorObject* self)
+{
+ _stack_free(&self->state);
+ Py_DECREF(self->string);
+ Py_DECREF(self->pattern);
+ PyMem_DEL(self);
+}
+
+static PyObject*
+_cursor_match(CursorObject* self, PyObject* args)
+{
+ SRE_STATE* state = &self->state;
+ PyObject* match;
+ int status;
+
+ state->ptr = state->start;
+
+ if (state->charsize == 1) {
+ status = sre_match(state, PatternObject_GetCode(self->pattern));
+ } else {
+ status = sre_umatch(state, PatternObject_GetCode(self->pattern));
+ }
+
+ match = _pattern_new_match((PatternObject*) self->pattern,
+ state, self->string, status);
+
+ if (status >= 0)
+ state->start = state->ptr;
+ else
+ state->start = (char*) state->ptr + state->charsize;
+
+ return match;
+}
+
+
+static PyObject*
+_cursor_search(CursorObject* self, PyObject* args)
+{
+ SRE_STATE* state = &self->state;
+ PyObject* match;
+ int status;
+
+ state->ptr = state->start;
+
+ if (state->charsize == 1) {
+ status = sre_search(state, PatternObject_GetCode(self->pattern));
+ } else {
+ status = sre_usearch(state, PatternObject_GetCode(self->pattern));
+ }
+
+ match = _pattern_new_match((PatternObject*) self->pattern,
+ state, self->string, status);
+
+ if (status >= 0)
+ state->start = state->ptr;
+
+ return match;
+}
+
+static PyMethodDef _cursor_methods[] = {
+ {"match", (PyCFunction) _cursor_match, 0},
+ {"search", (PyCFunction) _cursor_search, 0},
+ {NULL, NULL}
+};
+
+static PyObject*
+_cursor_getattr(CursorObject* self, char* name)
+{
+ PyObject* res;
+
+ res = Py_FindMethod(_cursor_methods, (PyObject*) self, name);
+ if (res)
+ return res;
+
+ PyErr_Clear();
+
+ /* attributes */
+ if (!strcmp(name, "pattern")) {
+ Py_INCREF(self->pattern);
+ return self->pattern;
+ }
+
+ PyErr_SetString(PyExc_AttributeError, name);
+ return NULL;
+}
+
+statichere PyTypeObject Cursor_Type = {
+ PyObject_HEAD_INIT(NULL)
+ 0, "Cursor",
+ sizeof(CursorObject), /* size of basic object */
+ 0,
+ (destructor)_cursor_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ (getattrfunc)_cursor_getattr, /*tp_getattr*/
+};
+
static PyMethodDef _functions[] = {
{"compile", _compile, 1},
{"getcodesize", _getcodesize, 1},
@@ -1445,7 +1704,8 @@ __declspec(dllexport)
init_sre()
{
/* Patch object types */
- Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type;
+ Pattern_Type.ob_type = Match_Type.ob_type =
+ Cursor_Type.ob_type = &PyType_Type;
Py_InitModule("_sre", _functions);
}
diff --git a/Modules/sre.h b/Modules/sre.h
index 2936b059ec..3664c9d295 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -14,17 +14,18 @@
#include "sre_constants.h"
-/* Python objects */
-
typedef struct {
PyObject_HEAD
PyObject* code; /* link to the code string object */
- PyObject* pattern; /* link to the pattern source (or None) */
int groups;
PyObject* groupindex;
+ /* compatibility */
+ PyObject* pattern; /* pattern source (or None) */
+ int flags; /* flags used when compiling pattern source */
} PatternObject;
-#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code))
+#define PatternObject_GetCode(o)\
+ ((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
typedef struct {
PyObject_HEAD
@@ -34,5 +35,28 @@ typedef struct {
int mark[2];
} MatchObject;
-#endif
+typedef struct {
+ /* string pointers */
+ void* ptr; /* current position (also end of current slice) */
+ void* beginning; /* start of original string */
+ void* start; /* start of current slice */
+ void* end; /* end of original string */
+ /* character size */
+ int charsize;
+ /* registers */
+ int marks;
+ void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
+ /* backtracking stack */
+ void** stack;
+ int stacksize;
+ int stackbase;
+} SRE_STATE;
+typedef struct {
+ PyObject_HEAD
+ PyObject* pattern;
+ PyObject* string;
+ SRE_STATE state;
+} CursorObject;
+
+#endif
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 6b940d303f..c6b123e3db 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -1,4 +1,4 @@
-/* generated by sre_constants.py */
+/* generated from sre_constants.py */
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
@@ -25,3 +25,25 @@
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
+#define SRE_AT_BEGINNING 0
+#define SRE_AT_BEGINNING_LINE 1
+#define SRE_AT_BOUNDARY 2
+#define SRE_AT_NON_BOUNDARY 3
+#define SRE_AT_END 4
+#define SRE_AT_END_LINE 5
+#define SRE_CATEGORY_DIGIT 0
+#define SRE_CATEGORY_NOT_DIGIT 1
+#define SRE_CATEGORY_SPACE 2
+#define SRE_CATEGORY_NOT_SPACE 3
+#define SRE_CATEGORY_WORD 4
+#define SRE_CATEGORY_NOT_WORD 5
+#define SRE_CATEGORY_LINEBREAK 6
+#define SRE_CATEGORY_NOT_LINEBREAK 7
+#define SRE_CATEGORY_LOC_DIGIT 8
+#define SRE_CATEGORY_LOC_NOT_DIGIT 9
+#define SRE_CATEGORY_LOC_SPACE 10
+#define SRE_CATEGORY_LOC_NOT_SPACE 11
+#define SRE_CATEGORY_LOC_WORD 12
+#define SRE_CATEGORY_LOC_NOT_WORD 13
+#define SRE_CATEGORY_LOC_LINEBREAK 14
+#define SRE_CATEGORY_LOC_NOT_LINEBREAK 15