diff options
author | Scott MacVicar <scottmac@php.net> | 2009-11-03 12:15:03 +0000 |
---|---|---|
committer | Scott MacVicar <scottmac@php.net> | 2009-11-03 12:15:03 +0000 |
commit | f03b175f7cb5b1ffc0b4f59d83836beb3794ebfd (patch) | |
tree | 30ce6bf2f30beb84bbbc70d257a2666d032e0398 /ext/pcre/pcrelib/pcre_study.c | |
parent | 26e3082abca953a2add5ae02b6fd3f6f301e00ec (diff) | |
download | php-git-f03b175f7cb5b1ffc0b4f59d83836beb3794ebfd.tar.gz |
Update PCRE to 8.00
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_study.c | 424 |
1 files changed, 402 insertions, 22 deletions
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c index 226cc65941..a2e9f44faa 100644 --- a/ext/pcre/pcrelib/pcre_study.c +++ b/ext/pcre/pcrelib/pcre_study.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2009 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -52,6 +52,364 @@ supporting functions. */ enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; + +/************************************************* +* Find the minimum subject length for a group * +*************************************************/ + +/* Scan a parenthesized group and compute the minimum length of subject that +is needed to match it. This is a lower bound; it does not mean there is a +string of that length that matches. In UTF8 mode, the result is in characters +rather than bytes. + +Arguments: + code pointer to start of group (the bracket) + startcode pointer to start of the whole pattern + options the compiling options + +Returns: the minimum length + -1 if \C was encountered + -2 internal error (missing capturing bracket) +*/ + +static int +find_minlength(const uschar *code, const uschar *startcode, int options) +{ +int length = -1; +BOOL utf8 = (options & PCRE_UTF8) != 0; +BOOL had_recurse = FALSE; +register int branchlength = 0; +register uschar *cc = (uschar *)code + 1 + LINK_SIZE; + +if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2; + +/* Scan along the opcodes for this branch. If we get to the end of the +branch, check the length against that of the other branches. */ + +for (;;) + { + int d, min; + uschar *cs, *ce; + register int op = *cc; + + switch (op) + { + case OP_CBRA: + case OP_SCBRA: + case OP_BRA: + case OP_SBRA: + case OP_ONCE: + case OP_COND: + case OP_SCOND: + d = find_minlength(cc, startcode, options); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* Reached end of a branch; if it's a ket it is the end of a nested + call. If it's ALT it is an alternation in a nested call. If it is + END it's the end of the outer call. All can be handled by the same code. */ + + case OP_ALT: + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_END: + if (length < 0 || (!had_recurse && branchlength < length)) + length = branchlength; + if (*cc != OP_ALT) return length; + cc += 1 + LINK_SIZE; + branchlength = 0; + had_recurse = FALSE; + break; + + /* Skip over assertive subpatterns */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do cc += GET(cc, 1); while (*cc == OP_ALT); + /* Fall through */ + + /* Skip over things that don't match chars */ + + case OP_REVERSE: + case OP_CREF: + case OP_NCREF: + case OP_RREF: + case OP_NRREF: + case OP_DEF: + case OP_OPT: + case OP_CALLOUT: + case OP_SOD: + case OP_SOM: + case OP_EOD: + case OP_EODN: + case OP_CIRC: + case OP_DOLL: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + cc += _pcre_OP_lengths[*cc]; + break; + + /* Skip over a subpattern that has a {0} or {0,x} quantifier */ + + case OP_BRAZERO: + case OP_BRAMINZERO: + case OP_SKIPZERO: + cc += _pcre_OP_lengths[*cc]; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* Handle literal characters and + repetitions */ + + case OP_CHAR: + case OP_CHARNC: + case OP_NOT: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSPLUS: + branchlength++; + cc += 2; +#ifdef SUPPORT_UTF8 + if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#endif + break; + + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + branchlength++; + cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; + break; + + /* Handle exact repetitions. The count is already in characters, but we + need to skip over a multibyte character in UTF8 mode. */ + + case OP_EXACT: + case OP_NOTEXACT: + branchlength += GET2(cc,1); + cc += 4; +#ifdef SUPPORT_UTF8 + if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#endif + break; + + case OP_TYPEEXACT: + branchlength += GET2(cc,1); + cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4; + break; + + /* Handle single-char non-literal matchers */ + + case OP_PROP: + case OP_NOTPROP: + cc += 2; + /* Fall through */ + + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + case OP_ALLANY: + case OP_EXTUNI: + case OP_HSPACE: + case OP_NOT_HSPACE: + case OP_VSPACE: + case OP_NOT_VSPACE: + branchlength++; + cc++; + break; + + /* "Any newline" might match two characters */ + + case OP_ANYNL: + branchlength += 2; + cc++; + break; + + /* The single-byte matcher means we can't proceed in UTF-8 mode */ + + case OP_ANYBYTE: +#ifdef SUPPORT_UTF8 + if (utf8) return -1; +#endif + branchlength++; + cc++; + break; + + /* For repeated character types, we have to test for \p and \P, which have + an extra two bytes of parameters. */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSQUERY: + if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; + cc += _pcre_OP_lengths[op]; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; + cc += _pcre_OP_lengths[op]; + break; + + /* Check a class for variable quantification */ + +#ifdef SUPPORT_UTF8 + case OP_XCLASS: + cc += GET(cc, 1) - 33; + /* Fall through */ +#endif + + case OP_CLASS: + case OP_NCLASS: + cc += 33; + + switch (*cc) + { + case OP_CRPLUS: + case OP_CRMINPLUS: + branchlength++; + /* Fall through */ + + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + branchlength += GET2(cc,1); + cc += 5; + break; + + default: + branchlength++; + break; + } + break; + + /* Backreferences and subroutine calls are treated in the same way: we find + the minimum length for the subpattern. A recursion, however, causes an + a flag to be set that causes the length of this branch to be ignored. The + logic is that a recursion can only make sense if there is another + alternation that stops the recursing. That will provide the minimum length + (when no recursion happens). A backreference within the group that it is + referencing behaves in the same way. + + If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket + matches an empty string (by default it causes a matching failure), so in + that case we must set the minimum length to zero. */ + + case OP_REF: + if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) + { + ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1)); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) + { + d = 0; + had_recurse = TRUE; + } + else d = find_minlength(cs, startcode, options); + } + else d = 0; + cc += 3; + + /* Handle repeated back references */ + + switch (*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + min = 0; + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + min = GET2(cc, 1); + cc += 5; + break; + + default: + min = 1; + break; + } + + branchlength += min * d; + break; + + case OP_RECURSE: + cs = ce = (uschar *)startcode + GET(cc, 1); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) + had_recurse = TRUE; + else + branchlength += find_minlength(cs, startcode, options); + cc += 1 + LINK_SIZE; + break; + + /* Anything else does not or need not match a character. We can get the + item's length from the table, but for those that can match zero occurrences + of a character, we must take special action for UTF-8 characters. */ + + case OP_UPTO: + case OP_NOTUPTO: + case OP_MINUPTO: + case OP_NOTMINUPTO: + case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: + case OP_NOTMINSTAR: + case OP_POSSTAR: + case OP_NOTPOSSTAR: + case OP_QUERY: + case OP_MINQUERY: + case OP_NOTMINQUERY: + case OP_POSQUERY: + case OP_NOTPOSQUERY: + cc += _pcre_OP_lengths[op]; +#ifdef SUPPORT_UTF8 + if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#endif + break; + + /* For the record, these are the opcodes that are matched by "default": + OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP, + OP_THEN. */ + + default: + cc += _pcre_OP_lengths[op]; + break; + } + } +/* Control never gets here */ +} + + + /************************************************* * Set a bit and maybe its alternate case * *************************************************/ @@ -498,13 +856,15 @@ Arguments: set NULL unless error Returns: pointer to a pcre_extra block, with study_data filled in and the - appropriate flag set; + appropriate flags set; NULL on error or if no optimization possible */ PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION pcre_study(const pcre *external_re, int options, const char **errorptr) { +int min; +BOOL bits_set = FALSE; uschar start_bits[32]; pcre_extra *extra; pcre_study_data *study; @@ -531,30 +891,39 @@ code = (uschar *)re + re->name_table_offset + (re->name_count * re->name_entry_size); /* For an anchored pattern, or an unanchored pattern that has a first char, or -a multiline pattern that matches only at "line starts", no further processing -at present. */ +a multiline pattern that matches only at "line starts", there is no point in +seeking a list of starting bytes. */ -if ((re->options & PCRE_ANCHORED) != 0 || - (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) - return NULL; +if ((re->options & PCRE_ANCHORED) == 0 && + (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) + { + /* Set the character tables in the block that is passed around */ -/* Set the character tables in the block that is passed around */ + tables = re->tables; + if (tables == NULL) + (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); -tables = re->tables; -if (tables == NULL) - (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, - (void *)(&tables)); + compile_block.lcc = tables + lcc_offset; + compile_block.fcc = tables + fcc_offset; + compile_block.cbits = tables + cbits_offset; + compile_block.ctypes = tables + ctypes_offset; -compile_block.lcc = tables + lcc_offset; -compile_block.fcc = tables + fcc_offset; -compile_block.cbits = tables + cbits_offset; -compile_block.ctypes = tables + ctypes_offset; + /* See if we can find a fixed set of initial characters for the pattern. */ + + memset(start_bits, 0, 32 * sizeof(uschar)); + bits_set = set_start_bits(code, start_bits, + (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0, + &compile_block) == SSB_DONE; + } -/* See if we can find a fixed set of initial characters for the pattern. */ +/* Find the minimum length of subject string. */ -memset(start_bits, 0, 32 * sizeof(uschar)); -if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, - (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; +min = find_minlength(code, code, re->options); + +/* Return NULL if no optimization is possible. */ + +if (!bits_set && min < 0) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional @@ -577,8 +946,19 @@ extra->flags = PCRE_EXTRA_STUDY_DATA; extra->study_data = study; study->size = sizeof(pcre_study_data); -study->options = PCRE_STUDY_MAPPED; -memcpy(study->start_bits, start_bits, sizeof(start_bits)); +study->flags = 0; + +if (bits_set) + { + study->flags |= PCRE_STUDY_MAPPED; + memcpy(study->start_bits, start_bits, sizeof(start_bits)); + } + +if (min >= 0) + { + study->flags |= PCRE_STUDY_MINLEN; + study->minlength = min; + } return extra; } |