summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_study.c
diff options
context:
space:
mode:
authorNuno Lopes <nlopess@php.net>2007-02-09 19:48:47 +0000
committerNuno Lopes <nlopess@php.net>2007-02-09 19:48:47 +0000
commitb3e66c616dcc1f5d9988d3e485dcd00bbba6fabe (patch)
tree7d627e2f5988d55ae5dd3b76171b94fc9ab0bc7d /ext/pcre/pcrelib/pcre_study.c
parente6d69595afed237cdfe561c9f052efb41f41c622 (diff)
downloadphp-git-b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe.tar.gz
upgrade pcre to version 7.0
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
-rw-r--r--ext/pcre/pcrelib/pcre_study.c117
1 files changed, 81 insertions, 36 deletions
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c
index 051e484aad..87f8c6fb51 100644
--- a/ext/pcre/pcrelib/pcre_study.c
+++ b/ext/pcre/pcrelib/pcre_study.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2007 University of Cambridge
+ Copyright (c) 1997-2006 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,11 @@ supporting functions. */
#include "pcre_internal.h"
+/* Returns from set_start_bits() */
+
+enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
+
+
/*************************************************
* Set a bit and maybe its alternate case *
*************************************************/
@@ -72,12 +77,16 @@ if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
/*************************************************
-* Create bitmap of starting chars *
+* Create bitmap of starting bytes *
*************************************************/
-/* This function scans a compiled unanchored expression and attempts to build a
-bitmap of the set of initial characters. If it can't, it returns FALSE. As time
-goes by, we may be able to get more clever at doing this.
+/* This function scans a compiled unanchored expression recursively and
+attempts to build a bitmap of the set of possible starting bytes. As time goes
+by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
+useful for parenthesized groups in patterns such as (a*)b where the group
+provides some optional starting bytes but scanning must continue at the outer
+level to find at least one mandatory byte. At the outermost level, this
+function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
@@ -86,14 +95,17 @@ Arguments:
utf8 TRUE if in UTF-8 mode
cd the block with char table pointers
-Returns: TRUE if table built, FALSE otherwise
+Returns: SSB_FAIL => Failed to find any starting bytes
+ SSB_DONE => Found mandatory starting bytes
+ SSB_CONTINUE => Found optional starting bytes
*/
-static BOOL
+static int
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
BOOL utf8, compile_data *cd)
{
register int c;
+int yield = SSB_DONE;
#if 0
/* ========================================================================= */
@@ -114,36 +126,60 @@ volatile int dummy;
do
{
- const uschar *tcode = code + 1 + LINK_SIZE;
+ const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
BOOL try_next = TRUE;
- while (try_next)
+ while (try_next) /* Loop for items in this branch */
{
- /* If a branch starts with a bracket or a positive lookahead assertion,
- recurse to set bits from within them. That's all for this branch. */
-
- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
+ int rc;
+ switch(*tcode)
{
- if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
- return FALSE;
- try_next = FALSE;
- }
+ /* Fail if we reach something we don't understand */
- else switch(*tcode)
- {
default:
- return FALSE;
+ return SSB_FAIL;
+
+ /* If we hit a bracket or a positive lookahead assertion, recurse to set
+ bits from within the subpattern. If it can't find anything, we have to
+ give up. If it finds some mandatory character(s), we are done for this
+ branch. Otherwise, carry on scanning after the subpattern. */
+
+ case OP_BRA:
+ case OP_SBRA:
+ case OP_CBRA:
+ case OP_SCBRA:
+ case OP_ONCE:
+ case OP_ASSERT:
+ rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
+ if (rc == SSB_FAIL) return SSB_FAIL;
+ if (rc == SSB_DONE) try_next = FALSE; else
+ {
+ do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
+ tcode += 1 + LINK_SIZE;
+ }
+ break;
- /* Skip over callout */
+ /* If we hit ALT or KET, it means we haven't found anything mandatory in
+ this branch, though we might have found something optional. For ALT, we
+ continue with the next alternative, but we have to arrange that the final
+ result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
+ return SSB_CONTINUE: if this is the top level, that indicates failure,
+ but after a nested subpattern, it causes scanning to continue. */
- case OP_CALLOUT:
- tcode += 2 + 2*LINK_SIZE;
+ case OP_ALT:
+ yield = SSB_CONTINUE;
+ try_next = FALSE;
break;
- /* Skip over extended extraction bracket number */
+ case OP_KET:
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ return SSB_CONTINUE;
- case OP_BRANUMBER:
- tcode += 3;
+ /* Skip over callout */
+
+ case OP_CALLOUT:
+ tcode += 2 + 2*LINK_SIZE;
break;
/* Skip over lookbehind and negative lookahead assertions */
@@ -152,7 +188,7 @@ do
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
- tcode += 1+LINK_SIZE;
+ tcode += 1 + LINK_SIZE;
break;
/* Skip over an option setting, changing the caseless flag */
@@ -166,27 +202,30 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
- if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
- return FALSE;
+ if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
+ return SSB_FAIL;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
dummy = 1;
========================================================================= */
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
- tcode += 1+LINK_SIZE;
+ tcode += 1 + LINK_SIZE;
break;
/* Single-char * or ? sets the bit and tries the next item */
case OP_STAR:
case OP_MINSTAR:
+ case OP_POSSTAR:
case OP_QUERY:
case OP_MINQUERY:
+ case OP_POSQUERY:
set_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+ if (utf8 && tcode[-1] >= 0xc0)
+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
@@ -194,10 +233,12 @@ do
case OP_UPTO:
case OP_MINUPTO:
+ case OP_POSUPTO:
set_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+ if (utf8 && tcode[-1] >= 0xc0)
+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
@@ -210,6 +251,7 @@ do
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
+ case OP_POSPLUS:
set_bit(start_bits, tcode[1], caseless, cd);
try_next = FALSE;
break;
@@ -283,16 +325,19 @@ do
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
+ case OP_TYPEPOSUPTO:
tcode += 2; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
+ case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
+ case OP_TYPEPOSQUERY:
switch(tcode[1])
{
case OP_ANY:
- return FALSE;
+ return SSB_FAIL;
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
@@ -418,7 +463,7 @@ do
code += GET(code, 1); /* Advance to next branch */
}
while (*code == OP_ALT);
-return TRUE;
+return yield;
}
@@ -492,8 +537,8 @@ compile_block.ctypes = tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
-if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
- (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
+if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
+ (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
the latter, which is pointed to by the former, which may also get additional