upgrade pcre to version 7.0

author: Nuno Lopes <nlopess@php.net> 2007-02-09 19:48:47 +0000
committer: Nuno Lopes <nlopess@php.net> 2007-02-09 19:48:47 +0000
commit: b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe (patch)
tree: 7d627e2f5988d55ae5dd3b76171b94fc9ab0bc7d /ext/pcre/pcrelib/pcre_exec.c
parent: e6d69595afed237cdfe561c9f052efb41f41c622 (diff)
download: php-git-b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe.tar.gz
1 files changed, 580 insertions, 314 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c
index a67855bfef..890e0f731a 100644
--- a/ext/pcre/pcrelib/pcre_exec.c
+++ b/ext/pcre/pcrelib/pcre_exec.c
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2007 University of Cambridge
+           Copyright (c) 1997-2006 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -42,25 +42,22 @@ POSSIBILITY OF SUCH DAMAGE.
 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
 possible. There are also some static supporting functions. */
 
-#define NLBLOCK md           /* The block containing newline information */
-#include "pcre_internal.h"
+#define NLBLOCK md             /* Block containing newline information */
+#define PSSTART start_subject  /* Field containing processed string start */
+#define PSEND   end_subject    /* Field containing processed string end */
 
+#include "pcre_internal.h"
 
-/* Structure for building a chain of data that actually lives on the
-stack, for holding the values of the subject pointer at the start of each
-subpattern, so as to detect when an empty string has been matched by a
-subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
-are on the heap, not on the stack. */
+/* The chain of eptrblocks for tail recursions uses memory in stack workspace,
+obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
 
-typedef struct eptrblock {
-  struct eptrblock *epb_prev;
-  USPTR epb_saved_eptr;
-} eptrblock;
+#define EPTR_WORK_SIZE (1000)
 
 /* Flag bits for the match() function */
 
-#define match_condassert   0x01    /* Called to check a condition assertion */
-#define match_isgroup      0x02    /* Set if start of bracketed group */
+#define match_condassert     0x01  /* Called to check a condition assertion */
+#define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */
+#define match_tail_recursed  0x04  /* Tail recursive call */
 
 /* Non-error returns from the match() function. Error returns are externally
 defined PCRE_ERROR_xxx codes, which are all negative. */
@@ -101,7 +98,7 @@ Returns:     nothing
 static void
 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 {
-int c;
+unsigned int c;
 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 while (length-- > 0)
   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
@@ -291,7 +288,6 @@ typedef struct heapframe {
 
   BOOL Xcur_is_word;
   BOOL Xcondition;
-  BOOL Xminimize;
   BOOL Xprev_is_word;
 
   unsigned long int Xoriginal_ims;
@@ -303,11 +299,10 @@ typedef struct heapframe {
   int Xprop_category;
   int Xprop_chartype;
   int Xprop_script;
-  int *Xprop_test_variable;
 #endif
 
   int Xctype;
-  int Xfc;
+  unsigned int Xfc;
   int Xfi;
   int Xlength;
   int Xmax;
@@ -340,10 +335,7 @@ typedef struct heapframe {
 *         Match from current position            *
 *************************************************/
 
-/* On entry ecode points to the first opcode, and eptr to the first character
-in the subject string, while eptrb holds the value of eptr at the start of the
-last bracketed group - used for breaking infinite loops matching zero-length
-strings. This function is called recursively in many circumstances. Whenever it
+/* This function is called recursively in many circumstances. Whenever it
 returns a negative (error) response, the outer incarnation must also return the
 same response.
 
@@ -353,8 +345,8 @@ performance. Tests using gcc on a SPARC disproved this; in the first case, it
 made performance worse.
 
 Arguments:
-   eptr        pointer in subject
-   ecode       position in code
+   eptr        pointer to current character in subject
+   ecode       pointer to current position in compiled code
    offset_top  current top pointer
    md          pointer to "static" info for the match
    ims         current /i, /m, and /s options
@@ -362,7 +354,9 @@ Arguments:
                  brackets - for testing for empty matches
    flags       can contain
                  match_condassert - this is an assertion condition
-                 match_isgroup - this is the start of a bracketed group
+                 match_cbegroup - this is the start of an unlimited repeat
+                   group that can match an empty string
+                 match_tail_recursed - this is a tail_recursed group
    rdepth      the recursion depth
 
 Returns:       MATCH_MATCH if matched            )  these values are >= 0
@@ -377,14 +371,16 @@ match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
   int flags, unsigned int rdepth)
 {
 /* These variables do not need to be preserved over recursion in this function,
-so they can be ordinary variables in all cases. Mark them with "register"
-because they are used a lot in loops. */
+so they can be ordinary variables in all cases. Mark some of them with
+"register" because they are used a lot in loops. */
 
 register int  rrc;         /* Returns from recursive calls */
 register int  i;           /* Used for loops not involving calls to RMATCH() */
-register unsigned int  c;  /* Character values not kept over RMATCH() calls */
+register unsigned int c;   /* Character values not kept over RMATCH() calls */
 register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
 
+BOOL minimize, possessive; /* Quantifier options */
+
 /* When recursion is not being used, all "local" variables that have to be
 preserved over calls to RMATCH() are part of a "frame" which is obtained from
 heap storage. Set up the top-level frame here; others are obtained from the
@@ -434,7 +430,6 @@ HEAP_RECURSE:
 
 #define cur_is_word        frame->Xcur_is_word
 #define condition          frame->Xcondition
-#define minimize           frame->Xminimize
 #define prev_is_word       frame->Xprev_is_word
 
 #define original_ims       frame->Xoriginal_ims
@@ -446,7 +441,6 @@ HEAP_RECURSE:
 #define prop_category      frame->Xprop_category
 #define prop_chartype      frame->Xprop_chartype
 #define prop_script        frame->Xprop_script
-#define prop_test_variable frame->Xprop_test_variable
 #endif
 
 #define ctype              frame->Xctype
@@ -470,7 +464,7 @@ HEAP_RECURSE:
 get preserved during recursion in the normal way. In this environment, fi and
 i, and fc and c, can be the same variables. */
 
-#else
+#else         /* NO_RECURSE not defined */
 #define fi i
 #define fc c
 
@@ -489,7 +483,6 @@ recursion_info new_recursive;      /* within blocks below are for variables  */
                                    /* that do not have to be preserved over  */
 BOOL cur_is_word;                  /* a recursive call to RMATCH().          */
 BOOL condition;
-BOOL minimize;
 BOOL prev_is_word;
 
 unsigned long int original_ims;
@@ -501,7 +494,6 @@ int prop_fail_result;
 int prop_category;
 int prop_chartype;
 int prop_script;
-int *prop_test_variable;
 #endif
 
 int ctype;
@@ -516,7 +508,7 @@ int save_offset1, save_offset2, save_offset3;
 int stacksave[REC_STACK_SAVE_MAX];
 
 eptrblock newptrb;
-#endif
+#endif     /* NO_RECURSE */
 
 /* These statements are here to stop the compiler complaining about unitialized
 variables. */
@@ -524,9 +516,9 @@ variables. */
 #ifdef SUPPORT_UCP
 prop_value = 0;
 prop_fail_result = 0;
-prop_test_variable = NULL;
 #endif
 
+
 /* This label is used for tail recursion, which is used in a few cases even
 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
 used. Thanks to Ian Taylor for noticing this possibility and sending the
@@ -556,24 +548,34 @@ utf8 = md->utf8;       /* Local copy of the flag */
 utf8 = FALSE;
 #endif
 
-/* At the start of a bracketed group, add the current subject pointer to the
-stack of such pointers, to be re-instated at the end of the group when we hit
-the closing ket. When match() is called in other circumstances, we don't add to
-this stack. */
+/* At the start of a group with an unlimited repeat that may match an empty
+string, the match_cbegroup flag is set. When this is the case, add the current
+subject pointer to the chain of such remembered pointers, to be checked when we
+hit the closing ket, in order to break infinite loops that match no characters.
+When match() is called in other circumstances, don't add to the chain. If this
+is a tail recursion, use a block from the workspace, as the one on the stack is
+already used. */
 
-if ((flags & match_isgroup) != 0)
+if ((flags & match_cbegroup) != 0)
   {
-  newptrb.epb_prev = eptrb;
-  newptrb.epb_saved_eptr = eptr;
-  eptrb = &newptrb;
+  eptrblock *p;
+  if ((flags & match_tail_recursed) != 0)
+    {
+    if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
+    p = md->eptrchain + md->eptrn++;
+    }
+  else p = &newptrb;
+  p->epb_saved_eptr = eptr;
+  p->epb_prev = eptrb;
+  eptrb = p;
   }
 
-/* Now start processing the operations. */
+/* Now start processing the opcodes. */
 
 for (;;)
   {
+  minimize = possessive = FALSE;
   op = *ecode;
-  minimize = FALSE;
 
   /* For partial matching, remember if we ever hit the end of the subject after
   matching at least one subject character. */
@@ -583,33 +585,30 @@ for (;;)
       eptr > md->start_match)
     md->hitend = TRUE;
 
-  /* Opening capturing bracket. If there is space in the offset vector, save
-  the current subject position in the working slot at the top of the vector. We
-  mustn't change the current values of the data slot, because they may be set
-  from a previous iteration of this group, and be referred to by a reference
-  inside the group.
-
-  If the bracket fails to match, we need to restore this value and also the
-  values of the final offsets, in case they were set by a previous iteration of
-  the same bracket.
-
-  If there isn't enough space in the offset vector, treat this as if it were a
-  non-capturing bracket. Don't worry about setting the flag for the error case
-  here; that is handled in the code for KET. */
-
-  if (op > OP_BRA)
+  switch(op)
     {
-    number = op - OP_BRA;
-
-    /* For extended extraction brackets (large number), we have to fish out the
-    number from a dummy opcode at the start. */
-
-    if (number > EXTRACT_BASIC_MAX)
-      number = GET2(ecode, 2+LINK_SIZE);
+    /* Handle a capturing bracket. If there is space in the offset vector, save
+    the current subject position in the working slot at the top of the vector.
+    We mustn't change the current values of the data slot, because they may be
+    set from a previous iteration of this group, and be referred to by a
+    reference inside the group.
+
+    If the bracket fails to match, we need to restore this value and also the
+    values of the final offsets, in case they were set by a previous iteration
+    of the same bracket.
+
+    If there isn't enough space in the offset vector, treat this as if it were
+    a non-capturing bracket. Don't worry about setting the flag for the error
+    case here; that is handled in the code for KET. */
+
+    case OP_CBRA:
+    case OP_SCBRA:
+    number = GET2(ecode, 1+LINK_SIZE);
     offset = number << 1;
 
 #ifdef DEBUG
-    printf("start bracket %d subject=", number);
+    printf("start bracket %d\n", number);
+    printf("subject=");
     pchars(eptr, 16, TRUE, md);
     printf("\n");
 #endif
@@ -624,10 +623,11 @@ for (;;)
       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
 
+      flags = (op == OP_SCBRA)? match_cbegroup : 0;
       do
         {
-        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
-          match_isgroup);
+        RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+          ims, eptrb, flags);
         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
         md->capture_last = save_capture_last;
         ecode += GET(ecode, 1);
@@ -643,39 +643,35 @@ for (;;)
       RRETURN(MATCH_NOMATCH);
       }
 
-    /* Insufficient room for saving captured contents */
+    /* Insufficient room for saving captured contents. Treat as a non-capturing
+    bracket. */
 
-    else op = OP_BRA;
-    }
+    DPRINTF(("insufficient capture room: treat as non-capturing\n"));
 
-  /* Other types of node can be handled by a switch */
-
-  switch(op)
-    {
-    case OP_BRA:     /* Non-capturing bracket: optimized */
-    DPRINTF(("start bracket 0\n"));
-
-    /* Loop for all the alternatives */
+    /* Non-capturing bracket. Loop for all the alternatives. When we get to the
+    final alternative within the brackets, we would return the result of a
+    recursive call to match() whatever happened. We can reduce stack usage by
+    turning this into a tail recursion. */
 
+    case OP_BRA:
+    case OP_SBRA:
+    DPRINTF(("start non-capturing bracket\n"));
+    flags = (op >= OP_SBRA)? match_cbegroup : 0;
     for (;;)
       {
-      /* When we get to the final alternative within the brackets, we would
-      return the result of a recursive call to match() whatever happened. We
-      can reduce stack usage by turning this into a tail recursion. */
-
       if (ecode[GET(ecode, 1)] != OP_ALT)
-       {
-       ecode += 1 + LINK_SIZE;
-       flags = match_isgroup;
-       DPRINTF(("bracket 0 tail recursion\n"));
-       goto TAIL_RECURSE;
-       }
+        {
+        ecode += _pcre_OP_lengths[*ecode];
+        flags |= match_tail_recursed;
+        DPRINTF(("bracket 0 tail recursion\n"));
+        goto TAIL_RECURSE;
+        }
 
       /* For non-final alternatives, continue the loop for a NOMATCH result;
       otherwise return. */
 
-      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
-        match_isgroup);
+      RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
+        eptrb, flags);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += GET(ecode, 1);
       }
@@ -688,54 +684,72 @@ for (;;)
     obeyed, we can use tail recursion to avoid using another stack frame. */
 
     case OP_COND:
-    if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
+    case OP_SCOND:
+    if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
+      {
+      offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
+      condition = md->recursive != NULL &&
+        (offset == RREF_ANY || offset == md->recursive->group_num);
+      ecode += condition? 3 : GET(ecode, 1);
+      }
+
+    else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */
       {
       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
-      condition = (offset == CREF_RECURSE * 2)?
-        (md->recursive != NULL) :
-        (offset < offset_top && md->offset_vector[offset] >= 0);
-      ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
-      flags = match_isgroup;
-      goto TAIL_RECURSE;
+      condition = offset < offset_top && md->offset_vector[offset] >= 0;
+      ecode += condition? 3 : GET(ecode, 1);
+      }
+
+    else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */
+      {
+      condition = FALSE;
+      ecode += GET(ecode, 1);
       }
 
     /* The condition is an assertion. Call match() to evaluate it - setting
-    the final argument TRUE causes it to stop at the end of an assertion. */
+    the final argument match_condassert causes it to stop at the end of an
+    assertion. */
 
     else
       {
       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
-          match_condassert | match_isgroup);
+          match_condassert);
       if (rrc == MATCH_MATCH)
         {
-        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
+        condition = TRUE;
+        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
         }
       else if (rrc != MATCH_NOMATCH)
         {
         RRETURN(rrc);         /* Need braces because of following else */
         }
-      else ecode += GET(ecode, 1);
+      else
+        {
+        condition = FALSE;
+        ecode += GET(ecode, 1);
+        }
+      }
 
-      /* We are now at the branch that is to be obeyed. As there is only one,
-      we can use tail recursion to avoid using another stack frame. */
+    /* We are now at the branch that is to be obeyed. As there is only one,
+    we can use tail recursion to avoid using another stack frame. If the second
+    alternative doesn't exist, we can just plough on. */
 
+    if (condition || *ecode == OP_ALT)
+      {
       ecode += 1 + LINK_SIZE;
-      flags = match_isgroup;
+      flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
       goto TAIL_RECURSE;
       }
-    /* Control never reaches here */
-
-    /* Skip over conditional reference or large extraction number data if
-    encountered. */
-
-    case OP_CREF:
-    case OP_BRANUMBER:
-    ecode += 3;
+    else
+      {
+      ecode += 1 + LINK_SIZE;
+      }
     break;
 
-    /* End of the pattern. If we are in a recursion, we should restore the
-    offsets appropriately and continue from after the call. */
+
+    /* End of the pattern. If we are in a top-level recursion, we should
+    restore the offsets appropriately and continue from after the call. */
 
     case OP_END:
     if (md->recursive != NULL && md->recursive->group_num == 0)
@@ -777,8 +791,7 @@ for (;;)
     case OP_ASSERTBACK:
     do
       {
-      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
-        match_isgroup);
+      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
       if (rrc == MATCH_MATCH) break;
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += GET(ecode, 1);
@@ -804,8 +817,7 @@ for (;;)
     case OP_ASSERTBACK_NOT:
     do
       {
-      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
-        match_isgroup);
+      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += GET(ecode,1);
@@ -826,8 +838,8 @@ for (;;)
 #ifdef SUPPORT_UTF8
     if (utf8)
       {
-      c = GET(ecode,1);
-      for (i = 0; i < c; i++)
+      i = GET(ecode, 1);
+      while (i-- > 0)
         {
         eptr--;
         if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
@@ -840,7 +852,7 @@ for (;;)
     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
 
       {
-      eptr -= GET(ecode,1);
+      eptr -= GET(ecode, 1);
       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
       }
 
@@ -897,13 +909,8 @@ for (;;)
     case OP_RECURSE:
       {
       callpat = md->start_code + GET(ecode, 1);
-      new_recursive.group_num = *callpat - OP_BRA;
-
-      /* For extended extraction brackets (large number), we have to fish out
-      the number from a dummy opcode at the start. */
-
-      if (new_recursive.group_num > EXTRACT_BASIC_MAX)
-        new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
+      new_recursive.group_num = (callpat == md->start_code)? 0 :
+        GET2(callpat, 1 + LINK_SIZE);
 
       /* Add to "recursing stack" */
 
@@ -936,10 +943,11 @@ for (;;)
       restore the offset and recursion data. */
 
       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
+      flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
       do
         {
-        RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
-            eptrb, match_isgroup);
+        RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
+          md, ims, eptrb, flags);
         if (rrc == MATCH_MATCH)
           {
           DPRINTF(("Recursion matched\n"));
@@ -983,7 +991,7 @@ for (;;)
     do
       {
       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
-        eptrb, match_isgroup);
+        eptrb, 0);
       if (rrc == MATCH_MATCH) break;
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += GET(ecode,1);
@@ -997,7 +1005,7 @@ for (;;)
     /* Continue as from after the assertion, updating the offsets high water
     mark, since extracts may have been taken. */
 
-    do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+    do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
 
     offset_top = md->end_offset_top;
     eptr = md->end_match_ptr;
@@ -1031,15 +1039,15 @@ for (;;)
       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode = prev;
-      flags = match_isgroup;
+      flags = match_tail_recursed;
       goto TAIL_RECURSE;
       }
     else  /* OP_KETRMAX */
       {
-      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += 1 + LINK_SIZE;
-      flags = 0;
+      flags = match_tail_recursed;
       goto TAIL_RECURSE;
       }
     /* Control never gets here */
@@ -1060,38 +1068,44 @@ for (;;)
     case OP_BRAZERO:
       {
       next = ecode+1;
-      RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
+      RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       do next += GET(next,1); while (*next == OP_ALT);
-      ecode = next + 1+LINK_SIZE;
+      ecode = next + 1 + LINK_SIZE;
       }
     break;
 
     case OP_BRAMINZERO:
       {
       next = ecode+1;
-      do next += GET(next,1); while (*next == OP_ALT);
-      RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
-        match_isgroup);
+      do next += GET(next, 1); while (*next == OP_ALT);
+      RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode++;
       }
     break;
 
-    /* End of a group, repeated or non-repeating. If we are at the end of
-    an assertion "group", stop matching and return MATCH_MATCH, but record the
-    current high water mark for use by positive assertions. Do this also
-    for the "once" (not-backup up) groups. */
+    /* End of a group, repeated or non-repeating. */
 
     case OP_KET:
     case OP_KETRMIN:
     case OP_KETRMAX:
     prev = ecode - GET(ecode, 1);
-    saved_eptr = eptrb->epb_saved_eptr;
 
-    /* Back up the stack of bracket start pointers. */
+    /* If this was a group that remembered the subject start, in order to break
+    infinite repeats of empty string matches, retrieve the subject start from
+    the chain. Otherwise, set it NULL. */
+
+    if (*prev >= OP_SBRA)
+      {
+      saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */
+      eptrb = eptrb->epb_prev;              /* Backup to previous group */
+      }
+    else saved_eptr = NULL;
 
-    eptrb = eptrb->epb_prev;
+    /* If we are at the end of an assertion group, stop matching and return
+    MATCH_MATCH, but record the current high water mark for use by positive
+    assertions. Do this also for the "once" (atomic) groups. */
 
     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
         *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
@@ -1102,18 +1116,15 @@ for (;;)
       RRETURN(MATCH_MATCH);
       }
 
-    /* In all other cases except a conditional group we have to check the
-    group number back at the start and if necessary complete handling an
-    extraction by setting the offsets and bumping the high water mark. */
+    /* For capturing groups we have to check the group number back at the start
+    and if necessary complete handling an extraction by setting the offsets and
+    bumping the high water mark. Note that whole-pattern recursion is coded as
+    a recurse into group 0, so it won't be picked up here. Instead, we catch it
+    when the OP_END is reached. Other recursion is handled here. */
 
-    if (*prev != OP_COND)
+    if (*prev == OP_CBRA || *prev == OP_SCBRA)
       {
-      number = *prev - OP_BRA;
-
-      /* For extended extraction brackets (large number), we have to fish out
-      the number from a dummy opcode at the start. */
-
-      if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
+      number = GET2(prev, 1+LINK_SIZE);
       offset = number << 1;
 
 #ifdef DEBUG
@@ -1121,42 +1132,34 @@ for (;;)
       printf("\n");
 #endif
 
-      /* Test for a numbered group. This includes groups called as a result
-      of recursion. Note that whole-pattern recursion is coded as a recurse
-      into group 0, so it won't be picked up here. Instead, we catch it when
-      the OP_END is reached. */
-
-      if (number > 0)
+      md->capture_last = number;
+      if (offset >= md->offset_max) md->offset_overflow = TRUE; else
         {
-        md->capture_last = number;
-        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
-          {
-          md->offset_vector[offset] =
-            md->offset_vector[md->offset_end - number];
-          md->offset_vector[offset+1] = eptr - md->start_subject;
-          if (offset_top <= offset) offset_top = offset + 2;
-          }
+        md->offset_vector[offset] =
+          md->offset_vector[md->offset_end - number];
+        md->offset_vector[offset+1] = eptr - md->start_subject;
+        if (offset_top <= offset) offset_top = offset + 2;
+        }
 
-        /* Handle a recursively called group. Restore the offsets
-        appropriately and continue from after the call. */
+      /* Handle a recursively called group. Restore the offsets
+      appropriately and continue from after the call. */
 
-        if (md->recursive != NULL && md->recursive->group_num == number)
-          {
-          recursion_info *rec = md->recursive;
-          DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
-          md->recursive = rec->prevrec;
-          md->start_match = rec->save_start;
-          memcpy(md->offset_vector, rec->offset_save,
-            rec->saved_max * sizeof(int));
-          ecode = rec->after_call;
-          ims = original_ims;
-          break;
-          }
+      if (md->recursive != NULL && md->recursive->group_num == number)
+        {
+        recursion_info *rec = md->recursive;
+        DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
+        md->recursive = rec->prevrec;
+        md->start_match = rec->save_start;
+        memcpy(md->offset_vector, rec->offset_save,
+          rec->saved_max * sizeof(int));
+        ecode = rec->after_call;
+        ims = original_ims;
+        break;
         }
       }
 
-    /* Reset the value of the ims flags, in case they got changed during
-    the group. */
+    /* For both capturing and non-capturing groups, reset the value of the ims
+    flags, in case they got changed during the group. */
 
     ims = original_ims;
     DPRINTF(("ims reset to %02lx\n", ims));
@@ -1177,20 +1180,22 @@ for (;;)
     preceding bracket, in the appropriate order. In the second case, we can use
     tail recursion to avoid using another stack frame. */
 
+    flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
+
     if (*ecode == OP_KETRMIN)
       {
       RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode = prev;
-      flags = match_isgroup;
+      flags |= match_tail_recursed;
       goto TAIL_RECURSE;
       }
     else  /* OP_KETRMAX */
       {
-      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += 1 + LINK_SIZE;
-      flags = 0;
+      flags = match_tail_recursed;
       goto TAIL_RECURSE;
       }
     /* Control never gets here */
@@ -1202,9 +1207,7 @@ for (;;)
     if ((ims & PCRE_MULTILINE) != 0)
       {
       if (eptr != md->start_subject &&
-          (eptr == md->end_subject ||
-           eptr < md->start_subject + md->nllen ||
-           !IS_NEWLINE(eptr - md->nllen)))
+          (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
         RRETURN(MATCH_NOMATCH);
       ecode++;
       break;
@@ -1244,7 +1247,7 @@ for (;;)
       if (!md->endonly)
         {
         if (eptr != md->end_subject &&
-            (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
+            (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
           RRETURN(MATCH_NOMATCH);
         ecode++;
         break;
@@ -1263,7 +1266,7 @@ for (;;)
 
     case OP_EODN:
     if (eptr != md->end_subject &&
-        (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
+        (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
       RRETURN(MATCH_NOMATCH);
     ecode++;
     break;
@@ -1319,8 +1322,7 @@ for (;;)
     case OP_ANY:
     if ((ims & PCRE_DOTALL) == 0)
       {
-      if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
-        RRETURN(MATCH_NOMATCH);
+      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
       }
     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
     if (utf8)
@@ -1414,6 +1416,26 @@ for (;;)
     ecode++;
     break;
 
+    case OP_ANYNL:
+    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+    GETCHARINCTEST(c, eptr);
+    switch(c)
+      {
+      default: RRETURN(MATCH_NOMATCH);
+      case 0x000d:
+      if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+      break;
+      case 0x000a:
+      case 0x000b:
+      case 0x000c:
+      case 0x0085:
+      case 0x2028:
+      case 0x2029:
+      break;
+      }
+    ecode++;
+    break;
+
 #ifdef SUPPORT_UCP
     /* Check the next character by Unicode property. We will get here only
     if the support is in the binary; otherwise a compile-time error occurs. */
@@ -1456,7 +1478,6 @@ for (;;)
 
         default:
         RRETURN(PCRE_ERROR_INTERNAL);
-        break;
         }
 
       ecode += 3;
@@ -1926,7 +1947,7 @@ for (;;)
 
       else
         {
-        int dc;
+        unsigned int dc;
         GETCHARINC(dc, eptr);
         ecode += length;
 
@@ -1953,13 +1974,17 @@ for (;;)
       }
     break;
 
-    /* Match a single character repeatedly; different opcodes share code. */
+    /* Match a single character repeatedly. */
 
     case OP_EXACT:
     min = max = GET2(ecode, 1);
     ecode += 3;
     goto REPEATCHAR;
 
+    case OP_POSUPTO:
+    possessive = TRUE;
+    /* Fall through */
+
     case OP_UPTO:
     case OP_MINUPTO:
     min = 0;
@@ -1968,6 +1993,27 @@ for (;;)
     ecode += 3;
     goto REPEATCHAR;
 
+    case OP_POSSTAR:
+    possessive = TRUE;
+    min = 0;
+    max = INT_MAX;
+    ecode++;
+    goto REPEATCHAR;
+
+    case OP_POSPLUS:
+    possessive = TRUE;
+    min = 1;
+    max = INT_MAX;
+    ecode++;
+    goto REPEATCHAR;
+
+    case OP_POSQUERY:
+    possessive = TRUE;
+    min = 0;
+    max = 1;
+    ecode++;
+    goto REPEATCHAR;
+
     case OP_STAR:
     case OP_MINSTAR:
     case OP_PLUS:
@@ -2003,10 +2049,9 @@ for (;;)
         uschar occhars[8];
 
 #ifdef SUPPORT_UCP
-        int othercase;
+        unsigned int othercase;
         if ((ims & PCRE_CASELESS) != 0 &&
-            (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
-             othercase >= 0)
+            (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
           oclength = _pcre_ord2utf8(othercase, occhars);
 #endif  /* SUPPORT_UCP */
 
@@ -2042,7 +2087,8 @@ for (;;)
             }
           /* Control never gets here */
           }
-        else
+
+        else  /* Maximize */
           {
           pp = eptr;
           for (i = min; i < max; i++)
@@ -2056,6 +2102,8 @@ for (;;)
               eptr += oclength;
               }
             }
+
+          if (possessive) continue;
           while (eptr >= pp)
            {
            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2110,7 +2158,7 @@ for (;;)
           }
         /* Control never gets here */
         }
-      else
+      else  /* Maximize */
         {
         pp = eptr;
         for (i = min; i < max; i++)
@@ -2118,6 +2166,7 @@ for (;;)
           if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
           eptr++;
           }
+        if (possessive) continue;
         while (eptr >= pp)
           {
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2146,7 +2195,7 @@ for (;;)
           }
         /* Control never gets here */
         }
-      else
+      else  /* Maximize */
         {
         pp = eptr;
         for (i = min; i < max; i++)
@@ -2154,6 +2203,7 @@ for (;;)
           if (eptr >= md->end_subject || fc != *eptr) break;
           eptr++;
           }
+        if (possessive) continue;
         while (eptr >= pp)
           {
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2206,6 +2256,34 @@ for (;;)
     ecode += 3;
     goto REPEATNOTCHAR;
 
+    case OP_NOTPOSSTAR:
+    possessive = TRUE;
+    min = 0;
+    max = INT_MAX;
+    ecode++;
+    goto REPEATNOTCHAR;
+
+    case OP_NOTPOSPLUS:
+    possessive = TRUE;
+    min = 1;
+    max = INT_MAX;
+    ecode++;
+    goto REPEATNOTCHAR;
+
+    case OP_NOTPOSQUERY:
+    possessive = TRUE;
+    min = 0;
+    max = 1;
+    ecode++;
+    goto REPEATNOTCHAR;
+
+    case OP_NOTPOSUPTO:
+    possessive = TRUE;
+    min = 0;
+    max = GET2(ecode, 1);
+    ecode += 3;
+    goto REPEATNOTCHAR;
+
     case OP_NOTSTAR:
     case OP_NOTMINSTAR:
     case OP_NOTPLUS:
@@ -2245,7 +2323,7 @@ for (;;)
       /* UTF-8 mode */
       if (utf8)
         {
-        register int d;
+        register unsigned int d;
         for (i = 1; i <= min; i++)
           {
           GETCHARINC(d, eptr);
@@ -2270,7 +2348,7 @@ for (;;)
         /* UTF-8 mode */
         if (utf8)
           {
-          register int d;
+          register unsigned int d;
           for (fi = min;; fi++)
             {
             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2306,7 +2384,7 @@ for (;;)
         /* UTF-8 mode */
         if (utf8)
           {
-          register int d;
+          register unsigned int d;
           for (i = min; i < max; i++)
             {
             int len = 1;
@@ -2316,7 +2394,8 @@ for (;;)
             if (fc == d) break;
             eptr += len;
             }
-          for(;;)
+        if (possessive) continue;
+        for(;;)
             {
             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -2333,6 +2412,7 @@ for (;;)
             if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
             eptr++;
             }
+          if (possessive) continue;
           while (eptr >= pp)
             {
             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2354,7 +2434,7 @@ for (;;)
       /* UTF-8 mode */
       if (utf8)
         {
-        register int d;
+        register unsigned int d;
         for (i = 1; i <= min; i++)
           {
           GETCHARINC(d, eptr);
@@ -2377,7 +2457,7 @@ for (;;)
         /* UTF-8 mode */
         if (utf8)
           {
-          register int d;
+          register unsigned int d;
           for (fi = min;; fi++)
             {
             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2412,7 +2492,7 @@ for (;;)
         /* UTF-8 mode */
         if (utf8)
           {
-          register int d;
+          register unsigned int d;
           for (i = min; i < max; i++)
             {
             int len = 1;
@@ -2421,6 +2501,7 @@ for (;;)
             if (fc == d) break;
             eptr += len;
             }
+          if (possessive) continue;
           for(;;)
             {
             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2438,6 +2519,7 @@ for (;;)
             if (eptr >= md->end_subject || fc == *eptr) break;
             eptr++;
             }
+          if (possessive) continue;
           while (eptr >= pp)
             {
             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -2469,6 +2551,34 @@ for (;;)
     ecode += 3;
     goto REPEATTYPE;
 
+    case OP_TYPEPOSSTAR:
+    possessive = TRUE;
+    min = 0;
+    max = INT_MAX;
+    ecode++;
+    goto REPEATTYPE;
+
+    case OP_TYPEPOSPLUS:
+    possessive = TRUE;
+    min = 1;
+    max = INT_MAX;
+    ecode++;
+    goto REPEATTYPE;
+
+    case OP_TYPEPOSQUERY:
+    possessive = TRUE;
+    min = 0;
+    max = 1;
+    ecode++;
+    goto REPEATTYPE;
+
+    case OP_TYPEPOSUPTO:
+    possessive = TRUE;
+    min = 0;
+    max = GET2(ecode, 1);
+    ecode += 3;
+    goto REPEATTYPE;
+
     case OP_TYPESTAR:
     case OP_TYPEMINSTAR:
     case OP_TYPEPLUS:
@@ -2571,7 +2681,6 @@ for (;;)
 
           default:
           RRETURN(PCRE_ERROR_INTERNAL);
-          break;
           }
         }
 
@@ -2611,9 +2720,7 @@ for (;;)
         for (i = 1; i <= min; i++)
           {
           if (eptr >= md->end_subject ||
-               ((ims & PCRE_DOTALL) == 0 &&
-                 eptr <= md->end_subject - md->nllen &&
-                 IS_NEWLINE(eptr)))
+               ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
             RRETURN(MATCH_NOMATCH);
           eptr++;
           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -2624,6 +2731,28 @@ for (;;)
         eptr += min;
         break;
 
+        case OP_ANYNL:
+        for (i = 1; i <= min; i++)
+          {
+          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+          GETCHARINC(c, eptr);
+          switch(c)
+            {
+            default: RRETURN(MATCH_NOMATCH);
+            case 0x000d:
+            if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+            break;
+            case 0x000a:
+            case 0x000b:
+            case 0x000c:
+            case 0x0085:
+            case 0x2028:
+            case 0x2029:
+            break;
+            }
+          }
+        break;
+
         case OP_NOT_DIGIT:
         for (i = 1; i <= min; i++)
           {
@@ -2692,7 +2821,8 @@ for (;;)
 #endif     /* SUPPORT_UTF8 */
 
       /* Code for the non-UTF-8 case for minimum matching of operators other
-      than OP_PROP and OP_NOTPROP. */
+      than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
+      number of bytes present, as this was tested above. */
 
       switch(ctype)
         {
@@ -2701,8 +2831,7 @@ for (;;)
           {
           for (i = 1; i <= min; i++)
             {
-            if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
-              RRETURN(MATCH_NOMATCH);
+            if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
             eptr++;
             }
           }
@@ -2713,6 +2842,28 @@ for (;;)
         eptr += min;
         break;
 
+        /* Because of the CRLF case, we can't assume the minimum number of
+        bytes are present in this case. */
+
+        case OP_ANYNL:
+        for (i = 1; i <= min; i++)
+          {
+          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+          switch(*eptr++)
+            {
+            default: RRETURN(MATCH_NOMATCH);
+            case 0x000d:
+            if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+            break;
+            case 0x000a:
+            case 0x000b:
+            case 0x000c:
+            case 0x0085:
+            break;
+            }
+          }
+        break;
+
         case OP_NOT_DIGIT:
         for (i = 1; i <= min; i++)
           if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
@@ -2774,7 +2925,7 @@ for (;;)
             GETCHARINC(c, eptr);
             if (prop_fail_result) RRETURN(MATCH_NOMATCH);
             }
-          break;
+          /* Control never gets here */
 
           case PT_LAMP:
           for (fi = min;; fi++)
@@ -2789,7 +2940,7 @@ for (;;)
                  prop_chartype == ucp_Lt) == prop_fail_result)
               RRETURN(MATCH_NOMATCH);
             }
-          break;
+          /* Control never gets here */
 
           case PT_GC:
           for (fi = min;; fi++)
@@ -2802,7 +2953,7 @@ for (;;)
             if ((prop_category == prop_value) == prop_fail_result)
               RRETURN(MATCH_NOMATCH);
             }
-          break;
+          /* Control never gets here */
 
           case PT_PC:
           for (fi = min;; fi++)
@@ -2815,7 +2966,7 @@ for (;;)
             if ((prop_chartype == prop_value) == prop_fail_result)
               RRETURN(MATCH_NOMATCH);
             }
-          break;
+          /* Control never gets here */
 
           case PT_SC:
           for (fi = min;; fi++)
@@ -2828,11 +2979,10 @@ for (;;)
             if ((prop_script == prop_value) == prop_fail_result)
               RRETURN(MATCH_NOMATCH);
             }
-          break;
+          /* Control never gets here */
 
           default:
           RRETURN(PCRE_ERROR_INTERNAL);
-          break;
           }
         }
 
@@ -2876,7 +3026,7 @@ for (;;)
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (fi >= max || eptr >= md->end_subject ||
                (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
-                eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
+                IS_NEWLINE(eptr)))
             RRETURN(MATCH_NOMATCH);
 
           GETCHARINC(c, eptr);
@@ -2888,6 +3038,23 @@ for (;;)
             case OP_ANYBYTE:
             break;
 
+            case OP_ANYNL:
+            switch(c)
+              {
+              default: RRETURN(MATCH_NOMATCH);
+              case 0x000d:
+              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+              break;
+              case 0x000a:
+              case 0x000b:
+              case 0x000c:
+              case 0x0085:
+              case 0x2028:
+              case 0x2029:
+              break;
+              }
+            break;
+
             case OP_NOT_DIGIT:
             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
               RRETURN(MATCH_NOMATCH);
@@ -2932,8 +3099,7 @@ for (;;)
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (fi >= max || eptr >= md->end_subject ||
-               ((ims & PCRE_DOTALL) == 0 &&
-                 eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
+               ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
             RRETURN(MATCH_NOMATCH);
 
           c = *eptr++;
@@ -2945,6 +3111,21 @@ for (;;)
             case OP_ANYBYTE:
             break;
 
+            case OP_ANYNL:
+            switch(c)
+              {
+              default: RRETURN(MATCH_NOMATCH);
+              case 0x000d:
+              if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+              break;
+              case 0x000a:
+              case 0x000b:
+              case 0x000c:
+              case 0x0085:
+              break;
+              }
+            break;
+
             case OP_NOT_DIGIT:
             if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
             break;
@@ -2977,7 +3158,7 @@ for (;;)
       /* Control never gets here */
       }
 
-    /* If maximizing it is worth using inline code for speed, doing the type
+    /* If maximizing, it is worth using inline code for speed, doing the type
     test once at the start (i.e. keep it out of the loop). Again, keep the
     UTF-8 and UCP stuff separate. */
 
@@ -3058,6 +3239,7 @@ for (;;)
 
         /* eptr is now past the end of the maximum run */
 
+        if (possessive) continue;
         for(;;)
           {
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -3093,6 +3275,7 @@ for (;;)
 
         /* eptr is now past the end of the maximum run */
 
+        if (possessive) continue;
         for(;;)
           {
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -3135,9 +3318,7 @@ for (;;)
               {
               for (i = min; i < max; i++)
                 {
-                if (eptr >= md->end_subject ||
-                    (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
-                  break;
+                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
                 eptr++;
                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
                 }
@@ -3161,9 +3342,7 @@ for (;;)
               {
               for (i = min; i < max; i++)
                 {
-                if (eptr >= md->end_subject ||
-                    (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
-                  break;
+                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
                 eptr++;
                 }
               break;
@@ -3171,7 +3350,8 @@ for (;;)
             else
               {
               c = max - min;
-              if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+              if (c > (unsigned int)(md->end_subject - eptr))
+                c = md->end_subject - eptr;
               eptr += c;
               }
             }
@@ -3181,10 +3361,32 @@ for (;;)
 
           case OP_ANYBYTE:
           c = max - min;
-          if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+          if (c > (unsigned int)(md->end_subject - eptr))
+            c = md->end_subject - eptr;
           eptr += c;
           break;
 
+          case OP_ANYNL:
+          for (i = min; i < max; i++)
+            {
+            int len = 1;
+            if (eptr >= md->end_subject) break;
+            GETCHARLEN(c, eptr, len);
+            if (c == 0x000d)
+              {
+              if (++eptr >= md->end_subject) break;
+              if (*eptr == 0x000a) eptr++;
+              }
+            else
+              {
+              if (c != 0x000a && c != 0x000b && c != 0x000c &&
+                  c != 0x0085 && c != 0x2028 && c != 0x2029)
+                break;
+              eptr += len;
+              }
+            }
+          break;
+
           case OP_NOT_DIGIT:
           for (i = min; i < max; i++)
             {
@@ -3257,6 +3459,7 @@ for (;;)
 
         /* eptr is now past the end of the maximum run */
 
+        if (possessive) continue;
         for(;;)
           {
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -3277,9 +3480,7 @@ for (;;)
             {
             for (i = min; i < max; i++)
               {
-              if (eptr >= md->end_subject ||
-                  (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
-                break;
+              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
               eptr++;
               }
             break;
@@ -3288,10 +3489,30 @@ for (;;)
 
           case OP_ANYBYTE:
           c = max - min;
-          if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+          if (c > (unsigned int)(md->end_subject - eptr))
+            c = md->end_subject - eptr;
           eptr += c;
           break;
 
+          case OP_ANYNL:
+          for (i = min; i < max; i++)
+            {
+            if (eptr >= md->end_subject) break;
+            c = *eptr;
+            if (c == 0x000d)
+              {
+              if (++eptr >= md->end_subject) break;
+              if (*eptr == 0x000a) eptr++;
+              }
+            else
+              {
+              if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
+                break;
+              eptr++;
+              }
+            }
+          break;
+
           case OP_NOT_DIGIT:
           for (i = min; i < max; i++)
             {
@@ -3352,6 +3573,7 @@ for (;;)
 
         /* eptr is now past the end of the maximum run */
 
+        if (possessive) continue;
         while (eptr >= pp)
           {
           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
@@ -3366,14 +3588,12 @@ for (;;)
       }
     /* Control never gets here */
 
-    /* There's been some horrible disaster. Since all codes > OP_BRA are
-    for capturing brackets, and there shouldn't be any gaps between 0 and
-    OP_BRA, arrival here can only mean there is something seriously wrong
-    in the code above or the OP_xxx definitions. */
+    /* There's been some horrible disaster. Arrival here can only mean there is
+    something seriously wrong in the code above or the OP_xxx definitions. */
 
     default:
     DPRINTF(("Unknown opcode %d\n", *ecode));
-    RRETURN(PCRE_ERROR_UNKNOWN_NODE);
+    RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
     }
 
   /* Do not stick any code in here without much thought; it is assumed
@@ -3411,7 +3631,6 @@ Undefine all the macros that were defined above to handle this. */
 
 #undef cur_is_word
 #undef condition
-#undef minimize
 #undef prev_is_word
 
 #undef original_ims
@@ -3484,6 +3703,7 @@ BOOL startline;
 BOOL firstline;
 BOOL first_byte_caseless = FALSE;
 BOOL req_byte_caseless = FALSE;
+BOOL utf8;
 match_data match_block;
 match_data *md = &match_block;
 const uschar *tables;
@@ -3491,6 +3711,7 @@ const uschar *start_bits = NULL;
 USPTR start_match = (USPTR)subject + start_offset;
 USPTR end_subject;
 USPTR req_byte_ptr = start_match - 1;
+eptrblock eptrchain[EPTR_WORK_SIZE];
 
 pcre_study_data internal_study;
 const pcre_study_data *study;
@@ -3567,7 +3788,7 @@ md->end_subject = md->start_subject + length;
 end_subject = md->end_subject;
 
 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
-md->utf8 = (re->options & PCRE_UTF8) != 0;
+utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
 
 md->notbol = (options & PCRE_NOTBOL) != 0;
 md->noteol = (options & PCRE_NOTEOL) != 0;
@@ -3576,6 +3797,7 @@ md->partial = (options & PCRE_PARTIAL) != 0;
 md->hitend = FALSE;
 
 md->recursive = NULL;                   /* No recursion at top level */
+md->eptrchain = eptrchain;              /* Make workspace generally available */
 
 md->lcc = tables + lcc_offset;
 md->ctypes = tables + ctypes_offset;
@@ -3583,26 +3805,36 @@ md->ctypes = tables + ctypes_offset;
 /* Handle different types of newline. The two bits give four cases. If nothing
 is set at run time, whatever was used at compile time applies. */
 
-switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
-         PCRE_NEWLINE_CRLF)
+switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
+       PCRE_NEWLINE_BITS)
   {
-  default:              newline = NEWLINE; break;   /* Compile-time default */
+  case 0: newline = NEWLINE; break;   /* Compile-time default */
   case PCRE_NEWLINE_CR: newline = '\r'; break;
   case PCRE_NEWLINE_LF: newline = '\n'; break;
   case PCRE_NEWLINE_CR+
        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+  case PCRE_NEWLINE_ANY: newline = -1; break;
+  default: return PCRE_ERROR_BADNEWLINE;
   }
 
-if (newline > 255)
+if (newline < 0)
   {
-  md->nllen = 2;
-  md->nl[0] = (newline >> 8) & 255;
-  md->nl[1] = newline & 255;
+  md->nltype = NLTYPE_ANY;
   }
 else
   {
-  md->nllen = 1;
-  md->nl[0] = newline;
+  md->nltype = NLTYPE_FIXED;
+  if (newline > 255)
+    {
+    md->nllen = 2;
+    md->nl[0] = (newline >> 8) & 255;
+    md->nl[1] = newline & 255;
+    }
+  else
+    {
+    md->nllen = 1;
+    md->nl[0] = newline;
+    }
   }
 
 /* Partial matching is supported only for a restricted set of regexes at the
@@ -3615,7 +3847,7 @@ if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
 back the character offset. */
 
 #ifdef SUPPORT_UTF8
-if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
     return PCRE_ERROR_BADUTF8;
@@ -3707,10 +3939,13 @@ if ((re->options & PCRE_REQCHSET) != 0)
   req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
   }
 
+
+/* ==========================================================================*/
+
 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
 the loop runs just once. */
 
-do
+for(;;)
   {
   USPTR save_end_subject = end_subject;
 
@@ -3725,14 +3960,14 @@ do
 
   /* Advance to a unique first char if possible. If firstline is TRUE, the
   start of the match is constrained to the first line of a multiline string.
-  Implement this by temporarily adjusting end_subject so that we stop scanning
-  at a newline. If the match fails at the newline, later code breaks this loop.
-  */
+  That is, the match must be before or at the first newline. Implement this by
+  temporarily adjusting end_subject so that we stop scanning at a newline. If
+  the match fails at the newline, later code breaks this loop. */
 
   if (firstline)
     {
     USPTR t = start_match;
-    while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
+    while (t < md->end_subject && !IS_NEWLINE(t)) t++;
     end_subject = t;
     }
 
@@ -3753,11 +3988,9 @@ do
 
   else if (startline)
     {
-    if (start_match >= md->start_subject + md->nllen +
-          start_offset)
+    if (start_match > md->start_subject + start_offset)
       {
-      while (start_match <= end_subject &&
-             !IS_NEWLINE(start_match - md->nllen))
+      while (start_match <= end_subject && !WAS_NEWLINE(start_match))
         start_match++;
       }
     }
@@ -3793,8 +4026,8 @@ do
 
   HOWEVER: when the subject string is very, very long, searching to its end can
   take a long time, and give bad performance on quite ordinary patterns. This
-  showed up when somebody was matching /^C/ on a 32-megabyte string... so we
-  don't do this when the string is sufficiently long.
+  showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
+  string... so we don't do this when the string is sufficiently long.
 
   ALSO: this processing is disabled when partial matching is requested.
   */
@@ -3826,9 +4059,14 @@ do
           }
         }
 
-      /* If we can't find the required character, break the matching loop */
+      /* If we can't find the required character, break the matching loop,
+      forcing a match failure. */
 
-      if (p >= end_subject) break;
+      if (p >= end_subject)
+        {
+        rc = MATCH_NOMATCH;
+        break;
+        }
 
       /* If we have found the required character, save the point where we
       found it, so that we don't search again next time round the loop if
@@ -3838,49 +4076,70 @@ do
       }
     }
 
-  /* When a match occurs, substrings will be set for all internal extractions;
-  we just need to set up the whole thing as substring 0 before returning. If
-  there were too many extractions, set the return code to zero. In the case
-  where we had to get some local store to hold offsets for backreferences, copy
-  those back references that we can. In this case there need not be overflow
-  if certain parts of the pattern were not used. */
+  /* OK, we can now run the match. */
 
   md->start_match = start_match;
   md->match_call_count = 0;
+  md->eptrn = 0;                          /* Next free eptrchain slot */
+  rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
 
-  rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
+  /* Any return other than MATCH_NOMATCH breaks the loop. */
 
-  /* When the result is no match, if the subject's first character was a
-  newline and the PCRE_FIRSTLINE option is set, break (which will return
-  PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
-  newline in the subject. Otherwise, advance the pointer to the next character
-  and continue - but the continuation will actually happen only when the
-  pattern is not anchored. */
+  if (rc != MATCH_NOMATCH) break;
 
-  if (rc == MATCH_NOMATCH)
-    {
-    if (firstline &&
-        start_match <= md->end_subject - md->nllen &&
-        IS_NEWLINE(start_match))
-      break;
-    start_match++;
+  /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
+  newline in the subject (though it may continue over the newline). Therefore,
+  if we have just failed to match, starting at a newline, do not continue. */
+
+  if (firstline && IS_NEWLINE(start_match)) break;
+
+  /* Advance the match position by one character. */
+
+  start_match++;
 #ifdef SUPPORT_UTF8
-    if (md->utf8)
-      while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
-        start_match++;
+  if (utf8)
+    while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+      start_match++;
 #endif
-    continue;
-    }
 
-  if (rc != MATCH_MATCH)
-    {
-    DPRINTF((">>>> error: returning %d\n", rc));
-    return rc;
-    }
+  /* Break the loop if the pattern is anchored or if we have passed the end of
+  the subject. */
+
+  if (anchored || start_match > end_subject) break;
+
+  /* If we have just passed a CR and the newline option is CRLF or ANY, and we
+  are now at a LF, advance the match position by one more character. */
+
+  if (start_match[-1] == '\r' &&
+       (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
+       start_match < end_subject &&
+       *start_match == '\n')
+    start_match++;
+
+  }   /* End of for(;;) "bumpalong" loop */
+
+/* ==========================================================================*/
+
+/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
+conditions is true:
 
-  /* We have a match! Copy the offset information from temporary store if
-  necessary */
+(1) The pattern is anchored;
 
+(2) We are past the end of the subject;
+
+(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
+    this option requests that a match occur at or before the first newline in
+    the subject.
+
+When we have a match and the offset vector is big enough to deal with any
+backreferences, captured substring offsets will already be set up. In the case
+where we had to get some local store to hold offsets for backreference
+processing, copy those that we can. In this case there need not be overflow if
+certain parts of the pattern were not used, even though there are more
+capturing parentheses than vector slots. */
+
+if (rc == MATCH_MATCH)
+  {
   if (using_temporary_offsets)
     {
     if (offsetcount >= 4)
@@ -3889,15 +4148,18 @@ do
         (offsetcount - 2) * sizeof(int));
       DPRINTF(("Copied offsets from temporary memory\n"));
       }
-    if (md->end_offset_top > offsetcount)
-      md->offset_overflow = TRUE;
-
+    if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
     DPRINTF(("Freeing temporary memory\n"));
     (pcre_free)(md->offset_vector);
     }
 
+  /* Set the return code to the number of captured strings, or 0 if there are
+  too many to fit into the vector. */
+
   rc = md->offset_overflow? 0 : md->end_offset_top/2;
 
+  /* If there is space, set up the whole thing as substring 0. */
+
   if (offsetcount < 2) rc = 0; else
     {
     offsets[0] = start_match - md->start_subject;
@@ -3908,9 +4170,8 @@ do
   return rc;
   }
 
-/* This "while" is the end of the "do" above */
-
-while (!anchored && start_match <= end_subject);
+/* Control gets here if there has been an error, or if the overall match
+attempt has failed at all permitted starting positions. */
 
 if (using_temporary_offsets)
   {
@@ -3918,7 +4179,12 @@ if (using_temporary_offsets)
   (pcre_free)(md->offset_vector);
   }
 
-if (md->partial && md->hitend)
+if (rc != MATCH_NOMATCH)
+  {
+  DPRINTF((">>>> error: returning %d\n", rc));
+  return rc;
+  }
+else if (md->partial && md->hitend)
   {
   DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
   return PCRE_ERROR_PARTIAL;
author	Nuno Lopes <nlopess@php.net>	2007-02-09 19:48:47 +0000
committer	Nuno Lopes <nlopess@php.net>	2007-02-09 19:48:47 +0000
commit	b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe (patch)
tree	7d627e2f5988d55ae5dd3b76171b94fc9ab0bc7d /ext/pcre/pcrelib/pcre_exec.c
parent	e6d69595afed237cdfe561c9f052efb41f41c622 (diff)
download	php-git-b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe.tar.gz