diff options
author | Nuno Lopes <nlopess@php.net> | 2007-02-09 19:48:47 +0000 |
---|---|---|
committer | Nuno Lopes <nlopess@php.net> | 2007-02-09 19:48:47 +0000 |
commit | b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe (patch) | |
tree | 7d627e2f5988d55ae5dd3b76171b94fc9ab0bc7d /ext/pcre/pcrelib/pcre_exec.c | |
parent | e6d69595afed237cdfe561c9f052efb41f41c622 (diff) | |
download | php-git-b3e66c616dcc1f5d9988d3e485dcd00bbba6fabe.tar.gz |
upgrade pcre to version 7.0
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_exec.c | 894 |
1 files changed, 580 insertions, 314 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c index a67855bfef..890e0f731a 100644 --- a/ext/pcre/pcrelib/pcre_exec.c +++ b/ext/pcre/pcrelib/pcre_exec.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,25 +42,22 @@ POSSIBILITY OF SUCH DAMAGE. pattern matching using an NFA algorithm, trying to mimic Perl as closely as possible. There are also some static supporting functions. */ -#define NLBLOCK md /* The block containing newline information */ -#include "pcre_internal.h" +#define NLBLOCK md /* Block containing newline information */ +#define PSSTART start_subject /* Field containing processed string start */ +#define PSEND end_subject /* Field containing processed string end */ +#include "pcre_internal.h" -/* Structure for building a chain of data that actually lives on the -stack, for holding the values of the subject pointer at the start of each -subpattern, so as to detect when an empty string has been matched by a -subpattern - to break infinite loops. When NO_RECURSE is set, these blocks -are on the heap, not on the stack. */ +/* The chain of eptrblocks for tail recursions uses memory in stack workspace, +obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */ -typedef struct eptrblock { - struct eptrblock *epb_prev; - USPTR epb_saved_eptr; -} eptrblock; +#define EPTR_WORK_SIZE (1000) /* Flag bits for the match() function */ -#define match_condassert 0x01 /* Called to check a condition assertion */ -#define match_isgroup 0x02 /* Set if start of bracketed group */ +#define match_condassert 0x01 /* Called to check a condition assertion */ +#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ +#define match_tail_recursed 0x04 /* Tail recursive call */ /* Non-error returns from the match() function. Error returns are externally defined PCRE_ERROR_xxx codes, which are all negative. */ @@ -101,7 +98,7 @@ Returns: nothing static void pchars(const uschar *p, int length, BOOL is_subject, match_data *md) { -int c; +unsigned int c; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; while (length-- > 0) if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); @@ -291,7 +288,6 @@ typedef struct heapframe { BOOL Xcur_is_word; BOOL Xcondition; - BOOL Xminimize; BOOL Xprev_is_word; unsigned long int Xoriginal_ims; @@ -303,11 +299,10 @@ typedef struct heapframe { int Xprop_category; int Xprop_chartype; int Xprop_script; - int *Xprop_test_variable; #endif int Xctype; - int Xfc; + unsigned int Xfc; int Xfi; int Xlength; int Xmax; @@ -340,10 +335,7 @@ typedef struct heapframe { * Match from current position * *************************************************/ -/* On entry ecode points to the first opcode, and eptr to the first character -in the subject string, while eptrb holds the value of eptr at the start of the -last bracketed group - used for breaking infinite loops matching zero-length -strings. This function is called recursively in many circumstances. Whenever it +/* This function is called recursively in many circumstances. Whenever it returns a negative (error) response, the outer incarnation must also return the same response. @@ -353,8 +345,8 @@ performance. Tests using gcc on a SPARC disproved this; in the first case, it made performance worse. Arguments: - eptr pointer in subject - ecode position in code + eptr pointer to current character in subject + ecode pointer to current position in compiled code offset_top current top pointer md pointer to "static" info for the match ims current /i, /m, and /s options @@ -362,7 +354,9 @@ Arguments: brackets - for testing for empty matches flags can contain match_condassert - this is an assertion condition - match_isgroup - this is the start of a bracketed group + match_cbegroup - this is the start of an unlimited repeat + group that can match an empty string + match_tail_recursed - this is a tail_recursed group rdepth the recursion depth Returns: MATCH_MATCH if matched ) these values are >= 0 @@ -377,14 +371,16 @@ match(REGISTER USPTR eptr, REGISTER const uschar *ecode, int flags, unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, -so they can be ordinary variables in all cases. Mark them with "register" -because they are used a lot in loops. */ +so they can be ordinary variables in all cases. Mark some of them with +"register" because they are used a lot in loops. */ register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ -register unsigned int c; /* Character values not kept over RMATCH() calls */ +register unsigned int c; /* Character values not kept over RMATCH() calls */ register BOOL utf8; /* Local copy of UTF-8 flag for speed */ +BOOL minimize, possessive; /* Quantifier options */ + /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from heap storage. Set up the top-level frame here; others are obtained from the @@ -434,7 +430,6 @@ HEAP_RECURSE: #define cur_is_word frame->Xcur_is_word #define condition frame->Xcondition -#define minimize frame->Xminimize #define prev_is_word frame->Xprev_is_word #define original_ims frame->Xoriginal_ims @@ -446,7 +441,6 @@ HEAP_RECURSE: #define prop_category frame->Xprop_category #define prop_chartype frame->Xprop_chartype #define prop_script frame->Xprop_script -#define prop_test_variable frame->Xprop_test_variable #endif #define ctype frame->Xctype @@ -470,7 +464,7 @@ HEAP_RECURSE: get preserved during recursion in the normal way. In this environment, fi and i, and fc and c, can be the same variables. */ -#else +#else /* NO_RECURSE not defined */ #define fi i #define fc c @@ -489,7 +483,6 @@ recursion_info new_recursive; /* within blocks below are for variables */ /* that do not have to be preserved over */ BOOL cur_is_word; /* a recursive call to RMATCH(). */ BOOL condition; -BOOL minimize; BOOL prev_is_word; unsigned long int original_ims; @@ -501,7 +494,6 @@ int prop_fail_result; int prop_category; int prop_chartype; int prop_script; -int *prop_test_variable; #endif int ctype; @@ -516,7 +508,7 @@ int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; eptrblock newptrb; -#endif +#endif /* NO_RECURSE */ /* These statements are here to stop the compiler complaining about unitialized variables. */ @@ -524,9 +516,9 @@ variables. */ #ifdef SUPPORT_UCP prop_value = 0; prop_fail_result = 0; -prop_test_variable = NULL; #endif + /* This label is used for tail recursion, which is used in a few cases even when NO_RECURSE is not defined, in order to reduce the amount of stack that is used. Thanks to Ian Taylor for noticing this possibility and sending the @@ -556,24 +548,34 @@ utf8 = md->utf8; /* Local copy of the flag */ utf8 = FALSE; #endif -/* At the start of a bracketed group, add the current subject pointer to the -stack of such pointers, to be re-instated at the end of the group when we hit -the closing ket. When match() is called in other circumstances, we don't add to -this stack. */ +/* At the start of a group with an unlimited repeat that may match an empty +string, the match_cbegroup flag is set. When this is the case, add the current +subject pointer to the chain of such remembered pointers, to be checked when we +hit the closing ket, in order to break infinite loops that match no characters. +When match() is called in other circumstances, don't add to the chain. If this +is a tail recursion, use a block from the workspace, as the one on the stack is +already used. */ -if ((flags & match_isgroup) != 0) +if ((flags & match_cbegroup) != 0) { - newptrb.epb_prev = eptrb; - newptrb.epb_saved_eptr = eptr; - eptrb = &newptrb; + eptrblock *p; + if ((flags & match_tail_recursed) != 0) + { + if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT); + p = md->eptrchain + md->eptrn++; + } + else p = &newptrb; + p->epb_saved_eptr = eptr; + p->epb_prev = eptrb; + eptrb = p; } -/* Now start processing the operations. */ +/* Now start processing the opcodes. */ for (;;) { + minimize = possessive = FALSE; op = *ecode; - minimize = FALSE; /* For partial matching, remember if we ever hit the end of the subject after matching at least one subject character. */ @@ -583,33 +585,30 @@ for (;;) eptr > md->start_match) md->hitend = TRUE; - /* Opening capturing bracket. If there is space in the offset vector, save - the current subject position in the working slot at the top of the vector. We - mustn't change the current values of the data slot, because they may be set - from a previous iteration of this group, and be referred to by a reference - inside the group. - - If the bracket fails to match, we need to restore this value and also the - values of the final offsets, in case they were set by a previous iteration of - the same bracket. - - If there isn't enough space in the offset vector, treat this as if it were a - non-capturing bracket. Don't worry about setting the flag for the error case - here; that is handled in the code for KET. */ - - if (op > OP_BRA) + switch(op) { - number = op - OP_BRA; - - /* For extended extraction brackets (large number), we have to fish out the - number from a dummy opcode at the start. */ - - if (number > EXTRACT_BASIC_MAX) - number = GET2(ecode, 2+LINK_SIZE); + /* Handle a capturing bracket. If there is space in the offset vector, save + the current subject position in the working slot at the top of the vector. + We mustn't change the current values of the data slot, because they may be + set from a previous iteration of this group, and be referred to by a + reference inside the group. + + If the bracket fails to match, we need to restore this value and also the + values of the final offsets, in case they were set by a previous iteration + of the same bracket. + + If there isn't enough space in the offset vector, treat this as if it were + a non-capturing bracket. Don't worry about setting the flag for the error + case here; that is handled in the code for KET. */ + + case OP_CBRA: + case OP_SCBRA: + number = GET2(ecode, 1+LINK_SIZE); offset = number << 1; #ifdef DEBUG - printf("start bracket %d subject=", number); + printf("start bracket %d\n", number); + printf("subject="); pchars(eptr, 16, TRUE, md); printf("\n"); #endif @@ -624,10 +623,11 @@ for (;;) DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); md->offset_vector[md->offset_end - number] = eptr - md->start_subject; + flags = (op == OP_SCBRA)? match_cbegroup : 0; do { - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, - match_isgroup); + RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags); if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->capture_last = save_capture_last; ecode += GET(ecode, 1); @@ -643,39 +643,35 @@ for (;;) RRETURN(MATCH_NOMATCH); } - /* Insufficient room for saving captured contents */ + /* Insufficient room for saving captured contents. Treat as a non-capturing + bracket. */ - else op = OP_BRA; - } + DPRINTF(("insufficient capture room: treat as non-capturing\n")); - /* Other types of node can be handled by a switch */ - - switch(op) - { - case OP_BRA: /* Non-capturing bracket: optimized */ - DPRINTF(("start bracket 0\n")); - - /* Loop for all the alternatives */ + /* Non-capturing bracket. Loop for all the alternatives. When we get to the + final alternative within the brackets, we would return the result of a + recursive call to match() whatever happened. We can reduce stack usage by + turning this into a tail recursion. */ + case OP_BRA: + case OP_SBRA: + DPRINTF(("start non-capturing bracket\n")); + flags = (op >= OP_SBRA)? match_cbegroup : 0; for (;;) { - /* When we get to the final alternative within the brackets, we would - return the result of a recursive call to match() whatever happened. We - can reduce stack usage by turning this into a tail recursion. */ - if (ecode[GET(ecode, 1)] != OP_ALT) - { - ecode += 1 + LINK_SIZE; - flags = match_isgroup; - DPRINTF(("bracket 0 tail recursion\n")); - goto TAIL_RECURSE; - } + { + ecode += _pcre_OP_lengths[*ecode]; + flags |= match_tail_recursed; + DPRINTF(("bracket 0 tail recursion\n")); + goto TAIL_RECURSE; + } /* For non-final alternatives, continue the loop for a NOMATCH result; otherwise return. */ - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, - match_isgroup); + RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, + eptrb, flags); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); } @@ -688,54 +684,72 @@ for (;;) obeyed, we can use tail recursion to avoid using another stack frame. */ case OP_COND: - if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ + case OP_SCOND: + if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ + { + offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + condition = md->recursive != NULL && + (offset == RREF_ANY || offset == md->recursive->group_num); + ecode += condition? 3 : GET(ecode, 1); + } + + else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ { offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ - condition = (offset == CREF_RECURSE * 2)? - (md->recursive != NULL) : - (offset < offset_top && md->offset_vector[offset] >= 0); - ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1)); - flags = match_isgroup; - goto TAIL_RECURSE; + condition = offset < offset_top && md->offset_vector[offset] >= 0; + ecode += condition? 3 : GET(ecode, 1); + } + + else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ + { + condition = FALSE; + ecode += GET(ecode, 1); } /* The condition is an assertion. Call match() to evaluate it - setting - the final argument TRUE causes it to stop at the end of an assertion. */ + the final argument match_condassert causes it to stop at the end of an + assertion. */ else { RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, - match_condassert | match_isgroup); + match_condassert); if (rrc == MATCH_MATCH) { - ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); + condition = TRUE; + ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); while (*ecode == OP_ALT) ecode += GET(ecode, 1); } else if (rrc != MATCH_NOMATCH) { RRETURN(rrc); /* Need braces because of following else */ } - else ecode += GET(ecode, 1); + else + { + condition = FALSE; + ecode += GET(ecode, 1); + } + } - /* We are now at the branch that is to be obeyed. As there is only one, - we can use tail recursion to avoid using another stack frame. */ + /* We are now at the branch that is to be obeyed. As there is only one, + we can use tail recursion to avoid using another stack frame. If the second + alternative doesn't exist, we can just plough on. */ + if (condition || *ecode == OP_ALT) + { ecode += 1 + LINK_SIZE; - flags = match_isgroup; + flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0); goto TAIL_RECURSE; } - /* Control never reaches here */ - - /* Skip over conditional reference or large extraction number data if - encountered. */ - - case OP_CREF: - case OP_BRANUMBER: - ecode += 3; + else + { + ecode += 1 + LINK_SIZE; + } break; - /* End of the pattern. If we are in a recursion, we should restore the - offsets appropriately and continue from after the call. */ + + /* End of the pattern. If we are in a top-level recursion, we should + restore the offsets appropriately and continue from after the call. */ case OP_END: if (md->recursive != NULL && md->recursive->group_num == 0) @@ -777,8 +791,7 @@ for (;;) case OP_ASSERTBACK: do { - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, - match_isgroup); + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); if (rrc == MATCH_MATCH) break; if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); @@ -804,8 +817,7 @@ for (;;) case OP_ASSERTBACK_NOT: do { - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, - match_isgroup); + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0); if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode,1); @@ -826,8 +838,8 @@ for (;;) #ifdef SUPPORT_UTF8 if (utf8) { - c = GET(ecode,1); - for (i = 0; i < c; i++) + i = GET(ecode, 1); + while (i-- > 0) { eptr--; if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); @@ -840,7 +852,7 @@ for (;;) /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ { - eptr -= GET(ecode,1); + eptr -= GET(ecode, 1); if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); } @@ -897,13 +909,8 @@ for (;;) case OP_RECURSE: { callpat = md->start_code + GET(ecode, 1); - new_recursive.group_num = *callpat - OP_BRA; - - /* For extended extraction brackets (large number), we have to fish out - the number from a dummy opcode at the start. */ - - if (new_recursive.group_num > EXTRACT_BASIC_MAX) - new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); + new_recursive.group_num = (callpat == md->start_code)? 0 : + GET2(callpat, 1 + LINK_SIZE); /* Add to "recursing stack" */ @@ -936,10 +943,11 @@ for (;;) restore the offset and recursion data. */ DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); + flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; do { - RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, - eptrb, match_isgroup); + RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, + md, ims, eptrb, flags); if (rrc == MATCH_MATCH) { DPRINTF(("Recursion matched\n")); @@ -983,7 +991,7 @@ for (;;) do { RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, - eptrb, match_isgroup); + eptrb, 0); if (rrc == MATCH_MATCH) break; if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode,1); @@ -997,7 +1005,7 @@ for (;;) /* Continue as from after the assertion, updating the offsets high water mark, since extracts may have been taken. */ - do ecode += GET(ecode,1); while (*ecode == OP_ALT); + do ecode += GET(ecode, 1); while (*ecode == OP_ALT); offset_top = md->end_offset_top; eptr = md->end_match_ptr; @@ -1031,15 +1039,15 @@ for (;;) RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode = prev; - flags = match_isgroup; + flags = match_tail_recursed; goto TAIL_RECURSE; } else /* OP_KETRMAX */ { - RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); + RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; - flags = 0; + flags = match_tail_recursed; goto TAIL_RECURSE; } /* Control never gets here */ @@ -1060,38 +1068,44 @@ for (;;) case OP_BRAZERO: { next = ecode+1; - RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); + RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); do next += GET(next,1); while (*next == OP_ALT); - ecode = next + 1+LINK_SIZE; + ecode = next + 1 + LINK_SIZE; } break; case OP_BRAMINZERO: { next = ecode+1; - do next += GET(next,1); while (*next == OP_ALT); - RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, - match_isgroup); + do next += GET(next, 1); while (*next == OP_ALT); + RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode++; } break; - /* End of a group, repeated or non-repeating. If we are at the end of - an assertion "group", stop matching and return MATCH_MATCH, but record the - current high water mark for use by positive assertions. Do this also - for the "once" (not-backup up) groups. */ + /* End of a group, repeated or non-repeating. */ case OP_KET: case OP_KETRMIN: case OP_KETRMAX: prev = ecode - GET(ecode, 1); - saved_eptr = eptrb->epb_saved_eptr; - /* Back up the stack of bracket start pointers. */ + /* If this was a group that remembered the subject start, in order to break + infinite repeats of empty string matches, retrieve the subject start from + the chain. Otherwise, set it NULL. */ + + if (*prev >= OP_SBRA) + { + saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ + eptrb = eptrb->epb_prev; /* Backup to previous group */ + } + else saved_eptr = NULL; - eptrb = eptrb->epb_prev; + /* If we are at the end of an assertion group, stop matching and return + MATCH_MATCH, but record the current high water mark for use by positive + assertions. Do this also for the "once" (atomic) groups. */ if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || @@ -1102,18 +1116,15 @@ for (;;) RRETURN(MATCH_MATCH); } - /* In all other cases except a conditional group we have to check the - group number back at the start and if necessary complete handling an - extraction by setting the offsets and bumping the high water mark. */ + /* For capturing groups we have to check the group number back at the start + and if necessary complete handling an extraction by setting the offsets and + bumping the high water mark. Note that whole-pattern recursion is coded as + a recurse into group 0, so it won't be picked up here. Instead, we catch it + when the OP_END is reached. Other recursion is handled here. */ - if (*prev != OP_COND) + if (*prev == OP_CBRA || *prev == OP_SCBRA) { - number = *prev - OP_BRA; - - /* For extended extraction brackets (large number), we have to fish out - the number from a dummy opcode at the start. */ - - if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); + number = GET2(prev, 1+LINK_SIZE); offset = number << 1; #ifdef DEBUG @@ -1121,42 +1132,34 @@ for (;;) printf("\n"); #endif - /* Test for a numbered group. This includes groups called as a result - of recursion. Note that whole-pattern recursion is coded as a recurse - into group 0, so it won't be picked up here. Instead, we catch it when - the OP_END is reached. */ - - if (number > 0) + md->capture_last = number; + if (offset >= md->offset_max) md->offset_overflow = TRUE; else { - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else - { - md->offset_vector[offset] = - md->offset_vector[md->offset_end - number]; - md->offset_vector[offset+1] = eptr - md->start_subject; - if (offset_top <= offset) offset_top = offset + 2; - } + md->offset_vector[offset] = + md->offset_vector[md->offset_end - number]; + md->offset_vector[offset+1] = eptr - md->start_subject; + if (offset_top <= offset) offset_top = offset + 2; + } - /* Handle a recursively called group. Restore the offsets - appropriately and continue from after the call. */ + /* Handle a recursively called group. Restore the offsets + appropriately and continue from after the call. */ - if (md->recursive != NULL && md->recursive->group_num == number) - { - recursion_info *rec = md->recursive; - DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); - md->recursive = rec->prevrec; - md->start_match = rec->save_start; - memcpy(md->offset_vector, rec->offset_save, - rec->saved_max * sizeof(int)); - ecode = rec->after_call; - ims = original_ims; - break; - } + if (md->recursive != NULL && md->recursive->group_num == number) + { + recursion_info *rec = md->recursive; + DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); + md->recursive = rec->prevrec; + md->start_match = rec->save_start; + memcpy(md->offset_vector, rec->offset_save, + rec->saved_max * sizeof(int)); + ecode = rec->after_call; + ims = original_ims; + break; } } - /* Reset the value of the ims flags, in case they got changed during - the group. */ + /* For both capturing and non-capturing groups, reset the value of the ims + flags, in case they got changed during the group. */ ims = original_ims; DPRINTF(("ims reset to %02lx\n", ims)); @@ -1177,20 +1180,22 @@ for (;;) preceding bracket, in the appropriate order. In the second case, we can use tail recursion to avoid using another stack frame. */ + flags = (*prev >= OP_SBRA)? match_cbegroup : 0; + if (*ecode == OP_KETRMIN) { RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode = prev; - flags = match_isgroup; + flags |= match_tail_recursed; goto TAIL_RECURSE; } else /* OP_KETRMAX */ { - RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); + RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; - flags = 0; + flags = match_tail_recursed; goto TAIL_RECURSE; } /* Control never gets here */ @@ -1202,9 +1207,7 @@ for (;;) if ((ims & PCRE_MULTILINE) != 0) { if (eptr != md->start_subject && - (eptr == md->end_subject || - eptr < md->start_subject + md->nllen || - !IS_NEWLINE(eptr - md->nllen))) + (eptr == md->end_subject || !WAS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); ecode++; break; @@ -1244,7 +1247,7 @@ for (;;) if (!md->endonly) { if (eptr != md->end_subject && - (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) + (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) RRETURN(MATCH_NOMATCH); ecode++; break; @@ -1263,7 +1266,7 @@ for (;;) case OP_EODN: if (eptr != md->end_subject && - (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) + (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) RRETURN(MATCH_NOMATCH); ecode++; break; @@ -1319,8 +1322,7 @@ for (;;) case OP_ANY: if ((ims & PCRE_DOTALL) == 0) { - if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) - RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); if (utf8) @@ -1414,6 +1416,26 @@ for (;;) ecode++; break; + case OP_ANYNL: + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + break; + } + ecode++; + break; + #ifdef SUPPORT_UCP /* Check the next character by Unicode property. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ @@ -1456,7 +1478,6 @@ for (;;) default: RRETURN(PCRE_ERROR_INTERNAL); - break; } ecode += 3; @@ -1926,7 +1947,7 @@ for (;;) else { - int dc; + unsigned int dc; GETCHARINC(dc, eptr); ecode += length; @@ -1953,13 +1974,17 @@ for (;;) } break; - /* Match a single character repeatedly; different opcodes share code. */ + /* Match a single character repeatedly. */ case OP_EXACT: min = max = GET2(ecode, 1); ecode += 3; goto REPEATCHAR; + case OP_POSUPTO: + possessive = TRUE; + /* Fall through */ + case OP_UPTO: case OP_MINUPTO: min = 0; @@ -1968,6 +1993,27 @@ for (;;) ecode += 3; goto REPEATCHAR; + case OP_POSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATCHAR; + + case OP_POSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATCHAR; + + case OP_POSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATCHAR; + case OP_STAR: case OP_MINSTAR: case OP_PLUS: @@ -2003,10 +2049,9 @@ for (;;) uschar occhars[8]; #ifdef SUPPORT_UCP - int othercase; + unsigned int othercase; if ((ims & PCRE_CASELESS) != 0 && - (othercase = _pcre_ucp_othercase(fc)) >= 0 && - othercase >= 0) + (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) oclength = _pcre_ord2utf8(othercase, occhars); #endif /* SUPPORT_UCP */ @@ -2042,7 +2087,8 @@ for (;;) } /* Control never gets here */ } - else + + else /* Maximize */ { pp = eptr; for (i = min; i < max; i++) @@ -2056,6 +2102,8 @@ for (;;) eptr += oclength; } } + + if (possessive) continue; while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2110,7 +2158,7 @@ for (;;) } /* Control never gets here */ } - else + else /* Maximize */ { pp = eptr; for (i = min; i < max; i++) @@ -2118,6 +2166,7 @@ for (;;) if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; eptr++; } + if (possessive) continue; while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2146,7 +2195,7 @@ for (;;) } /* Control never gets here */ } - else + else /* Maximize */ { pp = eptr; for (i = min; i < max; i++) @@ -2154,6 +2203,7 @@ for (;;) if (eptr >= md->end_subject || fc != *eptr) break; eptr++; } + if (possessive) continue; while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2206,6 +2256,34 @@ for (;;) ecode += 3; goto REPEATNOTCHAR; + case OP_NOTPOSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSUPTO: + possessive = TRUE; + min = 0; + max = GET2(ecode, 1); + ecode += 3; + goto REPEATNOTCHAR; + case OP_NOTSTAR: case OP_NOTMINSTAR: case OP_NOTPLUS: @@ -2245,7 +2323,7 @@ for (;;) /* UTF-8 mode */ if (utf8) { - register int d; + register unsigned int d; for (i = 1; i <= min; i++) { GETCHARINC(d, eptr); @@ -2270,7 +2348,7 @@ for (;;) /* UTF-8 mode */ if (utf8) { - register int d; + register unsigned int d; for (fi = min;; fi++) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2306,7 +2384,7 @@ for (;;) /* UTF-8 mode */ if (utf8) { - register int d; + register unsigned int d; for (i = min; i < max; i++) { int len = 1; @@ -2316,7 +2394,8 @@ for (;;) if (fc == d) break; eptr += len; } - for(;;) + if (possessive) continue; + for(;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -2333,6 +2412,7 @@ for (;;) if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; eptr++; } + if (possessive) continue; while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2354,7 +2434,7 @@ for (;;) /* UTF-8 mode */ if (utf8) { - register int d; + register unsigned int d; for (i = 1; i <= min; i++) { GETCHARINC(d, eptr); @@ -2377,7 +2457,7 @@ for (;;) /* UTF-8 mode */ if (utf8) { - register int d; + register unsigned int d; for (fi = min;; fi++) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2412,7 +2492,7 @@ for (;;) /* UTF-8 mode */ if (utf8) { - register int d; + register unsigned int d; for (i = min; i < max; i++) { int len = 1; @@ -2421,6 +2501,7 @@ for (;;) if (fc == d) break; eptr += len; } + if (possessive) continue; for(;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2438,6 +2519,7 @@ for (;;) if (eptr >= md->end_subject || fc == *eptr) break; eptr++; } + if (possessive) continue; while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2469,6 +2551,34 @@ for (;;) ecode += 3; goto REPEATTYPE; + case OP_TYPEPOSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSUPTO: + possessive = TRUE; + min = 0; + max = GET2(ecode, 1); + ecode += 3; + goto REPEATTYPE; + case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: @@ -2571,7 +2681,6 @@ for (;;) default: RRETURN(PCRE_ERROR_INTERNAL); - break; } } @@ -2611,9 +2720,7 @@ for (;;) for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && - eptr <= md->end_subject - md->nllen && - IS_NEWLINE(eptr))) + ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; @@ -2624,6 +2731,28 @@ for (;;) eptr += min; break; + case OP_ANYNL: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + break; + } + } + break; + case OP_NOT_DIGIT: for (i = 1; i <= min; i++) { @@ -2692,7 +2821,8 @@ for (;;) #endif /* SUPPORT_UTF8 */ /* Code for the non-UTF-8 case for minimum matching of operators other - than OP_PROP and OP_NOTPROP. */ + than OP_PROP and OP_NOTPROP. We can assume that there are the minimum + number of bytes present, as this was tested above. */ switch(ctype) { @@ -2701,8 +2831,7 @@ for (;;) { for (i = 1; i <= min; i++) { - if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) - RRETURN(MATCH_NOMATCH); + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; } } @@ -2713,6 +2842,28 @@ for (;;) eptr += min; break; + /* Because of the CRLF case, we can't assume the minimum number of + bytes are present in this case. */ + + case OP_ANYNL: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + switch(*eptr++) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + break; + } + } + break; + case OP_NOT_DIGIT: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); @@ -2774,7 +2925,7 @@ for (;;) GETCHARINC(c, eptr); if (prop_fail_result) RRETURN(MATCH_NOMATCH); } - break; + /* Control never gets here */ case PT_LAMP: for (fi = min;; fi++) @@ -2789,7 +2940,7 @@ for (;;) prop_chartype == ucp_Lt) == prop_fail_result) RRETURN(MATCH_NOMATCH); } - break; + /* Control never gets here */ case PT_GC: for (fi = min;; fi++) @@ -2802,7 +2953,7 @@ for (;;) if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } - break; + /* Control never gets here */ case PT_PC: for (fi = min;; fi++) @@ -2815,7 +2966,7 @@ for (;;) if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } - break; + /* Control never gets here */ case PT_SC: for (fi = min;; fi++) @@ -2828,11 +2979,10 @@ for (;;) if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } - break; + /* Control never gets here */ default: RRETURN(PCRE_ERROR_INTERNAL); - break; } } @@ -2876,7 +3026,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && - eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); @@ -2888,6 +3038,23 @@ for (;;) case OP_ANYBYTE: break; + case OP_ANYNL: + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + break; + } + break; + case OP_NOT_DIGIT: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); @@ -2932,8 +3099,7 @@ for (;;) RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && - eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); c = *eptr++; @@ -2945,6 +3111,21 @@ for (;;) case OP_ANYBYTE: break; + case OP_ANYNL: + switch(c) + { + default: RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) eptr++; + break; + case 0x000a: + case 0x000b: + case 0x000c: + case 0x0085: + break; + } + break; + case OP_NOT_DIGIT: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; @@ -2977,7 +3158,7 @@ for (;;) /* Control never gets here */ } - /* If maximizing it is worth using inline code for speed, doing the type + /* If maximizing, it is worth using inline code for speed, doing the type test once at the start (i.e. keep it out of the loop). Again, keep the UTF-8 and UCP stuff separate. */ @@ -3058,6 +3239,7 @@ for (;;) /* eptr is now past the end of the maximum run */ + if (possessive) continue; for(;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3093,6 +3275,7 @@ for (;;) /* eptr is now past the end of the maximum run */ + if (possessive) continue; for(;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3135,9 +3318,7 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || - (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -3161,9 +3342,7 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || - (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; eptr++; } break; @@ -3171,7 +3350,8 @@ for (;;) else { c = max - min; - if (c > md->end_subject - eptr) c = md->end_subject - eptr; + if (c > (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; eptr += c; } } @@ -3181,10 +3361,32 @@ for (;;) case OP_ANYBYTE: c = max - min; - if (c > md->end_subject - eptr) c = md->end_subject - eptr; + if (c > (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; eptr += c; break; + case OP_ANYNL: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (c == 0x000d) + { + if (++eptr >= md->end_subject) break; + if (*eptr == 0x000a) eptr++; + } + else + { + if (c != 0x000a && c != 0x000b && c != 0x000c && + c != 0x0085 && c != 0x2028 && c != 0x2029) + break; + eptr += len; + } + } + break; + case OP_NOT_DIGIT: for (i = min; i < max; i++) { @@ -3257,6 +3459,7 @@ for (;;) /* eptr is now past the end of the maximum run */ + if (possessive) continue; for(;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3277,9 +3480,7 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || - (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; eptr++; } break; @@ -3288,10 +3489,30 @@ for (;;) case OP_ANYBYTE: c = max - min; - if (c > md->end_subject - eptr) c = md->end_subject - eptr; + if (c > (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; eptr += c; break; + case OP_ANYNL: + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject) break; + c = *eptr; + if (c == 0x000d) + { + if (++eptr >= md->end_subject) break; + if (*eptr == 0x000a) eptr++; + } + else + { + if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085) + break; + eptr++; + } + } + break; + case OP_NOT_DIGIT: for (i = min; i < max; i++) { @@ -3352,6 +3573,7 @@ for (;;) /* eptr is now past the end of the maximum run */ + if (possessive) continue; while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3366,14 +3588,12 @@ for (;;) } /* Control never gets here */ - /* There's been some horrible disaster. Since all codes > OP_BRA are - for capturing brackets, and there shouldn't be any gaps between 0 and - OP_BRA, arrival here can only mean there is something seriously wrong - in the code above or the OP_xxx definitions. */ + /* There's been some horrible disaster. Arrival here can only mean there is + something seriously wrong in the code above or the OP_xxx definitions. */ default: DPRINTF(("Unknown opcode %d\n", *ecode)); - RRETURN(PCRE_ERROR_UNKNOWN_NODE); + RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); } /* Do not stick any code in here without much thought; it is assumed @@ -3411,7 +3631,6 @@ Undefine all the macros that were defined above to handle this. */ #undef cur_is_word #undef condition -#undef minimize #undef prev_is_word #undef original_ims @@ -3484,6 +3703,7 @@ BOOL startline; BOOL firstline; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; +BOOL utf8; match_data match_block; match_data *md = &match_block; const uschar *tables; @@ -3491,6 +3711,7 @@ const uschar *start_bits = NULL; USPTR start_match = (USPTR)subject + start_offset; USPTR end_subject; USPTR req_byte_ptr = start_match - 1; +eptrblock eptrchain[EPTR_WORK_SIZE]; pcre_study_data internal_study; const pcre_study_data *study; @@ -3567,7 +3788,7 @@ md->end_subject = md->start_subject + length; end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; -md->utf8 = (re->options & PCRE_UTF8) != 0; +utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; @@ -3576,6 +3797,7 @@ md->partial = (options & PCRE_PARTIAL) != 0; md->hitend = FALSE; md->recursive = NULL; /* No recursion at top level */ +md->eptrchain = eptrchain; /* Make workspace generally available */ md->lcc = tables + lcc_offset; md->ctypes = tables + ctypes_offset; @@ -3583,26 +3805,36 @@ md->ctypes = tables + ctypes_offset; /* Handle different types of newline. The two bits give four cases. If nothing is set at run time, whatever was used at compile time applies. */ -switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & - PCRE_NEWLINE_CRLF) +switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & + PCRE_NEWLINE_BITS) { - default: newline = NEWLINE; break; /* Compile-time default */ + case 0: newline = NEWLINE; break; /* Compile-time default */ case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + case PCRE_NEWLINE_ANY: newline = -1; break; + default: return PCRE_ERROR_BADNEWLINE; } -if (newline > 255) +if (newline < 0) { - md->nllen = 2; - md->nl[0] = (newline >> 8) & 255; - md->nl[1] = newline & 255; + md->nltype = NLTYPE_ANY; } else { - md->nllen = 1; - md->nl[0] = newline; + md->nltype = NLTYPE_FIXED; + if (newline > 255) + { + md->nllen = 2; + md->nl[0] = (newline >> 8) & 255; + md->nl[1] = newline & 255; + } + else + { + md->nllen = 1; + md->nl[0] = newline; + } } /* Partial matching is supported only for a restricted set of regexes at the @@ -3615,7 +3847,7 @@ if (md->partial && (re->options & PCRE_NOPARTIAL) != 0) back the character offset. */ #ifdef SUPPORT_UTF8 -if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { if (_pcre_valid_utf8((uschar *)subject, length) >= 0) return PCRE_ERROR_BADUTF8; @@ -3707,10 +3939,13 @@ if ((re->options & PCRE_REQCHSET) != 0) req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ } + +/* ==========================================================================*/ + /* Loop for handling unanchored repeated matching attempts; for anchored regexs the loop runs just once. */ -do +for(;;) { USPTR save_end_subject = end_subject; @@ -3725,14 +3960,14 @@ do /* Advance to a unique first char if possible. If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. - Implement this by temporarily adjusting end_subject so that we stop scanning - at a newline. If the match fails at the newline, later code breaks this loop. - */ + That is, the match must be before or at the first newline. Implement this by + temporarily adjusting end_subject so that we stop scanning at a newline. If + the match fails at the newline, later code breaks this loop. */ if (firstline) { USPTR t = start_match; - while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; + while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } @@ -3753,11 +3988,9 @@ do else if (startline) { - if (start_match >= md->start_subject + md->nllen + - start_offset) + if (start_match > md->start_subject + start_offset) { - while (start_match <= end_subject && - !IS_NEWLINE(start_match - md->nllen)) + while (start_match <= end_subject && !WAS_NEWLINE(start_match)) start_match++; } } @@ -3793,8 +4026,8 @@ do HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This - showed up when somebody was matching /^C/ on a 32-megabyte string... so we - don't do this when the string is sufficiently long. + showed up when somebody was matching something like /^\d+C/ on a 32-megabyte + string... so we don't do this when the string is sufficiently long. ALSO: this processing is disabled when partial matching is requested. */ @@ -3826,9 +4059,14 @@ do } } - /* If we can't find the required character, break the matching loop */ + /* If we can't find the required character, break the matching loop, + forcing a match failure. */ - if (p >= end_subject) break; + if (p >= end_subject) + { + rc = MATCH_NOMATCH; + break; + } /* If we have found the required character, save the point where we found it, so that we don't search again next time round the loop if @@ -3838,49 +4076,70 @@ do } } - /* When a match occurs, substrings will be set for all internal extractions; - we just need to set up the whole thing as substring 0 before returning. If - there were too many extractions, set the return code to zero. In the case - where we had to get some local store to hold offsets for backreferences, copy - those back references that we can. In this case there need not be overflow - if certain parts of the pattern were not used. */ + /* OK, we can now run the match. */ md->start_match = start_match; md->match_call_count = 0; + md->eptrn = 0; /* Next free eptrchain slot */ + rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0); - rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0); + /* Any return other than MATCH_NOMATCH breaks the loop. */ - /* When the result is no match, if the subject's first character was a - newline and the PCRE_FIRSTLINE option is set, break (which will return - PCRE_ERROR_NOMATCH). The option requests that a match occur before the first - newline in the subject. Otherwise, advance the pointer to the next character - and continue - but the continuation will actually happen only when the - pattern is not anchored. */ + if (rc != MATCH_NOMATCH) break; - if (rc == MATCH_NOMATCH) - { - if (firstline && - start_match <= md->end_subject - md->nllen && - IS_NEWLINE(start_match)) - break; - start_match++; + /* If PCRE_FIRSTLINE is set, the match must happen before or at the first + newline in the subject (though it may continue over the newline). Therefore, + if we have just failed to match, starting at a newline, do not continue. */ + + if (firstline && IS_NEWLINE(start_match)) break; + + /* Advance the match position by one character. */ + + start_match++; #ifdef SUPPORT_UTF8 - if (md->utf8) - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; + if (utf8) + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; #endif - continue; - } - if (rc != MATCH_MATCH) - { - DPRINTF((">>>> error: returning %d\n", rc)); - return rc; - } + /* Break the loop if the pattern is anchored or if we have passed the end of + the subject. */ + + if (anchored || start_match > end_subject) break; + + /* If we have just passed a CR and the newline option is CRLF or ANY, and we + are now at a LF, advance the match position by one more character. */ + + if (start_match[-1] == '\r' && + (md->nltype == NLTYPE_ANY || md->nllen == 2) && + start_match < end_subject && + *start_match == '\n') + start_match++; + + } /* End of for(;;) "bumpalong" loop */ + +/* ==========================================================================*/ + +/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping +conditions is true: - /* We have a match! Copy the offset information from temporary store if - necessary */ +(1) The pattern is anchored; +(2) We are past the end of the subject; + +(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because + this option requests that a match occur at or before the first newline in + the subject. + +When we have a match and the offset vector is big enough to deal with any +backreferences, captured substring offsets will already be set up. In the case +where we had to get some local store to hold offsets for backreference +processing, copy those that we can. In this case there need not be overflow if +certain parts of the pattern were not used, even though there are more +capturing parentheses than vector slots. */ + +if (rc == MATCH_MATCH) + { if (using_temporary_offsets) { if (offsetcount >= 4) @@ -3889,15 +4148,18 @@ do (offsetcount - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } - if (md->end_offset_top > offsetcount) - md->offset_overflow = TRUE; - + if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; DPRINTF(("Freeing temporary memory\n")); (pcre_free)(md->offset_vector); } + /* Set the return code to the number of captured strings, or 0 if there are + too many to fit into the vector. */ + rc = md->offset_overflow? 0 : md->end_offset_top/2; + /* If there is space, set up the whole thing as substring 0. */ + if (offsetcount < 2) rc = 0; else { offsets[0] = start_match - md->start_subject; @@ -3908,9 +4170,8 @@ do return rc; } -/* This "while" is the end of the "do" above */ - -while (!anchored && start_match <= end_subject); +/* Control gets here if there has been an error, or if the overall match +attempt has failed at all permitted starting positions. */ if (using_temporary_offsets) { @@ -3918,7 +4179,12 @@ if (using_temporary_offsets) (pcre_free)(md->offset_vector); } -if (md->partial && md->hitend) +if (rc != MATCH_NOMATCH) + { + DPRINTF((">>>> error: returning %d\n", rc)); + return rc; + } +else if (md->partial && md->hitend) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); return PCRE_ERROR_PARTIAL; |