summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_exec.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
-rw-r--r--ext/pcre/pcrelib/pcre_exec.c1139
1 files changed, 625 insertions, 514 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c
index 05d0e52d33..654eb9e276 100644
--- a/ext/pcre/pcrelib/pcre_exec.c
+++ b/ext/pcre/pcrelib/pcre_exec.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2012 University of Cambridge
+ Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -56,6 +56,20 @@ possible. There are also some static supporting functions. */
#undef min
#undef max
+/* The md->capture_last field uses the lower 16 bits for the last captured
+substring (which can never be greater than 65535) and a bit in the top half
+to mean "capture vector overflowed". This odd way of doing things was
+implemented when it was realized that preserving and restoring the overflow bit
+whenever the last capture number was saved/restored made for a neater
+interface, and doing it this way saved on (a) another variable, which would
+have increased the stack frame size (a big NO-NO in PCRE) and (b) another
+separate set of save/restore instructions. The following defines are used in
+implementing this. */
+
+#define CAPLMASK 0x0000ffff /* The bits used for last_capture */
+#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
+#define OVFLBIT 0x00010000 /* The bit that is set for overflow */
+
/* Values for setting in md->match_function_type to indicate two special types
of call to match(). We do it this way to save on using another stack variable,
as stack usage is to be discouraged. */
@@ -73,13 +87,17 @@ defined PCRE_ERROR_xxx codes, which are all negative. */
negative to avoid the external error codes. */
#define MATCH_ACCEPT (-999)
-#define MATCH_COMMIT (-998)
-#define MATCH_KETRPOS (-997)
-#define MATCH_ONCE (-996)
+#define MATCH_KETRPOS (-998)
+#define MATCH_ONCE (-997)
+/* The next 5 must be kept together and in sequence so that a test that checks
+for any one of them can use a range. */
+#define MATCH_COMMIT (-996)
#define MATCH_PRUNE (-995)
#define MATCH_SKIP (-994)
#define MATCH_SKIP_ARG (-993)
#define MATCH_THEN (-992)
+#define MATCH_BACKTRACK_MAX MATCH_THEN
+#define MATCH_BACKTRACK_MIN MATCH_COMMIT
/* Maximum number of ints of offset to save on the stack for recursive calls.
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
@@ -89,8 +107,8 @@ because the offset vector is always a multiple of 3 long. */
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
-static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
-static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
+static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
+static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
#ifdef PCRE_DEBUG
/*************************************************
@@ -116,7 +134,7 @@ pcre_uint32 c;
BOOL utf = md->utf;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
while (length-- > 0)
- if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
+ if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
}
#endif
@@ -149,7 +167,7 @@ match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
{
PCRE_PUCHAR eptr_start = eptr;
register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined SUPPORT_UCP
BOOL utf = md->utf;
#endif
@@ -177,8 +195,7 @@ ASCII characters. */
if (caseless)
{
-#ifdef SUPPORT_UTF
-#ifdef SUPPORT_UCP
+#if defined SUPPORT_UTF && defined SUPPORT_UCP
if (utf)
{
/* Match characters up to the end of the reference. NOTE: the number of
@@ -212,17 +229,16 @@ if (caseless)
}
else
#endif
-#endif
/* The same code works when not in UTF-8 mode and in UTF-8 mode when there
is no UCP support. */
{
while (length-- > 0)
{
- pcre_uchar cc, cp;
+ pcre_uint32 cc, cp;
if (eptr >= md->end_subject) return -2; /* Partial match */
- cc = RAWUCHARTEST(eptr);
- cp = RAWUCHARTEST(p);
+ cc = UCHAR21TEST(eptr);
+ cp = UCHAR21TEST(p);
if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
p++;
eptr++;
@@ -238,7 +254,7 @@ else
while (length-- > 0)
{
if (eptr >= md->end_subject) return -2; /* Partial match */
- if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
+ if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
}
}
@@ -416,10 +432,10 @@ typedef struct heapframe {
int Xlength;
int Xmax;
int Xmin;
- int Xnumber;
+ unsigned int Xnumber;
int Xoffset;
- int Xop;
- int Xsave_capture_last;
+ unsigned int Xop;
+ pcre_int32 Xsave_capture_last;
int Xsave_offset1, Xsave_offset2, Xsave_offset3;
int Xstacksave[REC_STACK_SAVE_MAX];
@@ -634,8 +650,8 @@ int max;
int min;
unsigned int number;
int offset;
-pcre_uchar op;
-int save_capture_last;
+unsigned int op;
+pcre_int32 save_capture_last;
int save_offset1, save_offset2, save_offset3;
int stacksave[REC_STACK_SAVE_MAX];
@@ -763,23 +779,16 @@ for (;;)
case OP_FAIL:
RRETURN(MATCH_NOMATCH);
- /* COMMIT overrides PRUNE, SKIP, and THEN */
-
case OP_COMMIT:
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM52);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
- rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
- rrc != MATCH_THEN)
- RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_COMMIT);
- /* PRUNE overrides THEN */
-
case OP_PRUNE:
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM51);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_PRUNE);
case OP_PRUNE_ARG:
@@ -789,38 +798,39 @@ for (;;)
eptrb, RM56);
if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
md->mark == NULL) md->mark = ecode + 2;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_PRUNE);
- /* SKIP overrides PRUNE and THEN */
-
case OP_SKIP:
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM53);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
- RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
md->start_match_ptr = eptr; /* Pass back current position */
RRETURN(MATCH_SKIP);
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
- nomatch_mark. There is a flag that disables this opcode when re-matching a
- pattern that ended with a SKIP for which there was not a matching MARK. */
+ nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
+ not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
+ that failed and any that precede it (either they also failed, or were not
+ triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
+ SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
+ set to the count of the one that failed. */
case OP_SKIP_ARG:
- if (md->ignore_skip_arg)
+ md->skip_arg_count++;
+ if (md->skip_arg_count <= md->ignore_skip_arg)
{
ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
break;
}
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
eptrb, RM57);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
- RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
/* Pass back the current skip name by overloading md->start_match_ptr and
returning the special MATCH_SKIP_ARG return code. This will either be
caught by a matching MARK, or get to the top, where it causes a rematch
- with the md->ignore_skip_arg flag set. */
+ with md->ignore_skip_arg set to the value of md->skip_arg_count. */
md->start_match_ptr = ecode + 2;
RRETURN(MATCH_SKIP_ARG);
@@ -1066,6 +1076,7 @@ for (;;)
/* In all other cases, we have to make another call to match(). */
save_mark = md->mark;
+ save_capture_last = md->capture_last;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
RM2);
@@ -1097,6 +1108,7 @@ for (;;)
ecode += GET(ecode, 1);
md->mark = save_mark;
if (*ecode != OP_ALT) break;
+ md->capture_last = save_capture_last;
}
RRETURN(MATCH_NOMATCH);
@@ -1155,10 +1167,16 @@ for (;;)
if (rrc == MATCH_KETRPOS)
{
offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
save_capture_last = md->capture_last;
matched_once = TRUE;
+ mstart = md->start_match_ptr; /* In case \K changed it */
+ if (eptr == md->end_match_ptr) /* Matched an empty string */
+ {
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+ break;
+ }
+ eptr = md->end_match_ptr;
continue;
}
@@ -1218,6 +1236,7 @@ for (;;)
POSSESSIVE_NON_CAPTURE:
matched_once = FALSE;
code_offset = (int)(ecode - md->start_code);
+ save_capture_last = md->capture_last;
for (;;)
{
@@ -1227,9 +1246,15 @@ for (;;)
if (rrc == MATCH_KETRPOS)
{
offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
matched_once = TRUE;
+ mstart = md->start_match_ptr; /* In case \K reset it */
+ if (eptr == md->end_match_ptr) /* Matched an empty string */
+ {
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+ break;
+ }
+ eptr = md->end_match_ptr;
continue;
}
@@ -1247,6 +1272,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
+ md->capture_last = save_capture_last;
}
if (matched_once || allow_zero)
@@ -1258,25 +1284,32 @@ for (;;)
/* Control never reaches here. */
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. */
+ /* Conditional group: compilation checked that there are no more than two
+ branches. If the condition is false, skipping the first branch takes us
+ past the end of the item if there is only one branch, but that's exactly
+ what we want. */
case OP_COND:
case OP_SCOND:
- codelink = GET(ecode, 1);
+
+ /* The variable codelink will be added to ecode when the condition is
+ false, to get to the second branch. Setting it to the offset to the ALT
+ or KET, then incrementing ecode achieves this effect. We now have ecode
+ pointing to the condition or callout. */
+
+ codelink = GET(ecode, 1); /* Offset to the second branch */
+ ecode += 1 + LINK_SIZE; /* From this opcode */
/* Because of the way auto-callout works during compile, a callout item is
inserted between OP_COND and an assertion condition. */
- if (ecode[LINK_SIZE+1] == OP_CALLOUT)
+ if (*ecode == OP_CALLOUT)
{
if (PUBL(callout) != NULL)
{
PUBL(callout_block) cb;
cb.version = 2; /* Version 1 of the callout block */
- cb.callout_number = ecode[LINK_SIZE+2];
+ cb.callout_number = ecode[1];
cb.offset_vector = md->offset_vector;
#if defined COMPILE_PCRE8
cb.subject = (PCRE_SPTR)md->start_subject;
@@ -1288,215 +1321,130 @@ for (;;)
cb.subject_length = (int)(md->end_subject - md->start_subject);
cb.start_match = (int)(mstart - md->start_subject);
cb.current_position = (int)(eptr - md->start_subject);
- cb.pattern_position = GET(ecode, LINK_SIZE + 3);
- cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
+ cb.pattern_position = GET(ecode, 2);
+ cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
+ cb.capture_last = md->capture_last & CAPLMASK;
+ /* Internal change requires this for API compatibility. */
+ if (cb.capture_last == 0) cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = md->nomatch_mark;
if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
+
+ /* Advance ecode past the callout, so it now points to the condition. We
+ must adjust codelink so that the value of ecode+codelink is unchanged. */
+
ecode += PRIV(OP_lengths)[OP_CALLOUT];
+ codelink -= PRIV(OP_lengths)[OP_CALLOUT];
}
- condcode = ecode[LINK_SIZE+1];
-
- /* Now see what the actual condition is */
+ /* Test the various possible conditions */
- if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
+ condition = FALSE;
+ switch(condcode = *ecode)
{
- if (md->recursive == NULL) /* Not recursing => FALSE */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
- else
+ case OP_RREF: /* Numbered group recursion test */
+ if (md->recursive != NULL) /* Not recursing => FALSE */
{
- unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
+ unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
condition = (recno == RREF_ANY || recno == md->recursive->group_num);
+ }
+ break;
- /* If the test is for recursion into a specific subpattern, and it is
- false, but the test was set up by name, scan the table to see if the
- name refers to any other numbers, and test them. The condition is true
- if any one is set. */
-
- if (!condition && condcode == OP_NRREF)
+ case OP_DNRREF: /* Duplicate named group recursion test */
+ if (md->recursive != NULL)
+ {
+ int count = GET2(ecode, 1 + IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ while (count-- > 0)
{
- pcre_uchar *slotA = md->name_table;
- for (i = 0; i < md->name_count; i++)
- {
- if (GET2(slotA, 0) == recno) break;
- slotA += md->name_entry_size;
- }
-
- /* Found a name for the number - there can be only one; duplicate
- names for different numbers are allowed, but not vice versa. First
- scan down for duplicates. */
-
- if (i < md->name_count)
- {
- pcre_uchar *slotB = slotA;
- while (slotB > md->name_table)
- {
- slotB -= md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- condition = GET2(slotB, 0) == md->recursive->group_num;
- if (condition) break;
- }
- else break;
- }
-
- /* Scan up for duplicates */
-
- if (!condition)
- {
- slotB = slotA;
- for (i++; i < md->name_count; i++)
- {
- slotB += md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- condition = GET2(slotB, 0) == md->recursive->group_num;
- if (condition) break;
- }
- else break;
- }
- }
- }
+ unsigned int recno = GET2(slot, 0);
+ condition = recno == md->recursive->group_num;
+ if (condition) break;
+ slot += md->name_entry_size;
}
-
- /* Chose branch according to the condition */
-
- ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
}
- }
+ break;
- else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
- {
- offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
+ case OP_CREF: /* Numbered group used test */
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
condition = offset < offset_top && md->offset_vector[offset] >= 0;
+ break;
- /* If the numbered capture is unset, but the reference was by name,
- scan the table to see if the name refers to any other numbers, and test
- them. The condition is true if any one is set. This is tediously similar
- to the code above, but not close enough to try to amalgamate. */
-
- if (!condition && condcode == OP_NCREF)
+ case OP_DNCREF: /* Duplicate named group used test */
{
- unsigned int refno = offset >> 1;
- pcre_uchar *slotA = md->name_table;
-
- for (i = 0; i < md->name_count; i++)
+ int count = GET2(ecode, 1 + IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ while (count-- > 0)
{
- if (GET2(slotA, 0) == refno) break;
- slotA += md->name_entry_size;
- }
-
- /* Found a name for the number - there can be only one; duplicate names
- for different numbers are allowed, but not vice versa. First scan down
- for duplicates. */
-
- if (i < md->name_count)
- {
- pcre_uchar *slotB = slotA;
- while (slotB > md->name_table)
- {
- slotB -= md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- offset = GET2(slotB, 0) << 1;
- condition = offset < offset_top &&
- md->offset_vector[offset] >= 0;
- if (condition) break;
- }
- else break;
- }
-
- /* Scan up for duplicates */
-
- if (!condition)
- {
- slotB = slotA;
- for (i++; i < md->name_count; i++)
- {
- slotB += md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- offset = GET2(slotB, 0) << 1;
- condition = offset < offset_top &&
- md->offset_vector[offset] >= 0;
- if (condition) break;
- }
- else break;
- }
- }
+ offset = GET2(slot, 0) << 1;
+ condition = offset < offset_top && md->offset_vector[offset] >= 0;
+ if (condition) break;
+ slot += md->name_entry_size;
}
}
+ break;
- /* Chose branch according to the condition */
-
- ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
- }
-
- else if (condcode == OP_DEF) /* DEFINE - always false */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
+ case OP_DEF: /* DEFINE - always false */
+ break;
- /* The condition is an assertion. Call match() to evaluate it - setting
- md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
- an assertion. */
+ /* The condition is an assertion. Call match() to evaluate it - setting
+ md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
+ of an assertion. */
- else
- {
+ default:
md->match_function_type = MATCH_CONDASSERT;
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
+ RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
if (rrc == MATCH_MATCH)
{
if (md->end_offset_top > offset_top)
offset_top = md->end_offset_top; /* Captures may have happened */
condition = TRUE;
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
+
+ /* Advance ecode past the assertion to the start of the first branch,
+ but adjust it so that the general choosing code below works. */
+
+ ecode += GET(ecode, 1);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
+ ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
}
/* PCRE doesn't allow the effect of (*THEN) to escape beyond an
- assertion; it is therefore treated as NOMATCH. */
+ assertion; it is therefore treated as NOMATCH. Any other return is an
+ error. */
else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
{
RRETURN(rrc); /* Need braces because of following else */
}
- else
- {
- condition = FALSE;
- ecode += codelink;
- }
+ break;
}
- /* We are now at the branch that is to be obeyed. As there is only one, can
- use tail recursion to avoid using another stack frame, except when there is
- unlimited repeat of a possibly empty group. In the latter case, a recursive
- call to match() is always required, unless the second alternative doesn't
- exist, in which case we can just plough on. Note that, for compatibility
- with Perl, the | in a conditional group is NOT treated as creating two
- alternatives. If a THEN is encountered in the branch, it propagates out to
- the enclosing alternative (unless nested in a deeper set of alternatives,
- of course). */
-
- if (condition || *ecode == OP_ALT)
+ /* Choose branch according to the condition */
+
+ ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
+
+ /* We are now at the branch that is to be obeyed. As there is only one, we
+ can use tail recursion to avoid using another stack frame, except when
+ there is unlimited repeat of a possibly empty group. In the latter case, a
+ recursive call to match() is always required, unless the second alternative
+ doesn't exist, in which case we can just plough on. Note that, for
+ compatibility with Perl, the | in a conditional group is NOT treated as
+ creating two alternatives. If a THEN is encountered in the branch, it
+ propagates out to the enclosing alternative (unless nested in a deeper set
+ of alternatives, of course). */
+
+ if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
{
if (op != OP_SCOND)
{
- ecode += 1 + LINK_SIZE;
goto TAIL_RECURSE;
}
md->match_function_type = MATCH_CBEGROUP;
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
+ RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
RRETURN(rrc);
}
@@ -1504,7 +1452,6 @@ for (;;)
else
{
- ecode += 1 + LINK_SIZE;
}
break;
@@ -1513,7 +1460,7 @@ for (;;)
to close any currently open capturing brackets. */
case OP_CLOSE:
- number = GET2(ecode, 1);
+ number = GET2(ecode, 1); /* Must be less than 65536 */
offset = number << 1;
#ifdef PCRE_DEBUG
@@ -1521,8 +1468,8 @@ for (;;)
printf("\n");
#endif
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ md->capture_last = (md->capture_last & OVFLMASK) | number;
+ if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
@@ -1584,28 +1531,49 @@ for (;;)
}
else condassert = FALSE;
+ /* Loop for each branch */
+
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
+
+ /* A match means that the assertion is true; break out of the loop
+ that matches its alternatives. */
+
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
mstart = md->start_match_ptr; /* In case \K reset it */
break;
}
+
+ /* If not matched, restore the previous mark setting. */
+
md->mark = save_mark;
- /* A COMMIT failure must fail the entire assertion, without trying any
- subsequent branches. */
+ /* See comment in the code for capturing groups above about handling
+ THEN. */
- if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
+ if (rrc == MATCH_THEN)
+ {
+ next = ecode + GET(ecode,1);
+ if (md->start_match_ptr < next &&
+ (*ecode == OP_ALT || *next == OP_ALT))
+ rrc = MATCH_NOMATCH;
+ }
- /* PCRE does not allow THEN to escape beyond an assertion; it
- is treated as NOMATCH. */
+ /* Anything other than NOMATCH causes the entire assertion to fail,
+ passing back the return code. This includes COMMIT, SKIP, PRUNE and an
+ uncaptured THEN, which means they take their normal effect. This
+ consistent approach does not always have exactly the same effect as in
+ Perl. */
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
}
- while (*ecode == OP_ALT);
+ while (*ecode == OP_ALT); /* Continue for next alternative */
+
+ /* If we have tried all the alternative branches, the assertion has
+ failed. If not, we broke out after a match. */
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
@@ -1613,17 +1581,16 @@ for (;;)
if (condassert) RRETURN(MATCH_MATCH);
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
+ /* Continue from after a successful assertion, updating the offsets high
+ water mark, since extracts may have been taken during the assertion. */
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
ecode += 1 + LINK_SIZE;
offset_top = md->end_offset_top;
continue;
- /* Negative assertion: all branches must fail to match. Encountering SKIP,
- PRUNE, or COMMIT means we must assume failure without checking subsequent
- branches. */
+ /* Negative assertion: all branches must fail to match for the assertion to
+ succeed. */
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
@@ -1635,28 +1602,64 @@ for (;;)
}
else condassert = FALSE;
+ /* Loop for each alternative branch. */
+
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
- md->mark = save_mark;
- if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
- if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
+ md->mark = save_mark; /* Always restore the mark setting */
+
+ switch(rrc)
{
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ case MATCH_MATCH: /* A successful match means */
+ case MATCH_ACCEPT: /* the assertion has failed. */
+ RRETURN(MATCH_NOMATCH);
+
+ case MATCH_NOMATCH: /* Carry on with next branch */
break;
+
+ /* See comment in the code for capturing groups above about handling
+ THEN. */
+
+ case MATCH_THEN:
+ next = ecode + GET(ecode,1);
+ if (md->start_match_ptr < next &&
+ (*ecode == OP_ALT || *next == OP_ALT))
+ {
+ rrc = MATCH_NOMATCH;
+ break;
+ }
+ /* Otherwise fall through. */
+
+ /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
+ assertion to fail to match, without considering any more alternatives.
+ Failing to match means the assertion is true. This is a consistent
+ approach, but does not always have the same effect as in Perl. */
+
+ case MATCH_COMMIT:
+ case MATCH_SKIP:
+ case MATCH_SKIP_ARG:
+ case MATCH_PRUNE:
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
+
+ /* Anything else is an error */
+
+ default:
+ RRETURN(rrc);
}
- /* PCRE does not allow THEN to escape beyond an assertion; it is treated
- as NOMATCH. */
+ /* Continue with next branch */
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
- if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
+ /* All branches in the assertion failed to match. */
- ecode += 1 + LINK_SIZE;
+ NEG_ASSERT_TRUE:
+ if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
+ ecode += 1 + LINK_SIZE; /* Continue with current branch */
continue;
/* Move the subject pointer back. This occurs only at the start of
@@ -1716,7 +1719,9 @@ for (;;)
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
+ cb.capture_last = md->capture_last & CAPLMASK;
+ /* Internal change requires this for API compatibility. */
+ if (cb.capture_last == 0) cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = md->nomatch_mark;
if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
@@ -1762,6 +1767,7 @@ for (;;)
/* Add to "recursing stack" */
new_recursive.group_num = recno;
+ new_recursive.saved_capture_last = md->capture_last;
new_recursive.subject_position = eptr;
new_recursive.prevrec = md->recursive;
md->recursive = &new_recursive;
@@ -1785,8 +1791,9 @@ for (;;)
new_recursive.saved_max * sizeof(int));
/* OK, now we can do the recursion. After processing each alternative,
- restore the offset data. If there were nested recursions, md->recursive
- might be changed, so reset it before looping. */
+ restore the offset data and the last captured value. If there were nested
+ recursions, md->recursive might be changed, so reset it before looping.
+ */
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
cbegroup = (*callpat >= OP_SBRA);
@@ -1797,6 +1804,7 @@ for (;;)
md, eptrb, RM6);
memcpy(md->offset_vector, new_recursive.offset_save,
new_recursive.saved_max * sizeof(int));
+ md->capture_last = new_recursive.saved_capture_last;
md->recursive = new_recursive.prevrec;
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
@@ -1813,11 +1821,16 @@ for (;;)
goto RECURSION_MATCHED; /* Exit loop; end processing */
}
- /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
- is treated as NOMATCH. */
+ /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
+ recursion; they cause a NOMATCH for the entire recursion. These codes
+ are defined in a range that can be tested for. */
+
+ if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
+ RRETURN(MATCH_NOMATCH);
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
- rrc != MATCH_COMMIT)
+ /* Any return code other than NOMATCH is an error. */
+
+ if (rrc != MATCH_NOMATCH)
{
DPRINTF(("Recursion gave error %d\n", rrc));
if (new_recursive.offset_save != stacksave)
@@ -1947,8 +1960,8 @@ for (;;)
/* Deal with capturing */
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ md->capture_last = (md->capture_last & OVFLMASK) | number;
+ if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
{
/* If offset is greater than offset_top, it means that we are
"skipping" a capturing group, and that group's offsets must be marked
@@ -1976,6 +1989,19 @@ for (;;)
}
}
+ /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
+ and return the MATCH_KETRPOS. This makes it possible to do the repeats one
+ at a time from the outer level, thus saving stack. This must precede the
+ empty string test - in this case that test is done at the outer level. */
+
+ if (*ecode == OP_KETRPOS)
+ {
+ md->start_match_ptr = mstart; /* In case \K reset it */
+ md->end_match_ptr = eptr;
+ md->end_offset_top = offset_top;
+ RRETURN(MATCH_KETRPOS);
+ }
+
/* For an ordinary non-repeating ket, just continue at this level. This
also happens for a repeating ket if no characters were matched in the
group. This is the forcible breaking of infinite loops as implemented in
@@ -1998,17 +2024,6 @@ for (;;)
break;
}
- /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
- and return the MATCH_KETRPOS. This makes it possible to do the repeats one
- at a time from the outer level, thus saving stack. */
-
- if (*ecode == OP_KETRPOS)
- {
- md->end_match_ptr = eptr;
- md->end_offset_top = offset_top;
- RRETURN(MATCH_KETRPOS);
- }
-
/* The normal repeating kets try the rest of the pattern or restart from
the preceding bracket, in the appropriate order. In the second case, we can
use tail recursion to avoid using another stack frame, unless we have an
@@ -2099,7 +2114,7 @@ for (;;)
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
+ UCHAR21TEST(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2143,7 +2158,7 @@ for (;;)
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
+ UCHAR21TEST(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2286,7 +2301,7 @@ for (;;)
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
+ UCHAR21TEST(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2440,7 +2455,7 @@ for (;;)
{
SCHECK_PARTIAL();
}
- else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
+ else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
break;
case CHAR_LF:
@@ -2571,19 +2586,24 @@ for (;;)
RRETURN(MATCH_NOMATCH);
break;
- case PT_SPACE: /* Perl space */
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
- == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
- c == CHAR_FF || c == CHAR_CR)
- == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
+ (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
+ break;
+ }
break;
case PT_WORD:
@@ -2604,6 +2624,13 @@ for (;;)
}
break;
+ case PT_UCNC:
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == (op == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
+
/* This should never occur */
default:
@@ -2650,15 +2677,7 @@ for (;;)
similar code to character type repeats - written out again for speed.
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- case OP_REF:
- case OP_REFI:
- caseless = op == OP_REFI;
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 1 + IMM2_SIZE;
-
- /* If the reference is unset, there are two possibilities:
+ loops). If the reference is unset, there are two possibilities:
(a) In the default, Perl-compatible state, set the length negative;
this ensures that every attempt at a match fails. We can't just fail
@@ -2668,8 +2687,45 @@ for (;;)
so that the back reference matches an empty string.
Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
+ referenced subpattern.
+
+ The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
+ or to a non-duplicated named group. For a duplicated named group, OP_DNREF
+ and OP_DNREFI are used. In this case we must scan the list of groups to
+ which the name refers, and use the first one that is set. */
+
+ case OP_DNREF:
+ case OP_DNREFI:
+ caseless = op == OP_DNREFI;
+ {
+ int count = GET2(ecode, 1+IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ ecode += 1 + 2*IMM2_SIZE;
+
+ /* Setting the default length first and initializing 'offset' avoids
+ compiler warnings in the REF_REPEAT code. */
+
+ length = (md->jscript_compat)? 0 : -1;
+ offset = 0;
+ while (count-- > 0)
+ {
+ offset = GET2(slot, 0) << 1;
+ if (offset < offset_top && md->offset_vector[offset] >= 0)
+ {
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
+ break;
+ }
+ slot += md->name_entry_size;
+ }
+ }
+ goto REF_REPEAT;
+
+ case OP_REF:
+ case OP_REFI:
+ caseless = op == OP_REFI;
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 1 + IMM2_SIZE;
if (offset >= offset_top || md->offset_vector[offset] < 0)
length = (md->jscript_compat)? 0 : -1;
else
@@ -2677,6 +2733,7 @@ for (;;)
/* Set up for repetition, or handle the non-repeated case */
+ REF_REPEAT:
switch (*ecode)
{
case OP_CRSTAR:
@@ -2825,8 +2882,12 @@ for (;;)
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSPLUS:
+ case OP_CRPOSQUERY:
c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
+ if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
+ else possessive = TRUE;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
@@ -2834,7 +2895,9 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
minimize = (*ecode == OP_CRMINRANGE);
+ possessive = (*ecode == OP_CRPOSRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 1 + IMM2_SIZE);
if (max == 0) max = INT_MAX;
@@ -2976,6 +3039,9 @@ for (;;)
if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
eptr += len;
}
+
+ if (possessive) continue; /* No backtracking */
+
for (;;)
{
RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
@@ -3006,6 +3072,9 @@ for (;;)
if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
eptr++;
}
+
+ if (possessive) continue; /* No backtracking */
+
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
@@ -3021,9 +3090,10 @@ for (;;)
/* Control never gets here */
- /* Match an extended character class. This opcode is encountered only
- when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
- mode, because Unicode properties are supported in non-UTF-8 mode. */
+ /* Match an extended character class. In the 8-bit library, this opcode is
+ encountered only when UTF-8 mode mode is supported. In the 16-bit and
+ 32-bit libraries, codepoints greater than 255 may be encountered even when
+ UTF is not supported. */
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
@@ -3039,8 +3109,12 @@ for (;;)
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSPLUS:
+ case OP_CRPOSQUERY:
c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
+ if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
+ else possessive = TRUE;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
@@ -3048,7 +3122,9 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
minimize = (*ecode == OP_CRMINRANGE);
+ possessive = (*ecode == OP_CRPOSRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 1 + IMM2_SIZE);
if (max == 0) max = INT_MAX;
@@ -3120,6 +3196,9 @@ for (;;)
if (!PRIV(xclass)(c, data, utf)) break;
eptr += len;
}
+
+ if (possessive) continue; /* No backtracking */
+
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
@@ -3150,7 +3229,7 @@ for (;;)
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
RRETURN(MATCH_NOMATCH);
}
- while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
}
else
#endif
@@ -3190,7 +3269,7 @@ for (;;)
if (fc < 128)
{
- pcre_uchar cc = RAWUCHAR(eptr);
+ pcre_uint32 cc = UCHAR21(eptr);
if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
ecode++;
eptr++;
@@ -3295,7 +3374,22 @@ for (;;)
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. */
+ /* Common code for all repeated single-character matches. We first check
+ for the minimum number of characters. If the minimum equals the maximum, we
+ are done. Otherwise, if minimizing, check the rest of the pattern for a
+ match; if there isn't one, advance up to the maximum, one character at a
+ time.
+
+ If maximizing, advance up to the maximum number of matching characters,
+ until eptr is past the end of the maximum run. If possessive, we are
+ then done (no backing up). Otherwise, match at this position; anything
+ other than no match is immediately returned. For nomatch, back up one
+ character, unless we are matching \R and the last thing matched was
+ \r\n, in which case, back up two bytes. When we reach the first optional
+ character position, we can save stack by doing a tail recurse.
+
+ The various UTF/non-UTF and caseful/caseless cases are handled separately,
+ for speed. */
REPEATCHAR:
#ifdef SUPPORT_UTF
@@ -3379,13 +3473,12 @@ for (;;)
}
}
- if (possessive) continue;
-
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
@@ -3439,14 +3532,13 @@ for (;;)
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc; /* Faster than pcre_uchar */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHARTEST(eptr);
+ cc = UCHAR21TEST(eptr);
if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
eptr++;
}
@@ -3455,8 +3547,7 @@ for (;;)
{
for (fi = min;; fi++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc; /* Faster than pcre_uchar */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max) RRETURN(MATCH_NOMATCH);
@@ -3465,7 +3556,7 @@ for (;;)
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHARTEST(eptr);
+ cc = UCHAR21TEST(eptr);
if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
eptr++;
}
@@ -3476,29 +3567,26 @@ for (;;)
pp = eptr;
for (i = min; i < max; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc; /* Faster than pcre_uchar */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
break;
}
- cc = RAWUCHARTEST(eptr);
+ cc = UCHAR21TEST(eptr);
if (fc != cc && foc != cc) break;
eptr++;
}
-
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
- /* Control never gets here */
}
/* Caseful comparisons (includes all multi-byte characters) */
@@ -3512,7 +3600,7 @@ for (;;)
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
+ if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
@@ -3529,7 +3617,7 @@ for (;;)
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
+ if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -3543,18 +3631,18 @@ for (;;)
SCHECK_PARTIAL();
break;
}
- if (fc != RAWUCHARTEST(eptr)) break;
+ if (fc != UCHAR21TEST(eptr)) break;
eptr++;
}
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
}
/* Control never gets here */
@@ -3726,7 +3814,7 @@ for (;;)
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = 1; i <= min; i++)
@@ -3764,7 +3852,7 @@ for (;;)
}
}
else
-#endif
+#endif /*SUPPORT_UTF */
/* Not UTF mode */
{
for (fi = min;; fi++)
@@ -3806,17 +3894,18 @@ for (;;)
if (fc == d || (unsigned int)foc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = min; i < max; i++)
@@ -3829,18 +3918,17 @@ for (;;)
if (fc == *eptr || foc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
-
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
- /* Control never gets here */
}
/* Caseful comparisons */
@@ -3941,12 +4029,13 @@ for (;;)
if (fc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
@@ -3964,16 +4053,16 @@ for (;;)
if (fc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
-
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
}
/* Control never gets here */
@@ -4155,7 +4244,12 @@ for (;;)
}
break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+
case PT_SPACE: /* Perl space */
+ case PT_PXSPACE: /* POSIX space */
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject)
@@ -4164,26 +4258,18 @@ for (;;)
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_PXSPACE: /* POSIX space */
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject)
+ switch(c)
{
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ break;
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
}
break;
@@ -4225,6 +4311,22 @@ for (;;)
}
break;
+ case PT_UCNC:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
/* This should not occur */
default:
@@ -4284,7 +4386,7 @@ for (;;)
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- RAWUCHAR(eptr) == NLBLOCK->nl[0])
+ UCHAR21(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -4326,7 +4428,7 @@ for (;;)
default: RRETURN(MATCH_NOMATCH);
case CHAR_CR:
- if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
+ if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
break;
case CHAR_LF:
@@ -4430,14 +4532,13 @@ for (;;)
case OP_DIGIT:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHAR(eptr);
+ cc = UCHAR21(eptr);
if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4448,14 +4549,13 @@ for (;;)
case OP_NOT_WHITESPACE:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHAR(eptr);
+ cc = UCHAR21(eptr);
if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4466,14 +4566,13 @@ for (;;)
case OP_WHITESPACE:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHAR(eptr);
+ cc = UCHAR21(eptr);
if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4484,14 +4583,13 @@ for (;;)
case OP_NOT_WORDCHAR:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHAR(eptr);
+ cc = UCHAR21(eptr);
if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4502,14 +4600,13 @@ for (;;)
case OP_WORDCHAR:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = RAWUCHAR(eptr);
+ cc = UCHAR21(eptr);
if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4892,25 +4989,11 @@ for (;;)
}
/* Control never gets here */
- case PT_SPACE: /* Perl space */
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= md->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
for (fi = min;; fi++)
{
@@ -4923,10 +5006,18 @@ for (;;)
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ break;
+ }
}
/* Control never gets here */
@@ -4976,6 +5067,25 @@ for (;;)
}
/* Control never gets here */
+ case PT_UCNC:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
/* This should never occur */
default:
RRETURN(PCRE_ERROR_INTERNAL);
@@ -5057,7 +5167,7 @@ for (;;)
{
default: RRETURN(MATCH_NOMATCH);
case CHAR_CR:
- if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
+ if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
break;
case CHAR_LF:
@@ -5391,7 +5501,12 @@ for (;;)
}
break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+
case PT_SPACE: /* Perl space */
+ case PT_PXSPACE: /* POSIX space */
for (i = min; i < max; i++)
{
int len = 1;
@@ -5401,30 +5516,21 @@ for (;;)
break;
}
GETCHARLENTEST(c, eptr, len);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
break;
- eptr+= len;
- }
- break;
- case PT_PXSPACE: /* POSIX space */
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject)
- {
- SCHECK_PARTIAL();
+ default:
+ if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ goto ENDLOOP99; /* Break the loop */
break;
}
- GETCHARLENTEST(c, eptr, len);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- break;
eptr+= len;
}
+ ENDLOOP99:
break;
case PT_WORD:
@@ -5470,23 +5576,42 @@ for (;;)
GOT_MAX:
break;
+ case PT_UCNC:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == prop_fail_result)
+ break;
+ eptr += len;
+ }
+ break;
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
if (utf) BACKCHAR(eptr);
}
}
- /* Match extended Unicode sequences. We will get here only if the
+ /* Match extended Unicode grapheme clusters. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
@@ -5518,22 +5643,42 @@ for (;;)
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ int lgb, rgb;
+ PCRE_PUCHAR fptr;
+
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- for (;;) /* Move back over one extended */
+
+ /* Backtracking over an extended grapheme cluster involves inspecting
+ the previous two characters (if present) to see if a break is
+ permitted between them. */
+
+ eptr--;
+ if (!utf) c = *eptr; else
{
- if (!utf) c = *eptr; else
+ BACKCHAR(eptr);
+ GETCHAR(c, eptr);
+ }
+ rgb = UCD_GRAPHBREAK(c);
+
+ for (;;)
+ {
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
+ fptr = eptr - 1;
+ if (!utf) c = *fptr; else
{
- BACKCHAR(eptr);
- GETCHAR(c, eptr);
+ BACKCHAR(fptr);
+ GETCHAR(c, fptr);
}
- if (UCD_CATEGORY(c) != ucp_M) break;
- eptr--;
+ lgb = UCD_GRAPHBREAK(c);
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
+ eptr = fptr;
+ rgb = lgb;
}
}
}
@@ -5547,54 +5692,25 @@ for (;;)
switch(ctype)
{
case OP_ANY:
- if (max < INT_MAX)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
+ if (eptr >= md->end_subject)
{
- if (eptr >= md->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- if (IS_NEWLINE(eptr)) break;
- if (md->partial != 0 && /* Take care with CRLF partial */
- eptr + 1 >= md->end_subject &&
- NLBLOCK->nltype == NLTYPE_FIXED &&
- NLBLOCK->nllen == 2 &&
- RAWUCHAR(eptr) == NLBLOCK->nl[0])
- {
- md->hitend = TRUE;
- if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
- }
- eptr++;
- ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
+ SCHECK_PARTIAL();
+ break;
}
- }
-
- /* Handle unlimited UTF-8 repeat */
-
- else
- {
- for (i = min; i < max; i++)
+ if (IS_NEWLINE(eptr)) break;
+ if (md->partial != 0 && /* Take care with CRLF partial */
+ eptr + 1 >= md->end_subject &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ UCHAR21(eptr) == NLBLOCK->nl[0])
{
- if (eptr >= md->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- if (IS_NEWLINE(eptr)) break;
- if (md->partial != 0 && /* Take care with CRLF partial */
- eptr + 1 >= md->end_subject &&
- NLBLOCK->nltype == NLTYPE_FIXED &&
- NLBLOCK->nllen == 2 &&
- RAWUCHAR(eptr) == NLBLOCK->nl[0])
- {
- md->hitend = TRUE;
- if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
- }
- eptr++;
- ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
+ md->hitend = TRUE;
+ if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
}
+ eptr++;
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -5644,7 +5760,7 @@ for (;;)
if (c == CHAR_CR)
{
if (++eptr >= md->end_subject) break;
- if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
+ if (UCHAR21(eptr) == CHAR_LF) eptr++;
}
else
{
@@ -5799,21 +5915,16 @@ for (;;)
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character, unless we are matching \R and the last thing matched was
- \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
- if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
- RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
+ if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
+ UCHAR21(eptr - 1) == CHAR_CR) eptr--;
}
}
else
@@ -6048,15 +6159,10 @@ for (;;)
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character (byte), unless we are matching \R and the last thing matched
- was \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@@ -6065,11 +6171,8 @@ for (;;)
}
}
- /* Get here if we can't make it match with any permitted repetitions */
-
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
- /* Control never gets here */
/* There's been some horrible disaster. Arrival here can only mean there is
something seriously wrong in the code above or the OP_xxx definitions. */
@@ -6103,10 +6206,10 @@ switch (frame->Xwhere)
LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
LBL(65) LBL(66)
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
- LBL(21)
+ LBL(20) LBL(21)
#endif
#ifdef SUPPORT_UTF
- LBL(16) LBL(18) LBL(20)
+ LBL(16) LBL(18)
LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
@@ -6264,6 +6367,7 @@ const pcre_uint8 *start_bits = NULL;
PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
PCRE_PUCHAR end_subject;
PCRE_PUCHAR start_partial = NULL;
+PCRE_PUCHAR match_partial = NULL;
PCRE_PUCHAR req_char_ptr = start_match - 1;
const pcre_study_data *study;
@@ -6393,9 +6497,11 @@ md->callout_data = NULL;
tables = re->tables;
+/* The two limit values override the defaults, whatever their value. */
+
if (extra_data != NULL)
{
- register unsigned int flags = extra_data->flags;
+ unsigned long int flags = extra_data->flags;
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
@@ -6407,6 +6513,15 @@ if (extra_data != NULL)
if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
}
+/* Limits in the regex override only if they are smaller. */
+
+if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
+ md->match_limit = re->limit_match;
+
+if ((re->flags & PCRE_RLSET) != 0 &&
+ re->limit_recursion < md->match_limit_recursion)
+ md->match_limit_recursion = re->limit_recursion;
+
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
is a feature that makes it possible to save compiled regex and re-use them
in other programs later. */
@@ -6432,7 +6547,7 @@ end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
md->use_ucp = (re->options & PCRE_UCP) != 0;
md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
-md->ignore_skip_arg = FALSE;
+md->ignore_skip_arg = 0;
/* Some options are unpacked into BOOL variables in the hope that testing
them will be faster than individual option bits. */
@@ -6542,11 +6657,9 @@ if (re->top_backref > 0 && re->top_backref >= ocount/3)
DPRINTF(("Got memory to hold back references\n"));
}
else md->offset_vector = offsets;
-
md->offset_end = ocount;
md->offset_max = (2*ocount)/3;
-md->offset_overflow = FALSE;
-md->capture_last = -1;
+md->capture_last = 0;
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
@@ -6658,10 +6771,10 @@ for(;;)
if (first_char != first_char2)
while (start_match < end_subject &&
- (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
+ (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
start_match++;
else
- while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
+ while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
start_match++;
}
@@ -6693,7 +6806,7 @@ for(;;)
if (start_match[-1] == CHAR_CR &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
- RAWUCHARTEST(start_match) == CHAR_NL)
+ UCHAR21TEST(start_match) == CHAR_NL)
start_match++;
}
}
@@ -6704,22 +6817,12 @@ for(;;)
{
while (start_match < end_subject)
{
- register pcre_uint32 c = RAWUCHARTEST(start_match);
+ register pcre_uint32 c = UCHAR21TEST(start_match);
#ifndef COMPILE_PCRE8
if (c > 255) c = 255;
#endif
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- {
- start_match++;
-#if defined SUPPORT_UTF && defined COMPILE_PCRE8
- /* In non 8-bit mode, the iteration will stop for
- characters > 255 at the beginning or not stop at all. */
- if (utf)
- ACROSSCHAR(start_match < end_subject, *start_match,
- start_match++);
-#endif
- }
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+ start_match++;
}
}
} /* Starting optimizations */
@@ -6772,7 +6875,7 @@ for(;;)
{
while (p < end_subject)
{
- register pcre_uint32 pp = RAWUCHARINCTEST(p);
+ register pcre_uint32 pp = UCHAR21INCTEST(p);
if (pp == req_char || pp == req_char2) { p--; break; }
}
}
@@ -6780,7 +6883,7 @@ for(;;)
{
while (p < end_subject)
{
- if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
+ if (UCHAR21INCTEST(p) == req_char) { p--; break; }
}
}
@@ -6816,8 +6919,13 @@ for(;;)
md->match_call_count = 0;
md->match_function_type = 0;
md->end_offset_top = 0;
+ md->skip_arg_count = 0;
rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
- if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
+ if (md->hitend && start_partial == NULL)
+ {
+ start_partial = md->start_used_ptr;
+ match_partial = start_match;
+ }
switch(rc)
{
@@ -6830,14 +6938,14 @@ for(;;)
case MATCH_SKIP_ARG:
new_start_match = start_match;
- md->ignore_skip_arg = TRUE;
+ md->ignore_skip_arg = md->skip_arg_count;
break;
- /* SKIP passes back the next starting point explicitly, but if it is the
- same as the match we have just done, treat it as NOMATCH. */
+ /* SKIP passes back the next starting point explicitly, but if it is no
+ greater than the match we have just done, treat it as NOMATCH. */
case MATCH_SKIP:
- if (md->start_match_ptr != start_match)
+ if (md->start_match_ptr > start_match)
{
new_start_match = md->start_match_ptr;
break;
@@ -6845,12 +6953,12 @@ for(;;)
/* Fall through */
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
- exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
+ exactly like PRUNE. Unset ignore SKIP-with-argument. */
case MATCH_NOMATCH:
case MATCH_PRUNE:
case MATCH_THEN:
- md->ignore_skip_arg = FALSE;
+ md->ignore_skip_arg = 0;
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF
if (utf)
@@ -6943,7 +7051,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
(arg_offset_max - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
- if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
+ if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
DPRINTF(("Freeing temporary memory\n"));
(PUBL(free))(md->offset_vector);
}
@@ -6951,7 +7059,8 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
/* Set the return code to the number of captured strings, or 0 if there were
too many to fit into the vector. */
- rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
+ rc = ((md->capture_last & OVFLBIT) != 0 &&
+ md->end_offset_top >= arg_offset_max)?
0 : md->end_offset_top/2;
/* If there is space in the offset vector, set any unused pairs at the end of
@@ -7016,7 +7125,7 @@ if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
/* Handle partial matches - disable any mark data */
-if (start_partial != NULL)
+if (match_partial != NULL)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
md->mark = NULL;
@@ -7024,6 +7133,8 @@ if (start_partial != NULL)
{
offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
+ if (offsetcount > 2)
+ offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
}
rc = PCRE_ERROR_PARTIAL;
}