summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_compile.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r--ext/pcre/pcrelib/pcre_compile.c302
1 files changed, 249 insertions, 53 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c
index 1e0672c5cd..b197c1b548 100644
--- a/ext/pcre/pcrelib/pcre_compile.c
+++ b/ext/pcre/pcrelib/pcre_compile.c
@@ -339,7 +339,9 @@ static const char error_texts[] =
"number is too big\0"
"subpattern name expected\0"
"digit expected after (?+\0"
- "] is an invalid data character in JavaScript compatibility mode";
+ "] is an invalid data character in JavaScript compatibility mode\0"
+ /* 65 */
+ "different names for subpatterns of the same number are not allowed";
/* Table to identify digits and hex digits. This is used when compiling
@@ -1098,6 +1100,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
if (name != NULL && lorn == ptr - thisname &&
strncmp((const char *)name, (const char *)thisname, lorn) == 0)
return *count;
+ term++;
}
}
}
@@ -1132,19 +1135,21 @@ for (; *ptr != 0; ptr++)
BOOL negate_class = FALSE;
for (;;)
{
- int c = *(++ptr);
- if (c == CHAR_BACKSLASH)
+ if (ptr[1] == CHAR_BACKSLASH)
{
- if (ptr[1] == CHAR_E)
- ptr++;
- else if (strncmp((const char *)ptr+1,
+ if (ptr[2] == CHAR_E)
+ ptr+= 2;
+ else if (strncmp((const char *)ptr+2,
STR_Q STR_BACKSLASH STR_E, 3) == 0)
- ptr += 3;
+ ptr += 4;
else
break;
}
- else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
+ else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
+ {
negate_class = TRUE;
+ ptr++;
+ }
else break;
}
@@ -1310,7 +1315,9 @@ for (;;)
case OP_CALLOUT:
case OP_CREF:
+ case OP_NCREF:
case OP_RREF:
+ case OP_NRREF:
case OP_DEF:
code += _pcre_OP_lengths[*code];
break;
@@ -1326,23 +1333,34 @@ for (;;)
/*************************************************
-* Find the fixed length of a pattern *
+* Find the fixed length of a branch *
*************************************************/
-/* Scan a pattern and compute the fixed length of subject that will match it,
+/* Scan a branch and compute the fixed length of subject that will match it,
if the length is fixed. This is needed for dealing with backward assertions.
-In UTF8 mode, the result is in characters rather than bytes.
+In UTF8 mode, the result is in characters rather than bytes. The branch is
+temporarily terminated with OP_END when this function is called.
+
+This function is called when a backward assertion is encountered, so that if it
+fails, the error message can point to the correct place in the pattern.
+However, we cannot do this when the assertion contains subroutine calls,
+because they can be forward references. We solve this by remembering this case
+and doing the check at the end; a flag specifies which mode we are running in.
Arguments:
code points to the start of the pattern (the bracket)
options the compiling options
+ atend TRUE if called when the pattern is complete
+ cd the "compile data" structure
-Returns: the fixed length, or -1 if there is no fixed length,
+Returns: the fixed length,
+ or -1 if there is no fixed length,
or -2 if \C was encountered
+ or -3 if an OP_RECURSE item was encountered and atend is FALSE
*/
static int
-find_fixedlength(uschar *code, int options)
+find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
{
int length = -1;
@@ -1355,6 +1373,7 @@ branch, check the length against that of the other branches. */
for (;;)
{
int d;
+ uschar *ce, *cs;
register int op = *cc;
switch (op)
{
@@ -1362,7 +1381,7 @@ for (;;)
case OP_BRA:
case OP_ONCE:
case OP_COND:
- d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
+ d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -1385,6 +1404,21 @@ for (;;)
branchlength = 0;
break;
+ /* A true recursion implies not fixed length, but a subroutine call may
+ be OK. If the subroutine is a forward reference, we can't deal with
+ it until the end of the pattern, so return -3. */
+
+ case OP_RECURSE:
+ if (!atend) return -3;
+ cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
+ do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
+ if (cc > cs && cc < ce) return -1; /* Recursion */
+ d = find_fixedlength(cs + 2, options, atend, cd);
+ if (d < 0) return d;
+ branchlength += d;
+ cc += 1 + LINK_SIZE;
+ break;
+
/* Skip over assertive subpatterns */
case OP_ASSERT:
@@ -1398,7 +1432,9 @@ for (;;)
case OP_REVERSE:
case OP_CREF:
+ case OP_NCREF:
case OP_RREF:
+ case OP_NRREF:
case OP_DEF:
case OP_OPT:
case OP_CALLOUT:
@@ -1421,10 +1457,8 @@ for (;;)
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while ((*cc & 0xc0) == 0x80) cc++;
- }
+ if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
+ cc += _pcre_utf8_table4[cc[-1] & 0x3f];
#endif
break;
@@ -1435,10 +1469,8 @@ for (;;)
branchlength += GET2(cc,1);
cc += 4;
#ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while((*cc & 0x80) == 0x80) cc++;
- }
+ if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
+ cc += _pcre_utf8_table4[cc[-1] & 0x3f];
#endif
break;
@@ -1517,22 +1549,25 @@ for (;;)
/*************************************************
-* Scan compiled regex for numbered bracket *
+* Scan compiled regex for specific bracket *
*************************************************/
/* This little function scans through a compiled pattern until it finds a
-capturing bracket with the given number.
+capturing bracket with the given number, or, if the number is negative, an
+instance of OP_REVERSE for a lookbehind. The function is global in the C sense
+so that it can be called from pcre_study() when finding the minimum matching
+length.
Arguments:
code points to start of expression
utf8 TRUE in UTF-8 mode
- number the required bracket number
+ number the required bracket number or negative to find a lookbehind
Returns: pointer to the opcode for the bracket, or NULL if not found
*/
-static const uschar *
-find_bracket(const uschar *code, BOOL utf8, int number)
+const uschar *
+_pcre_find_bracket(const uschar *code, BOOL utf8, int number)
{
for (;;)
{
@@ -1545,6 +1580,14 @@ for (;;)
if (c == OP_XCLASS) code += GET(code, 1);
+ /* Handle recursion */
+
+ else if (c == OP_REVERSE)
+ {
+ if (number < 0) return (uschar *)code;
+ code += _pcre_OP_lengths[c];
+ }
+
/* Handle capturing bracket */
else if (c == OP_CBRA)
@@ -1910,10 +1953,13 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
+ if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
+ break;
+
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
+ if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
break;
#endif
}
@@ -3867,10 +3913,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (repeat_max == 0) goto END_REPEAT;
+ /*--------------------------------------------------------------------*/
+ /* This code is obsolete from release 8.00; the restriction was finally
+ removed: */
+
/* All real repeats make it impossible to handle partial matching (maybe
one day we will be able to remove this restriction). */
- if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
+ /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
+ /*--------------------------------------------------------------------*/
/* Combine the op_type with the repeat_type */
@@ -4017,10 +4068,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
goto END_REPEAT;
}
+ /*--------------------------------------------------------------------*/
+ /* This code is obsolete from release 8.00; the restriction was finally
+ removed: */
+
/* All real repeats make it impossible to handle partial matching (maybe
one day we will be able to remove this restriction). */
- if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
+ /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
+ /*--------------------------------------------------------------------*/
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
@@ -4335,11 +4391,20 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (possessive_quantifier)
{
int len;
- if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
- *tempcode == OP_NOTEXACT)
+
+ if (*tempcode == OP_TYPEEXACT)
tempcode += _pcre_OP_lengths[*tempcode] +
- ((*tempcode == OP_TYPEEXACT &&
- (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
+ ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
+
+ else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
+ {
+ tempcode += _pcre_OP_lengths[*tempcode];
+#ifdef SUPPORT_UTF8
+ if (utf8 && tempcode[-1] >= 0xc0)
+ tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
+#endif
+ }
+
len = code - tempcode;
if (len > 0) switch (*tempcode)
{
@@ -4417,8 +4482,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (namelen == verbs[i].len &&
strncmp((char *)name, vn, namelen) == 0)
{
- *code = verbs[i].op;
- if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
+ /* Check for open captures before ACCEPT */
+
+ if (verbs[i].op == OP_ACCEPT)
+ {
+ open_capitem *oc;
+ cd->had_accept = TRUE;
+ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
+ {
+ *code++ = OP_CLOSE;
+ PUT2INC(code, 0, oc->number);
+ }
+ }
+ *code++ = verbs[i].op;
break;
}
vn += verbs[i].len + 1;
@@ -4580,7 +4656,10 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
}
/* Otherwise (did not start with "+" or "-"), start by looking for the
- name. */
+ name. If we find a name, add one to the opcode to change OP_CREF or
+ OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
+ except they record that the reference was originally to a name. The
+ information is used to check duplicate names. */
slot = cd->name_table;
for (i = 0; i < cd->names_found; i++)
@@ -4595,6 +4674,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
{
recno = GET2(slot, 0);
PUT2(code, 2+LINK_SIZE, recno);
+ code[1+LINK_SIZE]++;
}
/* Search the pattern for a forward reference */
@@ -4603,6 +4683,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
(options & PCRE_EXTENDED) != 0)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
+ code[1+LINK_SIZE]++;
}
/* If terminator == 0 it means that the name followed directly after
@@ -4795,11 +4876,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
}
}
- /* In the real compile, create the entry in the table */
+ /* In the real compile, create the entry in the table, maintaining
+ alphabetical order. Duplicate names for different numbers are
+ permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
+ number are always OK. (An existing number can be re-used if (?|
+ appears in the pattern.) In either event, a duplicate name results in
+ a duplicate entry in the table, even if the number is the same. This
+ is because the number of names, and hence the table size, is computed
+ in the pre-compile, and it affects various numbers and pointers which
+ would all have to be modified, and the compiled code moved down, if
+ duplicates with the same number were omitted from the table. This
+ doesn't seem worth the hassle. However, *different* names for the
+ same number are not permitted. */
else
{
+ BOOL dupname = FALSE;
slot = cd->name_table;
+
for (i = 0; i < cd->names_found; i++)
{
int crc = memcmp(name, slot+2, namelen);
@@ -4807,33 +4901,66 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
{
if (slot[2+namelen] == 0)
{
- if ((options & PCRE_DUPNAMES) == 0)
+ if (GET2(slot, 0) != cd->bracount + 1 &&
+ (options & PCRE_DUPNAMES) == 0)
{
*errorcodeptr = ERR43;
goto FAILED;
}
+ else dupname = TRUE;
}
- else crc = -1; /* Current name is substring */
+ else crc = -1; /* Current name is a substring */
}
+
+ /* Make space in the table and break the loop for an earlier
+ name. For a duplicate or later name, carry on. We do this for
+ duplicates so that in the simple case (when ?(| is not used) they
+ are in order of their numbers. */
+
if (crc < 0)
{
memmove(slot + cd->name_entry_size, slot,
(cd->names_found - i) * cd->name_entry_size);
break;
}
+
+ /* Continue the loop for a later or duplicate name */
+
slot += cd->name_entry_size;
}
+ /* For non-duplicate names, check for a duplicate number before
+ adding the new name. */
+
+ if (!dupname)
+ {
+ uschar *cslot = cd->name_table;
+ for (i = 0; i < cd->names_found; i++)
+ {
+ if (cslot != slot)
+ {
+ if (GET2(cslot, 0) == cd->bracount + 1)
+ {
+ *errorcodeptr = ERR65;
+ goto FAILED;
+ }
+ }
+ else i--;
+ cslot += cd->name_entry_size;
+ }
+ }
+
PUT2(slot, 0, cd->bracount + 1);
memcpy(slot + 2, name, namelen);
slot[2+namelen] = 0;
}
}
- /* In both cases, count the number of names we've encountered. */
+ /* In both pre-compile and compile, count the number of names we've
+ encountered. */
- ptr++; /* Move past > or ' */
cd->names_found++;
+ ptr++; /* Move past > or ' */
goto NUMBERED_GROUP;
@@ -5002,7 +5129,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (lengthptr == NULL)
{
*code = OP_END;
- if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
+ if (recno != 0)
+ called = _pcre_find_bracket(cd->start_code, utf8, recno);
/* Forward reference */
@@ -5646,6 +5774,8 @@ uschar *code = *codeptr;
uschar *last_branch = code;
uschar *start_bracket = code;
uschar *reverse_count = NULL;
+open_capitem capitem;
+int capnumber = 0;
int firstbyte, reqbyte;
int branchfirstbyte, branchreqbyte;
int length;
@@ -5672,6 +5802,17 @@ the code that abstracts option settings at the start of the pattern and makes
them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
pre-compile phase to find out whether anything has yet been compiled or not. */
+/* If this is a capturing subpattern, add to the chain of open capturing items
+so that we can detect them if (*ACCEPT) is encountered. */
+
+if (*code == OP_CBRA)
+ {
+ capnumber = GET2(code, 1 + LINK_SIZE);
+ capitem.number = capnumber;
+ capitem.next = cd->open_caps;
+ cd->open_caps = &capitem;
+ }
+
/* Offset is set zero to mark that this bracket is still open */
PUT(code, 1, 0);
@@ -5766,21 +5907,29 @@ for (;;)
/* If lookbehind, check that this branch matches a fixed-length string, and
put the length into the OP_REVERSE item. Temporarily mark the end of the
- branch with OP_END. */
+ branch with OP_END. If the branch contains OP_RECURSE, the result is -3
+ because there may be forward references that we can't check here. Set a
+ flag to cause another lookbehind check at the end. Why not do it all at the
+ end? Because common, erroneous checks are picked up here and the offset of
+ the problem can be shown. */
if (lookbehind)
{
int fixed_length;
*code = OP_END;
- fixed_length = find_fixedlength(last_branch, options);
+ fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
DPRINTF(("fixed length = %d\n", fixed_length));
- if (fixed_length < 0)
+ if (fixed_length == -3)
+ {
+ cd->check_lookbehind = TRUE;
+ }
+ else if (fixed_length < 0)
{
*errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
*ptrptr = ptr;
return FALSE;
}
- PUT(reverse_count, 0, fixed_length);
+ else { PUT(reverse_count, 0, fixed_length); }
}
}
@@ -5808,6 +5957,10 @@ for (;;)
while (branch_length > 0);
}
+ /* If it was a capturing subpattern, remove it from the chain. */
+
+ if (capnumber > 0) cd->open_caps = cd->open_caps->next;
+
/* Fill in the ket */
*code = OP_KET;
@@ -6010,7 +6163,9 @@ do {
switch (*scode)
{
case OP_CREF:
+ case OP_NCREF:
case OP_RREF:
+ case OP_NRREF:
case OP_DEF:
return FALSE;
@@ -6179,9 +6334,7 @@ int length = 1; /* For final END opcode */
int firstbyte, reqbyte, newline;
int errorcode = 0;
int skipatstart = 0;
-#ifdef SUPPORT_UTF8
-BOOL utf8;
-#endif
+BOOL utf8 = (options & PCRE_UTF8) != 0;
size_t size;
uschar *code;
const uschar *codestart;
@@ -6278,7 +6431,6 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
#ifdef SUPPORT_UTF8
-utf8 = (options & PCRE_UTF8) != 0;
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
{
@@ -6286,7 +6438,7 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
goto PCRE_EARLY_ERROR_RETURN2;
}
#else
-if ((options & PCRE_UTF8) != 0)
+if (utf8)
{
errorcode = ERR32;
goto PCRE_EARLY_ERROR_RETURN;
@@ -6375,6 +6527,7 @@ cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
cd->req_varyopt = 0;
cd->external_options = options;
cd->external_flags = 0;
+cd->open_caps = NULL;
/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
don't need to look at the result of the function here. The initial options have
@@ -6449,6 +6602,8 @@ cd->start_code = codestart;
cd->hwm = cworkspace;
cd->req_varyopt = 0;
cd->had_accept = FALSE;
+cd->check_lookbehind = FALSE;
+cd->open_caps = NULL;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, errorcode will be set non-zero, so we don't need to look at the result
@@ -6487,7 +6642,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)
cd->hwm -= LINK_SIZE;
offset = GET(cd->hwm, 0);
recno = GET(codestart, offset);
- groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
+ groupptr = _pcre_find_bracket(codestart, utf8, recno);
if (groupptr == NULL) errorcode = ERR53;
else PUT(((uschar *)codestart), offset, groupptr - codestart);
}
@@ -6497,6 +6652,47 @@ subpattern. */
if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
+/* If there were any lookbehind assertions that contained OP_RECURSE
+(recursions or subroutine calls), a flag is set for them to be checked here,
+because they may contain forward references. Actual recursions can't be fixed
+length, but subroutine calls can. It is done like this so that those without
+OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
+exceptional ones forgo this. We scan the pattern to check that they are fixed
+length, and set their lengths. */
+
+if (cd->check_lookbehind)
+ {
+ uschar *cc = (uschar *)codestart;
+
+ /* Loop, searching for OP_REVERSE items, and process those that do not have
+ their length set. (Actually, it will also re-process any that have a length
+ of zero, but that is a pathological case, and it does no harm.) When we find
+ one, we temporarily terminate the branch it is in while we scan it. */
+
+ for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
+ cc != NULL;
+ cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
+ {
+ if (GET(cc, 1) == 0)
+ {
+ int fixed_length;
+ uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
+ int end_op = *be;
+ *be = OP_END;
+ fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
+ *be = end_op;
+ DPRINTF(("fixed length = %d\n", fixed_length));
+ if (fixed_length < 0)
+ {
+ errorcode = (fixed_length == -2)? ERR36 : ERR25;
+ break;
+ }
+ PUT(cc, 1, fixed_length);
+ }
+ cc += 1 + LINK_SIZE;
+ }
+ }
+
/* Failed to compile, or error while post-processing */
if (errorcode != 0)