summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_compile.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r--ext/pcre/pcrelib/pcre_compile.c158
1 files changed, 127 insertions, 31 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c
index 53027e603d..b0d81ac94c 100644
--- a/ext/pcre/pcrelib/pcre_compile.c
+++ b/ext/pcre/pcrelib/pcre_compile.c
@@ -406,6 +406,7 @@ static const char error_texts[] =
"different names for subpatterns of the same number are not allowed\0"
"(*MARK) must have an argument\0"
"this version of PCRE is not compiled with PCRE_UCP support\0"
+ "\\c must be followed by an ASCII character\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -839,7 +840,8 @@ else
break;
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
- This coding is ASCII-specific, but then the whole concept of \cx is
+ An error is given if the byte following \c is not an ASCII character. This
+ coding is ASCII-specific, but then the whole concept of \cx is
ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
case CHAR_c:
@@ -849,11 +851,15 @@ else
*errorcodeptr = ERR2;
break;
}
-
-#ifndef EBCDIC /* ASCII/UTF-8 coding */
+#ifndef EBCDIC /* ASCII/UTF-8 coding */
+ if (c > 127) /* Excludes all non-ASCII in either mode */
+ {
+ *errorcodeptr = ERR68;
+ break;
+ }
if (c >= CHAR_a && c <= CHAR_z) c -= 32;
c ^= 0x40;
-#else /* EBCDIC coding */
+#else /* EBCDIC coding */
if (c >= CHAR_a && c <= CHAR_z) c += 64;
c ^= 0xC0;
#endif
@@ -1097,10 +1103,21 @@ top-level call starts at the beginning of the pattern. All other calls must
start at a parenthesis. It scans along a pattern's text looking for capturing
subpatterns, and counting them. If it finds a named pattern that matches the
name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. We know that if (?P< is
-encountered, the name will be terminated by '>' because that is checked in the
-first pass. Recursion is used to keep track of subpatterns that reset the
-capturing group numbers - the (?| feature.
+returns when it reaches a given numbered subpattern. Recursion is used to keep
+track of subpatterns that reset the capturing group numbers - the (?| feature.
+
+This function was originally called only from the second pass, in which we know
+that if (?< or (?' or (?P< is encountered, the name will be correctly
+terminated because that is checked in the first pass. There is now one call to
+this function in the first pass, to check for a recursive back reference by
+name (so that we can make the whole group atomic). In this case, we need check
+only up to the current position in the pattern, and that is still OK because
+and previous occurrences will have been checked. To make this work, the test
+for "end of pattern" is a check against cd->end_pattern in the main loop,
+instead of looking for a binary zero. This means that the special first-pass
+call can adjust cd->end_pattern temporarily. (Checks for binary zero while
+processing items within the loop are OK, because afterwards the main loop will
+terminate.)
Arguments:
ptrptr address of the current character pointer (updated)
@@ -1108,6 +1125,7 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
count pointer to the current capturing subpattern number (updated)
Returns: the number of the named subpattern, or -1 if not found
@@ -1115,7 +1133,7 @@ Returns: the number of the named subpattern, or -1 if not found
static int
find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
+ BOOL xmode, BOOL utf8, int *count)
{
uschar *ptr = *ptrptr;
int start_count = *count;
@@ -1200,9 +1218,11 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
}
/* Past any initial parenthesis handling, scan for parentheses or vertical
-bars. */
+bars. Stop if we get to cd->end_pattern. Note that this is important for the
+first-pass call when this value is temporarily adjusted to stop at the current
+position. So DO NOT change this to a test for binary zero. */
-for (; *ptr != 0; ptr++)
+for (; ptr < cd->end_pattern; ptr++)
{
/* Skip over backslashed characters and also entire \Q...\E */
@@ -1276,7 +1296,15 @@ for (; *ptr != 0; ptr++)
if (xmode && *ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+ ptr++;
+ while (*ptr != 0)
+ {
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
if (*ptr == 0) goto FAIL_EXIT;
continue;
}
@@ -1285,7 +1313,7 @@ for (; *ptr != 0; ptr++)
if (*ptr == CHAR_LEFT_PARENTHESIS)
{
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+ int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
if (rc > 0) return rc;
if (*ptr == 0) goto FAIL_EXIT;
}
@@ -1331,12 +1359,14 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
Returns: the number of the found subpattern, or -1 if not found
*/
static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+ BOOL utf8)
{
uschar *ptr = (uschar *)cd->start_pattern;
int count = 0;
@@ -1349,7 +1379,7 @@ matching closing parens. That is why we have to have a loop. */
for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
if (rc > 0 || *ptr++ == 0) break;
}
@@ -1722,9 +1752,12 @@ for (;;)
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
- case OP_THEN_ARG:
code += code[1];
break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
}
/* Add in the fixed length from the table */
@@ -1825,9 +1858,12 @@ for (;;)
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
- case OP_THEN_ARG:
code += code[1];
break;
+
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
}
/* Add in the fixed length from the table */
@@ -2103,10 +2139,13 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
- case OP_THEN_ARG:
code += code[1];
break;
+ case OP_THEN_ARG:
+ code += code[1+LINK_SIZE];
+ break;
+
/* None of the remaining opcodes are required to match a character. */
default:
@@ -2504,8 +2543,15 @@ if ((options & PCRE_EXTENDED) != 0)
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -2541,8 +2587,15 @@ if ((options & PCRE_EXTENDED) != 0)
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -3115,9 +3168,14 @@ for (;; ptr++)
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
}
if (*ptr != 0) continue;
@@ -3492,9 +3550,14 @@ for (;; ptr++)
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
continue;
+ /* Perl 5.004 onwards omits VT from \s, but we must preserve it
+ if it was previously set by something earlier in the character
+ class. */
+
case ESC_s:
- for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
- classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
+ classbits[0] |= cbits[cbit_space];
+ classbits[1] |= cbits[cbit_space+1] & ~0x08;
+ for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
continue;
case ESC_S:
@@ -4806,7 +4869,12 @@ for (;; ptr++)
*errorcodeptr = ERR66;
goto FAILED;
}
- *code++ = verbs[i].op;
+ *code = verbs[i].op;
+ if (*code++ == OP_THEN)
+ {
+ PUT(code, 0, code - bcptr->current_branch - 1);
+ code += LINK_SIZE;
+ }
}
else
@@ -4816,7 +4884,12 @@ for (;; ptr++)
*errorcodeptr = ERR59;
goto FAILED;
}
- *code++ = verbs[i].op_arg;
+ *code = verbs[i].op_arg;
+ if (*code++ == OP_THEN_ARG)
+ {
+ PUT(code, 0, code - bcptr->current_branch - 1);
+ code += LINK_SIZE;
+ }
*code++ = arglen;
memcpy(code, arg, arglen);
code += arglen;
@@ -5010,7 +5083,7 @@ for (;; ptr++)
/* Search the pattern for a forward reference */
else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) > 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
code[1+LINK_SIZE]++;
@@ -5311,11 +5384,17 @@ for (;; ptr++)
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
namelen = (int)(ptr - name);
- /* In the pre-compile phase, do a syntax check and set a dummy
- reference number. */
+ /* In the pre-compile phase, do a syntax check. We used to just set
+ a dummy reference number, because it was not used in the first pass.
+ However, with the change of recursive back references to be atomic,
+ we have to look for the number so that this state can be identified, as
+ otherwise the incorrect length is computed. If it's not a backwards
+ reference, the dummy number will do. */
if (lengthptr != NULL)
{
+ const uschar *temp;
+
if (namelen == 0)
{
*errorcodeptr = ERR62;
@@ -5331,7 +5410,22 @@ for (;; ptr++)
*errorcodeptr = ERR48;
goto FAILED;
}
- recno = 0;
+
+ /* The name table does not exist in the first pass, so we cannot
+ do a simple search as in the code below. Instead, we have to scan the
+ pattern to find the number. It is important that we scan it only as
+ far as we have got because the syntax of named subpatterns has not
+ been checked for the rest of the pattern, and find_parens() assumes
+ correct syntax. In any case, it's a waste of resources to scan
+ further. We stop the scan at the current point by temporarily
+ adjusting the value of cd->endpattern. */
+
+ temp = cd->end_pattern;
+ cd->end_pattern = ptr;
+ recno = find_parens(cd, name, namelen,
+ (options & PCRE_EXTENDED) != 0, utf8);
+ cd->end_pattern = temp;
+ if (recno < 0) recno = 0; /* Forward ref; set dummy number */
}
/* In the real compile, seek the name in the table. We check the name
@@ -5356,7 +5450,7 @@ for (;; ptr++)
}
else if ((recno = /* Forward back reference */
find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) <= 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -5467,7 +5561,7 @@ for (;; ptr++)
if (called == NULL)
{
if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0) < 0)
+ (options & PCRE_EXTENDED) != 0, utf8) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -6797,6 +6891,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
{ skipatstart += 7; options |= PCRE_UTF8; continue; }
else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
{ skipatstart += 6; options |= PCRE_UCP; continue; }
+ else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
+ { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }