diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_compile.c | 492 |
1 files changed, 319 insertions, 173 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c index 77ec51cace..ef33cfc388 100644 --- a/ext/pcre/pcrelib/pcre_compile.c +++ b/ext/pcre/pcrelib/pcre_compile.c @@ -42,7 +42,7 @@ POSSIBILITY OF SUCH DAMAGE. supporting internal functions that are not used by other modules. */ -#include <config.h> +#include "config.h" #define NLBLOCK cd /* Block containing newline information */ #define PSSTART start_pattern /* Field containing processed string start */ @@ -138,35 +138,47 @@ static const short int escapes[] = { #endif -/* Table of special "verbs" like (*PRUNE) */ +/* Table of special "verbs" like (*PRUNE). This is a short table, so it is +searched linearly. Put all the names into a single string, in order to reduce +the number of relocations when a shared library is dynamically linked. */ typedef struct verbitem { - const char *name; int len; int op; } verbitem; +static const char verbnames[] = + "ACCEPT\0" + "COMMIT\0" + "F\0" + "FAIL\0" + "PRUNE\0" + "SKIP\0" + "THEN"; + static verbitem verbs[] = { - { "ACCEPT", 6, OP_ACCEPT }, - { "COMMIT", 6, OP_COMMIT }, - { "F", 1, OP_FAIL }, - { "FAIL", 4, OP_FAIL }, - { "PRUNE", 5, OP_PRUNE }, - { "SKIP", 4, OP_SKIP }, - { "THEN", 4, OP_THEN } + { 6, OP_ACCEPT }, + { 6, OP_COMMIT }, + { 1, OP_FAIL }, + { 4, OP_FAIL }, + { 5, OP_PRUNE }, + { 4, OP_SKIP }, + { 4, OP_THEN } }; static int verbcount = sizeof(verbs)/sizeof(verbitem); -/* Tables of names of POSIX character classes and their lengths. The list is -terminated by a zero length entry. The first three must be alpha, lower, upper, -as this is assumed for handling case independence. */ +/* Tables of names of POSIX character classes and their lengths. The names are +now all in a single string, to reduce the number of relocations when a shared +library is dynamically loaded. The list of lengths is terminated by a zero +length entry. The first three must be alpha, lower, upper, as this is assumed +for handling case independence. */ -static const char *const posix_names[] = { - "alpha", "lower", "upper", - "alnum", "ascii", "blank", "cntrl", "digit", "graph", - "print", "punct", "space", "word", "xdigit" }; +static const char posix_names[] = + "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" + "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" + "word\0" "xdigit"; static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; @@ -205,84 +217,90 @@ static const int posix_class_maps[] = { /* The texts of compile-time error messages. These are "char *" because they are passed to the outside world. Do not ever re-use any error number, because they are documented. Always add a new error instead. Messages marked DEAD below -are no longer used. */ - -static const char *error_texts[] = { - "no error", - "\\ at end of pattern", - "\\c at end of pattern", - "unrecognized character follows \\", - "numbers out of order in {} quantifier", +are no longer used. This used to be a table of strings, but in order to reduce +the number of relocations needed when a shared library is loaded dynamically, +it is now one long string. We cannot use a table of offsets, because the +lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we +simply count through to the one we want - this isn't a performance issue +because these strings are used only when there is a compilation error. */ + +static const char error_texts[] = + "no error\0" + "\\ at end of pattern\0" + "\\c at end of pattern\0" + "unrecognized character follows \\\0" + "numbers out of order in {} quantifier\0" /* 5 */ - "number too big in {} quantifier", - "missing terminating ] for character class", - "invalid escape sequence in character class", - "range out of order in character class", - "nothing to repeat", + "number too big in {} quantifier\0" + "missing terminating ] for character class\0" + "invalid escape sequence in character class\0" + "range out of order in character class\0" + "nothing to repeat\0" /* 10 */ - "operand of unlimited repeat could match the empty string", /** DEAD **/ - "internal error: unexpected repeat", - "unrecognized character after (?", - "POSIX named classes are supported only within a class", - "missing )", + "operand of unlimited repeat could match the empty string\0" /** DEAD **/ + "internal error: unexpected repeat\0" + "unrecognized character after (? or (?-\0" + "POSIX named classes are supported only within a class\0" + "missing )\0" /* 15 */ - "reference to non-existent subpattern", - "erroffset passed as NULL", - "unknown option bit(s) set", - "missing ) after comment", - "parentheses nested too deeply", /** DEAD **/ + "reference to non-existent subpattern\0" + "erroffset passed as NULL\0" + "unknown option bit(s) set\0" + "missing ) after comment\0" + "parentheses nested too deeply\0" /** DEAD **/ /* 20 */ - "regular expression is too large", - "failed to get memory", - "unmatched parentheses", - "internal error: code overflow", - "unrecognized character after (?<", + "regular expression is too large\0" + "failed to get memory\0" + "unmatched parentheses\0" + "internal error: code overflow\0" + "unrecognized character after (?<\0" /* 25 */ - "lookbehind assertion is not fixed length", - "malformed number or name after (?(", - "conditional group contains more than two branches", - "assertion expected after (?(", - "(?R or (?[+-]digits must be followed by )", + "lookbehind assertion is not fixed length\0" + "malformed number or name after (?(\0" + "conditional group contains more than two branches\0" + "assertion expected after (?(\0" + "(?R or (?[+-]digits must be followed by )\0" /* 30 */ - "unknown POSIX class name", - "POSIX collating elements are not supported", - "this version of PCRE is not compiled with PCRE_UTF8 support", - "spare error", /** DEAD **/ - "character value in \\x{...} sequence is too large", + "unknown POSIX class name\0" + "POSIX collating elements are not supported\0" + "this version of PCRE is not compiled with PCRE_UTF8 support\0" + "spare error\0" /** DEAD **/ + "character value in \\x{...} sequence is too large\0" /* 35 */ - "invalid condition (?(0)", - "\\C not allowed in lookbehind assertion", - "PCRE does not support \\L, \\l, \\N, \\U, or \\u", - "number after (?C is > 255", - "closing ) for (?C expected", + "invalid condition (?(0)\0" + "\\C not allowed in lookbehind assertion\0" + "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" + "number after (?C is > 255\0" + "closing ) for (?C expected\0" /* 40 */ - "recursive call could loop indefinitely", - "unrecognized character after (?P", - "syntax error in subpattern name (missing terminator)", - "two named subpatterns have the same name", - "invalid UTF-8 string", + "recursive call could loop indefinitely\0" + "unrecognized character after (?P\0" + "syntax error in subpattern name (missing terminator)\0" + "two named subpatterns have the same name\0" + "invalid UTF-8 string\0" /* 45 */ - "support for \\P, \\p, and \\X has not been compiled", - "malformed \\P or \\p sequence", - "unknown property name after \\P or \\p", - "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", - "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", + "support for \\P, \\p, and \\X has not been compiled\0" + "malformed \\P or \\p sequence\0" + "unknown property name after \\P or \\p\0" + "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" + "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ - "repeated subpattern is too long", /** DEAD **/ - "octal value is greater than \\377 (not in UTF-8 mode)", - "internal error: overran compiling workspace", - "internal error: previously-checked referenced subpattern not found", - "DEFINE group contains more than one branch", + "repeated subpattern is too long\0" /** DEAD **/ + "octal value is greater than \\377 (not in UTF-8 mode)\0" + "internal error: overran compiling workspace\0" + "internal error: previously-checked referenced subpattern not found\0" + "DEFINE group contains more than one branch\0" /* 55 */ - "repeating a DEFINE group is not allowed", - "inconsistent NEWLINE options", - "\\g is not followed by a braced name or an optionally braced non-zero number", - "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number", - "(*VERB) with an argument is not supported", + "repeating a DEFINE group is not allowed\0" + "inconsistent NEWLINE options\0" + "\\g is not followed by a braced name or an optionally braced non-zero number\0" + "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0" + "(*VERB) with an argument is not supported\0" /* 60 */ - "(*VERB) not recognized", - "number is too big" -}; + "(*VERB) not recognized\0" + "number is too big\0" + "subpattern name expected\0" + "digit expected after (?+"; /* Table to identify digits and hex digits. This is used when compiling @@ -418,6 +436,28 @@ static BOOL /************************************************* +* Find an error text * +*************************************************/ + +/* The error texts are now all in one long string, to save on relocations. As +some of the text is of unknown length, we can't use a table of offsets. +Instead, just count through the strings. This is not a performance issue +because it happens only when there has been a compilation error. + +Argument: the error number +Returns: pointer to the error string +*/ + +static const char * +find_error_text(int n) +{ +const char *s = error_texts; +for (; n > 0; n--) while (*s++ != 0); +return s; +} + + +/************************************************* * Handle escapes * *************************************************/ @@ -456,16 +496,16 @@ ptr--; /* Set pointer back to the last byte */ if (c == 0) *errorcodeptr = ERR1; -/* Non-alphamerics are literals. For digits or letters, do an initial lookup in -a table. A non-zero result is something that can be returned immediately. +/* Non-alphanumerics are literals. For digits or letters, do an initial lookup +in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ #ifndef EBCDIC /* ASCII coding */ -else if (c < '0' || c > 'z') {} /* Not alphameric */ +else if (c < '0' || c > 'z') {} /* Not alphanumeric */ else if ((i = escapes[c - '0']) != 0) c = i; #else /* EBCDIC coding */ -else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */ +else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ else if ((i = escapes[c - 0x48]) != 0) c = i; #endif @@ -682,10 +722,10 @@ else break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any - other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, - for Perl compatibility, it is a literal. This code looks a bit odd, but - there used to be some cases other than the default, and there may be again - in future, so I haven't "optimized" it. */ + other alphanumeric following \ is an error if PCRE_EXTRA was set; + otherwise, for Perl compatibility, it is a literal. This code looks a bit + odd, but there used to be some cases other than the default, and there may + be again in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) @@ -774,7 +814,7 @@ top = _pcre_utt_size; while (bot < top) { i = (bot + top) >> 1; - c = strcmp(name, _pcre_utt[i].name); + c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); if (c == 0) { *dptr = _pcre_utt[i].value; @@ -1466,8 +1506,9 @@ for (;;) can match the empty string or not. It is called from could_be_empty() below and from compile_branch() when checking for an unlimited repeat of a group that can match nothing. Note that first_significant_code() skips over -assertions. If we hit an unclosed bracket, we return "empty" - this means we've -struck an inner bracket whose current branch will already have been scanned. +backward and negative forward assertions when its final argument is TRUE. If we +hit an unclosed bracket, we return "empty" - this means we've struck an inner +bracket whose current branch will already have been scanned. Arguments: code points to start of search @@ -1489,6 +1530,16 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE c = *code; + /* Skip over forward assertions; the other assertions are skipped by + first_significant_code() with a TRUE final argument. */ + + if (c == OP_ASSERT) + { + do code += GET(code, 1); while (*code == OP_ALT); + c = *code; + continue; + } + /* Groups with zero repeats can of course be empty; skip them. */ if (c == OP_BRAZERO || c == OP_BRAMINZERO) @@ -1684,29 +1735,48 @@ return TRUE; *************************************************/ /* This function is called when the sequence "[:" or "[." or "[=" is -encountered in a character class. It checks whether this is followed by an -optional ^ and then a sequence of letters, terminated by a matching ":]" or -".]" or "=]". +encountered in a character class. It checks whether this is followed by a +sequence of characters terminated by a matching ":]" or ".]" or "=]". If we +reach an unescaped ']' without the special preceding character, return FALSE. + +Originally, this function only recognized a sequence of letters between the +terminators, but it seems that Perl recognizes any sequence of characters, +though of course unknown POSIX names are subsequently rejected. Perl gives an +"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE +didn't consider this to be a POSIX class. Likewise for [:1234:]. + +The problem in trying to be exactly like Perl is in the handling of escapes. We +have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX +class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +below handles the special case of \], but does not try to do any other escape +processing. This makes it different from Perl for cases such as [:l\ower:] +where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize +"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, +I think. -Argument: +Arguments: ptr pointer to the initial [ endptr where to return the end pointer - cd pointer to compile data Returns: TRUE or FALSE */ static BOOL -check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) +check_posix_syntax(const uschar *ptr, const uschar **endptr) { int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ -if (*(++ptr) == '^') ptr++; -while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; -if (*ptr == terminator && ptr[1] == ']') +for (++ptr; *ptr != 0; ptr++) { - *endptr = ptr; - return TRUE; + if (*ptr == '\\' && ptr[1] == ']') ptr++; else + { + if (*ptr == ']') return FALSE; + if (*ptr == terminator && ptr[1] == ']') + { + *endptr = ptr; + return TRUE; + } + } } return FALSE; } @@ -1731,11 +1801,13 @@ Returns: a value representing the name, or -1 if unknown static int check_posix_name(const uschar *ptr, int len) { +const char *pn = posix_names; register int yield = 0; while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && - strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield; + strncmp((const char *)ptr, pn, len) == 0) return yield; + pn += posix_name_lengths[yield] + 1; yield++; } return -1; @@ -2341,6 +2413,7 @@ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; for (;; ptr++) { BOOL negate_class; + BOOL should_flip_negation; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; @@ -2564,7 +2637,7 @@ for (;; ptr++) they are encountered at the top level, so we'll do that too. */ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + check_posix_syntax(ptr, &tempptr)) { *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; goto FAILED; @@ -2589,6 +2662,12 @@ for (;; ptr++) else break; } + /* If a class contains a negative special such as \S, we need to flip the + negation flag at the end, so that support for characters > 255 works + correctly (they are all included in the class). */ + + should_flip_negation = FALSE; + /* Keep a count of chars with values < 256 so that we can optimize the case of just a single character (as long as it's < 256). However, For higher valued UTF-8 characters, we don't yet do any optimization. */ @@ -2644,7 +2723,7 @@ for (;; ptr++) if (c == '[' && (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; @@ -2661,6 +2740,7 @@ for (;; ptr++) if (*ptr == '^') { local_negate = TRUE; + should_flip_negation = TRUE; /* Note negative special */ ptr++; } @@ -2735,7 +2815,7 @@ for (;; ptr++) c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ + if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ @@ -2763,6 +2843,7 @@ for (;; ptr++) continue; case ESC_D: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; continue; @@ -2771,6 +2852,7 @@ for (;; ptr++) continue; case ESC_W: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; continue; @@ -2780,13 +2862,11 @@ for (;; ptr++) continue; case ESC_S: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; - case ESC_E: /* Perl ignores an orphan \E */ - continue; - default: /* Not recognized; fall through */ break; /* Need "default" setting to stop compiler warning. */ } @@ -2974,6 +3054,12 @@ for (;; ptr++) oldptr = ptr; + /* Remember \r or \n */ + + if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; + + /* Check for range */ + if (!inescq && ptr[1] == '-') { int d; @@ -3015,7 +3101,7 @@ for (;; ptr++) d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - /* \b is backslash; \X is literal X; \R is literal R; any other + /* \b is backspace; \X is literal X; \R is literal R; any other special means the '-' was literal */ if (d < 0) @@ -3041,6 +3127,10 @@ for (;; ptr++) if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ + /* Remember \r or \n */ + + if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; + /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless matching for characters > 127 is available only if UCP support is @@ -3194,16 +3284,24 @@ for (;; ptr++) goto FAILED; } + +/* This code has been disabled because it would mean that \s counts as +an explicit \r or \n reference, and that's not really what is wanted. Now +we set the flag only if there is a literal "\r" or "\n" in the class. */ + +#if 0 /* Remember whether \r or \n are in this class */ if (negate_class) { - if ((classbits[1] & 0x24) != 0x24) cd->external_options |= PCRE_HASCRORLF; + if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; } else { - if ((classbits[1] & 0x24) != 0) cd->external_options |= PCRE_HASCRORLF; + if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; } +#endif + /* If class_charcount is 1, we saw precisely one character whose value is less than 256. As long as there were no characters >= 128 and there was no @@ -3267,11 +3365,14 @@ for (;; ptr++) zeroreqbyte = reqbyte; /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode. If there are no characters < 256, - we can omit the bitmap in the actual compiled code. */ + extended class, with its own opcode, unless there was a negated special + such as \S in the class, because in that case all characters > 255 are in + the class, so any that were explicitly given as well can be ignored. If + (when there are explicit characters > 255 that must be listed) there are no + characters < 256, we can omit the bitmap in the actual compiled code. */ #ifdef SUPPORT_UTF8 - if (class_utf8) + if (class_utf8 && !should_flip_negation) { *class_utf8data++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; @@ -3297,20 +3398,19 @@ for (;; ptr++) } #endif - /* If there are no characters > 255, negate the 32-byte map if necessary, - and copy it into the code vector. If this is the first thing in the branch, - there can be no first char setting, whatever the repeat count. Any reqbyte - setting must remain unchanged after any kind of repeat. */ + /* If there are no characters > 255, set the opcode to OP_CLASS or + OP_NCLASS, depending on whether the whole class was negated and whether + there were negative specials such as \S in the class. Then copy the 32-byte + map into the code vector, negating it if necessary. */ + *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) { - *code++ = OP_NCLASS; if (lengthptr == NULL) /* Save time in the pre-compile phase */ for (c = 0; c < 32; c++) code[c] = ~classbits[c]; } else { - *code++ = OP_CLASS; memcpy(code, classbits, 32); } code += 32; @@ -3496,7 +3596,7 @@ for (;; ptr++) /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ - if (repeat_max != 1) cd->nopartial = TRUE; + if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; /* Combine the op_type with the repeat_type */ @@ -3646,7 +3746,7 @@ for (;; ptr++) /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ - if (repeat_max != 1) cd->nopartial = TRUE; + if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; @@ -3946,7 +4046,9 @@ for (;; ptr++) int len; if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || *tempcode == OP_NOTEXACT) - tempcode += _pcre_OP_lengths[*tempcode]; + tempcode += _pcre_OP_lengths[*tempcode] + + ((*tempcode == OP_TYPEEXACT && + (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); len = code - tempcode; if (len > 0) switch (*tempcode) { @@ -4004,6 +4106,7 @@ for (;; ptr++) if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) { int i, namelen; + const char *vn = verbnames; const uschar *name = ++ptr; previous = NULL; while ((cd->ctypes[*++ptr] & ctype_letter) != 0); @@ -4021,12 +4124,13 @@ for (;; ptr++) for (i = 0; i < verbcount; i++) { if (namelen == verbs[i].len && - strncmp((char *)name, verbs[i].name, namelen) == 0) + strncmp((char *)name, vn, namelen) == 0) { *code = verbs[i].op; if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; break; } + vn += verbs[i].len + 1; } if (i < verbcount) continue; *errorcodeptr = ERR60; @@ -4171,16 +4275,13 @@ for (;; ptr++) *errorcodeptr = ERR58; goto FAILED; } - if (refsign == '-') + recno = (refsign == '-')? + cd->bracount - recno + 1 : recno +cd->bracount; + if (recno <= 0 || recno > cd->final_bracount) { - recno = cd->bracount - recno + 1; - if (recno <= 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } + *errorcodeptr = ERR15; + goto FAILED; } - else recno += cd->bracount; PUT2(code, 2+LINK_SIZE, recno); break; } @@ -4252,9 +4353,10 @@ for (;; ptr++) skipbytes = 1; } - /* Check for the "name" actually being a subpattern number. */ + /* Check for the "name" actually being a subpattern number. We are + in the second pass here, so final_bracount is set. */ - else if (recno > 0) + else if (recno > 0 && recno <= cd->final_bracount) { PUT2(code, 2+LINK_SIZE, recno); } @@ -4448,7 +4550,9 @@ for (;; ptr++) /* We come here from the Python syntax above that handles both references (?P=name) and recursion (?P>name), as well as falling - through from the Perl recursion syntax (?&name). */ + through from the Perl recursion syntax (?&name). We also come here from + the Perl \k<name> or \k'name' back reference syntax and the \k{name} + .NET syntax. */ NAMED_REF_OR_RECURSE: name = ++ptr; @@ -4460,6 +4564,11 @@ for (;; ptr++) if (lengthptr != NULL) { + if (namelen == 0) + { + *errorcodeptr = ERR62; + goto FAILED; + } if (*ptr != terminator) { *errorcodeptr = ERR42; @@ -4473,14 +4582,19 @@ for (;; ptr++) recno = 0; } - /* In the real compile, seek the name in the table */ + /* In the real compile, seek the name in the table. We check the name + first, and then check that we have reached the end of the name in the + table. That way, if the name that is longer than any in the table, + the comparison will fail without reading beyond the table entry. */ else { slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { - if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; + if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && + slot[2+namelen] == 0) + break; slot += cd->name_entry_size; } @@ -4517,7 +4631,15 @@ for (;; ptr++) { const uschar *called; - if ((refsign = *ptr) == '+') ptr++; + if ((refsign = *ptr) == '+') + { + ptr++; + if ((digitab[*ptr] & ctype_digit) == 0) + { + *errorcodeptr = ERR63; + goto FAILED; + } + } else if (refsign == '-') { if ((digitab[ptr[1]] & ctype_digit) == 0) @@ -4643,7 +4765,7 @@ for (;; ptr++) case 'J': /* Record that it changed in the external options */ *optset |= PCRE_DUPNAMES; - cd->external_options |= PCRE_JCHANGED; + cd->external_flags |= PCRE_JCHANGED; break; case 'i': *optset |= PCRE_CASELESS; break; @@ -5063,7 +5185,7 @@ for (;; ptr++) /* Remember if \r or \n were seen */ if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') - cd->external_options |= PCRE_HASCRORLF; + cd->external_flags |= PCRE_HASCRORLF; /* Set the first and required bytes appropriately. If no previous first byte, set it from this character, but revert to none on a zero repeat. @@ -5743,24 +5865,46 @@ cd->fcc = tables + fcc_offset; cd->cbits = tables + cbits_offset; cd->ctypes = tables + ctypes_offset; -/* Check for newline settings at the start of the pattern, and remember the -offset for later. */ +/* Check for global one-time settings at the start of the pattern, and remember +the offset for later. */ -if (ptr[0] == '(' && ptr[1] == '*') +while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') { int newnl = 0; - if (strncmp((char *)(ptr+2), "CR)", 3) == 0) - { skipatstart = 5; newnl = PCRE_NEWLINE_CR; } - else if (strncmp((char *)(ptr+2), "LF)", 3) == 0) - { skipatstart = 5; newnl = PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+2), "CRLF)", 5) == 0) - { skipatstart = 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+2), "ANY)", 4) == 0) - { skipatstart = 6; newnl = PCRE_NEWLINE_ANY; } - else if (strncmp((char *)(ptr+2), "ANYCRLF)", 8) == 0) - { skipatstart = 10; newnl = PCRE_NEWLINE_ANYCRLF; } - if (skipatstart > 0) + int newbsr = 0; + + if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0) + { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } + else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0) + { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } + else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0) + { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } + else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0) + { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } + else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) + { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } + + else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) + { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } + else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) + { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } + + if (newnl != 0) options = (options & ~PCRE_NEWLINE_BITS) | newnl; + else if (newbsr != 0) + options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr; + else break; + } + +/* Check validity of \R options. */ + +switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) + { + case 0: + case PCRE_BSR_ANYCRLF: + case PCRE_BSR_UNICODE: + break; + default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; } /* Handle different types of newline. The three bits give seven cases. The @@ -5822,7 +5966,7 @@ to compile parts of the pattern into; the compiled code is discarded when it is no longer needed, so hopefully this workspace will never overflow, though there is a test for its doing so. */ -cd->bracount = 0; +cd->bracount = cd->final_bracount = 0; cd->names_found = 0; cd->name_entry_size = 0; cd->name_table = NULL; @@ -5832,8 +5976,8 @@ cd->hwm = cworkspace; cd->start_pattern = (const uschar *)pattern; cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); cd->req_varyopt = 0; -cd->nopartial = FALSE; cd->external_options = options; +cd->external_flags = 0; /* Now do the pre-compile. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. The initial options have @@ -5872,14 +6016,16 @@ if (re == NULL) goto PCRE_EARLY_ERROR_RETURN; } -/* Put in the magic number, and save the sizes, initial options, and character -table pointer. NULL is used for the default character tables. The nullpad field -is at the end; it's there to help in the case when a regex compiled on a system -with 4-byte pointers is run on another with 8-byte pointers. */ +/* Put in the magic number, and save the sizes, initial options, internal +flags, and character table pointer. NULL is used for the default character +tables. The nullpad field is at the end; it's there to help in the case when a +regex compiled on a system with 4-byte pointers is run on another with 8-byte +pointers. */ re->magic_number = MAGIC_NUMBER; re->size = size; re->options = cd->external_options; +re->flags = cd->external_flags; re->dummy1 = 0; re->first_byte = 0; re->req_byte = 0; @@ -5897,6 +6043,7 @@ field. Reset the bracket count and the names_found field. Also reset the hwm field; this time it's used for remembering forward references to subpatterns. */ +cd->final_bracount = cd->bracount; /* Save for checking forward references */ cd->bracount = 0; cd->names_found = 0; cd->name_table = (uschar *)re + re->name_table_offset; @@ -5904,7 +6051,6 @@ codestart = cd->name_table + re->name_entry_size * re->name_count; cd->start_code = codestart; cd->hwm = cworkspace; cd->req_varyopt = 0; -cd->nopartial = FALSE; cd->had_accept = FALSE; /* Set up a starting, non-extracting bracket, then compile the expression. On @@ -5918,8 +6064,8 @@ code = (uschar *)codestart; &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; +re->flags = cd->external_flags; -if (cd->nopartial) re->options |= PCRE_NOPARTIAL; if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ /* If not reached end of pattern on success, there's an excess bracket. */ @@ -5962,7 +6108,7 @@ if (errorcode != 0) PCRE_EARLY_ERROR_RETURN: *erroroffset = ptr - (const uschar *)pattern; PCRE_EARLY_ERROR_RETURN2: - *errorptr = error_texts[errorcode]; + *errorptr = find_error_text(errorcode); if (errorcodeptr != NULL) *errorcodeptr = errorcode; return NULL; } @@ -5991,10 +6137,10 @@ if ((re->options & PCRE_ANCHORED) == 0) int ch = firstbyte & 255; re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch)? ch : firstbyte; - re->options |= PCRE_FIRSTSET; + re->flags |= PCRE_FIRSTSET; } else if (is_startline(codestart, 0, cd->backref_map)) - re->options |= PCRE_STARTLINE; + re->flags |= PCRE_STARTLINE; } } @@ -6008,7 +6154,7 @@ if (reqbyte >= 0 && int ch = reqbyte & 255; re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; - re->options |= PCRE_REQCHSET; + re->flags |= PCRE_REQCHSET; } /* Print out the compiled data if debugging is enabled. This is never the @@ -6021,7 +6167,7 @@ printf("Length = %d top_bracket = %d top_backref = %d\n", printf("Options=%08x\n", re->options); -if ((re->options & PCRE_FIRSTSET) != 0) +if ((re->flags & PCRE_FIRSTSET) != 0) { int ch = re->first_byte & 255; const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? @@ -6030,7 +6176,7 @@ if ((re->options & PCRE_FIRSTSET) != 0) else printf("First char = \\x%02x%s\n", ch, caseless); } -if ((re->options & PCRE_REQCHSET) != 0) +if ((re->flags & PCRE_REQCHSET) != 0) { int ch = re->req_byte & 255; const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? @@ -6047,7 +6193,7 @@ was compiled can be seen. */ if (code - codestart > length) { (pcre_free)(re); - *errorptr = error_texts[ERR23]; + *errorptr = find_error_text(ERR23); *erroroffset = ptr - (uschar *)pattern; if (errorcodeptr != NULL) *errorcodeptr = ERR23; return NULL; |