diff options
author | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2019-02-12 17:50:19 +0000 |
---|---|---|
committer | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2019-02-12 17:50:19 +0000 |
commit | 4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68 (patch) | |
tree | 2f92e9bdf9f05dbe278c16ae6162b5dd725f2749 /src | |
parent | 5a5285b1066d191d22eb858cbc9862b6e044ca9e (diff) | |
download | pcre2-4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68.tar.gz |
Implement PCRE2_EXTRA_ALT_BSUX to support ECMAscript 6's \u{hhh..} syntax.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1070 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src')
-rw-r--r-- | src/pcre2.h.in | 1 | ||||
-rw-r--r-- | src/pcre2_compile.c | 137 | ||||
-rw-r--r-- | src/pcre2_internal.h | 4 | ||||
-rw-r--r-- | src/pcre2_substitute.c | 6 | ||||
-rw-r--r-- | src/pcre2test.c | 4 |
5 files changed, 98 insertions, 54 deletions
diff --git a/src/pcre2.h.in b/src/pcre2.h.in index bd1b04f..9415d70 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -150,6 +150,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ #define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ #define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ +#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index d72d9bd..1edcf1a 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -764,7 +764,7 @@ are allowed. */ #define PUBLIC_COMPILE_EXTRA_OPTIONS \ (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ - PCRE2_EXTRA_ESCAPED_CR_IS_LF) + PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX) /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and @@ -1459,7 +1459,8 @@ Returns: zero => a data character int PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, - int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb) + int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass, + compile_block *cb) { BOOL utf = (options & PCRE2_UTF) != 0; PCRE2_SPTR ptr = *ptrptr; @@ -1495,8 +1496,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0) if (i > 0) { c = (uint32_t)i; - if (cb != NULL && c == CHAR_CR && - (cb->cx->extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0) + if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0) c = CHAR_LF; } else /* Negative table entry */ @@ -1551,22 +1551,28 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0) /* Escapes that need further processing, including those that are unknown, have a zero entry in the lookup table. When called from pcre2_substitute(), only \c, -\o, and \x are recognized (and \u when BSUX is set). */ +\o, and \x are recognized (\u and \U can never appear as they are used for case +forcing). */ else { + int s; PCRE2_SPTR oldptr; BOOL overflow; - int s; + BOOL alt_bsux = + ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0; /* Filter calls from pcre2_substitute(). */ - if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x && - (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0)) + if (cb == NULL) { - *errorcodeptr = ERR3; - return 0; - } + if (c != CHAR_c && c != CHAR_o && c != CHAR_x) + { + *errorcodeptr = ERR3; + return 0; + } + alt_bsux = FALSE; /* Do not modify \x handling */ + } switch (c) { @@ -1579,40 +1585,74 @@ else *errorcodeptr = ERR37; break; - /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated - specially, \u must be followed by four hex digits. Otherwise it is a - lowercase u letter. */ + /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX + is set. Otherwise, \u must be followed by exactly four hex digits or, if + PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces. + Otherwise it is a lowercase u letter. This gives some compatibility with + ECMAScript (aka JavaScript). */ case CHAR_u: - if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else + if (!alt_bsux) *errorcodeptr = ERR37; else { uint32_t xc; - if (ptrend - ptr < 4) break; /* Less than 4 chars */ - if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ - if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; - if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; - if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ - c = (cc << 4) | xc; - ptr += 4; + + if (*ptr == CHAR_LEFT_CURLY_BRACKET && + (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0) + { + PCRE2_SPTR hptr = ptr + 1; + cc = 0; + + while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff) + { + if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */ + { + *errorcodeptr = ERR77; + ptr = hptr; /* Show where */ + break; /* *hptr != } will cause another break below */ + } + cc = (cc << 4) | xc; + hptr++; + } + + if (hptr == ptr + 1 || /* No hex digits */ + hptr >= ptrend || /* Hit end of input */ + *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ + break; /* Hex escape not recognized */ + + c = cc; /* Accept the code point */ + ptr = hptr + 1; + } + + else /* Must be exactly 4 hex digits */ + { + if (ptrend - ptr < 4) break; /* Less than 4 chars */ + if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ + if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ + cc = (cc << 4) | xc; + if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ + cc = (cc << 4) | xc; + if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ + c = (cc << 4) | xc; + ptr += 4; + } + if (utf) { if (c > 0x10ffffU) *errorcodeptr = ERR77; else if (c >= 0xd800 && c <= 0xdfff && - (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) - *errorcodeptr = ERR73; + (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + *errorcodeptr = ERR73; } else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; } break; - /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an - upper case letter. */ + /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, + in which case it is an upper case letter. */ case CHAR_U: - if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; + if (!alt_bsux) *errorcodeptr = ERR37; break; /* In a character class, \g is just a literal "g". Outside a character @@ -1791,8 +1831,8 @@ else } else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) { - if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL || - (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)) + if (utf && c >= 0xd800 && c <= 0xdfff && + (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) { ptr--; *errorcodeptr = ERR73; @@ -1806,11 +1846,11 @@ else } break; - /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by - two hexadecimal digits. Otherwise it is a lowercase x letter. */ + /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed + by two hexadecimal digits. Otherwise it is a lowercase x letter. */ case CHAR_x: - if ((options & PCRE2_ALT_BSUX) != 0) + if (alt_bsux) { uint32_t xc; if (ptrend - ptr < 2) break; /* Less than 2 characters */ @@ -1818,9 +1858,9 @@ else if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ c = (cc << 4) | xc; ptr += 2; - } /* End PCRE2_ALT_BSUX handling */ + } - /* Handle \x in Perl's style. \x{ddd} is a character number which can be + /* Handle \x in Perl's style. \x{ddd} is a character code which can be greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex digits. If not, { used to be treated as a data character. However, Perl seems to read hex digits up to the first non-such, and ignore the rest, so @@ -1864,8 +1904,8 @@ else } else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) { - if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL || - (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)) + if (utf && c >= 0xd800 && c <= 0xdfff && + (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) { ptr--; *errorcodeptr = ERR73; @@ -2438,6 +2478,7 @@ uint32_t *parsed_pattern = cb->parsed_pattern; uint32_t *parsed_pattern_end = cb->parsed_pattern_end; uint32_t meta_quantifier = 0; uint32_t add_after_mark = 0; +uint32_t extra_options = cb->cx->extra_options; uint16_t nest_depth = 0; int after_manual_callout = 0; int expect_cond_assert = 0; @@ -2461,12 +2502,12 @@ nest_save *top_nest, *end_nests; /* Insert leading items for word and line matching (features provided for the benefit of pcre2grep). */ -if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) +if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) { *parsed_pattern++ = META_CIRCUMFLEX; *parsed_pattern++ = META_NOCAPTURE; } -else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) +else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) { *parsed_pattern++ = META_ESCAPE + ESC_b; *parsed_pattern++ = META_NOCAPTURE; @@ -2631,7 +2672,7 @@ while (ptr < ptrend) if ((options & PCRE2_ALT_VERBNAMES) != 0) { escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, - FALSE, cb); + cb->cx->extra_options, FALSE, cb); if (errorcode != 0) goto FAILED; } else escape = 0; /* Treat all as literal */ @@ -2821,11 +2862,11 @@ while (ptr < ptrend) case CHAR_BACKSLASH: tempptr = ptr; escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, - FALSE, cb); + cb->cx->extra_options, FALSE, cb); if (errorcode != 0) { ESCAPE_FAILED: - if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) + if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) goto FAILED; ptr = tempptr; if (ptr >= ptrend) c = CHAR_BACKSLASH; else @@ -3382,12 +3423,12 @@ while (ptr < ptrend) else { tempptr = ptr; - escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, - options, TRUE, cb); + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + cb->cx->extra_options, TRUE, cb); if (errorcode != 0) { - if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) + if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0) goto FAILED; ptr = tempptr; if (ptr >= ptrend) c = CHAR_BACKSLASH; else @@ -4545,12 +4586,12 @@ parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout, /* Insert trailing items for word and line matching (features provided for the benefit of pcre2grep). */ -if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) +if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0) { *parsed_pattern++ = META_KET; *parsed_pattern++ = META_DOLLAR; } -else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) +else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0) { *parsed_pattern++ = META_KET; *parsed_pattern++ = META_ESCAPE + ESC_b; diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index b4efcf0..5669990 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge + New API code Copyright (c) 2016-2019 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -1942,7 +1942,7 @@ is available. */ extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_block *); extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, - int *, uint32_t, BOOL, compile_block *); + int *, uint32_t, uint32_t, BOOL, compile_block *); extern PCRE2_SPTR _pcre2_extuni(uint32_t, PCRE2_SPTR, PCRE2_SPTR, PCRE2_SPTR, BOOL, int *); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c index d404c93..0b46629 100644 --- a/src/pcre2_substitute.c +++ b/src/pcre2_substitute.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge + New API code Copyright (c) 2016-2019 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -129,7 +129,7 @@ for (; ptr < ptrend; ptr++) ptr += 1; /* Must point after \ */ erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, - code->overall_options, FALSE, NULL); + code->overall_options, code->extra_options, FALSE, NULL); ptr -= 1; /* Back to last code unit of escape */ if (errorcode != 0) { @@ -774,7 +774,7 @@ do ptr++; /* Point after \ */ rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, - code->overall_options, FALSE, NULL); + code->overall_options, code->extra_options, FALSE, NULL); if (errorcode != 0) goto BADESCAPE; switch(rc) diff --git a/src/pcre2test.c b/src/pcre2test.c index 462a254..e493099 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -646,6 +646,7 @@ static modstruct modlist[] = { { "expand", MOD_PAT, MOD_CTL, CTL_EXPAND, PO(control) }, { "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) }, { "extended_more", MOD_PATP, MOD_OPT, PCRE2_EXTENDED_MORE, PO(options) }, + { "extra_alt_bsux", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALT_BSUX, CO(extra_options) }, { "find_limits", MOD_DAT, MOD_CTL, CTL_FINDLIMITS, DO(control) }, { "firstline", MOD_PAT, MOD_OPT, PCRE2_FIRSTLINE, PO(options) }, { "framesize", MOD_PAT, MOD_CTL, CTL_FRAMESIZE, PO(control) }, @@ -4189,10 +4190,11 @@ show_compile_extra_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s <none>%s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s", before, ((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "", ((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "", + ((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " extra_alt_bsux" : "", ((options & PCRE2_EXTRA_MATCH_WORD) != 0)? " match_word" : "", ((options & PCRE2_EXTRA_MATCH_LINE) != 0)? " match_line" : "", ((options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)? " escaped_cr_is_lf" : "", |