diff options
author | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2019-02-06 18:11:36 +0000 |
---|---|---|
committer | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2019-02-06 18:11:36 +0000 |
commit | 03c006cfda40d5218d2248674ddc3824f8169897 (patch) | |
tree | 8bfb007e8adba8eb8e1256afba09001b52509905 /src | |
parent | 2aee0809b4ec6f9c2fdbb33a0c200b17a9fd333c (diff) | |
download | pcre2-03c006cfda40d5218d2248674ddc3824f8169897.tar.gz |
Allow non-ASCII in group names when UTF is set; revise group naming terminology
in documentation to use "capture group", as Perl does.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1066 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src')
-rw-r--r-- | src/pcre2_compile.c | 88 | ||||
-rw-r--r-- | src/pcre2_error.c | 12 | ||||
-rw-r--r-- | src/pcre2test.c | 94 |
3 files changed, 140 insertions, 54 deletions
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index eb45210..0c38b5b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2194,6 +2194,7 @@ so it is simplest just to return both. Arguments: ptrptr points to the character pointer variable ptrend points to the end of the input string + utf true if the input is UTF-encoded terminator the terminator of a subpattern name must be this offsetptr where to put the offset from the start of the pattern nameptr where to put a pointer to the name in the input @@ -2206,13 +2207,12 @@ Returns: TRUE if a name was read */ static BOOL -read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator, +read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator, PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, int *errorcodeptr, compile_block *cb) { PCRE2_SPTR ptr = *ptrptr; BOOL is_group = (*ptr != CHAR_ASTERISK); -uint32_t namelen = 0; if (++ptr >= ptrend) /* No characters in name */ { @@ -2221,35 +2221,74 @@ if (++ptr >= ptrend) /* No characters in name */ goto FAILED; } -/* A group name must not start with a digit. If either of the others start with -a digit it just won't be recognized. */ +*nameptr = ptr; +*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); -if (is_group && IS_DIGIT(*ptr)) +/* In UTF mode, a group name may contain letters and decimal digits as defined +by Unicode properties, and underscores, but must not start with a digit. */ + +#ifdef SUPPORT_UNICODE +if (utf && is_group) { - *errorcodeptr = ERR44; - goto FAILED; + uint32_t c, type; + + GETCHAR(c, ptr); + type = UCD_CHARTYPE(c); + + if (type == ucp_Nd) + { + *errorcodeptr = ERR44; + goto FAILED; + } + + for(;;) + { + if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L && + c != CHAR_UNDERSCORE) break; + ptr++; + FORWARDCHAR(ptr); + if (ptr >= ptrend) break; + GETCHAR(c, ptr); + type = UCD_CHARTYPE(c); + } } +else +#else +(void)utf; /* Avoid compiler warning */ +#endif /* SUPPORT_UNICODE */ -*nameptr = ptr; -*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); +/* Handle non-group names and group names in non-UTF modes. A group name must +not start with a digit. If either of the others start with a digit it just +won't be recognized. */ -while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) { - ptr++; - namelen++; - if (namelen > MAX_NAME_SIZE) + if (is_group && IS_DIGIT(*ptr)) { - *errorcodeptr = ERR48; + *errorcodeptr = ERR44; goto FAILED; } + + while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) + { + ptr++; + } } +/* Check name length */ + +if (ptr > *nameptr + MAX_NAME_SIZE) + { + *errorcodeptr = ERR48; + goto FAILED; + } +*namelenptr = ptr - *nameptr; + /* Subpattern names must not be empty, and their terminator is checked here. (What follows a verb or alpha assertion name is checked separately.) */ if (is_group) { - if (namelen == 0) + if (ptr == *nameptr) { *errorcodeptr = ERR62; /* Subpattern name expected */ goto FAILED; @@ -2262,7 +2301,6 @@ if (is_group) ptr++; } -*namelenptr = namelen; *ptrptr = ptr; return TRUE; @@ -2981,7 +3019,7 @@ while (ptr < ptrend) /* Not a numerical recursion */ - if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, &errorcode, cb)) goto ESCAPE_FAILED; /* \k and \g when used with braces are back references, whereas \g used @@ -3554,8 +3592,8 @@ while (ptr < ptrend) uint32_t meta; vn = alasnames; - if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode, - cb)) goto FAILED; + if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; if (ptr >= ptrend || *ptr != CHAR_COLON) { errorcode = ERR95; /* Malformed */ @@ -3651,8 +3689,8 @@ while (ptr < ptrend) else { vn = verbnames; - if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode, - cb)) goto FAILED; + if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; if (ptr >= ptrend || (*ptr != CHAR_COLON && *ptr != CHAR_RIGHT_PARENTHESIS)) { @@ -3907,7 +3945,7 @@ while (ptr < ptrend) errorcode = ERR41; goto FAILED; } - if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name, + if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, &namelen, &errorcode, cb)) goto FAILED; *parsed_pattern++ = META_BACKREF_BYNAME; *parsed_pattern++ = namelen; @@ -3967,7 +4005,7 @@ while (ptr < ptrend) case CHAR_AMPERSAND: RECURSE_BY_NAME: - if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name, + if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name, &namelen, &errorcode, cb)) goto FAILED; *parsed_pattern++ = META_RECURSE_BYNAME; *parsed_pattern++ = namelen; @@ -4215,7 +4253,7 @@ while (ptr < ptrend) terminator = CHAR_RIGHT_PARENTHESIS; ptr--; /* Point to char before name */ } - if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, &errorcode, cb)) goto FAILED; /* Handle (?(R&name) */ @@ -4349,7 +4387,7 @@ while (ptr < ptrend) terminator = CHAR_APOSTROPHE; /* Terminator */ DEFINE_NAME: - if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen, &errorcode, cb)) goto FAILED; /* We have a name for this capturing group. It is also assigned a number, diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 4c8127c..349351d 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -95,7 +95,7 @@ static const unsigned char compile_error_texts[] = /* 25 */ "lookbehind assertion is not fixed length\0" "a relative value of zero is not allowed\0" - "conditional group contains more than two branches\0" + "conditional subpattern contains more than two branches\0" "assertion expected after (?( or (?(?C)\0" "digit expected after (?+ or (?-\0" /* 30 */ @@ -113,21 +113,21 @@ static const unsigned char compile_error_texts[] = /* 40 */ "invalid escape sequence in (*VERB) name\0" "unrecognized character after (?P\0" - "syntax error in subpattern name (missing terminator)\0" + "syntax error in subpattern name (missing terminator?)\0" "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0" - "group name must start with a non-digit\0" + "subpattern name must start with a non-digit\0" /* 45 */ "this version of PCRE2 does not have support for \\P, \\p, or \\X\0" "malformed \\P or \\p sequence\0" "unknown property name after \\P or \\p\0" - "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" + "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0" "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ "invalid range in character class\0" "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0" "internal error: overran compiling workspace\0" "internal error: previously-checked referenced subpattern not found\0" - "DEFINE group contains more than one branch\0" + "DEFINE subpattern contains more than one branch\0" /* 55 */ "missing opening brace after \\o\0" "internal error: unknown newline setting\0" @@ -137,7 +137,7 @@ static const unsigned char compile_error_texts[] = "obsolete error (should not occur)\0" /* Was the above */ /* 60 */ "(*VERB) not recognized or malformed\0" - "group number is too big\0" + "subpattern number is too big\0" "subpattern name expected\0" "internal error: parsed pattern overflow\0" "non-octal character in \\o{} (closing brace missing?)\0" diff --git a/src/pcre2test.c b/src/pcre2test.c index 40b4a6f..462a254 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -169,7 +169,7 @@ commented out the original, but kept it around just in case. */ /* void vms_setsymbol( char *, char *, int ); Original code from [1]. */ #endif -/* VC and older compilers don't support %td or %zu, and even some that claim to +/* VC and older compilers don't support %td or %zu, and even some that claim to be C99 don't support it (hence DISABLE_PERCENT_ZT). */ #if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(DISABLE_PERCENT_ZT) @@ -539,7 +539,7 @@ typedef struct patctl { /* Structure for pattern modifiers. */ uint32_t jitstack; /* Must be in same position as datctl */ uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ uint32_t substitute_skip; /* Must be in same position as patctl */ - uint32_t substitute_stop; /* Must be in same position as patctl */ + uint32_t substitute_stop; /* Must be in same position as patctl */ uint32_t jit; uint32_t stackguard_test; uint32_t tables_id; @@ -561,7 +561,7 @@ typedef struct datctl { /* Structure for data line modifiers. */ uint32_t jitstack; /* Must be in same position as patctl */ uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ uint32_t substitute_skip; /* Must be in same position as patctl */ - uint32_t substitute_stop; /* Must be in same position as patctl */ + uint32_t substitute_stop; /* Must be in same position as patctl */ uint32_t startend[2]; uint32_t cerror[2]; uint32_t cfail[2]; @@ -3049,13 +3049,14 @@ return yield; -#ifdef SUPPORT_PCRE2_8 /************************************************* * Convert character value to UTF-8 * *************************************************/ /* This function takes an integer value in the range 0 - 0x7fffffff -and encodes it as a UTF-8 character in 0 to 6 bytes. +and encodes it as a UTF-8 character in 0 to 6 bytes. It is needed even when the +8-bit library is not supported, to generate UTF-8 output for non-ASCII +characters. Arguments: cvalue the character value @@ -3081,7 +3082,6 @@ for (j = i; j > 0; j--) *utf8bytes = utf8_table2[i] | cvalue; return i + 1; } -#endif /* SUPPORT_PCRE2_8 */ @@ -4374,6 +4374,7 @@ static int show_pattern_info(void) { uint32_t compile_options, overall_options, extra_options; +BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0; if ((pat_patctl.control & (CTL_BINCODE|CTL_FULLBINCODE)) != 0) { @@ -4463,7 +4464,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) != 0) return PR_ABEND; - fprintf(outfile, "Capturing subpattern count = %d\n", capture_count); + fprintf(outfile, "Capture group count = %d\n", capture_count); if (backrefmax > 0) fprintf(outfile, "Max back reference = %d\n", backrefmax); @@ -4482,14 +4483,60 @@ if ((pat_patctl.control & CTL_INFO) != 0) if (namecount > 0) { - fprintf(outfile, "Named capturing subpatterns:\n"); + fprintf(outfile, "Named capture groups:\n"); for (; namecount > 0; namecount--) { int imm2_size = test_mode == PCRE8_MODE ? 2 : 1; uint32_t length = (uint32_t)STRLEN(nametable + imm2_size); fprintf(outfile, " "); - PCHARSV(nametable, imm2_size, length, FALSE, outfile); + + /* In UTF mode the name may be a UTF string containing non-ASCII + letters and digits. We must output it as a UTF-8 string. In non-UTF mode, + use the normal string printing functions, which use escapes for all + non-ASCII characters. */ + + if (utf) + { +#ifdef SUPPORT_PCRE2_32 + if (test_mode == PCRE32_MODE) + { + PCRE2_SPTR32 nameptr = (PCRE2_SPTR32)nametable + imm2_size; + while (*nameptr != 0) + { + uint8_t u8buff[6]; + int len = ord2utf8(*nameptr++, u8buff); + fprintf(outfile, "%.*s", len, u8buff); + } + } +#endif +#ifdef SUPPORT_PCRE2_16 + if (test_mode == PCRE16_MODE) + { + PCRE2_SPTR16 nameptr = (PCRE2_SPTR16)nametable + imm2_size; + while (*nameptr != 0) + { + int len; + uint8_t u8buff[6]; + uint32_t c = *nameptr++ & 0xffff; + if (c >= 0xD800 && c < 0xDC00) + c = ((c & 0x3ff) << 10) + (*nameptr++ & 0x3ff) + 0x10000; + len = ord2utf8(c, u8buff); + fprintf(outfile, "%.*s", len, u8buff); + } + } +#endif +#ifdef SUPPORT_PCRE2_8 + if (test_mode == PCRE8_MODE) + fprintf(outfile, "%s", (PCRE2_SPTR8)nametable + imm2_size); +#endif + } + else /* Not UTF mode */ + { + PCHARSV(nametable, imm2_size, length, FALSE, outfile); + } + while (length++ < nameentrysize - imm2_size) putc(' ', outfile); + #ifdef SUPPORT_PCRE2_32 if (test_mode == PCRE32_MODE) fprintf(outfile, "%3d\n", (int)(((PCRE2_SPTR32)nametable)[0])); @@ -4503,6 +4550,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) fprintf(outfile, "%3d\n", (int)( ((((PCRE2_SPTR8)nametable)[0]) << 8) | ((PCRE2_SPTR8)nametable)[1])); #endif + nametable = (void*)((PCRE2_SPTR8)nametable + nameentrysize * code_unit_size); } } @@ -5971,30 +6019,30 @@ BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0; (void)data_ptr; /* Not used */ fprintf(outfile, "%2d(%d) Old %" SIZ_FORM " %" SIZ_FORM " \"", - scb->subscount, scb->oveccount, + scb->subscount, scb->oveccount, SIZ_CAST scb->ovector[0], SIZ_CAST scb->ovector[1]); -PCHARSV(scb->input, scb->ovector[0], scb->ovector[1] - scb->ovector[0], +PCHARSV(scb->input, scb->ovector[0], scb->ovector[1] - scb->ovector[0], utf, outfile); fprintf(outfile, "\" New %" SIZ_FORM " %" SIZ_FORM " \"", SIZ_CAST scb->output_offsets[0], SIZ_CAST scb->output_offsets[1]); -PCHARSV(scb->output, scb->output_offsets[0], +PCHARSV(scb->output, scb->output_offsets[0], scb->output_offsets[1] - scb->output_offsets[0], utf, outfile); -if (scb->subscount == dat_datctl.substitute_stop) +if (scb->subscount == dat_datctl.substitute_stop) { yield = -1; - fprintf(outfile, " STOPPED"); - } -else if (scb->subscount == dat_datctl.substitute_skip) + fprintf(outfile, " STOPPED"); + } +else if (scb->subscount == dat_datctl.substitute_skip) { yield = +1; - fprintf(outfile, " SKIPPED"); - } + fprintf(outfile, " SKIPPED"); + } -fprintf(outfile, "\"\n"); +fprintf(outfile, "\"\n"); return yield; } @@ -6867,11 +6915,11 @@ arg_ulen = ulen; /* Value to use in match arg */ if (p[-1] != 0 && !decode_modifiers(p, CTX_DAT, NULL, &dat_datctl)) return PR_OK; - -/* Setting substitute_{skip,fail} implies a substitute callout. */ + +/* Setting substitute_{skip,fail} implies a substitute callout. */ if (dat_datctl.substitute_skip != 0 || dat_datctl.substitute_stop != 0) - dat_datctl.control2 |= CTL2_SUBSTITUTE_CALLOUT; + dat_datctl.control2 |= CTL2_SUBSTITUTE_CALLOUT; /* Check for mutually exclusive modifiers. At present, these are all in the first control word. */ @@ -8129,7 +8177,7 @@ if (arg != NULL && arg[0] != CHAR_MINUS) break; } -/* For VMS, return the value by setting a symbol, for certain values only. This +/* For VMS, return the value by setting a symbol, for certain values only. This is contributed code which the PCRE2 developers have no means of testing. */ #ifdef __VMS |