summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-02-06 18:11:36 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-02-06 18:11:36 +0000
commit03c006cfda40d5218d2248674ddc3824f8169897 (patch)
tree8bfb007e8adba8eb8e1256afba09001b52509905 /src
parent2aee0809b4ec6f9c2fdbb33a0c200b17a9fd333c (diff)
downloadpcre2-03c006cfda40d5218d2248674ddc3824f8169897.tar.gz
Allow non-ASCII in group names when UTF is set; revise group naming terminology
in documentation to use "capture group", as Perl does. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1066 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src')
-rw-r--r--src/pcre2_compile.c88
-rw-r--r--src/pcre2_error.c12
-rw-r--r--src/pcre2test.c94
3 files changed, 140 insertions, 54 deletions
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index eb45210..0c38b5b 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -2194,6 +2194,7 @@ so it is simplest just to return both.
Arguments:
ptrptr points to the character pointer variable
ptrend points to the end of the input string
+ utf true if the input is UTF-encoded
terminator the terminator of a subpattern name must be this
offsetptr where to put the offset from the start of the pattern
nameptr where to put a pointer to the name in the input
@@ -2206,13 +2207,12 @@ Returns: TRUE if a name was read
*/
static BOOL
-read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator,
+read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
int *errorcodeptr, compile_block *cb)
{
PCRE2_SPTR ptr = *ptrptr;
BOOL is_group = (*ptr != CHAR_ASTERISK);
-uint32_t namelen = 0;
if (++ptr >= ptrend) /* No characters in name */
{
@@ -2221,35 +2221,74 @@ if (++ptr >= ptrend) /* No characters in name */
goto FAILED;
}
-/* A group name must not start with a digit. If either of the others start with
-a digit it just won't be recognized. */
+*nameptr = ptr;
+*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
-if (is_group && IS_DIGIT(*ptr))
+/* In UTF mode, a group name may contain letters and decimal digits as defined
+by Unicode properties, and underscores, but must not start with a digit. */
+
+#ifdef SUPPORT_UNICODE
+if (utf && is_group)
{
- *errorcodeptr = ERR44;
- goto FAILED;
+ uint32_t c, type;
+
+ GETCHAR(c, ptr);
+ type = UCD_CHARTYPE(c);
+
+ if (type == ucp_Nd)
+ {
+ *errorcodeptr = ERR44;
+ goto FAILED;
+ }
+
+ for(;;)
+ {
+ if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
+ c != CHAR_UNDERSCORE) break;
+ ptr++;
+ FORWARDCHAR(ptr);
+ if (ptr >= ptrend) break;
+ GETCHAR(c, ptr);
+ type = UCD_CHARTYPE(c);
+ }
}
+else
+#else
+(void)utf; /* Avoid compiler warning */
+#endif /* SUPPORT_UNICODE */
-*nameptr = ptr;
-*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
+/* Handle non-group names and group names in non-UTF modes. A group name must
+not start with a digit. If either of the others start with a digit it just
+won't be recognized. */
-while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
{
- ptr++;
- namelen++;
- if (namelen > MAX_NAME_SIZE)
+ if (is_group && IS_DIGIT(*ptr))
{
- *errorcodeptr = ERR48;
+ *errorcodeptr = ERR44;
goto FAILED;
}
+
+ while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
+ {
+ ptr++;
+ }
}
+/* Check name length */
+
+if (ptr > *nameptr + MAX_NAME_SIZE)
+ {
+ *errorcodeptr = ERR48;
+ goto FAILED;
+ }
+*namelenptr = ptr - *nameptr;
+
/* Subpattern names must not be empty, and their terminator is checked here.
(What follows a verb or alpha assertion name is checked separately.) */
if (is_group)
{
- if (namelen == 0)
+ if (ptr == *nameptr)
{
*errorcodeptr = ERR62; /* Subpattern name expected */
goto FAILED;
@@ -2262,7 +2301,6 @@ if (is_group)
ptr++;
}
-*namelenptr = namelen;
*ptrptr = ptr;
return TRUE;
@@ -2981,7 +3019,7 @@ while (ptr < ptrend)
/* Not a numerical recursion */
- if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
+ if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
&errorcode, cb)) goto ESCAPE_FAILED;
/* \k and \g when used with braces are back references, whereas \g used
@@ -3554,8 +3592,8 @@ while (ptr < ptrend)
uint32_t meta;
vn = alasnames;
- if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
- cb)) goto FAILED;
+ if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
+ &errorcode, cb)) goto FAILED;
if (ptr >= ptrend || *ptr != CHAR_COLON)
{
errorcode = ERR95; /* Malformed */
@@ -3651,8 +3689,8 @@ while (ptr < ptrend)
else
{
vn = verbnames;
- if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
- cb)) goto FAILED;
+ if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
+ &errorcode, cb)) goto FAILED;
if (ptr >= ptrend || (*ptr != CHAR_COLON &&
*ptr != CHAR_RIGHT_PARENTHESIS))
{
@@ -3907,7 +3945,7 @@ while (ptr < ptrend)
errorcode = ERR41;
goto FAILED;
}
- if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
+ if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
&namelen, &errorcode, cb)) goto FAILED;
*parsed_pattern++ = META_BACKREF_BYNAME;
*parsed_pattern++ = namelen;
@@ -3967,7 +4005,7 @@ while (ptr < ptrend)
case CHAR_AMPERSAND:
RECURSE_BY_NAME:
- if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
+ if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
&namelen, &errorcode, cb)) goto FAILED;
*parsed_pattern++ = META_RECURSE_BYNAME;
*parsed_pattern++ = namelen;
@@ -4215,7 +4253,7 @@ while (ptr < ptrend)
terminator = CHAR_RIGHT_PARENTHESIS;
ptr--; /* Point to char before name */
}
- if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
+ if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
&errorcode, cb)) goto FAILED;
/* Handle (?(R&name) */
@@ -4349,7 +4387,7 @@ while (ptr < ptrend)
terminator = CHAR_APOSTROPHE; /* Terminator */
DEFINE_NAME:
- if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
+ if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
&errorcode, cb)) goto FAILED;
/* We have a name for this capturing group. It is also assigned a number,
diff --git a/src/pcre2_error.c b/src/pcre2_error.c
index 4c8127c..349351d 100644
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@@ -95,7 +95,7 @@ static const unsigned char compile_error_texts[] =
/* 25 */
"lookbehind assertion is not fixed length\0"
"a relative value of zero is not allowed\0"
- "conditional group contains more than two branches\0"
+ "conditional subpattern contains more than two branches\0"
"assertion expected after (?( or (?(?C)\0"
"digit expected after (?+ or (?-\0"
/* 30 */
@@ -113,21 +113,21 @@ static const unsigned char compile_error_texts[] =
/* 40 */
"invalid escape sequence in (*VERB) name\0"
"unrecognized character after (?P\0"
- "syntax error in subpattern name (missing terminator)\0"
+ "syntax error in subpattern name (missing terminator?)\0"
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
- "group name must start with a non-digit\0"
+ "subpattern name must start with a non-digit\0"
/* 45 */
"this version of PCRE2 does not have support for \\P, \\p, or \\X\0"
"malformed \\P or \\p sequence\0"
"unknown property name after \\P or \\p\0"
- "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
+ "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0"
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
/* 50 */
"invalid range in character class\0"
"octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
"internal error: overran compiling workspace\0"
"internal error: previously-checked referenced subpattern not found\0"
- "DEFINE group contains more than one branch\0"
+ "DEFINE subpattern contains more than one branch\0"
/* 55 */
"missing opening brace after \\o\0"
"internal error: unknown newline setting\0"
@@ -137,7 +137,7 @@ static const unsigned char compile_error_texts[] =
"obsolete error (should not occur)\0" /* Was the above */
/* 60 */
"(*VERB) not recognized or malformed\0"
- "group number is too big\0"
+ "subpattern number is too big\0"
"subpattern name expected\0"
"internal error: parsed pattern overflow\0"
"non-octal character in \\o{} (closing brace missing?)\0"
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 40b4a6f..462a254 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -169,7 +169,7 @@ commented out the original, but kept it around just in case. */
/* void vms_setsymbol( char *, char *, int ); Original code from [1]. */
#endif
-/* VC and older compilers don't support %td or %zu, and even some that claim to
+/* VC and older compilers don't support %td or %zu, and even some that claim to
be C99 don't support it (hence DISABLE_PERCENT_ZT). */
#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(DISABLE_PERCENT_ZT)
@@ -539,7 +539,7 @@ typedef struct patctl { /* Structure for pattern modifiers. */
uint32_t jitstack; /* Must be in same position as datctl */
uint8_t replacement[REPLACE_MODSIZE]; /* So must this */
uint32_t substitute_skip; /* Must be in same position as patctl */
- uint32_t substitute_stop; /* Must be in same position as patctl */
+ uint32_t substitute_stop; /* Must be in same position as patctl */
uint32_t jit;
uint32_t stackguard_test;
uint32_t tables_id;
@@ -561,7 +561,7 @@ typedef struct datctl { /* Structure for data line modifiers. */
uint32_t jitstack; /* Must be in same position as patctl */
uint8_t replacement[REPLACE_MODSIZE]; /* So must this */
uint32_t substitute_skip; /* Must be in same position as patctl */
- uint32_t substitute_stop; /* Must be in same position as patctl */
+ uint32_t substitute_stop; /* Must be in same position as patctl */
uint32_t startend[2];
uint32_t cerror[2];
uint32_t cfail[2];
@@ -3049,13 +3049,14 @@ return yield;
-#ifdef SUPPORT_PCRE2_8
/*************************************************
* Convert character value to UTF-8 *
*************************************************/
/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
+and encodes it as a UTF-8 character in 0 to 6 bytes. It is needed even when the
+8-bit library is not supported, to generate UTF-8 output for non-ASCII
+characters.
Arguments:
cvalue the character value
@@ -3081,7 +3082,6 @@ for (j = i; j > 0; j--)
*utf8bytes = utf8_table2[i] | cvalue;
return i + 1;
}
-#endif /* SUPPORT_PCRE2_8 */
@@ -4374,6 +4374,7 @@ static int
show_pattern_info(void)
{
uint32_t compile_options, overall_options, extra_options;
+BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
if ((pat_patctl.control & (CTL_BINCODE|CTL_FULLBINCODE)) != 0)
{
@@ -4463,7 +4464,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
!= 0)
return PR_ABEND;
- fprintf(outfile, "Capturing subpattern count = %d\n", capture_count);
+ fprintf(outfile, "Capture group count = %d\n", capture_count);
if (backrefmax > 0)
fprintf(outfile, "Max back reference = %d\n", backrefmax);
@@ -4482,14 +4483,60 @@ if ((pat_patctl.control & CTL_INFO) != 0)
if (namecount > 0)
{
- fprintf(outfile, "Named capturing subpatterns:\n");
+ fprintf(outfile, "Named capture groups:\n");
for (; namecount > 0; namecount--)
{
int imm2_size = test_mode == PCRE8_MODE ? 2 : 1;
uint32_t length = (uint32_t)STRLEN(nametable + imm2_size);
fprintf(outfile, " ");
- PCHARSV(nametable, imm2_size, length, FALSE, outfile);
+
+ /* In UTF mode the name may be a UTF string containing non-ASCII
+ letters and digits. We must output it as a UTF-8 string. In non-UTF mode,
+ use the normal string printing functions, which use escapes for all
+ non-ASCII characters. */
+
+ if (utf)
+ {
+#ifdef SUPPORT_PCRE2_32
+ if (test_mode == PCRE32_MODE)
+ {
+ PCRE2_SPTR32 nameptr = (PCRE2_SPTR32)nametable + imm2_size;
+ while (*nameptr != 0)
+ {
+ uint8_t u8buff[6];
+ int len = ord2utf8(*nameptr++, u8buff);
+ fprintf(outfile, "%.*s", len, u8buff);
+ }
+ }
+#endif
+#ifdef SUPPORT_PCRE2_16
+ if (test_mode == PCRE16_MODE)
+ {
+ PCRE2_SPTR16 nameptr = (PCRE2_SPTR16)nametable + imm2_size;
+ while (*nameptr != 0)
+ {
+ int len;
+ uint8_t u8buff[6];
+ uint32_t c = *nameptr++ & 0xffff;
+ if (c >= 0xD800 && c < 0xDC00)
+ c = ((c & 0x3ff) << 10) + (*nameptr++ & 0x3ff) + 0x10000;
+ len = ord2utf8(c, u8buff);
+ fprintf(outfile, "%.*s", len, u8buff);
+ }
+ }
+#endif
+#ifdef SUPPORT_PCRE2_8
+ if (test_mode == PCRE8_MODE)
+ fprintf(outfile, "%s", (PCRE2_SPTR8)nametable + imm2_size);
+#endif
+ }
+ else /* Not UTF mode */
+ {
+ PCHARSV(nametable, imm2_size, length, FALSE, outfile);
+ }
+
while (length++ < nameentrysize - imm2_size) putc(' ', outfile);
+
#ifdef SUPPORT_PCRE2_32
if (test_mode == PCRE32_MODE)
fprintf(outfile, "%3d\n", (int)(((PCRE2_SPTR32)nametable)[0]));
@@ -4503,6 +4550,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
fprintf(outfile, "%3d\n", (int)(
((((PCRE2_SPTR8)nametable)[0]) << 8) | ((PCRE2_SPTR8)nametable)[1]));
#endif
+
nametable = (void*)((PCRE2_SPTR8)nametable + nameentrysize * code_unit_size);
}
}
@@ -5971,30 +6019,30 @@ BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
(void)data_ptr; /* Not used */
fprintf(outfile, "%2d(%d) Old %" SIZ_FORM " %" SIZ_FORM " \"",
- scb->subscount, scb->oveccount,
+ scb->subscount, scb->oveccount,
SIZ_CAST scb->ovector[0], SIZ_CAST scb->ovector[1]);
-PCHARSV(scb->input, scb->ovector[0], scb->ovector[1] - scb->ovector[0],
+PCHARSV(scb->input, scb->ovector[0], scb->ovector[1] - scb->ovector[0],
utf, outfile);
fprintf(outfile, "\" New %" SIZ_FORM " %" SIZ_FORM " \"",
SIZ_CAST scb->output_offsets[0], SIZ_CAST scb->output_offsets[1]);
-PCHARSV(scb->output, scb->output_offsets[0],
+PCHARSV(scb->output, scb->output_offsets[0],
scb->output_offsets[1] - scb->output_offsets[0], utf, outfile);
-if (scb->subscount == dat_datctl.substitute_stop)
+if (scb->subscount == dat_datctl.substitute_stop)
{
yield = -1;
- fprintf(outfile, " STOPPED");
- }
-else if (scb->subscount == dat_datctl.substitute_skip)
+ fprintf(outfile, " STOPPED");
+ }
+else if (scb->subscount == dat_datctl.substitute_skip)
{
yield = +1;
- fprintf(outfile, " SKIPPED");
- }
+ fprintf(outfile, " SKIPPED");
+ }
-fprintf(outfile, "\"\n");
+fprintf(outfile, "\"\n");
return yield;
}
@@ -6867,11 +6915,11 @@ arg_ulen = ulen; /* Value to use in match arg */
if (p[-1] != 0 && !decode_modifiers(p, CTX_DAT, NULL, &dat_datctl))
return PR_OK;
-
-/* Setting substitute_{skip,fail} implies a substitute callout. */
+
+/* Setting substitute_{skip,fail} implies a substitute callout. */
if (dat_datctl.substitute_skip != 0 || dat_datctl.substitute_stop != 0)
- dat_datctl.control2 |= CTL2_SUBSTITUTE_CALLOUT;
+ dat_datctl.control2 |= CTL2_SUBSTITUTE_CALLOUT;
/* Check for mutually exclusive modifiers. At present, these are all in the
first control word. */
@@ -8129,7 +8177,7 @@ if (arg != NULL && arg[0] != CHAR_MINUS)
break;
}
-/* For VMS, return the value by setting a symbol, for certain values only. This
+/* For VMS, return the value by setting a symbol, for certain values only. This
is contributed code which the PCRE2 developers have no means of testing. */
#ifdef __VMS