summaryrefslogtreecommitdiff
path: root/ext/pcre/pcre2lib/pcre2_study.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/pcre/pcre2lib/pcre2_study.c')
-rw-r--r--ext/pcre/pcre2lib/pcre2_study.c392
1 files changed, 291 insertions, 101 deletions
diff --git a/ext/pcre/pcre2lib/pcre2_study.c b/ext/pcre/pcre2lib/pcre2_study.c
index acbf98b41b..9bbb37570f 100644
--- a/ext/pcre/pcre2lib/pcre2_study.c
+++ b/ext/pcre/pcre2lib/pcre2_study.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2018 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -54,11 +54,11 @@ collecting data (e.g. minimum matching length). */
/* Set a bit in the starting code unit bit map. */
-#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
+#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
/* Returns from set_start_bits() */
-enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
+enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
/*************************************************
@@ -88,11 +88,13 @@ Arguments:
countptr pointer to call count (to catch over complexity)
backref_cache vector for caching back references.
+This function is no longer called when the pattern contains (*ACCEPT); however,
+the old code for returning -1 is retained, just in case.
+
Returns: the minimum length
-1 \C in UTF-8 mode
or (*ACCEPT)
or pattern too complicated
- or back reference to duplicate name/number
-2 internal error (missing capturing bracket)
-3 internal error (opcode not listed)
*/
@@ -103,6 +105,7 @@ find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
int *backref_cache)
{
int length = -1;
+int branchlength = 0;
int prev_cap_recno = -1;
int prev_cap_d = 0;
int prev_recurse_recno = -1;
@@ -110,9 +113,9 @@ int prev_recurse_d = 0;
uint32_t once_fudge = 0;
BOOL had_recurse = FALSE;
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
-recurse_check this_recurse;
-int branchlength = 0;
+PCRE2_SPTR nextbranch = code + GET(code, 1);
PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
+recurse_check this_recurse;
/* If this is a "could be empty" group, its minimum length is 0. */
@@ -128,16 +131,20 @@ if ((*countptr)++ > 1000) return -1;
/* Scan along the opcodes for this branch. If we get to the end of the branch,
check the length against that of the other branches. If the accumulated length
-passes 16-bits, stop. */
+passes 16-bits, reset to that value and skip the rest of the branch. */
for (;;)
{
int d, min, recno;
- PCRE2_UCHAR *cs, *ce;
- PCRE2_UCHAR op = *cc;
+ PCRE2_UCHAR op, *cs, *ce;
- if (branchlength >= UINT16_MAX) return UINT16_MAX;
+ if (branchlength >= UINT16_MAX)
+ {
+ branchlength = UINT16_MAX;
+ cc = (PCRE2_UCHAR *)nextbranch;
+ }
+ op = *cc;
switch (op)
{
case OP_COND:
@@ -171,6 +178,7 @@ for (;;)
/* Fall through */
case OP_ONCE:
+ case OP_SCRIPT_RUN:
case OP_SBRA:
case OP_BRAPOS:
case OP_SBRAPOS:
@@ -205,7 +213,9 @@ for (;;)
cc += 1 + LINK_SIZE;
break;
- /* ACCEPT makes things far too complicated; we have to give up. */
+ /* ACCEPT makes things far too complicated; we have to give up. In fact,
+ from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
+ used. However, leave the code in place, just in case. */
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:
@@ -213,9 +223,9 @@ for (;;)
/* Reached end of a branch; if it's a ket it is the end of a nested
call. If it's ALT it is an alternation in a nested call. If it is END it's
- the end of the outer call. All can be handled by the same code. If an
- ACCEPT was previously encountered, use the length that was in force at that
- time, and pass back the shortest ACCEPT length. */
+ the end of the outer call. All can be handled by the same code. If the
+ length of any branch is zero, there is no need to scan any subsequent
+ branches. */
case OP_ALT:
case OP_KET:
@@ -225,7 +235,8 @@ for (;;)
case OP_END:
if (length < 0 || (!had_recurse && branchlength < length))
length = branchlength;
- if (op != OP_ALT) return length;
+ if (op != OP_ALT || length == 0) return length;
+ nextbranch = cc + GET(cc, 1);
cc += 1 + LINK_SIZE;
branchlength = 0;
had_recurse = FALSE;
@@ -237,6 +248,8 @@ for (;;)
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
+ case OP_ASSERT_NA:
+ case OP_ASSERTBACK_NA:
do cc += GET(cc, 1); while (*cc == OP_ALT);
/* Fall through */
@@ -450,15 +463,17 @@ for (;;)
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
matches an empty string (by default it causes a matching failure), so in
- that case we must set the minimum length to zero. */
+ that case we must set the minimum length to zero.
- /* Duplicate named pattern back reference. We cannot reliably find a length
- for this if duplicate numbers are present in the pattern. */
+ For backreferenes, if duplicate numbers are present in the pattern we check
+ for a reference to a duplicate. If it is, we don't know which version will
+ be referenced, so we have to set the minimum length to zero. */
+
+ /* Duplicate named pattern back reference. */
case OP_DNREF:
case OP_DNREFI:
- if (dupcapused) return -1;
- if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
+ if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
int count = GET2(cc, 1+IMM2_SIZE);
PCRE2_UCHAR *slot =
@@ -481,28 +496,32 @@ for (;;)
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- {
- dd = 0;
- had_recurse = TRUE;
- }
- else
+
+ dd = 0;
+ if (!dupcapused ||
+ (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev)
- if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ if (cc > cs && cc < ce) /* Simple recursion */
{
- dd = 0;
had_recurse = TRUE;
}
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- dd = find_minlength(re, cs, startcode, utf, &this_recurse,
- countptr, backref_cache);
- if (dd < 0) return dd;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev)
+ if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ had_recurse = TRUE;
+ }
+ else
+ {
+ this_recurse.prev = recurses; /* No recursion */
+ this_recurse.group = cs;
+ dd = find_minlength(re, cs, startcode, utf, &this_recurse,
+ countptr, backref_cache);
+ if (dd < 0) return dd;
+ }
}
}
@@ -520,48 +539,51 @@ for (;;)
cc += 1 + 2*IMM2_SIZE;
goto REPEAT_BACK_REFERENCE;
- /* Single back reference. We cannot find a length for this if duplicate
- numbers are present in the pattern. */
+ /* Single back reference by number. References by name are converted to by
+ number when there is no duplication. */
case OP_REF:
case OP_REFI:
- if (dupcapused) return -1;
recno = GET2(cc, 1);
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
d = backref_cache[recno];
else
{
int i;
+ d = 0;
+
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- {
- d = 0;
- had_recurse = TRUE;
- }
- else
+
+ if (!dupcapused ||
+ (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ if (cc > cs && cc < ce) /* Simple recursion */
{
- d = 0;
had_recurse = TRUE;
}
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
- backref_cache);
- if (d < 0) return d;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ had_recurse = TRUE;
+ }
+ else /* No recursion */
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
+ backref_cache);
+ if (d < 0) return d;
+ }
}
}
}
- else d = 0;
backref_cache[recno] = d;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
@@ -750,15 +772,19 @@ Arguments:
p points to the first code unit of the character
caseless TRUE if caseless
utf TRUE for UTF mode
+ ucp TRUE for UCP mode
Returns: pointer after the character
*/
static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
+ BOOL ucp)
{
uint32_t c = *p++; /* First code unit */
-(void)utf; /* Stop compiler warning when UTF not supported */
+
+(void)utf; /* Stop compiler warnings when UTF not supported */
+(void)ucp;
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */
@@ -788,22 +814,26 @@ if (utf)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
-#if PCRE2_CODE_UNIT_WIDTH == 8
- PCRE2_UCHAR buff[6];
c = UCD_OTHERCASE(c);
- (void)PRIV(ord2utf)(c, buff);
- SET_BIT(buff[0]);
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (utf)
+ {
+ PCRE2_UCHAR buff[6];
+ (void)PRIV(ord2utf)(c, buff);
+ SET_BIT(buff[0]);
+ }
+ else if (c < 256) SET_BIT(c);
#else /* 16-bit or 32-bit mode */
- c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
+
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF */
+ /* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
}
@@ -842,7 +872,7 @@ for (c = 0; c < table_limit; c++)
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
- if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
+ if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
{
PCRE2_UCHAR buff[6];
(void)PRIV(ord2utf)(c, buff);
@@ -887,7 +917,7 @@ if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
/*************************************************
-* Create bitmap of starting bytes *
+* Create bitmap of starting code units *
*************************************************/
/* This function scans a compiled unanchored expression recursively and
@@ -902,19 +932,26 @@ The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
must continue at the outer level to find at least one mandatory code unit. At
the outermost level, this function fails unless the result is SSB_DONE.
+We restrict recursion (for nested groups) to 1000 to avoid stack overflow
+issues.
+
Arguments:
re points to the compiled regex block
code points to an expression
utf TRUE if in UTF mode
+ ucp TRUE if in UCP mode
+ depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units
SSB_DONE => Found mandatory starting code units
SSB_CONTINUE => Found optional starting code units
SSB_UNKNOWN => Hit an unrecognized opcode
+ SSB_TOODEEP => Recursion is too deep
*/
static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
+ int *depthptr)
{
uint32_t c;
int yield = SSB_DONE;
@@ -925,6 +962,9 @@ int table_limit = utf? 16:32;
int table_limit = 32;
#endif
+*depthptr += 1;
+if (*depthptr > 1000) return SSB_TOODEEP;
+
do
{
BOOL try_next = TRUE;
@@ -937,6 +977,9 @@ do
{
int rc;
uint8_t *classmap = NULL;
+#ifdef SUPPORT_WIDE_CHARS
+ PCRE2_UCHAR xclassflags;
+#endif
switch(*tcode)
{
@@ -1075,14 +1118,20 @@ do
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_ONCE:
+ case OP_SCRIPT_RUN:
case OP_ASSERT:
- rc = set_start_bits(re, tcode, utf);
- if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
- if (rc == SSB_DONE) try_next = FALSE; else
+ case OP_ASSERT_NA:
+ rc = set_start_bits(re, tcode, utf, ucp, depthptr);
+ if (rc == SSB_DONE)
+ {
+ try_next = FALSE;
+ }
+ else if (rc == SSB_CONTINUE)
{
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
}
+ else return rc; /* FAIL, UNKNOWN, or TOODEEP */
break;
/* If we hit ALT or KET, it means we haven't found anything mandatory in
@@ -1118,6 +1167,7 @@ do
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
+ case OP_ASSERTBACK_NA:
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
@@ -1127,8 +1177,8 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
- rc = set_start_bits(re, ++tcode, utf);
- if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
+ rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
+ if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
@@ -1149,7 +1199,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+ tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break;
case OP_STARI:
@@ -1158,7 +1208,7 @@ do
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
- tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+ tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break;
/* Single-char upto sets the bit and tries the next */
@@ -1166,13 +1216,13 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+ tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
- tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+ tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break;
/* At least one single char sets the bit and stops */
@@ -1184,7 +1234,7 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- (void)set_table_bit(re, tcode + 1, FALSE, utf);
+ (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE;
break;
@@ -1195,7 +1245,7 @@ do
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
- (void)set_table_bit(re, tcode + 1, TRUE, utf);
+ (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE;
break;
@@ -1442,20 +1492,59 @@ do
negative XCLASS without a map, give up. If there are no property checks,
there must be wide characters on the XCLASS list, because otherwise an
XCLASS would not have been created. This means that code points >= 255
- are always potential starters. */
+ are potential starters. In the UTF-8 case we can scan them and set bits
+ for the relevant leading bytes. */
#ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS:
- if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
- (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
+ xclassflags = tcode[1 + LINK_SIZE];
+ if ((xclassflags & XCL_HASPROP) != 0 ||
+ (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
return SSB_FAIL;
/* We have a positive XCLASS or a negative one without a map. Set up the
map pointer if there is one, and fall through. */
- classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
+ classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
-#endif
+
+ /* In UTF-8 mode, scan the character list and set bits for leading bytes,
+ then jump to handle the map. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (utf && (xclassflags & XCL_NOT) == 0)
+ {
+ PCRE2_UCHAR b, e;
+ PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
+ tcode += GET(tcode, 1);
+
+ for (;;) switch (*p++)
+ {
+ case XCL_SINGLE:
+ b = *p++;
+ while ((*p & 0xc0) == 0x80) p++;
+ re->start_bitmap[b/8] |= (1u << (b&7));
+ break;
+
+ case XCL_RANGE:
+ b = *p++;
+ while ((*p & 0xc0) == 0x80) p++;
+ e = *p++;
+ while ((*p & 0xc0) == 0x80) p++;
+ for (; b <= e; b++)
+ re->start_bitmap[b/8] |= (1u << (b&7));
+ break;
+
+ case XCL_END:
+ goto HANDLE_CLASSMAP;
+
+ default:
+ return SSB_UNKNOWN; /* Internal error, should not occur */
+ }
+ }
+#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
+#endif /* SUPPORT_WIDE_CHARS */
+
/* It seems that the fall through comment must be outside the #ifdef if
it is to avoid the gcc compiler warning. */
@@ -1497,6 +1586,9 @@ do
greater than 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
+#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
+ HANDLE_CLASSMAP:
+#endif
if (classmap != NULL)
{
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@@ -1505,11 +1597,11 @@ do
for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
for (c = 128; c < 256; c++)
{
- if ((classmap[c/8] & (1 << (c&7))) != 0)
+ if ((classmap[c/8] & (1u << (c&7))) != 0)
{
- int d = (c >> 6) | 0xc0; /* Set bit for this starter */
- re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */
- c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
+ int d = (c >> 6) | 0xc0; /* Set bit for this starter */
+ re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */
+ c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
}
}
}
@@ -1567,7 +1659,9 @@ return yield;
/* This function is handed a compiled expression that it must study to produce
information that will speed up the matching.
-Argument: points to the compiled expression
+Argument:
+ re points to the compiled expression
+
Returns: 0 normally; non-zero should never normally occur
1 unknown opcode in set_start_bits
2 missing capturing bracket
@@ -1577,10 +1671,10 @@ Returns: 0 normally; non-zero should never normally occur
int
PRIV(study)(pcre2_real_code *re)
{
-int min;
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */
@@ -1593,27 +1687,124 @@ code units. */
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
- int rc = set_start_bits(re, code, utf);
+ int depth = 0;
+ int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1;
- if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
+
+ /* If a list of starting code units was set up, scan the list to see if only
+ one or two were listed. Having only one listed is rare because usually a
+ single starting code unit will have been recognized and PCRE2_FIRSTSET set.
+ If two are listed, see if they are caseless versions of the same character;
+ if so we can replace the list with a caseless first code unit. This gives
+ better performance and is plausibly worth doing for patterns such as [Ww]ord
+ or (word|WORD). */
+
+ if (rc == SSB_DONE)
+ {
+ int i;
+ int a = -1;
+ int b = -1;
+ uint8_t *p = re->start_bitmap;
+ uint32_t flags = PCRE2_FIRSTMAPSET;
+
+ for (i = 0; i < 256; p++, i += 8)
+ {
+ uint8_t x = *p;
+ if (x != 0)
+ {
+ int c;
+ uint8_t y = x & (~x + 1); /* Least significant bit */
+ if (y != x) goto DONE; /* More than one bit set */
+
+ /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
+ all wide characters", so we cannot use it here. */
+
+#if PCRE2_CODE_UNIT_WIDTH != 8
+ if (i == 248 && x == 0x80) goto DONE;
+#endif
+
+ /* Compute the character value */
+
+ c = i;
+ switch (x)
+ {
+ case 1: break;
+ case 2: c += 1; break; case 4: c += 2; break;
+ case 8: c += 3; break; case 16: c += 4; break;
+ case 32: c += 5; break; case 64: c += 6; break;
+ case 128: c += 7; break;
+ }
+
+ /* c contains the code unit value, in the range 0-255. In 8-bit UTF
+ mode, only values < 128 can be used. In all the other cases, c is a
+ character value. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (utf && c > 127) goto DONE;
+#endif
+ if (a < 0) a = c; /* First one found, save in a */
+ else if (b < 0) /* Second one found */
+ {
+ int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
+
+#ifdef SUPPORT_UNICODE
+ if (utf || ucp)
+ {
+ if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
+ if (c > 127) d = UCD_OTHERCASE(c);
+ }
+#endif /* SUPPORT_UNICODE */
+
+ if (d != a) goto DONE; /* Not the other case of a */
+ b = c; /* Save second in b */
+ }
+ else goto DONE; /* More than two characters found */
+ }
+ }
+
+ /* Replace the start code unit bits with a first code unit, but only if it
+ is not the same as a required later code unit. This is because a search for
+ a required code unit starts after an explicit first code unit, but at a
+ code unit found from the bitmap. Patterns such as /a*a/ don't work
+ if both the start unit and required unit are the same. */
+
+ if (a >= 0 &&
+ (
+ (re->flags & PCRE2_LASTSET) == 0 ||
+ (
+ re->last_codeunit != (uint32_t)a &&
+ (b < 0 || re->last_codeunit != (uint32_t)b)
+ )
+ ))
+ {
+ re->first_codeunit = a;
+ flags = PCRE2_FIRSTSET;
+ if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
+ }
+
+ DONE:
+ re->flags |= flags;
+ }
}
/* Find the minimum length of subject string. If the pattern can match an empty
-string, the minimum length is already known. If there are more back references
-than the size of the vector we are going to cache them in, do nothing. A
-pattern that complicated will probably take a long time to analyze and may in
-any case turn out to be too complicated. Note that back reference minima are
-held as 16-bit numbers. */
-
-if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
+string, the minimum length is already known. If the pattern contains (*ACCEPT)
+all bets are off, and we don't even try to find a minimum length. If there are
+more back references than the size of the vector we are going to cache them in,
+do nothing. A pattern that complicated will probably take a long time to
+analyze and may in any case turn out to be too complicated. Note that back
+reference minima are held as 16-bit numbers. */
+
+if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
re->top_backref <= MAX_CACHE_BACKREF)
{
+ int min;
int backref_cache[MAX_CACHE_BACKREF+1];
backref_cache[0] = 0; /* Highest one that is set */
min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
switch(min)
{
- case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
+ case -1: /* \C in UTF mode or over-complex regex */
break; /* Leave minlength unchanged (will be zero) */
case -2:
@@ -1623,8 +1814,7 @@ if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
return 3; /* unrecognized opcode */
default:
- if (min > UINT16_MAX) min = UINT16_MAX;
- re->minlength = min;
+ re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
break;
}
}