summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-05-28 16:34:28 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-05-28 16:34:28 +0000
commit8dc2afce13214b29b1c901bcc6954b71313cdff3 (patch)
tree07a5aec97b22789a02844a8c0dc05ced8be56392 /src
parent0db7752db847fecaf0717312d3fd284b2b69ca26 (diff)
downloadpcre2-8dc2afce13214b29b1c901bcc6954b71313cdff3.tar.gz
Tweak limits on "must have" code unit searches (improves some performance).
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1097 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src')
-rw-r--r--src/pcre2_dfa_match.c52
-rw-r--r--src/pcre2_internal.h9
-rw-r--r--src/pcre2_match.c75
3 files changed, 82 insertions, 54 deletions
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index d20f02a..911e9b9 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -3294,10 +3294,10 @@ time. */
if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
return PCRE2_ERROR_BADOPTION;
-
+
/* Invalid UTF support is not available for DFA matching. */
-if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
+if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
return PCRE2_ERROR_DFA_UINVALID_UTF;
/* Check that the first field in the block is the magic number. If it is not,
@@ -3658,7 +3658,7 @@ for (;;)
while (start_match < end_subject && UCHAR21TEST(start_match) !=
first_cu)
start_match++;
-#else
+#else /* 8-bit code units */
start_match = memchr(start_match, first_cu, end_subject - start_match);
if (start_match == NULL) start_match = end_subject;
#endif
@@ -3745,6 +3745,8 @@ for (;;)
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
{
+ PCRE2_SPTR p;
+
/* The minimum matching length is a lower bound; no actual string of that
length may actually match the pattern. Although the value is, strictly,
in characters, we treat it as code units to avoid spending too much time
@@ -3758,37 +3760,63 @@ for (;;)
point. This optimization can save a huge amount of backtracking in
patterns with nested unlimited repeats that aren't going to match.
Writing separate code for cased/caseless versions makes it go faster, as
- does using an autoincrement and backing off on a match.
+ does using an autoincrement and backing off on a match. As in the case of
+ the first code unit, using memchr() in the 8-bit library gives a big
+ speed up. Unlike the first_cu check above, we do not need to call
+ memchr() twice in the caseless case because we only need to check for the
+ presence of the character in either case, not find the first occurrence.
+
+ The search can be skipped if the code unit was found later than the
+ current starting point in a previous iteration of the bumpalong loop.
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
patterns. This showed up when somebody was matching something like
/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
- sufficiently long. */
+ sufficiently long, but it's worth searching a lot more for unanchored
+ patterns. */
- if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+ p = start_match + (has_first_cu? 1:0);
+ if (has_req_cu && p > req_cu_ptr)
{
- PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
-
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
+ PCRE2_SIZE check_length = end_subject - start_match;
- if (p > req_cu_ptr)
+ if (check_length < REQ_CU_MAX ||
+ (!anchored && check_length < REQ_CU_MAX * 1000))
{
- if (req_cu != req_cu2)
+ if (req_cu != req_cu2) /* Caseless */
{
+#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject)
{
uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; }
}
+#else /* 8-bit code units */
+ PCRE2_SPTR pp = p;
+ p = memchr(pp, req_cu, end_subject - pp);
+ if (p == NULL)
+ {
+ p = memchr(pp, req_cu2, end_subject - pp);
+ if (p == NULL) p = end_subject;
+ }
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
}
+
+ /* The caseful case */
+
else
{
+#if PCRE2_CODE_UNIT_WIDTH != 8
while (p < end_subject)
{
if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
}
+
+#else /* 8-bit code units */
+ p = memchr(p, req_cu, end_subject - p);
+ if (p == NULL) p = end_subject;
+#endif
}
/* If we can't find the required code unit, break the matching loop,
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index 814d91b..7fd8044 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -535,13 +535,14 @@ enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
/* The maximum remaining length of subject we are prepared to search for a
-req_unit match. In 8-bit mode, memchr() is used and is much faster than the
-search loop that has to be used in 16-bit and 32-bit modes. */
+req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is
+much faster than the search loop that has to be used in 16-bit and 32-bit
+modes. */
#if PCRE2_CODE_UNIT_WIDTH == 8
-#define REQ_CU_MAX 2000
+#define REQ_CU_MAX 5000
#else
-#define REQ_CU_MAX 1000
+#define REQ_CU_MAX 2000
#endif
/* Offsets for the bitmap tables in the cbits set of tables. Each table
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index b2dae2d..849bb58 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6783,6 +6783,8 @@ for(;;)
if (!mb->partial)
{
+ PCRE2_SPTR p;
+
/* The minimum matching length is a lower bound; no string of that length
may actually match the pattern. Although the value is, strictly, in
characters, we treat it as code units to avoid spending too much time in
@@ -6806,60 +6808,57 @@ for(;;)
memchr() twice in the caseless case because we only need to check for the
presence of the character in either case, not find the first occurrence.
+ The search can be skipped if the code unit was found later than the
+ current starting point in a previous iteration of the bumpalong loop.
+
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
- patterns. This showed up when somebody was matching something like
- /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
- sufficiently long. */
+ anchored patterns. This showed up when somebody was matching something
+ like /^\d+C/ on a 32-megabyte string... so we don't do this when the
+ string is sufficiently long, but it's worth searching a lot more for
+ unanchored patterns. */
- if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+ p = start_match + (has_first_cu? 1:0);
+ if (has_req_cu && p > req_cu_ptr)
{
- PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+ PCRE2_SIZE check_length = end_subject - start_match;
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it last time round the bumpalong loop. */
-
- if (p > req_cu_ptr)
+ if (check_length < REQ_CU_MAX ||
+ (!anchored && check_length < REQ_CU_MAX * 1000))
{
- if (p < end_subject)
+ if (req_cu != req_cu2) /* Caseless */
{
- if (req_cu != req_cu2) /* Caseless */
- {
#if PCRE2_CODE_UNIT_WIDTH != 8
- do
- {
- uint32_t pp = UCHAR21INCTEST(p);
- if (pp == req_cu || pp == req_cu2) { p--; break; }
- }
- while (p < end_subject);
-
+ while (p < end_subject)
+ {
+ uint32_t pp = UCHAR21INCTEST(p);
+ if (pp == req_cu || pp == req_cu2) { p--; break; }
+ }
#else /* 8-bit code units */
- PCRE2_SPTR pp = p;
- p = memchr(pp, req_cu, end_subject - pp);
- if (p == NULL)
- {
- p = memchr(pp, req_cu2, end_subject - pp);
- if (p == NULL) p = end_subject;
- }
-#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
+ PCRE2_SPTR pp = p;
+ p = memchr(pp, req_cu, end_subject - pp);
+ if (p == NULL)
+ {
+ p = memchr(pp, req_cu2, end_subject - pp);
+ if (p == NULL) p = end_subject;
}
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
+ }
- /* The caseful case */
+ /* The caseful case */
- else
- {
+ else
+ {
#if PCRE2_CODE_UNIT_WIDTH != 8
- do
- {
- if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
- }
- while (p < end_subject);
+ while (p < end_subject)
+ {
+ if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
+ }
#else /* 8-bit code units */
- p = memchr(p, req_cu, end_subject - p);
- if (p == NULL) p = end_subject;
+ p = memchr(p, req_cu, end_subject - p);
+ if (p == NULL) p = end_subject;
#endif
- }
}
/* If we can't find the required code unit, break the bumpalong loop,