summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c109
1 files changed, 59 insertions, 50 deletions
diff --git a/regcomp.c b/regcomp.c
index 9e7960cf84..02a6f75131 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2599,61 +2599,70 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32
* can now analyze for sequences of problematic code points. (Prior to
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding */
+ if (OP(scan) != EXACT) {
+ char *s, *t;
+ char * s0 = STRING(scan);
+ char * const s_end = s0 + STR_LEN(scan);
+
+ /* First we look at the sequences that can occur only in UTF-8 strings.
+ * The sequences are of length 6 */
+ if (UTF && STR_LEN(scan) >= 6) {
+
+ /* Two problematic code points in Unicode casefolding of EXACT
+ * nodes:
+ *
+ * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ *
+ * which casefold to
+ *
+ * Unicode UTF-8
+ *
+ * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+ *
+ * This means that in case-insensitive matching (or "loose
+ * matching", as Unicode calls it), an EXACTF of length six (the
+ * UTF-8 encoded byte length of the above casefolded versions) can
+ * match a target string of length two (the byte length of UTF-8
+ * encoded U+0390 or U+03B0). This would rather mess up the
+ * minimum length computation. (there are other code points that
+ * also fold to these two sequences, but the delta is smaller)
+ *
+ * What we'll do is to look for the tail four bytes, and then peek
+ * at the preceding two bytes to see whether we need to decrease
+ * the minimum length by four (six minus two).
+ *
+ * Thanks to the design of UTF-8, there cannot be false matches:
+ * A sequence of valid UTF-8 bytes cannot be a subsequence of
+ * another valid sequence of UTF-8 bytes. */
- if (UTF
- && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
- && ( STR_LEN(scan) >= 6 ) )
- {
- /*
- Two problematic code points in Unicode casefolding of EXACT nodes:
-
- U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
- U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-
- which casefold to
-
- Unicode UTF-8
-
- U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
- U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
-
- This means that in case-insensitive matching (or "loose matching",
- as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
- length of the above casefolded versions) can match a target string
- of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
- This would rather mess up the minimum length computation.
-
- What we'll do is to look for the tail four bytes, and then peek
- at the preceding two bytes to see whether we need to decrease
- the minimum length by four (six minus two).
-
- Thanks to the design of UTF-8, there cannot be false matches:
- A sequence of valid UTF-8 bytes cannot be a subsequence of
- another valid sequence of UTF-8 bytes.
-
- */
- char * const s0 = STRING(scan), *s, *t;
- char * const s1 = s0 + STR_LEN(scan) - 1;
- char * const s2 = s1 - 4;
#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
- const char t0[] = "\xaf\x49\xaf\x42";
-#else
- const char t0[] = "\xcc\x88\xcc\x81";
-#endif
- const char * const t1 = t0 + 3;
-
- for (s = s0 + 2;
- s < s2 && (t = ninstr(s, s1, t0, t1));
- s = t + 4) {
-#ifdef EBCDIC
- if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
- ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+ const char U390_first_byte = '\xb4';
+ const char U390_2nd_byte = '\x68';
+ const char U3B0_first_byte = '\xb5';
+ const char U3B0_2nd_byte = '\x46';
+ const char tail[] = "\xaf\x49\xaf\x42";
#else
- if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
- ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ const char U390_first_byte = '\xce';
+ const char U390_2nd_byte = '\xb9';
+ const char U3B0_first_byte = '\xcf';
+ const char U3B0_2nd_byte = '\x85';
+ const char tail[] = "\xcc\x88\xcc\x81";
#endif
+ const STRLEN tail_len = sizeof(tail) - 1;
+ for (s = s0 + 2; /* +2 is to skip the non-tail */
+ s <= s_end - tail_len
+ && (t = ninstr(s, s_end, tail, tail + tail_len));
+ s = t + tail_len)
+ {
+ if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+ || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+ {
*min_change -= 4;
- }
+ }
+ }
+ }
}
#ifdef DEBUGGING