summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-12-23 19:46:10 -0700
committerKarl Williamson <public@khwilliamson.com>2012-01-19 11:58:19 -0700
commit86d6fcadb912bd04e5bb511a8188871eb12e4274 (patch)
tree51f71c9c5a88ef9f3ab17af6bdd4500ccb5a1b95 /regcomp.c
parentca600955b124a74a55428f5d252241f4119646d4 (diff)
downloadperl-86d6fcadb912bd04e5bb511a8188871eb12e4274.tar.gz
regcomp.c: Rework join_exact()
This re formats and refactors portions of join_exact() that look for the tricky Greek fold sequences. I renamed various variables, etc, to help me understand what was going on. It turns out that there were two off-by-one bugs that prevented this from working properly. The first bug had the loop quit one too soon The boundary should be "<=", and not strictly less-than. This means that if the sequence is the last thing in the string (or only thing) it will not be found. The other bug had the end-needle parameter be 1 too short, which means that this would succeed with only the first 3 bytes of the sequence (now called 'tail'), thus matching many more things than it should (provided it got the chance to match at all given the first bug).
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c109
1 files changed, 59 insertions, 50 deletions
diff --git a/regcomp.c b/regcomp.c
index 9e7960cf84..02a6f75131 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2599,61 +2599,70 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32
* can now analyze for sequences of problematic code points. (Prior to
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding */
+ if (OP(scan) != EXACT) {
+ char *s, *t;
+ char * s0 = STRING(scan);
+ char * const s_end = s0 + STR_LEN(scan);
+
+ /* First we look at the sequences that can occur only in UTF-8 strings.
+ * The sequences are of length 6 */
+ if (UTF && STR_LEN(scan) >= 6) {
+
+ /* Two problematic code points in Unicode casefolding of EXACT
+ * nodes:
+ *
+ * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ *
+ * which casefold to
+ *
+ * Unicode UTF-8
+ *
+ * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+ *
+ * This means that in case-insensitive matching (or "loose
+ * matching", as Unicode calls it), an EXACTF of length six (the
+ * UTF-8 encoded byte length of the above casefolded versions) can
+ * match a target string of length two (the byte length of UTF-8
+ * encoded U+0390 or U+03B0). This would rather mess up the
+ * minimum length computation. (there are other code points that
+ * also fold to these two sequences, but the delta is smaller)
+ *
+ * What we'll do is to look for the tail four bytes, and then peek
+ * at the preceding two bytes to see whether we need to decrease
+ * the minimum length by four (six minus two).
+ *
+ * Thanks to the design of UTF-8, there cannot be false matches:
+ * A sequence of valid UTF-8 bytes cannot be a subsequence of
+ * another valid sequence of UTF-8 bytes. */
- if (UTF
- && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
- && ( STR_LEN(scan) >= 6 ) )
- {
- /*
- Two problematic code points in Unicode casefolding of EXACT nodes:
-
- U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
- U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-
- which casefold to
-
- Unicode UTF-8
-
- U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
- U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
-
- This means that in case-insensitive matching (or "loose matching",
- as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
- length of the above casefolded versions) can match a target string
- of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
- This would rather mess up the minimum length computation.
-
- What we'll do is to look for the tail four bytes, and then peek
- at the preceding two bytes to see whether we need to decrease
- the minimum length by four (six minus two).
-
- Thanks to the design of UTF-8, there cannot be false matches:
- A sequence of valid UTF-8 bytes cannot be a subsequence of
- another valid sequence of UTF-8 bytes.
-
- */
- char * const s0 = STRING(scan), *s, *t;
- char * const s1 = s0 + STR_LEN(scan) - 1;
- char * const s2 = s1 - 4;
#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
- const char t0[] = "\xaf\x49\xaf\x42";
-#else
- const char t0[] = "\xcc\x88\xcc\x81";
-#endif
- const char * const t1 = t0 + 3;
-
- for (s = s0 + 2;
- s < s2 && (t = ninstr(s, s1, t0, t1));
- s = t + 4) {
-#ifdef EBCDIC
- if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
- ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+ const char U390_first_byte = '\xb4';
+ const char U390_2nd_byte = '\x68';
+ const char U3B0_first_byte = '\xb5';
+ const char U3B0_2nd_byte = '\x46';
+ const char tail[] = "\xaf\x49\xaf\x42";
#else
- if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
- ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ const char U390_first_byte = '\xce';
+ const char U390_2nd_byte = '\xb9';
+ const char U3B0_first_byte = '\xcf';
+ const char U3B0_2nd_byte = '\x85';
+ const char tail[] = "\xcc\x88\xcc\x81";
#endif
+ const STRLEN tail_len = sizeof(tail) - 1;
+ for (s = s0 + 2; /* +2 is to skip the non-tail */
+ s <= s_end - tail_len
+ && (t = ninstr(s, s_end, tail, tail + tail_len));
+ s = t + tail_len)
+ {
+ if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+ || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+ {
*min_change -= 4;
- }
+ }
+ }
+ }
}
#ifdef DEBUGGING