From 86d6fcadb912bd04e5bb511a8188871eb12e4274 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 23 Dec 2011 19:46:10 -0700 Subject: regcomp.c: Rework join_exact() This re formats and refactors portions of join_exact() that look for the tricky Greek fold sequences. I renamed various variables, etc, to help me understand what was going on. It turns out that there were two off-by-one bugs that prevented this from working properly. The first bug had the loop quit one too soon The boundary should be "<=", and not strictly less-than. This means that if the sequence is the last thing in the string (or only thing) it will not be found. The other bug had the end-needle parameter be 1 too short, which means that this would succeed with only the first 3 bytes of the sequence (now called 'tail'), thus matching many more things than it should (provided it got the chance to match at all given the first bug). --- regcomp.c | 109 ++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 50 deletions(-) (limited to 'regcomp.c') diff --git a/regcomp.c b/regcomp.c index 9e7960cf84..02a6f75131 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2599,61 +2599,70 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 * can now analyze for sequences of problematic code points. (Prior to * this final joining, sequences could have been split over boundaries, and * hence missed). The sequences only happen in folding */ + if (OP(scan) != EXACT) { + char *s, *t; + char * s0 = STRING(scan); + char * const s_end = s0 + STR_LEN(scan); + + /* First we look at the sequences that can occur only in UTF-8 strings. + * The sequences are of length 6 */ + if (UTF && STR_LEN(scan) >= 6) { + + /* Two problematic code points in Unicode casefolding of EXACT + * nodes: + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * What we'll do is to look for the tail four bytes, and then peek + * at the preceding two bytes to see whether we need to decrease + * the minimum length by four (six minus two). + * + * Thanks to the design of UTF-8, there cannot be false matches: + * A sequence of valid UTF-8 bytes cannot be a subsequence of + * another valid sequence of UTF-8 bytes. */ - if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) - && ( STR_LEN(scan) >= 6 ) ) - { - /* - Two problematic code points in Unicode casefolding of EXACT nodes: - - U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - - which casefold to - - Unicode UTF-8 - - U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - - This means that in case-insensitive matching (or "loose matching", - as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte - length of the above casefolded versions) can match a target string - of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). - This would rather mess up the minimum length computation. - - What we'll do is to look for the tail four bytes, and then peek - at the preceding two bytes to see whether we need to decrease - the minimum length by four (six minus two). - - Thanks to the design of UTF-8, there cannot be false matches: - A sequence of valid UTF-8 bytes cannot be a subsequence of - another valid sequence of UTF-8 bytes. - - */ - char * const s0 = STRING(scan), *s, *t; - char * const s1 = s0 + STR_LEN(scan) - 1; - char * const s2 = s1 - 4; #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char t0[] = "\xaf\x49\xaf\x42"; -#else - const char t0[] = "\xcc\x88\xcc\x81"; -#endif - const char * const t1 = t0 + 3; - - for (s = s0 + 2; - s < s2 && (t = ninstr(s, s1, t0, t1)); - s = t + 4) { -#ifdef EBCDIC - if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || - ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) + const char U390_first_byte = '\xb4'; + const char U390_2nd_byte = '\x68'; + const char U3B0_first_byte = '\xb5'; + const char U3B0_2nd_byte = '\x46'; + const char tail[] = "\xaf\x49\xaf\x42"; #else - if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || - ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) + const char U390_first_byte = '\xce'; + const char U390_2nd_byte = '\xb9'; + const char U3B0_first_byte = '\xcf'; + const char U3B0_2nd_byte = '\x85'; + const char tail[] = "\xcc\x88\xcc\x81"; #endif + const STRLEN tail_len = sizeof(tail) - 1; + for (s = s0 + 2; /* +2 is to skip the non-tail */ + s <= s_end - tail_len + && (t = ninstr(s, s_end, tail, tail + tail_len)); + s = t + tail_len) + { + if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte) + || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte)) + { *min_change -= 4; - } + } + } + } } #ifdef DEBUGGING -- cgit v1.2.1