From 86d6fcadb912bd04e5bb511a8188871eb12e4274 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Fri, 23 Dec 2011 19:46:10 -0700
Subject: regcomp.c: Rework join_exact()

This re formats and refactors portions of join_exact() that look for the
tricky Greek fold sequences.  I renamed various variables, etc, to help
me understand what was going on.  It turns out that there were two
off-by-one bugs that prevented this from working properly.

The first bug had the loop quit one too soon  The boundary should be
"<=", and not strictly less-than.  This means that if the sequence is
the last thing in the string (or only thing) it will not be found.
The other bug had the end-needle parameter be 1 too short, which means
that this would succeed with  only the first 3 bytes of the sequence
(now called 'tail'), thus matching many more things than it should
(provided it got the chance to match at all given the first bug).
---
 regcomp.c | 109 ++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 59 insertions(+), 50 deletions(-)

(limited to 'regcomp.c')

diff --git a/regcomp.c b/regcomp.c
index 9e7960cf84..02a6f75131 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2599,61 +2599,70 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32
      * can now analyze for sequences of problematic code points.  (Prior to
      * this final joining, sequences could have been split over boundaries, and
      * hence missed).  The sequences only happen in folding */
+    if (OP(scan) != EXACT) {
+        char *s, *t;
+        char * s0 = STRING(scan);
+        char * const s_end = s0 + STR_LEN(scan);
+
+	/* First we look at the sequences that can occur only in UTF-8 strings.
+	 * The sequences are of length 6 */
+	if (UTF && STR_LEN(scan) >= 6) {
+
+	    /* Two problematic code points in Unicode casefolding of EXACT
+	     * nodes:
+	     *
+	     * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+	     * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+	     *
+	     * which casefold to
+	     *
+	     * Unicode                      UTF-8
+	     *
+	     * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
+	     * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
+             *
+	     * This means that in case-insensitive matching (or "loose
+	     * matching", as Unicode calls it), an EXACTF of length six (the
+	     * UTF-8 encoded byte length of the above casefolded versions) can
+	     * match a target string of length two (the byte length of UTF-8
+	     * encoded U+0390 or U+03B0).  This would rather mess up the
+	     * minimum length computation.  (there are other code points that
+	     * also fold to these two sequences, but the delta is smaller)
+	     *
+	     * What we'll do is to look for the tail four bytes, and then peek
+	     * at the preceding two bytes to see whether we need to decrease
+	     * the minimum length by four (six minus two).
+	     *
+	     * Thanks to the design of UTF-8, there cannot be false matches:
+	     * A sequence of valid UTF-8 bytes cannot be a subsequence of
+	     * another valid sequence of UTF-8 bytes. */
 
-    if (UTF
-	&& ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
-	&& ( STR_LEN(scan) >= 6 ) )
-    {
-    /*
-    Two problematic code points in Unicode casefolding of EXACT nodes:
-    
-    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-    
-    which casefold to
-    
-    Unicode                      UTF-8
-    
-    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
-    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
-    
-    This means that in case-insensitive matching (or "loose matching",
-    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
-    length of the above casefolded versions) can match a target string
-    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
-    This would rather mess up the minimum length computation.
-    
-    What we'll do is to look for the tail four bytes, and then peek
-    at the preceding two bytes to see whether we need to decrease
-    the minimum length by four (six minus two).
-    
-    Thanks to the design of UTF-8, there cannot be false matches:
-    A sequence of valid UTF-8 bytes cannot be a subsequence of
-    another valid sequence of UTF-8 bytes.
-    
-    */
-         char * const s0 = STRING(scan), *s, *t;
-         char * const s1 = s0 + STR_LEN(scan) - 1;
-         char * const s2 = s1 - 4;
 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-	 const char t0[] = "\xaf\x49\xaf\x42";
-#else
-         const char t0[] = "\xcc\x88\xcc\x81";
-#endif
-         const char * const t1 = t0 + 3;
-    
-         for (s = s0 + 2;
-              s < s2 && (t = ninstr(s, s1, t0, t1));
-              s = t + 4) {
-#ifdef EBCDIC
-	      if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
-		  ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+	    const char U390_first_byte = '\xb4';
+	    const char U390_2nd_byte = '\x68';
+	    const char U3B0_first_byte = '\xb5';
+	    const char U3B0_2nd_byte = '\x46';
+	    const char tail[] = "\xaf\x49\xaf\x42";
 #else
-              if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
-                  ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+	    const char U390_first_byte = '\xce';
+	    const char U390_2nd_byte = '\xb9';
+	    const char U3B0_first_byte = '\xcf';
+	    const char U3B0_2nd_byte = '\x85';
+	    const char tail[] = "\xcc\x88\xcc\x81";
 #endif
+	    const STRLEN tail_len = sizeof(tail) - 1;
+	    for (s = s0 + 2;    /* +2 is to skip the non-tail */
+		 s <= s_end - tail_len
+		   && (t = ninstr(s, s_end, tail, tail + tail_len));
+		 s = t + tail_len)
+	    {
+		if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+		    || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+		{
 		    *min_change -= 4;
-         }
+		}
+	    }
+	}
     }
 
 #ifdef DEBUGGING
-- 
cgit v1.2.1