More UTF-8 EXACT tweaking, plus a forgotten UTF-8

toggle-on from the encoding pragma. p4raw-id: //depot/perl@12872
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-11-06 15:18:41 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-11-06 15:18:41 +0000
commit: 5ff6fc6d3e84f8da3756f8b5246037f5e410021e (patch)
tree: e444f6a59e0e558603b22a98b941eb0c9a68481f
parent: c75a1282313a5941ab098f0e18a0c22e45fa4362 (diff)
download: perl-5ff6fc6d3e84f8da3756f8b5246037f5e410021e.tar.gz
2 files changed, 18 insertions, 20 deletions
diff --git a/regcomp.c b/regcomp.c
index cd3857eb2b..12e03959fd 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1764,7 +1764,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
     pm->op_pmflags = RExC_flags16;
     if (UTF)
-	r->reganch |= ROPT_UTF8;
+        r->reganch |= ROPT_UTF8;	/* Unicode in it? */
     r->regstclass = NULL;
     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
 	r->reganch |= ROPT_NAUGHTY;
@@ -3168,6 +3168,7 @@ tryagain:
 	      RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
 	 } else
 	      RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+	 RExC_utf8 = 1;
     }
 
     return(ret);
diff --git a/regexec.c b/regexec.c
index 60d93f7ad7..712c4d9b47 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2204,43 +2204,40 @@ S_regmatch(pTHX_ regnode *prog)
 	    s = STRING(scan);
 	    ln = STR_LEN(scan);
 	    if (do_utf8 != (UTF!=0)) {
+		/* The target and the pattern have differing "utf8ness". */
 		char *l = locinput;
 		char *e = s + ln;
 		STRLEN len;
 
-		if (do_utf8)
+		if (do_utf8) {
+		    /* The target is utf8, the pattern is not utf8. */
 		    while (s < e) {
-			UV uv;
-
 			if (l >= PL_regeol)
-			    sayNO;
-			uv = NATIVE_TO_UNI(*(U8*)s);
-			if (UTF8_IS_START(uv)) {
-			     len = UTF8SKIP(s);
-			     if (memNE(s, l, len))
-				  sayNO;
-			     l += len;
-			     s += len;
-			} else {
-			     if (uv != utf8_to_uvchr((U8*)l, &len))
-				  sayNO;
-			     l += len;
-			     s ++;
-			}
+			     sayNO;
+			if (NATIVE_TO_UNI(*(U8*)s) !=
+			    utf8_to_uvchr((U8*)l, &len))
+			     sayNO;
+			l += len;
+			s ++;
 		    }
-		else
+		}
+		else {
+		    /* The target is not utf8, the pattern is utf8. */
 		    while (s < e) {
 			if (l >= PL_regeol)
 			    sayNO;
-			if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
+			if (NATIVE_TO_UNI(*((U8*)l)) !=
+			    utf8_to_uvchr((U8*)s, &len))
 			    sayNO;
 			s += len;
 			l ++;
 		    }
+		}
 		locinput = l;
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
+	    /* The target and the pattern have the same "utf8ness". */
 	    /* Inline the first character, for speed. */
 	    if (UCHARAT(s) != nextchr)
 		sayNO;
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-11-06 15:18:41 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-11-06 15:18:41 +0000
commit	5ff6fc6d3e84f8da3756f8b5246037f5e410021e (patch)
tree	e444f6a59e0e558603b22a98b941eb0c9a68481f
parent	c75a1282313a5941ab098f0e18a0c22e45fa4362 (diff)
download	perl-5ff6fc6d3e84f8da3756f8b5246037f5e410021e.tar.gz