summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-11-06 15:18:41 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-11-06 15:18:41 +0000
commit5ff6fc6d3e84f8da3756f8b5246037f5e410021e (patch)
treee444f6a59e0e558603b22a98b941eb0c9a68481f
parentc75a1282313a5941ab098f0e18a0c22e45fa4362 (diff)
downloadperl-5ff6fc6d3e84f8da3756f8b5246037f5e410021e.tar.gz
More UTF-8 EXACT tweaking, plus a forgotten UTF-8
toggle-on from the encoding pragma. p4raw-id: //depot/perl@12872
-rw-r--r--regcomp.c3
-rw-r--r--regexec.c35
2 files changed, 18 insertions, 20 deletions
diff --git a/regcomp.c b/regcomp.c
index cd3857eb2b..12e03959fd 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1764,7 +1764,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
pm->op_pmflags = RExC_flags16;
if (UTF)
- r->reganch |= ROPT_UTF8;
+ r->reganch |= ROPT_UTF8; /* Unicode in it? */
r->regstclass = NULL;
if (RExC_naughty >= 10) /* Probably an expensive pattern. */
r->reganch |= ROPT_NAUGHTY;
@@ -3168,6 +3168,7 @@ tryagain:
RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
} else
RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+ RExC_utf8 = 1;
}
return(ret);
diff --git a/regexec.c b/regexec.c
index 60d93f7ad7..712c4d9b47 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2204,43 +2204,40 @@ S_regmatch(pTHX_ regnode *prog)
s = STRING(scan);
ln = STR_LEN(scan);
if (do_utf8 != (UTF!=0)) {
+ /* The target and the pattern have differing "utf8ness". */
char *l = locinput;
char *e = s + ln;
STRLEN len;
- if (do_utf8)
+ if (do_utf8) {
+ /* The target is utf8, the pattern is not utf8. */
while (s < e) {
- UV uv;
-
if (l >= PL_regeol)
- sayNO;
- uv = NATIVE_TO_UNI(*(U8*)s);
- if (UTF8_IS_START(uv)) {
- len = UTF8SKIP(s);
- if (memNE(s, l, len))
- sayNO;
- l += len;
- s += len;
- } else {
- if (uv != utf8_to_uvchr((U8*)l, &len))
- sayNO;
- l += len;
- s ++;
- }
+ sayNO;
+ if (NATIVE_TO_UNI(*(U8*)s) !=
+ utf8_to_uvchr((U8*)l, &len))
+ sayNO;
+ l += len;
+ s ++;
}
- else
+ }
+ else {
+ /* The target is not utf8, the pattern is utf8. */
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
+ if (NATIVE_TO_UNI(*((U8*)l)) !=
+ utf8_to_uvchr((U8*)s, &len))
sayNO;
s += len;
l ++;
}
+ }
locinput = l;
nextchr = UCHARAT(locinput);
break;
}
+ /* The target and the pattern have the same "utf8ness". */
/* Inline the first character, for speed. */
if (UCHARAT(s) != nextchr)
sayNO;