regexec.c: PATCH: [perl #114808]

Commit c72077c4fff72b66cdde1621c62fb4fd383ce093 fixed a place where to_byte_substr() fails, but the code continued as if it had succeeded. There is yet another place where the return is not checked. This commit adds a check there. However, it turns out that there is another underlying problem to [perl #114808]. The function to_byte_substr() tries to downgrade the substr fields in the regex program it is passed. If it fails (because something in it is expressible only in UTF-8), it permanently changes that field to point to PL_sv_undef, thus losing the original information. This is fine as long as the program will be used once and discarded. However, there are places where the program is re-used, as in the test case introduced by this commit, and the original value has been lost. To solve this, this commit also changes to_byte_substr() from returning void to instead returning bool, indicating success or failure. On failure, the original substrs are left intact. The calls to this function are correspondingly changed. One of them had a trace statement when the failure happens, I reworded it to be more general and accurate (it was slightly misleading), and added the trace to every such place, not just the one. In addition, I found the use of the same ternary operation in 3 or 4 consecutive lines very hard to understand; and is inefficient unless compiled under C optimization which avoids recalculating things. So I expanded the several nearly identical places in the code that do that so that I could quickly see what is going on.
author: Karl Williamson <public@khwilliamson.com> 2012-10-06 10:06:57 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-10-06 13:50:14 -0600
commit: 7e0d5ad7c9cdb21b681e611b888acd41d34c4d05 (patch)
tree: 2d99a40ca92780839184bb4783b7948fe81a659a /regexec.c
parent: c72077c4fff72b66cdde1621c62fb4fd383ce093 (diff)
download: perl-7e0d5ad7c9cdb21b681e611b888acd41d34c4d05.tar.gz
1 files changed, 85 insertions, 38 deletions
diff --git a/regexec.c b/regexec.c
index 350f2937ae..ea3a1b05fb 100644
--- a/regexec.c
+++ b/regexec.c
@@ -37,6 +37,11 @@
 #include "re_top.h"
 #endif
 
+/* At least one required character in the target string is expressible only in
+ * UTF-8. */
+const char* const non_utf8_target_but_utf8_required
+                = "Can't match, because target string needs to be in UTF-8\n";
+
 /*
  * pregcomp and pregexec -- regsub and regerror are not used in perl
  *
@@ -630,15 +635,15 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 	    to_utf8_substr(prog);
 	check = prog->check_utf8;
     } else {
-	if (!prog->check_substr && prog->check_utf8)
-	    to_byte_substr(prog);
+	if (!prog->check_substr && prog->check_utf8) {
+	    if (! to_byte_substr(prog)) {
+                DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+                                        non_utf8_target_but_utf8_required));
+                goto fail;
+            }
+        }
 	check = prog->check_substr;
     }
-    if (check == &PL_sv_undef) {
-	DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
-		"Non-utf8 string cannot match utf8 check string\n"));
-	goto fail;
-    }
     if (prog->extflags & RXf_ANCH) {	/* Match at beg-of-str or after \n */
 	ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 		     || ( (prog->extflags & RXf_ANCH_BOL)
@@ -2317,11 +2322,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 #ifdef DEBUGGING
 	int did_match = 0;
 #endif
-	if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
-	    utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
-	ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
-
 	if (utf8_target) {
+            if (! prog->anchored_utf8) {
+                to_utf8_substr(prog);
+            }
+            ch = SvPVX_const(prog->anchored_utf8)[0];
 	    REXEC_FBC_SCAN(
 		if (*s == ch) {
 		    DEBUG_EXECUTE_r( did_match = 1 );
@@ -2331,8 +2336,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 			s += UTF8SKIP(s);
 		}
 	    );
+
 	}
 	else {
+            if (! prog->anchored_substr) {
+                if (! to_byte_substr(prog)) {
+                    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+                                            non_utf8_target_but_utf8_required));
+                    goto phooey;
+                }
+            }
+            ch = SvPVX_const(prog->anchored_substr)[0];
 	    REXEC_FBC_SCAN(
 		if (*s == ch) {
 		    DEBUG_EXECUTE_r( did_match = 1 );
@@ -2361,23 +2375,44 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 	int did_match = 0;
 #endif
 	if (prog->anchored_substr || prog->anchored_utf8) {
-	    if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
-		utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
-	    must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
+	    if (utf8_target) {
+                if (! prog->anchored_utf8) {
+                    to_utf8_substr(prog);
+                }
+                must = prog->anchored_utf8;
+            }
+            else {
+                if (! prog->anchored_substr) {
+                    if (! to_byte_substr(prog)) {
+                        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+                                            non_utf8_target_but_utf8_required));
+                        goto phooey;
+                    }
+                }
+                must = prog->anchored_substr;
+            }
 	    back_max = back_min = prog->anchored_offset;
 	} else {
-	    if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
-		utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
-	    must = utf8_target ? prog->float_utf8 : prog->float_substr;
+	    if (utf8_target) {
+                if (! prog->float_utf8) {
+                    to_utf8_substr(prog);
+                }
+                must = prog->float_utf8;
+            }
+            else {
+                if (! prog->float_substr) {
+                    if (! to_byte_substr(prog)) {
+                        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+                                            non_utf8_target_but_utf8_required));
+                        goto phooey;
+                    }
+                }
+                must = prog->float_substr;
+            }
 	    back_max = prog->float_max_offset;
 	    back_min = prog->float_min_offset;
 	}
-	
 	    
-	if (must == &PL_sv_undef)
-	    /* could not downgrade utf8 check substring, so must fail */
-	    goto phooey;
-
         if (back_min<0) {
 	    last = strend;
 	} else {
@@ -2471,16 +2506,22 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 	    STRLEN len;
 	    const char *little;
 
-	    if (utf8_target && !prog->float_utf8)
-		to_utf8_substr(prog);
-	    else if (!utf8_target && !prog->float_substr) {
-		to_byte_substr(prog);
-		if (prog->float_substr == &PL_sv_undef)
-		    /* downgrading failed, but target is not utf8, so
-		     * matching must fail */
-		    goto phooey;
-	    }
-	    float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
+	    if (utf8_target) {
+                if (! prog->float_utf8) {
+                    to_utf8_substr(prog);
+                }
+                float_real = prog->float_utf8;
+            }
+            else {
+                if (! prog->float_substr) {
+                    if (! to_byte_substr(prog)) {
+                        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+                                            non_utf8_target_but_utf8_required));
+                        goto phooey;
+                    }
+                }
+                float_real = prog->float_substr;
+            }
 
             little = SvPV_const(float_real, len);
 	    if (SvTAIL(float_real)) {
@@ -7357,6 +7398,9 @@ restore_pos(pTHX_ void *arg)
 STATIC void
 S_to_utf8_substr(pTHX_ register regexp *prog)
 {
+    /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
+     * on the converted value */
+
     int i = 1;
 
     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
@@ -7385,9 +7429,12 @@ S_to_utf8_substr(pTHX_ register regexp *prog)
     } while (i--);
 }
 
-STATIC void
+STATIC bool
 S_to_byte_substr(pTHX_ register regexp *prog)
 {
+    /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
+     * on the converted value; returns FALSE if can't be converted. */
+
     dVAR;
     int i = 1;
 
@@ -7397,7 +7444,9 @@ S_to_byte_substr(pTHX_ register regexp *prog)
 	if (prog->substrs->data[i].utf8_substr
 	    && !prog->substrs->data[i].substr) {
 	    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
-	    if (sv_utf8_downgrade(sv, TRUE)) {
+	    if (! sv_utf8_downgrade(sv, TRUE)) {
+                return FALSE;
+            }
 		if (SvVALID(prog->substrs->data[i].utf8_substr)) {
 		    if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
 			/* Trim the trailing \n that fbm_compile added last
@@ -7407,15 +7456,13 @@ S_to_byte_substr(pTHX_ register regexp *prog)
 		    } else
 			fbm_compile(sv, 0);
 		}
-	    } else {
-		SvREFCNT_dec(sv);
-		sv = &PL_sv_undef;
-	    }
 	    prog->substrs->data[i].substr = sv;
 	    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
 		prog->check_substr = sv;
 	}
     } while (i--);
+
+    return TRUE;
 }
 
 /* These constants are for finding GCB=LV and GCB=LVT.  These are for the
author	Karl Williamson <public@khwilliamson.com>	2012-10-06 10:06:57 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-10-06 13:50:14 -0600
commit	7e0d5ad7c9cdb21b681e611b888acd41d34c4d05 (patch)
tree	2d99a40ca92780839184bb4783b7948fe81a659a /regexec.c
parent	c72077c4fff72b66cdde1621c62fb4fd383ce093 (diff)
download	perl-7e0d5ad7c9cdb21b681e611b888acd41d34c4d05.tar.gz