Raise warning on multi-byte char in single-byte locale

See http://nntp.perl.org/group/perl.perl5.porters/211909 Something is quite likely wrong with the logic if say in a Greek locale, Unicode characters (especially Greek ones) are encountered. The same character will be represented by two different code points. This warning alerts the user to this undesirable state of affairs.
author: Karl Williamson <khw@cpan.org> 2014-12-29 13:15:57 -0700
committer: Karl Williamson <khw@cpan.org> 2014-12-29 13:52:57 -0700
commit: 613abc6d16e99bd9834fe6afd79beb61a3a4734d (patch)
tree: 3d2287cfb7ac08c691a642d58b96a2daa76f999d
parent: dbf3c4d788344c8d20eb2549c638ced519d3f0e8 (diff)
download: perl-613abc6d16e99bd9834fe6afd79beb61a3a4734d.tar.gz
10 files changed, 146 insertions, 4 deletions
diff --git a/perl.h b/perl.h
index 35624b5918..89a7d43f03 100644
--- a/perl.h
+++ b/perl.h
@@ -5798,6 +5798,27 @@ typedef struct am_table_short AMTS;
         }  STMT_END
 
 
+    /* These two internal macros are called when a warning should be raised,
+     * and will do so if enabled.  The first takes a single code point
+     * argument; the 2nd, is a pointer to the first byte of the UTF-8 encoded
+     * string, and an end position which it won't try to read past */
+#   define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(cp)                         \
+      Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                           \
+             "Wide character (U+%"UVXf") in %s", (UV) cp, OP_DESC(PL_op));
+
+#  define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(s, send)                   \
+	STMT_START { /* Check if to warn before doing the conversion work */\
+            if (ckWARN(WARN_LOCALE)) {                                      \
+                UV cp = utf8_to_uvchr_buf((U8 *) s, (U8 *) send, NULL);     \
+                Perl_warner(aTHX_ packWARN(WARN_LOCALE),                    \
+                    "Wide character (U+%"UVXf") in %s",                     \
+                    (cp == 0)                                               \
+                     ? UNICODE_REPLACEMENT                                  \
+                     : (UV) cp,                                             \
+                    OP_DESC(PL_op));                                        \
+            }                                                               \
+        }  STMT_END
+
 #   endif   /* PERL_CORE or PERL_IN_XSUB_RE */
 
 #else   /* No locale usage */
@@ -5816,6 +5837,8 @@ typedef struct am_table_short AMTS;
 #   define IN_LC(category)                  0
 
 #   define _CHECK_AND_WARN_PROBLEMATIC_LOCALE
+#   define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(a)
+#   define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(a,b)
 #endif
 
 #ifdef USE_LOCALE_NUMERIC
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index a49456576a..6a830b92ff 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -209,6 +209,10 @@ XXX L<message|perldiag/"message">
 
 XXX L<message|perldiag/"message">
 
+=item *
+
+L<Wide character (U+%X) in %s|perldiag/"Wide character (U+%X) in %s">
+
 =back
 
 =head2 Changes to Existing Diagnostics
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 63df68d591..4979da2da3 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -6964,6 +6964,20 @@ warning is to add C<no warnings 'utf8';> but that is often closer to
 cheating.  In general, you are supposed to explicitly mark the
 filehandle with an encoding, see L<open> and L<perlfunc/binmode>.
 
+=item Wide character (U+%X) in %s
+
+(W locale) While in a single-byte locale (I<i.e.>, a non-UTF-8
+one), a multi-byte character was encountered.   Perl considers this
+character to be the specified Unicode code point.  Combining non-UTF8
+locales and Unicode is dangerous.  Almost certainly some characters
+will have two different representations.  For example, in the ISO 8859-7
+(Greek) locale, the code point 0xC3 represents a Capital Gamma.  But so
+also does 0x393.  This will make string comparisons unreliable.
+
+You likely need to figure out how this multi-byte character got mixed up
+with your single-byte locale (or perhaps you thought you had a UTF-8
+locale, but Perl disagrees).
+
 =item Within []-length '%c' not allowed
 
 (F) The count in the (un)pack template may be replaced by C<[TEMPLATE]>
diff --git a/pod/perllocale.pod b/pod/perllocale.pod
index 17fddcb40d..3b2d79dcc8 100644
--- a/pod/perllocale.pod
+++ b/pod/perllocale.pod
@@ -1519,6 +1519,11 @@ Still another problem is that this approach can lead to two code
 points meaning the same character.  Thus in a Greek locale, both U+03A7
 and U+00D7 are GREEK CAPITAL LETTER CHI.
 
+Because of all these problems, starting in v5.22, Perl will raise a
+warning if a multi-byte (hence Unicode) code point is used when a
+single-byte locale is in effect.  (Although it doesn't check for this if
+doing so would unreasonably slow execution down.)
+
 Vendor locales are notoriously buggy, and it is difficult for Perl to test
 its locale-handling code because this interacts with code that Perl has no
 control over; therefore the locale-handling code in Perl may be buggy as
diff --git a/regexec.c b/regexec.c
index 776cfd517e..e659f4ba35 100644
--- a/regexec.c
+++ b/regexec.c
@@ -512,6 +512,8 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
                         TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
     }
 
+    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
+
     if (classnum < _FIRST_NON_SWASH_CC) {
 
         /* Initialize the swash unless done already */
@@ -1457,6 +1459,9 @@ STMT_START {
     switch (trie_type) {                                                            \
     case trie_flu8:                                                                 \
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
+        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
+        }                                                                           \
         goto do_trie_utf8_fold;                                                     \
     case trie_utf8_exactfa_fold:                                                    \
         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
@@ -1495,6 +1500,9 @@ STMT_START {
         break;                                                                      \
     case trie_utf8l:                                                                \
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
+        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
+        }                                                                           \
         /* FALLTHROUGH */                                                           \
     case trie_utf8:                                                                 \
         uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
@@ -1819,7 +1827,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                        UTF-8 to express.  */
                 break;
             }
-            utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+            utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+                                             | FOLDEQ_S2_FOLDS_SANE;
             goto do_exactf_utf8;
 
     case EXACTFU:
@@ -4185,6 +4194,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+                    if (utf8_target
+                        && UTF8_IS_ABOVE_LATIN1(nextchr)
+                        && scan->flags == EXACTL)
+                    {
+                        /* We only output for EXACTL, as we let the folder
+                         * output this message for EXACTFLU8 to avoid
+                         * duplication */
+                        _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+                                                               reginfo->strend);
+                    }
                 }
                 if (   trie->bitmap
                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
@@ -4461,6 +4480,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	case EXACTL:             /*  /abc/l       */
             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+            /* Complete checking would involve going through every character
+             * matched by the string to see if any is above latin1.  But the
+             * comparision otherwise might very well be a fast assembly
+             * language routine, and I (khw) don't think slowing things down
+             * just to check for this warning is worth it.  So this just checks
+             * the first character */
+            if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+            }
             /* FALLTHROUGH */
 	case EXACT: {            /*  /abc/        */
 	    char *s = STRING(scan);
@@ -4560,7 +4589,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             if (! utf8_target) {
                 sayNO;
             }
-	    fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED;
+            fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
+                                             | FOLDEQ_S1_FOLDS_SANE;
 	    goto do_exactf;
 
 	case EXACTFU_SS:         /*  /\x{df}/iu   */
@@ -4758,6 +4788,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 }
             }
             else { /* Here, must be an above Latin-1 code point */
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
                 goto utf8_posix_above_latin1;
             }
 
@@ -7231,6 +7262,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
 	break;
     case EXACTL:
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+        if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
+        }
         /* FALLTHROUGH */
     case EXACT:
         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
@@ -7318,7 +7352,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         if (! utf8_target) {
             break;
         }
-	utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
+        utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+                                    | FOLDEQ_S2_FOLDS_SANE;
         goto do_exactf;
 
     case EXACTFU_SS:
@@ -7733,6 +7768,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
 		 * UTF8_ALLOW_FFFF */
 	if (c_len == (STRLEN)-1)
 	    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+        if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+            _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
+        }
     }
 
     /* If this character is potentially in the bitmap, check it */
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 73696dfb1d..0c6a16a5ba 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -117,3 +117,29 @@ $_ = 'a' x (2**15+1);
 #
 EXPECT
 
+########
+# NAME Wide character in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+    print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"\x{100}" =~ /\x{100}|\x{101}/il;
+"\x{100}" =~ /\x{100}|\x{101}/l;
+"\x{100}" =~ /\w/l;
+"\x{100}" =~ /\x{100}+/l;
+"\x{100}" =~ /[\x{100}\x{102}]/l;
+no warnings 'locale';
+EXPECT
+Wide character (U+100) in pattern match (m//) at - line 8.
+Wide character (U+100) in pattern match (m//) at - line 8.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 9.
+Wide character (U+100) in pattern match (m//) at - line 10.
+Wide character (U+100) in pattern match (m//) at - line 10.
+Wide character (U+100) in pattern match (m//) at - line 11.
+Wide character (U+100) in pattern match (m//) at - line 12.
+Wide character (U+100) in pattern match (m//) at - line 12.
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index abce3d1365..75f3f25782 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -612,3 +612,31 @@ Can't do fc("\x{FB05}") on non-UTF-8 locale; resolved to "\x{FB06}". at - line 1
 Can't do uc("\x{FB00}") on non-UTF-8 locale; resolved to "\x{FB00}". at - line 13.
 Can't do ucfirst("\x{149}") on non-UTF-8 locale; resolved to "\x{149}". at - line 14.
 Can't do lcfirst("\x{178}") on non-UTF-8 locale; resolved to "\x{178}". at - line 15.
+########
+# NAME Wide character in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+    print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use feature 'fc';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+my $a;
+$a = lc("\x{100}");
+$a = lcfirst("\x{101}");
+$a = fc("\x{102}");
+$a = uc("\x{103}");
+$a = ucfirst("\x{104}");
+no warnings 'locale';
+$a = lc("\x{100}");
+$a = lcfirst("\x{101}");
+$a = fc("\x{102}");
+$a = uc("\x{103}");
+$a = ucfirst("\x{104}");
+EXPECT
+Wide character (U+100) in lc at - line 10.
+Wide character (U+101) in lcfirst at - line 11.
+Wide character (U+102) in fc at - line 12.
+Wide character (U+103) in uc at - line 13.
+Wide character (U+104) in ucfirst at - line 14.
diff --git a/t/re/charset.t b/t/re/charset.t
index 4d0d99cfdc..e06191620d 100644
--- a/t/re/charset.t
+++ b/t/re/charset.t
@@ -9,6 +9,8 @@ BEGIN {
 
 use strict;
 use warnings;
+no warnings 'locale';   # Some /l tests use above-latin1 chars to make sure
+                        # they work, even though they warn.
 use Config;
 
 plan('no_plan');
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index c210e2e06e..19d6fbce07 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -866,6 +866,7 @@ sub run_tests {
         ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED";
         my $loc_re = qq /(?l:^([^X]*)X)/;
         utf8::upgrade ($loc_re);
+        no warnings 'locale';
         ok "\x{100}X" =~ /$loc_re/, "locale, S_cl_and ANYOF_UNICODE & ANYOF_INVERTED";
     }
 
diff --git a/utf8.c b/utf8.c
index b5470a8c71..8551e111de 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1914,7 +1914,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
 	    s += UTF8SKIP(s);
 	}
 
-	/* Here, no characters crossed, result is ok as-is */
+        /* Here, no characters crossed, result is ok as-is, but we warn. */
+        _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
 	return result;
     }
author	Karl Williamson <khw@cpan.org>	2014-12-29 13:15:57 -0700
committer	Karl Williamson <khw@cpan.org>	2014-12-29 13:52:57 -0700
commit	613abc6d16e99bd9834fe6afd79beb61a3a4734d (patch)
tree	3d2287cfb7ac08c691a642d58b96a2daa76f999d
parent	dbf3c4d788344c8d20eb2549c638ced519d3f0e8 (diff)
download	perl-613abc6d16e99bd9834fe6afd79beb61a3a4734d.tar.gz