diff options
author | Karl Williamson <khw@cpan.org> | 2014-12-29 13:15:57 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2014-12-29 13:52:57 -0700 |
commit | 613abc6d16e99bd9834fe6afd79beb61a3a4734d (patch) | |
tree | 3d2287cfb7ac08c691a642d58b96a2daa76f999d | |
parent | dbf3c4d788344c8d20eb2549c638ced519d3f0e8 (diff) | |
download | perl-613abc6d16e99bd9834fe6afd79beb61a3a4734d.tar.gz |
Raise warning on multi-byte char in single-byte locale
See http://nntp.perl.org/group/perl.perl5.porters/211909
Something is quite likely wrong with the logic if say in a Greek locale,
Unicode characters (especially Greek ones) are encountered. The same
character will be represented by two different code points. This
warning alerts the user to this undesirable state of affairs.
-rw-r--r-- | perl.h | 23 | ||||
-rw-r--r-- | pod/perldelta.pod | 4 | ||||
-rw-r--r-- | pod/perldiag.pod | 14 | ||||
-rw-r--r-- | pod/perllocale.pod | 5 | ||||
-rw-r--r-- | regexec.c | 44 | ||||
-rw-r--r-- | t/lib/warnings/regexec | 26 | ||||
-rw-r--r-- | t/lib/warnings/utf8 | 28 | ||||
-rw-r--r-- | t/re/charset.t | 2 | ||||
-rw-r--r-- | t/re/pat_advanced.t | 1 | ||||
-rw-r--r-- | utf8.c | 3 |
10 files changed, 146 insertions, 4 deletions
@@ -5798,6 +5798,27 @@ typedef struct am_table_short AMTS; } STMT_END + /* These two internal macros are called when a warning should be raised, + * and will do so if enabled. The first takes a single code point + * argument; the 2nd, is a pointer to the first byte of the UTF-8 encoded + * string, and an end position which it won't try to read past */ +# define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(cp) \ + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ + "Wide character (U+%"UVXf") in %s", (UV) cp, OP_DESC(PL_op)); + +# define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(s, send) \ + STMT_START { /* Check if to warn before doing the conversion work */\ + if (ckWARN(WARN_LOCALE)) { \ + UV cp = utf8_to_uvchr_buf((U8 *) s, (U8 *) send, NULL); \ + Perl_warner(aTHX_ packWARN(WARN_LOCALE), \ + "Wide character (U+%"UVXf") in %s", \ + (cp == 0) \ + ? UNICODE_REPLACEMENT \ + : (UV) cp, \ + OP_DESC(PL_op)); \ + } \ + } STMT_END + # endif /* PERL_CORE or PERL_IN_XSUB_RE */ #else /* No locale usage */ @@ -5816,6 +5837,8 @@ typedef struct am_table_short AMTS; # define IN_LC(category) 0 # define _CHECK_AND_WARN_PROBLEMATIC_LOCALE +# define _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(a) +# define _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(a,b) #endif #ifdef USE_LOCALE_NUMERIC diff --git a/pod/perldelta.pod b/pod/perldelta.pod index a49456576a..6a830b92ff 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -209,6 +209,10 @@ XXX L<message|perldiag/"message"> XXX L<message|perldiag/"message"> +=item * + +L<Wide character (U+%X) in %s|perldiag/"Wide character (U+%X) in %s"> + =back =head2 Changes to Existing Diagnostics diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 63df68d591..4979da2da3 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -6964,6 +6964,20 @@ warning is to add C<no warnings 'utf8';> but that is often closer to cheating. In general, you are supposed to explicitly mark the filehandle with an encoding, see L<open> and L<perlfunc/binmode>. +=item Wide character (U+%X) in %s + +(W locale) While in a single-byte locale (I<i.e.>, a non-UTF-8 +one), a multi-byte character was encountered. Perl considers this +character to be the specified Unicode code point. Combining non-UTF8 +locales and Unicode is dangerous. Almost certainly some characters +will have two different representations. For example, in the ISO 8859-7 +(Greek) locale, the code point 0xC3 represents a Capital Gamma. But so +also does 0x393. This will make string comparisons unreliable. + +You likely need to figure out how this multi-byte character got mixed up +with your single-byte locale (or perhaps you thought you had a UTF-8 +locale, but Perl disagrees). + =item Within []-length '%c' not allowed (F) The count in the (un)pack template may be replaced by C<[TEMPLATE]> diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 17fddcb40d..3b2d79dcc8 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -1519,6 +1519,11 @@ Still another problem is that this approach can lead to two code points meaning the same character. Thus in a Greek locale, both U+03A7 and U+00D7 are GREEK CAPITAL LETTER CHI. +Because of all these problems, starting in v5.22, Perl will raise a +warning if a multi-byte (hence Unicode) code point is used when a +single-byte locale is in effect. (Although it doesn't check for this if +doing so would unreasonably slow execution down.) + Vendor locales are notoriously buggy, and it is difficult for Perl to test its locale-handling code because this interacts with code that Perl has no control over; therefore the locale-handling code in Perl may be buggy as @@ -512,6 +512,8 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character)); + if (classnum < _FIRST_NON_SWASH_CC) { /* Initialize the swash unless done already */ @@ -1457,6 +1459,9 @@ STMT_START { switch (trie_type) { \ case trie_flu8: \ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \ + } \ goto do_trie_utf8_fold; \ case trie_utf8_exactfa_fold: \ flags |= FOLD_FLAGS_NOMIX_ASCII; \ @@ -1495,6 +1500,9 @@ STMT_START { break; \ case trie_utf8l: \ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \ + } \ /* FALLTHROUGH */ \ case trie_utf8: \ uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ @@ -1819,7 +1827,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, UTF-8 to express. */ break; } - utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED + | FOLDEQ_S2_FOLDS_SANE; goto do_exactf_utf8; case EXACTFU: @@ -4185,6 +4194,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (scan->flags == EXACTL || scan->flags == EXACTFLU8) { _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (utf8_target + && UTF8_IS_ABOVE_LATIN1(nextchr) + && scan->flags == EXACTL) + { + /* We only output for EXACTL, as we let the folder + * output this message for EXACTFLU8 to avoid + * duplication */ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, + reginfo->strend); + } } if ( trie->bitmap && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) @@ -4461,6 +4480,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case EXACTL: /* /abc/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + /* Complete checking would involve going through every character + * matched by the string to see if any is above latin1. But the + * comparision otherwise might very well be a fast assembly + * language routine, and I (khw) don't think slowing things down + * just to check for this warning is worth it. So this just checks + * the first character */ + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) { + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); + } /* FALLTHROUGH */ case EXACT: { /* /abc/ */ char *s = STRING(scan); @@ -4560,7 +4589,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (! utf8_target) { sayNO; } - fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED; + fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED + | FOLDEQ_S1_FOLDS_SANE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ @@ -4758,6 +4788,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { /* Here, must be an above Latin-1 code point */ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); goto utf8_posix_above_latin1; } @@ -7231,6 +7262,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; case EXACTL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); + } /* FALLTHROUGH */ case EXACT: assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); @@ -7318,7 +7352,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! utf8_target) { break; } - utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; + utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED + | FOLDEQ_S2_FOLDS_SANE; goto do_exactf; case EXACTFU_SS: @@ -7733,6 +7768,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_ALLOW_FFFF */ if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) { + _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); + } } /* If this character is potentially in the bitmap, check it */ diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index 73696dfb1d..0c6a16a5ba 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -117,3 +117,29 @@ $_ = 'a' x (2**15+1); # EXPECT +######## +# NAME Wide character in non-UTF-8 locale +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); +"\x{100}" =~ /\x{100}|\x{101}/il; +"\x{100}" =~ /\x{100}|\x{101}/l; +"\x{100}" =~ /\w/l; +"\x{100}" =~ /\x{100}+/l; +"\x{100}" =~ /[\x{100}\x{102}]/l; +no warnings 'locale'; +EXPECT +Wide character (U+100) in pattern match (m//) at - line 8. +Wide character (U+100) in pattern match (m//) at - line 8. +Wide character (U+100) in pattern match (m//) at - line 9. +Wide character (U+100) in pattern match (m//) at - line 9. +Wide character (U+100) in pattern match (m//) at - line 9. +Wide character (U+100) in pattern match (m//) at - line 10. +Wide character (U+100) in pattern match (m//) at - line 10. +Wide character (U+100) in pattern match (m//) at - line 11. +Wide character (U+100) in pattern match (m//) at - line 12. +Wide character (U+100) in pattern match (m//) at - line 12. diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index abce3d1365..75f3f25782 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -612,3 +612,31 @@ Can't do fc("\x{FB05}") on non-UTF-8 locale; resolved to "\x{FB06}". at - line 1 Can't do uc("\x{FB00}") on non-UTF-8 locale; resolved to "\x{FB00}". at - line 13. Can't do ucfirst("\x{149}") on non-UTF-8 locale; resolved to "\x{149}". at - line 14. Can't do lcfirst("\x{178}") on non-UTF-8 locale; resolved to "\x{178}". at - line 15. +######## +# NAME Wide character in non-UTF-8 locale +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +use warnings 'locale'; +use feature 'fc'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); +my $a; +$a = lc("\x{100}"); +$a = lcfirst("\x{101}"); +$a = fc("\x{102}"); +$a = uc("\x{103}"); +$a = ucfirst("\x{104}"); +no warnings 'locale'; +$a = lc("\x{100}"); +$a = lcfirst("\x{101}"); +$a = fc("\x{102}"); +$a = uc("\x{103}"); +$a = ucfirst("\x{104}"); +EXPECT +Wide character (U+100) in lc at - line 10. +Wide character (U+101) in lcfirst at - line 11. +Wide character (U+102) in fc at - line 12. +Wide character (U+103) in uc at - line 13. +Wide character (U+104) in ucfirst at - line 14. diff --git a/t/re/charset.t b/t/re/charset.t index 4d0d99cfdc..e06191620d 100644 --- a/t/re/charset.t +++ b/t/re/charset.t @@ -9,6 +9,8 @@ BEGIN { use strict; use warnings; +no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure + # they work, even though they warn. use Config; plan('no_plan'); diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index c210e2e06e..19d6fbce07 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -866,6 +866,7 @@ sub run_tests { ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; my $loc_re = qq /(?l:^([^X]*)X)/; utf8::upgrade ($loc_re); + no warnings 'locale'; ok "\x{100}X" =~ /$loc_re/, "locale, S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; } @@ -1914,7 +1914,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c s += UTF8SKIP(s); } - /* Here, no characters crossed, result is ok as-is */ + /* Here, no characters crossed, result is ok as-is, but we warn. */ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p)); return result; } |