diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-12-13 22:01:46 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-12-15 16:26:00 -0700 |
commit | 094a2f8c3da82fac9e0698c2daeb7e94d0ae765a (patch) | |
tree | 377042bb7ad310d7b0b1cf66079e80e5d8743e0a /pp.c | |
parent | 81c6c7ce308a6bd705e6d8343eb996df5a938aa5 (diff) | |
download | perl-094a2f8c3da82fac9e0698c2daeb7e94d0ae765a.tar.gz |
pp.c: Changing case of utf8 strings under locale uses locale for < 255
As proposed on p5p and approved, this changes the functions uc(), lc(),
ucfirst(), and lcfirst() to respect locale for code points < 255; and
use Unicode semantics for those above 255. This results in better, but
not perfect results, as noted in the changed pods, and brings these
functions into line with how regular expression pattern matching already
works.
Diffstat (limited to 'pp.c')
-rw-r--r-- | pp.c | 33 |
1 files changed, 29 insertions, 4 deletions
@@ -3481,6 +3481,7 @@ PP(pp_ucfirst) STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or * lowercased) character stored in tmpbuf. May be either * UTF-8 or not, but in either case is the number of bytes */ + bool tainted = FALSE; SvGETMAGIC(source); if (SvOK(source)) { @@ -3508,8 +3509,14 @@ PP(pp_ucfirst) else if (DO_UTF8(source)) { /* Is the source utf8? */ doing_utf8 = TRUE; ulen = UTF8SKIP(s); - if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen); - else toLOWER_utf8(s, tmpbuf, &tculen); + if (op_type == OP_UCFIRST) { + _to_utf8_title_flags(s, tmpbuf, &tculen, + cBOOL(IN_LOCALE_RUNTIME), &tainted); + } + else { + _to_utf8_lower_flags(s, tmpbuf, &tculen, + cBOOL(IN_LOCALE_RUNTIME), &tainted); + } /* we can't do in-place if the length changes. */ if (ulen != tculen) inplace = FALSE; @@ -3641,6 +3648,11 @@ PP(pp_ucfirst) Copy(tmpbuf, d, tculen, U8); SvCUR_set(dest, need - 1); } + + if (tainted) { + TAINT; + SvTAINTED_on(dest); + } } else { /* Neither source nor dest are in or need to be UTF-8 */ if (slen) { @@ -3746,6 +3758,7 @@ PP(pp_uc) if (DO_UTF8(source)) { const U8 *const send = s + len; U8 tmpbuf[UTF8_MAXBYTES+1]; + bool tainted = FALSE; /* All occurrences of these are to be moved to follow any other marks. * This is context-dependent. We may not be passed enough context to @@ -3777,7 +3790,8 @@ PP(pp_uc) * and copy it to the output buffer */ u = UTF8SKIP(s); - uv = toUPPER_utf8(s, tmpbuf, &ulen); + uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, + cBOOL(IN_LOCALE_RUNTIME), &tainted); if (uv == GREEK_CAPITAL_LETTER_IOTA && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI) { @@ -3807,7 +3821,12 @@ PP(pp_uc) } SvUTF8_on(dest); *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + if (tainted) { + TAINT; + SvTAINTED_on(dest); + } } else { /* Not UTF-8 */ if (len) { @@ -3976,12 +3995,14 @@ PP(pp_lc) if (DO_UTF8(source)) { const U8 *const send = s + len; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; + bool tainted = FALSE; while (s < send) { const STRLEN u = UTF8SKIP(s); STRLEN ulen; - toLOWER_utf8(s, tmpbuf, &ulen); + _to_utf8_lower_flags(s, tmpbuf, &ulen, + cBOOL(IN_LOCALE_RUNTIME), &tainted); /* Here is where we would do context-sensitive actions. See the * commit message for this comment for why there isn't any */ @@ -4011,6 +4032,10 @@ PP(pp_lc) SvUTF8_on(dest); *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + if (tainted) { + TAINT; + SvTAINTED_on(dest); + } } else { /* Not utf8 */ if (len) { const U8 *const send = s + len; |