summaryrefslogtreecommitdiff
path: root/pp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-12-13 22:01:46 -0700
committerKarl Williamson <public@khwilliamson.com>2011-12-15 16:26:00 -0700
commit094a2f8c3da82fac9e0698c2daeb7e94d0ae765a (patch)
tree377042bb7ad310d7b0b1cf66079e80e5d8743e0a /pp.c
parent81c6c7ce308a6bd705e6d8343eb996df5a938aa5 (diff)
downloadperl-094a2f8c3da82fac9e0698c2daeb7e94d0ae765a.tar.gz
pp.c: Changing case of utf8 strings under locale uses locale for < 255
As proposed on p5p and approved, this changes the functions uc(), lc(), ucfirst(), and lcfirst() to respect locale for code points < 255; and use Unicode semantics for those above 255. This results in better, but not perfect results, as noted in the changed pods, and brings these functions into line with how regular expression pattern matching already works.
Diffstat (limited to 'pp.c')
-rw-r--r--pp.c33
1 files changed, 29 insertions, 4 deletions
diff --git a/pp.c b/pp.c
index d55c7fdf95..56a69db246 100644
--- a/pp.c
+++ b/pp.c
@@ -3481,6 +3481,7 @@ PP(pp_ucfirst)
STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or
* lowercased) character stored in tmpbuf. May be either
* UTF-8 or not, but in either case is the number of bytes */
+ bool tainted = FALSE;
SvGETMAGIC(source);
if (SvOK(source)) {
@@ -3508,8 +3509,14 @@ PP(pp_ucfirst)
else if (DO_UTF8(source)) { /* Is the source utf8? */
doing_utf8 = TRUE;
ulen = UTF8SKIP(s);
- if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
- else toLOWER_utf8(s, tmpbuf, &tculen);
+ if (op_type == OP_UCFIRST) {
+ _to_utf8_title_flags(s, tmpbuf, &tculen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
+ }
+ else {
+ _to_utf8_lower_flags(s, tmpbuf, &tculen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
+ }
/* we can't do in-place if the length changes. */
if (ulen != tculen) inplace = FALSE;
@@ -3641,6 +3648,11 @@ PP(pp_ucfirst)
Copy(tmpbuf, d, tculen, U8);
SvCUR_set(dest, need - 1);
}
+
+ if (tainted) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
}
else { /* Neither source nor dest are in or need to be UTF-8 */
if (slen) {
@@ -3746,6 +3758,7 @@ PP(pp_uc)
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES+1];
+ bool tainted = FALSE;
/* All occurrences of these are to be moved to follow any other marks.
* This is context-dependent. We may not be passed enough context to
@@ -3777,7 +3790,8 @@ PP(pp_uc)
* and copy it to the output buffer */
u = UTF8SKIP(s);
- uv = toUPPER_utf8(s, tmpbuf, &ulen);
+ uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
if (uv == GREEK_CAPITAL_LETTER_IOTA
&& utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
{
@@ -3807,7 +3821,12 @@ PP(pp_uc)
}
SvUTF8_on(dest);
*d = '\0';
+
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ if (tainted) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
}
else { /* Not UTF-8 */
if (len) {
@@ -3976,12 +3995,14 @@ PP(pp_lc)
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+ bool tainted = FALSE;
while (s < send) {
const STRLEN u = UTF8SKIP(s);
STRLEN ulen;
- toLOWER_utf8(s, tmpbuf, &ulen);
+ _to_utf8_lower_flags(s, tmpbuf, &ulen,
+ cBOOL(IN_LOCALE_RUNTIME), &tainted);
/* Here is where we would do context-sensitive actions. See the
* commit message for this comment for why there isn't any */
@@ -4011,6 +4032,10 @@ PP(pp_lc)
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ if (tainted) {
+ TAINT;
+ SvTAINTED_on(dest);
+ }
} else { /* Not utf8 */
if (len) {
const U8 *const send = s + len;