pp.c: Changing case of utf8 strings under locale uses locale for < 255

As proposed on p5p and approved, this changes the functions uc(), lc(), ucfirst(), and lcfirst() to respect locale for code points < 255; and use Unicode semantics for those above 255. This results in better, but not perfect results, as noted in the changed pods, and brings these functions into line with how regular expression pattern matching already works.
author: Karl Williamson <public@khwilliamson.com> 2011-12-13 22:01:46 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-12-15 16:26:00 -0700
commit: 094a2f8c3da82fac9e0698c2daeb7e94d0ae765a (patch)
tree: 377042bb7ad310d7b0b1cf66079e80e5d8743e0a /pp.c
parent: 81c6c7ce308a6bd705e6d8343eb996df5a938aa5 (diff)
download: perl-094a2f8c3da82fac9e0698c2daeb7e94d0ae765a.tar.gz
1 files changed, 29 insertions, 4 deletions
diff --git a/pp.c b/pp.c
index d55c7fdf95..56a69db246 100644
--- a/pp.c
+++ b/pp.c
@@ -3481,6 +3481,7 @@ PP(pp_ucfirst)
     STRLEN tculen;  /* tculen is the byte length of the freshly titlecased (or
 		     * lowercased) character stored in tmpbuf.  May be either
 		     * UTF-8 or not, but in either case is the number of bytes */
+    bool tainted = FALSE;
 
     SvGETMAGIC(source);
     if (SvOK(source)) {
@@ -3508,8 +3509,14 @@ PP(pp_ucfirst)
     else if (DO_UTF8(source)) {	/* Is the source utf8? */
 	doing_utf8 = TRUE;
         ulen = UTF8SKIP(s);
-        if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
-        else toLOWER_utf8(s, tmpbuf, &tculen);
+        if (op_type == OP_UCFIRST) {
+	    _to_utf8_title_flags(s, tmpbuf, &tculen,
+				 cBOOL(IN_LOCALE_RUNTIME), &tainted);
+	}
+        else {
+	    _to_utf8_lower_flags(s, tmpbuf, &tculen,
+				 cBOOL(IN_LOCALE_RUNTIME), &tainted);
+	}
 
         /* we can't do in-place if the length changes.  */
         if (ulen != tculen) inplace = FALSE;
@@ -3641,6 +3648,11 @@ PP(pp_ucfirst)
 	    Copy(tmpbuf, d, tculen, U8);
 	    SvCUR_set(dest, need - 1);
 	}
+
+	if (tainted) {
+	    TAINT;
+	    SvTAINTED_on(dest);
+	}
     }
     else {  /* Neither source nor dest are in or need to be UTF-8 */
 	if (slen) {
@@ -3746,6 +3758,7 @@ PP(pp_uc)
     if (DO_UTF8(source)) {
 	const U8 *const send = s + len;
 	U8 tmpbuf[UTF8_MAXBYTES+1];
+	bool tainted = FALSE;
 
 	/* All occurrences of these are to be moved to follow any other marks.
 	 * This is context-dependent.  We may not be passed enough context to
@@ -3777,7 +3790,8 @@ PP(pp_uc)
              * and copy it to the output buffer */
 
             u = UTF8SKIP(s);
-            uv = toUPPER_utf8(s, tmpbuf, &ulen);
+            uv = _to_utf8_upper_flags(s, tmpbuf, &ulen,
+				      cBOOL(IN_LOCALE_RUNTIME), &tainted);
             if (uv == GREEK_CAPITAL_LETTER_IOTA
                 && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
             {
@@ -3807,7 +3821,12 @@ PP(pp_uc)
 	}
 	SvUTF8_on(dest);
 	*d = '\0';
+
 	SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+	if (tainted) {
+	    TAINT;
+	    SvTAINTED_on(dest);
+	}
     }
     else {	/* Not UTF-8 */
 	if (len) {
@@ -3976,12 +3995,14 @@ PP(pp_lc)
     if (DO_UTF8(source)) {
 	const U8 *const send = s + len;
 	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+	bool tainted = FALSE;
 
 	while (s < send) {
 	    const STRLEN u = UTF8SKIP(s);
 	    STRLEN ulen;
 
-	    toLOWER_utf8(s, tmpbuf, &ulen);
+	    _to_utf8_lower_flags(s, tmpbuf, &ulen,
+				 cBOOL(IN_LOCALE_RUNTIME), &tainted);
 
 	    /* Here is where we would do context-sensitive actions.  See the
 	     * commit message for this comment for why there isn't any */
@@ -4011,6 +4032,10 @@ PP(pp_lc)
 	SvUTF8_on(dest);
 	*d = '\0';
 	SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+	if (tainted) {
+	    TAINT;
+	    SvTAINTED_on(dest);
+	}
     } else {	/* Not utf8 */
 	if (len) {
 	    const U8 *const send = s + len;
author	Karl Williamson <public@khwilliamson.com>	2011-12-13 22:01:46 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-12-15 16:26:00 -0700
commit	094a2f8c3da82fac9e0698c2daeb7e94d0ae765a (patch)
tree	377042bb7ad310d7b0b1cf66079e80e5d8743e0a /pp.c
parent	81c6c7ce308a6bd705e6d8343eb996df5a938aa5 (diff)
download	perl-094a2f8c3da82fac9e0698c2daeb7e94d0ae765a.tar.gz