utf8.c: Faster latin1 upper/title casing

This creates a new function to handle upper/title casing code points in the latin1 range, and avoids using a swash to compute the case. This is because the correct values are compiled-in. And it calls this function when appropriate for both title and upper casing, in both utf8 and uni forms, Unlike the similar function for lower casing, it may make sense for this function to be called from outside utf8.c, but inside the core, so it is not static, but its name begins with an underscore.
author: Karl Williamson <public@khwilliamson.com> 2011-11-08 21:51:07 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-11-08 22:38:38 -0700
commit: 3a4c58c9c509b78969231f9f65f289c227188c3f (patch)
tree: 5202ae806607733ff6aa9926ead070b097b08fb2
parent: 968c5e6ab01546e5aa0c2cb7fbb1c06a7c99b6af (diff)
download: perl-3a4c58c9c509b78969231f9f65f289c227188c3f.tar.gz
4 files changed, 89 insertions, 2 deletions
diff --git a/embed.fnc b/embed.fnc
index 035f3db3fe..e2911dd8a0 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -602,6 +602,7 @@ Ap	|UV	|to_uni_upper	|UV c|NN U8 *p|NN STRLEN *lenp
 Ap	|UV	|to_uni_title	|UV c|NN U8 *p|NN STRLEN *lenp
 #ifdef PERL_IN_UTF8_C
 sR	|U8	|to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
+p	|UV	|_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
 #endif
 Ap	|UV	|to_uni_lower	|UV c|NN U8 *p|NN STRLEN *lenp
 Amp	|UV	|to_uni_fold	|UV c|NN U8 *p|NN STRLEN *lenp
diff --git a/embed.h b/embed.h
index 2c9b827b0d..3d985b5f82 100644
--- a/embed.h
+++ b/embed.h
@@ -1571,6 +1571,7 @@
 #define isa_lookup(a,b,c,d)	S_isa_lookup(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C)
+#define _to_upper_title_latin1(a,b,c,d)	Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
 #define is_utf8_char_slow	S_is_utf8_char_slow
 #define is_utf8_common(a,b,c)	S_is_utf8_common(aTHX_ a,b,c)
 #define swash_get(a,b,c)	S_swash_get(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 2b58991d26..7f9621a47e 100644
--- a/proto.h
+++ b/proto.h
@@ -6983,6 +6983,12 @@ STATIC bool	S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
 
 #endif
 #if defined(PERL_IN_UTF8_C)
+PERL_CALLCONV UV	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
+			__attribute__nonnull__(pTHX_2)
+			__attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1	\
+	assert(p); assert(lenp)
+
 STATIC STRLEN	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(1);
diff --git a/utf8.c b/utf8.c
index 8c87d2156a..38f5c6c00c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1314,6 +1314,57 @@ Perl_is_uni_xdigit(pTHX_ UV c)
     return is_utf8_xdigit(tmpbuf);
 }
 
+UV
+Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
+{
+    /* We have the latin1-range values compiled into the core, so just use
+     * those, converting the result to utf8.  The only difference between upper
+     * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
+     * either "SS" or "Ss".  Which one to use is passed into the routine in
+     * 'S_or_s' to avoid a test */
+
+    UV converted = toUPPER_LATIN1_MOD(c);
+
+    PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
+
+    assert(S_or_s == 'S' || S_or_s == 's');
+
+    if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
+					  characters in this range */
+	*p = (U8) converted;
+	*lenp = 1;
+	return converted;
+    }
+
+    /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
+     * which it maps to one of them, so as to only have to have one check for
+     * it in the main case */
+    if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
+	switch (c) {
+	    case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+		converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
+		break;
+	    case MICRO_SIGN:
+		converted = GREEK_CAPITAL_LETTER_MU;
+		break;
+	    case LATIN_SMALL_LETTER_SHARP_S:
+		*(p)++ = 'S';
+		*p = S_or_s;
+		*lenp = 2;
+		return 'S';
+	    default:
+		Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
+		/* NOTREACHED */
+	}
+    }
+
+    *(p)++ = UTF8_TWO_BYTE_HI(converted);
+    *p = UTF8_TWO_BYTE_LO(converted);
+    *lenp = 2;
+
+    return converted;
+}
+
 /* Call the function to convert a UTF-8 encoded character to the specified case.
  * Note that there may be more than one character in the result.
  * INP is a pointer to the first byte of the input character
@@ -1334,6 +1385,8 @@ Perl_is_uni_xdigit(pTHX_ UV c)
 UV
 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 {
+    dVAR;
+
     /* Convert the Unicode character whose ordinal is c to its uppercase
      * version and store that in UTF-8 in p and its length in bytes in lenp.
      * Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
@@ -1344,17 +1397,27 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 
     PERL_ARGS_ASSERT_TO_UNI_UPPER;
 
+    if (c < 256) {
+	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
+    }
+
     uvchr_to_utf8(p, c);
-    return to_utf8_upper(p, p, lenp);
+    return CALL_UPPER_CASE(p, p, lenp);
 }
 
 UV
 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 {
+    dVAR;
+
     PERL_ARGS_ASSERT_TO_UNI_TITLE;
 
+    if (c < 256) {
+	return _to_upper_title_latin1((U8) c, p, lenp, 's');
+    }
+
     uvchr_to_utf8(p, c);
-    return to_utf8_title(p, p, lenp);
+    return CALL_TITLE_CASE(p, p, lenp);
 }
 
 STATIC U8
@@ -2021,6 +2084,14 @@ Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
 
     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
 
+    if (UTF8_IS_INVARIANT(*p)) {
+	return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
+    }
+    else if UTF8_IS_DOWNGRADEABLE_START(*p) {
+	return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
+				      ustrp, lenp, 'S');
+    }
+
     return CALL_UPPER_CASE(p, ustrp, lenp);
 }
 
@@ -2044,6 +2115,14 @@ Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
 
     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
 
+    if (UTF8_IS_INVARIANT(*p)) {
+	return _to_upper_title_latin1(*p, ustrp, lenp, 's');
+    }
+    else if UTF8_IS_DOWNGRADEABLE_START(*p) {
+	return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
+				      ustrp, lenp, 's');
+    }
+
     return CALL_TITLE_CASE(p, ustrp, lenp);
 }
author	Karl Williamson <public@khwilliamson.com>	2011-11-08 21:51:07 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-11-08 22:38:38 -0700
commit	3a4c58c9c509b78969231f9f65f289c227188c3f (patch)
tree	5202ae806607733ff6aa9926ead070b097b08fb2
parent	968c5e6ab01546e5aa0c2cb7fbb1c06a7c99b6af (diff)
download	perl-3a4c58c9c509b78969231f9f65f289c227188c3f.tar.gz