diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-11-08 21:51:07 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-11-08 22:38:38 -0700 |
commit | 3a4c58c9c509b78969231f9f65f289c227188c3f (patch) | |
tree | 5202ae806607733ff6aa9926ead070b097b08fb2 /utf8.c | |
parent | 968c5e6ab01546e5aa0c2cb7fbb1c06a7c99b6af (diff) | |
download | perl-3a4c58c9c509b78969231f9f65f289c227188c3f.tar.gz |
utf8.c: Faster latin1 upper/title casing
This creates a new function to handle upper/title casing code points in
the latin1 range, and avoids using a swash to compute the case. This is
because the correct values are compiled-in.
And it calls this function when appropriate for both title and upper
casing, in both utf8 and uni forms,
Unlike the similar function for lower casing, it may make sense for this function to be
called from outside utf8.c, but inside the core, so it is not static,
but its name begins with an underscore.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 83 |
1 files changed, 81 insertions, 2 deletions
@@ -1314,6 +1314,57 @@ Perl_is_uni_xdigit(pTHX_ UV c) return is_utf8_xdigit(tmpbuf); } +UV +Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s) +{ + /* We have the latin1-range values compiled into the core, so just use + * those, converting the result to utf8. The only difference between upper + * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is + * either "SS" or "Ss". Which one to use is passed into the routine in + * 'S_or_s' to avoid a test */ + + UV converted = toUPPER_LATIN1_MOD(c); + + PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; + + assert(S_or_s == 'S' || S_or_s == 's'); + + if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for + characters in this range */ + *p = (U8) converted; + *lenp = 1; + return converted; + } + + /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, + * which it maps to one of them, so as to only have to have one check for + * it in the main case */ + if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + switch (c) { + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + break; + case MICRO_SIGN: + converted = GREEK_CAPITAL_LETTER_MU; + break; + case LATIN_SMALL_LETTER_SHARP_S: + *(p)++ = 'S'; + *p = S_or_s; + *lenp = 2; + return 'S'; + default: + Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); + /* NOTREACHED */ + } + } + + *(p)++ = UTF8_TWO_BYTE_HI(converted); + *p = UTF8_TWO_BYTE_LO(converted); + *lenp = 2; + + return converted; +} + /* Call the function to convert a UTF-8 encoded character to the specified case. * Note that there may be more than one character in the result. * INP is a pointer to the first byte of the input character @@ -1334,6 +1385,8 @@ Perl_is_uni_xdigit(pTHX_ UV c) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) { + dVAR; + /* Convert the Unicode character whose ordinal is c to its uppercase * version and store that in UTF-8 in p and its length in bytes in lenp. * Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since @@ -1344,17 +1397,27 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UNI_UPPER; + if (c < 256) { + return _to_upper_title_latin1((U8) c, p, lenp, 'S'); + } + uvchr_to_utf8(p, c); - return to_utf8_upper(p, p, lenp); + return CALL_UPPER_CASE(p, p, lenp); } UV Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) { + dVAR; + PERL_ARGS_ASSERT_TO_UNI_TITLE; + if (c < 256) { + return _to_upper_title_latin1((U8) c, p, lenp, 's'); + } + uvchr_to_utf8(p, c); - return to_utf8_title(p, p, lenp); + return CALL_TITLE_CASE(p, p, lenp); } STATIC U8 @@ -2021,6 +2084,14 @@ Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UTF8_UPPER; + if (UTF8_IS_INVARIANT(*p)) { + return _to_upper_title_latin1(*p, ustrp, lenp, 'S'); + } + else if UTF8_IS_DOWNGRADEABLE_START(*p) { + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + ustrp, lenp, 'S'); + } + return CALL_UPPER_CASE(p, ustrp, lenp); } @@ -2044,6 +2115,14 @@ Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UTF8_TITLE; + if (UTF8_IS_INVARIANT(*p)) { + return _to_upper_title_latin1(*p, ustrp, lenp, 's'); + } + else if UTF8_IS_DOWNGRADEABLE_START(*p) { + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + ustrp, lenp, 's'); + } + return CALL_TITLE_CASE(p, ustrp, lenp); } |