diff options
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 83 |
4 files changed, 89 insertions, 2 deletions
@@ -602,6 +602,7 @@ Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp #ifdef PERL_IN_UTF8_C sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp +p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s #endif Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp @@ -1571,6 +1571,7 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) +#define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d) #define is_utf8_char_slow S_is_utf8_char_slow #define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c) #define swash_get(a,b,c) S_swash_get(aTHX_ a,b,c) @@ -6983,6 +6983,12 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U #endif #if defined(PERL_IN_UTF8_C) +PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1 \ + assert(p); assert(lenp) + STATIC STRLEN S_is_utf8_char_slow(const U8 *s, const STRLEN len) __attribute__warn_unused_result__ __attribute__nonnull__(1); @@ -1314,6 +1314,57 @@ Perl_is_uni_xdigit(pTHX_ UV c) return is_utf8_xdigit(tmpbuf); } +UV +Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s) +{ + /* We have the latin1-range values compiled into the core, so just use + * those, converting the result to utf8. The only difference between upper + * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is + * either "SS" or "Ss". Which one to use is passed into the routine in + * 'S_or_s' to avoid a test */ + + UV converted = toUPPER_LATIN1_MOD(c); + + PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; + + assert(S_or_s == 'S' || S_or_s == 's'); + + if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for + characters in this range */ + *p = (U8) converted; + *lenp = 1; + return converted; + } + + /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, + * which it maps to one of them, so as to only have to have one check for + * it in the main case */ + if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + switch (c) { + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + break; + case MICRO_SIGN: + converted = GREEK_CAPITAL_LETTER_MU; + break; + case LATIN_SMALL_LETTER_SHARP_S: + *(p)++ = 'S'; + *p = S_or_s; + *lenp = 2; + return 'S'; + default: + Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); + /* NOTREACHED */ + } + } + + *(p)++ = UTF8_TWO_BYTE_HI(converted); + *p = UTF8_TWO_BYTE_LO(converted); + *lenp = 2; + + return converted; +} + /* Call the function to convert a UTF-8 encoded character to the specified case. * Note that there may be more than one character in the result. * INP is a pointer to the first byte of the input character @@ -1334,6 +1385,8 @@ Perl_is_uni_xdigit(pTHX_ UV c) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) { + dVAR; + /* Convert the Unicode character whose ordinal is c to its uppercase * version and store that in UTF-8 in p and its length in bytes in lenp. * Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since @@ -1344,17 +1397,27 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UNI_UPPER; + if (c < 256) { + return _to_upper_title_latin1((U8) c, p, lenp, 'S'); + } + uvchr_to_utf8(p, c); - return to_utf8_upper(p, p, lenp); + return CALL_UPPER_CASE(p, p, lenp); } UV Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) { + dVAR; + PERL_ARGS_ASSERT_TO_UNI_TITLE; + if (c < 256) { + return _to_upper_title_latin1((U8) c, p, lenp, 's'); + } + uvchr_to_utf8(p, c); - return to_utf8_title(p, p, lenp); + return CALL_TITLE_CASE(p, p, lenp); } STATIC U8 @@ -2021,6 +2084,14 @@ Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UTF8_UPPER; + if (UTF8_IS_INVARIANT(*p)) { + return _to_upper_title_latin1(*p, ustrp, lenp, 'S'); + } + else if UTF8_IS_DOWNGRADEABLE_START(*p) { + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + ustrp, lenp, 'S'); + } + return CALL_UPPER_CASE(p, ustrp, lenp); } @@ -2044,6 +2115,14 @@ Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UTF8_TITLE; + if (UTF8_IS_INVARIANT(*p)) { + return _to_upper_title_latin1(*p, ustrp, lenp, 's'); + } + else if UTF8_IS_DOWNGRADEABLE_START(*p) { + return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + ustrp, lenp, 's'); + } + return CALL_TITLE_CASE(p, ustrp, lenp); } |