diff options
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r-- | strings/ctype-utf8.c | 1054 |
1 files changed, 975 insertions, 79 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 91f633e45ce..7de5cdd00ee 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -27,6 +27,16 @@ #define EILSEQ ENOENT #endif + +#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" +#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" +#define MY_UTF8MB3_BIN MY_UTF8MB3 "_bin" +#define MY_UTF8MB4_GENERAL_CI MY_UTF8MB4 "_general_ci" +#define MY_UTF8MB4_GENERAL_CS MY_UTF8MB4 "_general_cs" +#define MY_UTF8MB4_BIN MY_UTF8MB4 "_bin" + + + #ifndef HAVE_CHARSET_utf8 #define HAVE_CHARSET_utf8 #endif @@ -39,6 +49,14 @@ #define HAVE_UNIDATA #endif +#ifdef HAVE_CHARSET_utf16 +#define HAVE_UNIDATA +#endif + +#ifdef HAVE_CHARSET_utf32 +#define HAVE_UNIDATA +#endif + #ifdef HAVE_UNIDATA #include "my_uctype.h" @@ -1702,6 +1720,24 @@ MY_UNICASE_INFO *my_unicase_turkish[256]= }; +#define REPLACEMENT_CHAR 0xFFFD; + + +static inline void +my_tosort_unicode(MY_UNICASE_INFO **uni_plane, my_wc_t *wc) +{ + int page= *wc >> 8; + if (page < 256) + { + if (uni_plane[page]) + *wc= uni_plane[page][*wc & 0xFF].sort; + } + else + { + *wc= REPLACEMENT_CHAR; + } +} + /* ** Compare string against string with wildcard @@ -1712,13 +1748,14 @@ MY_UNICASE_INFO *my_unicase_turkish[256]= ** 1 if matched with wildcard */ -int my_wildcmp_unicode(CHARSET_INFO *cs, - const char *str,const char *str_end, - const char *wildstr,const char *wildend, - int escape, int w_one, int w_many, - MY_UNICASE_INFO **weights) +int +my_wildcmp_unicode(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many, + MY_UNICASE_INFO **weights) { - int result= -1; /* Not found, using wildcards */ + int result= -1; /* Not found, using wildcards */ my_wc_t s_wc, w_wc; int scan, plane; int (*mb_wc)(struct charset_info_st *, my_wc_t *, @@ -1734,14 +1771,14 @@ int my_wildcmp_unicode(CHARSET_INFO *cs, (const uchar*)wildend)) <= 0) return 1; - if (w_wc == (my_wc_t)w_many) + if (w_wc == (my_wc_t) w_many) { - result= 1; /* Found an anchor char */ + result= 1; /* Found an anchor char */ break; } wildstr+= scan; - if (w_wc == (my_wc_t)escape && wildstr < wildend) + if (w_wc == (my_wc_t) escape && wildstr < wildend) { if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, (const uchar*)wildend)) <= 0) @@ -1755,29 +1792,27 @@ int my_wildcmp_unicode(CHARSET_INFO *cs, return 1; str+= scan; - if (!escaped && w_wc == (my_wc_t)w_one) + if (!escaped && w_wc == (my_wc_t) w_one) { - result= 1; /* Found an anchor char */ + result= 1; /* Found an anchor char */ } else { if (weights) { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + my_tosort_unicode(weights, &s_wc); + my_tosort_unicode(weights, &w_wc); } if (s_wc != w_wc) - return 1; /* No match */ + return 1; /* No match */ } if (wildstr == wildend) - return (str != str_end); /* Match if both are at end */ + return (str != str_end); /* Match if both are at end */ } - if (w_wc == (my_wc_t)w_many) - { /* Found w_many */ + if (w_wc == (my_wc_t) w_many) + { /* Found w_many */ /* Remove any '%' and '_' from the wild search string */ for ( ; wildstr != wildend ; ) @@ -1786,29 +1821,29 @@ int my_wildcmp_unicode(CHARSET_INFO *cs, (const uchar*)wildend)) <= 0) return 1; - if (w_wc == (my_wc_t)w_many) - { - wildstr+= scan; - continue; - } - - if (w_wc == (my_wc_t)w_one) - { - wildstr+= scan; + if (w_wc == (my_wc_t)w_many) + { + wildstr+= scan; + continue; + } + + if (w_wc == (my_wc_t)w_one) + { + wildstr+= scan; if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, (const uchar*)str_end)) <=0) return 1; str+= scan; - continue; - } - break; /* Not a wild character */ + continue; + } + break; /* Not a wild character */ } if (wildstr == wildend) - return 0; /* Ok if w_many is last */ + return 0; /* Ok if w_many is last */ if (str == str_end) - return -1; + return -1; if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, (const uchar*)wildend)) <=0) @@ -1836,10 +1871,8 @@ int my_wildcmp_unicode(CHARSET_INFO *cs, return 1; if (weights) { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + my_tosort_unicode(weights, &s_wc); + my_tosort_unicode(weights, &w_wc); } if (s_wc == w_wc) @@ -1861,8 +1894,53 @@ int my_wildcmp_unicode(CHARSET_INFO *cs, return (str != str_end ? 1 : 0); } -#endif +/* + This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32 +*/ +size_t +my_strnxfrm_unicode(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + const uchar *src, size_t srclen) +{ + my_wc_t wc; + int res; + uchar *de= dst + dstlen; + uchar *de_beg= de - 1; + const uchar *se = src + srclen; + MY_UNICASE_INFO **uni_plane= (cs->state & MY_CS_BINSORT) ? + NULL : cs->caseinfo; + LINT_INIT(wc); + DBUG_ASSERT(src); + + while (dst < de_beg) + { + if ((res= cs->cset->mb_wc(cs,&wc, src, se)) <= 0) + break; + src+=res; + + if (uni_plane) + my_tosort_unicode(uni_plane, &wc); + + *dst++= (uchar) (wc >> 8); + if (dst < de) + *dst++= (uchar) (wc & 0xFF); + } + + while (dst < de_beg) /* Fill the tail with keys for space character */ + { + *dst++= 0x00; + *dst++= 0x20; + } + + if (dst < de) /* Clear the last byte, if "dstlen" was an odd number */ + *dst= 0x00; + + return dstlen; +} + + +#endif /* HAVE_UNIDATA */ #ifdef HAVE_CHARSET_utf8 @@ -2569,44 +2647,6 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), } -static size_t my_strnxfrm_utf8(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, - const uchar *src, size_t srclen) -{ - my_wc_t wc; - int res; - int plane; - uchar *de= dst + dstlen; - uchar *de_beg= de - 1; - const uchar *se = src + srclen; - MY_UNICASE_INFO **uni_plane= cs->caseinfo; - - while (dst < de_beg) - { - if ((res=my_utf8_uni(cs,&wc, src, se)) <= 0) - break; - src+=res; - - plane=(wc>>8) & 0xFF; - wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc; - - *dst++= (uchar)(wc >> 8); - *dst++= (uchar)(wc & 0xFF); - - } - - while (dst < de_beg) /* Fill the tail with keys for space character */ - { - *dst++= 0x00; - *dst++= 0x20; - } - - if (dst < de) /* Clear the last byte, if "dstlen" was an odd number */ - *dst= 0x00; - - return dstlen; -} - static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { my_wc_t wc; @@ -2642,7 +2682,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = NULL, /* init */ my_strnncoll_utf8, my_strnncollsp_utf8, - my_strnxfrm_utf8, + my_strnxfrm_unicode, my_strnxfrmlen_utf8, my_like_range_mb, my_wildcmp_utf8, @@ -2891,7 +2931,7 @@ static MY_COLLATION_HANDLER my_collation_cs_handler = NULL, /* init */ my_strnncoll_utf8_cs, my_strnncollsp_utf8_cs, - my_strnxfrm_utf8, + my_strnxfrm_unicode, my_strnxfrmlen_utf8, my_like_range_simple, my_wildcmp_mb, @@ -4154,7 +4194,7 @@ static MY_COLLATION_HANDLER my_collation_filename_handler = NULL, /* init */ my_strnncoll_utf8, my_strnncollsp_utf8, - my_strnxfrm_utf8, + my_strnxfrm_unicode, my_strnxfrmlen_utf8, my_like_range_mb, my_wildcmp_utf8, @@ -4284,3 +4324,859 @@ int main() +#ifdef HAVE_CHARSET_utf8mb4 + +/* + We consider bytes with code more than 127 as a letter. + This garantees that word boundaries work fine with regular + expressions. Note, there is no need to mark byte 255 as a + letter, it is illegal byte in UTF8. +*/ +static uchar ctype_utf8mb4[]= +{ + 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16, + 16,129,129,129,129,129,129, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16, + 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0 +}; + + +static uchar to_lower_utf8mb4[]= +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95, + 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, + 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, + 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 +}; + + +static uchar to_upper_utf8mb4[]= +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127, + 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159, + 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, + 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, + 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207, + 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223, + 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239, + 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 +}; + + +static inline int +bincmp_utf8mb4(const uchar *s, const uchar *se, + const uchar *t, const uchar *te) +{ + int slen= (int) (se - s), tlen= (int) (te - t); + int len= min(slen, tlen); + int cmp= memcmp(s, t, len); + return cmp ? cmp : slen - tlen; +} + + +static int +my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t * pwc, const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0x80) + { + *pwc= c; + return 1; + } + else if (c < 0xc2) + return MY_CS_ILSEQ; + else if (c < 0xe0) + { + if (s + 2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!((s[1] ^ 0x80) < 0x40)) + return MY_CS_ILSEQ; + + *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + return 2; + } + else if (c < 0xf0) + { + if (s + 3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + *pwc= ((my_wc_t) (c & 0x0f) << 12) | + ((my_wc_t) (s[1] ^ 0x80) << 6) | + (my_wc_t) (s[2] ^ 0x80); + return 3; + } + else if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + + if (!((s[1] ^ 0x80) < 0x40 && + (s[2] ^ 0x80) < 0x40 && + (s[3] ^ 0x80) < 0x40 && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + *pwc = ((my_wc_t) (c & 0x07) << 18) | + ((my_wc_t) (s[1] ^ 0x80) << 12) | + ((my_wc_t) (s[2] ^ 0x80) << 6) | + (my_wc_t) (s[3] ^ 0x80); + return 4; + } + return MY_CS_ILSEQ; +} + + +/* + The same as above, but without range check + for example, for a null-terminated string +*/ +static int +my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t *pwc, const uchar *s) +{ + uchar c; + + c= s[0]; + if (c < 0x80) + { + *pwc = c; + return 1; + } + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (!((s[1] ^ 0x80) < 0x40)) + return MY_CS_ILSEQ; + + *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + return 2; + } + + if (c < 0xf0) + { + if (!((s[1] ^ 0x80) < 0x40 && + (s[2] ^ 0x80) < 0x40 && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + *pwc= ((my_wc_t) (c & 0x0f) << 12) | + ((my_wc_t) (s[1] ^ 0x80) << 6) | + (my_wc_t) (s[2] ^ 0x80); + + return 3; + } + else if (c < 0xf5) + { + if (!((s[1] ^ 0x80) < 0x40 && + (s[2] ^ 0x80) < 0x40 && + (s[3] ^ 0x80) < 0x40 && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + *pwc = ((my_wc_t) (c & 0x07) << 18) | + ((my_wc_t) (s[1] ^ 0x80) << 12) | + ((my_wc_t) (s[2] ^ 0x80) << 6) | + (my_wc_t) (s[3] ^ 0x80); + return 4; + } + return MY_CS_ILSEQ; +} + + +static int +my_wc_mb_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t wc, uchar *r, uchar *e) +{ + int count; + + if (r >= e) + return MY_CS_TOOSMALL; + + if (wc < 0x80) + count= 1; + else if (wc < 0x800) + count= 2; + else if (wc < 0x10000) + count= 3; + else if (wc < 0x200000) + count= 4; + else return MY_CS_ILUNI; + + if (r + count > e) + return MY_CS_TOOSMALLN(count); + + switch (count) { + /* Fall through all cases!!! */ + case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000; + case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800; + case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0; + case 1: r[0] = (uchar) wc; + } + return count; +} + + +/* + The same as above, but without range check. +*/ +static int +my_wc_mb_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t wc, uchar *r) +{ + int count; + + if (wc < 0x80) + count= 1; + else if (wc < 0x800) + count= 2; + else if (wc < 0x10000) + count= 3; + else if (wc < 0x200000) + count= 4; + else + return MY_CS_ILUNI; + + switch (count) + { + /* Fall through all cases!!! */ + case 4: r[3]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0x10000; + case 3: r[2]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0x800; + case 2: r[1]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0xc0; + case 1: r[0]= (uchar) wc; + } + return count; +} + + +static inline void +my_tolower_utf8mb4(MY_UNICASE_INFO **uni_plane, my_wc_t *wc) +{ + int page= *wc >> 8; + if (page < 256 && uni_plane[page]) + *wc= uni_plane[page][*wc & 0xFF].tolower; +} + + +static inline void +my_toupper_utf8mb4(MY_UNICASE_INFO **uni_plane, my_wc_t *wc) +{ + int page= *wc >> 8; + if (page < 256 && uni_plane[page]) + *wc= uni_plane[page][*wc & 0xFF].toupper; +} + + +static size_t +my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen, + char *dst, size_t dstlen) +{ + my_wc_t wc; + int srcres, dstres; + char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + DBUG_ASSERT(src != dst || cs->caseup_multiply == 1); + + while ((src < srcend) && + (srcres= my_mb_wc_utf8mb4(cs, &wc, + (uchar *) src, (uchar*) srcend)) > 0) + { + my_toupper_utf8mb4(uni_plane, &wc); + if ((dstres= my_wc_mb_utf8mb4(cs, wc, (uchar*) dst, (uchar*) dstend)) <= 0) + break; + src+= srcres; + dst+= dstres; + } + return (size_t) (dst - dst0); +} + + +static inline void +my_hash_add(ulong *n1, ulong *n2, uint ch) +{ + n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8); + n2[0]+= 3; +} + + +static void +my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen, + ulong *n1, ulong *n2) +{ + my_wc_t wc; + int res; + const uchar *e= s + slen; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + + /* + Remove end space. We do this to be able to compare + 'A ' and 'A' as identical + */ + while (e > s && e[-1] == ' ') + e--; + + while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0) + { + my_tosort_unicode(uni_plane, &wc); + my_hash_add(n1, n2, (uint) (wc & 0xFF)); + my_hash_add(n1, n2, (uint) (wc >> 8) & 0xFF); + if (wc > 0xFFFF) + { + /* + Put the highest byte only if it is non-zero, + to make hash functions for utf8mb3 and utf8mb4 + compatible for BMP characters. + This is useful to keep order of records in + test results, e.g. for "SHOW GRANTS". + */ + my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF); + } + s+= res; + } +} + + +static size_t +my_caseup_str_utf8mb4(CHARSET_INFO *cs, char *src) +{ + my_wc_t wc; + int srcres, dstres; + char *dst= src, *dst0= src; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + DBUG_ASSERT(cs->caseup_multiply == 1); + + while (*src && + (srcres= my_mb_wc_utf8mb4_no_range(cs, &wc, (uchar *) src)) > 0) + { + my_toupper_utf8mb4(uni_plane, &wc); + if ((dstres= my_wc_mb_utf8mb4_no_range(cs, wc, (uchar*) dst)) <= 0) + break; + src+= srcres; + dst+= dstres; + } + *dst= '\0'; + return (size_t) (dst - dst0); +} + + +static size_t +my_casedn_utf8mb4(CHARSET_INFO *cs, + char *src, size_t srclen, + char *dst, size_t dstlen) +{ + my_wc_t wc; + int srcres, dstres; + char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + DBUG_ASSERT(src != dst || cs->casedn_multiply == 1); + + while ((src < srcend) && + (srcres= my_mb_wc_utf8mb4(cs, &wc, + (uchar*) src, (uchar*) srcend)) > 0) + { + my_tolower_utf8mb4(uni_plane, &wc); + if ((dstres= my_wc_mb_utf8mb4(cs, wc, (uchar*) dst, (uchar*) dstend)) <= 0) + break; + src+= srcres; + dst+= dstres; + } + return (size_t) (dst - dst0); +} + + +static size_t +my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src) +{ + my_wc_t wc; + int srcres, dstres; + char *dst= src, *dst0= src; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + DBUG_ASSERT(cs->casedn_multiply == 1); + + while (*src && + (srcres= my_mb_wc_utf8mb4_no_range(cs, &wc, (uchar *) src)) > 0) + { + my_tolower_utf8mb4(uni_plane, &wc); + if ((dstres= my_wc_mb_utf8mb4_no_range(cs, wc, (uchar*) dst)) <= 0) + break; + src+= srcres; + dst+= dstres; + } + + /* + In rare cases lower string can be shorter than + the original string, for example: + + "U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE" + (which is 0xC4B0 in utf8, i.e. two bytes) + + is converted into + + "U+0069 LATIN SMALL LETTER I" + (which is 0x69 in utf8, i.e. one byte) + + So, we need to put '\0' terminator after converting. + */ + + *dst= '\0'; + return (size_t) (dst - dst0); +} + + +static int +my_strnncoll_utf8mb4(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool t_is_prefix) +{ + my_wc_t s_wc,t_wc; + const uchar *se= s + slen; + const uchar *te= t + tlen; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + LINT_INIT(s_wc); + LINT_INIT(t_wc); + + while ( s < se && t < te ) + { + int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se); + int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te); + + if ( s_res <= 0 || t_res <= 0 ) + { + /* Incorrect string, compare bytewise */ + return bincmp_utf8mb4(s, se, t, te); + } + + my_tosort_unicode(uni_plane, &s_wc); + my_tosort_unicode(uni_plane, &t_wc); + + if ( s_wc != t_wc ) + { + return s_wc > t_wc ? 1 : -1; + } + + s+= s_res; + t+= t_res; + } + return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t))); +} + + +/** + + Compare strings, discarding end space + + If one string is shorter as the other, then we space extend the other + so that the strings have equal length. + + This will ensure that the following things hold: + + "a" == "a " + "a\0" < "a" + "a\0" < "a " + + @param cs Character set pinter. + @param a First string to compare. + @param a_length Length of 'a'. + @param b Second string to compare. + @param b_length Length of 'b'. + @param diff_if_only_endspace_difference + Set to 1 if the strings should be regarded as different + if they only difference in end space + + @return Comparison result. + @retval Negative number, if a less than b. + @retval 0, if a is equal to b + @retval Positive number, if a > b +*/ + +static int +my_strnncollsp_utf8mb4(CHARSET_INFO *cs, + const uchar *s, size_t slen, + const uchar *t, size_t tlen, + my_bool diff_if_only_endspace_difference) +{ + int res; + my_wc_t s_wc, t_wc; + const uchar *se= s + slen, *te= t + tlen; + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + LINT_INIT(s_wc); + LINT_INIT(t_wc); + +#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE + diff_if_only_endspace_difference= FALSE; +#endif + + while ( s < se && t < te ) + { + int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se); + int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te); + + if ( s_res <= 0 || t_res <= 0 ) + { + /* Incorrect string, compare bytewise */ + return bincmp_utf8mb4(s, se, t, te); + } + + my_tosort_unicode(uni_plane, &s_wc); + my_tosort_unicode(uni_plane, &t_wc); + + if ( s_wc != t_wc ) + { + return s_wc > t_wc ? 1 : -1; + } + + s+=s_res; + t+=t_res; + } + + slen= (size_t) (se-s); + tlen= (size_t) (te-t); + res= 0; + + if (slen != tlen) + { + int swap= 1; + if (diff_if_only_endspace_difference) + res= 1; /* Assume 'a' is bigger */ + if (slen < tlen) + { + slen= tlen; + s= t; + se= te; + swap= -1; + res= -res; + } + /* + This following loop uses the fact that in UTF-8 + all multibyte characters are greater than space, + and all multibyte head characters are greater than + space. It means if we meet a character greater + than space, it always means that the longer string + is greater. So we can reuse the same loop from the + 8bit version, without having to process full multibute + sequences. + */ + for ( ; s < se; s++) + { + if (*s != ' ') + return (*s < ' ') ? -swap : swap; + } + } + return res; +} + + +/** + Compare 0-terminated UTF8 strings. + + @param cs character set handler + @param s First 0-terminated string to compare + @param t Second 0-terminated string to compare + + @return Comparison result. + @retval negative number if s < t + @retval positive number if s > t + @retval 0 is the strings are equal +*/ + +static int +my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t) +{ + MY_UNICASE_INFO **uni_plane= cs->caseinfo; + while (s[0] && t[0]) + { + my_wc_t s_wc,t_wc; + + if ((uchar) s[0] < 128) + { + /* + s[0] is between 0 and 127. + It represents a single byte character. + Convert it into weight according to collation. + */ + s_wc= plane00[(uchar) s[0]].tolower; + s++; + } + else + { + int res= my_mb_wc_utf8mb4_no_range(cs, &s_wc, (const uchar*) s); + + /* + In the case of wrong multibyte sequence we will + call strcmp() for byte-to-byte comparison. + */ + if (res <= 0) + return strcmp(s, t); + s+= res; + + my_tolower_utf8mb4(uni_plane, &s_wc); + } + + + /* Do the same for the second string */ + + if ((uchar) t[0] < 128) + { + /* Convert single byte character into weight */ + t_wc= plane00[(uchar) t[0]].tolower; + t++; + } + else + { + int res= my_mb_wc_utf8mb4_no_range(cs, &t_wc, (const uchar*) t); + if (res <= 0) + return strcmp(s, t); + t+= res; + + my_tolower_utf8mb4(uni_plane, &t_wc); + } + + /* Now we have two weights, let's compare them */ + if ( s_wc != t_wc ) + return ((int) s_wc) - ((int) t_wc); + } + return ((int) (uchar) s[0]) - ((int) (uchar) t[0]); +} + + +static int +my_wildcmp_utf8mb4(CHARSET_INFO *cs, + const char *str, const char *strend, + const char *wildstr, const char *wildend, + int escape, int w_one, int w_many) +{ + return my_wildcmp_unicode(cs, str, strend, wildstr, wildend, + escape, w_one, w_many, cs->caseinfo); +} + + +static size_t +my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len) +{ + /* TODO: fix when working on WL "Unicode new version" */ + return (len * 2 + 2) / 4; +} + + +static uint +my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) +{ + my_wc_t wc; + int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e); + return (res > 1) ? res : 0; +} + + +static uint +my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c) +{ + if (c < 0x80) + return 1; + if (c < 0xc2) + return 0; /* Illegal mb head */ + if (c < 0xe0) + return 2; + if (c < 0xf0) + return 3; + if (c < 0xf8) + return 4; + return 0; /* Illegal mb head */; +} + + +static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= +{ + NULL, /* init */ + my_strnncoll_utf8mb4, + my_strnncollsp_utf8mb4, + my_strnxfrm_unicode, + my_strnxfrmlen_utf8mb4, + my_like_range_mb, + my_wildcmp_utf8mb4, + my_strcasecmp_utf8mb4, + my_instr_mb, + my_hash_sort_utf8mb4, + my_propagate_complex +}; + + +static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = +{ + NULL, /* init */ + my_strnncoll_mb_bin, + my_strnncollsp_mb_bin, + my_strnxfrm_unicode, + my_strnxfrmlen_utf8mb4, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + +MY_CHARSET_HANDLER my_charset_utf8mb4_handler= +{ + NULL, /* init */ + my_ismbchar_utf8mb4, + my_mbcharlen_utf8mb4, + my_numchars_mb, + my_charpos_mb, + my_well_formed_len_mb, + my_lengthsp_8bit, + my_numcells_mb, + my_mb_wc_utf8mb4, + my_wc_mb_utf8mb4, + my_mb_ctype_mb, + my_caseup_str_utf8mb4, + my_casedn_str_utf8mb4, + my_caseup_utf8mb4, + my_casedn_utf8mb4, + my_snprintf_8bit, + my_long10_to_str_8bit, + my_longlong10_to_str_8bit, + my_fill_8bit, + my_strntol_8bit, + my_strntoul_8bit, + my_strntoll_8bit, + my_strntoull_8bit, + my_strntod_8bit, + my_strtoll10_8bit, + my_strntoull10rnd_8bit, + my_scan_8bit +}; + + + +CHARSET_INFO my_charset_utf8mb4_general_ci= +{ + 45,0,0, /* number */ + MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */ + MY_UTF8MB4, /* cs name */ + MY_UTF8MB4_GENERAL_CI,/* name */ + "UTF-8 Unicode", /* comment */ + NULL, /* tailoring */ + ctype_utf8mb4, /* ctype */ + to_lower_utf8mb4, /* to_lower */ + to_upper_utf8mb4, /* to_upper */ + to_upper_utf8mb4, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 1, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 4, /* mbmaxlen */ + 0, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf8mb4_handler, + &my_collation_utf8mb4_general_ci_handler +}; + + +CHARSET_INFO my_charset_utf8mb4_bin= +{ + 46,0,0, /* number */ + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */ + MY_UTF8MB4, /* cs name */ + MY_UTF8MB4_BIN, /* name */ + "UTF-8 Unicode", /* comment */ + NULL, /* tailoring */ + ctype_utf8mb4, /* ctype */ + to_lower_utf8mb4, /* to_lower */ + to_upper_utf8mb4, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 1, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 4, /* mbmaxlen */ + 0, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf8mb4_handler, + &my_collation_utf8mb4_bin_handler +}; + +#endif /* HAVE_CHARSET_utf8mb4 */ |