diff options
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r-- | strings/ctype-utf8.c | 717 |
1 files changed, 219 insertions, 498 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index b7fb7e49cab..3c2c812a004 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -26,8 +26,86 @@ #define EILSEQ ENOENT #endif -#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40) +/* Detect special bytes and sequences */ +#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40) +/* + Check MB2 character assuming that b0 is alredy known to be >= 0xC2. + Use this macro if the caller already checked b0 for: + - an MB1 character + - an unused gap between MB1 and MB2HEAD +*/ +#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \ + IS_CONTINUATION_BYTE((uchar) b1)) + +/* + Check MB3 character assuming that b0 is already known to be + in the valid MB3HEAD range [0xE0..0xEF]. +*/ +#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \ + IS_CONTINUATION_BYTE(b2) && \ + ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0)) + +/* + Check MB3 character assuming that b0 is already known to be >= 0xE0, + but is not checked for the high end 0xF0 yet. + Use this macro if the caller already checked b0 for: + - an MB1 character + - an unused gap between MB1 and MB2HEAD + - an MB2HEAD +*/ +#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \ + IS_UTF8MB3_STEP2(b0,b1,b2)) + +/* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] +*/ + +/* + Check MB4 character assuming that b0 is already + known to be in the range [0xF0..0xF4] +*/ +#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \ + IS_CONTINUATION_BYTE(b2) && \ + IS_CONTINUATION_BYTE(b3) && \ + (b0 >= 0xf1 || b1 >= 0x90) && \ + (b0 <= 0xf3 || b1 <= 0x8F)) +#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \ + IS_UTF8MB4_STEP2(b0,b1,b2,b3)) + +/* Convert individual bytes to Unicode code points */ +#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\ + ((my_wc_t) ((uchar) b1 ^ 0x80))) +#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\ + ((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\ + ((my_wc_t) ((uchar) b2 ^ 0x80))) +#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\ + ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\ + ((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\ + (my_wc_t) ((uchar) b3 ^ 0x80)) + +/* Definitions for strcoll.ic */ +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80) +#define IS_MB1_MBHEAD_UNUSED_GAP(x) ((uchar) (x) < 0xC2) +#define IS_MB2_CHAR(x,y) IS_UTF8MB2_STEP2(x,y) +#define IS_MB3_CHAR(x,y,z) IS_UTF8MB3_STEP3(x,y,z) + +/* Collation names */ #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" #define MY_UTF8MB3_BIN MY_UTF8MB3 "_bin" @@ -88,8 +166,7 @@ int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && - (c >= 0xe1 || s[1] >= 0xa0))) + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) return MY_CS_ILSEQ; return 3; @@ -4848,7 +4925,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; - *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + *pwc= UTF8MB2_CODE(c, s[1]); return 2; } else if (c < 0xf0) @@ -4856,76 +4933,12 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && - (c >= 0xe1 || s[1] >= 0xa0))) + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) return MY_CS_ILSEQ; - *pwc = ((my_wc_t) (c & 0x0f) << 12) | - ((my_wc_t) (s[1] ^ 0x80) << 6) | - (my_wc_t) (s[2] ^ 0x80); - + *pwc= UTF8MB3_CODE(c, s[1], s[2]); return 3; } -#ifdef UNICODE_32BIT - else if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) - { - if (s+4 > e) /* We need 4 characters */ - return MY_CS_TOOSMALL4; - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - (c >= 0xf1 || s[1] >= 0x90))) - return MY_CS_ILSEQ; - - *pwc = ((my_wc_t) (c & 0x07) << 18) | - ((my_wc_t) (s[1] ^ 0x80) << 12) | - ((my_wc_t) (s[2] ^ 0x80) << 6) | - (my_wc_t) (s[3] ^ 0x80); - - return 4; - } - else if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) - { - if (s+5 >e) /* We need 5 characters */ - return MY_CS_TOOSMALL5; - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - IS_CONTINUATION_BYTE(s[4]) && - (c >= 0xf9 || s[1] >= 0x88))) - return MY_CS_ILSEQ; - - *pwc = ((my_wc_t) (c & 0x03) << 24) | - ((my_wc_t) (s[1] ^ 0x80) << 18) | - ((my_wc_t) (s[2] ^ 0x80) << 12) | - ((my_wc_t) (s[3] ^ 0x80) << 6) | - (my_wc_t) (s[4] ^ 0x80); - return 5; - } - else if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) - { - if ( s+6 >e ) /* We need 6 characters */ - return MY_CS_TOOSMALL6; - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - IS_CONTINUATION_BYTE(s[4]) && - IS_CONTINUATION_BYTE(s[5]) && - (c >= 0xfd || s[1] >= 0x84))) - return MY_CS_ILSEQ; - - *pwc = ((my_wc_t) (c & 0x01) << 30) - | ((my_wc_t) (s[1] ^ 0x80) << 24) - | ((my_wc_t) (s[2] ^ 0x80) << 18) - | ((my_wc_t) (s[3] ^ 0x80) << 12) - | ((my_wc_t) (s[4] ^ 0x80) << 6) - | (my_wc_t) (s[5] ^ 0x80); - return 6; - } -#endif return MY_CS_ILSEQ; } @@ -4954,21 +4967,16 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), if (!((s[1] ^ 0x80) < 0x40)) return MY_CS_ILSEQ; - *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + *pwc= UTF8MB2_CODE(c, s[1]); return 2; } if (c < 0xf0) { - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - (c >= 0xe1 || s[1] >= 0xa0))) + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) return MY_CS_ILSEQ; - *pwc= ((my_wc_t) (c & 0x0f) << 12) | - ((my_wc_t) (s[1] ^ 0x80) << 6) | - (my_wc_t) (s[2] ^ 0x80); - + *pwc= UTF8MB3_CODE(c, s[1], s[2]); return 3; } return MY_CS_ILSEQ; @@ -5193,148 +5201,6 @@ static size_t my_casedn_str_utf8(CHARSET_INFO *cs, char *src) } -static int my_strnncoll_utf8(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - int s_res,t_res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se=s+slen; - const uchar *te=t+tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - while ( s < se && t < te ) - { - s_res=my_utf8_uni(cs,&s_wc, s, se); - t_res=my_utf8_uni(cs,&t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare byte by byte value */ - return bincmp(s, se, t, te); - } - - my_tosort_unicode(uni_plane, &s_wc, cs->state); - my_tosort_unicode(uni_plane, &t_wc, cs->state); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+=s_res; - t+=t_res; - } - return (int) (t_is_prefix ? t-te : ((se-s) - (te-t))); -} - - - -/* - Compare strings, discarding end space - - SYNOPSIS - my_strnncollsp_utf8() - cs character set handler - a First string to compare - a_length Length of 'a' - b Second string to compare - b_length Length of 'b' - diff_if_only_endspace_difference - Set to 1 if the strings should be regarded as different - if they only difference in end space - - IMPLEMENTATION - If one string is shorter as the other, then we space extend the other - so that the strings have equal length. - - This will ensure that the following things hold: - - "a" == "a " - "a\0" < "a" - "a\0" < "a " - - RETURN - < 0 a < b - = 0 a == b - > 0 a > b -*/ - -static int my_strnncollsp_utf8(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference) -{ - int s_res, t_res, res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s+slen, *te= t+tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= 0; -#endif - - while ( s < se && t < te ) - { - s_res=my_utf8_uni(cs,&s_wc, s, se); - t_res=my_utf8_uni(cs,&t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare byte by byte value */ - return bincmp(s, se, t, te); - } - - my_tosort_unicode(uni_plane, &s_wc, cs->state); - my_tosort_unicode(uni_plane, &t_wc, cs->state); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+=s_res; - t+=t_res; - } - - slen= (size_t) (se-s); - tlen= (size_t) (te-t); - res= 0; - - if (slen != tlen) - { - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - if (slen < tlen) - { - slen= tlen; - s= t; - se= te; - swap= -1; - res= -res; - } - /* - This following loop uses the fact that in UTF-8 - all multibyte characters are greater than space, - and all multibyte head characters are greater than - space. It means if we meet a character greater - than space, it always means that the longer string - is greater. So we can reuse the same loop from the - 8bit version, without having to process full multibute - sequences. - */ - for ( ; s < se; s++) - { - if (*s != ' ') - return (*s < ' ') ? -swap : swap; - } - } - return res; -} - - /* Compare 0-terminated UTF8 strings. @@ -5456,50 +5322,6 @@ int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)), if (c < 0xf0) return my_valid_mbcharlen_utf8mb3(s, e); -#ifdef UNICODE_32BIT - if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) - { - if (s+4 > e) /* We need 4 characters */ - return MY_CS_TOOSMALL4; - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - (c >= 0xf1 || s[1] >= 0x90))) - return MY_CS_ILSEQ; - - return 4; - } - if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) - { - if (s+5 >e) /* We need 5 characters */ - return MY_CS_TOOSMALL5; - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - IS_CONTINUATION_BYTE(s[4]) && - (c >= 0xf9 || s[1] >= 0x88))) - return MY_CS_ILSEQ; - - return 5; - } - if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) - { - if ( s+6 >e ) /* We need 6 characters */ - return MY_CS_TOOSMALL6; - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - IS_CONTINUATION_BYTE(s[4]) && - IS_CONTINUATION_BYTE(s[5]) && - (c >= 0xfd || s[1] >= 0x84))) - return MY_CS_ILSEQ; - - return 6; - } -#endif return MY_CS_ILSEQ; } @@ -5535,6 +5357,75 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, /* my_well_formed_char_length_utf8 */ +static inline int my_weight_mb1_utf8_general_ci(uchar b) +{ + return (int) plane00[b & 0xFF].sort; +} + + +static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1) +{ + my_wc_t wc= UTF8MB2_CODE(b0, b1); + MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); +} + + +static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2) +{ + my_wc_t wc= UTF8MB3_CODE(b0, b1, b2); + MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) +#include "strcoll.ic" + + +static inline int my_weight_mb1_utf8_general_mysql500_ci(uchar b) +{ + return (int) plane00_mysql500[b & 0xFF].sort; +} + + +static inline int my_weight_mb2_utf8_general_mysql500_ci(uchar b0, uchar b1) +{ + my_wc_t wc= UTF8MB2_CODE(b0, b1); + MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); +} + + +static inline int +my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2) +{ + my_wc_t wc= UTF8MB3_CODE(b0, b1, b2); + MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) ((int) (uchar) (x)) +#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) +#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) +#include "strcoll.ic" + + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e); @@ -5552,14 +5443,6 @@ static uint my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), return 2; else if (c < 0xf0) return 3; -#ifdef UNICODE_32BIT - else if (c < 0xf8) - return 4; - else if (c < 0xfc) - return 5; - else if (c < 0xfe) - return 6; -#endif return 0; /* Illegal mb head */; } @@ -5567,8 +5450,24 @@ static uint my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler = { NULL, /* init */ - my_strnncoll_utf8, - my_strnncollsp_utf8, + my_strnncoll_utf8_general_ci, + my_strnncollsp_utf8_general_ci, + my_strnxfrm_unicode, + my_strnxfrmlen_unicode, + my_like_range_mb, + my_wildcmp_utf8, + my_strcasecmp_utf8, + my_instr_mb, + my_hash_sort_utf8, + my_propagate_complex +}; + + +static MY_COLLATION_HANDLER my_collation_utf8_general_mysql500_ci_handler = +{ + NULL, /* init */ + my_strnncoll_utf8_general_mysql500_ci, + my_strnncollsp_utf8_general_mysql500_ci, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_mb, @@ -5583,8 +5482,8 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler = static MY_COLLATION_HANDLER my_collation_utf8_bin_handler = { NULL, /* init */ - my_strnncoll_mb_bin, - my_strnncollsp_mb_bin, + my_strnncoll_utf8_bin, + my_strnncollsp_utf8_bin, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_mb, @@ -5627,6 +5526,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_charlen_utf8, my_well_formed_char_length_utf8, my_copy_fix_mb, + my_uni_utf8, }; @@ -5693,7 +5593,7 @@ struct charset_info_st my_charset_utf8_general_mysql500_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf8_handler, - &my_collation_utf8_general_ci_handler + &my_collation_utf8_general_mysql500_ci_handler }; @@ -7166,8 +7066,8 @@ my_ismbchar_filename(CHARSET_INFO *cs, const char *str, const char *end) static MY_COLLATION_HANDLER my_collation_filename_handler = { NULL, /* init */ - my_strnncoll_utf8, - my_strnncollsp_utf8, + my_strnncoll_simple, + my_strnncollsp_simple, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_mb, @@ -7210,6 +7110,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_charlen_filename, my_well_formed_char_length_filename, my_copy_fix_mb, + my_wc_mb_filename, }; @@ -7410,7 +7311,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; - *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + *pwc= UTF8MB2_CODE(c, s[1]); return 2; } else if (c < 0xf0) @@ -7418,13 +7319,10 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && - (c >= 0xe1 || s[1] >= 0xa0))) + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) return MY_CS_ILSEQ; - *pwc= ((my_wc_t) (c & 0x0f) << 12) | - ((my_wc_t) (s[1] ^ 0x80) << 6) | - (my_wc_t) (s[2] ^ 0x80); + *pwc= UTF8MB3_CODE(c, s[1], s[2]); return 3; } else if (c < 0xf5) @@ -7432,35 +7330,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - /* - UTF-8 quick four-byte mask: - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - Encoding allows to encode U+00010000..U+001FFFFF - - The maximum character defined in the Unicode standard is U+0010FFFF. - Higher characters U+00110000..U+001FFFFF are not used. - - 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) - 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) - - Valid codes: - [F0][90..BF][80..BF][80..BF] - [F1][80..BF][80..BF][80..BF] - [F2][80..BF][80..BF][80..BF] - [F3][80..BF][80..BF][80..BF] - [F4][80..8F][80..BF][80..BF] - */ - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - (c >= 0xf1 || s[1] >= 0x90) && - (c <= 0xf3 || s[1] <= 0x8F))) + if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3])) return MY_CS_ILSEQ; - *pwc = ((my_wc_t) (c & 0x07) << 18) | - ((my_wc_t) (s[1] ^ 0x80) << 12) | - ((my_wc_t) (s[2] ^ 0x80) << 6) | - (my_wc_t) (s[3] ^ 0x80); + *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]); return 4; } return MY_CS_ILSEQ; @@ -7492,34 +7364,22 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), if (!IS_CONTINUATION_BYTE(s[1])) return MY_CS_ILSEQ; - *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); + *pwc= UTF8MB2_CODE(c, s[1]); return 2; } if (c < 0xf0) { - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - (c >= 0xe1 || s[1] >= 0xa0))) + if (!IS_UTF8MB3_STEP2(c, s[1], s[2])) return MY_CS_ILSEQ; - *pwc= ((my_wc_t) (c & 0x0f) << 12) | - ((my_wc_t) (s[1] ^ 0x80) << 6) | - (my_wc_t) (s[2] ^ 0x80); - + *pwc= UTF8MB3_CODE(c, s[1], s[2]); return 3; } else if (c < 0xf5) { - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - (c >= 0xf1 || s[1] >= 0x90) && - (c <= 0xf3 || s[1] <= 0x8F))) + if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3])) return MY_CS_ILSEQ; - *pwc = ((my_wc_t) (c & 0x07) << 18) | - ((my_wc_t) (s[1] ^ 0x80) << 12) | - ((my_wc_t) (s[2] ^ 0x80) << 6) | - (my_wc_t) (s[3] ^ 0x80); + *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]); return 4; } return MY_CS_ILSEQ; @@ -7765,146 +7625,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src) } -static int -my_strnncoll_utf8mb4(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s + slen; - const uchar *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - while ( s < se && t < te ) - { - int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se); - int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare bytewise */ - return bincmp_utf8mb4(s, se, t, te); - } - - my_tosort_unicode(uni_plane, &s_wc, cs->state); - my_tosort_unicode(uni_plane, &t_wc, cs->state); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t))); -} - - -/** - - Compare strings, discarding end space - - If one string is shorter as the other, then we space extend the other - so that the strings have equal length. - - This will ensure that the following things hold: - - "a" == "a " - "a\0" < "a" - "a\0" < "a " - - @param cs Character set pinter. - @param a First string to compare. - @param a_length Length of 'a'. - @param b Second string to compare. - @param b_length Length of 'b'. - @param diff_if_only_endspace_difference - Set to 1 if the strings should be regarded as different - if they only difference in end space - - @return Comparison result. - @retval Negative number, if a less than b. - @retval 0, if a is equal to b - @retval Positive number, if a > b -*/ - -static int -my_strnncollsp_utf8mb4(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference) -{ - int res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= FALSE; -#endif - - while ( s < se && t < te ) - { - int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se); - int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare bytewise */ - return bincmp_utf8mb4(s, se, t, te); - } - - my_tosort_unicode(uni_plane, &s_wc, cs->state); - my_tosort_unicode(uni_plane, &t_wc, cs->state); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+=s_res; - t+=t_res; - } - - slen= (size_t) (se-s); - tlen= (size_t) (te-t); - res= 0; - - if (slen != tlen) - { - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - if (slen < tlen) - { - slen= tlen; - s= t; - se= te; - swap= -1; - res= -res; - } - /* - This following loop uses the fact that in UTF-8 - all multibyte characters are greater than space, - and all multibyte head characters are greater than - space. It means if we meet a character greater - than space, it always means that the longer string - is greater. So we can reuse the same loop from the - 8bit version, without having to process full multibute - sequences. - */ - for ( ; s < se; s++) - { - if (*s != ' ') - return (*s < ' ') ? -swap : swap; - } - } - return res; -} - - /** Compare 0-terminated UTF8 strings. @@ -8007,30 +7727,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - /* - UTF-8 quick four-byte mask: - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - Encoding allows to encode U+00010000..U+001FFFFF - - The maximum character defined in the Unicode standard is U+0010FFFF. - Higher characters U+00110000..U+001FFFFF are not used. - - 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) - 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) - - Valid codes: - [F0][90..BF][80..BF][80..BF] - [F1][80..BF][80..BF][80..BF] - [F2][80..BF][80..BF][80..BF] - [F3][80..BF][80..BF][80..BF] - [F4][80..8F][80..BF][80..BF] - */ - - if (!(IS_CONTINUATION_BYTE(s[1]) && - IS_CONTINUATION_BYTE(s[2]) && - IS_CONTINUATION_BYTE(s[3]) && - (c >= 0xf1 || s[1] >= 0x90) && - (c <= 0xf3 || s[1] <= 0x8F))) + if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3])) return MY_CS_ILSEQ; return 4; @@ -8072,6 +7769,29 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN /* my_well_formed_char_length_utf8mb4 */ + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci +#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3) +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1) +#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2) +/* + All non-BMP characters have the same weight. +*/ +#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(b0) ((int) (uchar) (b0)) +#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1)) +#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2)) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3)) +#include "strcoll.ic" + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { @@ -8100,8 +7820,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c) static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= { NULL, /* init */ - my_strnncoll_utf8mb4, - my_strnncollsp_utf8mb4, + my_strnncoll_utf8mb4_general_ci, + my_strnncollsp_utf8mb4_general_ci, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_mb, @@ -8116,8 +7836,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = { NULL, /* init */ - my_strnncoll_mb_bin, - my_strnncollsp_mb_bin, + my_strnncoll_utf8mb4_bin, + my_strnncollsp_utf8mb4_bin, my_strnxfrm_unicode_full_bin, my_strnxfrmlen_unicode_full_bin, my_like_range_mb, @@ -8161,6 +7881,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_charlen_utf8mb4, my_well_formed_char_length_utf8mb4, my_copy_fix_mb, + my_wc_mb_utf8mb4, }; |