From 35d8ac350d97557d06edd1cffe7ecc28fc68930a Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Mon, 6 Jul 2015 10:47:39 +0400 Subject: MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character" --- mysql-test/include/ctype_utf8mb4.inc | 25 ++++- mysql-test/r/ctype_utf8mb4_heap.result | 54 +++++++++- mysql-test/r/ctype_utf8mb4_innodb.result | 54 +++++++++- mysql-test/r/ctype_utf8mb4_myisam.result | 54 +++++++++- strings/ctype-utf8.c | 175 ++++++------------------------- strings/strcoll.ic | 13 +++ unittest/strings/strings-t.c | 49 +++++++++ 7 files changed, 275 insertions(+), 149 deletions(-) diff --git a/mysql-test/include/ctype_utf8mb4.inc b/mysql-test/include/ctype_utf8mb4.inc index 1971cc0c9a1..a1b7d144c5d 100644 --- a/mysql-test/include/ctype_utf8mb4.inc +++ b/mysql-test/include/ctype_utf8mb4.inc @@ -1802,5 +1802,28 @@ DROP TABLE t1; --echo # --echo # ---echo # End of tests +--echo # ctype_utf8mb4.inc: Start of 10.1 tests +--echo # + +--echo # +--echo # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character" +--echo # +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a)); +INSERT INTO t1 VALUES (0x61); +INSERT INTO t1 VALUES (0xC280),(0xDFBF); +INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF); +INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF); +SELECT HEX(a) FROM t1 ORDER BY a; +SELECT HEX(a) FROM t1 ORDER BY a DESC; +ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +SELECT HEX(a) FROM t1 ORDER BY a; +SELECT HEX(a) FROM t1 ORDER BY a DESC; +DROP TABLE t1; + +--echo # +--echo # ctype_utf8mb4.inc: End of 10.1 tests +--echo # + +--echo # +--echo # End of ctype_utf8mb4.inc --echo # diff --git a/mysql-test/r/ctype_utf8mb4_heap.result b/mysql-test/r/ctype_utf8mb4_heap.result index 52030d62047..78cfe1da597 100644 --- a/mysql-test/r/ctype_utf8mb4_heap.result +++ b/mysql-test/r/ctype_utf8mb4_heap.result @@ -2495,5 +2495,57 @@ DROP TABLE t1; # End of 5.5 tests # # -# End of tests +# ctype_utf8mb4.inc: Start of 10.1 tests +# +# +# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character" +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a)); +INSERT INTO t1 VALUES (0x61); +INSERT INTO t1 VALUES (0xC280),(0xDFBF); +INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF); +INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF); +SELECT HEX(a) FROM t1 ORDER BY a; +HEX(a) +61 +C280 +DFBF +E0A080 +EFBFBF +F0908080 +F48FBFBF +SELECT HEX(a) FROM t1 ORDER BY a DESC; +HEX(a) +F48FBFBF +F0908080 +EFBFBF +E0A080 +DFBF +C280 +61 +ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +SELECT HEX(a) FROM t1 ORDER BY a; +HEX(a) +61 +C280 +DFBF +E0A080 +EFBFBF +F0908080 +F48FBFBF +SELECT HEX(a) FROM t1 ORDER BY a DESC; +HEX(a) +F48FBFBF +F0908080 +EFBFBF +E0A080 +DFBF +C280 +61 +DROP TABLE t1; +# +# ctype_utf8mb4.inc: End of 10.1 tests +# +# +# End of ctype_utf8mb4.inc # diff --git a/mysql-test/r/ctype_utf8mb4_innodb.result b/mysql-test/r/ctype_utf8mb4_innodb.result index 243c000b6c4..722c03bdff9 100644 --- a/mysql-test/r/ctype_utf8mb4_innodb.result +++ b/mysql-test/r/ctype_utf8mb4_innodb.result @@ -2642,5 +2642,57 @@ DROP TABLE t1; # End of 5.5 tests # # -# End of tests +# ctype_utf8mb4.inc: Start of 10.1 tests +# +# +# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character" +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a)); +INSERT INTO t1 VALUES (0x61); +INSERT INTO t1 VALUES (0xC280),(0xDFBF); +INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF); +INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF); +SELECT HEX(a) FROM t1 ORDER BY a; +HEX(a) +61 +C280 +DFBF +E0A080 +EFBFBF +F0908080 +F48FBFBF +SELECT HEX(a) FROM t1 ORDER BY a DESC; +HEX(a) +F48FBFBF +F0908080 +EFBFBF +E0A080 +DFBF +C280 +61 +ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +SELECT HEX(a) FROM t1 ORDER BY a; +HEX(a) +61 +C280 +DFBF +E0A080 +EFBFBF +F0908080 +F48FBFBF +SELECT HEX(a) FROM t1 ORDER BY a DESC; +HEX(a) +F48FBFBF +F0908080 +EFBFBF +E0A080 +DFBF +C280 +61 +DROP TABLE t1; +# +# ctype_utf8mb4.inc: End of 10.1 tests +# +# +# End of ctype_utf8mb4.inc # diff --git a/mysql-test/r/ctype_utf8mb4_myisam.result b/mysql-test/r/ctype_utf8mb4_myisam.result index acdd6d36af7..f391f3fbba1 100644 --- a/mysql-test/r/ctype_utf8mb4_myisam.result +++ b/mysql-test/r/ctype_utf8mb4_myisam.result @@ -2642,5 +2642,57 @@ DROP TABLE t1; # End of 5.5 tests # # -# End of tests +# ctype_utf8mb4.inc: Start of 10.1 tests +# +# +# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character" +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a)); +INSERT INTO t1 VALUES (0x61); +INSERT INTO t1 VALUES (0xC280),(0xDFBF); +INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF); +INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF); +SELECT HEX(a) FROM t1 ORDER BY a; +HEX(a) +61 +C280 +DFBF +E0A080 +EFBFBF +F0908080 +F48FBFBF +SELECT HEX(a) FROM t1 ORDER BY a DESC; +HEX(a) +F48FBFBF +F0908080 +EFBFBF +E0A080 +DFBF +C280 +61 +ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +SELECT HEX(a) FROM t1 ORDER BY a; +HEX(a) +61 +C280 +DFBF +E0A080 +EFBFBF +F0908080 +F48FBFBF +SELECT HEX(a) FROM t1 ORDER BY a DESC; +HEX(a) +F48FBFBF +F0908080 +EFBFBF +E0A080 +DFBF +C280 +61 +DROP TABLE t1; +# +# ctype_utf8mb4.inc: End of 10.1 tests +# +# +# End of ctype_utf8mb4.inc # diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index b77580a6ec3..2fc53e84b5c 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -85,7 +85,8 @@ IS_CONTINUATION_BYTE(b3) && \ (b0 >= 0xf1 || b1 >= 0x90) && \ (b0 <= 0xf3 || b1 <= 0x8F)) - +#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \ + IS_UTF8MB4_STEP2(b0,b1,b2,b3)) /* Convert individual bytes to Unicode code points */ #define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\ @@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src) } -static int -my_strnncoll_utf8mb4(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s + slen; - const uchar *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - while ( s < se && t < te ) - { - int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se); - int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare bytewise */ - return bincmp_utf8mb4(s, se, t, te); - } - - my_tosort_unicode(uni_plane, &s_wc, cs->state); - my_tosort_unicode(uni_plane, &t_wc, cs->state); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t))); -} - - -/** - - Compare strings, discarding end space - - If one string is shorter as the other, then we space extend the other - so that the strings have equal length. - - This will ensure that the following things hold: - - "a" == "a " - "a\0" < "a" - "a\0" < "a " - - @param cs Character set pinter. - @param a First string to compare. - @param a_length Length of 'a'. - @param b Second string to compare. - @param b_length Length of 'b'. - @param diff_if_only_endspace_difference - Set to 1 if the strings should be regarded as different - if they only difference in end space - - @return Comparison result. - @retval Negative number, if a less than b. - @retval 0, if a is equal to b - @retval Positive number, if a > b -*/ - -static int -my_strnncollsp_utf8mb4(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference) -{ - int res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= FALSE; -#endif - - while ( s < se && t < te ) - { - int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se); - int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare bytewise */ - return bincmp_utf8mb4(s, se, t, te); - } - - my_tosort_unicode(uni_plane, &s_wc, cs->state); - my_tosort_unicode(uni_plane, &t_wc, cs->state); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+=s_res; - t+=t_res; - } - - slen= (size_t) (se-s); - tlen= (size_t) (te-t); - res= 0; - - if (slen != tlen) - { - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - if (slen < tlen) - { - slen= tlen; - s= t; - se= te; - swap= -1; - res= -res; - } - /* - This following loop uses the fact that in UTF-8 - all multibyte characters are greater than space, - and all multibyte head characters are greater than - space. It means if we meet a character greater - than space, it always means that the longer string - is greater. So we can reuse the same loop from the - 8bit version, without having to process full multibute - sequences. - */ - for ( ; s < se; s++) - { - if (*s != ' ') - return (*s < ' ') ? -swap : swap; - } - } - return res; -} - - /** Compare 0-terminated UTF8 strings. @@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN /* my_well_formed_char_length_utf8mb4 */ + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci +#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3) +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1) +#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2) +/* + There is no mapping between code point and weight for non-BMP characters + in utf8mb4_general_ci. Just using code point as weight. +*/ +#define WEIGHT_MB4(b0,b1,b2,b3) UTF8MB4_CODE(b0,b1,b2,b3) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(b0) ((int) (uchar) (b0)) +#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1)) +#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2)) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3)) +#include "strcoll.ic" + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { @@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c) static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= { NULL, /* init */ - my_strnncoll_utf8mb4, - my_strnncollsp_utf8mb4, + my_strnncoll_utf8mb4_general_ci, + my_strnncollsp_utf8mb4_general_ci, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_mb, @@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = { NULL, /* init */ - my_strnncoll_mb_bin, - my_strnncollsp_mb_bin, + my_strnncoll_utf8mb4_bin, + my_strnncollsp_utf8mb4_bin, my_strnxfrm_unicode_full_bin, my_strnxfrmlen_unicode_full_bin, my_like_range_mb, diff --git a/strings/strcoll.ic b/strings/strcoll.ic index 31f610c4397..5f4ee615d84 100644 --- a/strings/strcoll.ic +++ b/strings/strcoll.ic @@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) } #endif +#ifdef IS_MB4_CHAR + if (str + 4 > end) /* Incomplete four-byte character */ + goto bad; + + if (IS_MB4_CHAR(str[0], str[1], str[2], str[3])) + { + *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]); + return 4; /* A valid four-byte character */ + } + +#endif + bad: *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ return 1; @@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), #undef WEIGHT_MB1 #undef WEIGHT_MB2 #undef WEIGHT_MB3 +#undef WEIGHT_MB4 #undef WEIGHT_PAD_SPACE diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c index 6da7a0cc72f..4e9ca820981 100644 --- a/unittest/strings/strings-t.c +++ b/unittest/strings/strings-t.c @@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]= }; +STRNNCOLL_PARAM strcoll_utf8mb4_common[]= +{ + /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xC2"), -1}, /* MB4 vs incomplete MB2 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\x7F"),-1}, /* MB4 vs broken MB3 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\xC0"),-1}, /* MB4 vs broken MB3 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0"), -1}, /* MB4 vs incomplete MB3 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80"),-1}, /* MB4 vs incomplete MB4 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */ + + /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC2"), -1}, /* MB4 vs incomplete MB2 */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\x7F"),-1}, /* MB4 vs broken MB3 */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\xC0"),-1}, /* MB4 vs broken MB3 */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0"), -1}, /* MB4 vs incomplete MB3 */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80"),-1}, /* MB4 vs incomplete MB4 */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */ + + /* Broken MB4 vs incomplete/broken MB3 */ + {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0"), 1}, /* Broken MB4 vs incomplete MB3 */ + {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\x7F"),1}, /* Broken MB4 vs broken MB3 */ + {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\xC0"),1}, /* Broken MB4 vs broken MB3 */ + + /* + Broken MB4 vs incomplete MB4: + The three leftmost bytes are compared binary, the fourth byte is compared + to auto-padded space. + */ + {CSTR("\xF0\x90\x80\x1F"), CSTR("\xF0\x90\x80"),-1}, /* Broken MB4 vs incomplete MB4 */ + {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80"),1}, /* Broken MB4 vs incomplete MB4 */ + + /* Broken MB4 vs broken MB4 */ + {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\x7F"),-1},/* Broken MB4 vs broken MB4 */ + {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\xC0"),-1},/* Broken MB4 vs broken MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + static void str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) { @@ -497,6 +540,12 @@ test_strcollsp() failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_bin, strcoll_utf8mb3_common); +#endif +#ifdef HAVE_CHARSET_utf8mb4 + failed+= strcollsp(&my_charset_utf8mb4_general_ci, strcoll_utf8mb3_common); + failed+= strcollsp(&my_charset_utf8mb4_bin, strcoll_utf8mb3_common); + failed+= strcollsp(&my_charset_utf8mb4_general_ci, strcoll_utf8mb4_common); + failed+= strcollsp(&my_charset_utf8mb4_bin, strcoll_utf8mb4_common); #endif return failed; } -- cgit v1.2.1