From 35d8ac350d97557d06edd1cffe7ecc28fc68930a Mon Sep 17 00:00:00 2001
From: Alexander Barkov <bar@mariadb.org>
Date: Mon, 6 Jul 2015 10:47:39 +0400
Subject: MDEV-8417 utf8mb4: compare broken bytes as "greater than any
 non-broken character"

---
 mysql-test/include/ctype_utf8mb4.inc     |  25 ++++-
 mysql-test/r/ctype_utf8mb4_heap.result   |  54 +++++++++-
 mysql-test/r/ctype_utf8mb4_innodb.result |  54 +++++++++-
 mysql-test/r/ctype_utf8mb4_myisam.result |  54 +++++++++-
 strings/ctype-utf8.c                     | 175 ++++++-------------------------
 strings/strcoll.ic                       |  13 +++
 unittest/strings/strings-t.c             |  49 +++++++++
 7 files changed, 275 insertions(+), 149 deletions(-)

diff --git a/mysql-test/include/ctype_utf8mb4.inc b/mysql-test/include/ctype_utf8mb4.inc
index 1971cc0c9a1..a1b7d144c5d 100644
--- a/mysql-test/include/ctype_utf8mb4.inc
+++ b/mysql-test/include/ctype_utf8mb4.inc
@@ -1802,5 +1802,28 @@ DROP TABLE t1;
 --echo #
 
 --echo #
---echo # End of tests
+--echo # ctype_utf8mb4.inc: Start of 10.1 tests
+--echo #
+
+--echo #
+--echo # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
+--echo #
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
+INSERT INTO t1 VALUES (0x61);
+INSERT INTO t1 VALUES (0xC280),(0xDFBF);
+INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
+INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
+SELECT HEX(a) FROM t1 ORDER BY a;
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+SELECT HEX(a) FROM t1 ORDER BY a;
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+DROP TABLE t1;
+
+--echo #
+--echo # ctype_utf8mb4.inc: End of 10.1 tests
+--echo #
+
+--echo #
+--echo # End of ctype_utf8mb4.inc
 --echo #
diff --git a/mysql-test/r/ctype_utf8mb4_heap.result b/mysql-test/r/ctype_utf8mb4_heap.result
index 52030d62047..78cfe1da597 100644
--- a/mysql-test/r/ctype_utf8mb4_heap.result
+++ b/mysql-test/r/ctype_utf8mb4_heap.result
@@ -2495,5 +2495,57 @@ DROP TABLE t1;
 # End of 5.5 tests
 #
 #
-# End of tests
+# ctype_utf8mb4.inc: Start of 10.1 tests
+#
+#
+# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
+#
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
+INSERT INTO t1 VALUES (0x61);
+INSERT INTO t1 VALUES (0xC280),(0xDFBF);
+INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
+INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
+SELECT HEX(a) FROM t1 ORDER BY a;
+HEX(a)
+61
+C280
+DFBF
+E0A080
+EFBFBF
+F0908080
+F48FBFBF
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+HEX(a)
+F48FBFBF
+F0908080
+EFBFBF
+E0A080
+DFBF
+C280
+61
+ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+SELECT HEX(a) FROM t1 ORDER BY a;
+HEX(a)
+61
+C280
+DFBF
+E0A080
+EFBFBF
+F0908080
+F48FBFBF
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+HEX(a)
+F48FBFBF
+F0908080
+EFBFBF
+E0A080
+DFBF
+C280
+61
+DROP TABLE t1;
+#
+# ctype_utf8mb4.inc: End of 10.1 tests
+#
+#
+# End of ctype_utf8mb4.inc
 #
diff --git a/mysql-test/r/ctype_utf8mb4_innodb.result b/mysql-test/r/ctype_utf8mb4_innodb.result
index 243c000b6c4..722c03bdff9 100644
--- a/mysql-test/r/ctype_utf8mb4_innodb.result
+++ b/mysql-test/r/ctype_utf8mb4_innodb.result
@@ -2642,5 +2642,57 @@ DROP TABLE t1;
 # End of 5.5 tests
 #
 #
-# End of tests
+# ctype_utf8mb4.inc: Start of 10.1 tests
+#
+#
+# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
+#
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
+INSERT INTO t1 VALUES (0x61);
+INSERT INTO t1 VALUES (0xC280),(0xDFBF);
+INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
+INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
+SELECT HEX(a) FROM t1 ORDER BY a;
+HEX(a)
+61
+C280
+DFBF
+E0A080
+EFBFBF
+F0908080
+F48FBFBF
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+HEX(a)
+F48FBFBF
+F0908080
+EFBFBF
+E0A080
+DFBF
+C280
+61
+ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+SELECT HEX(a) FROM t1 ORDER BY a;
+HEX(a)
+61
+C280
+DFBF
+E0A080
+EFBFBF
+F0908080
+F48FBFBF
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+HEX(a)
+F48FBFBF
+F0908080
+EFBFBF
+E0A080
+DFBF
+C280
+61
+DROP TABLE t1;
+#
+# ctype_utf8mb4.inc: End of 10.1 tests
+#
+#
+# End of ctype_utf8mb4.inc
 #
diff --git a/mysql-test/r/ctype_utf8mb4_myisam.result b/mysql-test/r/ctype_utf8mb4_myisam.result
index acdd6d36af7..f391f3fbba1 100644
--- a/mysql-test/r/ctype_utf8mb4_myisam.result
+++ b/mysql-test/r/ctype_utf8mb4_myisam.result
@@ -2642,5 +2642,57 @@ DROP TABLE t1;
 # End of 5.5 tests
 #
 #
-# End of tests
+# ctype_utf8mb4.inc: Start of 10.1 tests
+#
+#
+# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
+#
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
+INSERT INTO t1 VALUES (0x61);
+INSERT INTO t1 VALUES (0xC280),(0xDFBF);
+INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
+INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
+SELECT HEX(a) FROM t1 ORDER BY a;
+HEX(a)
+61
+C280
+DFBF
+E0A080
+EFBFBF
+F0908080
+F48FBFBF
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+HEX(a)
+F48FBFBF
+F0908080
+EFBFBF
+E0A080
+DFBF
+C280
+61
+ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+SELECT HEX(a) FROM t1 ORDER BY a;
+HEX(a)
+61
+C280
+DFBF
+E0A080
+EFBFBF
+F0908080
+F48FBFBF
+SELECT HEX(a) FROM t1 ORDER BY a DESC;
+HEX(a)
+F48FBFBF
+F0908080
+EFBFBF
+E0A080
+DFBF
+C280
+61
+DROP TABLE t1;
+#
+# ctype_utf8mb4.inc: End of 10.1 tests
+#
+#
+# End of ctype_utf8mb4.inc
 #
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index b77580a6ec3..2fc53e84b5c 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -85,7 +85,8 @@
                                        IS_CONTINUATION_BYTE(b3) && \
                                        (b0 >= 0xf1 || b1 >= 0x90) && \
                                        (b0 <= 0xf3 || b1 <= 0x8F))
-
+#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
+                                       IS_UTF8MB4_STEP2(b0,b1,b2,b3))
 
 /* Convert individual bytes to Unicode code points */
 #define UTF8MB2_CODE(b0,b1)       (((my_wc_t) ((uchar) b0 & 0x1f) << 6)  |\
@@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
 }
 
 
-static int
-my_strnncoll_utf8mb4(CHARSET_INFO *cs,
-                     const uchar *s, size_t slen,
-                     const uchar *t, size_t tlen,
-                     my_bool t_is_prefix)
-{
-  my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
-  const uchar *se= s + slen;
-  const uchar *te= t + tlen;
-  MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
-  while ( s < se && t < te )
-  {
-    int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
-    int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
-
-    if ( s_res <= 0 || t_res <= 0 )
-    {
-      /* Incorrect string, compare bytewise */
-      return bincmp_utf8mb4(s, se, t, te);
-    }
-
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
-    if ( s_wc != t_wc )
-    {
-      return s_wc > t_wc ? 1 : -1;
-    }
-
-    s+= s_res;
-    t+= t_res;
-  }
-  return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
-}
-
-
-/**
-  
-  Compare strings, discarding end space
-
-  If one string is shorter as the other, then we space extend the other
-  so that the strings have equal length.
-
-  This will ensure that the following things hold:
-
-    "a"  == "a "
-    "a\0" < "a"
-    "a\0" < "a "
-
-  @param  cs        Character set pinter.
-  @param  a         First string to compare.
-  @param  a_length  Length of 'a'.
-  @param  b         Second string to compare.
-  @param  b_length  Length of 'b'.
-  @param  diff_if_only_endspace_difference
-                    Set to 1 if the strings should be regarded as different
-                    if they only difference in end space
-
-  @return Comparison result.
-    @retval Negative number, if a less than b.
-    @retval 0, if a is equal to b
-    @retval Positive number, if a > b
-*/
-
-static int
-my_strnncollsp_utf8mb4(CHARSET_INFO *cs,
-                       const uchar *s, size_t slen,
-                       const uchar *t, size_t tlen,
-                       my_bool diff_if_only_endspace_difference)
-{
-  int res;
-  my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
-  const uchar *se= s + slen, *te= t + tlen;
-  MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
-  diff_if_only_endspace_difference= FALSE;
-#endif
-
-  while ( s < se && t < te )
-  {
-    int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
-    int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
-
-    if ( s_res <= 0 || t_res <= 0 )
-    {
-      /* Incorrect string, compare bytewise */
-      return bincmp_utf8mb4(s, se, t, te);
-    }
-
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
-    if ( s_wc != t_wc )
-    {
-      return s_wc > t_wc ? 1 : -1;
-    }
-
-    s+=s_res;
-    t+=t_res;
-  }
-
-  slen= (size_t) (se-s);
-  tlen= (size_t) (te-t);
-  res= 0;
-
-  if (slen != tlen)
-  {
-    int swap= 1;
-    if (diff_if_only_endspace_difference)
-      res= 1;                                   /* Assume 'a' is bigger */
-    if (slen < tlen)
-    {
-      slen= tlen;
-      s= t;
-      se= te;
-      swap= -1;
-      res= -res;
-    }
-    /*
-      This following loop uses the fact that in UTF-8
-      all multibyte characters are greater than space,
-      and all multibyte head characters are greater than
-      space. It means if we meet a character greater
-      than space, it always means that the longer string
-      is greater. So we can reuse the same loop from the
-      8bit version, without having to process full multibute
-      sequences.
-    */
-    for ( ; s < se; s++)
-    {
-      if (*s != ' ')
-	return (*s < ' ') ? -swap : swap;
-    }
-  }
-  return res;
-}
-
-
 /**
   Compare 0-terminated UTF8 strings.
 
@@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
 /* my_well_formed_char_length_utf8mb4 */
 
+
+#define MY_FUNCTION_NAME(x)      my_ ## x ## _utf8mb4_general_ci
+#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
+#define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(b0)           my_weight_mb1_utf8_general_ci(b0)
+#define WEIGHT_MB2(b0,b1)        my_weight_mb2_utf8_general_ci(b0,b1)
+#define WEIGHT_MB3(b0,b1,b2)     my_weight_mb3_utf8_general_ci(b0,b1,b2)
+/*
+  There is no mapping between code point and weight for non-BMP characters
+  in utf8mb4_general_ci. Just using code point as weight.
+*/
+#define WEIGHT_MB4(b0,b1,b2,b3)  UTF8MB4_CODE(b0,b1,b2,b3)
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x)      my_ ## x ## _utf8mb4_bin
+#define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(b0)           ((int) (uchar) (b0))
+#define WEIGHT_MB2(b0,b1)        ((int) UTF8MB2_CODE(b0,b1))
+#define WEIGHT_MB3(b0,b1,b2)     ((int) UTF8MB3_CODE(b0,b1,b2))
+#define WEIGHT_MB4(b0,b1,b2,b3)  ((int) UTF8MB4_CODE(b0,b1,b2,b3))
+#include "strcoll.ic"
+
+
 static uint
 my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
 {
@@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs  __attribute__((unused)), uint c)
 static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
 {
   NULL,               /* init */
-  my_strnncoll_utf8mb4,
-  my_strnncollsp_utf8mb4,
+  my_strnncoll_utf8mb4_general_ci,
+  my_strnncollsp_utf8mb4_general_ci,
   my_strnxfrm_unicode,
   my_strnxfrmlen_unicode,
   my_like_range_mb,
@@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
 static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
 {
     NULL,		/* init */
-    my_strnncoll_mb_bin,
-    my_strnncollsp_mb_bin,
+    my_strnncoll_utf8mb4_bin,
+    my_strnncollsp_utf8mb4_bin,
     my_strnxfrm_unicode_full_bin,
     my_strnxfrmlen_unicode_full_bin,
     my_like_range_mb,
diff --git a/strings/strcoll.ic b/strings/strcoll.ic
index 31f610c4397..5f4ee615d84 100644
--- a/strings/strcoll.ic
+++ b/strings/strcoll.ic
@@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
   }
 #endif
 
+#ifdef IS_MB4_CHAR
+  if (str + 4 > end)                     /* Incomplete four-byte character */
+    goto bad;
+
+  if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
+  {
+    *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
+    return 4;                            /* A valid four-byte character */
+  }
+
+#endif
+
 bad:
   *weight= WEIGHT_ILSEQ(str[0]);         /* Bad byte */
   return 1;
@@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
 #undef WEIGHT_MB1
 #undef WEIGHT_MB2
 #undef WEIGHT_MB3
+#undef WEIGHT_MB4
 #undef WEIGHT_PAD_SPACE
diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c
index 6da7a0cc72f..4e9ca820981 100644
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
 };
 
 
+STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
+{
+  /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"),        -1},  /* MB4 vs unused byte */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xC2"),        -1},  /* MB4 vs incomplete MB2 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\x7F"),-1},  /* MB4 vs broken MB3 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\xC0"),-1},  /* MB4 vs broken MB3 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0"),     -1}, /* MB4 vs incomplete MB3 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80"),-1},  /* MB4 vs incomplete MB4 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
+  {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
+
+  /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC0"),        -1},  /* MB4 vs unused byte */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC2"),        -1},  /* MB4 vs incomplete MB2 */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\x7F"),-1},  /* MB4 vs broken MB3 */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\xC0"),-1},  /* MB4 vs broken MB3 */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0"),     -1}, /* MB4 vs incomplete MB3 */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80"),-1},  /* MB4 vs incomplete MB4 */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
+  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
+
+  /* Broken MB4 vs incomplete/broken MB3 */
+  {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0"),    1},  /* Broken MB4 vs incomplete MB3 */
+  {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\x7F"),1},  /* Broken MB4 vs broken MB3 */
+  {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\xC0"),1},  /* Broken MB4 vs broken MB3 */
+
+  /*
+    Broken MB4 vs incomplete MB4:
+    The three leftmost bytes are compared binary, the fourth byte is compared
+    to auto-padded space.
+  */
+  {CSTR("\xF0\x90\x80\x1F"), CSTR("\xF0\x90\x80"),-1}, /* Broken MB4 vs incomplete MB4 */
+  {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80"),1},  /* Broken MB4 vs incomplete MB4 */
+
+  /* Broken MB4 vs broken MB4 */
+  {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\x7F"),-1},/* Broken MB4 vs broken MB4 */
+  {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\xC0"),-1},/* Broken MB4 vs broken MB4 */
+
+  {NULL, 0, NULL, 0, 0}
+};
+
+
 static void
 str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
 {
@@ -497,6 +540,12 @@ test_strcollsp()
   failed+= strcollsp(&my_charset_utf8_general_ci,          strcoll_utf8mb3_common);
   failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
   failed+= strcollsp(&my_charset_utf8_bin,                 strcoll_utf8mb3_common);
+#endif
+#ifdef HAVE_CHARSET_utf8mb4
+  failed+= strcollsp(&my_charset_utf8mb4_general_ci,          strcoll_utf8mb3_common);
+  failed+= strcollsp(&my_charset_utf8mb4_bin,                 strcoll_utf8mb3_common);
+  failed+= strcollsp(&my_charset_utf8mb4_general_ci,          strcoll_utf8mb4_common);
+  failed+= strcollsp(&my_charset_utf8mb4_bin,                 strcoll_utf8mb4_common);
 #endif
   return failed;
 }
-- 
cgit v1.2.1