1 files changed, 975 insertions, 79 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 91f633e45ce..7de5cdd00ee 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -27,6 +27,16 @@
 #define EILSEQ ENOENT
 #endif
 
+
+#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
+#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
+#define MY_UTF8MB3_BIN        MY_UTF8MB3 "_bin"
+#define MY_UTF8MB4_GENERAL_CI MY_UTF8MB4 "_general_ci"
+#define MY_UTF8MB4_GENERAL_CS MY_UTF8MB4 "_general_cs"
+#define MY_UTF8MB4_BIN        MY_UTF8MB4 "_bin"
+
+
+
 #ifndef HAVE_CHARSET_utf8
 #define HAVE_CHARSET_utf8
 #endif
@@ -39,6 +49,14 @@
 #define HAVE_UNIDATA
 #endif
 
+#ifdef HAVE_CHARSET_utf16
+#define HAVE_UNIDATA
+#endif
+
+#ifdef HAVE_CHARSET_utf32
+#define HAVE_UNIDATA
+#endif
+
 #ifdef HAVE_UNIDATA
 
 #include "my_uctype.h"
@@ -1702,6 +1720,24 @@ MY_UNICASE_INFO *my_unicase_turkish[256]=
 };
 
 
+#define REPLACEMENT_CHAR 0xFFFD;
+
+
+static inline void
+my_tosort_unicode(MY_UNICASE_INFO **uni_plane, my_wc_t *wc)
+{
+  int page= *wc >> 8;
+  if (page < 256)
+  {
+    if (uni_plane[page])
+      *wc= uni_plane[page][*wc & 0xFF].sort;
+  }
+  else
+  {
+    *wc= REPLACEMENT_CHAR;
+  }
+}
+
 
 /*
 ** Compare string against string with wildcard
@@ -1712,13 +1748,14 @@ MY_UNICASE_INFO *my_unicase_turkish[256]=
 **	 1 if matched with wildcard
 */
 
-int my_wildcmp_unicode(CHARSET_INFO *cs,
-		       const char *str,const char *str_end,
-		       const char *wildstr,const char *wildend,
-		       int escape, int w_one, int w_many,
-		       MY_UNICASE_INFO **weights)
+int
+my_wildcmp_unicode(CHARSET_INFO *cs,
+                   const char *str,const char *str_end,
+                   const char *wildstr,const char *wildend,
+                   int escape, int w_one, int w_many,
+                   MY_UNICASE_INFO **weights)
 {
-  int result= -1;			/* Not found, using wildcards */
+  int result= -1;                             /* Not found, using wildcards */
   my_wc_t s_wc, w_wc;
   int scan, plane;
   int (*mb_wc)(struct charset_info_st *, my_wc_t *,
@@ -1734,14 +1771,14 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
                        (const uchar*)wildend)) <= 0)
         return 1;
 
-      if (w_wc == (my_wc_t)w_many)
+      if (w_wc == (my_wc_t) w_many)
       {
-        result= 1;				/* Found an anchor char */
+        result= 1;                                /* Found an anchor char */
         break;
       }
 
       wildstr+= scan;
-      if (w_wc ==  (my_wc_t)escape && wildstr < wildend)
+      if (w_wc ==  (my_wc_t) escape && wildstr < wildend)
       {
         if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
                          (const uchar*)wildend)) <= 0)
@@ -1755,29 +1792,27 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
         return 1;
       str+= scan;
       
-      if (!escaped && w_wc == (my_wc_t)w_one)
+      if (!escaped && w_wc == (my_wc_t) w_one)
       {
-        result= 1;				/* Found an anchor char */
+        result= 1;                                /* Found an anchor char */
       }
       else
       {
         if (weights)
         {
-          plane=(s_wc>>8) & 0xFF;
-          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-          plane=(w_wc>>8) & 0xFF;
-          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+          my_tosort_unicode(weights, &s_wc);
+          my_tosort_unicode(weights, &w_wc);
         }
         if (s_wc != w_wc)
-          return 1;				/* No match */
+          return 1;                               /* No match */
       }
       if (wildstr == wildend)
-	return (str != str_end);		/* Match if both are at end */
+        return (str != str_end);                  /* Match if both are at end */
     }
     
     
-    if (w_wc == (my_wc_t)w_many)
-    {						/* Found w_many */
+    if (w_wc == (my_wc_t) w_many)
+    {                                             /* Found w_many */
     
       /* Remove any '%' and '_' from the wild search string */
       for ( ; wildstr != wildend ; )
@@ -1786,29 +1821,29 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
                          (const uchar*)wildend)) <= 0)
           return 1;
         
-	if (w_wc == (my_wc_t)w_many)
-	{
-	  wildstr+= scan;
-	  continue;
-	} 
-	
-	if (w_wc == (my_wc_t)w_one)
-	{
-	  wildstr+= scan;
+        if (w_wc == (my_wc_t)w_many)
+        {
+          wildstr+= scan;
+          continue;
+        } 
+        
+        if (w_wc == (my_wc_t)w_one)
+        {
+          wildstr+= scan;
           if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
                            (const uchar*)str_end)) <=0)
             return 1;
           str+= scan;
-	  continue;
-	}
-	break;					/* Not a wild character */
+          continue;
+        }
+        break;                                        /* Not a wild character */
       }
       
       if (wildstr == wildend)
-	return 0;				/* Ok if w_many is last */
+        return 0;                                /* Ok if w_many is last */
       
       if (str == str_end)
-	return -1;
+        return -1;
       
       if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
                        (const uchar*)wildend)) <=0)
@@ -1836,10 +1871,8 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
             return 1;
           if (weights)
           {
-            plane=(s_wc>>8) & 0xFF;
-            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-            plane=(w_wc>>8) & 0xFF;
-            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+            my_tosort_unicode(weights, &s_wc);
+            my_tosort_unicode(weights, &w_wc);
           }
           
           if (s_wc == w_wc)
@@ -1861,8 +1894,53 @@ int my_wildcmp_unicode(CHARSET_INFO *cs,
   return (str != str_end ? 1 : 0);
 }
 
-#endif
 
+/*
+  This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
+*/
+size_t
+my_strnxfrm_unicode(CHARSET_INFO *cs,
+                    uchar *dst, size_t dstlen,
+                    const uchar *src, size_t srclen)
+{
+  my_wc_t wc;
+  int res;
+  uchar *de= dst + dstlen;
+  uchar *de_beg= de - 1;
+  const uchar *se = src + srclen;
+  MY_UNICASE_INFO **uni_plane= (cs->state & MY_CS_BINSORT) ?
+                                NULL : cs->caseinfo;
+  LINT_INIT(wc);
+  DBUG_ASSERT(src);
+  
+  while (dst < de_beg)
+  {
+    if ((res= cs->cset->mb_wc(cs,&wc, src, se)) <= 0)
+      break;
+    src+=res;
+
+    if (uni_plane)
+      my_tosort_unicode(uni_plane, &wc);
+
+    *dst++= (uchar) (wc >> 8);
+    if (dst < de)
+      *dst++= (uchar) (wc & 0xFF);
+  }
+  
+  while (dst < de_beg) /* Fill the tail with keys for space character */
+  {
+    *dst++= 0x00;
+    *dst++= 0x20;
+  }
+  
+  if (dst < de)  /* Clear the last byte, if "dstlen" was an odd number */
+    *dst= 0x00;
+  
+  return dstlen;
+}
+
+
+#endif /* HAVE_UNIDATA */
 
 
 #ifdef HAVE_CHARSET_utf8
@@ -2569,44 +2647,6 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-static size_t my_strnxfrm_utf8(CHARSET_INFO *cs,
-                               uchar *dst, size_t dstlen,
-                               const uchar *src, size_t srclen)
-{
-  my_wc_t wc;
-  int res;
-  int plane;
-  uchar *de= dst + dstlen;
-  uchar *de_beg= de - 1;
-  const uchar *se = src + srclen;
-  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
-
-  while (dst < de_beg)
-  {
-    if ((res=my_utf8_uni(cs,&wc, src, se)) <= 0)
-      break;
-    src+=res;
-
-    plane=(wc>>8) & 0xFF;
-    wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc;
-
-    *dst++= (uchar)(wc >> 8);
-    *dst++= (uchar)(wc & 0xFF);
-    
-  }
-  
-  while (dst < de_beg) /* Fill the tail with keys for space character */
-  {
-    *dst++= 0x00;
-    *dst++= 0x20;
-  }
-  
-  if (dst < de)  /* Clear the last byte, if "dstlen" was an odd number */
-    *dst= 0x00;
-  
-  return dstlen;
-}
-
 static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
 {
   my_wc_t wc;
@@ -2642,7 +2682,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
     NULL,               /* init */
     my_strnncoll_utf8,
     my_strnncollsp_utf8,
-    my_strnxfrm_utf8,
+    my_strnxfrm_unicode,
     my_strnxfrmlen_utf8,
     my_like_range_mb,
     my_wildcmp_utf8,
@@ -2891,7 +2931,7 @@ static MY_COLLATION_HANDLER my_collation_cs_handler =
     NULL,		/* init */
     my_strnncoll_utf8_cs,
     my_strnncollsp_utf8_cs,
-    my_strnxfrm_utf8,
+    my_strnxfrm_unicode,
     my_strnxfrmlen_utf8,
     my_like_range_simple,
     my_wildcmp_mb,
@@ -4154,7 +4194,7 @@ static MY_COLLATION_HANDLER my_collation_filename_handler =
     NULL,               /* init */
     my_strnncoll_utf8,
     my_strnncollsp_utf8,
-    my_strnxfrm_utf8,
+    my_strnxfrm_unicode,
     my_strnxfrmlen_utf8,
     my_like_range_mb,
     my_wildcmp_utf8,
@@ -4284,3 +4324,859 @@ int main()
 
 
 
+#ifdef HAVE_CHARSET_utf8mb4
+
+/*
+  We consider bytes with code more than 127 as a letter.
+  This garantees that word boundaries work fine with regular
+  expressions. Note, there is no need to mark byte 255  as a
+  letter, it is illegal byte in UTF8.
+*/
+static uchar ctype_utf8mb4[]=
+{
+    0,
+   32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
+   32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+   72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
+   16,129,129,129,129,129,129,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 16, 16, 16, 16, 16,
+   16,130,130,130,130,130,130,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 16, 16, 16, 16, 32,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  0
+};
+
+
+static uchar to_lower_utf8mb4[]=
+{
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
+  112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
+   96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
+  112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
+  128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+  144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
+  160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
+  176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
+  192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
+  208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
+  224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
+  240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
+};
+
+
+static uchar to_upper_utf8mb4[]=
+{
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+   80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+   96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+   80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
+  128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+  144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
+  160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
+  176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
+  192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
+  208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
+  224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
+  240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
+};
+
+
+static inline int
+bincmp_utf8mb4(const uchar *s, const uchar *se,
+               const uchar *t, const uchar *te)
+{
+  int slen= (int) (se - s), tlen= (int) (te - t);
+  int len= min(slen, tlen);
+  int cmp= memcmp(s, t, len);
+  return cmp ? cmp : slen - tlen;
+}
+
+
+static int
+my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+                 my_wc_t * pwc, const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0x80)
+  {
+    *pwc= c;
+    return 1;
+  }
+  else if (c < 0xc2)
+    return MY_CS_ILSEQ;
+  else if (c < 0xe0)
+  {
+    if (s + 2 > e) /* We need 2 characters */
+      return MY_CS_TOOSMALL2;
+
+    if (!((s[1] ^ 0x80) < 0x40))
+      return MY_CS_ILSEQ;
+
+    *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    return 2;
+  }
+  else if (c < 0xf0)
+  {
+    if (s + 3 > e) /* We need 3 characters */
+      return MY_CS_TOOSMALL3;
+
+    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+          (c >= 0xe1 || s[1] >= 0xa0)))
+      return MY_CS_ILSEQ;
+
+    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
+          ((my_wc_t) (s[1] ^ 0x80) << 6) |
+           (my_wc_t) (s[2] ^ 0x80);
+    return 3;
+  }
+  else if (c < 0xf5)
+  {
+    if (s + 4 > e) /* We need 4 characters */
+      return MY_CS_TOOSMALL4;
+
+    /*
+      UTF-8 quick four-byte mask:
+      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      Encoding allows to encode U+00010000..U+001FFFFF
+      
+      The maximum character defined in the Unicode standard is U+0010FFFF.
+      Higher characters U+00110000..U+001FFFFF are not used.
+      
+      11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+      11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+      
+      Valid codes:
+      [F0][90..BF][80..BF][80..BF]
+      [F1][80..BF][80..BF][80..BF]
+      [F2][80..BF][80..BF][80..BF]
+      [F3][80..BF][80..BF][80..BF]
+      [F4][80..8F][80..BF][80..BF]
+    */
+
+    if (!((s[1] ^ 0x80) < 0x40 &&
+          (s[2] ^ 0x80) < 0x40 &&
+          (s[3] ^ 0x80) < 0x40 &&
+          (c >= 0xf1 || s[1] >= 0x90) &&
+          (c <= 0xf3 || s[1] <= 0x8F)))
+      return MY_CS_ILSEQ;
+    *pwc = ((my_wc_t) (c & 0x07) << 18)    |
+           ((my_wc_t) (s[1] ^ 0x80) << 12) |
+           ((my_wc_t) (s[2] ^ 0x80) << 6)  |
+            (my_wc_t) (s[3] ^ 0x80);
+    return 4;
+  }
+  return MY_CS_ILSEQ;
+}
+
+
+/*
+  The same as above, but without range check
+  for example, for a null-terminated string
+*/
+static int
+my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
+                          my_wc_t *pwc, const uchar *s)
+{
+  uchar c;
+
+  c= s[0];
+  if (c < 0x80)
+  {
+    *pwc = c;
+    return 1;
+  }
+
+  if (c < 0xc2)
+    return MY_CS_ILSEQ;
+
+  if (c < 0xe0)
+  {
+    if (!((s[1] ^ 0x80) < 0x40))
+      return MY_CS_ILSEQ;
+
+    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    return 2;
+  }
+  
+  if (c < 0xf0)
+  {
+    if (!((s[1] ^ 0x80) < 0x40 &&
+          (s[2] ^ 0x80) < 0x40 &&
+          (c >= 0xe1 || s[1] >= 0xa0)))
+      return MY_CS_ILSEQ;
+    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
+          ((my_wc_t) (s[1] ^ 0x80) << 6) |
+           (my_wc_t) (s[2] ^ 0x80);
+
+    return 3;
+  }
+  else if (c < 0xf5)
+  {
+    if (!((s[1] ^ 0x80) < 0x40 &&
+          (s[2] ^ 0x80) < 0x40 &&
+          (s[3] ^ 0x80) < 0x40 &&
+          (c >= 0xf1 || s[1] >= 0x90) &&
+          (c <= 0xf3 || s[1] <= 0x8F)))
+      return MY_CS_ILSEQ;
+    *pwc = ((my_wc_t) (c & 0x07) << 18)    |
+           ((my_wc_t) (s[1] ^ 0x80) << 12) |
+           ((my_wc_t) (s[2] ^ 0x80) << 6)  |
+            (my_wc_t) (s[3] ^ 0x80);
+    return 4;
+  }
+  return MY_CS_ILSEQ;
+}
+
+
+static int
+my_wc_mb_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+                 my_wc_t wc, uchar *r, uchar *e)
+{
+  int count;
+
+  if (r >= e)
+    return MY_CS_TOOSMALL;
+
+  if (wc < 0x80)
+    count= 1;
+  else if (wc < 0x800)
+    count= 2;
+  else if (wc < 0x10000)
+    count= 3;
+  else if (wc < 0x200000)
+    count= 4;
+  else return MY_CS_ILUNI;
+
+  if (r + count > e)
+    return MY_CS_TOOSMALLN(count);
+
+  switch (count) {
+    /* Fall through all cases!!! */
+    case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
+    case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
+    case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
+    case 1: r[0] = (uchar) wc;
+  }
+  return count;
+}
+
+
+/*
+  The same as above, but without range check.
+*/
+static int
+my_wc_mb_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
+                          my_wc_t wc, uchar *r)
+{
+  int count;
+
+  if (wc < 0x80)
+    count= 1;
+  else if (wc < 0x800)
+    count= 2;
+  else if (wc < 0x10000)
+    count= 3;
+  else if (wc < 0x200000)
+    count= 4;
+  else
+    return MY_CS_ILUNI;
+
+  switch (count)
+  {
+    /* Fall through all cases!!! */
+    case 4: r[3]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0x10000;
+    case 3: r[2]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0x800;
+    case 2: r[1]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0xc0;
+    case 1: r[0]= (uchar) wc;
+  }
+  return count;
+}
+
+
+static inline void
+my_tolower_utf8mb4(MY_UNICASE_INFO **uni_plane, my_wc_t *wc)
+{
+  int page= *wc >> 8;
+  if (page < 256 && uni_plane[page])
+    *wc= uni_plane[page][*wc & 0xFF].tolower;
+}
+
+
+static inline void
+my_toupper_utf8mb4(MY_UNICASE_INFO **uni_plane, my_wc_t *wc)
+{
+  int page= *wc >> 8;
+  if (page < 256 && uni_plane[page])
+    *wc= uni_plane[page][*wc & 0xFF].toupper;
+}
+
+
+static size_t
+my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen,
+                  char *dst, size_t dstlen)
+{
+  my_wc_t wc;
+  int srcres, dstres;
+  char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  DBUG_ASSERT(src != dst || cs->caseup_multiply == 1);
+
+  while ((src < srcend) &&
+         (srcres= my_mb_wc_utf8mb4(cs, &wc,
+                                   (uchar *) src, (uchar*) srcend)) > 0)
+  {
+    my_toupper_utf8mb4(uni_plane, &wc);
+    if ((dstres= my_wc_mb_utf8mb4(cs, wc, (uchar*) dst, (uchar*) dstend)) <= 0)
+      break;
+    src+= srcres;
+    dst+= dstres;
+  }
+  return (size_t) (dst - dst0);
+}
+
+
+static inline void
+my_hash_add(ulong *n1, ulong *n2, uint ch)
+{
+  n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8);
+  n2[0]+= 3;
+}
+
+
+static void
+my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
+                     ulong *n1, ulong *n2)
+{
+  my_wc_t wc;
+  int res;
+  const uchar *e= s + slen;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+
+  /*
+    Remove end space. We do this to be able to compare
+    'A ' and 'A' as identical
+  */
+  while (e > s && e[-1] == ' ')
+    e--;
+
+  while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
+  {
+    my_tosort_unicode(uni_plane, &wc);
+    my_hash_add(n1, n2, (uint) (wc & 0xFF));
+    my_hash_add(n1, n2, (uint) (wc >> 8)  & 0xFF);
+    if (wc > 0xFFFF)
+    {
+       /*
+        Put the highest byte only if it is non-zero,
+        to make hash functions for utf8mb3 and utf8mb4
+        compatible for BMP characters.
+        This is useful to keep order of records in
+        test results, e.g. for "SHOW GRANTS".
+      */
+      my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF);
+    }
+    s+= res;
+  }
+}
+
+
+static size_t
+my_caseup_str_utf8mb4(CHARSET_INFO *cs, char *src)
+{
+  my_wc_t wc;
+  int srcres, dstres;
+  char *dst= src, *dst0= src;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  DBUG_ASSERT(cs->caseup_multiply == 1);
+
+  while (*src &&
+         (srcres= my_mb_wc_utf8mb4_no_range(cs, &wc, (uchar *) src)) > 0)
+  {
+    my_toupper_utf8mb4(uni_plane, &wc);
+    if ((dstres= my_wc_mb_utf8mb4_no_range(cs, wc, (uchar*) dst)) <= 0)
+      break;
+    src+= srcres;
+    dst+= dstres;
+  }
+  *dst= '\0';
+  return (size_t) (dst - dst0);
+}
+
+
+static size_t
+my_casedn_utf8mb4(CHARSET_INFO *cs,
+                  char *src, size_t srclen,
+                  char *dst, size_t dstlen)
+{
+  my_wc_t wc;
+  int srcres, dstres;
+  char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  DBUG_ASSERT(src != dst || cs->casedn_multiply == 1);
+
+  while ((src < srcend) &&
+         (srcres= my_mb_wc_utf8mb4(cs, &wc,
+                                   (uchar*) src, (uchar*) srcend)) > 0)
+  {
+    my_tolower_utf8mb4(uni_plane, &wc);
+    if ((dstres= my_wc_mb_utf8mb4(cs, wc, (uchar*) dst, (uchar*) dstend)) <= 0)
+      break;
+    src+= srcres;
+    dst+= dstres;
+  }
+  return (size_t) (dst - dst0);
+}
+
+
+static size_t
+my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
+{
+  my_wc_t wc;
+  int srcres, dstres;
+  char *dst= src, *dst0= src;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  DBUG_ASSERT(cs->casedn_multiply == 1);
+
+  while (*src &&
+         (srcres= my_mb_wc_utf8mb4_no_range(cs, &wc, (uchar *) src)) > 0)
+  {
+    my_tolower_utf8mb4(uni_plane, &wc);
+    if ((dstres= my_wc_mb_utf8mb4_no_range(cs, wc, (uchar*) dst)) <= 0)
+      break;
+    src+= srcres;
+    dst+= dstres;
+  }
+
+  /*
+   In rare cases lower string can be shorter than
+   the original string, for example:
+
+   "U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE"
+   (which is 0xC4B0 in utf8, i.e. two bytes)
+
+   is converted into
+
+   "U+0069 LATIN SMALL LETTER I"
+   (which is 0x69 in utf8, i.e. one byte)
+
+   So, we need to put '\0' terminator after converting.
+  */
+
+  *dst= '\0';
+  return (size_t) (dst - dst0);
+}
+
+
+static int
+my_strnncoll_utf8mb4(CHARSET_INFO *cs,
+                     const uchar *s, size_t slen,
+                     const uchar *t, size_t tlen,
+                     my_bool t_is_prefix)
+{
+  my_wc_t s_wc,t_wc;
+  const uchar *se= s + slen;
+  const uchar *te= t + tlen;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  LINT_INIT(s_wc);
+  LINT_INIT(t_wc);
+
+  while ( s < se && t < te )
+  {
+    int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
+    int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
+
+    if ( s_res <= 0 || t_res <= 0 )
+    {
+      /* Incorrect string, compare bytewise */
+      return bincmp_utf8mb4(s, se, t, te);
+    }
+
+    my_tosort_unicode(uni_plane, &s_wc);
+    my_tosort_unicode(uni_plane, &t_wc);
+    
+    if ( s_wc != t_wc )
+    {
+      return s_wc > t_wc ? 1 : -1;
+    }
+
+    s+= s_res;
+    t+= t_res;
+  }
+  return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
+}
+
+
+/**
+  
+  Compare strings, discarding end space
+
+  If one string is shorter as the other, then we space extend the other
+  so that the strings have equal length.
+
+  This will ensure that the following things hold:
+
+    "a"  == "a "
+    "a\0" < "a"
+    "a\0" < "a "
+
+  @param  cs        Character set pinter.
+  @param  a         First string to compare.
+  @param  a_length  Length of 'a'.
+  @param  b         Second string to compare.
+  @param  b_length  Length of 'b'.
+  @param  diff_if_only_endspace_difference
+                    Set to 1 if the strings should be regarded as different
+                    if they only difference in end space
+
+  @return Comparison result.
+    @retval Negative number, if a less than b.
+    @retval 0, if a is equal to b
+    @retval Positive number, if a > b
+*/
+
+static int
+my_strnncollsp_utf8mb4(CHARSET_INFO *cs,
+                       const uchar *s, size_t slen,
+                       const uchar *t, size_t tlen,
+                       my_bool diff_if_only_endspace_difference)
+{
+  int res;
+  my_wc_t s_wc, t_wc;
+  const uchar *se= s + slen, *te= t + tlen;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  LINT_INIT(s_wc);
+  LINT_INIT(t_wc);
+
+#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
+  diff_if_only_endspace_difference= FALSE;
+#endif
+
+  while ( s < se && t < te )
+  {
+    int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
+    int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
+
+    if ( s_res <= 0 || t_res <= 0 )
+    {
+      /* Incorrect string, compare bytewise */
+      return bincmp_utf8mb4(s, se, t, te);
+    }
+
+    my_tosort_unicode(uni_plane, &s_wc);
+    my_tosort_unicode(uni_plane, &t_wc);
+
+    if ( s_wc != t_wc )
+    {
+      return s_wc > t_wc ? 1 : -1;
+    }
+
+    s+=s_res;
+    t+=t_res;
+  }
+
+  slen= (size_t) (se-s);
+  tlen= (size_t) (te-t);
+  res= 0;
+
+  if (slen != tlen)
+  {
+    int swap= 1;
+    if (diff_if_only_endspace_difference)
+      res= 1;                                   /* Assume 'a' is bigger */
+    if (slen < tlen)
+    {
+      slen= tlen;
+      s= t;
+      se= te;
+      swap= -1;
+      res= -res;
+    }
+    /*
+      This following loop uses the fact that in UTF-8
+      all multibyte characters are greater than space,
+      and all multibyte head characters are greater than
+      space. It means if we meet a character greater
+      than space, it always means that the longer string
+      is greater. So we can reuse the same loop from the
+      8bit version, without having to process full multibute
+      sequences.
+    */
+    for ( ; s < se; s++)
+    {
+      if (*s != ' ')
+	return (*s < ' ') ? -swap : swap;
+    }
+  }
+  return res;
+}
+
+
+/**
+  Compare 0-terminated UTF8 strings.
+
+  @param  cs                  character set handler
+  @param  s                   First 0-terminated string to compare
+  @param  t                   Second 0-terminated string to compare
+
+  @return Comparison result.
+    @retval negative number if s < t
+    @retval positive number if s > t
+    @retval 0 is the strings are equal
+*/
+
+static int
+my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t)
+{
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  while (s[0] && t[0])
+  {
+    my_wc_t s_wc,t_wc;
+    
+    if ((uchar) s[0] < 128)
+    {
+      /* 
+        s[0] is between 0 and 127.
+        It represents a single byte character.
+        Convert it into weight according to collation.
+      */
+      s_wc= plane00[(uchar) s[0]].tolower;
+      s++;
+    }
+    else
+    {
+      int res= my_mb_wc_utf8mb4_no_range(cs, &s_wc, (const uchar*) s);
+      
+      /* 
+         In the case of wrong multibyte sequence we will
+         call strcmp() for byte-to-byte comparison.
+      */
+      if (res <= 0)
+        return strcmp(s, t);
+      s+= res;
+      
+      my_tolower_utf8mb4(uni_plane, &s_wc);
+    }
+    
+    
+    /* Do the same for the second string */
+    
+    if ((uchar) t[0] < 128)
+    {
+      /* Convert single byte character into weight */
+      t_wc= plane00[(uchar) t[0]].tolower;
+      t++;
+    }
+    else
+    {
+      int res= my_mb_wc_utf8mb4_no_range(cs, &t_wc, (const uchar*) t);
+      if (res <= 0)
+        return strcmp(s, t);
+      t+= res;
+      
+      my_tolower_utf8mb4(uni_plane, &t_wc);
+    }
+    
+    /* Now we have two weights, let's compare them */
+    if ( s_wc != t_wc )
+      return  ((int) s_wc) - ((int) t_wc);
+  }
+  return ((int) (uchar) s[0]) - ((int) (uchar) t[0]);
+}
+
+
+static int
+my_wildcmp_utf8mb4(CHARSET_INFO *cs,
+                   const char *str, const char *strend,
+                   const char *wildstr, const char *wildend,
+                   int escape, int w_one, int w_many)
+{
+  return my_wildcmp_unicode(cs, str, strend, wildstr, wildend,
+                            escape, w_one, w_many, cs->caseinfo); 
+}
+
+
+static size_t
+my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
+{
+  /* TODO: fix when working on WL "Unicode new version" */
+  return (len * 2 + 2) / 4;
+}
+
+
+static uint
+my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
+{
+  my_wc_t wc;
+  int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
+  return (res > 1) ? res : 0;
+}
+
+
+static uint
+my_mbcharlen_utf8mb4(CHARSET_INFO *cs  __attribute__((unused)), uint c)
+{
+  if (c < 0x80)
+    return 1;
+  if (c < 0xc2)
+    return 0; /* Illegal mb head */
+  if (c < 0xe0)
+    return 2;
+  if (c < 0xf0)
+    return 3;
+  if (c < 0xf8)
+    return 4;
+  return 0; /* Illegal mb head */;
+}
+
+
+static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
+{
+  NULL,               /* init */
+  my_strnncoll_utf8mb4,
+  my_strnncollsp_utf8mb4,
+  my_strnxfrm_unicode,
+  my_strnxfrmlen_utf8mb4,
+  my_like_range_mb,
+  my_wildcmp_utf8mb4,
+  my_strcasecmp_utf8mb4,
+  my_instr_mb,
+  my_hash_sort_utf8mb4,
+  my_propagate_complex
+};
+
+
+static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
+{
+    NULL,		/* init */
+    my_strnncoll_mb_bin,
+    my_strnncollsp_mb_bin,
+    my_strnxfrm_unicode,
+    my_strnxfrmlen_utf8mb4,
+    my_like_range_mb,
+    my_wildcmp_mb_bin,
+    my_strcasecmp_mb_bin,
+    my_instr_mb,
+    my_hash_sort_mb_bin,
+    my_propagate_simple
+};
+
+
+MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
+{
+  NULL,               /* init */
+  my_ismbchar_utf8mb4,
+  my_mbcharlen_utf8mb4,
+  my_numchars_mb,
+  my_charpos_mb,
+  my_well_formed_len_mb,
+  my_lengthsp_8bit,
+  my_numcells_mb,
+  my_mb_wc_utf8mb4,
+  my_wc_mb_utf8mb4,
+  my_mb_ctype_mb,
+  my_caseup_str_utf8mb4,
+  my_casedn_str_utf8mb4,
+  my_caseup_utf8mb4,
+  my_casedn_utf8mb4,
+  my_snprintf_8bit,
+  my_long10_to_str_8bit,
+  my_longlong10_to_str_8bit,
+  my_fill_8bit,
+  my_strntol_8bit,
+  my_strntoul_8bit,
+  my_strntoll_8bit,
+  my_strntoull_8bit,
+  my_strntod_8bit,
+  my_strtoll10_8bit,
+  my_strntoull10rnd_8bit,
+  my_scan_8bit
+};
+
+
+
+CHARSET_INFO my_charset_utf8mb4_general_ci=
+{
+  45,0,0,              /* number       */
+  MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT,  /* state  */
+  MY_UTF8MB4,         /* cs name      */
+  MY_UTF8MB4_GENERAL_CI,/* name       */
+  "UTF-8 Unicode",    /* comment      */
+  NULL,               /* tailoring    */
+  ctype_utf8mb4,      /* ctype        */
+  to_lower_utf8mb4,   /* to_lower     */
+  to_upper_utf8mb4,   /* to_upper     */
+  to_upper_utf8mb4,   /* sort_order   */
+  NULL,               /* contractions */
+  NULL,               /* sort_order_big*/
+  NULL,               /* tab_to_uni   */
+  NULL,               /* tab_from_uni */
+  my_unicase_default, /* caseinfo     */
+  NULL,               /* state_map    */
+  NULL,               /* ident_map    */
+  1,                  /* strxfrm_multiply */
+  1,                  /* caseup_multiply  */
+  1,                  /* casedn_multiply  */
+  1,                  /* mbminlen     */
+  4,                  /* mbmaxlen     */
+  0,                  /* min_sort_char */
+  0xFFFF,             /* max_sort_char */
+  ' ',                /* pad char      */
+  0,                  /* escape_with_backslash_is_dangerous */
+  &my_charset_utf8mb4_handler,
+  &my_collation_utf8mb4_general_ci_handler
+};
+
+
+CHARSET_INFO my_charset_utf8mb4_bin=
+{
+  46,0,0,             /* number       */
+  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state  */
+  MY_UTF8MB4,         /* cs name      */
+  MY_UTF8MB4_BIN,     /* name         */
+  "UTF-8 Unicode",    /* comment      */
+  NULL,               /* tailoring    */
+  ctype_utf8mb4,      /* ctype        */
+  to_lower_utf8mb4,   /* to_lower     */
+  to_upper_utf8mb4,   /* to_upper     */
+  NULL,               /* sort_order   */
+  NULL,               /* contractions */
+  NULL,               /* sort_order_big*/
+  NULL,               /* tab_to_uni   */
+  NULL,               /* tab_from_uni */
+  my_unicase_default, /* caseinfo     */
+  NULL,               /* state_map    */
+  NULL,               /* ident_map    */
+  1,                  /* strxfrm_multiply */
+  1,                  /* caseup_multiply  */
+  1,                  /* casedn_multiply  */
+  1,                  /* mbminlen     */
+  4,                  /* mbmaxlen     */
+  0,                  /* min_sort_char */
+  0xFFFF,             /* max_sort_char */
+  ' ',                /* pad char      */
+  0,                  /* escape_with_backslash_is_dangerous */
+  &my_charset_utf8mb4_handler,
+  &my_collation_utf8mb4_bin_handler
+};
+
+#endif /* HAVE_CHARSET_utf8mb4 */