10.0-base merge

author: Sergei Golubchik <sergii@pisem.net> 2014-02-26 15:28:07 +0100
committer: Sergei Golubchik <sergii@pisem.net> 2014-02-26 15:28:07 +0100
commit: 0dc23679c867629ded5f9534d2ab6e8edf238aa0 (patch)
tree: 9cf966507fa2ef0fd17932b600d051df5f7bd2e5 /strings/ctype-utf8.c
parent: 6efa5efa7dd112b6ac2efdd84235a13cca51c4d4 (diff)
parent: 0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2 (diff)
download: mariadb-git-0dc23679c867629ded5f9534d2ab6e8edf238aa0.tar.gz
1 files changed, 231 insertions, 36 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index e564a85c828..aba179b154c 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -26,6 +26,7 @@
 #define EILSEQ ENOENT
 #endif
 
+#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
 
 #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
 #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@@ -56,6 +57,46 @@
 #define HAVE_UNIDATA
 #endif
 
+
+#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
+
+static inline
+int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  DBUG_ASSERT(s < e);
+  c= s[0];
+  if (c < 0x80)
+    return 1;
+
+  if (c < 0xc2)
+    return MY_CS_ILSEQ;
+
+  if (c < 0xe0)
+  {
+    if (s+2 > e) /* We need 2 characters */
+      return MY_CS_TOOSMALL2;
+
+    if (!(IS_CONTINUATION_BYTE(s[1])))
+      return MY_CS_ILSEQ;
+
+    return 2;
+  }
+
+  DBUG_ASSERT(c < 0xf0);
+  if (s+3 > e) /* We need 3 characters */
+    return MY_CS_TOOSMALL3;
+
+  if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
+          (c >= 0xe1 || s[1] >= 0xa0)))
+    return MY_CS_ILSEQ;
+
+  return 3;
+}
+
+#endif  /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
+
 #ifdef HAVE_UNIDATA
 
 #include "my_uctype.h"
@@ -4806,7 +4847,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
     if (s+2 > e) /* We need 2 characters */
       return MY_CS_TOOSMALL2;
 
-    if (!((s[1] ^ 0x80) < 0x40))
+    if (!(IS_CONTINUATION_BYTE(s[1])))
       return MY_CS_ILSEQ;
 
     *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -4817,7 +4858,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
     if (s+3 > e) /* We need 3 characters */
       return MY_CS_TOOSMALL3;
 
-    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
           (c >= 0xe1 || s[1] >= 0xa0)))
       return MY_CS_ILSEQ;
 
@@ -4833,9 +4874,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
     if (s+4 > e) /* We need 4 characters */
       return MY_CS_TOOSMALL4;
 
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
           (c >= 0xf1 || s[1] >= 0x90)))
       return MY_CS_ILSEQ;
 
@@ -4851,10 +4892,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
     if (s+5 >e) /* We need 5 characters */
       return MY_CS_TOOSMALL5;
 
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
-          (s[4] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
           (c >= 0xf9 || s[1] >= 0x88)))
       return MY_CS_ILSEQ;
 
@@ -4870,11 +4911,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
     if ( s+6 >e ) /* We need 6 characters */
       return MY_CS_TOOSMALL6;
 
-    if (!((s[1] ^ 0x80) < 0x40   &&
-          (s[2] ^ 0x80) < 0x40   &&
-          (s[3] ^ 0x80) < 0x40   &&
-          (s[4] ^ 0x80) < 0x40   &&
-          (s[5] ^ 0x80) < 0x40   &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
+          IS_CONTINUATION_BYTE(s[5]) &&
           (c >= 0xfd || s[1] >= 0x84)))
       return MY_CS_ILSEQ;
 
@@ -4918,11 +4959,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
     *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
     return 2;
   }
-  
+
   if (c < 0xf0)
   {
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
           (c >= 0xe1 || s[1] >= 0xa0)))
       return MY_CS_ILSEQ;
 
@@ -5404,10 +5445,90 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
 }
 
 
+static
+int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+                            const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0xf0)
+    return my_valid_mbcharlen_utf8mb3(s, e);
+
+#ifdef UNICODE_32BIT
+  if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
+  {
+    if (s+4 > e) /* We need 4 characters */
+      return MY_CS_TOOSMALL4;
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          (c >= 0xf1 || s[1] >= 0x90)))
+      return MY_CS_ILSEQ;
+
+    return 4;
+  }
+  if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
+  {
+    if (s+5 >e) /* We need 5 characters */
+      return MY_CS_TOOSMALL5;
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
+          (c >= 0xf9 || s[1] >= 0x88)))
+      return MY_CS_ILSEQ;
+
+    return 5;
+  }
+  if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
+  {
+    if ( s+6 >e ) /* We need 6 characters */
+      return MY_CS_TOOSMALL6;
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
+          IS_CONTINUATION_BYTE(s[5]) &&
+          (c >= 0xfd || s[1] >= 0x84)))
+      return MY_CS_ILSEQ;
+
+    return 6;
+  }
+#endif
+  return MY_CS_ILSEQ;
+}
+
+static size_t
+my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
+                        size_t pos, int *error)
+{
+  const char *b_start= b;
+  *error= 0;
+  while (pos)
+  {
+    int mb_len;
+
+    if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+    {
+      *error= b < e ? 1 : 0;
+      break;
+    }
+    b+= mb_len;
+    pos--;
+  }
+  return (size_t) (b - b_start);
+}
+
 static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
 {
-  my_wc_t wc;
-  int  res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
+  int  res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
   return (res>1) ? res : 0;
 }
 
@@ -5472,7 +5593,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
     my_mbcharlen_utf8,
     my_numchars_mb,
     my_charpos_mb,
-    my_well_formed_len_mb,
+    my_well_formed_len_utf8,
     my_lengthsp_8bit,
     my_numcells_mb,
     my_utf8_uni,
@@ -7244,7 +7365,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
     if (s + 2 > e) /* We need 2 characters */
       return MY_CS_TOOSMALL2;
 
-    if (!((s[1] ^ 0x80) < 0x40))
+    if (!(IS_CONTINUATION_BYTE(s[1])))
       return MY_CS_ILSEQ;
 
     *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -7255,7 +7376,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
     if (s + 3 > e) /* We need 3 characters */
       return MY_CS_TOOSMALL3;
 
-    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
           (c >= 0xe1 || s[1] >= 0xa0)))
       return MY_CS_ILSEQ;
 
@@ -7288,9 +7409,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
       [F4][80..8F][80..BF][80..BF]
     */
 
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
           (c >= 0xf1 || s[1] >= 0x90) &&
           (c <= 0xf3 || s[1] <= 0x8F)))
       return MY_CS_ILSEQ;
@@ -7326,17 +7447,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
 
   if (c < 0xe0)
   {
-    if (!((s[1] ^ 0x80) < 0x40))
+    if (!IS_CONTINUATION_BYTE(s[1]))
       return MY_CS_ILSEQ;
 
     *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
     return 2;
   }
-  
+
   if (c < 0xf0)
   {
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
           (c >= 0xe1 || s[1] >= 0xa0)))
       return MY_CS_ILSEQ;
     *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
@@ -7347,9 +7468,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
   }
   else if (c < 0xf5)
   {
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
           (c >= 0xf1 || s[1] >= 0x90) &&
           (c <= 0xf3 || s[1] <= 0x8F)))
       return MY_CS_ILSEQ;
@@ -7836,11 +7957,84 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs,
 }
 
 
+static int
+my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+                           const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0xf0)
+    return my_valid_mbcharlen_utf8mb3(s, e);
+
+  if (c < 0xf5)
+  {
+    if (s + 4 > e) /* We need 4 characters */
+      return MY_CS_TOOSMALL4;
+
+    /*
+      UTF-8 quick four-byte mask:
+      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      Encoding allows to encode U+00010000..U+001FFFFF
+
+      The maximum character defined in the Unicode standard is U+0010FFFF.
+      Higher characters U+00110000..U+001FFFFF are not used.
+
+      11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+      11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+
+      Valid codes:
+      [F0][90..BF][80..BF][80..BF]
+      [F1][80..BF][80..BF][80..BF]
+      [F2][80..BF][80..BF][80..BF]
+      [F3][80..BF][80..BF][80..BF]
+      [F4][80..8F][80..BF][80..BF]
+    */
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          (c >= 0xf1 || s[1] >= 0x90) &&
+          (c <= 0xf3 || s[1] <= 0x8F)))
+      return MY_CS_ILSEQ;
+
+    return 4;
+  }
+
+  return MY_CS_ILSEQ;
+}
+
+
+static
+size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
+                                  const char *b, const char *e,
+                                  size_t pos, int *error)
+{
+  const char *b_start= b;
+  *error= 0;
+  while (pos)
+  {
+    int mb_len;
+
+    if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+    {
+      *error= b < e ? 1 : 0;
+      break;
+    }
+    b+= mb_len;
+    pos--;
+  }
+  return (size_t) (b - b_start);
+}
+
+
 static uint
 my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
 {
-  my_wc_t wc;
-  int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
+  int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
   return (res > 1) ? res : 0;
 }
 
@@ -7901,7 +8095,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
   my_mbcharlen_utf8mb4,
   my_numchars_mb,
   my_charpos_mb,
-  my_well_formed_len_mb,
+  my_well_formed_len_utf8mb4,
   my_lengthsp_8bit,
   my_numcells_mb,
   my_mb_wc_utf8mb4,
@@ -7963,7 +8157,8 @@ struct charset_info_st my_charset_utf8mb4_general_ci=
 struct charset_info_st my_charset_utf8mb4_bin=
 {
   46,0,0,             /* number       */
-  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state  */
+  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|
+  MY_CS_UNICODE_SUPPLEMENT, /* state  */
   MY_UTF8MB4,         /* cs name      */
   MY_UTF8MB4_BIN,     /* name         */
   "UTF-8 Unicode",    /* comment      */
author	Sergei Golubchik <sergii@pisem.net>	2014-02-26 15:28:07 +0100
committer	Sergei Golubchik <sergii@pisem.net>	2014-02-26 15:28:07 +0100
commit	0dc23679c867629ded5f9534d2ab6e8edf238aa0 (patch)
tree	9cf966507fa2ef0fd17932b600d051df5f7bd2e5 /strings/ctype-utf8.c
parent	6efa5efa7dd112b6ac2efdd84235a13cca51c4d4 (diff)
parent	0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2 (diff)
download	mariadb-git-0dc23679c867629ded5f9534d2ab6e8edf238aa0.tar.gz