Merging utf16le from MySQL-5.6

added: mysql-test/include/ctype_heap.inc mysql-test/include/ctype_strtoll10.inc mysql-test/r/ctype_utf16le.result mysql-test/t/ctype_utf16le.test modified: cmake/character_sets.cmake include/m_ctype.h mysql-test/r/ctype_ucs.result mysql-test/r/ctype_utf16.result mysql-test/r/ctype_utf32.result mysql-test/suite/funcs_1/r/innodb_func_view.result mysql-test/suite/funcs_1/r/memory_func_view.result mysql-test/suite/funcs_1/r/myisam_func_view.result mysql-test/suite/sys_vars/r/character_set_client_basic.result mysql-test/suite/sys_vars/r/character_set_connection_basic.result mysql-test/suite/sys_vars/r/character_set_database_basic.result mysql-test/suite/sys_vars/r/character_set_filesystem_basic.result mysql-test/suite/sys_vars/r/character_set_results_basic.result mysql-test/t/ctype_ucs.test mysql-test/t/ctype_utf16.test mysql-test/t/ctype_utf32.test mysys/charset-def.c sql/item_func.cc sql/sys_vars.cc strings/ctype-latin1.c strings/ctype-ucs2.c
author: Alexander Barkov <bar@mnogosearch.org> 2013-03-28 17:19:09 +0400
committer: Alexander Barkov <bar@mnogosearch.org> 2013-03-28 17:19:09 +0400
commit: d1e162e011d77e64cc98afa76ed5e67b2bca6381 (patch)
tree: 55335bb289b5712e4bf1589bc9cc10313918d23a /strings
parent: 41013f16a05f45524a01e40cc48bc8e6f9904f55 (diff)
download: mariadb-git-d1e162e011d77e64cc98afa76ed5e67b2bca6381.tar.gz
2 files changed, 395 insertions, 131 deletions
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 80ae11c82c2..fd327925fb8 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -380,7 +380,10 @@ int my_wc_mb_latin1(CHARSET_INFO *cs  __attribute__((unused)),
   if (str >= end)
     return MY_CS_TOOSMALL;
   
-  pl= uni_to_cs[(wc>>8) & 0xFF];
+  if (wc > 0xFFFF)
+    return MY_CS_ILUNI;
+  
+  pl= uni_to_cs[wc >> 8];
   str[0]= pl ? pl[wc & 0xFF] : '\0';
   return (!str[0] && wc) ? MY_CS_ILUNI : 1;
 }
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 6ebbae8fb5a..fa43f36fb84 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -101,6 +101,7 @@ my_strntol_mb2_or_mb4(CHARSET_INFO *cs,
   int      overflow;
   int      cnv;
   my_wc_t  wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   register unsigned int cutlim;
   register uint32 cutoff;
   register uint32 res;
@@ -111,7 +112,7 @@ my_strntol_mb2_or_mb4(CHARSET_INFO *cs,
   *err= 0;
   do
   {
-    if ((cnv= cs->cset->mb_wc(cs, &wc, s, e))>0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       switch (wc)
       {
@@ -141,7 +142,7 @@ bs:
   cutlim= (uint) (((uint32)~0L) % (uint32) base);
   
   do {
-    if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       s+= cnv;
       if (wc >= '0' && wc <= '9')
@@ -212,6 +213,7 @@ my_strntoul_mb2_or_mb4(CHARSET_INFO *cs,
   int      overflow;
   int      cnv;
   my_wc_t  wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   register unsigned int cutlim;
   register uint32 cutoff;
   register uint32 res;
@@ -222,7 +224,7 @@ my_strntoul_mb2_or_mb4(CHARSET_INFO *cs,
   *err= 0;
   do
   {
-    if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       switch (wc)
       {
@@ -253,7 +255,7 @@ bs:
   
   do
   {
-    if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       s+= cnv;
       if (wc >= '0' && wc <= '9')
@@ -316,6 +318,7 @@ my_strntoll_mb2_or_mb4(CHARSET_INFO *cs,
   int      overflow;
   int      cnv;
   my_wc_t  wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   register ulonglong    cutoff;
   register unsigned int cutlim;
   register ulonglong    res;
@@ -326,7 +329,7 @@ my_strntoll_mb2_or_mb4(CHARSET_INFO *cs,
   *err= 0;
   do
   {
-    if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       switch (wc)
       {
@@ -356,7 +359,7 @@ bs:
   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
 
   do {
-    if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       s+=cnv;
       if ( wc>='0' && wc<='9')
@@ -427,6 +430,7 @@ my_strntoull_mb2_or_mb4(CHARSET_INFO *cs,
   int      overflow;
   int      cnv;
   my_wc_t  wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   register ulonglong    cutoff;
   register unsigned int cutlim;
   register ulonglong    res;
@@ -437,7 +441,7 @@ my_strntoull_mb2_or_mb4(CHARSET_INFO *cs,
   *err= 0;
   do
   {
-    if ((cnv= cs->cset->mb_wc(cs,&wc,s,e)) > 0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       switch (wc)
       {
@@ -468,7 +472,7 @@ bs:
 
   do
   {
-    if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
+    if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
     {
       s+=cnv;
       if ( wc>='0' && wc<='9')
@@ -533,6 +537,7 @@ my_strntod_mb2_or_mb4(CHARSET_INFO *cs,
   register const uchar *s= (const uchar*) nptr;
   const uchar *end;
   my_wc_t  wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   int     cnv;
 
   *err= 0;
@@ -541,7 +546,7 @@ my_strntod_mb2_or_mb4(CHARSET_INFO *cs,
     length= sizeof(buf) - 1;
   end= s + length;
 
-  while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
+  while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
   {
     s+= cnv;
     if (wc > (int) (uchar) 'e' || !wc)
@@ -566,6 +571,7 @@ my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs,
   ulonglong res;
   const uchar *end, *s= (const uchar*) nptr;
   my_wc_t  wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   int     cnv;
 
   /* Cut too long strings */
@@ -573,7 +579,7 @@ my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs,
     length= sizeof(buf)-1;
   end= s + length;
 
-  while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
+  while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
   {
     s+= cnv;
     if (wc > (int) (uchar) 'e' || !wc)
@@ -712,23 +718,36 @@ static longlong
 my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
                  const char *nptr, char **endptr, int *error)
 {
-  const char *s, *end, *start, *n_end, *true_end;
+  const uchar *s, *end, *start, *n_end, *true_end;
   uchar c;
   unsigned long i, j, k;
   ulonglong li;
   int negative;
   ulong cutoff, cutoff2, cutoff3;
+  my_wc_t wc;
+  int res;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
 
-  s= nptr;
+  s= (const uchar *) nptr;
   /* If fixed length string */
   if (endptr)
   {
-    /* Make sure string length is even */
-    end= s + ((*endptr - s) / 2) * 2;
-    while (s < end && !s[0] && (s[1] == ' ' || s[1] == '\t'))
-      s+= 2;
-    if (s == end)
-      goto no_conv;
+    /*
+      Make sure string length is even.
+      Odd length indicates a bug in the caller.
+      Assert in debug, round in production.
+    */
+    DBUG_ASSERT((*endptr - (const char *) s) % 2 == 0);
+    end= s + ((*endptr - (const char*) s) / 2) * 2;
+
+    for ( ; ; ) /* Skip leading spaces and tabs */
+    {
+      if ((res= mb_wc(cs, &wc, s, end)) <= 0)
+        goto no_conv;
+      s+= res;
+      if (wc != ' ' && wc != '\t')
+        break;
+    }
   }
   else
   {
@@ -738,13 +757,13 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
 
   /* Check for a sign. */
   negative= 0;
-  if (!s[0] && s[1] == '-')
+  if (wc == '-')
   {
     *error= -1;                                        /* Mark as negative number */
     negative= 1;
-    s+= 2;
-    if (s == end)
+    if ((res= mb_wc(cs, &wc, s, end)) <= 0)
       goto no_conv;
+    s+= res; /* wc is now expected to hold the first digit. */
     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
@@ -752,46 +771,53 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
   else
   {
     *error= 0;
-    if (!s[0] && s[1] == '+')
+    if (wc == '+')
     {
-      s+= 2;
-      if (s == end)
+      if ((res= mb_wc(cs, &wc, s, end)) <= 0)
         goto no_conv;
+      s+= res; /* wc is now expected to hold the first digit. */
     }
     cutoff=  ULONGLONG_MAX / LFACTOR2;
     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
     cutoff3=  ULONGLONG_MAX % 100;
   }
 
-  /* Handle case where we have a lot of pre-zero */
-  if (!s[0] && s[1] == '0')
+  /*
+    The code below assumes that 'wc' holds the first digit
+    and 's' points to the next character after it.
+
+    Scan pre-zeros if any.
+  */
+  if (wc == '0')
   {
     i= 0;
-    do
+    for ( ; ; s+= res)
     {
-      s+= 2;
       if (s == end)
         goto end_i;                                /* Return 0 */
+      if ((res= mb_wc(cs, &wc, s, end)) <= 0)
+        goto no_conv;
+      if (wc != '0')
+        break;
     }
-    while (!s[0] && s[1] == '0');
     n_end= s + 2 * INIT_CNT;
   }
   else
   {
     /* Read first digit to check that it's a valid number */
-    if (s[0] || (c= (s[1]-'0')) > 9)
+    if ((i= (wc - '0')) > 9)
       goto no_conv;
-    i= c;
-    s+= 2;
     n_end= s + 2 * (INIT_CNT-1);
   }
 
   /* Handle first 9 digits and store them in i */
   if (n_end > end)
     n_end= end;
-  for (; s != n_end ; s+= 2)
+  for ( ; ; s+= res)
   {
-    if (s[0] || (c= (s[1]-'0')) > 9)
+    if ((res= mb_wc(cs, &wc, s, n_end)) <= 0)
+      break;
+    if ((c= (wc - '0')) > 9)
       goto end_i;
     i= i*10+c;
   }
@@ -806,10 +832,12 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
     n_end= end;
   do
   {
-    if (s[0] || (c= (s[1]-'0')) > 9)
+    if ((res= mb_wc(cs, &wc, s, end)) <= 0)
+      goto no_conv;
+    if ((c= (wc - '0')) > 9)
       goto end_i_and_j;
-    j= j*10+c;
-    s+= 2;
+    s+= res;
+    j= j * 10 + c;
   } while (s != n_end);
   if (s == end)
   {
@@ -817,20 +845,26 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
       goto end_i_and_j;
     goto end3;
   }
-  if (s[0] || (c= (s[1]-'0')) > 9)
-    goto end3;
 
   /* Handle the next 1 or 2 digits and store them in k */
-  k=c;
-  s+= 2;
-  if (s == end || s[0] || (c= (s[1]-'0')) > 9)
+  if ((res= mb_wc(cs, &wc, s, end)) <= 0)
+    goto no_conv;
+  if ((k= (wc - '0')) > 9)
+    goto end3;
+  s+= res;
+
+  if (s == end)
+    goto end4;
+  if ((res= mb_wc(cs, &wc, s, end)) <= 0)
+    goto no_conv;
+  if ((c= (wc - '0')) > 9)
     goto end4;
+  s+= res;
   k= k*10+c;
-  s+= 2;
   *endptr= (char*) s;
 
   /* number string should have ended here */
-  if (s != end && !s[0] && (c= (s[1]-'0')) <= 9)
+  if (s != end && mb_wc(cs, &wc, s, end) > 0 && ((uchar) (wc - '0')) <= 9)
     goto overflow;
 
   /* Check that we didn't get an overflow with the last digit */
@@ -882,15 +916,18 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)),
             const char *str, const char *end, int sequence_type)
 {
   const char *str0= str;
-  end--; /* for easier loop condition, because of two bytes per character */
-  
+  my_wc_t wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
+  int res;
+
   switch (sequence_type)
   {
   case MY_SEQ_SPACES:
-    for ( ; str < end; str+= 2)
+    for (res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
+         res > 0 && wc == ' ';
+         str+= res,
+         res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end))
     {
-      if (str[0] != '\0' || str[1] != ' ')
-        break;
     }
     return (size_t) (str - str0);
   default:
@@ -900,11 +937,33 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)),
 
 
 static void
-my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)),
-            char *s, size_t l, int fill)
+my_fill_mb2(CHARSET_INFO *cs, char *s, size_t slen, int fill)
 {
-  DBUG_ASSERT(fill <= 0xFFFF);
-  for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
+  char buf[10], *last;
+  int buflen, remainder;
+
+  DBUG_ASSERT((slen % 2) == 0);
+
+  buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
+                          (uchar*) buf + sizeof(buf));
+
+  DBUG_ASSERT(buflen > 0);
+
+  /*
+    "last" in the last position where a sequence of "buflen" bytes can start.
+  */
+  for (last= s + slen - buflen; s <= last; s+= buflen)
+  {
+    /* Enough space for the characer */
+    memcpy(s, buf, (size_t) buflen);
+  }
+
+  /* 
+    If there are some more space which is not enough
+    for the whole multibyte character, then add trailing zeros.
+  */
+  if ((remainder= last + buflen - s) > 0)
+    bzero(s, (size_t) remainder);
 }
 
 
@@ -1018,11 +1077,26 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
   DB80..DBFF - Private surrogate high     (128 pages)
   DC00..DFFF - Surrogate low              (1024 codes in a page)
 */
+#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
+#define MY_UTF16_SURROGATE_HIGH_LAST  0xDBFF
+#define MY_UTF16_SURROGATE_LOW_FIRST  0xDC00
+#define MY_UTF16_SURROGATE_LOW_LAST   0xDFFF
 
 #define MY_UTF16_HIGH_HEAD(x)  ((((uchar) (x)) & 0xFC) == 0xD8)
 #define MY_UTF16_LOW_HEAD(x)   ((((uchar) (x)) & 0xFC) == 0xDC)
 #define MY_UTF16_SURROGATE(x)  (((x) & 0xF800) == 0xD800)
 
+#define MY_UTF16_WC2(a, b)       ((a << 8) + b)
+
+/*
+  a= 110110??  (<< 18)
+  b= ????????  (<< 10)
+  c= 110111??  (<<  8)
+  d= ????????  (<<  0)
+*/
+#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
+                                  ((c & 3) << 8) + d + 0x10000)
+
 static int
 my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
              my_wc_t *pwc, const uchar *s, const uchar *e)
@@ -1044,23 +1118,14 @@ my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
     if (!MY_UTF16_LOW_HEAD(s[2]))  /* Broken surrigate pair */
       return MY_CS_ILSEQ;
 
-    /*
-      s[0]= 110110??  (<< 18)
-      s[1]= ????????  (<< 10)
-      s[2]= 110111??  (<<  8)
-      s[3]= ????????  (<<  0)
-    */ 
-
-    *pwc= ((s[0] & 3) << 18) + (s[1] << 10) +
-          ((s[2] & 3) << 8) + s[3] + 0x10000;
-
+    *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
     return 4;
   }
 
   if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
     return MY_CS_ILSEQ;
-  
-  *pwc= (s[0] << 8) + s[1];
+
+  *pwc= MY_UTF16_WC2(s[0], s[1]);
   return 2;
 }
 
@@ -1098,7 +1163,7 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
 static inline void
 my_tolower_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 {
-  int page= *wc >> 8;
+  uint page= *wc >> 8;
   if (page < 256 && uni_plane[page])
     *wc= uni_plane[page][*wc & 0xFF].tolower;
 }
@@ -1107,7 +1172,7 @@ my_tolower_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 static inline void
 my_toupper_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 {
-  int page= *wc >> 8;
+  uint page= *wc >> 8;
   if (page < 256 && uni_plane[page])
     *wc= uni_plane[page][*wc & 0xFF].toupper;
 }
@@ -1116,7 +1181,7 @@ my_toupper_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 static inline void
 my_tosort_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 {
-  int page= *wc >> 8;
+  uint page= *wc >> 8;
   if (page < 256)
   {
     if (uni_plane[page])
@@ -1135,16 +1200,18 @@ my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
                 size_t dstlen __attribute__((unused)))
 {
   my_wc_t wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
+  my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
   int res;
   char *srcend= src + srclen;
   MY_UNICASE_INFO *const *uni_plane= cs->caseinfo;
   DBUG_ASSERT(src == dst && srclen == dstlen);
   
   while ((src < srcend) &&
-         (res= my_utf16_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
+         (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
   {
     my_toupper_utf16(uni_plane, &wc);
-    if (res != my_uni_utf16(cs, wc, (uchar*) src, (uchar*) srcend))
+    if (res != wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
       break;
     src+= res;
   }
@@ -1157,14 +1224,12 @@ my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen,
                    ulong *n1, ulong *n2)
 {
   my_wc_t wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   int res;
-  const uchar *e= s+slen;
+  const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen);
   MY_UNICASE_INFO *const *uni_plane= cs->caseinfo;
 
-  while (e > s + 1 && e[-1] == ' ' && e[-2] == '\0')
-    e-= 2;
-
-  while ((s < e) && (res= my_utf16_uni(cs, &wc, (uchar *)s, (uchar*)e)) > 0)
+  while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
   {
     my_tosort_utf16(uni_plane, &wc);
     n1[0]^= (((n1[0] & 63) + n2[0]) * (wc & 0xFF)) + (n1[0] << 8);
@@ -1182,16 +1247,18 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
                 size_t dstlen __attribute__((unused)))
 {
   my_wc_t wc;
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
+  my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
   int res;
   char *srcend= src + srclen;
   MY_UNICASE_INFO *const *uni_plane= cs->caseinfo;
   DBUG_ASSERT(src == dst && srclen == dstlen);
 
   while ((src < srcend) &&
-         (res= my_utf16_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
+         (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
   {
     my_tolower_utf16(uni_plane, &wc);
-    if (res != my_uni_utf16(cs, wc, (uchar*) src, (uchar*) srcend))
+    if (res != wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
       break;
     src+= res;
   }
@@ -1207,14 +1274,15 @@ my_strnncoll_utf16(CHARSET_INFO *cs,
 {
   int s_res, t_res;
   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   const uchar *se= s + slen;
   const uchar *te= t + tlen;
   MY_UNICASE_INFO *const *uni_plane= cs->caseinfo;
 
   while (s < se && t < te)
   {
-    s_res= my_utf16_uni(cs, &s_wc, s, se);
-    t_res= my_utf16_uni(cs, &t_wc, t, te);
+    s_res= mb_wc(cs, &s_wc, s, se);
+    t_res= mb_wc(cs, &t_wc, t, te);
 
     if (s_res <= 0 || t_res <= 0)
     {
@@ -1271,6 +1339,7 @@ my_strnncollsp_utf16(CHARSET_INFO *cs,
 {
   int res;
   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   const uchar *se= s + slen, *te= t + tlen;
   MY_UNICASE_INFO *const *uni_plane= cs->caseinfo;
 
@@ -1283,8 +1352,8 @@ my_strnncollsp_utf16(CHARSET_INFO *cs,
 
   while (s < se && t < te)
   {
-    int s_res= my_utf16_uni(cs, &s_wc, s, se);
-    int t_res= my_utf16_uni(cs, &t_wc, t, te);
+    int s_res= mb_wc(cs, &s_wc, s, se);
+    int t_res= mb_wc(cs, &t_wc, t, te);
 
     if (s_res <= 0 || t_res <= 0)
     {
@@ -1324,7 +1393,7 @@ my_strnncollsp_utf16(CHARSET_INFO *cs,
 
     for ( ; s < se; s+= s_res)
     {
-      if ((s_res= my_utf16_uni(cs, &s_wc, s, se)) < 0)
+      if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0)
       {
         DBUG_ASSERT(0);
         return 0;
@@ -1338,22 +1407,11 @@ my_strnncollsp_utf16(CHARSET_INFO *cs,
 
 
 static uint
-my_ismbchar_utf16(CHARSET_INFO *cs __attribute__((unused)),
-                  const char *b __attribute__((unused)),
-                  const char *e __attribute__((unused)))
+my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e)
 {
-  if (b + 2 > e)
-    return 0;
-  
-  if (MY_UTF16_HIGH_HEAD(*b))
-  {
-    return (b + 4 <= e) && MY_UTF16_LOW_HEAD(b[2]) ? 4 : 0;
-  }
-  
-  if (MY_UTF16_LOW_HEAD(*b))
-    return 0;
-  
-  return 2;
+  my_wc_t wc;
+  int res= cs->cset->mb_wc(cs, &wc, (const uchar *) b, (const uchar *) e);
+  return (uint) (res > 0 ? res : 0);
 }
 
 
@@ -1361,6 +1419,7 @@ static uint
 my_mbcharlen_utf16(CHARSET_INFO *cs  __attribute__((unused)),
                    uint c __attribute__((unused)))
 {
+  DBUG_ASSERT(0);
   return MY_UTF16_HIGH_HEAD(c) ? 4 : 2;
 }
 
@@ -1449,13 +1508,14 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
 {
   int s_res,t_res;
   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   const uchar *se=s+slen;
   const uchar *te=t+tlen;
 
   while ( s < se && t < te )
   {
-    s_res= my_utf16_uni(cs,&s_wc, s, se);
-    t_res= my_utf16_uni(cs,&t_wc, t, te);
+    s_res= mb_wc(cs, &s_wc, s, se);
+    t_res= mb_wc(cs, &t_wc, t, te);
 
     if (s_res <= 0 || t_res <= 0)
     {
@@ -1482,6 +1542,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
 {
   int res;
   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
+  my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
   const uchar *se= s + slen, *te= t + tlen;
 
   DBUG_ASSERT((slen % 2) == 0);
@@ -1493,8 +1554,8 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
 
   while (s < se && t < te)
   {
-    int s_res= my_utf16_uni(cs, &s_wc, s, se);
-    int t_res= my_utf16_uni(cs, &t_wc, t, te);
+    int s_res= mb_wc(cs, &s_wc, s, se);
+    int t_res= mb_wc(cs, &t_wc, t, te);
 
     if (s_res <= 0 || t_res <= 0)
     {
@@ -1531,7 +1592,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
 
     for ( ; s < se; s+= s_res)
     {
-      if ((s_res= my_utf16_uni(cs, &s_wc, s, se)) < 0)
+      if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0)
       {
         DBUG_ASSERT(0);
         return 0;
@@ -1545,17 +1606,11 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
 
 
 static void
-my_hash_sort_utf16_bin(CHARSET_INFO *cs __attribute__((unused)),
-                       const uchar *key, size_t len,ulong *nr1, ulong *nr2)
+my_hash_sort_utf16_bin(CHARSET_INFO *cs,
+                       const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
 {
-  const uchar *pos = key;
-  
-  key+= len;
-
-  while (key > pos + 1 && key[-1] == ' ' && key[-2] == '\0')
-    key-= 2;
-
-  for (; pos < (uchar*) key ; pos++)
+  const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len);
+  for ( ; pos < end ; pos++)
   {
     nr1[0]^= (ulong) ((((uint) nr1[0] & 63) + nr2[0]) * 
               ((uint)*pos)) + (nr1[0] << 8);
@@ -1664,7 +1719,7 @@ struct charset_info_st my_charset_utf16_general_ci=
 struct charset_info_st my_charset_utf16_bin=
 {
   55,0,0,              /* number       */
-  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
+  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
   "utf16",             /* cs name      */
   "utf16_bin",         /* name         */
   "UTF-16 Unicode",    /* comment      */
@@ -1693,6 +1748,176 @@ struct charset_info_st my_charset_utf16_bin=
   &my_collation_utf16_bin_handler
 };
 
+
+static int
+my_utf16le_uni(const CHARSET_INFO *cs __attribute__((unused)),
+               my_wc_t *pwc, const uchar *s, const uchar *e)
+{
+  my_wc_t lo;
+
+  if (s + 2 > e)
+    return MY_CS_TOOSMALL2;
+
+  if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
+      (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
+    return 2; /* [0000-D7FF,E000-FFFF] */
+
+  if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
+    return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
+
+  if (s + 4  > e)
+    return MY_CS_TOOSMALL4;
+
+  s+= 2;
+
+  if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
+      lo > MY_UTF16_SURROGATE_LOW_LAST)
+    return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
+
+  *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
+  return 4;
+}
+
+
+static int
+my_uni_utf16le(const CHARSET_INFO *cs __attribute__((unused)),
+               my_wc_t wc, uchar *s, uchar *e)
+{
+  uint32 first, second, total;
+  if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
+      (wc > MY_UTF16_SURROGATE_LOW_LAST &&
+       wc <= 0xFFFF))
+  {
+    if (s + 2 > e)
+      return MY_CS_TOOSMALL2;
+    int2store(s, wc);
+    return 2; /* [0000-D7FF,E000-FFFF] */
+  }
+
+  if (wc < 0xFFFF || wc > 0x10FFFF)
+    return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
+
+  if (s + 4 > e)
+    return MY_CS_TOOSMALL4;
+
+  wc-= 0x10000;
+  first=  (0xD800 | ((wc >> 10) & 0x3FF));
+  second= (0xDC00 | (wc & 0x3FF));
+  total=  first | (second << 16);
+  int4store(s, total);
+  return 4; /* [010000-10FFFF] */
+}
+
+
+static size_t
+my_lengthsp_utf16le(const CHARSET_INFO *cs __attribute__((unused)),
+                    const char *ptr, size_t length)
+{
+  const char *end= ptr + length;
+  while (end > ptr + 1 && uint2korr(end - 2) == ' ')
+    end-= 2;
+  return (size_t) (end - ptr);
+}
+
+
+static MY_CHARSET_HANDLER my_charset_utf16le_handler=
+{
+  NULL,                /* init         */
+  my_ismbchar_utf16,
+  my_mbcharlen_utf16,
+  my_numchars_utf16,
+  my_charpos_utf16,
+  my_well_formed_len_utf16,
+  my_lengthsp_utf16le,
+  my_numcells_mb,
+  my_utf16le_uni,      /* mb_wc        */
+  my_uni_utf16le,      /* wc_mb        */
+  my_mb_ctype_mb,
+  my_caseup_str_mb2_or_mb4,
+  my_casedn_str_mb2_or_mb4,
+  my_caseup_utf16,
+  my_casedn_utf16,
+  my_snprintf_mb2,
+  my_l10tostr_mb2_or_mb4,
+  my_ll10tostr_mb2_or_mb4,
+  my_fill_mb2,
+  my_strntol_mb2_or_mb4,
+  my_strntoul_mb2_or_mb4,
+  my_strntoll_mb2_or_mb4,
+  my_strntoull_mb2_or_mb4,
+  my_strntod_mb2_or_mb4,
+  my_strtoll10_mb2,
+  my_strntoull10rnd_mb2_or_mb4,
+  my_scan_mb2
+};
+
+
+struct charset_info_st my_charset_utf16le_general_ci=
+{
+  56,0,0,              /* number       */
+  MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
+  "utf16le",           /* cs name    */
+  "utf16le_general_ci",/* name         */
+  "UTF-16LE Unicode",  /* comment      */
+  NULL,                /* tailoring    */
+  NULL,                /* ctype        */
+  NULL,                /* to_lower     */
+  NULL,                /* to_upper     */
+  NULL,                /* sort_order   */
+  NULL,                /* contractions */
+  NULL,                /* sort_order_big*/
+  NULL,                /* tab_to_uni   */
+  NULL,                /* tab_from_uni */
+  my_unicase_default,  /* caseinfo     */
+  NULL,                /* state_map    */
+  NULL,                /* ident_map    */
+  1,                   /* strxfrm_multiply */
+  1,                   /* caseup_multiply  */
+  1,                   /* casedn_multiply  */
+  2,                   /* mbminlen     */
+  4,                   /* mbmaxlen     */
+  0,                   /* min_sort_char */
+  0xFFFF,              /* max_sort_char */
+  ' ',                 /* pad char      */
+  0,                   /* escape_with_backslash_is_dangerous */
+  &my_charset_utf16le_handler,
+  &my_collation_utf16_general_ci_handler
+};
+
+
+struct charset_info_st my_charset_utf16le_bin=
+{
+  62,0,0,              /* number       */
+  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
+  "utf16le",           /* cs name      */
+  "utf16le_bin",       /* name         */
+  "UTF-16LE Unicode",  /* comment      */
+  NULL,                /* tailoring    */
+  NULL,                /* ctype        */
+  NULL,                /* to_lower     */
+  NULL,                /* to_upper     */
+  NULL,                /* sort_order   */
+  NULL,                /* contractions */
+  NULL,                /* sort_order_big*/
+  NULL,                /* tab_to_uni   */
+  NULL,                /* tab_from_uni */
+  my_unicase_default,  /* caseinfo     */
+  NULL,                /* state_map    */
+  NULL,                /* ident_map    */
+  1,                   /* strxfrm_multiply */
+  1,                   /* caseup_multiply  */
+  1,                   /* casedn_multiply  */
+  2,                   /* mbminlen     */
+  4,                   /* mbmaxlen     */
+  0,                   /* min_sort_char */
+  0xFFFF,              /* max_sort_char */
+  ' ',                 /* pad char      */
+  0,                   /* escape_with_backslash_is_dangerous */
+  &my_charset_utf16le_handler,
+  &my_collation_utf16_bin_handler
+};
+
+
 #endif /* HAVE_CHARSET_utf16 */
 
 
@@ -1727,7 +1952,7 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
 static inline void
 my_tolower_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 {
-  int page= *wc >> 8;
+  uint page= *wc >> 8;
   if (page < 256 && uni_plane[page])
     *wc= uni_plane[page][*wc & 0xFF].tolower;
 }
@@ -1736,7 +1961,7 @@ my_tolower_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 static inline void
 my_toupper_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 {
-  int page= *wc >> 8;
+  uint page= *wc >> 8;
   if (page < 256 && uni_plane[page])
     *wc= uni_plane[page][*wc & 0xFF].toupper;
 }
@@ -1745,7 +1970,7 @@ my_toupper_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc)
 static inline void
 my_tosort_utf32(MY_UNICASE_INFO *const* uni_plane, my_wc_t *wc)
 {
-  int page= *wc >> 8;
+  uint page= *wc >> 8;
   if (page < 256)
   {
     if (uni_plane[page])
@@ -2216,7 +2441,7 @@ my_strtoll10_utf32(CHARSET_INFO *cs __attribute__((unused)),
   if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
     goto end4;
   k= k * 10 + c;
-  s+= 2;
+  s+= 4;
   *endptr= (char*) s;
 
   /* number string should have ended here */
@@ -2589,7 +2814,7 @@ struct charset_info_st my_charset_utf32_general_ci=
 struct charset_info_st my_charset_utf32_bin=
 {
   61,0,0,              /* number       */
-  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
+  MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
   "utf32",             /* cs name    */
   "utf32_bin",         /* name         */
   "UTF-32 Unicode",    /* comment      */
@@ -2708,6 +2933,35 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
 }
 
 
+static inline void
+my_tolower_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc)
+{
+  uint page= *wc >> 8;
+  DBUG_ASSERT(page < 256);
+  if (uni_plane[page])
+    *wc= uni_plane[page][*wc & 0xFF].tolower;
+}
+
+
+static inline void
+my_toupper_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc)
+{
+  uint page= *wc >> 8;
+  DBUG_ASSERT(page < 256);
+  if (uni_plane[page])
+    *wc= uni_plane[page][*wc & 0xFF].toupper;
+}
+
+
+static inline void
+my_tosort_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc)
+{
+  uint page= *wc >> 8;
+  DBUG_ASSERT(page < 256);
+  if (uni_plane[page])
+    *wc= uni_plane[page][*wc & 0xFF].sort;
+}
+
 static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
                            char *dst __attribute__((unused)),
                            size_t dstlen __attribute__((unused)))
@@ -2721,8 +2975,7 @@ static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
   while ((src < srcend) &&
          (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
   {
-    int plane= (wc>>8) & 0xFF;
-    wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].toupper : wc;
+    my_toupper_ucs2(uni_plane, &wc);
     if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
       break;
     src+= res;
@@ -2744,8 +2997,7 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
 
   while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
   {
-    int plane = (wc>>8) & 0xFF;
-    wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc;
+    my_tosort_ucs2(uni_plane, &wc);
     n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8);
     n2[0]+=3;
     n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8);
@@ -2768,8 +3020,7 @@ static size_t my_casedn_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
   while ((src < srcend) &&
          (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
   {
-    int plane= (wc>>8) & 0xFF;
-    wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].tolower : wc;
+    my_tolower_ucs2(uni_plane, &wc);
     if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
       break;
     src+= res;
@@ -2778,6 +3029,18 @@ static size_t my_casedn_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
 }
 
 
+static void
+my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)), 
+             char *s, size_t l, int fill)
+{
+  char *last= s + l - 2;
+  uint16 tmp= (fill >> 8) + ((fill & 0xFF) << 8); /* swap bytes */
+  DBUG_ASSERT(fill <= 0xFFFF);
+  for ( ; s <= last; s+= 2)
+    int2store(s, tmp); /* store little-endian */
+}
+
+
 static int my_strnncoll_ucs2(CHARSET_INFO *cs, 
 			     const uchar *s, size_t slen, 
                              const uchar *t, size_t tlen,
@@ -2791,7 +3054,6 @@ static int my_strnncoll_ucs2(CHARSET_INFO *cs,
 
   while ( s < se && t < te )
   {
-    int plane;
     s_res=my_ucs2_uni(cs,&s_wc, s, se);
     t_res=my_ucs2_uni(cs,&t_wc, t, te);
     
@@ -2801,10 +3063,9 @@ static int my_strnncoll_ucs2(CHARSET_INFO *cs,
       return ((int)s[0]-(int)t[0]); 
     }
     
-    plane=(s_wc>>8) & 0xFF;
-    s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
-    plane=(t_wc>>8) & 0xFF;
-    t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc;
+    my_tosort_ucs2(uni_plane, &s_wc);
+    my_tosort_ucs2(uni_plane, &t_wc);
+
     if ( s_wc != t_wc )
     {
       return  s_wc > t_wc ? 1 : -1;
@@ -3115,7 +3376,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
     my_snprintf_mb2,
     my_l10tostr_mb2_or_mb4,
     my_ll10tostr_mb2_or_mb4,
-    my_fill_mb2,
+    my_fill_ucs2,
     my_strntol_mb2_or_mb4,
     my_strntoul_mb2_or_mb4,
     my_strntoll_mb2_or_mb4,
author	Alexander Barkov <bar@mnogosearch.org>	2013-03-28 17:19:09 +0400
committer	Alexander Barkov <bar@mnogosearch.org>	2013-03-28 17:19:09 +0400
commit	d1e162e011d77e64cc98afa76ed5e67b2bca6381 (patch)
tree	55335bb289b5712e4bf1589bc9cc10313918d23a /strings
parent	41013f16a05f45524a01e40cc48bc8e6f9904f55 (diff)
download	mariadb-git-d1e162e011d77e64cc98afa76ed5e67b2bca6381.tar.gz