18 files changed, 185 insertions, 89 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 5994816cbfc..f08efb461b7 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -364,6 +364,23 @@ typedef int (*my_charset_conv_wc_mb)(CHARSET_INFO *, my_wc_t,
 typedef size_t (*my_charset_conv_case)(CHARSET_INFO *,
                                        char *, size_t, char *, size_t);
 
+/*
+  A structure to return the statistics of a native string copying,
+  when no Unicode conversion is involved.
+
+  The stucture is OK to be unitialized before calling a copying routine.
+  A copying routine must populate the structure as follows:
+    - m_source_end_pos must be set by to a non-NULL value
+      in the range of the input string.
+    - m_well_formed_error_pos must be set to NULL if the string was
+      well formed, or to the position of the leftmost bad byte sequence.
+*/
+typedef struct
+{
+  const char *m_source_end_pos;        /* Position where reading stopped */
+  const char *m_well_formed_error_pos; /* Position where a bad byte was found*/
+} MY_STRCOPY_STATUS;
+
 
 /* See strings/CHARSET_INFO.txt about information on this structure  */
 struct my_charset_handler_st
@@ -426,6 +443,23 @@ struct my_charset_handler_st
                                 char **endptr, int *error);
   size_t        (*scan)(CHARSET_INFO *, const char *b, const char *e,
                         int sq);
+
+  /* Copying routines */
+  /*
+    copy_abort() - copy a string, abort if a bad byte sequence was found.
+    Not more than "nchars" characters are copied.
+
+    status->m_source_end_pos is set to a position in the range
+    between "src" and "src + src_length".
+
+    status->m_well_formed_error_pos is set to NULL if the string
+    in the range "src" and "status->m_source_end_pos" was well formed,
+    or is set to "src + src_length" otherwise.
+  */
+  size_t  (*copy_abort)(CHARSET_INFO *,
+                        char *dst, size_t dst_length,
+                        const char *src, size_t src_length,
+                        size_t nchars, MY_STRCOPY_STATUS *status);
 };
 
 extern MY_CHARSET_HANDLER my_charset_8bit_handler;
@@ -558,6 +592,14 @@ extern uint my_instr_simple(CHARSET_INFO *,
                             const char *s, size_t s_length,
                             my_match_t *match, uint nmatch);
 
+size_t my_copy_8bit(CHARSET_INFO *,
+                    char *dst, size_t dst_length,
+                    const char *src, size_t src_length,
+                    size_t nchars, MY_STRCOPY_STATUS *);
+size_t my_copy_abort_mb(CHARSET_INFO *cs,
+                        char *dst, size_t dst_length,
+                        const char *src, size_t src_length,
+                        size_t nchars, MY_STRCOPY_STATUS *);
 
 /* Functions for 8bit */
 extern size_t my_caseup_str_8bit(CHARSET_INFO *, char *);
diff --git a/sql/sql_string.cc b/sql/sql_string.cc
index 5eb55463e85..9fb462e9a9d 100644
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -921,73 +921,9 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
       (to_cs == from_cs) ||
       my_charset_same(from_cs, to_cs))
   {
-    if (to_length < to_cs->mbminlen || !nchars)
-    {
-      m_source_end_pos= from;
-      m_cannot_convert_error_pos= NULL;
-      m_well_formed_error_pos= NULL;
-      return 0;
-    }
-
-    if (to_cs == &my_charset_bin)
-    {
-      res= MY_MIN(MY_MIN(nchars, to_length), from_length);
-      memmove(to, from, res);
-      m_source_end_pos= from + res;
-      m_well_formed_error_pos= NULL;
-      m_cannot_convert_error_pos= NULL;
-    }
-    else
-    {
-      int well_formed_error;
-      uint from_offset;
-
-      if ((from_offset= (from_length % to_cs->mbminlen)) &&
-          (from_cs == &my_charset_bin))
-      {
-        /*
-          Copying from BINARY to UCS2 needs to prepend zeros sometimes:
-          INSERT INTO t1 (ucs2_column) VALUES (0x01);
-          0x01 -> 0x0001
-        */
-        uint pad_length= to_cs->mbminlen - from_offset;
-        bzero(to, pad_length);
-        memmove(to + pad_length, from, from_offset);
-        /*
-          In some cases left zero-padding can create an incorrect character.
-          For example:
-            INSERT INTO t1 (utf32_column) VALUES (0x110000);
-          We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
-          The valid characters range is limited to 0x00000000..0x0010FFFF.
-          
-          Make sure we didn't pad to an incorrect character.
-        */
-        if (to_cs->cset->well_formed_len(to_cs,
-                                         to, to + to_cs->mbminlen, 1,
-                                         &well_formed_error) !=
-                                         to_cs->mbminlen)
-        {
-          m_source_end_pos= m_well_formed_error_pos= from;
-          m_cannot_convert_error_pos= NULL;
-          return 0;
-        }
-        nchars--;
-        from+= from_offset;
-        from_length-= from_offset;
-        to+= to_cs->mbminlen;
-        to_length-= to_cs->mbminlen;
-      }
-
-      set_if_smaller(from_length, to_length);
-      res= to_cs->cset->well_formed_len(to_cs, from, from + from_length,
-                                        nchars, &well_formed_error);
-      memmove(to, from, res);
-      m_source_end_pos= from + res;
-      m_well_formed_error_pos= well_formed_error ? from + res : NULL;
-      m_cannot_convert_error_pos= NULL;
-      if (from_offset)
-        res+= to_cs->mbminlen;
-    }
+    m_cannot_convert_error_pos= NULL;
+    return to_cs->cset->copy_abort(to_cs, to, to_length, from, from_length,
+                                   nchars, this);
   }
   else
   {
diff --git a/sql/sql_string.h b/sql/sql_string.h
index a40ac536f04..d89adb6bf51 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -43,10 +43,8 @@ inline uint32 copy_and_convert(char *to, uint32 to_length,
 }
 
 
-class String_copier
+class String_copier: private MY_STRCOPY_STATUS
 {
-  const char *m_source_end_pos;
-  const char *m_well_formed_error_pos;
   const char *m_cannot_convert_error_pos;
 public:
   const char *source_end_pos() const
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index 38bdf86c64a..a9eb2b1b318 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -6922,7 +6922,8 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };
 
 struct charset_info_st my_charset_big5_chinese_ci=
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 2e699db0bd3..6b53b34159a 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -548,7 +548,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_8bit,
 };
 
 
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 86f450718d7..66b352721db 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -34800,7 +34800,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index b7065369258..36d99eec375 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -10007,7 +10007,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 0ce179b3a2d..8c47b666cf4 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -67549,7 +67549,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 0399660d311..b5aeed2088f 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -6410,7 +6410,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index f1b46ca4e6c..d282d96145d 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -10806,7 +10806,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index babf74599ea..099f03460ce 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -421,7 +421,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_8bit,
 };
 
 
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index cc0513dbc90..fc41563324a 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -423,6 +423,29 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e,
 }
 
 
+/*
+  Copy a multi-byte string. Abort if a bad byte sequence was found.
+  Note more than "nchars" characters are copied.
+*/
+size_t
+my_copy_abort_mb(CHARSET_INFO *cs,
+                 char *dst, size_t dst_length,
+                 const char *src, size_t src_length,
+                 size_t nchars, MY_STRCOPY_STATUS *status)
+{
+  int well_formed_error;
+  size_t res;
+
+  set_if_smaller(src_length, dst_length);
+  res= cs->cset->well_formed_len(cs, src, src + src_length,
+                                 nchars, &well_formed_error);
+  memmove(dst, src, res);
+  status->m_source_end_pos= src + res;
+  status->m_well_formed_error_pos= well_formed_error ? src + res : NULL;
+  return res;
+}
+
+
 uint my_instr_mb(CHARSET_INFO *cs,
                  const char *b, size_t b_length, 
                  const char *s, size_t s_length,
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 7f13cef4474..b010c528979 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -1108,6 +1108,25 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
+/*
+  Copy a 8-bit string. Not more than "nchars" character are copied.
+*/
+size_t
+my_copy_8bit(CHARSET_INFO *cs __attribute__((unused)),
+             char *dst, size_t dst_length,
+             const char *src, size_t src_length,
+             size_t nchars, MY_STRCOPY_STATUS *status)
+{
+  set_if_smaller(src_length, dst_length);
+  set_if_smaller(src_length, nchars);
+  if (src_length)
+    memmove(dst, src, src_length);
+  status->m_source_end_pos= src + src_length;
+  status->m_well_formed_error_pos= NULL;   
+  return src_length;
+}
+
+
 size_t my_lengthsp_8bit(CHARSET_INFO *cs __attribute__((unused)),
                         const char *ptr, size_t length)
 {
@@ -1886,7 +1905,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_8bit,
 };
 
 MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index c6e55879102..2038632c9d3 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -34172,7 +34172,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 61477f177c1..343fb812e20 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -885,7 +885,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_8bit,
 };
 
 
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index a560eb08bae..8f234e9e3a8 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -92,6 +92,65 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
+/*
+  Copy an UCS2/UTF16/UTF32 string.
+  Not more that "nchars" characters are copied.
+
+  UCS2/UTF16/UTF32 may need to prepend zero some bytes,
+  e.g. when copying from a BINARY source:
+  INSERT INTO t1 (ucs2_column) VALUES (0x01);
+  0x01 -> 0x0001
+*/
+static size_t
+my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs,
+                         char *dst, size_t dst_length,
+                         const char *src, size_t src_length,
+                         size_t nchars, MY_STRCOPY_STATUS *status)
+{
+  size_t src_offset;
+
+  if ((src_offset= (src_length % cs->mbminlen)))
+  {
+    int well_formed_error;
+    size_t pad_length;
+    if (dst_length < cs->mbminlen || !nchars)
+    {
+      status->m_source_end_pos= status->m_well_formed_error_pos= src;
+      return 0;
+    }
+
+    pad_length= cs->mbminlen - src_offset;
+    bzero(dst, pad_length);
+    memmove(dst + pad_length, src, src_offset);
+    /*
+      In some cases left zero-padding can create an incorrect character.
+      For example:
+        INSERT INTO t1 (utf32_column) VALUES (0x110000);
+      We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
+      The valid characters range is limited to 0x00000000..0x0010FFFF.
+      
+      Make sure we didn't pad to an incorrect character.
+    */
+    if (cs->cset->well_formed_len(cs,
+                                  dst, dst + cs->mbminlen, 1,
+                                  &well_formed_error) != cs->mbminlen)
+    {
+      status->m_source_end_pos= status->m_well_formed_error_pos= src;
+      return 0;
+    }
+    nchars--;
+    src+= src_offset;
+    src_length-= src_offset;
+    dst+= cs->mbminlen;
+    dst_length-= cs->mbminlen;
+    return
+      cs->mbminlen /* The left-padded character */ +
+      my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+  }
+  return  my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+}
+
+
 static long
 my_strntol_mb2_or_mb4(CHARSET_INFO *cs,
                       const char *nptr, size_t l, int base,
@@ -1682,7 +1741,8 @@ MY_CHARSET_HANDLER my_charset_utf16_handler=
   my_strntod_mb2_or_mb4,
   my_strtoll10_mb2,
   my_strntoull10rnd_mb2_or_mb4,
-  my_scan_mb2
+  my_scan_mb2,
+  my_copy_abort_mb2_or_mb4,
 };
 
 
@@ -1851,7 +1911,8 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler=
   my_strntod_mb2_or_mb4,
   my_strtoll10_mb2,
   my_strntoull10rnd_mb2_or_mb4,
-  my_scan_mb2
+  my_scan_mb2,
+  my_copy_abort_mb2_or_mb4,
 };
 
 
@@ -2765,7 +2826,8 @@ MY_CHARSET_HANDLER my_charset_utf32_handler=
   my_strntod_mb2_or_mb4,
   my_strtoll10_utf32,
   my_strntoull10rnd_mb2_or_mb4,
-  my_scan_utf32
+  my_scan_utf32,
+  my_copy_abort_mb2_or_mb4,
 };
 
 
@@ -3383,7 +3445,8 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
     my_strntod_mb2_or_mb4,
     my_strtoll10_mb2,
     my_strntoull10rnd_mb2_or_mb4,
-    my_scan_mb2
+    my_scan_mb2,
+    my_copy_abort_mb2_or_mb4,
 };
 
 
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index e7dbefe6c1d..f208d15f364 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -67295,7 +67295,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_abort_mb,
 };
 
 
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index d0a64d11c84..1116228f706 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5614,7 +5614,8 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_abort_mb,
 };
 
 
@@ -7167,7 +7168,8 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
     my_strntod_8bit,
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
-    my_scan_8bit
+    my_scan_8bit,
+    my_copy_abort_mb,
 };
 
 
@@ -8110,7 +8112,8 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
   my_strntod_8bit,
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
-  my_scan_8bit
+  my_scan_8bit,
+  my_copy_abort_mb,
 };