diff options
-rw-r--r-- | include/m_ctype.h | 42 | ||||
-rw-r--r-- | sql/sql_string.cc | 70 | ||||
-rw-r--r-- | sql/sql_string.h | 4 | ||||
-rw-r--r-- | strings/ctype-big5.c | 3 | ||||
-rw-r--r-- | strings/ctype-bin.c | 3 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 3 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 3 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 3 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 3 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 3 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 3 | ||||
-rw-r--r-- | strings/ctype-mb.c | 23 | ||||
-rw-r--r-- | strings/ctype-simple.c | 22 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 3 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 3 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 71 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 3 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 9 |
18 files changed, 185 insertions, 89 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 5994816cbfc..f08efb461b7 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -364,6 +364,23 @@ typedef int (*my_charset_conv_wc_mb)(CHARSET_INFO *, my_wc_t, typedef size_t (*my_charset_conv_case)(CHARSET_INFO *, char *, size_t, char *, size_t); +/* + A structure to return the statistics of a native string copying, + when no Unicode conversion is involved. + + The stucture is OK to be unitialized before calling a copying routine. + A copying routine must populate the structure as follows: + - m_source_end_pos must be set by to a non-NULL value + in the range of the input string. + - m_well_formed_error_pos must be set to NULL if the string was + well formed, or to the position of the leftmost bad byte sequence. +*/ +typedef struct +{ + const char *m_source_end_pos; /* Position where reading stopped */ + const char *m_well_formed_error_pos; /* Position where a bad byte was found*/ +} MY_STRCOPY_STATUS; + /* See strings/CHARSET_INFO.txt about information on this structure */ struct my_charset_handler_st @@ -426,6 +443,23 @@ struct my_charset_handler_st char **endptr, int *error); size_t (*scan)(CHARSET_INFO *, const char *b, const char *e, int sq); + + /* Copying routines */ + /* + copy_abort() - copy a string, abort if a bad byte sequence was found. + Not more than "nchars" characters are copied. + + status->m_source_end_pos is set to a position in the range + between "src" and "src + src_length". + + status->m_well_formed_error_pos is set to NULL if the string + in the range "src" and "status->m_source_end_pos" was well formed, + or is set to "src + src_length" otherwise. + */ + size_t (*copy_abort)(CHARSET_INFO *, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status); }; extern MY_CHARSET_HANDLER my_charset_8bit_handler; @@ -558,6 +592,14 @@ extern uint my_instr_simple(CHARSET_INFO *, const char *s, size_t s_length, my_match_t *match, uint nmatch); +size_t my_copy_8bit(CHARSET_INFO *, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *); +size_t my_copy_abort_mb(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *); /* Functions for 8bit */ extern size_t my_caseup_str_8bit(CHARSET_INFO *, char *); diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 5eb55463e85..9fb462e9a9d 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -921,73 +921,9 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs, (to_cs == from_cs) || my_charset_same(from_cs, to_cs)) { - if (to_length < to_cs->mbminlen || !nchars) - { - m_source_end_pos= from; - m_cannot_convert_error_pos= NULL; - m_well_formed_error_pos= NULL; - return 0; - } - - if (to_cs == &my_charset_bin) - { - res= MY_MIN(MY_MIN(nchars, to_length), from_length); - memmove(to, from, res); - m_source_end_pos= from + res; - m_well_formed_error_pos= NULL; - m_cannot_convert_error_pos= NULL; - } - else - { - int well_formed_error; - uint from_offset; - - if ((from_offset= (from_length % to_cs->mbminlen)) && - (from_cs == &my_charset_bin)) - { - /* - Copying from BINARY to UCS2 needs to prepend zeros sometimes: - INSERT INTO t1 (ucs2_column) VALUES (0x01); - 0x01 -> 0x0001 - */ - uint pad_length= to_cs->mbminlen - from_offset; - bzero(to, pad_length); - memmove(to + pad_length, from, from_offset); - /* - In some cases left zero-padding can create an incorrect character. - For example: - INSERT INTO t1 (utf32_column) VALUES (0x110000); - We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! - The valid characters range is limited to 0x00000000..0x0010FFFF. - - Make sure we didn't pad to an incorrect character. - */ - if (to_cs->cset->well_formed_len(to_cs, - to, to + to_cs->mbminlen, 1, - &well_formed_error) != - to_cs->mbminlen) - { - m_source_end_pos= m_well_formed_error_pos= from; - m_cannot_convert_error_pos= NULL; - return 0; - } - nchars--; - from+= from_offset; - from_length-= from_offset; - to+= to_cs->mbminlen; - to_length-= to_cs->mbminlen; - } - - set_if_smaller(from_length, to_length); - res= to_cs->cset->well_formed_len(to_cs, from, from + from_length, - nchars, &well_formed_error); - memmove(to, from, res); - m_source_end_pos= from + res; - m_well_formed_error_pos= well_formed_error ? from + res : NULL; - m_cannot_convert_error_pos= NULL; - if (from_offset) - res+= to_cs->mbminlen; - } + m_cannot_convert_error_pos= NULL; + return to_cs->cset->copy_abort(to_cs, to, to_length, from, from_length, + nchars, this); } else { diff --git a/sql/sql_string.h b/sql/sql_string.h index a40ac536f04..d89adb6bf51 100644 --- a/sql/sql_string.h +++ b/sql/sql_string.h @@ -43,10 +43,8 @@ inline uint32 copy_and_convert(char *to, uint32 to_length, } -class String_copier +class String_copier: private MY_STRCOPY_STATUS { - const char *m_source_end_pos; - const char *m_well_formed_error_pos; const char *m_cannot_convert_error_pos; public: const char *source_end_pos() const diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index 38bdf86c64a..a9eb2b1b318 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6922,7 +6922,8 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; struct charset_info_st my_charset_big5_chinese_ci= diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 2e699db0bd3..6b53b34159a 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -548,7 +548,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_8bit, }; diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 86f450718d7..66b352721db 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -34800,7 +34800,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index b7065369258..36d99eec375 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -10007,7 +10007,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 0ce179b3a2d..8c47b666cf4 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -67549,7 +67549,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index 0399660d311..b5aeed2088f 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -6410,7 +6410,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index f1b46ca4e6c..d282d96145d 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -10806,7 +10806,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index babf74599ea..099f03460ce 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -421,7 +421,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_8bit, }; diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index cc0513dbc90..fc41563324a 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -423,6 +423,29 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e, } +/* + Copy a multi-byte string. Abort if a bad byte sequence was found. + Note more than "nchars" characters are copied. +*/ +size_t +my_copy_abort_mb(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + int well_formed_error; + size_t res; + + set_if_smaller(src_length, dst_length); + res= cs->cset->well_formed_len(cs, src, src + src_length, + nchars, &well_formed_error); + memmove(dst, src, res); + status->m_source_end_pos= src + res; + status->m_well_formed_error_pos= well_formed_error ? src + res : NULL; + return res; +} + + uint my_instr_mb(CHARSET_INFO *cs, const char *b, size_t b_length, const char *s, size_t s_length, diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index 7f13cef4474..b010c528979 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1108,6 +1108,25 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)), } +/* + Copy a 8-bit string. Not more than "nchars" character are copied. +*/ +size_t +my_copy_8bit(CHARSET_INFO *cs __attribute__((unused)), + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + set_if_smaller(src_length, dst_length); + set_if_smaller(src_length, nchars); + if (src_length) + memmove(dst, src, src_length); + status->m_source_end_pos= src + src_length; + status->m_well_formed_error_pos= NULL; + return src_length; +} + + size_t my_lengthsp_8bit(CHARSET_INFO *cs __attribute__((unused)), const char *ptr, size_t length) { @@ -1886,7 +1905,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_8bit, }; MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler = diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index c6e55879102..2038632c9d3 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -34172,7 +34172,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 61477f177c1..343fb812e20 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -885,7 +885,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_8bit, }; diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index a560eb08bae..8f234e9e3a8 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -92,6 +92,65 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)), } +/* + Copy an UCS2/UTF16/UTF32 string. + Not more that "nchars" characters are copied. + + UCS2/UTF16/UTF32 may need to prepend zero some bytes, + e.g. when copying from a BINARY source: + INSERT INTO t1 (ucs2_column) VALUES (0x01); + 0x01 -> 0x0001 +*/ +static size_t +my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + size_t src_offset; + + if ((src_offset= (src_length % cs->mbminlen))) + { + int well_formed_error; + size_t pad_length; + if (dst_length < cs->mbminlen || !nchars) + { + status->m_source_end_pos= status->m_well_formed_error_pos= src; + return 0; + } + + pad_length= cs->mbminlen - src_offset; + bzero(dst, pad_length); + memmove(dst + pad_length, src, src_offset); + /* + In some cases left zero-padding can create an incorrect character. + For example: + INSERT INTO t1 (utf32_column) VALUES (0x110000); + We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! + The valid characters range is limited to 0x00000000..0x0010FFFF. + + Make sure we didn't pad to an incorrect character. + */ + if (cs->cset->well_formed_len(cs, + dst, dst + cs->mbminlen, 1, + &well_formed_error) != cs->mbminlen) + { + status->m_source_end_pos= status->m_well_formed_error_pos= src; + return 0; + } + nchars--; + src+= src_offset; + src_length-= src_offset; + dst+= cs->mbminlen; + dst_length-= cs->mbminlen; + return + cs->mbminlen /* The left-padded character */ + + my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status); + } + return my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status); +} + + static long my_strntol_mb2_or_mb4(CHARSET_INFO *cs, const char *nptr, size_t l, int base, @@ -1682,7 +1741,8 @@ MY_CHARSET_HANDLER my_charset_utf16_handler= my_strntod_mb2_or_mb4, my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, - my_scan_mb2 + my_scan_mb2, + my_copy_abort_mb2_or_mb4, }; @@ -1851,7 +1911,8 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler= my_strntod_mb2_or_mb4, my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, - my_scan_mb2 + my_scan_mb2, + my_copy_abort_mb2_or_mb4, }; @@ -2765,7 +2826,8 @@ MY_CHARSET_HANDLER my_charset_utf32_handler= my_strntod_mb2_or_mb4, my_strtoll10_utf32, my_strntoull10rnd_mb2_or_mb4, - my_scan_utf32 + my_scan_utf32, + my_copy_abort_mb2_or_mb4, }; @@ -3383,7 +3445,8 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_strntod_mb2_or_mb4, my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, - my_scan_mb2 + my_scan_mb2, + my_copy_abort_mb2_or_mb4, }; diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index e7dbefe6c1d..f208d15f364 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -67295,7 +67295,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index d0a64d11c84..1116228f706 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -5614,7 +5614,8 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; @@ -7167,7 +7168,8 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; @@ -8110,7 +8112,8 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_strntod_8bit, my_strtoll10_8bit, my_strntoull10rnd_8bit, - my_scan_8bit + my_scan_8bit, + my_copy_abort_mb, }; |