From 7063bd4d2bfe4688db60b28a15843406299a58f0 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 9 Dec 2005 16:37:58 +0400 Subject: Bug#15377 Valid multibyte sequences are truncated on INSERT ctype-euc_kr.c: ctype-gb2312.c: Adding specific well_formed_length functions for gb2312 and euckr, to allow storing characters which are correct according to the character set specifications but just don't have Unicode mapping. Previously only those which have Unicode mapping could be stored, while unassigned characters lead to data truncation. Many files: new file strings/ctype-gb2312.c: Bug#15377 Valid multibyte sequences are truncated on INSERT Adding specific well_formed_length functions for gb2312 and euckr, to allow storing characters which are correct according to the character set. Previously only those which have Unicode mapping could be stored. strings/ctype-euc_kr.c: Adding specific well_formed_length functions for gb2312 and euckr, to allow storing characters which are correct according to the character set. Previously only those which have Unicode mapping could be stored. --- strings/ctype-euc_kr.c | 37 ++++++++++++++++++++++++++++++++++++- strings/ctype-gb2312.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) (limited to 'strings') diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index f15e97de5be..2863b192f50 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8635,6 +8635,41 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), } +/* + Returns well formed length of a EUC-KR string. +*/ +static uint +my_well_formed_len_euckr(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + uint pos, int *error) +{ + const char *b0= b; + const char *emb= e - 1; /* Last possible end of an MB character */ + + *error= 0; + while (pos-- && b < e) + { + if ((uchar) b[0] < 128) + { + /* Single byte ascii character */ + b++; + } + else if (b < emb && iseuc_kr(*b) && iseuc_kr(b[1])) + { + /* Double byte character */ + b+= 2; + } + else + { + /* Wrong byte sequence */ + *error= 1; + break; + } + } + return (uint) (b - b0); +} + + static MY_COLLATION_HANDLER my_collation_ci_handler = { NULL, /* init */ @@ -8655,7 +8690,7 @@ static MY_CHARSET_HANDLER my_charset_handler= mbcharlen_euc_kr, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_euckr, my_lengthsp_8bit, my_numcells_8bit, my_mb_wc_euc_kr, /* mb_wc */ diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index 0cbad2d1c55..52dd61a8462 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5686,6 +5686,41 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), } +/* + Returns well formed length of a EUC-KR string. +*/ +static uint +my_well_formed_len_gb2312(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + uint pos, int *error) +{ + const char *b0= b; + const char *emb= e - 1; /* Last possible end of an MB character */ + + *error= 0; + while (pos-- && b < e) + { + if ((uchar) b[0] < 128) + { + /* Single byte ascii character */ + b++; + } + else if (b < emb && isgb2312head(*b) && isgb2312tail(b[1])) + { + /* Double byte character */ + b+= 2; + } + else + { + /* Wrong byte sequence */ + *error= 1; + break; + } + } + return (uint) (b - b0); +} + + static MY_COLLATION_HANDLER my_collation_ci_handler = { NULL, /* init */ @@ -5706,7 +5741,7 @@ static MY_CHARSET_HANDLER my_charset_handler= mbcharlen_gb2312, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_gb2312, my_lengthsp_8bit, my_numcells_8bit, my_mb_wc_gb2312, /* mb_wc */ -- cgit v1.2.1