diff options
author | Sergei Golubchik <sergii@pisem.net> | 2014-02-26 15:28:07 +0100 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2014-02-26 15:28:07 +0100 |
commit | 0dc23679c867629ded5f9534d2ab6e8edf238aa0 (patch) | |
tree | 9cf966507fa2ef0fd17932b600d051df5f7bd2e5 /strings/ctype-utf8.c | |
parent | 6efa5efa7dd112b6ac2efdd84235a13cca51c4d4 (diff) | |
parent | 0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2 (diff) | |
download | mariadb-git-0dc23679c867629ded5f9534d2ab6e8edf238aa0.tar.gz |
10.0-base merge
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r-- | strings/ctype-utf8.c | 267 |
1 files changed, 231 insertions, 36 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index e564a85c828..aba179b154c 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -26,6 +26,7 @@ #define EILSEQ ENOENT #endif +#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40) #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci" #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs" @@ -56,6 +57,46 @@ #define HAVE_UNIDATA #endif + +#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4) + +static inline +int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) +{ + uchar c; + + DBUG_ASSERT(s < e); + c= s[0]; + if (c < 0x80) + return 1; + + if (c < 0xc2) + return MY_CS_ILSEQ; + + if (c < 0xe0) + { + if (s+2 > e) /* We need 2 characters */ + return MY_CS_TOOSMALL2; + + if (!(IS_CONTINUATION_BYTE(s[1]))) + return MY_CS_ILSEQ; + + return 2; + } + + DBUG_ASSERT(c < 0xf0); + if (s+3 > e) /* We need 3 characters */ + return MY_CS_TOOSMALL3; + + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && + (c >= 0xe1 || s[1] >= 0xa0))) + return MY_CS_ILSEQ; + + return 3; +} + +#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/ + #ifdef HAVE_UNIDATA #include "my_uctype.h" @@ -4806,7 +4847,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -4817,7 +4858,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -4833,9 +4874,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+4 > e) /* We need 4 characters */ return MY_CS_TOOSMALL4; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90))) return MY_CS_ILSEQ; @@ -4851,10 +4892,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if (s+5 >e) /* We need 5 characters */ return MY_CS_TOOSMALL5; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && (c >= 0xf9 || s[1] >= 0x88))) return MY_CS_ILSEQ; @@ -4870,11 +4911,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), if ( s+6 >e ) /* We need 6 characters */ return MY_CS_TOOSMALL6; - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && - (s[4] ^ 0x80) < 0x40 && - (s[5] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && (c >= 0xfd || s[1] >= 0x84))) return MY_CS_ILSEQ; @@ -4918,11 +4959,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -5404,10 +5445,90 @@ int my_wildcmp_utf8(CHARSET_INFO *cs, } +static +int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + +#ifdef UNICODE_32BIT + if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) + { + if (s+4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90))) + return MY_CS_ILSEQ; + + return 4; + } + if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) + { + if (s+5 >e) /* We need 5 characters */ + return MY_CS_TOOSMALL5; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + (c >= 0xf9 || s[1] >= 0x88))) + return MY_CS_ILSEQ; + + return 5; + } + if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) + { + if ( s+6 >e ) /* We need 6 characters */ + return MY_CS_TOOSMALL6; + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + IS_CONTINUATION_BYTE(s[4]) && + IS_CONTINUATION_BYTE(s[5]) && + (c >= 0xfd || s[1] >= 0x84))) + return MY_CS_ILSEQ; + + return 6; + } +#endif + return MY_CS_ILSEQ; +} + +static size_t +my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - my_wc_t wc; - int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); return (res>1) ? res : 0; } @@ -5472,7 +5593,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_mbcharlen_utf8, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8, my_lengthsp_8bit, my_numcells_mb, my_utf8_uni, @@ -7244,7 +7365,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 2 > e) /* We need 2 characters */ return MY_CS_TOOSMALL2; - if (!((s[1] ^ 0x80) < 0x40)) + if (!(IS_CONTINUATION_BYTE(s[1]))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); @@ -7255,7 +7376,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), if (s + 3 > e) /* We need 3 characters */ return MY_CS_TOOSMALL3; - if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -7288,9 +7409,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), [F4][80..8F][80..BF][80..BF] */ - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -7326,17 +7447,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), if (c < 0xe0) { - if (!((s[1] ^ 0x80) < 0x40)) + if (!IS_CONTINUATION_BYTE(s[1])) return MY_CS_ILSEQ; *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80); return 2; } - + if (c < 0xf0) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; *pwc= ((my_wc_t) (c & 0x0f) << 12) | @@ -7347,9 +7468,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), } else if (c < 0xf5) { - if (!((s[1] ^ 0x80) < 0x40 && - (s[2] ^ 0x80) < 0x40 && - (s[3] ^ 0x80) < 0x40 && + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && (c >= 0xf1 || s[1] >= 0x90) && (c <= 0xf3 || s[1] <= 0x8F))) return MY_CS_ILSEQ; @@ -7836,11 +7957,84 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs, } +static int +my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + uchar c; + + if (s >= e) + return MY_CS_TOOSMALL; + + c= s[0]; + if (c < 0xf0) + return my_valid_mbcharlen_utf8mb3(s, e); + + if (c < 0xf5) + { + if (s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + UTF-8 quick four-byte mask: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Encoding allows to encode U+00010000..U+001FFFFF + + The maximum character defined in the Unicode standard is U+0010FFFF. + Higher characters U+00110000..U+001FFFFF are not used. + + 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min) + 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max) + + Valid codes: + [F0][90..BF][80..BF][80..BF] + [F1][80..BF][80..BF][80..BF] + [F2][80..BF][80..BF][80..BF] + [F3][80..BF][80..BF][80..BF] + [F4][80..8F][80..BF][80..BF] + */ + + if (!(IS_CONTINUATION_BYTE(s[1]) && + IS_CONTINUATION_BYTE(s[2]) && + IS_CONTINUATION_BYTE(s[3]) && + (c >= 0xf1 || s[1] >= 0x90) && + (c <= 0xf3 || s[1] <= 0x8F))) + return MY_CS_ILSEQ; + + return 4; + } + + return MY_CS_ILSEQ; +} + + +static +size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, + const char *b, const char *e, + size_t pos, int *error) +{ + const char *b_start= b; + *error= 0; + while (pos) + { + int mb_len; + + if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + { + *error= b < e ? 1 : 0; + break; + } + b+= mb_len; + pos--; + } + return (size_t) (b - b_start); +} + + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - my_wc_t wc; - int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e); + int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); return (res > 1) ? res : 0; } @@ -7901,7 +8095,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_mbcharlen_utf8mb4, my_numchars_mb, my_charpos_mb, - my_well_formed_len_mb, + my_well_formed_len_utf8mb4, my_lengthsp_8bit, my_numcells_mb, my_mb_wc_utf8mb4, @@ -7963,7 +8157,8 @@ struct charset_info_st my_charset_utf8mb4_general_ci= struct charset_info_st my_charset_utf8mb4_bin= { 46,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */ + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE| + MY_CS_UNICODE_SUPPLEMENT, /* state */ MY_UTF8MB4, /* cs name */ MY_UTF8MB4_BIN, /* name */ "UTF-8 Unicode", /* comment */ |