diff options
author | Alexander Barkov <bar@mnogosearch.org> | 2013-03-28 17:19:09 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mnogosearch.org> | 2013-03-28 17:19:09 +0400 |
commit | d1e162e011d77e64cc98afa76ed5e67b2bca6381 (patch) | |
tree | 55335bb289b5712e4bf1589bc9cc10313918d23a /strings | |
parent | 41013f16a05f45524a01e40cc48bc8e6f9904f55 (diff) | |
download | mariadb-git-d1e162e011d77e64cc98afa76ed5e67b2bca6381.tar.gz |
Merging utf16le from MySQL-5.6
added:
mysql-test/include/ctype_heap.inc
mysql-test/include/ctype_strtoll10.inc
mysql-test/r/ctype_utf16le.result
mysql-test/t/ctype_utf16le.test
modified:
cmake/character_sets.cmake
include/m_ctype.h
mysql-test/r/ctype_ucs.result
mysql-test/r/ctype_utf16.result
mysql-test/r/ctype_utf32.result
mysql-test/suite/funcs_1/r/innodb_func_view.result
mysql-test/suite/funcs_1/r/memory_func_view.result
mysql-test/suite/funcs_1/r/myisam_func_view.result
mysql-test/suite/sys_vars/r/character_set_client_basic.result
mysql-test/suite/sys_vars/r/character_set_connection_basic.result
mysql-test/suite/sys_vars/r/character_set_database_basic.result
mysql-test/suite/sys_vars/r/character_set_filesystem_basic.result
mysql-test/suite/sys_vars/r/character_set_results_basic.result
mysql-test/t/ctype_ucs.test
mysql-test/t/ctype_utf16.test
mysql-test/t/ctype_utf32.test
mysys/charset-def.c
sql/item_func.cc
sql/sys_vars.cc
strings/ctype-latin1.c
strings/ctype-ucs2.c
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-latin1.c | 5 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 521 |
2 files changed, 395 insertions, 131 deletions
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 80ae11c82c2..fd327925fb8 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -380,7 +380,10 @@ int my_wc_mb_latin1(CHARSET_INFO *cs __attribute__((unused)), if (str >= end) return MY_CS_TOOSMALL; - pl= uni_to_cs[(wc>>8) & 0xFF]; + if (wc > 0xFFFF) + return MY_CS_ILUNI; + + pl= uni_to_cs[wc >> 8]; str[0]= pl ? pl[wc & 0xFF] : '\0'; return (!str[0] && wc) ? MY_CS_ILUNI : 1; } diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 6ebbae8fb5a..fa43f36fb84 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -101,6 +101,7 @@ my_strntol_mb2_or_mb4(CHARSET_INFO *cs, int overflow; int cnv; my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; register unsigned int cutlim; register uint32 cutoff; register uint32 res; @@ -111,7 +112,7 @@ my_strntol_mb2_or_mb4(CHARSET_INFO *cs, *err= 0; do { - if ((cnv= cs->cset->mb_wc(cs, &wc, s, e))>0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { switch (wc) { @@ -141,7 +142,7 @@ bs: cutlim= (uint) (((uint32)~0L) % (uint32) base); do { - if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { s+= cnv; if (wc >= '0' && wc <= '9') @@ -212,6 +213,7 @@ my_strntoul_mb2_or_mb4(CHARSET_INFO *cs, int overflow; int cnv; my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; register unsigned int cutlim; register uint32 cutoff; register uint32 res; @@ -222,7 +224,7 @@ my_strntoul_mb2_or_mb4(CHARSET_INFO *cs, *err= 0; do { - if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { switch (wc) { @@ -253,7 +255,7 @@ bs: do { - if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { s+= cnv; if (wc >= '0' && wc <= '9') @@ -316,6 +318,7 @@ my_strntoll_mb2_or_mb4(CHARSET_INFO *cs, int overflow; int cnv; my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; register ulonglong cutoff; register unsigned int cutlim; register ulonglong res; @@ -326,7 +329,7 @@ my_strntoll_mb2_or_mb4(CHARSET_INFO *cs, *err= 0; do { - if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { switch (wc) { @@ -356,7 +359,7 @@ bs: cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base); do { - if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { s+=cnv; if ( wc>='0' && wc<='9') @@ -427,6 +430,7 @@ my_strntoull_mb2_or_mb4(CHARSET_INFO *cs, int overflow; int cnv; my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; register ulonglong cutoff; register unsigned int cutlim; register ulonglong res; @@ -437,7 +441,7 @@ my_strntoull_mb2_or_mb4(CHARSET_INFO *cs, *err= 0; do { - if ((cnv= cs->cset->mb_wc(cs,&wc,s,e)) > 0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { switch (wc) { @@ -468,7 +472,7 @@ bs: do { - if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0) + if ((cnv= mb_wc(cs, &wc, s, e)) > 0) { s+=cnv; if ( wc>='0' && wc<='9') @@ -533,6 +537,7 @@ my_strntod_mb2_or_mb4(CHARSET_INFO *cs, register const uchar *s= (const uchar*) nptr; const uchar *end; my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; int cnv; *err= 0; @@ -541,7 +546,7 @@ my_strntod_mb2_or_mb4(CHARSET_INFO *cs, length= sizeof(buf) - 1; end= s + length; - while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0) + while ((cnv= mb_wc(cs, &wc, s, end)) > 0) { s+= cnv; if (wc > (int) (uchar) 'e' || !wc) @@ -566,6 +571,7 @@ my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs, ulonglong res; const uchar *end, *s= (const uchar*) nptr; my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; int cnv; /* Cut too long strings */ @@ -573,7 +579,7 @@ my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs, length= sizeof(buf)-1; end= s + length; - while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0) + while ((cnv= mb_wc(cs, &wc, s, end)) > 0) { s+= cnv; if (wc > (int) (uchar) 'e' || !wc) @@ -712,23 +718,36 @@ static longlong my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)), const char *nptr, char **endptr, int *error) { - const char *s, *end, *start, *n_end, *true_end; + const uchar *s, *end, *start, *n_end, *true_end; uchar c; unsigned long i, j, k; ulonglong li; int negative; ulong cutoff, cutoff2, cutoff3; + my_wc_t wc; + int res; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; - s= nptr; + s= (const uchar *) nptr; /* If fixed length string */ if (endptr) { - /* Make sure string length is even */ - end= s + ((*endptr - s) / 2) * 2; - while (s < end && !s[0] && (s[1] == ' ' || s[1] == '\t')) - s+= 2; - if (s == end) - goto no_conv; + /* + Make sure string length is even. + Odd length indicates a bug in the caller. + Assert in debug, round in production. + */ + DBUG_ASSERT((*endptr - (const char *) s) % 2 == 0); + end= s + ((*endptr - (const char*) s) / 2) * 2; + + for ( ; ; ) /* Skip leading spaces and tabs */ + { + if ((res= mb_wc(cs, &wc, s, end)) <= 0) + goto no_conv; + s+= res; + if (wc != ' ' && wc != '\t') + break; + } } else { @@ -738,13 +757,13 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)), /* Check for a sign. */ negative= 0; - if (!s[0] && s[1] == '-') + if (wc == '-') { *error= -1; /* Mark as negative number */ negative= 1; - s+= 2; - if (s == end) + if ((res= mb_wc(cs, &wc, s, end)) <= 0) goto no_conv; + s+= res; /* wc is now expected to hold the first digit. */ cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2; cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100; cutoff3= MAX_NEGATIVE_NUMBER % 100; @@ -752,46 +771,53 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)), else { *error= 0; - if (!s[0] && s[1] == '+') + if (wc == '+') { - s+= 2; - if (s == end) + if ((res= mb_wc(cs, &wc, s, end)) <= 0) goto no_conv; + s+= res; /* wc is now expected to hold the first digit. */ } cutoff= ULONGLONG_MAX / LFACTOR2; cutoff2= ULONGLONG_MAX % LFACTOR2 / 100; cutoff3= ULONGLONG_MAX % 100; } - /* Handle case where we have a lot of pre-zero */ - if (!s[0] && s[1] == '0') + /* + The code below assumes that 'wc' holds the first digit + and 's' points to the next character after it. + + Scan pre-zeros if any. + */ + if (wc == '0') { i= 0; - do + for ( ; ; s+= res) { - s+= 2; if (s == end) goto end_i; /* Return 0 */ + if ((res= mb_wc(cs, &wc, s, end)) <= 0) + goto no_conv; + if (wc != '0') + break; } - while (!s[0] && s[1] == '0'); n_end= s + 2 * INIT_CNT; } else { /* Read first digit to check that it's a valid number */ - if (s[0] || (c= (s[1]-'0')) > 9) + if ((i= (wc - '0')) > 9) goto no_conv; - i= c; - s+= 2; n_end= s + 2 * (INIT_CNT-1); } /* Handle first 9 digits and store them in i */ if (n_end > end) n_end= end; - for (; s != n_end ; s+= 2) + for ( ; ; s+= res) { - if (s[0] || (c= (s[1]-'0')) > 9) + if ((res= mb_wc(cs, &wc, s, n_end)) <= 0) + break; + if ((c= (wc - '0')) > 9) goto end_i; i= i*10+c; } @@ -806,10 +832,12 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)), n_end= end; do { - if (s[0] || (c= (s[1]-'0')) > 9) + if ((res= mb_wc(cs, &wc, s, end)) <= 0) + goto no_conv; + if ((c= (wc - '0')) > 9) goto end_i_and_j; - j= j*10+c; - s+= 2; + s+= res; + j= j * 10 + c; } while (s != n_end); if (s == end) { @@ -817,20 +845,26 @@ my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)), goto end_i_and_j; goto end3; } - if (s[0] || (c= (s[1]-'0')) > 9) - goto end3; /* Handle the next 1 or 2 digits and store them in k */ - k=c; - s+= 2; - if (s == end || s[0] || (c= (s[1]-'0')) > 9) + if ((res= mb_wc(cs, &wc, s, end)) <= 0) + goto no_conv; + if ((k= (wc - '0')) > 9) + goto end3; + s+= res; + + if (s == end) + goto end4; + if ((res= mb_wc(cs, &wc, s, end)) <= 0) + goto no_conv; + if ((c= (wc - '0')) > 9) goto end4; + s+= res; k= k*10+c; - s+= 2; *endptr= (char*) s; /* number string should have ended here */ - if (s != end && !s[0] && (c= (s[1]-'0')) <= 9) + if (s != end && mb_wc(cs, &wc, s, end) > 0 && ((uchar) (wc - '0')) <= 9) goto overflow; /* Check that we didn't get an overflow with the last digit */ @@ -882,15 +916,18 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)), const char *str, const char *end, int sequence_type) { const char *str0= str; - end--; /* for easier loop condition, because of two bytes per character */ - + my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; + int res; + switch (sequence_type) { case MY_SEQ_SPACES: - for ( ; str < end; str+= 2) + for (res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end); + res > 0 && wc == ' '; + str+= res, + res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end)) { - if (str[0] != '\0' || str[1] != ' ') - break; } return (size_t) (str - str0); default: @@ -900,11 +937,33 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)), static void -my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)), - char *s, size_t l, int fill) +my_fill_mb2(CHARSET_INFO *cs, char *s, size_t slen, int fill) { - DBUG_ASSERT(fill <= 0xFFFF); - for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2); + char buf[10], *last; + int buflen, remainder; + + DBUG_ASSERT((slen % 2) == 0); + + buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf, + (uchar*) buf + sizeof(buf)); + + DBUG_ASSERT(buflen > 0); + + /* + "last" in the last position where a sequence of "buflen" bytes can start. + */ + for (last= s + slen - buflen; s <= last; s+= buflen) + { + /* Enough space for the characer */ + memcpy(s, buf, (size_t) buflen); + } + + /* + If there are some more space which is not enough + for the whole multibyte character, then add trailing zeros. + */ + if ((remainder= last + buflen - s) > 0) + bzero(s, (size_t) remainder); } @@ -1018,11 +1077,26 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), DB80..DBFF - Private surrogate high (128 pages) DC00..DFFF - Surrogate low (1024 codes in a page) */ +#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800 +#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF +#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00 +#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF #define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) #define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) #define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) +#define MY_UTF16_WC2(a, b) ((a << 8) + b) + +/* + a= 110110?? (<< 18) + b= ???????? (<< 10) + c= 110111?? (<< 8) + d= ???????? (<< 0) +*/ +#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \ + ((c & 3) << 8) + d + 0x10000) + static int my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) @@ -1044,23 +1118,14 @@ my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)), if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */ return MY_CS_ILSEQ; - /* - s[0]= 110110?? (<< 18) - s[1]= ???????? (<< 10) - s[2]= 110111?? (<< 8) - s[3]= ???????? (<< 0) - */ - - *pwc= ((s[0] & 3) << 18) + (s[1] << 10) + - ((s[2] & 3) << 8) + s[3] + 0x10000; - + *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]); return 4; } if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */ return MY_CS_ILSEQ; - - *pwc= (s[0] << 8) + s[1]; + + *pwc= MY_UTF16_WC2(s[0], s[1]); return 2; } @@ -1098,7 +1163,7 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)), static inline void my_tolower_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) { - int page= *wc >> 8; + uint page= *wc >> 8; if (page < 256 && uni_plane[page]) *wc= uni_plane[page][*wc & 0xFF].tolower; } @@ -1107,7 +1172,7 @@ my_tolower_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) static inline void my_toupper_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) { - int page= *wc >> 8; + uint page= *wc >> 8; if (page < 256 && uni_plane[page]) *wc= uni_plane[page][*wc & 0xFF].toupper; } @@ -1116,7 +1181,7 @@ my_toupper_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) static inline void my_tosort_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) { - int page= *wc >> 8; + uint page= *wc >> 8; if (page < 256) { if (uni_plane[page]) @@ -1135,16 +1200,18 @@ my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen, size_t dstlen __attribute__((unused))) { my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; + my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb; int res; char *srcend= src + srclen; MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && - (res= my_utf16_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0) + (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0) { my_toupper_utf16(uni_plane, &wc); - if (res != my_uni_utf16(cs, wc, (uchar*) src, (uchar*) srcend)) + if (res != wc_mb(cs, wc, (uchar *) src, (uchar *) srcend)) break; src+= res; } @@ -1157,14 +1224,12 @@ my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen, ulong *n1, ulong *n2) { my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; int res; - const uchar *e= s+slen; + const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen); MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; - while (e > s + 1 && e[-1] == ' ' && e[-2] == '\0') - e-= 2; - - while ((s < e) && (res= my_utf16_uni(cs, &wc, (uchar *)s, (uchar*)e)) > 0) + while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0) { my_tosort_utf16(uni_plane, &wc); n1[0]^= (((n1[0] & 63) + n2[0]) * (wc & 0xFF)) + (n1[0] << 8); @@ -1182,16 +1247,18 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen, size_t dstlen __attribute__((unused))) { my_wc_t wc; + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; + my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb; int res; char *srcend= src + srclen; MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && - (res= my_utf16_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0) + (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0) { my_tolower_utf16(uni_plane, &wc); - if (res != my_uni_utf16(cs, wc, (uchar*) src, (uchar*) srcend)) + if (res != wc_mb(cs, wc, (uchar *) src, (uchar *) srcend)) break; src+= res; } @@ -1207,14 +1274,15 @@ my_strnncoll_utf16(CHARSET_INFO *cs, { int s_res, t_res; my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; const uchar *se= s + slen; const uchar *te= t + tlen; MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; while (s < se && t < te) { - s_res= my_utf16_uni(cs, &s_wc, s, se); - t_res= my_utf16_uni(cs, &t_wc, t, te); + s_res= mb_wc(cs, &s_wc, s, se); + t_res= mb_wc(cs, &t_wc, t, te); if (s_res <= 0 || t_res <= 0) { @@ -1271,6 +1339,7 @@ my_strnncollsp_utf16(CHARSET_INFO *cs, { int res; my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; const uchar *se= s + slen, *te= t + tlen; MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; @@ -1283,8 +1352,8 @@ my_strnncollsp_utf16(CHARSET_INFO *cs, while (s < se && t < te) { - int s_res= my_utf16_uni(cs, &s_wc, s, se); - int t_res= my_utf16_uni(cs, &t_wc, t, te); + int s_res= mb_wc(cs, &s_wc, s, se); + int t_res= mb_wc(cs, &t_wc, t, te); if (s_res <= 0 || t_res <= 0) { @@ -1324,7 +1393,7 @@ my_strnncollsp_utf16(CHARSET_INFO *cs, for ( ; s < se; s+= s_res) { - if ((s_res= my_utf16_uni(cs, &s_wc, s, se)) < 0) + if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0) { DBUG_ASSERT(0); return 0; @@ -1338,22 +1407,11 @@ my_strnncollsp_utf16(CHARSET_INFO *cs, static uint -my_ismbchar_utf16(CHARSET_INFO *cs __attribute__((unused)), - const char *b __attribute__((unused)), - const char *e __attribute__((unused))) +my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e) { - if (b + 2 > e) - return 0; - - if (MY_UTF16_HIGH_HEAD(*b)) - { - return (b + 4 <= e) && MY_UTF16_LOW_HEAD(b[2]) ? 4 : 0; - } - - if (MY_UTF16_LOW_HEAD(*b)) - return 0; - - return 2; + my_wc_t wc; + int res= cs->cset->mb_wc(cs, &wc, (const uchar *) b, (const uchar *) e); + return (uint) (res > 0 ? res : 0); } @@ -1361,6 +1419,7 @@ static uint my_mbcharlen_utf16(CHARSET_INFO *cs __attribute__((unused)), uint c __attribute__((unused))) { + DBUG_ASSERT(0); return MY_UTF16_HIGH_HEAD(c) ? 4 : 2; } @@ -1449,13 +1508,14 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs, { int s_res,t_res; my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; const uchar *se=s+slen; const uchar *te=t+tlen; while ( s < se && t < te ) { - s_res= my_utf16_uni(cs,&s_wc, s, se); - t_res= my_utf16_uni(cs,&t_wc, t, te); + s_res= mb_wc(cs, &s_wc, s, se); + t_res= mb_wc(cs, &t_wc, t, te); if (s_res <= 0 || t_res <= 0) { @@ -1482,6 +1542,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, { int res; my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); + my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; const uchar *se= s + slen, *te= t + tlen; DBUG_ASSERT((slen % 2) == 0); @@ -1493,8 +1554,8 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, while (s < se && t < te) { - int s_res= my_utf16_uni(cs, &s_wc, s, se); - int t_res= my_utf16_uni(cs, &t_wc, t, te); + int s_res= mb_wc(cs, &s_wc, s, se); + int t_res= mb_wc(cs, &t_wc, t, te); if (s_res <= 0 || t_res <= 0) { @@ -1531,7 +1592,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, for ( ; s < se; s+= s_res) { - if ((s_res= my_utf16_uni(cs, &s_wc, s, se)) < 0) + if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0) { DBUG_ASSERT(0); return 0; @@ -1545,17 +1606,11 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, static void -my_hash_sort_utf16_bin(CHARSET_INFO *cs __attribute__((unused)), - const uchar *key, size_t len,ulong *nr1, ulong *nr2) +my_hash_sort_utf16_bin(CHARSET_INFO *cs, + const uchar *pos, size_t len, ulong *nr1, ulong *nr2) { - const uchar *pos = key; - - key+= len; - - while (key > pos + 1 && key[-1] == ' ' && key[-2] == '\0') - key-= 2; - - for (; pos < (uchar*) key ; pos++) + const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len); + for ( ; pos < end ; pos++) { nr1[0]^= (ulong) ((((uint) nr1[0] & 63) + nr2[0]) * ((uint)*pos)) + (nr1[0] << 8); @@ -1664,7 +1719,7 @@ struct charset_info_st my_charset_utf16_general_ci= struct charset_info_st my_charset_utf16_bin= { 55,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, "utf16", /* cs name */ "utf16_bin", /* name */ "UTF-16 Unicode", /* comment */ @@ -1693,6 +1748,176 @@ struct charset_info_st my_charset_utf16_bin= &my_collation_utf16_bin_handler }; + +static int +my_utf16le_uni(const CHARSET_INFO *cs __attribute__((unused)), + my_wc_t *pwc, const uchar *s, const uchar *e) +{ + my_wc_t lo; + + if (s + 2 > e) + return MY_CS_TOOSMALL2; + + if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST || + (*pwc > MY_UTF16_SURROGATE_LOW_LAST)) + return 2; /* [0000-D7FF,E000-FFFF] */ + + if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST) + return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */ + + if (s + 4 > e) + return MY_CS_TOOSMALL4; + + s+= 2; + + if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST || + lo > MY_UTF16_SURROGATE_LOW_LAST) + return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */ + + *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF)); + return 4; +} + + +static int +my_uni_utf16le(const CHARSET_INFO *cs __attribute__((unused)), + my_wc_t wc, uchar *s, uchar *e) +{ + uint32 first, second, total; + if (wc < MY_UTF16_SURROGATE_HIGH_FIRST || + (wc > MY_UTF16_SURROGATE_LOW_LAST && + wc <= 0xFFFF)) + { + if (s + 2 > e) + return MY_CS_TOOSMALL2; + int2store(s, wc); + return 2; /* [0000-D7FF,E000-FFFF] */ + } + + if (wc < 0xFFFF || wc > 0x10FFFF) + return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */ + + if (s + 4 > e) + return MY_CS_TOOSMALL4; + + wc-= 0x10000; + first= (0xD800 | ((wc >> 10) & 0x3FF)); + second= (0xDC00 | (wc & 0x3FF)); + total= first | (second << 16); + int4store(s, total); + return 4; /* [010000-10FFFF] */ +} + + +static size_t +my_lengthsp_utf16le(const CHARSET_INFO *cs __attribute__((unused)), + const char *ptr, size_t length) +{ + const char *end= ptr + length; + while (end > ptr + 1 && uint2korr(end - 2) == ' ') + end-= 2; + return (size_t) (end - ptr); +} + + +static MY_CHARSET_HANDLER my_charset_utf16le_handler= +{ + NULL, /* init */ + my_ismbchar_utf16, + my_mbcharlen_utf16, + my_numchars_utf16, + my_charpos_utf16, + my_well_formed_len_utf16, + my_lengthsp_utf16le, + my_numcells_mb, + my_utf16le_uni, /* mb_wc */ + my_uni_utf16le, /* wc_mb */ + my_mb_ctype_mb, + my_caseup_str_mb2_or_mb4, + my_casedn_str_mb2_or_mb4, + my_caseup_utf16, + my_casedn_utf16, + my_snprintf_mb2, + my_l10tostr_mb2_or_mb4, + my_ll10tostr_mb2_or_mb4, + my_fill_mb2, + my_strntol_mb2_or_mb4, + my_strntoul_mb2_or_mb4, + my_strntoll_mb2_or_mb4, + my_strntoull_mb2_or_mb4, + my_strntod_mb2_or_mb4, + my_strtoll10_mb2, + my_strntoull10rnd_mb2_or_mb4, + my_scan_mb2 +}; + + +struct charset_info_st my_charset_utf16le_general_ci= +{ + 56,0,0, /* number */ + MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, + "utf16le", /* cs name */ + "utf16le_general_ci",/* name */ + "UTF-16LE Unicode", /* comment */ + NULL, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 1, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 4, /* mbmaxlen */ + 0, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf16le_handler, + &my_collation_utf16_general_ci_handler +}; + + +struct charset_info_st my_charset_utf16le_bin= +{ + 62,0,0, /* number */ + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, + "utf16le", /* cs name */ + "utf16le_bin", /* name */ + "UTF-16LE Unicode", /* comment */ + NULL, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 1, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 4, /* mbmaxlen */ + 0, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf16le_handler, + &my_collation_utf16_bin_handler +}; + + #endif /* HAVE_CHARSET_utf16 */ @@ -1727,7 +1952,7 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)), static inline void my_tolower_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) { - int page= *wc >> 8; + uint page= *wc >> 8; if (page < 256 && uni_plane[page]) *wc= uni_plane[page][*wc & 0xFF].tolower; } @@ -1736,7 +1961,7 @@ my_tolower_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) static inline void my_toupper_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) { - int page= *wc >> 8; + uint page= *wc >> 8; if (page < 256 && uni_plane[page]) *wc= uni_plane[page][*wc & 0xFF].toupper; } @@ -1745,7 +1970,7 @@ my_toupper_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) static inline void my_tosort_utf32(MY_UNICASE_INFO *const* uni_plane, my_wc_t *wc) { - int page= *wc >> 8; + uint page= *wc >> 8; if (page < 256) { if (uni_plane[page]) @@ -2216,7 +2441,7 @@ my_strtoll10_utf32(CHARSET_INFO *cs __attribute__((unused)), if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9) goto end4; k= k * 10 + c; - s+= 2; + s+= 4; *endptr= (char*) s; /* number string should have ended here */ @@ -2589,7 +2814,7 @@ struct charset_info_st my_charset_utf32_general_ci= struct charset_info_st my_charset_utf32_bin= { 61,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, "utf32", /* cs name */ "utf32_bin", /* name */ "UTF-32 Unicode", /* comment */ @@ -2708,6 +2933,35 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) , } +static inline void +my_tolower_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc) +{ + uint page= *wc >> 8; + DBUG_ASSERT(page < 256); + if (uni_plane[page]) + *wc= uni_plane[page][*wc & 0xFF].tolower; +} + + +static inline void +my_toupper_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc) +{ + uint page= *wc >> 8; + DBUG_ASSERT(page < 256); + if (uni_plane[page]) + *wc= uni_plane[page][*wc & 0xFF].toupper; +} + + +static inline void +my_tosort_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc) +{ + uint page= *wc >> 8; + DBUG_ASSERT(page < 256); + if (uni_plane[page]) + *wc= uni_plane[page][*wc & 0xFF].sort; +} + static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, char *dst __attribute__((unused)), size_t dstlen __attribute__((unused))) @@ -2721,8 +2975,7 @@ static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, while ((src < srcend) && (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0) { - int plane= (wc>>8) & 0xFF; - wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].toupper : wc; + my_toupper_ucs2(uni_plane, &wc); if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend)) break; src+= res; @@ -2744,8 +2997,7 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0) { - int plane = (wc>>8) & 0xFF; - wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc; + my_tosort_ucs2(uni_plane, &wc); n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8); n2[0]+=3; n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8); @@ -2768,8 +3020,7 @@ static size_t my_casedn_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, while ((src < srcend) && (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0) { - int plane= (wc>>8) & 0xFF; - wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].tolower : wc; + my_tolower_ucs2(uni_plane, &wc); if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend)) break; src+= res; @@ -2778,6 +3029,18 @@ static size_t my_casedn_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, } +static void +my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)), + char *s, size_t l, int fill) +{ + char *last= s + l - 2; + uint16 tmp= (fill >> 8) + ((fill & 0xFF) << 8); /* swap bytes */ + DBUG_ASSERT(fill <= 0xFFFF); + for ( ; s <= last; s+= 2) + int2store(s, tmp); /* store little-endian */ +} + + static int my_strnncoll_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen, const uchar *t, size_t tlen, @@ -2791,7 +3054,6 @@ static int my_strnncoll_ucs2(CHARSET_INFO *cs, while ( s < se && t < te ) { - int plane; s_res=my_ucs2_uni(cs,&s_wc, s, se); t_res=my_ucs2_uni(cs,&t_wc, t, te); @@ -2801,10 +3063,9 @@ static int my_strnncoll_ucs2(CHARSET_INFO *cs, return ((int)s[0]-(int)t[0]); } - plane=(s_wc>>8) & 0xFF; - s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc; - plane=(t_wc>>8) & 0xFF; - t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc; + my_tosort_ucs2(uni_plane, &s_wc); + my_tosort_ucs2(uni_plane, &t_wc); + if ( s_wc != t_wc ) { return s_wc > t_wc ? 1 : -1; @@ -3115,7 +3376,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_snprintf_mb2, my_l10tostr_mb2_or_mb4, my_ll10tostr_mb2_or_mb4, - my_fill_mb2, + my_fill_ucs2, my_strntol_mb2_or_mb4, my_strntoul_mb2_or_mb4, my_strntoll_mb2_or_mb4, |