summaryrefslogtreecommitdiff
path: root/strings/ctype-utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r--strings/ctype-utf8.c673
1 files changed, 404 insertions, 269 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index b324a18e358..a73de37bbd7 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -111,7 +111,9 @@
#define MY_UTF8MB3_BIN MY_UTF8MB3 "_bin"
#define MY_UTF8MB4_GENERAL_CI MY_UTF8MB4 "_general_ci"
#define MY_UTF8MB4_GENERAL_CS MY_UTF8MB4 "_general_cs"
+#define MY_UTF8MB4_GENERAL_NOPAD_CI MY_UTF8MB4 "_general_nopad_ci"
#define MY_UTF8MB4_BIN MY_UTF8MB4 "_bin"
+#define MY_UTF8MB4_NOPAD_BIN MY_UTF8MB4 "_nopad_bin"
@@ -4692,30 +4694,19 @@ my_strxfrm_pad_unicode(uchar *str, uchar *strend)
}
-/*
- Store sorting weights using 2 bytes per character.
-
- This function is shared between
- - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
- which support BMP only (U+0000..U+FFFF).
- - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
- which map all supplementary characters to weight 0xFFFD.
-*/
-size_t
-my_strnxfrm_unicode(CHARSET_INFO *cs,
- uchar *dst, size_t dstlen, uint nweights,
- const uchar *src, size_t srclen, uint flags)
+size_t my_strnxfrm_unicode_internal(CHARSET_INFO *cs,
+ uchar *dst, uchar *de, uint *nweights,
+ const uchar *src, const uchar *se)
{
my_wc_t UNINIT_VAR(wc);
int res;
uchar *dst0= dst;
- uchar *de= dst + dstlen;
- const uchar *se= src + srclen;
MY_UNICASE_INFO *uni_plane= (cs->state & MY_CS_BINSORT) ?
NULL : cs->caseinfo;
- DBUG_ASSERT(!srclen || src);
- for (; dst < de && nweights; nweights--)
+ DBUG_ASSERT(src || !se);
+
+ for (; dst < de && *nweights; (*nweights)--)
{
if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
break;
@@ -4728,6 +4719,29 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
if (dst < de)
*dst++= (uchar) (wc & 0xFF);
}
+ return dst - dst0;
+}
+
+
+/*
+ Store sorting weights using 2 bytes per character.
+
+ This function is shared between
+ - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
+ which support BMP only (U+0000..U+FFFF).
+ - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
+ which map all supplementary characters to weight 0xFFFD.
+*/
+size_t
+my_strnxfrm_unicode(CHARSET_INFO *cs,
+ uchar *dst, size_t dstlen, uint nweights,
+ const uchar *src, size_t srclen, uint flags)
+{
+ uchar *dst0= dst;
+ uchar *de= dst + dstlen;
+ dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights,
+ src, src + srclen);
+ DBUG_ASSERT(dst <= de); /* Safety */
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights);
@@ -4740,6 +4754,35 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
}
+size_t
+my_strnxfrm_unicode_nopad(CHARSET_INFO *cs,
+ uchar *dst, size_t dstlen, uint nweights,
+ const uchar *src, size_t srclen, uint flags)
+{
+ uchar *dst0= dst;
+ uchar *de= dst + dstlen;
+ dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights,
+ src, src + srclen);
+ DBUG_ASSERT(dst <= de); /* Safety */
+
+ if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
+ {
+ size_t len= de - dst;
+ set_if_smaller(len, nweights * 2);
+ memset(dst, 0x00, len);
+ dst+= len;
+ }
+
+ my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
+
+ if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
+ {
+ memset(dst, 0x00, de - dst);
+ dst= de;
+ }
+ return dst - dst0;
+}
+
/*
For BMP-only collations that use 2 bytes per weight.
*/
@@ -4750,24 +4793,18 @@ my_strnxfrmlen_unicode(CHARSET_INFO *cs, size_t len)
return ((len + cs->mbmaxlen - 1) / cs->mbmaxlen) * 2;
}
-/*
- Store sorting weights using 3 bytes per character.
- This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
-*/
size_t
-my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
- uchar *dst, size_t dstlen, uint nweights,
- const uchar *src, size_t srclen, uint flags)
+my_strnxfrm_unicode_full_bin_internal(CHARSET_INFO *cs,
+ uchar *dst, uchar *de, uint *nweights,
+ const uchar *src, const uchar *se)
{
my_wc_t UNINIT_VAR(wc);
uchar *dst0= dst;
- uchar *de= dst + dstlen;
- const uchar *se = src + srclen;
- DBUG_ASSERT(!srclen || src);
+ DBUG_ASSERT(src || !se);
DBUG_ASSERT(cs->state & MY_CS_BINSORT);
- for ( ; dst < de && nweights; nweights--)
+ for (; dst < de && *nweights; (*nweights)--)
{
int res;
if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
@@ -4781,6 +4818,25 @@ my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
*dst++= (uchar) (wc & 0xFF);
}
}
+ return dst - dst0;
+}
+
+
+/*
+ Store sorting weights using 3 bytes per character.
+ This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
+*/
+size_t
+my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
+ uchar *dst, size_t dstlen, uint nweights,
+ const uchar *src, size_t srclen, uint flags)
+{
+ uchar *dst0= dst;
+ uchar *de= dst + dstlen;
+
+ dst+= my_strnxfrm_unicode_full_bin_internal(cs, dst, de, &nweights,
+ src, src + srclen);
+ DBUG_ASSERT(dst <= de); /* Safety */
if (flags & MY_STRXFRM_PAD_WITH_SPACE)
{
@@ -4795,7 +4851,7 @@ my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
}
}
}
-
+
my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
if (flags & MY_STRXFRM_PAD_TO_MAXLEN)
@@ -4816,6 +4872,37 @@ my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
size_t
+my_strnxfrm_unicode_full_nopad_bin(CHARSET_INFO *cs,
+ uchar *dst, size_t dstlen, uint nweights,
+ const uchar *src, size_t srclen, uint flags)
+{
+ uchar *dst0= dst;
+ uchar *de= dst + dstlen;
+
+ dst+= my_strnxfrm_unicode_full_bin_internal(cs, dst, de, &nweights,
+ src, src + srclen);
+ DBUG_ASSERT(dst <= de); /* Safety */
+
+ if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
+ {
+ size_t len= de - dst;
+ set_if_smaller(len, nweights * 3);
+ memset(dst, 0x00, len);
+ dst+= len;
+ }
+
+ my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
+
+ if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
+ {
+ memset(dst, 0x00, de - dst);
+ dst= de;
+ }
+ return dst - dst0;
+}
+
+
+size_t
my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
{
return ((len + 3) / cs->mbmaxlen) * 3;
@@ -4891,16 +4978,6 @@ static const uchar to_upper_utf8[] = {
240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
};
-static inline int bincmp(const uchar *s, const uchar *se,
- const uchar *t, const uchar *te)
-{
- int slen= (int) (se-s), tlen= (int) (te-t);
- int len=MY_MIN(slen,tlen);
- int cmp= memcmp(s,t,len);
- return cmp ? cmp : slen-tlen;
-}
-
-
static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
@@ -5063,14 +5140,6 @@ my_toupper_utf8mb3(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
}
-static inline void
-my_tosort_utf8mb3(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
- MY_UNICASE_CHARACTER *page;
- if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
- *wc= page[*wc & 0xFF].sort;
-}
-
static size_t my_caseup_utf8(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst, size_t dstlen)
{
@@ -5093,33 +5162,40 @@ static size_t my_caseup_utf8(CHARSET_INFO *cs, char *src, size_t srclen,
}
-static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen,
- ulong *nr1, ulong *nr2)
+static void my_hash_sort_utf8_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
+ ulong *nr1, ulong *nr2)
{
my_wc_t wc;
int res;
- const uchar *e=s+slen;
+ const uchar *e= s+slen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
register ulong m1= *nr1, m2= *nr2;
- /*
- Remove end space. We have to do this to be able to compare
- 'A ' and 'A' as identical
- */
- while (e > s && e[-1] == ' ')
- e--;
-
while ((s < e) && (res=my_utf8_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
{
my_tosort_unicode(uni_plane, &wc, cs->state);
MY_HASH_ADD_16(m1, m2, wc);
- s+=res;
+ s+= res;
}
*nr1= m1;
*nr2= m2;
}
+static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen,
+ ulong *nr1, ulong *nr2)
+{
+ const uchar *e= s+slen;
+ /*
+ Remove end space. We have to do this to be able to compare
+ 'A ' and 'A' as identical
+ */
+ while (e > s && e[-1] == ' ')
+ e--;
+ my_hash_sort_utf8_nopad(cs, s, e - s, nr1, nr2);
+}
+
+
static size_t my_caseup_str_utf8(CHARSET_INFO *cs, char *src)
{
my_wc_t wc;
@@ -5326,27 +5402,6 @@ int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
return MY_CS_ILSEQ;
}
-static size_t
-my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
- size_t pos, int *error)
-{
- const char *b_start= b;
- *error= 0;
- while (pos)
- {
- int mb_len;
-
- if ((mb_len= my_charlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
- {
- *error= b < e ? 1 : 0;
- break;
- }
- b+= mb_len;
- pos--;
- }
- return (size_t) (b - b_start);
-}
-
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8
#define CHARLEN(cs,str,end) my_charlen_utf8(cs,str,end)
@@ -5388,6 +5443,15 @@ static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2)
#include "strcoll.ic"
+#define DEFINE_STRNNCOLLSP_NOPAD
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_nopad_ci
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x)
+#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y)
+#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z)
+#include "strcoll.ic"
+
+
static inline int my_weight_mb1_utf8_general_mysql500_ci(uchar b)
{
return (int) plane00_mysql500[b & 0xFF].sort;
@@ -5427,26 +5491,13 @@ my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
#include "strcoll.ic"
-static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
-{
- int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e);
- return (res>1) ? res : 0;
-}
-
-static uint my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
- uint c)
-{
- if (c < 0x80)
- return 1;
- else if (c < 0xc2)
- return 0; /* Illegal mb head */
- else if (c < 0xe0)
- return 2;
- else if (c < 0xf0)
- return 3;
- return 0; /* Illegal mb head */;
-}
-
+#define DEFINE_STRNNCOLLSP_NOPAD
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_nopad_bin
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) ((int) (uchar) (x))
+#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y))
+#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z))
+#include "strcoll.ic"
/*
TODO-10.2: join this with pad_max_char() in ctype-mb.c
@@ -5527,14 +5578,44 @@ static MY_COLLATION_HANDLER my_collation_utf8_bin_handler =
my_propagate_simple
};
+
+static MY_COLLATION_HANDLER my_collation_utf8_general_nopad_ci_handler =
+{
+ NULL, /* init */
+ my_strnncoll_utf8_general_ci,
+ my_strnncollsp_utf8_general_nopad_ci,
+ my_strnxfrm_unicode_nopad,
+ my_strnxfrmlen_unicode,
+ my_like_range_mb,
+ my_wildcmp_utf8,
+ my_strcasecmp_utf8,
+ my_instr_mb,
+ my_hash_sort_utf8_nopad,
+ my_propagate_complex
+};
+
+
+static MY_COLLATION_HANDLER my_collation_utf8_nopad_bin_handler =
+{
+ NULL, /* init */
+ my_strnncoll_utf8_bin,
+ my_strnncollsp_utf8_nopad_bin,
+ my_strnxfrm_unicode_nopad,
+ my_strnxfrmlen_unicode,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_nopad_bin,
+ my_propagate_simple
+};
+
+
MY_CHARSET_HANDLER my_charset_utf8_handler=
{
NULL, /* init */
- my_ismbchar_utf8,
- my_mbcharlen_utf8,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_utf8,
my_lengthsp_8bit,
my_numcells_mb,
my_utf8_uni,
@@ -5662,6 +5743,73 @@ struct charset_info_st my_charset_utf8_bin=
&my_collation_utf8_bin_handler
};
+
+struct charset_info_st my_charset_utf8_general_nopad_ci=
+{
+ MY_NOPAD_ID(33),0,0, /* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NOPAD, /* state */
+ "utf8", /* cs name */
+ "utf8_general_nopad_ci", /* name */
+ "", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8, /* ctype */
+ to_lower_utf8, /* to_lower */
+ to_upper_utf8, /* to_upper */
+ to_upper_utf8, /* sort_order */
+ NULL, /* uca */
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ &my_unicase_default, /* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 1, /* mbminlen */
+ 3, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ 1, /* levels_for_order */
+ &my_charset_utf8_handler,
+ &my_collation_utf8_general_nopad_ci_handler
+};
+
+
+struct charset_info_st my_charset_utf8_nopad_bin=
+{
+ MY_NOPAD_ID(83),0,0,/* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NOPAD,
+ "utf8", /* cs name */
+ "utf8_nopad_bin", /* name */
+ "", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8, /* ctype */
+ to_lower_utf8, /* to_lower */
+ to_upper_utf8, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* uca */
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ &my_unicase_default,/* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 1, /* mbminlen */
+ 3, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ 1, /* levels_for_order */
+ &my_charset_utf8_handler,
+ &my_collation_utf8_nopad_bin_handler
+};
+
+
#ifdef HAVE_UTF8_GENERAL_CS
/*
@@ -5718,8 +5866,7 @@ static int my_strnncoll_utf8_cs(CHARSET_INFO *cs,
static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs,
const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool diff_if_only_endspace_difference)
+ const uchar *t, size_t tlen)
{
int s_res, t_res, res;
my_wc_t s_wc, t_wc;
@@ -5728,10 +5875,6 @@ static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs,
int save_diff= 0;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= 0;
-#endif
-
while ( s < se && t < te )
{
s_res=my_utf8_uni(cs,&s_wc, s, se);
@@ -5762,37 +5905,22 @@ static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs,
slen= se-s;
tlen= te-t;
- res= 0;
-
- if (slen != tlen)
- {
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- if (slen < tlen)
- {
- slen= tlen;
- s= t;
- se= te;
- swap= -1;
- res= -res;
- }
- /*
- This following loop uses the fact that in UTF-8
- all multibyte characters are greater than space,
- and all multibyte head characters are greater than
- space. It means if we meet a character greater
- than space, it always means that the longer string
- is greater. So we can reuse the same loop from the
- 8bit version, without having to process full multibute
- sequences.
- */
- for ( ; s < se; s++)
- {
- if (*s != (uchar) ' ')
- return (*s < (uchar) ' ') ? -swap : swap;
- }
- }
+
+ /*
+ The following code uses the fact that in UTF-8
+ all multibyte characters are greater than space,
+ and all multibyte head characters are greater than
+ space. It means if we meet a character greater
+ than space, it always means that the longer string
+ is greater. So we can reuse the same loop from the
+ 8bit version, without having to process full multibute
+ sequences.
+ */
+ if ((res= slen == tlen ? 0 :
+ slen < tlen ?
+ -my_strnncollsp_padspace_bin(t, tlen) :
+ my_strnncollsp_padspace_bin(s, slen)))
+ return res;
return save_diff;
}
@@ -7077,15 +7205,6 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
}
-static uint
-my_ismbchar_filename(CHARSET_INFO *cs, const char *str, const char *end)
-{
- my_wc_t wc;
- int rc= my_mb_wc_filename(cs, &wc, (const uchar *) str, (const uchar *) end);
- return rc > 1 ? rc : 0;
-}
-
-
#define MY_FUNCTION_NAME(x) my_ ## x ## _filename
#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end)
#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
@@ -7114,11 +7233,8 @@ static MY_COLLATION_HANDLER my_collation_filename_handler =
static MY_CHARSET_HANDLER my_charset_filename_handler=
{
NULL, /* init */
- my_ismbchar_filename,
- my_mbcharlen_utf8,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_mb,
my_lengthsp_8bit,
my_numcells_mb,
my_mb_wc_filename,
@@ -7181,57 +7297,6 @@ struct charset_info_st my_charset_filename=
};
-#ifdef MY_TEST_UTF8
-#include <stdio.h>
-
-static void test_mb(CHARSET_INFO *cs, uchar *s)
-{
- while(*s)
- {
- if (my_ismbhead_utf8(cs,*s))
- {
- uint len=my_mbcharlen_utf8(cs,*s);
- while(len--)
- {
- printf("%c",*s);
- s++;
- }
- printf("\n");
- }
- else
- {
- printf("%c\n",*s);
- s++;
- }
- }
-}
-
-int main()
-{
- char str[1024]=" utf8 test проба ПЕРА по-РУССКИ";
- CHARSET_INFO *cs;
-
- test_mb(cs,(uchar*)str);
-
- printf("orig :'%s'\n",str);
-
- my_caseup_utf8(cs,str,15);
- printf("caseup :'%s'\n",str);
-
- my_caseup_str_utf8(cs,str);
- printf("caseup_str:'%s'\n",str);
-
- my_casedn_utf8(cs,str,15);
- printf("casedn :'%s'\n",str);
-
- my_casedn_str_utf8(cs,str);
- printf("casedn_str:'%s'\n",str);
-
- return 0;
-}
-
-#endif
-
#endif /* HAVE_CHARSET_UTF8 */
@@ -7308,17 +7373,6 @@ static uchar to_upper_utf8mb4[]=
};
-static inline int
-bincmp_utf8mb4(const uchar *s, const uchar *se,
- const uchar *t, const uchar *te)
-{
- int slen= (int) (se - s), tlen= (int) (te - t);
- int len= MY_MIN(slen, tlen);
- int cmp= memcmp(s, t, len);
- return cmp ? cmp : slen - tlen;
-}
-
-
static int
my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
@@ -7537,7 +7591,7 @@ my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen,
static void
-my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
+my_hash_sort_utf8mb4_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
ulong *nr1, ulong *nr2)
{
my_wc_t wc;
@@ -7546,13 +7600,6 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
register ulong m1= *nr1, m2= *nr2;
- /*
- Remove end space. We do this to be able to compare
- 'A ' and 'A' as identical
- */
- while (e > s && e[-1] == ' ')
- e--;
-
while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_unicode(uni_plane, &wc, cs->state);
@@ -7575,6 +7622,21 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
}
+static void
+my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
+ ulong *nr1, ulong *nr2)
+{
+ const uchar *e= s + slen;
+ /*
+ Remove end space. We do this to be able to compare
+ 'A ' and 'A' as identical
+ */
+ while (e > s && e[-1] == ' ')
+ e--;
+ my_hash_sort_utf8mb4_nopad(cs, s, e - s, nr1, nr2);
+}
+
+
static size_t
my_caseup_str_utf8mb4(CHARSET_INFO *cs, char *src)
{
@@ -7774,29 +7836,6 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
}
-static
-size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
- const char *b, const char *e,
- size_t pos, int *error)
-{
- const char *b_start= b;
- *error= 0;
- while (pos)
- {
- int mb_len;
-
- if ((mb_len= my_charlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
- {
- *error= b < e ? 1 : 0;
- break;
- }
- b+= mb_len;
- pos--;
- }
- return (size_t) (b - b_start);
-}
-
-
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4
#define CHARLEN(cs,str,end) my_charlen_utf8mb4(cs,str,end)
#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
@@ -7829,29 +7868,28 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
#include "strcoll.ic"
-static uint
-my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
-{
- int res= my_charlen_utf8mb4(cs, (const uchar*) b, (const uchar*) e);
- return (res > 1) ? res : 0;
-}
+#define DEFINE_STRNNCOLLSP_NOPAD
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_nopad_ci
+#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0)
+#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1)
+#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2)
+/*
+ All non-BMP characters have the same weight.
+*/
+#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
+#include "strcoll.ic"
-static uint
-my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c)
-{
- if (c < 0x80)
- return 1;
- if (c < 0xc2)
- return 0; /* Illegal mb head */
- if (c < 0xe0)
- return 2;
- if (c < 0xf0)
- return 3;
- if (c < 0xf8)
- return 4;
- return 0; /* Illegal mb head */;
-}
+#define DEFINE_STRNNCOLLSP_NOPAD
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_nopad_bin
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(b0) ((int) (uchar) (b0))
+#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
+#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
+#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
+#include "strcoll.ic"
static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
@@ -7872,7 +7910,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
{
- NULL, /* init */
+ NULL, /* init */
my_strnncoll_utf8mb4_bin,
my_strnncollsp_utf8mb4_bin,
my_strnxfrm_unicode_full_bin,
@@ -7886,14 +7924,43 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
};
+static MY_COLLATION_HANDLER my_collation_utf8mb4_general_nopad_ci_handler=
+{
+ NULL, /* init */
+ my_strnncoll_utf8mb4_general_ci,
+ my_strnncollsp_utf8mb4_general_nopad_ci,
+ my_strnxfrm_unicode_nopad,
+ my_strnxfrmlen_unicode,
+ my_like_range_mb,
+ my_wildcmp_utf8mb4,
+ my_strcasecmp_utf8mb4,
+ my_instr_mb,
+ my_hash_sort_utf8mb4_nopad,
+ my_propagate_complex
+};
+
+
+static MY_COLLATION_HANDLER my_collation_utf8mb4_nopad_bin_handler =
+{
+ NULL, /* init */
+ my_strnncoll_utf8mb4_bin,
+ my_strnncollsp_utf8mb4_nopad_bin,
+ my_strnxfrm_unicode_full_nopad_bin,
+ my_strnxfrmlen_unicode_full_bin,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_nopad_bin,
+ my_propagate_simple
+};
+
+
MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
{
NULL, /* init */
- my_ismbchar_utf8mb4,
- my_mbcharlen_utf8mb4,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_utf8mb4,
my_lengthsp_8bit,
my_numcells_mb,
my_mb_wc_utf8mb4,
@@ -7989,4 +8056,72 @@ struct charset_info_st my_charset_utf8mb4_bin=
&my_collation_utf8mb4_bin_handler
};
+
+struct charset_info_st my_charset_utf8mb4_general_nopad_ci=
+{
+ MY_NOPAD_ID(45),0,0,/* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT|
+ MY_CS_NOPAD, /* state */
+ MY_UTF8MB4, /* cs name */
+ MY_UTF8MB4_GENERAL_NOPAD_CI,/* name */
+ "UTF-8 Unicode", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8mb4, /* ctype */
+ to_lower_utf8mb4, /* to_lower */
+ to_upper_utf8mb4, /* to_upper */
+ to_upper_utf8mb4, /* sort_order */
+ NULL, /* uca */
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ &my_unicase_default,/* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 1, /* mbminlen */
+ 4, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ 1, /* levels_for_order */
+ &my_charset_utf8mb4_handler,
+ &my_collation_utf8mb4_general_nopad_ci_handler
+};
+
+
+struct charset_info_st my_charset_utf8mb4_nopad_bin=
+{
+ MY_NOPAD_ID(46),0,0, /* number */
+ MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|
+ MY_CS_UNICODE_SUPPLEMENT|MY_CS_NOPAD, /* state */
+ MY_UTF8MB4, /* cs name */
+ MY_UTF8MB4_NOPAD_BIN, /* name */
+ "UTF-8 Unicode", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8mb4, /* ctype */
+ to_lower_utf8mb4, /* to_lower */
+ to_upper_utf8mb4, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* uca */
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ &my_unicase_default,/* caseinfo */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* caseup_multiply */
+ 1, /* casedn_multiply */
+ 1, /* mbminlen */
+ 4, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ ' ', /* pad char */
+ 0, /* escape_with_backslash_is_dangerous */
+ 1, /* levels_for_order */
+ &my_charset_utf8mb4_handler,
+ &my_collation_utf8mb4_nopad_bin_handler
+};
+
#endif /* HAVE_CHARSET_utf8mb4 */