summaryrefslogtreecommitdiff
path: root/strings/ctype-utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r--strings/ctype-utf8.c717
1 files changed, 219 insertions, 498 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index b7fb7e49cab..3c2c812a004 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -26,8 +26,86 @@
#define EILSEQ ENOENT
#endif
-#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
+/* Detect special bytes and sequences */
+#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
+/*
+ Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
+ Use this macro if the caller already checked b0 for:
+ - an MB1 character
+ - an unused gap between MB1 and MB2HEAD
+*/
+#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
+ IS_CONTINUATION_BYTE((uchar) b1))
+
+/*
+ Check MB3 character assuming that b0 is already known to be
+ in the valid MB3HEAD range [0xE0..0xEF].
+*/
+#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
+ IS_CONTINUATION_BYTE(b2) && \
+ ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
+
+/*
+ Check MB3 character assuming that b0 is already known to be >= 0xE0,
+ but is not checked for the high end 0xF0 yet.
+ Use this macro if the caller already checked b0 for:
+ - an MB1 character
+ - an unused gap between MB1 and MB2HEAD
+ - an MB2HEAD
+*/
+#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
+ IS_UTF8MB3_STEP2(b0,b1,b2))
+
+/*
+ UTF-8 quick four-byte mask:
+ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ Encoding allows to encode U+00010000..U+001FFFFF
+
+ The maximum character defined in the Unicode standard is U+0010FFFF.
+ Higher characters U+00110000..U+001FFFFF are not used.
+
+ 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+ 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+
+ Valid codes:
+ [F0][90..BF][80..BF][80..BF]
+ [F1][80..BF][80..BF][80..BF]
+ [F2][80..BF][80..BF][80..BF]
+ [F3][80..BF][80..BF][80..BF]
+ [F4][80..8F][80..BF][80..BF]
+*/
+
+/*
+ Check MB4 character assuming that b0 is already
+ known to be in the range [0xF0..0xF4]
+*/
+#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
+ IS_CONTINUATION_BYTE(b2) && \
+ IS_CONTINUATION_BYTE(b3) && \
+ (b0 >= 0xf1 || b1 >= 0x90) && \
+ (b0 <= 0xf3 || b1 <= 0x8F))
+#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
+ IS_UTF8MB4_STEP2(b0,b1,b2,b3))
+
+/* Convert individual bytes to Unicode code points */
+#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
+ ((my_wc_t) ((uchar) b1 ^ 0x80)))
+#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
+ ((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
+ ((my_wc_t) ((uchar) b2 ^ 0x80)))
+#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
+ ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
+ ((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
+ (my_wc_t) ((uchar) b3 ^ 0x80))
+
+/* Definitions for strcoll.ic */
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
+#define IS_MB1_MBHEAD_UNUSED_GAP(x) ((uchar) (x) < 0xC2)
+#define IS_MB2_CHAR(x,y) IS_UTF8MB2_STEP2(x,y)
+#define IS_MB3_CHAR(x,y,z) IS_UTF8MB3_STEP3(x,y,z)
+
+/* Collation names */
#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
#define MY_UTF8MB3_BIN MY_UTF8MB3 "_bin"
@@ -88,8 +166,7 @@ int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
- (c >= 0xe1 || s[1] >= 0xa0)))
+ if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
return 3;
@@ -4848,7 +4925,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
- *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+ *pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
else if (c < 0xf0)
@@ -4856,76 +4933,12 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
- (c >= 0xe1 || s[1] >= 0xa0)))
+ if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
- *pwc = ((my_wc_t) (c & 0x0f) << 12) |
- ((my_wc_t) (s[1] ^ 0x80) << 6) |
- (my_wc_t) (s[2] ^ 0x80);
-
+ *pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
-#ifdef UNICODE_32BIT
- else if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
- {
- if (s+4 > e) /* We need 4 characters */
- return MY_CS_TOOSMALL4;
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- (c >= 0xf1 || s[1] >= 0x90)))
- return MY_CS_ILSEQ;
-
- *pwc = ((my_wc_t) (c & 0x07) << 18) |
- ((my_wc_t) (s[1] ^ 0x80) << 12) |
- ((my_wc_t) (s[2] ^ 0x80) << 6) |
- (my_wc_t) (s[3] ^ 0x80);
-
- return 4;
- }
- else if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
- {
- if (s+5 >e) /* We need 5 characters */
- return MY_CS_TOOSMALL5;
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- IS_CONTINUATION_BYTE(s[4]) &&
- (c >= 0xf9 || s[1] >= 0x88)))
- return MY_CS_ILSEQ;
-
- *pwc = ((my_wc_t) (c & 0x03) << 24) |
- ((my_wc_t) (s[1] ^ 0x80) << 18) |
- ((my_wc_t) (s[2] ^ 0x80) << 12) |
- ((my_wc_t) (s[3] ^ 0x80) << 6) |
- (my_wc_t) (s[4] ^ 0x80);
- return 5;
- }
- else if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
- {
- if ( s+6 >e ) /* We need 6 characters */
- return MY_CS_TOOSMALL6;
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- IS_CONTINUATION_BYTE(s[4]) &&
- IS_CONTINUATION_BYTE(s[5]) &&
- (c >= 0xfd || s[1] >= 0x84)))
- return MY_CS_ILSEQ;
-
- *pwc = ((my_wc_t) (c & 0x01) << 30)
- | ((my_wc_t) (s[1] ^ 0x80) << 24)
- | ((my_wc_t) (s[2] ^ 0x80) << 18)
- | ((my_wc_t) (s[3] ^ 0x80) << 12)
- | ((my_wc_t) (s[4] ^ 0x80) << 6)
- | (my_wc_t) (s[5] ^ 0x80);
- return 6;
- }
-#endif
return MY_CS_ILSEQ;
}
@@ -4954,21 +4967,16 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
if (!((s[1] ^ 0x80) < 0x40))
return MY_CS_ILSEQ;
- *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+ *pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
if (c < 0xf0)
{
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- (c >= 0xe1 || s[1] >= 0xa0)))
+ if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
- *pwc= ((my_wc_t) (c & 0x0f) << 12) |
- ((my_wc_t) (s[1] ^ 0x80) << 6) |
- (my_wc_t) (s[2] ^ 0x80);
-
+ *pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
return MY_CS_ILSEQ;
@@ -5193,148 +5201,6 @@ static size_t my_casedn_str_utf8(CHARSET_INFO *cs, char *src)
}
-static int my_strnncoll_utf8(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool t_is_prefix)
-{
- int s_res,t_res;
- my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
- const uchar *se=s+slen;
- const uchar *te=t+tlen;
- MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
- while ( s < se && t < te )
- {
- s_res=my_utf8_uni(cs,&s_wc, s, se);
- t_res=my_utf8_uni(cs,&t_wc, t, te);
-
- if ( s_res <= 0 || t_res <= 0 )
- {
- /* Incorrect string, compare byte by byte value */
- return bincmp(s, se, t, te);
- }
-
- my_tosort_unicode(uni_plane, &s_wc, cs->state);
- my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
- if ( s_wc != t_wc )
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+=s_res;
- t+=t_res;
- }
- return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
-}
-
-
-
-/*
- Compare strings, discarding end space
-
- SYNOPSIS
- my_strnncollsp_utf8()
- cs character set handler
- a First string to compare
- a_length Length of 'a'
- b Second string to compare
- b_length Length of 'b'
- diff_if_only_endspace_difference
- Set to 1 if the strings should be regarded as different
- if they only difference in end space
-
- IMPLEMENTATION
- If one string is shorter as the other, then we space extend the other
- so that the strings have equal length.
-
- This will ensure that the following things hold:
-
- "a" == "a "
- "a\0" < "a"
- "a\0" < "a "
-
- RETURN
- < 0 a < b
- = 0 a == b
- > 0 a > b
-*/
-
-static int my_strnncollsp_utf8(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool diff_if_only_endspace_difference)
-{
- int s_res, t_res, res;
- my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
- const uchar *se= s+slen, *te= t+tlen;
- MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= 0;
-#endif
-
- while ( s < se && t < te )
- {
- s_res=my_utf8_uni(cs,&s_wc, s, se);
- t_res=my_utf8_uni(cs,&t_wc, t, te);
-
- if ( s_res <= 0 || t_res <= 0 )
- {
- /* Incorrect string, compare byte by byte value */
- return bincmp(s, se, t, te);
- }
-
- my_tosort_unicode(uni_plane, &s_wc, cs->state);
- my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
- if ( s_wc != t_wc )
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+=s_res;
- t+=t_res;
- }
-
- slen= (size_t) (se-s);
- tlen= (size_t) (te-t);
- res= 0;
-
- if (slen != tlen)
- {
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- if (slen < tlen)
- {
- slen= tlen;
- s= t;
- se= te;
- swap= -1;
- res= -res;
- }
- /*
- This following loop uses the fact that in UTF-8
- all multibyte characters are greater than space,
- and all multibyte head characters are greater than
- space. It means if we meet a character greater
- than space, it always means that the longer string
- is greater. So we can reuse the same loop from the
- 8bit version, without having to process full multibute
- sequences.
- */
- for ( ; s < se; s++)
- {
- if (*s != ' ')
- return (*s < ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
-
/*
Compare 0-terminated UTF8 strings.
@@ -5456,50 +5322,6 @@ int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
if (c < 0xf0)
return my_valid_mbcharlen_utf8mb3(s, e);
-#ifdef UNICODE_32BIT
- if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
- {
- if (s+4 > e) /* We need 4 characters */
- return MY_CS_TOOSMALL4;
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- (c >= 0xf1 || s[1] >= 0x90)))
- return MY_CS_ILSEQ;
-
- return 4;
- }
- if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
- {
- if (s+5 >e) /* We need 5 characters */
- return MY_CS_TOOSMALL5;
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- IS_CONTINUATION_BYTE(s[4]) &&
- (c >= 0xf9 || s[1] >= 0x88)))
- return MY_CS_ILSEQ;
-
- return 5;
- }
- if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
- {
- if ( s+6 >e ) /* We need 6 characters */
- return MY_CS_TOOSMALL6;
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- IS_CONTINUATION_BYTE(s[4]) &&
- IS_CONTINUATION_BYTE(s[5]) &&
- (c >= 0xfd || s[1] >= 0x84)))
- return MY_CS_ILSEQ;
-
- return 6;
- }
-#endif
return MY_CS_ILSEQ;
}
@@ -5535,6 +5357,75 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
/* my_well_formed_char_length_utf8 */
+static inline int my_weight_mb1_utf8_general_ci(uchar b)
+{
+ return (int) plane00[b & 0xFF].sort;
+}
+
+
+static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1)
+{
+ my_wc_t wc= UTF8MB2_CODE(b0, b1);
+ MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8];
+ return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2)
+{
+ my_wc_t wc= UTF8MB3_CODE(b0, b1, b2);
+ MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8];
+ return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x)
+#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y)
+#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z)
+#include "strcoll.ic"
+
+
+static inline int my_weight_mb1_utf8_general_mysql500_ci(uchar b)
+{
+ return (int) plane00_mysql500[b & 0xFF].sort;
+}
+
+
+static inline int my_weight_mb2_utf8_general_mysql500_ci(uchar b0, uchar b1)
+{
+ my_wc_t wc= UTF8MB2_CODE(b0, b1);
+ MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8];
+ return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+static inline int
+my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
+{
+ my_wc_t wc= UTF8MB3_CODE(b0, b1, b2);
+ MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8];
+ return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x)
+#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y)
+#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z)
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) ((int) (uchar) (x))
+#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y))
+#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z))
+#include "strcoll.ic"
+
+
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e);
@@ -5552,14 +5443,6 @@ static uint my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
return 2;
else if (c < 0xf0)
return 3;
-#ifdef UNICODE_32BIT
- else if (c < 0xf8)
- return 4;
- else if (c < 0xfc)
- return 5;
- else if (c < 0xfe)
- return 6;
-#endif
return 0; /* Illegal mb head */;
}
@@ -5567,8 +5450,24 @@ static uint my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler =
{
NULL, /* init */
- my_strnncoll_utf8,
- my_strnncollsp_utf8,
+ my_strnncoll_utf8_general_ci,
+ my_strnncollsp_utf8_general_ci,
+ my_strnxfrm_unicode,
+ my_strnxfrmlen_unicode,
+ my_like_range_mb,
+ my_wildcmp_utf8,
+ my_strcasecmp_utf8,
+ my_instr_mb,
+ my_hash_sort_utf8,
+ my_propagate_complex
+};
+
+
+static MY_COLLATION_HANDLER my_collation_utf8_general_mysql500_ci_handler =
+{
+ NULL, /* init */
+ my_strnncoll_utf8_general_mysql500_ci,
+ my_strnncollsp_utf8_general_mysql500_ci,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_mb,
@@ -5583,8 +5482,8 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler =
static MY_COLLATION_HANDLER my_collation_utf8_bin_handler =
{
NULL, /* init */
- my_strnncoll_mb_bin,
- my_strnncollsp_mb_bin,
+ my_strnncoll_utf8_bin,
+ my_strnncollsp_utf8_bin,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_mb,
@@ -5627,6 +5526,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_charlen_utf8,
my_well_formed_char_length_utf8,
my_copy_fix_mb,
+ my_uni_utf8,
};
@@ -5693,7 +5593,7 @@ struct charset_info_st my_charset_utf8_general_mysql500_ci=
0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_utf8_handler,
- &my_collation_utf8_general_ci_handler
+ &my_collation_utf8_general_mysql500_ci_handler
};
@@ -7166,8 +7066,8 @@ my_ismbchar_filename(CHARSET_INFO *cs, const char *str, const char *end)
static MY_COLLATION_HANDLER my_collation_filename_handler =
{
NULL, /* init */
- my_strnncoll_utf8,
- my_strnncollsp_utf8,
+ my_strnncoll_simple,
+ my_strnncollsp_simple,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_mb,
@@ -7210,6 +7110,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
my_charlen_filename,
my_well_formed_char_length_filename,
my_copy_fix_mb,
+ my_wc_mb_filename,
};
@@ -7410,7 +7311,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
- *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+ *pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
else if (c < 0xf0)
@@ -7418,13 +7319,10 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
- (c >= 0xe1 || s[1] >= 0xa0)))
+ if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
- *pwc= ((my_wc_t) (c & 0x0f) << 12) |
- ((my_wc_t) (s[1] ^ 0x80) << 6) |
- (my_wc_t) (s[2] ^ 0x80);
+ *pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
else if (c < 0xf5)
@@ -7432,35 +7330,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
- /*
- UTF-8 quick four-byte mask:
- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- Encoding allows to encode U+00010000..U+001FFFFF
-
- The maximum character defined in the Unicode standard is U+0010FFFF.
- Higher characters U+00110000..U+001FFFFF are not used.
-
- 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
- 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
-
- Valid codes:
- [F0][90..BF][80..BF][80..BF]
- [F1][80..BF][80..BF][80..BF]
- [F2][80..BF][80..BF][80..BF]
- [F3][80..BF][80..BF][80..BF]
- [F4][80..8F][80..BF][80..BF]
- */
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- (c >= 0xf1 || s[1] >= 0x90) &&
- (c <= 0xf3 || s[1] <= 0x8F)))
+ if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
return MY_CS_ILSEQ;
- *pwc = ((my_wc_t) (c & 0x07) << 18) |
- ((my_wc_t) (s[1] ^ 0x80) << 12) |
- ((my_wc_t) (s[2] ^ 0x80) << 6) |
- (my_wc_t) (s[3] ^ 0x80);
+ *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
return 4;
}
return MY_CS_ILSEQ;
@@ -7492,34 +7364,22 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
if (!IS_CONTINUATION_BYTE(s[1]))
return MY_CS_ILSEQ;
- *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+ *pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
if (c < 0xf0)
{
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- (c >= 0xe1 || s[1] >= 0xa0)))
+ if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
- *pwc= ((my_wc_t) (c & 0x0f) << 12) |
- ((my_wc_t) (s[1] ^ 0x80) << 6) |
- (my_wc_t) (s[2] ^ 0x80);
-
+ *pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
else if (c < 0xf5)
{
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- (c >= 0xf1 || s[1] >= 0x90) &&
- (c <= 0xf3 || s[1] <= 0x8F)))
+ if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
return MY_CS_ILSEQ;
- *pwc = ((my_wc_t) (c & 0x07) << 18) |
- ((my_wc_t) (s[1] ^ 0x80) << 12) |
- ((my_wc_t) (s[2] ^ 0x80) << 6) |
- (my_wc_t) (s[3] ^ 0x80);
+ *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
return 4;
}
return MY_CS_ILSEQ;
@@ -7765,146 +7625,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
}
-static int
-my_strnncoll_utf8mb4(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool t_is_prefix)
-{
- my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
- const uchar *se= s + slen;
- const uchar *te= t + tlen;
- MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
- while ( s < se && t < te )
- {
- int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
- int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
-
- if ( s_res <= 0 || t_res <= 0 )
- {
- /* Incorrect string, compare bytewise */
- return bincmp_utf8mb4(s, se, t, te);
- }
-
- my_tosort_unicode(uni_plane, &s_wc, cs->state);
- my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
- if ( s_wc != t_wc )
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+= s_res;
- t+= t_res;
- }
- return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
-}
-
-
-/**
-
- Compare strings, discarding end space
-
- If one string is shorter as the other, then we space extend the other
- so that the strings have equal length.
-
- This will ensure that the following things hold:
-
- "a" == "a "
- "a\0" < "a"
- "a\0" < "a "
-
- @param cs Character set pinter.
- @param a First string to compare.
- @param a_length Length of 'a'.
- @param b Second string to compare.
- @param b_length Length of 'b'.
- @param diff_if_only_endspace_difference
- Set to 1 if the strings should be regarded as different
- if they only difference in end space
-
- @return Comparison result.
- @retval Negative number, if a less than b.
- @retval 0, if a is equal to b
- @retval Positive number, if a > b
-*/
-
-static int
-my_strnncollsp_utf8mb4(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool diff_if_only_endspace_difference)
-{
- int res;
- my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
- const uchar *se= s + slen, *te= t + tlen;
- MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= FALSE;
-#endif
-
- while ( s < se && t < te )
- {
- int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
- int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
-
- if ( s_res <= 0 || t_res <= 0 )
- {
- /* Incorrect string, compare bytewise */
- return bincmp_utf8mb4(s, se, t, te);
- }
-
- my_tosort_unicode(uni_plane, &s_wc, cs->state);
- my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
- if ( s_wc != t_wc )
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+=s_res;
- t+=t_res;
- }
-
- slen= (size_t) (se-s);
- tlen= (size_t) (te-t);
- res= 0;
-
- if (slen != tlen)
- {
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- if (slen < tlen)
- {
- slen= tlen;
- s= t;
- se= te;
- swap= -1;
- res= -res;
- }
- /*
- This following loop uses the fact that in UTF-8
- all multibyte characters are greater than space,
- and all multibyte head characters are greater than
- space. It means if we meet a character greater
- than space, it always means that the longer string
- is greater. So we can reuse the same loop from the
- 8bit version, without having to process full multibute
- sequences.
- */
- for ( ; s < se; s++)
- {
- if (*s != ' ')
- return (*s < ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
-
/**
Compare 0-terminated UTF8 strings.
@@ -8007,30 +7727,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
- /*
- UTF-8 quick four-byte mask:
- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- Encoding allows to encode U+00010000..U+001FFFFF
-
- The maximum character defined in the Unicode standard is U+0010FFFF.
- Higher characters U+00110000..U+001FFFFF are not used.
-
- 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
- 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
-
- Valid codes:
- [F0][90..BF][80..BF][80..BF]
- [F1][80..BF][80..BF][80..BF]
- [F2][80..BF][80..BF][80..BF]
- [F3][80..BF][80..BF][80..BF]
- [F4][80..8F][80..BF][80..BF]
- */
-
- if (!(IS_CONTINUATION_BYTE(s[1]) &&
- IS_CONTINUATION_BYTE(s[2]) &&
- IS_CONTINUATION_BYTE(s[3]) &&
- (c >= 0xf1 || s[1] >= 0x90) &&
- (c <= 0xf3 || s[1] <= 0x8F)))
+ if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
return MY_CS_ILSEQ;
return 4;
@@ -8072,6 +7769,29 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
/* my_well_formed_char_length_utf8mb4 */
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci
+#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0)
+#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1)
+#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2)
+/*
+ All non-BMP characters have the same weight.
+*/
+#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_bin
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(b0) ((int) (uchar) (b0))
+#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
+#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
+#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
+#include "strcoll.ic"
+
+
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
@@ -8100,8 +7820,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c)
static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
{
NULL, /* init */
- my_strnncoll_utf8mb4,
- my_strnncollsp_utf8mb4,
+ my_strnncoll_utf8mb4_general_ci,
+ my_strnncollsp_utf8mb4_general_ci,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_mb,
@@ -8116,8 +7836,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
{
NULL, /* init */
- my_strnncoll_mb_bin,
- my_strnncollsp_mb_bin,
+ my_strnncoll_utf8mb4_bin,
+ my_strnncollsp_utf8mb4_bin,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_mb,
@@ -8161,6 +7881,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_charlen_utf8mb4,
my_well_formed_char_length_utf8mb4,
my_copy_fix_mb,
+ my_wc_mb_utf8mb4,
};