summaryrefslogtreecommitdiff
path: root/strings/ctype-uca.ic
diff options
context:
space:
mode:
Diffstat (limited to 'strings/ctype-uca.ic')
-rw-r--r--strings/ctype-uca.ic276
1 files changed, 174 insertions, 102 deletions
diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic
index cee12cf4d7b..1fc3480e5b5 100644
--- a/strings/ctype-uca.ic
+++ b/strings/ctype-uca.ic
@@ -36,108 +36,9 @@
#error MY_UCA_COLL_INIT is not defined
#endif
-
-static inline int
-MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
-{
- /*
- Check if the weights for the previous character have been
- already fully scanned. If yes, then get the next character and
- initialize wbeg and wlength to its weight string.
- */
-
- if (scanner->wbeg[0]) /* More weights left from the previous step: */
- return *scanner->wbeg++; /* return the next weight from expansion */
-
- do
- {
- const uint16 *wpage;
- my_wc_t wc[MY_UCA_MAX_CONTRACTION];
- int mblen;
-
- /* Get next character */
-#if MY_UCA_ASCII_OPTIMIZE
- /* Get next ASCII character */
- if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
- {
- wc[0]= scanner->sbeg[0];
- scanner->sbeg+= 1;
-
-#if MY_UCA_COMPILE_CONTRACTIONS
- if (my_uca_needs_context_handling(scanner->level, wc[0]))
- {
- uint16 *cweight= my_uca_context_weight_find(scanner, wc);
- if (cweight)
- return *cweight;
- }
-#endif
-
- scanner->page= 0;
- scanner->code= (int) wc[0];
- scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
- if (scanner->wbeg[0])
- return *scanner->wbeg++;
- continue;
- }
- else
-#endif
- /* Get next MB character */
- if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
- scanner->send)) <= 0))
- {
- if (scanner->sbeg >= scanner->send)
- return -1; /* No more bytes, end of line reached */
- /*
- There are some more bytes left. Non-positive mb_len means that
- we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
- */
- if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send)
- {
- /* For safety purposes don't go beyond the string range. */
- scanner->sbeg= scanner->send;
- }
- /*
- Treat every complete or incomplete mbminlen unit as a weight which is
- greater than weight for any possible normal character.
- 0xFFFF is greater than any possible weight in the UCA weight table.
- */
- return 0xFFFF;
- }
-
- scanner->sbeg+= mblen;
- if (wc[0] > scanner->level->maxchar)
- {
- /* Return 0xFFFD as weight for all characters outside BMP */
- scanner->wbeg= nochar;
- return 0xFFFD;
- }
-
-#if MY_UCA_COMPILE_CONTRACTIONS
- if (my_uca_needs_context_handling(scanner->level, wc[0]))
- {
- uint16 *cweight= my_uca_context_weight_find(scanner, wc);
- if (cweight)
- return *cweight;
- }
-#endif
-
- /* Process single character */
- scanner->page= wc[0] >> 8;
- scanner->code= wc[0] & 0xFF;
-
- /* If weight page for w[0] does not exist, then calculate algoritmically */
- if (!(wpage= scanner->level->weights[scanner->page]))
- return my_uca_scanner_next_implicit(scanner);
-
- /* Calculate pointer to w[0]'s weight, using page and offset */
- scanner->wbeg= wpage +
- scanner->code * scanner->level->lengths[scanner->page];
- } while (!scanner->wbeg[0]); /* Skip ignorable characters */
-
- return *scanner->wbeg++;
-}
-
-
+#include "ctype-uca-scanner_next.inl"
+#define SCANNER_NEXT_NCHARS
+#include "ctype-uca-scanner_next.inl"
/*
Compares two strings according to the collation
@@ -410,6 +311,173 @@ MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs,
}
+/*
+ Scan the next weight and perform space padding
+ or trimming according to "nchars".
+*/
+static inline weight_and_nchars_t
+MY_FUNCTION_NAME(scanner_next_pad_trim)(my_uca_scanner *scanner,
+ size_t nchars,
+ uint *generated)
+{
+ weight_and_nchars_t res;
+ if (nchars > 0 ||
+ scanner->wbeg[0] /* Some weights from a previous expansion left */)
+ {
+ if ((res= MY_FUNCTION_NAME(scanner_next_with_nchars)(scanner,
+ nchars)).weight < 0)
+ {
+ /*
+ We reached the end of the string, but the caller wants more weights.
+ Perform space padding.
+ */
+ res.weight= my_space_weight(scanner->level);
+ res.nchars= 1;
+ (*generated)++;
+ }
+ else if (res.nchars > nchars)
+ {
+ /*
+ We scanned the next collation element, but it does not fit into
+ the "nchars" limit. This is possible in case of:
+ - A contraction, e.g. Czech 'ch' with nchars=1
+ - A sequence of ignorable characters followed by non-ignorable ones,
+ e.g. CONCAT(x'00','a') with nchars=1.
+ Perform trimming.
+ */
+ res.weight= scanner->cs->state & MY_CS_NOPAD ?
+ 0 : my_space_weight(scanner->level);
+ res.nchars= (uint) nchars;
+ (*generated)++;
+ }
+ }
+ else
+ {
+ /* The caller wants nchars==0. Perform trimming. */
+ res.weight= scanner->cs->state & MY_CS_NOPAD ?
+ 0 : my_space_weight(scanner->level);
+ res.nchars= 0;
+ (*generated)++;
+ }
+ return res;
+}
+
+
+static int
+MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs,
+ const MY_UCA_WEIGHT_LEVEL *level,
+ const uchar *s, size_t slen,
+ const uchar *t, size_t tlen,
+ size_t nchars)
+{
+ my_uca_scanner sscanner;
+ my_uca_scanner tscanner;
+ size_t s_nchars_left= nchars;
+ size_t t_nchars_left= nchars;
+
+ my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
+ my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
+
+ for ( ; ; )
+ {
+ weight_and_nchars_t s_res;
+ weight_and_nchars_t t_res;
+ uint generated= 0;
+ int diff;
+
+ s_res= MY_FUNCTION_NAME(scanner_next_pad_trim)(&sscanner, s_nchars_left,
+ &generated);
+ t_res= MY_FUNCTION_NAME(scanner_next_pad_trim)(&tscanner, t_nchars_left,
+ &generated);
+ if ((diff= (s_res.weight - t_res.weight)))
+ return diff;
+
+ if (generated == 2)
+ {
+ if (cs->state & MY_CS_NOPAD)
+ {
+ /*
+ Both values are auto-generated. There's no real data any more.
+ We need to handle the remaining virtual trailing spaces.
+ The two strings still have s_nchars_left and t_nchars_left imaginary
+ trailing spaces at the end. If s_nchars_left != t_nchars_left,
+ the strings will be not equal in case of a NOPAD collation.
+
+ Example:
+ "B" is German "U+00DF LATIN SMALL LETTER SHARP S"
+ When we have these values in a
+ CHAR(3) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_nopad_ci
+ column:
+ 'B ' (one character, two trailing spaces)
+ 'ss ' (two characters, one trailing space)
+ The 'B ' is greater than the 'ss '.
+ They are compared in the following steps:
+ 1. 'B' == 'ss'
+ 2. ' ' == ' '
+ 3. ' ' > ''
+
+ We need to emulate the same behavior in this function even if
+ it's called with strings 'B' and 'ss' (with space trimmed).
+ The side which has more remaining virtual spaces at the end
+ is greater.
+ */
+ if (s_nchars_left < t_nchars_left)
+ return -1;
+ if (s_nchars_left > t_nchars_left)
+ return +1;
+ }
+ return 0;
+ }
+
+ DBUG_ASSERT(s_nchars_left >= s_res.nchars);
+ DBUG_ASSERT(t_nchars_left >= t_res.nchars);
+ s_nchars_left-= s_res.nchars;
+ t_nchars_left-= t_res.nchars;
+ }
+
+ return 0;
+}
+
+
+/*
+ One-level collations.
+*/
+static int
+MY_FUNCTION_NAME(strnncollsp_nchars)(CHARSET_INFO *cs,
+ const uchar *s, size_t slen,
+ const uchar *t, size_t tlen,
+ size_t nchars)
+{
+ return MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(cs, &cs->uca->level[0],
+ s, slen, t, tlen,
+ nchars);
+}
+
+
+/*
+ Multi-level collations.
+*/
+static int
+MY_FUNCTION_NAME(strnncollsp_nchars_multilevel)(CHARSET_INFO *cs,
+ const uchar *s, size_t slen,
+ const uchar *t, size_t tlen,
+ size_t nchars)
+{
+ uint num_level= cs->levels_for_order;
+ uint i;
+ for (i= 0; i != num_level; i++)
+ {
+ int ret= MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(cs,
+ &cs->uca->level[i],
+ s, slen,
+ t, tlen,
+ nchars);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
Calculates hash value for the given string,
@@ -753,6 +821,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp),
+ MY_FUNCTION_NAME(strnncollsp_nchars),
MY_FUNCTION_NAME(strnxfrm),
my_strnxfrmlen_any_uca,
MY_LIKE_RANGE,
@@ -776,6 +845,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp_nopad),
+ MY_FUNCTION_NAME(strnncollsp_nchars),
MY_FUNCTION_NAME(strnxfrm_nopad),
my_strnxfrmlen_any_uca,
MY_LIKE_RANGE, /* my_like_range_mb or my_like_range_generic */
@@ -797,6 +867,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_multilevel),
+ MY_FUNCTION_NAME(strnncollsp_nchars_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
my_strnxfrmlen_any_uca_multilevel,
MY_LIKE_RANGE,
@@ -818,6 +889,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
+ MY_FUNCTION_NAME(strnncollsp_nchars_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
my_strnxfrmlen_any_uca_multilevel,
MY_LIKE_RANGE,