summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.com>2022-09-07 10:50:40 +0400
committerAlexander Barkov <bar@mariadb.com>2022-09-09 09:41:23 +0400
commit71d74464a4edf10c6f0584aad533bae1ad664ccd (patch)
tree57ca22d8d3a461b41317562d506ab4ee33883538
parentf6118acda961a7fde17f2984d7600e211cd47ee3 (diff)
downloadmariadb-git-bb-10.10-bar-uca.tar.gz
More UCA collation performance improvementsbb-10.10-bar-uca
-rw-r--r--include/m_ctype.h9
-rw-r--r--strings/ctype-uca.c111
-rw-r--r--strings/ctype-uca.inl45
3 files changed, 129 insertions, 36 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 811b3b71a17..adb8c0551dd 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -190,6 +190,15 @@ typedef struct my_uca_level_booster_t
This array is used for prefix optimization.
*/
MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000];
+
+ /*
+ A helper array to process one-character ASCII strings.
+ Suitable only for stand-alone characters,
+ which are known not to be contraction parts,
+ e.g. the very last character of a string.
+ */
+ uint16 weight_1byte_to_1weight_standalone[0xFF];
+
} MY_UCA_LEVEL_BOOSTER;
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 5cedd8b0a63..55d6d34a0f6 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -32091,6 +32091,19 @@ my_uca_scanner_init_any(my_uca_scanner *scanner,
}
+/*
+ Test if both scanners have reached the end of their strings,
+ so there is no data left to compare.
+*/
+static inline my_bool
+my_uca_scanner_eof2(const my_uca_scanner *scanner0,
+ const my_uca_scanner *scanner1)
+{
+ return scanner0->sbeg >= scanner0->send &&
+ scanner1->sbeg >= scanner1->send;
+}
+
+
static inline int
my_space_weight(const MY_UCA_WEIGHT_LEVEL *level)
{
@@ -34420,10 +34433,34 @@ my_uca_level_booster_weight2_populate(MY_UCA_LEVEL_BOOSTER *booster)
static void
+my_uca_level_booster_1byte_to_1weight_standalone_populate(
+ MY_UCA_LEVEL_BOOSTER *dst,
+ const MY_UCA_WEIGHT_LEVEL *src)
+{
+ size_t i;
+ for (i= 0; i <= 0x7F; i++)
+ {
+ const uint16 *w= src->weights[0] + i * src->lengths[0];
+ dst->weight_1byte_to_1weight_standalone[i]= w[1] ? 0 : w[0];
+ }
+ for (i= 0x80; i <= 0xFF; i++)
+ {
+ /*
+ Invalid utf8 one-byte character.
+ Set weight to near-biggest possible uint16 value.
+ This makes a broken string greater than a non-broken string.
+ */
+ dst->weight_1byte_to_1weight_standalone[i]= 0xFF00 + i;
+ }
+}
+
+
+static void
my_uca_level_booster_populate(MY_UCA_LEVEL_BOOSTER *dst,
const MY_UCA_WEIGHT_LEVEL *src,
CHARSET_INFO *cs)
{
+ my_uca_level_booster_1byte_to_1weight_standalone_populate(dst, src);
my_uca_level_booster_2bytes_populate_pairs(dst, src, cs);
my_uca_level_booster_2bytes_pupulate_ascii2_contractions(dst,
&src->contractions);
@@ -34460,29 +34497,67 @@ my_uca_level_booster_new(MY_CHARSET_LOADER *loader,
/*
- Skip the simple equal prefix of two string using
+ Compare a simple prefix of two string using
"One or two bytes produce one or two weights" optimization.
- Return the prefix length.
+ Return the comparison result.
*/
-static size_t
-my_uca_level_booster_equal_prefix_length(const MY_UCA_LEVEL_BOOSTER *booster,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen)
-{
- const uchar *s0= s;
- size_t simple_count= MY_MIN(slen, tlen) >> 1;
- for ( ; simple_count; s+= 2, t+= 2, simple_count--)
+static int
+my_uca_level_booster_simple_prefix_cmp(my_uca_scanner *sscanner,
+ my_uca_scanner *tscanner,
+ const MY_UCA_LEVEL_BOOSTER *booster)
+{
+ size_t slen= sscanner->send - sscanner->sbeg;
+ size_t tlen= tscanner->send - tscanner->sbeg;
+ size_t length= MY_MIN(slen, tlen);
+ for ( ; length >= 2; sscanner->sbeg+= 2, tscanner->sbeg+= 2, length-= 2)
{
+ int cmp;
const MY_UCA_WEIGHT2 *ws, *wt;
- ws= my_uca_level_booster_simple_weight2_addr_const(booster, s[0], s[1]);
- wt= my_uca_level_booster_simple_weight2_addr_const(booster, t[0], t[1]);
- if (ws->weight[0] &&
- ws->weight[0] == wt->weight[0] &&
- ws->weight[1] == wt->weight[1])
- continue;
- break;
+ ws= my_uca_level_booster_simple_weight2_addr_const(booster,
+ sscanner->sbeg[0],
+ sscanner->sbeg[1]);
+ wt= my_uca_level_booster_simple_weight2_addr_const(booster,
+ tscanner->sbeg[0],
+ tscanner->sbeg[1]);
+ if (!ws->weight[0] || !wt->weight[0])
+ return 0; /* Can't optimize this */
+
+ if ((cmp= (int) ws->weight[0] - (int) wt->weight[0]))
+ return cmp; /* A difference on the first weight found */
+
+ if ((cmp= (int) ws->weight[1] - (int) wt->weight[1]))
+ {
+ /*
+ A difference on the second weight found.
+ If either of the second weights is zero then the two sides have
+ different number of weights: one weight vs two weights.
+ Evaluate two_second_weights to:
+ - 1 if both sides have two weights
+ - 0 otherwise (i.e. one weight vs two weights)
+ */
+ int two_second_weights= ws->weight[1] != 0 && wt->weight[1] != 0;
+ return cmp * two_second_weights;
+ }
+ }
+ if (length == 1 &&
+ sscanner->send - sscanner->sbeg == 1 &&
+ tscanner->send - tscanner->sbeg == 1) /* One byte left on both sides */
+ {
+ uint16 ws, wt;
+ if ((ws= booster->weight_1byte_to_1weight_standalone[sscanner->sbeg[0]]) &&
+ (wt= booster->weight_1byte_to_1weight_standalone[tscanner->sbeg[0]]))
+ {
+ /*
+ The increment is important only if the two weights are equal.
+ But it's not harmful even of the weights are different.
+ So let's always increment, to avoid a conditional jump.
+ */
+ sscanner->sbeg++;
+ tscanner->sbeg++;
+ return (int) ws - (int) wt;
+ }
}
- return s - s0;
+ return 0;
}
diff --git a/strings/ctype-uca.inl b/strings/ctype-uca.inl
index 9b8c271ab50..0da2c9730f0 100644
--- a/strings/ctype-uca.inl
+++ b/strings/ctype-uca.inl
@@ -97,18 +97,21 @@ MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
int s_res;
int t_res;
+ my_uca_scanner_init_any(&sscanner, s, slen);
+ my_uca_scanner_init_any(&tscanner, t, tlen);
+
#if MY_UCA_ASCII_OPTIMIZE
{
- size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
- s, slen, t, tlen);
- s+= prefix, slen-= prefix;
- t+= prefix, tlen-= prefix;
+ int cmp= my_uca_level_booster_simple_prefix_cmp(&sscanner, &tscanner,
+ level->booster);
+ if (cmp)
+ return cmp;
+ if (my_uca_scanner_eof2(&sscanner, &tscanner))
+ return 0;
}
#endif
my_uca_scanner_param_init(&param, cs, level);
- my_uca_scanner_init_any(&sscanner, s, slen);
- my_uca_scanner_init_any(&tscanner, t, tlen);
do
{
@@ -216,18 +219,21 @@ MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
my_uca_scanner_param param;
int s_res, t_res;
+ my_uca_scanner_init_any(&sscanner, s, slen);
+ my_uca_scanner_init_any(&tscanner, t, tlen);
+
#if MY_UCA_ASCII_OPTIMIZE
{
- size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
- s, slen, t, tlen);
- s+= prefix, slen-= prefix;
- t+= prefix, tlen-= prefix;
+ int cmp= my_uca_level_booster_simple_prefix_cmp(&sscanner, &tscanner,
+ level->booster);
+ if (cmp)
+ return cmp;
+ if (my_uca_scanner_eof2(&sscanner, &tscanner))
+ return 0;
}
#endif
my_uca_scanner_param_init(&param, cs, level);
- my_uca_scanner_init_any(&sscanner, s, slen);
- my_uca_scanner_init_any(&tscanner, t, tlen);
do
{
@@ -456,21 +462,24 @@ MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs,
size_t s_nchars_left= nchars;
size_t t_nchars_left= nchars;
+ my_uca_scanner_init_any(&sscanner, s, slen);
+ my_uca_scanner_init_any(&tscanner, t, tlen);
+
/*
TODO: strnncollsp_nchars_onelevel
#if MY_UCA_ASCII_OPTIMIZE
{
- size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
- s, slen, t, tlen);
- s+= prefix, slen-= prefix;
- t+= prefix, tlen-= prefix;
+ int cmp= my_uca_level_booster_simple_prefix_cmp(&sscanner, &tscanner,
+ level->booster);
+ if (cmp)
+ return cmp;
+ if (my_uca_scanner_eof2(&sscanner, &tscanner))
+ return 0;
}
#endif
*/
my_uca_scanner_param_init(&param, cs, level);
- my_uca_scanner_init_any(&sscanner, s, slen);
- my_uca_scanner_init_any(&tscanner, t, tlen);
for ( ; ; )
{