diff options
author | Alexander Barkov <bar@mariadb.com> | 2022-09-07 10:50:40 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.com> | 2022-09-09 09:41:23 +0400 |
commit | 71d74464a4edf10c6f0584aad533bae1ad664ccd (patch) | |
tree | 57ca22d8d3a461b41317562d506ab4ee33883538 | |
parent | f6118acda961a7fde17f2984d7600e211cd47ee3 (diff) | |
download | mariadb-git-bb-10.10-bar-uca.tar.gz |
More UCA collation performance improvementsbb-10.10-bar-uca
-rw-r--r-- | include/m_ctype.h | 9 | ||||
-rw-r--r-- | strings/ctype-uca.c | 111 | ||||
-rw-r--r-- | strings/ctype-uca.inl | 45 |
3 files changed, 129 insertions, 36 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 811b3b71a17..adb8c0551dd 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -190,6 +190,15 @@ typedef struct my_uca_level_booster_t This array is used for prefix optimization. */ MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000]; + + /* + A helper array to process one-character ASCII strings. + Suitable only for stand-alone characters, + which are known not to be contraction parts, + e.g. the very last character of a string. + */ + uint16 weight_1byte_to_1weight_standalone[0xFF]; + } MY_UCA_LEVEL_BOOSTER; diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 5cedd8b0a63..55d6d34a0f6 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -32091,6 +32091,19 @@ my_uca_scanner_init_any(my_uca_scanner *scanner, } +/* + Test if both scanners have reached the end of their strings, + so there is no data left to compare. +*/ +static inline my_bool +my_uca_scanner_eof2(const my_uca_scanner *scanner0, + const my_uca_scanner *scanner1) +{ + return scanner0->sbeg >= scanner0->send && + scanner1->sbeg >= scanner1->send; +} + + static inline int my_space_weight(const MY_UCA_WEIGHT_LEVEL *level) { @@ -34420,10 +34433,34 @@ my_uca_level_booster_weight2_populate(MY_UCA_LEVEL_BOOSTER *booster) static void +my_uca_level_booster_1byte_to_1weight_standalone_populate( + MY_UCA_LEVEL_BOOSTER *dst, + const MY_UCA_WEIGHT_LEVEL *src) +{ + size_t i; + for (i= 0; i <= 0x7F; i++) + { + const uint16 *w= src->weights[0] + i * src->lengths[0]; + dst->weight_1byte_to_1weight_standalone[i]= w[1] ? 0 : w[0]; + } + for (i= 0x80; i <= 0xFF; i++) + { + /* + Invalid utf8 one-byte character. + Set weight to near-biggest possible uint16 value. + This makes a broken string greater than a non-broken string. + */ + dst->weight_1byte_to_1weight_standalone[i]= 0xFF00 + i; + } +} + + +static void my_uca_level_booster_populate(MY_UCA_LEVEL_BOOSTER *dst, const MY_UCA_WEIGHT_LEVEL *src, CHARSET_INFO *cs) { + my_uca_level_booster_1byte_to_1weight_standalone_populate(dst, src); my_uca_level_booster_2bytes_populate_pairs(dst, src, cs); my_uca_level_booster_2bytes_pupulate_ascii2_contractions(dst, &src->contractions); @@ -34460,29 +34497,67 @@ my_uca_level_booster_new(MY_CHARSET_LOADER *loader, /* - Skip the simple equal prefix of two string using + Compare a simple prefix of two string using "One or two bytes produce one or two weights" optimization. - Return the prefix length. + Return the comparison result. */ -static size_t -my_uca_level_booster_equal_prefix_length(const MY_UCA_LEVEL_BOOSTER *booster, - const uchar *s, size_t slen, - const uchar *t, size_t tlen) -{ - const uchar *s0= s; - size_t simple_count= MY_MIN(slen, tlen) >> 1; - for ( ; simple_count; s+= 2, t+= 2, simple_count--) +static int +my_uca_level_booster_simple_prefix_cmp(my_uca_scanner *sscanner, + my_uca_scanner *tscanner, + const MY_UCA_LEVEL_BOOSTER *booster) +{ + size_t slen= sscanner->send - sscanner->sbeg; + size_t tlen= tscanner->send - tscanner->sbeg; + size_t length= MY_MIN(slen, tlen); + for ( ; length >= 2; sscanner->sbeg+= 2, tscanner->sbeg+= 2, length-= 2) { + int cmp; const MY_UCA_WEIGHT2 *ws, *wt; - ws= my_uca_level_booster_simple_weight2_addr_const(booster, s[0], s[1]); - wt= my_uca_level_booster_simple_weight2_addr_const(booster, t[0], t[1]); - if (ws->weight[0] && - ws->weight[0] == wt->weight[0] && - ws->weight[1] == wt->weight[1]) - continue; - break; + ws= my_uca_level_booster_simple_weight2_addr_const(booster, + sscanner->sbeg[0], + sscanner->sbeg[1]); + wt= my_uca_level_booster_simple_weight2_addr_const(booster, + tscanner->sbeg[0], + tscanner->sbeg[1]); + if (!ws->weight[0] || !wt->weight[0]) + return 0; /* Can't optimize this */ + + if ((cmp= (int) ws->weight[0] - (int) wt->weight[0])) + return cmp; /* A difference on the first weight found */ + + if ((cmp= (int) ws->weight[1] - (int) wt->weight[1])) + { + /* + A difference on the second weight found. + If either of the second weights is zero then the two sides have + different number of weights: one weight vs two weights. + Evaluate two_second_weights to: + - 1 if both sides have two weights + - 0 otherwise (i.e. one weight vs two weights) + */ + int two_second_weights= ws->weight[1] != 0 && wt->weight[1] != 0; + return cmp * two_second_weights; + } + } + if (length == 1 && + sscanner->send - sscanner->sbeg == 1 && + tscanner->send - tscanner->sbeg == 1) /* One byte left on both sides */ + { + uint16 ws, wt; + if ((ws= booster->weight_1byte_to_1weight_standalone[sscanner->sbeg[0]]) && + (wt= booster->weight_1byte_to_1weight_standalone[tscanner->sbeg[0]])) + { + /* + The increment is important only if the two weights are equal. + But it's not harmful even of the weights are different. + So let's always increment, to avoid a conditional jump. + */ + sscanner->sbeg++; + tscanner->sbeg++; + return (int) ws - (int) wt; + } } - return s - s0; + return 0; } diff --git a/strings/ctype-uca.inl b/strings/ctype-uca.inl index 9b8c271ab50..0da2c9730f0 100644 --- a/strings/ctype-uca.inl +++ b/strings/ctype-uca.inl @@ -97,18 +97,21 @@ MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs, int s_res; int t_res; + my_uca_scanner_init_any(&sscanner, s, slen); + my_uca_scanner_init_any(&tscanner, t, tlen); + #if MY_UCA_ASCII_OPTIMIZE { - size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, - s, slen, t, tlen); - s+= prefix, slen-= prefix; - t+= prefix, tlen-= prefix; + int cmp= my_uca_level_booster_simple_prefix_cmp(&sscanner, &tscanner, + level->booster); + if (cmp) + return cmp; + if (my_uca_scanner_eof2(&sscanner, &tscanner)) + return 0; } #endif my_uca_scanner_param_init(¶m, cs, level); - my_uca_scanner_init_any(&sscanner, s, slen); - my_uca_scanner_init_any(&tscanner, t, tlen); do { @@ -216,18 +219,21 @@ MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs, my_uca_scanner_param param; int s_res, t_res; + my_uca_scanner_init_any(&sscanner, s, slen); + my_uca_scanner_init_any(&tscanner, t, tlen); + #if MY_UCA_ASCII_OPTIMIZE { - size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, - s, slen, t, tlen); - s+= prefix, slen-= prefix; - t+= prefix, tlen-= prefix; + int cmp= my_uca_level_booster_simple_prefix_cmp(&sscanner, &tscanner, + level->booster); + if (cmp) + return cmp; + if (my_uca_scanner_eof2(&sscanner, &tscanner)) + return 0; } #endif my_uca_scanner_param_init(¶m, cs, level); - my_uca_scanner_init_any(&sscanner, s, slen); - my_uca_scanner_init_any(&tscanner, t, tlen); do { @@ -456,21 +462,24 @@ MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs, size_t s_nchars_left= nchars; size_t t_nchars_left= nchars; + my_uca_scanner_init_any(&sscanner, s, slen); + my_uca_scanner_init_any(&tscanner, t, tlen); + /* TODO: strnncollsp_nchars_onelevel #if MY_UCA_ASCII_OPTIMIZE { - size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster, - s, slen, t, tlen); - s+= prefix, slen-= prefix; - t+= prefix, tlen-= prefix; + int cmp= my_uca_level_booster_simple_prefix_cmp(&sscanner, &tscanner, + level->booster); + if (cmp) + return cmp; + if (my_uca_scanner_eof2(&sscanner, &tscanner)) + return 0; } #endif */ my_uca_scanner_param_init(¶m, cs, level); - my_uca_scanner_init_any(&sscanner, s, slen); - my_uca_scanner_init_any(&tscanner, t, tlen); for ( ; ; ) { |