diff options
author | unknown <monty@hundin.mysql.fi> | 2002-05-17 16:45:00 +0300 |
---|---|---|
committer | unknown <monty@hundin.mysql.fi> | 2002-05-17 16:45:00 +0300 |
commit | 387e77d104887d45a4574a9bd1b826cfa20f01c1 (patch) | |
tree | 138e0662f05b06e5000a3e7e6fa8383eea492a3b /sql | |
parent | 66f426c0635d16f7f594f6bf867e13e5cba18500 (diff) | |
download | mariadb-git-387e77d104887d45a4574a9bd1b826cfa20f01c1.tar.gz |
Optimize LIKE with turbo-boyer-more algoritm
Docs/manual.texi:
Added info about LIKE optimization
mysql-test/r/func_like.result:
Test of new LIKE optimization
mysql-test/t/func_like.test:
Test of new LIKE optimization
Diffstat (limited to 'sql')
-rw-r--r-- | sql/item_cmpfunc.cc | 266 | ||||
-rw-r--r-- | sql/item_cmpfunc.h | 29 | ||||
-rw-r--r-- | sql/unireg.h | 7 |
3 files changed, 295 insertions, 7 deletions
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc index 09c2bdc593a..c21ad18f08e 100644 --- a/sql/item_cmpfunc.cc +++ b/sql/item_cmpfunc.cc @@ -1228,23 +1228,23 @@ void Item_func_like::fix_length_and_dec() // cmp_type=STRING_RESULT; // For quick select } - longlong Item_func_like::val_int() { - String *res,*res2; - res=args[0]->val_str(&tmp_value1); + String* res = args[0]->val_str(&tmp_value1); if (args[0]->null_value) { null_value=1; return 0; } - res2=args[1]->val_str(&tmp_value2); + String* res2 = args[1]->val_str(&tmp_value2); if (args[1]->null_value) { null_value=1; return 0; } null_value=0; + if (canDoTurboBM) + return turboBM_matches(res->ptr(), res->length()) ? 1 : 0; if (binary) return wild_compare(*res,*res2,escape) ? 0 : 1; else @@ -1268,6 +1268,51 @@ Item_func::optimize_type Item_func_like::select_optimize() const return OPTIMIZE_NONE; } +bool Item_func_like::fix_fields(THD *thd,struct st_table_list *tlist) +{ + if (Item_bool_func2::fix_fields(thd, tlist)) + return 1; + + /* + TODO--we could do it for non-const, but we'd have to + recompute the tables for each row--probably not worth it. + */ + if (args[1]->const_item() && !(specialflag & SPECIAL_NO_NEW_FUNC)) + { + String* res2 = args[1]->val_str(&tmp_value2); + const size_t len = res2->length(); + const char* first = res2->ptr(); + const char* last = first + len - 1; + /* + len must be > 2 ('%pattern%') + heuristic: only do TurboBM for pattern_len > 2 + */ + + if (len > MIN_TURBOBM_PATTERN_LEN + 2 && + *first == wild_many && + *last == wild_many) + { + const char* tmp = first + 1; + for ( ; *tmp != wild_many && *tmp != wild_one && *tmp != escape; tmp++) ; + canDoTurboBM = tmp == last; + } + + if (canDoTurboBM) + { + pattern = first + 1; + pattern_len = len - 2; + DBUG_PRINT("TurboBM", ("Initializing pattern: '%s'...", first)); + int* suff = (int*)thd->alloc(sizeof(int[pattern_len + 1])); + bmGs = (int*)thd->alloc(sizeof(int[pattern_len + 1])); + bmBc = (int*)thd->alloc(sizeof(int[alphabet_size])); + turboBM_compute_good_suffix_shifts(suff); + turboBM_compute_bad_character_shifts(); + DBUG_PRINT("turboBM",("done")); + } + } + return 0; +} + #ifdef USE_REGEX bool @@ -1307,7 +1352,6 @@ Item_func_regex::fix_fields(THD *thd,TABLE_LIST *tables) return 0; } - longlong Item_func_regex::val_int() { char buff[MAX_FIELD_WIDTH]; @@ -1364,3 +1408,215 @@ Item_func_regex::~Item_func_regex() } #endif /* USE_REGEX */ + + +#ifdef LIKE_CMP_TOUPPER +#define likeconv(A) (uchar) toupper(A) +#else +#define likeconv(A) (uchar) my_sort_order[(uchar) (A)] +#endif + + +/********************************************************************** + turboBM_compute_suffixes() + Precomputation dependent only on pattern_len. +**********************************************************************/ + +void Item_func_like::turboBM_compute_suffixes(int* suff) +{ + const int plm1 = pattern_len - 1; + int f = 0; + int g = plm1; + int* const splm1 = suff + plm1; + + *splm1 = pattern_len; + + if (binary) + { + int i; + for (i = pattern_len - 2; i >= 0; i--) + { + int tmp = *(splm1 + i - f); + if (g < i && tmp < i - g) + suff[i] = tmp; + else + { + if (i < g) + g = i; // g = min(i, g) + f = i; + while (g >= 0 && pattern[g] == pattern[g + plm1 - f]) + g--; + suff[i] = f - g; + } + } + } + else + { + int i; + for (i = pattern_len - 2; 0 <= i; --i) + { + int tmp = *(splm1 + i - f); + if (g < i && tmp < i - g) + suff[i] = tmp; + else + { + if (i < g) + g = i; // g = min(i, g) + f = i; + while (g >= 0 && likeconv(pattern[g]) == likeconv(pattern[g + plm1 - f])) + g--; + suff[i] = f - g; + } + } + } +} + + +/********************************************************************** + turboBM_compute_good_suffix_shifts() + Precomputation dependent only on pattern_len. +**********************************************************************/ + +void Item_func_like::turboBM_compute_good_suffix_shifts(int* suff) +{ + turboBM_compute_suffixes(suff); + + int* end = bmGs + pattern_len; + int* k; + for (k = bmGs; k < end; k++) + *k = pattern_len; + + int tmp; + int i; + int j = 0; + const int plm1 = pattern_len - 1; + for (i = plm1; i > -1; i--) + { + if (suff[i] == i + 1) + { + for (tmp = plm1 - i; j < tmp; j++) + { + int* tmp2 = bmGs + j; + if (*tmp2 == pattern_len) + *tmp2 = tmp; + } + } + } + + int* tmp2; + for (tmp = plm1 - i; j < tmp; j++) + { + tmp2 = bmGs + j; + if (*tmp2 == pattern_len) + *tmp2 = tmp; + } + + tmp2 = bmGs + plm1; + for (i = 0; i <= pattern_len - 2; i++) + *(tmp2 - suff[i]) = plm1 - i; +} + + +/********************************************************************** + turboBM_compute_bad_character_shifts() + Precomputation dependent on pattern_len. +**********************************************************************/ + +void Item_func_like::turboBM_compute_bad_character_shifts() +{ + int* i; + int* end = bmBc + alphabet_size; + for (i = bmBc; i < end; i++) + *i = pattern_len; + + int j; + const int plm1 = pattern_len - 1; + if (binary) + for (j = 0; j < plm1; j++) + bmBc[pattern[j]] = plm1 - j; + else + for (j = 0; j < plm1; j++) + bmBc[likeconv(pattern[j])] = plm1 - j; +} + + +/********************************************************************** + turboBM_matches() + Search for pattern in text, returns true/false for match/no match +**********************************************************************/ + +bool Item_func_like::turboBM_matches(const char* text, int text_len) const +{ + register int bcShift; + register int turboShift; + int shift = pattern_len; + int j = 0; + int u = 0; + + const int plm1 = pattern_len - 1; + const int tlmpl = text_len - pattern_len; + + /* Searching */ + if (binary) + { + while (j <= tlmpl) + { + register int i = plm1; + while (i >= 0 && pattern[i] == text[i + j]) + { + i--; + if (i == plm1 - shift) + i -= u; + } + if (i < 0) + return true; + + register const int v = plm1 - i; + turboShift = u - v; + bcShift = bmBc[text[i + j]] - plm1 + i; + shift = max(turboShift, bcShift); + shift = max(shift, bmGs[i]); + if (shift == bmGs[i]) + u = min(pattern_len - shift, v); + else + { + if (turboShift < bcShift) + shift = max(shift, u + 1); + u = 0; + } + j += shift; + } + return false; + } + else + { + while (j <= tlmpl) + { + register int i = plm1; + while (i >= 0 && likeconv(pattern[i]) == likeconv(text[i + j])) + { + i--; + if (i == plm1 - shift) + i -= u; + } + if (i < 0) + return true; + + register const int v = plm1 - i; + turboShift = u - v; + bcShift = bmBc[likeconv(text[i + j])] - plm1 + i; + shift = max(turboShift, bcShift); + shift = max(shift, bmGs[i]); + if (shift == bmGs[i]) + u = min(pattern_len - shift, v); + else + { + if (turboShift < bcShift) + shift = max(shift, u + 1); + u = 0; + } + j += shift; + } + return false; + } +} diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h index 2048c4baea8..cd6e3d6e414 100644 --- a/sql/item_cmpfunc.h +++ b/sql/item_cmpfunc.h @@ -478,15 +478,40 @@ public: class Item_func_like :public Item_bool_func2 { char escape; -public: - Item_func_like(Item *a,Item *b, char* escape_arg) :Item_bool_func2(a,b),escape(*escape_arg) + + // Turbo Boyer-Moore data + bool canDoTurboBM; // pattern is '%abcd%' case + const char* pattern; + int pattern_len; + + // TurboBM buffers, *this is owner + int* bmGs; // good suffix shift table, size is pattern_len + 1 + int* bmBc; // bad character shift table, size is alphabet_size + + void turboBM_compute_suffixes(int* suff); + void turboBM_compute_good_suffix_shifts(int* suff); + void turboBM_compute_bad_character_shifts(); + bool turboBM_matches(const char* text, int text_len) const; + enum { alphabet_size = 256 }; + +public: + Item_func_like::Item_func_like(Item *a,Item *b, char* escape_arg) : + Item_bool_func2(a,b), + escape(*escape_arg), + canDoTurboBM(false), + pattern(0), + pattern_len(0), + bmGs(0), + bmBc(0) {} + longlong val_int(); enum Functype functype() const { return LIKE_FUNC; } optimize_type select_optimize() const; cond_result eq_cmp_result() const { return COND_TRUE; } const char *func_name() const { return "like"; } void fix_length_and_dec(); + bool fix_fields(THD *thd,struct st_table_list *tlist); }; #ifdef USE_REGEX diff --git a/sql/unireg.h b/sql/unireg.h index c4d2052d1da..5a61f4a6c12 100644 --- a/sql/unireg.h +++ b/sql/unireg.h @@ -122,6 +122,13 @@ bfill((A)->null_flags,(A)->null_bytes,255);\ #define TE_INFO_LENGTH 3 #define MTYP_NOEMPTY_BIT 128 +/* + * Minimum length pattern before Turbo Boyer-Moore is used + * for SELECT "text" LIKE "%pattern%", excluding the two + * wildcards in class Item_func_like. + */ +#define MIN_TURBOBM_PATTERN_LEN 3 + /* Include prototypes for unireg */ #include "mysqld_error.h" |