summaryrefslogtreecommitdiff
path: root/storage/innobase/fts/fts0que.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/fts/fts0que.cc')
-rw-r--r--storage/innobase/fts/fts0que.cc571
1 files changed, 395 insertions, 176 deletions
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 58b429a8406..5c757b4f176 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,6 +24,7 @@ Created 2007/03/27 Sunny Bains
Completed 2011/7/10 Sunny and Jimmy Yang
*******************************************************/
+#include "dict0dict.h" /* dict_table_get_n_rows() */
#include "ut0rbt.h"
#include "row0sel.h"
#include "fts0fts.h"
@@ -57,15 +58,10 @@ static const double FTS_NORMALIZE_COEFF = 0.0115F;
/* For parsing the search phrase */
static const char* FTS_PHRASE_DELIMITER = "\t ";
-typedef struct fts_match_struct fts_match_t;
-typedef struct fts_query_struct fts_query_t;
-typedef struct fts_phrase_struct fts_phrase_t;
-typedef struct fts_select_struct fts_select_t;
-typedef struct fts_doc_freq_struct fts_doc_freq_t;
-typedef struct fts_word_freq_struct fts_word_freq_t;
+struct fts_word_freq_t;
/** State of an FTS query. */
-struct fts_query_struct {
+struct fts_query_t {
mem_heap_t* heap; /*!< Heap to use for allocations */
trx_t* trx; /*!< The query transaction */
@@ -126,11 +122,11 @@ struct fts_query_struct {
position info for each matched word
in the word list */
- ulint total_docs; /*!< The total number of documents */
+ ib_uint64_t total_docs; /*!< The total number of documents */
ulint total_words; /*!< The total number of words */
- ulint error; /*!< Error code if any, that is
+ dberr_t error; /*!< Error code if any, that is
encountered during query processing */
ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
@@ -144,7 +140,7 @@ struct fts_query_struct {
/** For phrase matching, first we collect the documents and the positions
then we match. */
-struct fts_match_struct {
+struct fts_match_t {
doc_id_t doc_id; /*!< Document id */
ulint start; /*!< Start the phrase match from
@@ -158,7 +154,7 @@ struct fts_match_struct {
/** For matching tokens in a phrase search. We use this data structure in
the callback that determines whether a document should be accepted or
rejected for a phrase search. */
-struct fts_select_struct {
+struct fts_select_t {
doc_id_t doc_id; /*!< The document id to match */
ulint min_pos; /*!< For found to be TRUE at least
@@ -173,8 +169,23 @@ struct fts_select_struct {
the FTS index */
};
+/** structure defines a set of ranges for original documents, each of which
+has a minimum position and maximum position. Text in such range should
+contain all words in the proximity search. We will need to count the
+words in such range to make sure it is less than the specified distance
+of the proximity search */
+struct fts_proximity_t {
+ ulint n_pos; /*!< number of position set, defines
+ a range (min to max) containing all
+ matching words */
+ ulint* min_pos; /*!< the minimum position (in bytes)
+ of the range */
+ ulint* max_pos; /*!< the maximum position (in bytes)
+ of the range */
+};
+
/** The match positions and tokesn to match */
-struct fts_phrase_struct {
+struct fts_phrase_t {
ibool found; /*!< Match result */
const fts_match_t*
@@ -188,23 +199,26 @@ struct fts_phrase_struct {
CHARSET_INFO* charset; /*!< Phrase match charset */
mem_heap_t* heap; /*!< Heap for word processing */
ulint zip_size; /*!< row zip size */
+ fts_proximity_t*proximity_pos; /*!< position info for proximity
+ search verification. Records the min
+ and max position of words matched */
};
/** For storing the frequncy of a word/term in a document */
-struct fts_doc_freq_struct {
+struct fts_doc_freq_t {
doc_id_t doc_id; /*!< Document id */
ulint freq; /*!< Frequency of a word in a document */
};
/** To determine the word frequency per document. */
-struct fts_word_freq_struct {
+struct fts_word_freq_t {
byte* word; /*!< Word for which we need the freq,
it's allocated on the query heap */
ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
word frequencies. The elements are
of type fts_doc_freq_t */
- ulint doc_count; /*!< Total number of documents that
+ ib_uint64_t doc_count; /*!< Total number of documents that
contain this word */
double idf; /*!< Inverse document frequency */
};
@@ -257,37 +271,46 @@ search arguments to search the document again, thus "expand"
the search result set.
@return DB_SUCCESS if success, otherwise the error code */
static
-ulint
+dberr_t
fts_expand_query(
/*=============*/
dict_index_t* index, /*!< in: FTS index to search */
- fts_query_t* query); /*!< in: query result, to be freed
+ fts_query_t* query) /*!< in: query result, to be freed
by the client */
+ __attribute__((nonnull, warn_unused_result));
/*************************************************************//**
This function finds documents that contain all words in a
phrase or proximity search. And if proximity search, verify
-the words are close to each other enough, as in specified distance.
+the words are close enough to each other, as in specified distance.
This function is called for phrase and proximity search.
@return TRUE if documents are found, FALSE if otherwise */
static
ibool
-fts_check_phrase_proximity(
-/*=======================*/
- fts_query_t* query, /*!< in: query instance */
+fts_phrase_or_proximity_search(
+/*===========================*/
+ fts_query_t* query, /*!< in/out: query instance
+ query->doc_ids might be instantiated
+ with qualified doc IDs */
ib_vector_t* tokens); /*!< in: Tokens contain words */
/*************************************************************//**
-This function check the words in result document are close to each
-other enough (within proximity rnage). This is used for proximity search.
-@return TRUE if words are close to each other, FALSE if otherwise */
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
static
-ulint
-fts_proximity_check_position(
-/*=========================*/
- fts_match_t** match, /*!< in: query instance */
- ulint num_match, /*!< in: number of matching
- items */
- ulint distance); /*!< in: distance value
- for proximity search */
+bool
+fts_proximity_get_positions(
+/*========================*/
+ fts_match_t** match, /*!< in: query instance */
+ ulint num_match, /*!< in: number of matching
+ items */
+ ulint distance, /*!< in: distance value
+ for proximity search */
+ fts_proximity_t* qualified_pos); /*!< out: the position info
+ records ranges containing
+ all matching words. */
#if 0
/********************************************************************
Get the total number of words in a documents. */
@@ -954,8 +977,8 @@ cont_search:
/*****************************************************************//**
Set difference.
@return DB_SUCCESS if all went well */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_difference(
/*=================*/
fts_query_t* query, /*!< in: query instance */
@@ -993,15 +1016,21 @@ fts_query_difference(
ut_a(index_cache != NULL);
/* Search the cache for a matching word first. */
- nodes = fts_cache_find_word(index_cache, token);
+ if (query->cur_node->term.wildcard
+ && query->flags != FTS_PROXIMITY
+ && query->flags != FTS_PHRASE) {
+ fts_cache_find_wildcard(query, index_cache, token);
+ } else {
+ nodes = fts_cache_find_word(index_cache, token);
- for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
- const fts_node_t* node;
+ for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+ const fts_node_t* node;
- node = static_cast<const fts_node_t*>(
- ib_vector_get_const(nodes, i));
+ node = static_cast<const fts_node_t*>(
+ ib_vector_get_const(nodes, i));
- fts_query_check_node(query, token, node);
+ fts_query_check_node(query, token, node);
+ }
}
rw_lock_x_unlock(&cache->lock);
@@ -1026,8 +1055,8 @@ fts_query_difference(
/*****************************************************************//**
Intersect the token doc ids with the current set.
@return DB_SUCCESS if all went well */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_intersect(
/*================*/
fts_query_t* query, /*!< in: query instance */
@@ -1216,8 +1245,8 @@ fts_query_cache(
/*****************************************************************//**
Set union.
@return DB_SUCCESS if all went well */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_union(
/*============*/
fts_query_t* query, /*!< in: query instance */
@@ -1248,13 +1277,7 @@ fts_query_union(
/* Single '%' would confuse parser in pars_like_rebind(). In addition,
our wildcard search only supports prefix search */
- if (*token->f_str == '%') {
- if (token->f_len == 1) {
- return(query->error);
- }
- token->f_str++;
- token->f_len--;
- }
+ ut_ad(*token->f_str != '%');
fts_query_cache(query, token);
@@ -1485,6 +1508,67 @@ fts_query_match_phrase_terms(
}
/*****************************************************************//**
+Callback function to count the number of words in position ranges,
+and see whether the word count is in specified "phrase->distance"
+@return true if the number of characters is less than the "distance" */
+static
+bool
+fts_proximity_is_word_in_range(
+/*===========================*/
+ const fts_phrase_t*
+ phrase, /*!< in: phrase with the search info */
+ byte* start, /*!< in: text to search */
+ ulint total_len) /*!< in: length of text */
+{
+ fts_proximity_t* proximity_pos = phrase->proximity_pos;
+
+ /* Search each matched position pair (with min and max positions)
+ and count the number of words in the range */
+ for (ulint i = 0; i < proximity_pos->n_pos; i++) {
+ ulint cur_pos = proximity_pos->min_pos[i];
+ ulint n_word = 0;
+
+ ut_ad(proximity_pos->max_pos[i] <= total_len);
+
+ /* Walk through words in the range and count them */
+ while (cur_pos <= proximity_pos->max_pos[i]) {
+ ulint len;
+ fts_string_t str;
+ ulint offset = 0;
+
+ len = innobase_mysql_fts_get_token(
+ phrase->charset,
+ start + cur_pos,
+ start + total_len, &str, &offset);
+
+ if (len == 0) {
+ break;
+ }
+
+ /* Advances position with "len" bytes */
+ cur_pos += len;
+
+ /* Record the number of words */
+ if (str.f_n_char > 0) {
+ n_word++;
+ }
+
+ if (n_word > phrase->distance) {
+ break;
+ }
+ }
+
+ /* Check if the number of words is less than specified
+ "distance" */
+ if (n_word && n_word <= phrase->distance) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/*****************************************************************//**
Callback function to fetch and search the document.
@return TRUE if matched else FALSE */
static
@@ -1594,31 +1678,77 @@ fts_query_fetch_document(
sel_node_t* node = static_cast<sel_node_t*>(row);
fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
ulint prev_len = 0;
+ ulint total_len = 0;
+ byte* document_text = NULL;
exp = node->select_list;
phrase->found = FALSE;
+ /* For proximity search, we will need to get the whole document
+ from all fields, so first count the total length of the document
+ from all the fields */
+ if (phrase->proximity_pos) {
+ while (exp) {
+ ulint field_len;
+ dfield_t* dfield = que_node_get_val(exp);
+ byte* data = static_cast<byte*>(
+ dfield_get_data(dfield));
+
+ if (dfield_is_ext(dfield)) {
+ ulint local_len = dfield_get_len(dfield);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ field_len = mach_read_from_4(
+ data + local_len + BTR_EXTERN_LEN + 4);
+ } else {
+ field_len = dfield_get_len(dfield);
+ }
+
+ if (field_len != UNIV_SQL_NULL) {
+ total_len += field_len + 1;
+ }
+
+ exp = que_node_get_next(exp);
+ }
+
+ document_text = static_cast<byte*>(mem_heap_zalloc(
+ phrase->heap, total_len));
+
+ if (!document_text) {
+ return(FALSE);
+ }
+ }
+
+ exp = node->select_list;
+
while (exp) {
dfield_t* dfield = que_node_get_val(exp);
- void* data = NULL;
+ byte* data = static_cast<byte*>(
+ dfield_get_data(dfield));
ulint cur_len;
if (dfield_is_ext(dfield)) {
data = btr_copy_externally_stored_field(
- &cur_len, static_cast<const byte*>(data),
- phrase->zip_size,
+ &cur_len, data, phrase->zip_size,
dfield_get_len(dfield), phrase->heap);
} else {
- data = dfield_get_data(dfield);
cur_len = dfield_get_len(dfield);
}
if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
- phrase->found =
- fts_query_match_phrase(
- phrase, static_cast<byte*>(data),
- cur_len, prev_len, phrase->heap);
+ if (phrase->proximity_pos) {
+ memcpy(document_text + prev_len, data, cur_len);
+ } else {
+ /* For phrase search */
+ phrase->found =
+ fts_query_match_phrase(
+ phrase,
+ static_cast<byte*>(data),
+ cur_len, prev_len,
+ phrase->heap);
+ }
}
if (phrase->found) {
@@ -1633,6 +1763,13 @@ fts_query_fetch_document(
exp = que_node_get_next(exp);
}
+ if (phrase->proximity_pos) {
+ ut_ad(prev_len <= total_len);
+
+ phrase->found = fts_proximity_is_word_in_range(
+ phrase, document_text, total_len);
+ }
+
return(phrase->found);
}
@@ -1689,13 +1826,12 @@ fts_query_select(
/********************************************************************
Read the rows from the FTS index, that match word and where the
-doc id is between first and last doc id. */
-static
-ulint
+doc id is between first and last doc id.
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_find_term(
/*================*/
- /*!< out: DB_SUCCESS if all went well
- else error code */
fts_query_t* query, /*!< in: FTS query state */
que_t** graph, /*!< in: prepared statement */
const fts_string_t* word, /*!< in: the word to fetch */
@@ -1705,7 +1841,7 @@ fts_query_find_term(
ibool* found) /*!< out: TRUE if found else FALSE */
{
pars_info_t* info;
- ulint error;
+ dberr_t error;
fts_select_t select;
doc_id_t match_doc_id;
trx_t* trx = query->trx;
@@ -1830,19 +1966,18 @@ fts_query_sum(
}
/********************************************************************
-Calculate the total documents that contain a particular word (term). */
-static
-ulint
+Calculate the total documents that contain a particular word (term).
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_total_docs_containing_term(
/*=================================*/
- /*!< out: DB_SUCCESS if all went well
- else error code */
fts_query_t* query, /*!< in: FTS query state */
const fts_string_t* word, /*!< in: the word to check */
ulint* total) /*!< out: documents containing word */
{
pars_info_t* info;
- ulint error;
+ dberr_t error;
que_t* graph;
ulint selected;
trx_t* trx = query->trx;
@@ -1910,19 +2045,18 @@ fts_query_total_docs_containing_term(
}
/********************************************************************
-Get the total number of words in a documents. */
-static
-ulint
+Get the total number of words in a documents.
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_terms_in_document(
/*========================*/
- /*!< out: DB_SUCCESS if all went well
- else error code */
fts_query_t* query, /*!< in: FTS query state */
doc_id_t doc_id, /*!< in: the word to check */
ulint* total) /*!< out: total words in document */
{
pars_info_t* info;
- ulint error;
+ dberr_t error;
que_t* graph;
doc_id_t read_doc_id;
trx_t* trx = query->trx;
@@ -1993,9 +2127,9 @@ fts_query_terms_in_document(
/*****************************************************************//**
Retrieve the document and match the phrase tokens.
-@return TRUE if matches else FALSE */
-static
-ulint
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_match_document(
/*=====================*/
ib_vector_t* tokens, /*!< in: phrase tokens */
@@ -2004,7 +2138,7 @@ fts_query_match_document(
ulint distance, /*!< in: proximity distance */
ibool* found) /*!< out: TRUE if phrase found */
{
- ulint error;
+ dberr_t error;
fts_phrase_t phrase;
memset(&phrase, 0x0, sizeof(phrase));
@@ -2025,8 +2159,8 @@ fts_query_match_document(
if (error != DB_SUCCESS) {
ut_print_timestamp(stderr);
- fprintf(stderr, "InnoDB: Error: (%lu) matching document.\n",
- error);
+ fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
+ ut_strerr(error));
} else {
*found = phrase.found;
}
@@ -2037,11 +2171,66 @@ fts_query_match_document(
}
/*****************************************************************//**
+This function fetches the original documents and count the
+words in between matching words to see that is in specified distance
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+bool
+fts_query_is_in_proximity_range(
+/*============================*/
+ const fts_query_t* query, /*!< in: query instance */
+ fts_match_t** match, /*!< in: query instance */
+ fts_proximity_t* qualified_pos) /*!< in: position info for
+ qualified ranges */
+{
+ fts_get_doc_t get_doc;
+ fts_cache_t* cache = query->index->table->fts->cache;
+ dberr_t err;
+ fts_phrase_t phrase;
+
+ memset(&get_doc, 0x0, sizeof(get_doc));
+ memset(&phrase, 0x0, sizeof(phrase));
+
+ rw_lock_x_lock(&cache->lock);
+ get_doc.index_cache = fts_find_index_cache(cache, query->index);
+ rw_lock_x_unlock(&cache->lock);
+ ut_a(get_doc.index_cache != NULL);
+
+ phrase.distance = query->distance;
+ phrase.charset = get_doc.index_cache->charset;
+ phrase.zip_size = dict_table_zip_size(
+ get_doc.index_cache->index->table);
+ phrase.heap = mem_heap_create(512);
+ phrase.proximity_pos = qualified_pos;
+ phrase.found = FALSE;
+
+ err = fts_doc_fetch_by_doc_id(
+ &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+ fts_query_fetch_document, &phrase);
+
+ if (err != DB_SUCCESS) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Error: (%s) in verification phase of proximity "
+ "search", ut_strerr(err));
+ }
+
+ /* Free the prepared statement. */
+ if (get_doc.get_document_graph) {
+ fts_que_graph_free(get_doc.get_document_graph);
+ get_doc.get_document_graph = NULL;
+ }
+
+ mem_heap_free(phrase.heap);
+
+ return(err == DB_SUCCESS && phrase.found);
+}
+
+/*****************************************************************//**
Iterate over the matched document ids and search the for the
actual phrase in the text.
@return DB_SUCCESS if all OK */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_search_phrase(
/*====================*/
fts_query_t* query, /*!< in: query instance */
@@ -2050,8 +2239,6 @@ fts_query_search_phrase(
ulint i;
fts_get_doc_t get_doc;
ulint n_matched;
- // FIXME: Debug code
- ulint searched = 0;
fts_cache_t* cache = query->index->table->fts->cache;
n_matched = ib_vector_size(query->matched);
@@ -2061,9 +2248,7 @@ fts_query_search_phrase(
rw_lock_x_lock(&cache->lock);
- // FIXME: We shouldn't have to cast here.
- get_doc.index_cache = (fts_index_cache_t*)
- fts_find_index_cache(cache, query->index);
+ get_doc.index_cache = fts_find_index_cache(cache, query->index);
/* Must find the index cache */
ut_a(get_doc.index_cache != NULL);
@@ -2089,9 +2274,6 @@ fts_query_search_phrase(
an earlier pass. */
if (match->doc_id != 0) {
- // FIXME: Debug code
- ++searched;
-
query->error = fts_query_match_document(
tokens, &get_doc,
match, query->distance, &found);
@@ -2119,18 +2301,14 @@ fts_query_search_phrase(
get_doc.get_document_graph = NULL;
}
- // FIXME: Debug code
- ut_print_timestamp(stderr);
- printf(" End: %lu, %lu\n", searched, ib_vector_size(query->matched));
-
return(query->error);
}
/*****************************************************************//**
Text/Phrase search.
-@return count of doc ids added */
-static
-ulint
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_phrase_search(
/*====================*/
fts_query_t* query, /*!< in: query instance */
@@ -2290,7 +2468,7 @@ fts_query_phrase_search(
/* If we are doing proximity search, verify the distance
between all words, and check they are in specified distance. */
if (query->flags & FTS_PROXIMITY) {
- fts_check_phrase_proximity(query, tokens);
+ fts_phrase_or_proximity_search(query, tokens);
} else {
ibool matched;
@@ -2301,7 +2479,7 @@ fts_query_phrase_search(
and then doing a search through the text. Isolated
testing shows this also helps in mitigating disruption
of the buffer cache. */
- matched = fts_check_phrase_proximity(query, tokens);
+ matched = fts_phrase_or_proximity_search(query, tokens);
query->matched = query->match_array[0];
/* Read the actual text in and search for the phrase. */
@@ -2329,8 +2507,8 @@ func_exit:
/*****************************************************************//**
Find the word and evaluate.
@return DB_SUCCESS if all went well */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_query_execute(
/*==============*/
fts_query_t* query, /*!< in: query instance */
@@ -2477,13 +2655,12 @@ fts_query_visitor(
/*****************************************************************//**
Process (nested) sub-expression, create a new result set to store the
sub-expression result by processing nodes under current sub-expression
-list. Merge the sub-expression result with that of parent expression list. */
-
-ulint
+list. Merge the sub-expression result with that of parent expression list.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
fts_ast_visit_sub_exp(
/*==================*/
- /*!< out: DB_SUCCESS if all
- went well */
fts_ast_node_t* node, /*!< in,out: current root node */
fts_ast_callback visitor, /*!< in: callback function */
void* arg) /*!< in,out: arg for callback */
@@ -2492,8 +2669,9 @@ fts_ast_visit_sub_exp(
fts_query_t* query = static_cast<fts_query_t*>(arg);
ib_rbt_t* parent_doc_ids;
ib_rbt_t* subexpr_doc_ids;
- ulint error = DB_SUCCESS;
+ dberr_t error = DB_SUCCESS;
ibool inited = query->inited;
+ bool will_be_ignored = false;
ut_a(node->type == FTS_AST_SUBEXP_LIST);
@@ -2521,7 +2699,8 @@ fts_ast_visit_sub_exp(
/* Process nodes in current sub-expression and store its
result set in query->doc_ids we created above. */
- error = fts_ast_visit(FTS_NONE, node->next, visitor, arg);
+ error = fts_ast_visit(FTS_NONE, node->next, visitor,
+ arg, &will_be_ignored);
/* Reinstate parent node state and prepare for merge. */
query->inited = inited;
@@ -2757,6 +2936,8 @@ fts_query_read_node(
ut_a(query->cur_node->type == FTS_AST_TERM ||
query->cur_node->type == FTS_AST_TEXT);
+ memset(&node, 0, sizeof(node));
+
/* Need to consider the wildcard search case, the word frequency
is created on the search string not the actual word. So we need
to assign the frequency on search string behalf. */
@@ -2879,8 +3060,8 @@ fts_query_calculate_idf(
/*====================*/
fts_query_t* query) /*!< in: Query state */
{
- const ib_rbt_node_t* node;
- double total_docs = query->total_docs;
+ const ib_rbt_node_t* node;
+ ib_uint64_t total_docs = query->total_docs;
/* We need to free any instances of fts_doc_freq_t that we
may have allocated. */
@@ -2893,7 +3074,7 @@ fts_query_calculate_idf(
word_freq = rbt_value(fts_word_freq_t, node);
if (word_freq->doc_count > 0) {
- if (total_docs == (double) word_freq->doc_count) {
+ if (total_docs == word_freq->doc_count) {
/* QP assume ranking > 0 if we find
a match. Since Log10(1) = 0, we cannot
make IDF a zero value if do find a
@@ -2907,10 +3088,13 @@ fts_query_calculate_idf(
}
}
- fprintf(stderr,"'%s' -> %lu/%lu %6.5lf\n",
- word_freq->word,
- query->total_docs, word_freq->doc_count,
- word_freq->idf);
+ if (fts_enable_diag_print) {
+ fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
+ " %6.5lf\n",
+ word_freq->word,
+ query->total_docs, word_freq->doc_count,
+ word_freq->idf);
+ }
}
}
@@ -3017,7 +3201,7 @@ fts_retrieve_ranking(
ranking = rbt_value(fts_ranking_t, parent.last);
- return (ranking->rank);
+ return(ranking->rank);
}
return(0);
@@ -3184,7 +3368,7 @@ fts_query_parse(
FTS Query entry point.
@return DB_SUCCESS if successful otherwise error code */
UNIV_INTERN
-ulint
+dberr_t
fts_query(
/*======*/
trx_t* trx, /*!< in: transaction */
@@ -3196,7 +3380,7 @@ fts_query(
fts_result_t** result) /*!< in/out: result doc ids */
{
fts_query_t query;
- ulint error;
+ dberr_t error = DB_SUCCESS;
byte* lc_query_str;
ulint lc_query_str_len;
ulint result_len;
@@ -3204,6 +3388,7 @@ fts_query(
trx_t* query_trx;
CHARSET_INFO* charset;
ulint start_time_ms;
+ bool will_be_ignored = false;
boolean_mode = flags & FTS_BOOL;
@@ -3237,20 +3422,24 @@ fts_query(
/* Setup the RB tree that will be used to collect per term
statistics. */
query.word_freqs = rbt_create_arg_cmp(
- sizeof(fts_word_freq_t), innobase_fts_string_cmp, charset);
+ sizeof(fts_word_freq_t), innobase_fts_string_cmp,
+ (void*) charset);
- query.total_docs = fts_get_total_document_count(index->table);
+ query.total_docs = dict_table_get_n_rows(index->table);
- error = fts_get_total_word_count(trx, query.index, &query.total_words);
+#ifdef FTS_DOC_STATS_DEBUG
+ if (ft_enable_diag_print) {
+ error = fts_get_total_word_count(
+ trx, query.index, &query.total_words);
- if (error != DB_SUCCESS) {
- goto func_exit;
- }
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
-#ifdef FTS_INTERNAL_DIAG_PRINT
- fprintf(stderr, "Total docs: %lu Total words: %lu\n",
- query.total_docs, query.total_words);
-#endif
+ fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
+ query.total_docs, query.total_words);
+ }
+#endif /* FTS_DOC_STATS_DEBUG */
query.fts_common_table.suffix = "DELETED";
@@ -3299,13 +3488,14 @@ fts_query(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
/* Parse the input query string. */
- if (fts_query_parse(&query, lc_query_str, query_len)) {
+ if (fts_query_parse(&query, lc_query_str, result_len)) {
fts_ast_node_t* ast = query.root;
/* Traverse the Abstract Syntax Tree (AST) and execute
the query. */
query.error = fts_ast_visit(
- FTS_NONE, ast, fts_query_visitor, &query);
+ FTS_NONE, ast, fts_query_visitor,
+ &query, &will_be_ignored);
/* If query expansion is requested, extend the search
with first search pass result */
@@ -3453,8 +3643,8 @@ words in documents found in the first search pass will be used as
search arguments to search the document again, thus "expand"
the search result set.
@return DB_SUCCESS if success, otherwise the error code */
-static
-ulint
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
fts_expand_query(
/*=============*/
dict_index_t* index, /*!< in: FTS index to search */
@@ -3463,7 +3653,7 @@ fts_expand_query(
const ib_rbt_node_t* node;
const ib_rbt_node_t* token_node;
fts_doc_t result_doc;
- ulint error = DB_SUCCESS;
+ dberr_t error = DB_SUCCESS;
const fts_index_cache_t*index_cache;
/* If no doc is found in first search pass, return */
@@ -3482,7 +3672,7 @@ fts_expand_query(
result_doc.tokens = rbt_create_arg_cmp(
sizeof(fts_token_t), innobase_fts_text_cmp,
- index_cache->charset);
+ (void*) index_cache->charset);
result_doc.charset = index_cache->charset;
@@ -3557,14 +3747,16 @@ fts_expand_query(
/*************************************************************//**
This function finds documents that contain all words in a
phrase or proximity search. And if proximity search, verify
-the words are close to each other enough, as in specified distance.
+the words are close enough to each other, as in specified distance.
This function is called for phrase and proximity search.
@return TRUE if documents are found, FALSE if otherwise */
static
ibool
-fts_check_phrase_proximity(
-/*=======================*/
- fts_query_t* query, /*!< in: query instance */
+fts_phrase_or_proximity_search(
+/*===========================*/
+ fts_query_t* query, /*!< in/out: query instance.
+ query->doc_ids might be instantiated
+ with qualified doc IDs */
ib_vector_t* tokens) /*!< in: Tokens contain words */
{
ulint n_matched;
@@ -3581,8 +3773,13 @@ fts_check_phrase_proximity(
walk through the list and find common documents that
contain all the matching words. */
for (i = 0; i < n_matched; i++) {
- ulint j;
- ulint k = 0;
+ ulint j;
+ ulint k = 0;
+ fts_proximity_t qualified_pos;
+ ulint qualified_pos_buf[MAX_PROXIMITY_ITEM * 2];
+
+ qualified_pos.min_pos = &qualified_pos_buf[0];
+ qualified_pos.max_pos = &qualified_pos_buf[MAX_PROXIMITY_ITEM];
match[0] = static_cast<fts_match_t*>(
ib_vector_get(query->match_array[0], i));
@@ -3647,24 +3844,31 @@ fts_check_phrase_proximity(
/* For this matching doc, we need to further
verify whether the words in the doc are close
- to each other, and with in distance specified
+ to each other, and within the distance specified
in the proximity search */
if (query->flags & FTS_PHRASE) {
matched = TRUE;
- } else if (fts_proximity_check_position(
- match, num_token, query->distance)) {
- ulint z;
- /* If so, mark we find a matching doc */
- fts_query_process_doc_id(query, match[0]->doc_id, 0);
+ } else if (fts_proximity_get_positions(
+ match, num_token, ULINT_MAX, &qualified_pos)) {
+
+ /* Fetch the original documents and count the
+ words in between matching words to see that is in
+ specified distance */
+ if (fts_query_is_in_proximity_range(
+ query, match, &qualified_pos)) {
+ /* If so, mark we find a matching doc */
+ fts_query_process_doc_id(
+ query, match[0]->doc_id, 0);
- matched = TRUE;
- for (z = 0; z < num_token; z++) {
- fts_string_t* token;
- token = static_cast<fts_string_t*>(
- ib_vector_get(tokens, z));
- fts_query_add_word_to_document(
- query, match[0]->doc_id,
- token->f_str);
+ matched = TRUE;
+ for (ulint z = 0; z < num_token; z++) {
+ fts_string_t* token;
+ token = static_cast<fts_string_t*>(
+ ib_vector_get(tokens, z));
+ fts_query_add_word_to_document(
+ query, match[0]->doc_id,
+ token->f_str);
+ }
}
}
@@ -3678,24 +3882,32 @@ func_exit:
}
/*************************************************************//**
-This function check the words in result document are close to each
-other (within proximity range). This is used for proximity search.
-@return TRUE if words are close to each other, FALSE if otherwise */
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
static
-ulint
-fts_proximity_check_position(
-/*=========================*/
- fts_match_t** match, /*!< in: query instance */
- ulint num_match, /*!< in: number of matching
- items */
- ulint distance) /*!< in: distance value
- for proximity search */
+bool
+fts_proximity_get_positions(
+/*========================*/
+ fts_match_t** match, /*!< in: query instance */
+ ulint num_match, /*!< in: number of matching
+ items */
+ ulint distance, /*!< in: distance value
+ for proximity search */
+ fts_proximity_t* qualified_pos) /*!< out: the position info
+ records ranges containing
+ all matching words. */
{
ulint i;
ulint idx[MAX_PROXIMITY_ITEM];
ulint num_pos[MAX_PROXIMITY_ITEM];
ulint min_idx;
+ qualified_pos->n_pos = 0;
+
ut_a(num_match < MAX_PROXIMITY_ITEM);
/* Each word could appear multiple times in a doc. So
@@ -3747,14 +3959,21 @@ fts_proximity_check_position(
find a good match */
if (max_pos - min_pos <= distance
&& (i >= num_match || position[i] != ULINT_UNDEFINED)) {
- return(TRUE);
- } else {
- /* Otherwise, move to the next position is the
- list for the word with the smallest position */
- idx[min_idx]++;
+ /* The charset has variable character
+ length encoding, record the min_pos and
+ max_pos, we will need to verify the actual
+ number of characters */
+ qualified_pos->min_pos[qualified_pos->n_pos] = min_pos;
+ qualified_pos->max_pos[qualified_pos->n_pos] = max_pos;
+ qualified_pos->n_pos++;
}
+
+ /* Otherwise, move to the next position is the
+ list for the word with the smallest position */
+ idx[min_idx]++;
}
- /* Failed to find all words within the range for the doc */
- return(FALSE);
+ ut_ad(qualified_pos->n_pos <= MAX_PROXIMITY_ITEM);
+
+ return(qualified_pos->n_pos != 0);
}