diff options
author | Mikael Ottela <mikael.ottela@ixonos.com> | 2010-02-25 14:43:56 +0100 |
---|---|---|
committer | Jürg Billeter <j@bitron.ch> | 2010-02-25 15:01:20 +0100 |
commit | 46e49d7845cd2846e933bf816fe7473403c7e0f8 (patch) | |
tree | 811295d3e9393d73585a713c088e955397901fd6 | |
parent | 2e29b7812a488dc8d294427888d41f99249c461e (diff) | |
download | tracker-46e49d7845cd2846e933bf816fe7473403c7e0f8.tar.gz |
libtracker-fts: Do not limit word length in prefix queries
Index short words for properties that specify tracker:fulltextNoLimit.
Limit the word length in exact match queries but not in prefix ones.
-rw-r--r-- | src/libtracker-data/tracker-data-update.c | 7 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-fts.c | 63 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-fts.h | 2 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser.c | 9 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser.h | 6 |
5 files changed, 48 insertions, 39 deletions
diff --git a/src/libtracker-data/tracker-data-update.c b/src/libtracker-data/tracker-data-update.c index a574ad61e..f2713b71d 100644 --- a/src/libtracker-data/tracker-data-update.c +++ b/src/libtracker-data/tracker-data-update.c @@ -666,7 +666,9 @@ tracker_data_resource_buffer_flush (GError **error) g_string_append (fts, g_value_get_string (g_value_array_get_nth (values, i))); g_string_append_c (fts, ' '); } - tracker_fts_update_text (resource_buffer->id, tracker_data_query_resource_id (tracker_property_get_uri (prop)), fts->str); + tracker_fts_update_text (resource_buffer->id, + tracker_data_query_resource_id (tracker_property_get_uri (prop)), + fts->str, !tracker_property_get_fulltext_no_limit (prop)); g_string_free (fts, TRUE); } } @@ -1036,7 +1038,8 @@ get_old_property_values (TrackerProperty *property, /* delete old fts entries */ for (i = 0; i < old_values->n_values; i++) { tracker_fts_update_text (resource_buffer->id, -1, - g_value_get_string (g_value_array_get_nth (old_values, i))); + g_value_get_string (g_value_array_get_nth (old_values, i)), + !tracker_property_get_fulltext_no_limit (prop)); } } } diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c index 2b8964932..b80f10267 100644 --- a/src/libtracker-fts/tracker-fts.c +++ b/src/libtracker-fts/tracker-fts.c @@ -2323,6 +2323,7 @@ struct fulltext_vtab { TrackerParser *parser; /* tokenizer for inserts and queries */ gboolean stop_words; int max_words; + int min_word_length; /* Precompiled statements which we keep as long as the table is ** open. @@ -3358,8 +3359,8 @@ static int constructVtab( max_len = tracker_fts_config_get_max_word_length (config); v->max_words = tracker_fts_config_get_max_words_to_index (config); - - v->parser = tracker_parser_new (language, max_len, min_len); + v->min_word_length = min_len; + v->parser = tracker_parser_new (language, max_len); /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests */ v->stop_words = g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") != 0; @@ -4332,14 +4333,19 @@ static int tokenizeSegment( pToken = tracker_parser_next (parser, &iPos, - &iBegin, - &iEnd, - &stop_word, - &nToken); + &iBegin, + &iEnd, + &stop_word, + &nToken); if (!pToken) { break; } + /* If prefix search ignore the word lenght limit */ + if( nToken < v->min_word_length && !(iEnd<nSegment && pSegment[iEnd]=='*') ){ + continue; + } + // printf("token being indexed is %s, pos is %d, begin is %d, end is %d and length is %d\n", pToken, iPos, iBegin, iEnd, nToken); if( !inPhrase && @@ -4363,10 +4369,10 @@ static int tokenizeSegment( continue; } if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4 - && pToken[0]=='n' - && pToken[1]=='e' - && pToken[2]=='a' - && pToken[3]=='r' + && pToken[0]=='n' + && pToken[1]=='e' + && pToken[2]=='a' + && pToken[3]=='r' ){ QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1]; if( (iBegin+6)<nSegment @@ -4380,10 +4386,10 @@ static int tokenizeSegment( iEnd++; } pToken = tracker_parser_next (parser, &iPos, - &iBegin, - &iEnd, - &stop_word, - &nToken); + &iBegin, + &iEnd, + &stop_word, + &nToken); if (!pToken) { break; } @@ -4756,7 +4762,8 @@ static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid, #ifdef STORE_CATEGORY int Catid, #endif - const char *zText, int iColumn){ + const char *zText, int iColumn, + gboolean limit_word_length){ const char *pToken; int nTokenBytes; @@ -4773,24 +4780,24 @@ int Catid, while( 1 ){ pToken = tracker_parser_next (parser, &iPosition, - &iStartOffset, - &iEndOffset, - &stop_word, - &nTokenBytes); + &iStartOffset, + &iEndOffset, + &stop_word, + &nTokenBytes); if (!pToken) { break; } + if (limit_word_length && nTokenBytes < v->min_word_length) { + continue; + } + // printf("token being indexed is %s, begin is %d, end is %d and length is %d\n", pToken, iStartOffset, iEndOffset, nTokenBytes); if (stop_word) { continue; } - - - - /* Positions can't be negative; we use -1 as a terminator * internally. Token can't be NULL or empty. */ if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){ @@ -4947,7 +4954,7 @@ static int index_update(fulltext_vtab *v, sqlite_int64 iRow, /* tracker - as for col id we want col 0 to be the default metadata field (file:contents or email:body) , col 1 to be meatdata id 1, col 2 to be metadat id 2 etc so need to decrement i here */ - int rc = buildTerms(v, iRow, sqlite3_value_int (pValues[0]), zText, delete ? -1 : (i-1)); + int rc = buildTerms(v, iRow, sqlite3_value_int (pValues[0]), zText, delete ? -1 : (i-1), TRUE); if( rc!=SQLITE_OK ) return rc; } @@ -4955,7 +4962,7 @@ static int index_update(fulltext_vtab *v, sqlite_int64 iRow, for(i = 0; i < v->nColumn ; ++i){ char *zText = (char*)sqlite3_value_text(pValues[i]); - rc = buildTerms(v, iRow, zText, delete ? -1 : i); + rc = buildTerms(v, iRow, zText, delete ? -1 : i, TRUE); if( rc!=SQLITE_OK ) return rc; } @@ -7775,8 +7782,10 @@ int tracker_fts_update_init(int id){ return initPendingTerms(tracker_fts_vtab, id); } -int tracker_fts_update_text(int id, int column_id, const char *text){ - return buildTerms(tracker_fts_vtab, id, text, column_id); +int tracker_fts_update_text(int id, int column_id, + const char *text, gboolean limit_word_length){ + return buildTerms(tracker_fts_vtab, id, text, + column_id, limit_word_length); } void tracker_fts_update_commit(void){ diff --git a/src/libtracker-fts/tracker-fts.h b/src/libtracker-fts/tracker-fts.h index b39c4c573..491404d6f 100644 --- a/src/libtracker-fts/tracker-fts.h +++ b/src/libtracker-fts/tracker-fts.h @@ -24,7 +24,7 @@ G_BEGIN_DECLS int tracker_fts_init (sqlite3 *db); int tracker_fts_update_init (int id); -int tracker_fts_update_text (int id, int column_id, const char *text); +int tracker_fts_update_text (int id, int column_id, const char *text, gboolean limit_word_length); void tracker_fts_update_commit (void); void tracker_fts_update_rollback (void); gchar * tracker_fts_get_create_fts_table_query (void); diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c index cf1d1e34d..84eda5a2e 100644 --- a/src/libtracker-fts/tracker-parser.c +++ b/src/libtracker-fts/tracker-parser.c @@ -79,7 +79,6 @@ struct TrackerParser { gboolean enable_stop_words; guint max_words_to_index; guint max_word_length; - guint min_word_length; gboolean delimit_words; gboolean parse_reserved_words; @@ -323,7 +322,6 @@ parser_next (TrackerParser *parser, } if (!is_valid || - length < parser->min_word_length || word_type == TRACKER_PARSER_WORD_NUM) { word_type = TRACKER_PARSER_WORD_IGNORE; is_valid = TRUE; @@ -460,21 +458,18 @@ parser_next (TrackerParser *parser, TrackerParser * tracker_parser_new (TrackerLanguage *language, - gint max_word_length, - gint min_word_length) + gint max_word_length) { TrackerParser *parser; g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL); - g_return_val_if_fail (min_word_length > 0, NULL); - g_return_val_if_fail (min_word_length < max_word_length, NULL); + g_return_val_if_fail (max_word_length > 0, NULL); parser = g_new0 (TrackerParser, 1); parser->language = g_object_ref (language); parser->max_word_length = max_word_length; - parser->min_word_length = min_word_length; parser->word_length = 0; parser->attrs = NULL; diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h index 6e6b7fae4..f6503ac2e 100644 --- a/src/libtracker-fts/tracker-parser.h +++ b/src/libtracker-fts/tracker-parser.h @@ -30,8 +30,8 @@ G_BEGIN_DECLS typedef struct TrackerParser TrackerParser; TrackerParser *tracker_parser_new (TrackerLanguage *language, - gint max_word_length, - gint min_word_length); + gint max_word_length); + void tracker_parser_reset (TrackerParser *parser, const gchar *txt, gint txt_size, @@ -39,12 +39,14 @@ void tracker_parser_reset (TrackerParser *parser, gboolean enable_stemmer, gboolean enable_stop_words, gboolean parse_reserved_words); + const gchar * tracker_parser_next (TrackerParser *parser, gint *position, gint *byte_offset_start, gint *byte_offset_end, gboolean *stop_word, gint *word_length); + gchar * tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, |