diff options
author | Philip Van Hoof <philip@codeminded.be> | 2014-12-01 14:07:16 +0100 |
---|---|---|
committer | Philip Van Hoof <philip@codeminded.be> | 2014-12-01 14:07:16 +0100 |
commit | 5d05f68da10ef5a2c25fba4246120ce525035d51 (patch) | |
tree | f40db3e46953ef15f0dc7db5a5685711cbd79f18 /src/libtracker-fts | |
parent | 8d64b9783d3aebff2d9259068c402a9e1c1a81e4 (diff) | |
download | tracker-5d05f68da10ef5a2c25fba4246120ce525035d51.tar.gz |
Refactor tracker-parser to be located in libtracker-common
This also fixes the unaccenting SPARQL function in case of --disable-tracker-fts
Diffstat (limited to 'src/libtracker-fts')
-rw-r--r-- | src/libtracker-fts/Makefile.am | 13 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-fts-tokenizer.c | 3 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser-libicu.c | 749 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser-libunistring.c | 546 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser-utils.c | 91 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser-utils.h | 82 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser.h | 60 |
7 files changed, 3 insertions, 1541 deletions
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am index e583c391c..86b7ac34d 100644 --- a/src/libtracker-fts/Makefile.am +++ b/src/libtracker-fts/Makefile.am @@ -33,23 +33,12 @@ libtracker_fts_la_SOURCES = \ tracker-fts-config.c \ tracker-fts-config.h \ tracker-fts-tokenizer.c \ - tracker-fts-tokenizer.h \ - tracker-parser-utils.c \ - tracker-parser-utils.h \ - tracker-parser.h + tracker-fts-tokenizer.h if !HAVE_BUILTIN_FTS libtracker_fts_la_SOURCES += $(fts4_sources) endif -if BUILD_LIBUNISTRING_PARSER - libtracker_fts_la_SOURCES += tracker-parser-libunistring.c -endif - -if BUILD_LIBICU_PARSER - libtracker_fts_la_SOURCES += tracker-parser-libicu.c -endif - libtracker_fts_la_LIBADD = \ $(top_builddir)/src/libtracker-common/libtracker-common.la \ $(BUILD_LIBS) \ diff --git a/src/libtracker-fts/tracker-fts-tokenizer.c b/src/libtracker-fts/tracker-fts-tokenizer.c index d61ae79e2..c45d73d9a 100644 --- a/src/libtracker-fts/tracker-fts-tokenizer.c +++ b/src/libtracker-fts/tracker-fts-tokenizer.c @@ -26,9 +26,10 @@ #include <assert.h> #include <string.h> +#include <libtracker-common/tracker-parser.h> + #include "tracker-fts-tokenizer.h" #include "tracker-fts-config.h" -#include "tracker-parser.h" #include "fts3_tokenizer.h" typedef struct TrackerTokenizer TrackerTokenizer; diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c deleted file mode 100644 index b26722c96..000000000 --- a/src/libtracker-fts/tracker-parser-libicu.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org> - * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301 USA - */ - -#include "config.h" - -#include <stdio.h> -#include <string.h> -#include <locale.h> - -#include <unicode/utypes.h> -#include <unicode/ucnv.h> -#include <unicode/ubrk.h> -#include <unicode/ustring.h> -#include <unicode/uchar.h> -#include <unicode/unorm.h> - -#include "tracker-parser.h" -#include "tracker-parser-utils.h" - -/* Type of words detected */ -typedef enum { - TRACKER_PARSER_WORD_TYPE_ASCII, - TRACKER_PARSER_WORD_TYPE_OTHER_UNAC, - TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC, -} TrackerParserWordType; - -/* Max possible length of a UChar encoded string (just a safety limit) */ -#define WORD_BUFFER_LENGTH 512 - -struct TrackerParser { - const gchar *txt; - gint txt_size; - - TrackerLanguage *language; - guint max_word_length; - gboolean enable_stemmer; - gboolean enable_unaccent; - gboolean ignore_stop_words; - gboolean ignore_reserved_words; - gboolean ignore_numbers; - gboolean enable_forced_wordbreaks; - - /* Private members */ - gchar *word; - gint word_length; - guint word_position; - - /* Text as UChars */ - UChar *utxt; - gint utxt_size; - /* Original offset of each UChar in the input txt string */ - gint32 *offsets; - - /* The word-break iterator */ - UBreakIterator *bi; - - /* Cursor, as index of the utxt array of bytes */ - gsize cursor; -}; - - -static gboolean -get_word_info (const UChar *word, - gsize word_length, - gboolean ignore_numbers, - gboolean *p_is_allowed_word_start, - TrackerParserWordType *p_word_type) -{ - UCharIterator iter; - UChar32 unichar; - guint8 unichar_gc; - - /* Get first character of the word as UCS4 */ - uiter_setString (&iter, word, word_length); - unichar = uiter_current32 (&iter); - if (unichar == U_SENTINEL) { - return FALSE; - } - - /* We only want the words where the first character - * in the word is either a letter, a number or a symbol. - * - * This is needed because the word break algorithm also - * considers word breaks after for example commas or other - * punctuation marks. - * - * Note that looking at the first character in the string - * should be compatible with all Unicode normalization - * methods. - */ - unichar_gc = u_charType (unichar); - if (unichar_gc == U_UPPERCASE_LETTER || - unichar_gc == U_LOWERCASE_LETTER || - unichar_gc == U_TITLECASE_LETTER || - unichar_gc == U_MODIFIER_LETTER || - unichar_gc == U_OTHER_LETTER || - IS_UNDERSCORE_UCS4 ((guint32)unichar) || - (!ignore_numbers && - (unichar_gc == U_DECIMAL_DIGIT_NUMBER || - unichar_gc == U_LETTER_NUMBER || - unichar_gc == U_OTHER_NUMBER))) { - *p_is_allowed_word_start = TRUE; - } else { - *p_is_allowed_word_start = FALSE; - return TRUE; - } - - /* Word starts with a CJK character? */ - if (IS_CJK_UCS4 ((guint32)unichar)) { - *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; - return TRUE; - } - - /* Is ASCII-only string? */ - while (unichar != U_SENTINEL) { - if (!IS_ASCII_UCS4 ((guint32)unichar)) { - *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; - return TRUE; - } - unichar = uiter_next32 (&iter); - } - - *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; - return TRUE; -} - -/* The input word in this method MUST be normalized in NFKD form, - * and given in UChars, where str_length is the number of UChars - * (not the number of bytes) */ -gboolean -tracker_parser_unaccent_nfkd_string (gpointer str, - gsize *str_length) -{ - UChar *word; - gsize word_length; - gsize i; - gsize j; - - g_return_val_if_fail (str != NULL, FALSE); - g_return_val_if_fail (str_length != NULL, FALSE); - g_return_val_if_fail (*str_length > 0, FALSE); - - word = (UChar *)str; - word_length = *str_length; - - i = 0; - j = 0; - while (i < word_length) { - UChar32 unichar; - gint utf16_len; /* given in UChars */ - gsize aux_i; - - /* Get next character of the word as UCS4 */ - aux_i = i; - U16_NEXT (word, aux_i, word_length, unichar); - utf16_len = aux_i - i; - - /* Invalid UTF-16 character or end of original string. */ - if (utf16_len <= 0) { - break; - } - - /* If the given unichar is a combining diacritical mark, - * just update the original index, not the output one */ - if (IS_CDM_UCS4 ((guint32) unichar)) { - i += utf16_len; - continue; - } - - /* If already found a previous combining - * diacritical mark, indexes are different so - * need to copy characters. As output and input - * buffers may overlap, need to use memmove - * instead of memcpy */ - if (i != j) { - memmove (&word[j], &word[i], sizeof (UChar) * utf16_len); - } - - /* Update both indexes */ - i += utf16_len; - j += utf16_len; - } - - /* Force proper string end */ - word[j] = (UChar) 0; - - /* Set new output length */ - *str_length = j; - - return TRUE; -} - -static gchar * -convert_UChar_to_utf8 (const UChar *word, - gsize uchar_len, - gsize *utf8_len) -{ - gchar *utf8_str; - UErrorCode icu_error = U_ZERO_ERROR; - UConverter *converter; - gsize new_utf8_len; - - g_return_val_if_fail (word, NULL); - g_return_val_if_fail (utf8_len, NULL); - - /* Open converter UChar to UTF-16BE */ - converter = ucnv_open ("UTF-8", &icu_error); - if (!converter) { - g_warning ("Cannot open UTF-8 converter: '%s'", - U_FAILURE (icu_error) ? u_errorName (icu_error) : "none"); - return NULL; - } - - /* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes - * in UTF-8. */ - utf8_str = g_malloc (2 * uchar_len * sizeof (UChar) + 1); - - /* Convert from UChar to UTF-8 (NIL-terminated) */ - new_utf8_len = ucnv_fromUChars (converter, - utf8_str, - 2 * uchar_len * sizeof (UChar) + 1, - word, - uchar_len, - &icu_error); - if (U_FAILURE (icu_error)) { - g_warning ("Cannot convert from UChar to UTF-8: '%s'", - u_errorName (icu_error)); - g_free (utf8_str); - ucnv_close (converter); - return NULL; - } - - *utf8_len = new_utf8_len; - ucnv_close (converter); - - return utf8_str; -} - -static gchar * -process_word_uchar (TrackerParser *parser, - const UChar *word, - gint length, - TrackerParserWordType type, - gboolean *stop_word) -{ - UErrorCode error = U_ZERO_ERROR; - UChar normalized_buffer[WORD_BUFFER_LENGTH]; - gchar *utf8_str = NULL; - gsize new_word_length; - - /* Log original word */ - tracker_parser_message_hex ("ORIGINAL word", - (guint8 *)word, - length * sizeof (UChar)); - - - if (type != TRACKER_PARSER_WORD_TYPE_ASCII) { - UChar casefolded_buffer [WORD_BUFFER_LENGTH]; - - /* Casefold... */ - new_word_length = u_strFoldCase (casefolded_buffer, - WORD_BUFFER_LENGTH, - word, - length, - U_FOLD_CASE_DEFAULT, - &error); - if (U_FAILURE (error)) { - g_warning ("Error casefolding: '%s'", - u_errorName (error)); - return NULL; - } - if (new_word_length > WORD_BUFFER_LENGTH) - new_word_length = WORD_BUFFER_LENGTH; - - /* Log after casefolding */ - tracker_parser_message_hex (" After Casefolding", - (guint8 *)casefolded_buffer, - new_word_length * sizeof (UChar)); - - /* NFKD normalization... */ - new_word_length = unorm_normalize (casefolded_buffer, - new_word_length, - UNORM_NFKD, - 0, - normalized_buffer, - WORD_BUFFER_LENGTH, - &error); - if (U_FAILURE (error)) { - g_warning ("Error normalizing: '%s'", - u_errorName (error)); - return NULL; - } - - if (new_word_length > WORD_BUFFER_LENGTH) - new_word_length = WORD_BUFFER_LENGTH; - - /* Log after casefolding */ - tracker_parser_message_hex (" After Normalization", - (guint8 *) normalized_buffer, - new_word_length * sizeof (UChar)); - } else { - /* For ASCII-only, just tolower() each character */ - new_word_length = u_strToLower (normalized_buffer, - WORD_BUFFER_LENGTH, - word, - length, - NULL, - &error); - if (U_FAILURE (error)) { - g_warning ("Error lowercasing: '%s'", - u_errorName (error)); - return NULL; - } - - /* Log after casefolding */ - tracker_parser_message_hex (" After lowercase", - (guint8 *) normalized_buffer, - new_word_length * sizeof (UChar)); - } - - /* UNAC stripping needed? (for non-CJK and non-ASCII) */ - if (parser->enable_unaccent && - type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && - tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) { - /* Log after unaccenting */ - tracker_parser_message_hex (" After UNAC", - (guint8 *) normalized_buffer, - new_word_length * sizeof (UChar)); - } - - /* Finally, convert to UTF-8 */ - utf8_str = convert_UChar_to_utf8 (normalized_buffer, - new_word_length, - &new_word_length); - - /* Log after unaccenting */ - tracker_parser_message_hex (" After UTF8 conversion", - utf8_str, - new_word_length); - - /* Check if stop word */ - if (parser->ignore_stop_words) { - *stop_word = tracker_language_is_stop_word (parser->language, - utf8_str); - } - - /* Stemming needed? */ - if (utf8_str && - parser->enable_stemmer) { - gchar *stemmed; - - /* Input for stemmer ALWAYS in UTF-8, as well as output */ - stemmed = tracker_language_stem_word (parser->language, - utf8_str, - new_word_length); - - /* Log after stemming */ - tracker_parser_message_hex (" After stemming", - stemmed, strlen (stemmed)); - - /* If stemmed wanted and succeeded, free previous and return it */ - if (stemmed) { - g_free (utf8_str); - return stemmed; - } - } - - return utf8_str; -} - -static gboolean -parser_check_forced_wordbreaks (const UChar *buffer, - gsize current, - gsize *next) -{ - gsize unicode_word_length = *next - current; - gsize word_length = 0; - UCharIterator iter; - UChar32 unichar; - - uiter_setString (&iter, &buffer[current], unicode_word_length); - - /* Iterate over the string looking for forced word breaks */ - while ((unichar = uiter_next32 (&iter)) != U_SENTINEL && - word_length < unicode_word_length) { - - if (IS_FORCED_WORDBREAK_UCS4 ((guint32) unichar)) { - /* Support word starting with a forced wordbreak */ - if (word_length == 0) { - word_length = 1; - } - break; - } - - word_length ++; - } - - /* g_debug ("current: %" G_GSIZE_FORMAT ", " */ - /* "next: %" G_GSIZE_FORMAT ", " */ - /* "now: %" G_GSIZE_FORMAT, */ - /* current, */ - /* *next, */ - /* current + word_length); */ - - if (word_length != unicode_word_length) { - *next = current + word_length; - return TRUE; - } - return FALSE; -} - -static gboolean -parser_next (TrackerParser *parser, - gint *byte_offset_start, - gint *byte_offset_end, - gboolean *stop_word) -{ - gsize word_length_uchar = 0; - gsize word_length_utf8 = 0; - gchar *processed_word = NULL; - gsize current_word_offset_utf8; - - *byte_offset_start = 0; - *byte_offset_end = 0; - - g_return_val_if_fail (parser, FALSE); - - /* Loop to look for next valid word */ - while (!processed_word && - parser->cursor < parser->utxt_size) { - TrackerParserWordType type; - gboolean is_allowed; - gsize next_word_offset_uchar; - gsize next_word_offset_utf8; - gsize truncated_length; - - /* Set current word offset in the original UTF-8 string */ - current_word_offset_utf8 = parser->offsets[parser->cursor]; - - /* Find next word break. */ - next_word_offset_uchar = ubrk_next (parser->bi); - - /* Check if any forced wordbreaks here... */ - if (parser->enable_forced_wordbreaks) { - /* Returns TRUE if next word offset changed */ - if (parser_check_forced_wordbreaks (parser->utxt, - parser->cursor, - &next_word_offset_uchar)) { - /* We need to reset the iterator so that next word - * actually returns the same result */ - ubrk_previous (parser->bi); - } - } - - if (next_word_offset_uchar >= parser->utxt_size) { - /* Last word support... */ - next_word_offset_uchar = parser->utxt_size; - next_word_offset_utf8 = parser->txt_size; - } else { - next_word_offset_utf8 = parser->offsets[next_word_offset_uchar]; - } - - /* Word end is the first byte after the word, which is either the - * start of next word or the end of the string */ - word_length_uchar = next_word_offset_uchar - parser->cursor; - word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8; - - /* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */ - /* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */ - /* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */ - /* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */ - /* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */ - /* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */ - - /* Ignore the word if longer than the maximum allowed */ - if (word_length_utf8 >= parser->max_word_length) { - /* Ignore this word and keep on looping */ - parser->cursor = next_word_offset_uchar; - continue; - } - - /* Get word info... */ - if (!get_word_info (&parser->utxt[parser->cursor], - word_length_uchar, - parser->ignore_numbers, - &is_allowed, - &type)) { - /* Quit loop just in case */ - parser->cursor = parser->utxt_size; - break; - } - - /* Ignore the word if not an allowed word start */ - if (!is_allowed) { - /* Ignore this word and keep on looping */ - parser->cursor = next_word_offset_uchar; - continue; - } - - /* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */ - if (parser->ignore_reserved_words && - tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8], - word_length_utf8)) { - /* Ignore this word and keep on looping */ - parser->cursor = next_word_offset_uchar; - continue; - } - - /* compute truncated word length (in UChar bytes) if needed (to - * avoid extremely long words) */ - truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ? - word_length_uchar : - 2 * WORD_BUFFER_LENGTH); - - /* Process the word here. If it fails, we can still go - * to the next one. Returns newly allocated UTF-8 - * string always. - * Enable UNAC stripping only if no ASCII and no CJK - * Note we are passing UChar encoded string here! - */ - processed_word = process_word_uchar (parser, - &(parser->utxt[parser->cursor]), - truncated_length, - type, - stop_word); - if (!processed_word) { - /* Ignore this word and keep on looping */ - parser->cursor = next_word_offset_uchar; - continue; - } - } - - /* If we got a word here, set output */ - if (processed_word) { - /* Set outputs */ - *byte_offset_start = current_word_offset_utf8; - *byte_offset_end = current_word_offset_utf8 + word_length_utf8; - - /* Update cursor */ - parser->cursor += word_length_uchar; - - parser->word_length = strlen (processed_word); - parser->word = processed_word; - - return TRUE; - } - - /* No more words... */ - return FALSE; -} - -TrackerParser * -tracker_parser_new (TrackerLanguage *language) -{ - TrackerParser *parser; - - g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL); - - parser = g_new0 (TrackerParser, 1); - - parser->language = g_object_ref (language); - - return parser; -} - -void -tracker_parser_free (TrackerParser *parser) -{ - g_return_if_fail (parser != NULL); - - if (parser->language) { - g_object_unref (parser->language); - } - - if (parser->bi) { - ubrk_close (parser->bi); - } - - g_free (parser->utxt); - g_free (parser->offsets); - - g_free (parser->word); - - g_free (parser); -} - -void -tracker_parser_reset (TrackerParser *parser, - const gchar *txt, - gint txt_size, - guint max_word_length, - gboolean enable_stemmer, - gboolean enable_unaccent, - gboolean ignore_stop_words, - gboolean ignore_reserved_words, - gboolean ignore_numbers) -{ - UErrorCode error = U_ZERO_ERROR; - UConverter *converter; - UChar *last_uchar; - const gchar *last_utf8; - - g_return_if_fail (parser != NULL); - g_return_if_fail (txt != NULL); - - parser->max_word_length = max_word_length; - parser->enable_stemmer = enable_stemmer; - parser->enable_unaccent = enable_unaccent; - parser->ignore_stop_words = ignore_stop_words; - parser->ignore_reserved_words = ignore_reserved_words; - parser->ignore_numbers = ignore_numbers; - - /* Note: We're forcing some unicode characters to behave - * as wordbreakers: e.g, the '.' The main reason for this - * is to enable FTS searches matching file extension. */ - parser->enable_forced_wordbreaks = TRUE; - - parser->txt_size = txt_size; - parser->txt = txt; - - g_free (parser->word); - parser->word = NULL; - - if (parser->bi) { - ubrk_close (parser->bi); - parser->bi = NULL; - } - g_free (parser->utxt); - parser->utxt = NULL; - g_free (parser->offsets); - parser->offsets = NULL; - - parser->word_position = 0; - - parser->cursor = 0; - - /* Open converter UTF-8 to UChar */ - converter = ucnv_open ("UTF-8", &error); - if (!converter) { - g_warning ("Cannot open UTF-8 converter: '%s'", - U_FAILURE (error) ? u_errorName (error) : "none"); - return; - } - - /* Allocate UChars and offsets buffers */ - parser->utxt_size = txt_size + 1; - parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar)); - parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32)); - - /* last_uchar and last_utf8 will be also an output parameter! */ - last_uchar = parser->utxt; - last_utf8 = parser->txt; - - /* Convert to UChars storing offsets */ - ucnv_toUnicode (converter, - &last_uchar, - &parser->utxt[txt_size], - &last_utf8, - &parser->txt[txt_size], - parser->offsets, - FALSE, - &error); - if (U_SUCCESS (error)) { - /* Proper UChar array size is now given by 'last_uchar' */ - parser->utxt_size = last_uchar - parser->utxt; - - /* Open word-break iterator */ - parser->bi = ubrk_open(UBRK_WORD, - setlocale (LC_CTYPE, NULL), - parser->utxt, - parser->utxt_size, - &error); - if (U_SUCCESS (error)) { - /* Find FIRST word in the UChar array */ - parser->cursor = ubrk_first (parser->bi); - } - } - - /* If any error happened, reset buffers */ - if (U_FAILURE (error)) { - g_warning ("Error initializing libicu support: '%s'", - u_errorName (error)); - /* Reset buffers */ - g_free (parser->utxt); - parser->utxt = NULL; - g_free (parser->offsets); - parser->offsets = NULL; - parser->utxt_size = 0; - if (parser->bi) { - ubrk_close (parser->bi); - parser->bi = NULL; - } - } - - /* Close converter */ - ucnv_close (converter); -} - -const gchar * -tracker_parser_next (TrackerParser *parser, - gint *position, - gint *byte_offset_start, - gint *byte_offset_end, - gboolean *stop_word, - gint *word_length) -{ - const gchar *str; - gint byte_start = 0, byte_end = 0; - - str = NULL; - - g_free (parser->word); - parser->word = NULL; - - *stop_word = FALSE; - - if (parser_next (parser, &byte_start, &byte_end, stop_word)) { - str = parser->word; - } - - if (!*stop_word) { - parser->word_position++; - } - - *word_length = parser->word_length; - *position = parser->word_position; - *byte_offset_start = byte_start; - *byte_offset_end = byte_end; - - return str; -} - diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c deleted file mode 100644 index 9de6e46f7..000000000 --- a/src/libtracker-fts/tracker-parser-libunistring.c +++ /dev/null @@ -1,546 +0,0 @@ -/* - * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org> - * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301 USA - */ - -#include "config.h" - -#include <stdio.h> -#include <string.h> - -/* libunistring versions prior to 9.1.2 need this hack */ -#define _UNUSED_PARAMETER_ -#include <unistr.h> -#include <uniwbrk.h> -#include <unictype.h> -#include <unicase.h> - -#include "tracker-parser.h" -#include "tracker-parser-utils.h" - -/* Type of words detected */ -typedef enum { - TRACKER_PARSER_WORD_TYPE_ASCII, - TRACKER_PARSER_WORD_TYPE_OTHER_UNAC, - TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC, -} TrackerParserWordType; - -/* Max possible length of a UTF-8 encoded string (just a safety limit) */ -#define WORD_BUFFER_LENGTH 512 - -struct TrackerParser { - const gchar *txt; - gint txt_size; - - TrackerLanguage *language; - guint max_word_length; - gboolean enable_stemmer; - gboolean enable_unaccent; - gboolean ignore_stop_words; - gboolean ignore_reserved_words; - gboolean ignore_numbers; - gboolean enable_forced_wordbreaks; - - /* Private members */ - gchar *word; - gint word_length; - guint word_position; - - /* Cursor, as index of the input array of bytes */ - gsize cursor; - /* libunistring flags array */ - gchar *word_break_flags; - /* general category of the start character in words */ - uc_general_category_t allowed_start; -}; - -static gboolean -get_word_info (TrackerParser *parser, - gsize *p_word_length, - gboolean *p_is_allowed_word_start, - TrackerParserWordType *p_word_type) -{ - ucs4_t first_unichar; - gint first_unichar_len; - gboolean ascii_only; - - /* Defaults */ - *p_is_allowed_word_start = TRUE; - - /* Get first character of the word as UCS4 */ - first_unichar_len = u8_strmbtouc (&first_unichar, - &(parser->txt[parser->cursor])); - if (first_unichar_len <= 0) { - /* This should only happen if NIL was passed to u8_strmbtouc, - * so better just force stop here */ - return FALSE; - } else { - /* If first character has length 1, it's ASCII-7 */ - ascii_only = first_unichar_len == 1 ? TRUE : FALSE; - } - - /* Consider word starts with a forced wordbreak */ - if (parser->enable_forced_wordbreaks && - IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) { - *p_word_length = first_unichar_len; - } else { - gsize i; - - /* Find next word break, and in the same loop checking if only ASCII - * characters */ - i = parser->cursor + first_unichar_len; - while (1) { - /* Text bounds reached? */ - if (i >= parser->txt_size) - break; - /* Proper unicode word break detected? */ - if (parser->word_break_flags[i]) - break; - /* Forced word break detected? */ - if (parser->enable_forced_wordbreaks && - IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i])) - break; - - if (ascii_only && - !IS_ASCII_UCS4 ((guint32)parser->txt[i])) { - ascii_only = FALSE; - } - - i++; - } - - /* Word end is the first byte after the word, which is either the - * start of next word or the end of the string */ - *p_word_length = i - parser->cursor; - } - - /* We only want the words where the first character - * in the word is either a letter, a number or a symbol. - * This is needed because the word break algorithm also - * considers word breaks after for example commas or other - * punctuation marks. - * Note that looking at the first character in the string - * should be compatible with all Unicode normalization - * methods. - */ - if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) && - !uc_is_general_category (first_unichar, - parser->allowed_start)) { - *p_is_allowed_word_start = FALSE; - return TRUE; - } - - /* Decide word type */ - if (ascii_only) { - *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; - } else if (IS_CJK_UCS4 (first_unichar)) { - *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; - } else { - *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; - } - return TRUE; -} - -/* The input word in this method MUST be normalized in NFKD form, - * and given in UTF-8, where str_length is the byte-length */ -gboolean -tracker_parser_unaccent_nfkd_string (gpointer str, - gsize *str_length) -{ - gchar *word; - gsize word_length; - gsize i; - gsize j; - - g_return_val_if_fail (str != NULL, FALSE); - g_return_val_if_fail (str_length != NULL, FALSE); - g_return_val_if_fail (*str_length > 0, FALSE); - - word = (gchar *)str; - word_length = *str_length; - - i = 0; - j = 0; - while (i < word_length) { - ucs4_t unichar; - gint utf8_len; - - /* Get next character of the word as UCS4 */ - utf8_len = u8_strmbtouc (&unichar, &word[i]); - - /* Invalid UTF-8 character or end of original string. */ - if (utf8_len <= 0) { - break; - } - - /* If the given unichar is a combining diacritical mark, - * just update the original index, not the output one */ - if (IS_CDM_UCS4 ((guint32) unichar)) { - i += utf8_len; - continue; - } - - /* If already found a previous combining - * diacritical mark, indexes are different so - * need to copy characters. As output and input - * buffers may overlap, need to use memmove - * instead of memcpy */ - if (i != j) { - memmove (&word[j], &word[i], utf8_len); - } - - /* Update both indexes */ - i += utf8_len; - j += utf8_len; - } - - /* Force proper string end */ - word[j] = '\0'; - - /* Set new output length */ - *str_length = j; - - return TRUE; -} - -static gchar * -process_word_utf8 (TrackerParser *parser, - const gchar *word, - gint length, - TrackerParserWordType type, - gboolean *stop_word) -{ - gchar word_buffer [WORD_BUFFER_LENGTH]; - gchar *normalized = NULL; - gchar *stemmed = NULL; - size_t new_word_length; - - g_return_val_if_fail (parser != NULL, NULL); - g_return_val_if_fail (word != NULL, NULL); - - /* If length is set as -1, the input word MUST be NIL-terminated. - * Otherwise, this restriction is not needed as the length to process - * is given as input argument */ - if (length < 0) { - length = strlen (word); - } - - /* Log original word */ - tracker_parser_message_hex ("ORIGINAL word", - word, length); - - /* Normalization and case-folding ONLY for non-ASCII */ - if (type != TRACKER_PARSER_WORD_TYPE_ASCII) { - /* Leave space for last NIL */ - new_word_length = WORD_BUFFER_LENGTH - 1; - - /* Casefold and NFKD normalization in output. - * NOTE: if the output buffer is not big enough, u8_casefold will - * return a newly-allocated buffer. */ - normalized = u8_casefold ((const uint8_t *)word, - length, - uc_locale_language (), - UNINORM_NFKD, - word_buffer, - &new_word_length); - - /* Case folding + Normalization failed, ignore this word */ - g_return_val_if_fail (normalized != NULL, NULL); - - /* If output buffer is not the same as the one passed to - * u8_casefold, we know it was newly-allocated, so need - * to resize it in 1 byte to add last NIL */ - if (normalized != word_buffer) { - normalized = g_realloc (normalized, new_word_length + 1); - } - - /* Log after Normalization */ - tracker_parser_message_hex (" After Casefolding and NFKD normalization", - normalized, new_word_length); - } else { - /* For ASCII-only, just tolower() each character */ - gsize i; - - normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer; - - for (i = 0; i < length; i++) { - normalized[i] = g_ascii_tolower (word[i]); - } - - new_word_length = length; - - /* Log after tolower */ - tracker_parser_message_hex (" After Lowercasing", - normalized, new_word_length); - } - - /* Set output NIL */ - normalized[new_word_length] = '\0'; - - /* UNAC stripping needed? (for non-CJK and non-ASCII) */ - if (parser->enable_unaccent && - type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && - tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) { - /* Log after UNAC stripping */ - tracker_parser_message_hex (" After UNAC stripping", - normalized, new_word_length); - } - - /* Check if stop word */ - if (parser->ignore_stop_words) { - *stop_word = tracker_language_is_stop_word (parser->language, - normalized); - } - - /* Stemming needed? */ - if (parser->enable_stemmer) { - stemmed = tracker_language_stem_word (parser->language, - normalized, - new_word_length); - - /* Log after stemming */ - tracker_parser_message_hex (" After stemming", - stemmed, strlen (stemmed)); - } - - /* If stemmed wanted and succeeded, free previous and return it */ - if (stemmed) { - if (normalized != word_buffer) { - g_free (normalized); - } - return stemmed; - } - - /* It may be the case that no stripping and no stemming was needed, and - * that the output buffer in stack was enough for case-folding and - * normalization. In this case, need to strdup() the string to return it */ - return normalized == word_buffer ? g_strdup (word_buffer) : normalized; -} - -static gboolean -parser_next (TrackerParser *parser, - gint *byte_offset_start, - gint *byte_offset_end, - gboolean *stop_word) -{ - gsize word_length = 0; - gchar *processed_word = NULL; - - *byte_offset_start = 0; - *byte_offset_end = 0; - - g_return_val_if_fail (parser, FALSE); - - /* Loop to look for next valid word */ - while (!processed_word && - parser->cursor < parser->txt_size) { - TrackerParserWordType type; - gsize truncated_length; - gboolean is_allowed; - - /* Get word info */ - if (!get_word_info (parser, - &word_length, - &is_allowed, - &type)) { - /* Quit loop just in case */ - parser->cursor = parser->txt_size; - break; - } - - /* Ignore the word if not an allowed word start */ - if (!is_allowed) { - /* Ignore this word and keep on looping */ - parser->cursor += word_length; - continue; - } - - /* Ignore the word if longer than the maximum allowed */ - if (word_length >= parser->max_word_length) { - /* Ignore this word and keep on looping */ - parser->cursor += word_length; - continue; - } - - /* check if word is reserved and ignore it if so */ - if (parser->ignore_reserved_words && - tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor], - word_length)) { - /* Ignore this word and keep on looping */ - parser->cursor += word_length; - continue; - } - - /* compute truncated word length if needed (to avoid extremely - * long words)*/ - truncated_length = (word_length < WORD_BUFFER_LENGTH ? - word_length : - WORD_BUFFER_LENGTH - 1); - - /* Process the word here. If it fails, we can still go - * to the next one. Returns newly allocated string - * always */ - processed_word = process_word_utf8 (parser, - &(parser->txt[parser->cursor]), - truncated_length, - type, - stop_word); - if (!processed_word) { - /* Ignore this word and keep on looping */ - parser->cursor += word_length; - continue; - } - } - - /* If we got a word here, set output */ - if (processed_word) { - /* Set outputs */ - *byte_offset_start = parser->cursor; - *byte_offset_end = parser->cursor + word_length; - - /* Update cursor */ - parser->cursor += word_length; - - parser->word_length = strlen (processed_word); - parser->word = processed_word; - - return TRUE; - } - - /* No more words... */ - return FALSE; -} - -TrackerParser * -tracker_parser_new (TrackerLanguage *language) -{ - TrackerParser *parser; - - g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL); - - parser = g_new0 (TrackerParser, 1); - - parser->language = g_object_ref (language); - - return parser; -} - -void -tracker_parser_free (TrackerParser *parser) -{ - g_return_if_fail (parser != NULL); - - if (parser->language) { - g_object_unref (parser->language); - } - - g_free (parser->word_break_flags); - - g_free (parser->word); - - g_free (parser); -} - -void -tracker_parser_reset (TrackerParser *parser, - const gchar *txt, - gint txt_size, - guint max_word_length, - gboolean enable_stemmer, - gboolean enable_unaccent, - gboolean ignore_stop_words, - gboolean ignore_reserved_words, - gboolean ignore_numbers) -{ - g_return_if_fail (parser != NULL); - g_return_if_fail (txt != NULL); - - parser->max_word_length = max_word_length; - parser->enable_stemmer = enable_stemmer; - parser->enable_unaccent = enable_unaccent; - parser->ignore_stop_words = ignore_stop_words; - parser->ignore_reserved_words = ignore_reserved_words; - parser->ignore_numbers = ignore_numbers; - - /* Note: We're forcing some unicode characters to behave - * as wordbreakers: e.g, the '.' The main reason for this - * is to enable FTS searches matching file extension. */ - parser->enable_forced_wordbreaks = TRUE; - - parser->txt_size = txt_size; - parser->txt = txt; - - g_free (parser->word); - parser->word = NULL; - - parser->word_position = 0; - - parser->cursor = 0; - - g_free (parser->word_break_flags); - - /* Create array of flags, same size as original text. */ - parser->word_break_flags = g_malloc (txt_size); - - /* Get wordbreak flags in the whole string */ - u8_wordbreaks ((const uint8_t *)txt, - (size_t) txt_size, - (char *)parser->word_break_flags); - - /* Prepare a custom category which is a combination of the - * desired ones */ - parser->allowed_start = UC_LETTER; - if (!parser->ignore_numbers) { - parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER); - } -} - -const gchar * -tracker_parser_next (TrackerParser *parser, - gint *position, - gint *byte_offset_start, - gint *byte_offset_end, - gboolean *stop_word, - gint *word_length) -{ - const gchar *str; - gint byte_start = 0, byte_end = 0; - - str = NULL; - - g_free (parser->word); - parser->word = NULL; - - *stop_word = FALSE; - - if (parser_next (parser, &byte_start, &byte_end, stop_word)) { - str = parser->word; - } - - if (!*stop_word) { - parser->word_position++; - } - - *word_length = parser->word_length; - *position = parser->word_position; - *byte_offset_start = byte_start; - *byte_offset_end = byte_end; - - return str; -} - diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c deleted file mode 100644 index dec597747..000000000 --- a/src/libtracker-fts/tracker-parser-utils.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (C) 2010, Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301 USA - */ - -#include "config.h" - -#include <string.h> - -#include <libtracker-common/tracker-utils.h> - -#include "tracker-parser-utils.h" - -/* - * Definition of the possible reserved words. - * Length of word is explicitly given to avoid strlen() calls - */ -typedef struct { - const gchar *word; - gsize word_length; -} TrackerParserReservedWord; - -static const TrackerParserReservedWord reserved_words[] = { - { "or", 2 }, - { NULL, 0 } -}; - -gboolean -tracker_parser_is_reserved_word_utf8 (const gchar *word, - gsize word_length) -{ - gint i = 0; - - /* Loop the array of predefined reserved words */ - while (reserved_words[i].word != NULL) { - if (word_length == reserved_words[i].word_length && - strncmp (word, - reserved_words[i].word, - word_length) == 0) { - return TRUE; - } - i++; - } - - return FALSE; -} - - -#if TRACKER_PARSER_DEBUG_HEX -void -tracker_parser_message_hex (const gchar *message, - const gchar *str, - gsize str_length) -{ - gchar *hex_aux; - gchar *str_aux; - - g_return_if_fail (message); - g_return_if_fail (str); - g_return_if_fail (str_length != 0); - - /* String may not come NIL-terminated */ - str_aux = g_malloc (str_length + 1); - memcpy (str_aux, str, str_length); - str_aux[str_length] = '\0'; - - /* Get hexadecimal representation of the input string */ - hex_aux = tracker_strhex (str, str_length, ':'); - - /* Log it */ - g_message ("%s: '%s' (%s)", - message, str_aux, hex_aux); - - g_free (str_aux); - g_free (hex_aux); -} -#endif diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h deleted file mode 100644 index 614740f81..000000000 --- a/src/libtracker-fts/tracker-parser-utils.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (C) 2010, Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301 USA - */ - -#ifndef __TRACKER_PARSER_UTILS_H__ -#define __TRACKER_PARSER_UTILS_H__ - -#include "config.h" - -#include <glib.h> - -#ifdef HAVE_LIBICU -#include <unicode/utypes.h> -#endif - -G_BEGIN_DECLS - -/* ASCII-7 is in range [0x00,0x7F] */ -#define IS_ASCII_UCS4(c) ((c) <= 0x7F) - -/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */ -#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \ - ((c) >= 0x4E00 && (c) <= 0x9FA5) || \ - ((c) >= 0x20000 && (c) <= 0x2A6D6)) - -/* ASCII undescore? */ -#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F) - -/* Combining diacritical mark? - * Basic range: [0x0300,0x036F] - * Supplement: [0x1DC0,0x1DFF] - * For Symbols: [0x20D0,0x20FF] - * Half marks: [0xFE20,0xFE2F] - */ -#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F) || \ - ((c) >= 0x1DC0 && (c) <= 0x1DFF) || \ - ((c) >= 0x20D0 && (c) <= 0x20FF) || \ - ((c) >= 0xFE20 && (c) <= 0xFE2F)) - -/* Forced word breaks in Unicode parsers. - * If any of these is found INSIDE a properly delimited Unicode word, a new word - * break is forced and the Unicode word is split in two words. - * Current forced wordbreaks: - * - 0x002E: DOT ('.') - */ -#define IS_FORCED_WORDBREAK_UCS4(c) ((c) == 0x002E) - - -gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word, - gsize word_length); - - -/* Define to 1 if you want to enable debugging logs showing HEX contents - * of the words being parsed */ -#define TRACKER_PARSER_DEBUG_HEX 0 - -#if TRACKER_PARSER_DEBUG_HEX -void tracker_parser_message_hex (const gchar *message, - const gchar *str, - gsize str_length); -#else -#define tracker_parser_message_hex(a,b,c) -#endif - -G_END_DECLS - -#endif /* __TRACKER_PARSER_UTILS_H__ */ diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h deleted file mode 100644 index e6cb10e06..000000000 --- a/src/libtracker-fts/tracker-parser.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org> - * Copyright (C) 2008, Nokia <ivan.frade@nokia.com> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301 USA - */ - -#ifndef __LIBTRACKER_FTS_PARSER_H__ -#define __LIBTRACKER_FTS_PARSER_H__ - -#include <glib.h> - -#include <libtracker-common/tracker-language.h> - -G_BEGIN_DECLS - -typedef struct TrackerParser TrackerParser; - -TrackerParser *tracker_parser_new (TrackerLanguage *language); - -void tracker_parser_reset (TrackerParser *parser, - const gchar *txt, - gint txt_size, - guint max_word_length, - gboolean enable_stemmer, - gboolean enable_unaccent, - gboolean ignore_stop_words, - gboolean ignore_reserved_words, - gboolean ignore_numbers); - -const gchar * tracker_parser_next (TrackerParser *parser, - gint *position, - gint *byte_offset_start, - gint *byte_offset_end, - gboolean *stop_word, - gint *word_length); - -void tracker_parser_free (TrackerParser *parser); - -/* Other helper methods */ - -gboolean tracker_parser_unaccent_nfkd_string (gpointer str, - gsize *str_length); - -G_END_DECLS - -#endif /* __LIBTRACKER_FTS_PARSER_H__ */ |