summaryrefslogtreecommitdiff
path: root/src/libtracker-fts
diff options
context:
space:
mode:
authorPhilip Van Hoof <philip@codeminded.be>2014-12-01 14:07:16 +0100
committerPhilip Van Hoof <philip@codeminded.be>2014-12-01 14:07:16 +0100
commit5d05f68da10ef5a2c25fba4246120ce525035d51 (patch)
treef40db3e46953ef15f0dc7db5a5685711cbd79f18 /src/libtracker-fts
parent8d64b9783d3aebff2d9259068c402a9e1c1a81e4 (diff)
downloadtracker-5d05f68da10ef5a2c25fba4246120ce525035d51.tar.gz
Refactor tracker-parser to be located in libtracker-common
This also fixes the unaccenting SPARQL function in case of --disable-tracker-fts
Diffstat (limited to 'src/libtracker-fts')
-rw-r--r--src/libtracker-fts/Makefile.am13
-rw-r--r--src/libtracker-fts/tracker-fts-tokenizer.c3
-rw-r--r--src/libtracker-fts/tracker-parser-libicu.c749
-rw-r--r--src/libtracker-fts/tracker-parser-libunistring.c546
-rw-r--r--src/libtracker-fts/tracker-parser-utils.c91
-rw-r--r--src/libtracker-fts/tracker-parser-utils.h82
-rw-r--r--src/libtracker-fts/tracker-parser.h60
7 files changed, 3 insertions, 1541 deletions
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index e583c391c..86b7ac34d 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -33,23 +33,12 @@ libtracker_fts_la_SOURCES = \
tracker-fts-config.c \
tracker-fts-config.h \
tracker-fts-tokenizer.c \
- tracker-fts-tokenizer.h \
- tracker-parser-utils.c \
- tracker-parser-utils.h \
- tracker-parser.h
+ tracker-fts-tokenizer.h
if !HAVE_BUILTIN_FTS
libtracker_fts_la_SOURCES += $(fts4_sources)
endif
-if BUILD_LIBUNISTRING_PARSER
- libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
-endif
-
-if BUILD_LIBICU_PARSER
- libtracker_fts_la_SOURCES += tracker-parser-libicu.c
-endif
-
libtracker_fts_la_LIBADD = \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(BUILD_LIBS) \
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.c b/src/libtracker-fts/tracker-fts-tokenizer.c
index d61ae79e2..c45d73d9a 100644
--- a/src/libtracker-fts/tracker-fts-tokenizer.c
+++ b/src/libtracker-fts/tracker-fts-tokenizer.c
@@ -26,9 +26,10 @@
#include <assert.h>
#include <string.h>
+#include <libtracker-common/tracker-parser.h>
+
#include "tracker-fts-tokenizer.h"
#include "tracker-fts-config.h"
-#include "tracker-parser.h"
#include "fts3_tokenizer.h"
typedef struct TrackerTokenizer TrackerTokenizer;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
deleted file mode 100644
index b26722c96..000000000
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-#include "config.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <locale.h>
-
-#include <unicode/utypes.h>
-#include <unicode/ucnv.h>
-#include <unicode/ubrk.h>
-#include <unicode/ustring.h>
-#include <unicode/uchar.h>
-#include <unicode/unorm.h>
-
-#include "tracker-parser.h"
-#include "tracker-parser-utils.h"
-
-/* Type of words detected */
-typedef enum {
- TRACKER_PARSER_WORD_TYPE_ASCII,
- TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
- TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
-} TrackerParserWordType;
-
-/* Max possible length of a UChar encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
-struct TrackerParser {
- const gchar *txt;
- gint txt_size;
-
- TrackerLanguage *language;
- guint max_word_length;
- gboolean enable_stemmer;
- gboolean enable_unaccent;
- gboolean ignore_stop_words;
- gboolean ignore_reserved_words;
- gboolean ignore_numbers;
- gboolean enable_forced_wordbreaks;
-
- /* Private members */
- gchar *word;
- gint word_length;
- guint word_position;
-
- /* Text as UChars */
- UChar *utxt;
- gint utxt_size;
- /* Original offset of each UChar in the input txt string */
- gint32 *offsets;
-
- /* The word-break iterator */
- UBreakIterator *bi;
-
- /* Cursor, as index of the utxt array of bytes */
- gsize cursor;
-};
-
-
-static gboolean
-get_word_info (const UChar *word,
- gsize word_length,
- gboolean ignore_numbers,
- gboolean *p_is_allowed_word_start,
- TrackerParserWordType *p_word_type)
-{
- UCharIterator iter;
- UChar32 unichar;
- guint8 unichar_gc;
-
- /* Get first character of the word as UCS4 */
- uiter_setString (&iter, word, word_length);
- unichar = uiter_current32 (&iter);
- if (unichar == U_SENTINEL) {
- return FALSE;
- }
-
- /* We only want the words where the first character
- * in the word is either a letter, a number or a symbol.
- *
- * This is needed because the word break algorithm also
- * considers word breaks after for example commas or other
- * punctuation marks.
- *
- * Note that looking at the first character in the string
- * should be compatible with all Unicode normalization
- * methods.
- */
- unichar_gc = u_charType (unichar);
- if (unichar_gc == U_UPPERCASE_LETTER ||
- unichar_gc == U_LOWERCASE_LETTER ||
- unichar_gc == U_TITLECASE_LETTER ||
- unichar_gc == U_MODIFIER_LETTER ||
- unichar_gc == U_OTHER_LETTER ||
- IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
- (!ignore_numbers &&
- (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
- unichar_gc == U_LETTER_NUMBER ||
- unichar_gc == U_OTHER_NUMBER))) {
- *p_is_allowed_word_start = TRUE;
- } else {
- *p_is_allowed_word_start = FALSE;
- return TRUE;
- }
-
- /* Word starts with a CJK character? */
- if (IS_CJK_UCS4 ((guint32)unichar)) {
- *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
- return TRUE;
- }
-
- /* Is ASCII-only string? */
- while (unichar != U_SENTINEL) {
- if (!IS_ASCII_UCS4 ((guint32)unichar)) {
- *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
- return TRUE;
- }
- unichar = uiter_next32 (&iter);
- }
-
- *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
- return TRUE;
-}
-
-/* The input word in this method MUST be normalized in NFKD form,
- * and given in UChars, where str_length is the number of UChars
- * (not the number of bytes) */
-gboolean
-tracker_parser_unaccent_nfkd_string (gpointer str,
- gsize *str_length)
-{
- UChar *word;
- gsize word_length;
- gsize i;
- gsize j;
-
- g_return_val_if_fail (str != NULL, FALSE);
- g_return_val_if_fail (str_length != NULL, FALSE);
- g_return_val_if_fail (*str_length > 0, FALSE);
-
- word = (UChar *)str;
- word_length = *str_length;
-
- i = 0;
- j = 0;
- while (i < word_length) {
- UChar32 unichar;
- gint utf16_len; /* given in UChars */
- gsize aux_i;
-
- /* Get next character of the word as UCS4 */
- aux_i = i;
- U16_NEXT (word, aux_i, word_length, unichar);
- utf16_len = aux_i - i;
-
- /* Invalid UTF-16 character or end of original string. */
- if (utf16_len <= 0) {
- break;
- }
-
- /* If the given unichar is a combining diacritical mark,
- * just update the original index, not the output one */
- if (IS_CDM_UCS4 ((guint32) unichar)) {
- i += utf16_len;
- continue;
- }
-
- /* If already found a previous combining
- * diacritical mark, indexes are different so
- * need to copy characters. As output and input
- * buffers may overlap, need to use memmove
- * instead of memcpy */
- if (i != j) {
- memmove (&word[j], &word[i], sizeof (UChar) * utf16_len);
- }
-
- /* Update both indexes */
- i += utf16_len;
- j += utf16_len;
- }
-
- /* Force proper string end */
- word[j] = (UChar) 0;
-
- /* Set new output length */
- *str_length = j;
-
- return TRUE;
-}
-
-static gchar *
-convert_UChar_to_utf8 (const UChar *word,
- gsize uchar_len,
- gsize *utf8_len)
-{
- gchar *utf8_str;
- UErrorCode icu_error = U_ZERO_ERROR;
- UConverter *converter;
- gsize new_utf8_len;
-
- g_return_val_if_fail (word, NULL);
- g_return_val_if_fail (utf8_len, NULL);
-
- /* Open converter UChar to UTF-16BE */
- converter = ucnv_open ("UTF-8", &icu_error);
- if (!converter) {
- g_warning ("Cannot open UTF-8 converter: '%s'",
- U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
- return NULL;
- }
-
- /* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
- * in UTF-8. */
- utf8_str = g_malloc (2 * uchar_len * sizeof (UChar) + 1);
-
- /* Convert from UChar to UTF-8 (NIL-terminated) */
- new_utf8_len = ucnv_fromUChars (converter,
- utf8_str,
- 2 * uchar_len * sizeof (UChar) + 1,
- word,
- uchar_len,
- &icu_error);
- if (U_FAILURE (icu_error)) {
- g_warning ("Cannot convert from UChar to UTF-8: '%s'",
- u_errorName (icu_error));
- g_free (utf8_str);
- ucnv_close (converter);
- return NULL;
- }
-
- *utf8_len = new_utf8_len;
- ucnv_close (converter);
-
- return utf8_str;
-}
-
-static gchar *
-process_word_uchar (TrackerParser *parser,
- const UChar *word,
- gint length,
- TrackerParserWordType type,
- gboolean *stop_word)
-{
- UErrorCode error = U_ZERO_ERROR;
- UChar normalized_buffer[WORD_BUFFER_LENGTH];
- gchar *utf8_str = NULL;
- gsize new_word_length;
-
- /* Log original word */
- tracker_parser_message_hex ("ORIGINAL word",
- (guint8 *)word,
- length * sizeof (UChar));
-
-
- if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
- UChar casefolded_buffer [WORD_BUFFER_LENGTH];
-
- /* Casefold... */
- new_word_length = u_strFoldCase (casefolded_buffer,
- WORD_BUFFER_LENGTH,
- word,
- length,
- U_FOLD_CASE_DEFAULT,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error casefolding: '%s'",
- u_errorName (error));
- return NULL;
- }
- if (new_word_length > WORD_BUFFER_LENGTH)
- new_word_length = WORD_BUFFER_LENGTH;
-
- /* Log after casefolding */
- tracker_parser_message_hex (" After Casefolding",
- (guint8 *)casefolded_buffer,
- new_word_length * sizeof (UChar));
-
- /* NFKD normalization... */
- new_word_length = unorm_normalize (casefolded_buffer,
- new_word_length,
- UNORM_NFKD,
- 0,
- normalized_buffer,
- WORD_BUFFER_LENGTH,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error normalizing: '%s'",
- u_errorName (error));
- return NULL;
- }
-
- if (new_word_length > WORD_BUFFER_LENGTH)
- new_word_length = WORD_BUFFER_LENGTH;
-
- /* Log after casefolding */
- tracker_parser_message_hex (" After Normalization",
- (guint8 *) normalized_buffer,
- new_word_length * sizeof (UChar));
- } else {
- /* For ASCII-only, just tolower() each character */
- new_word_length = u_strToLower (normalized_buffer,
- WORD_BUFFER_LENGTH,
- word,
- length,
- NULL,
- &error);
- if (U_FAILURE (error)) {
- g_warning ("Error lowercasing: '%s'",
- u_errorName (error));
- return NULL;
- }
-
- /* Log after casefolding */
- tracker_parser_message_hex (" After lowercase",
- (guint8 *) normalized_buffer,
- new_word_length * sizeof (UChar));
- }
-
- /* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (parser->enable_unaccent &&
- type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
- tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) {
- /* Log after unaccenting */
- tracker_parser_message_hex (" After UNAC",
- (guint8 *) normalized_buffer,
- new_word_length * sizeof (UChar));
- }
-
- /* Finally, convert to UTF-8 */
- utf8_str = convert_UChar_to_utf8 (normalized_buffer,
- new_word_length,
- &new_word_length);
-
- /* Log after unaccenting */
- tracker_parser_message_hex (" After UTF8 conversion",
- utf8_str,
- new_word_length);
-
- /* Check if stop word */
- if (parser->ignore_stop_words) {
- *stop_word = tracker_language_is_stop_word (parser->language,
- utf8_str);
- }
-
- /* Stemming needed? */
- if (utf8_str &&
- parser->enable_stemmer) {
- gchar *stemmed;
-
- /* Input for stemmer ALWAYS in UTF-8, as well as output */
- stemmed = tracker_language_stem_word (parser->language,
- utf8_str,
- new_word_length);
-
- /* Log after stemming */
- tracker_parser_message_hex (" After stemming",
- stemmed, strlen (stemmed));
-
- /* If stemmed wanted and succeeded, free previous and return it */
- if (stemmed) {
- g_free (utf8_str);
- return stemmed;
- }
- }
-
- return utf8_str;
-}
-
-static gboolean
-parser_check_forced_wordbreaks (const UChar *buffer,
- gsize current,
- gsize *next)
-{
- gsize unicode_word_length = *next - current;
- gsize word_length = 0;
- UCharIterator iter;
- UChar32 unichar;
-
- uiter_setString (&iter, &buffer[current], unicode_word_length);
-
- /* Iterate over the string looking for forced word breaks */
- while ((unichar = uiter_next32 (&iter)) != U_SENTINEL &&
- word_length < unicode_word_length) {
-
- if (IS_FORCED_WORDBREAK_UCS4 ((guint32) unichar)) {
- /* Support word starting with a forced wordbreak */
- if (word_length == 0) {
- word_length = 1;
- }
- break;
- }
-
- word_length ++;
- }
-
- /* g_debug ("current: %" G_GSIZE_FORMAT ", " */
- /* "next: %" G_GSIZE_FORMAT ", " */
- /* "now: %" G_GSIZE_FORMAT, */
- /* current, */
- /* *next, */
- /* current + word_length); */
-
- if (word_length != unicode_word_length) {
- *next = current + word_length;
- return TRUE;
- }
- return FALSE;
-}
-
-static gboolean
-parser_next (TrackerParser *parser,
- gint *byte_offset_start,
- gint *byte_offset_end,
- gboolean *stop_word)
-{
- gsize word_length_uchar = 0;
- gsize word_length_utf8 = 0;
- gchar *processed_word = NULL;
- gsize current_word_offset_utf8;
-
- *byte_offset_start = 0;
- *byte_offset_end = 0;
-
- g_return_val_if_fail (parser, FALSE);
-
- /* Loop to look for next valid word */
- while (!processed_word &&
- parser->cursor < parser->utxt_size) {
- TrackerParserWordType type;
- gboolean is_allowed;
- gsize next_word_offset_uchar;
- gsize next_word_offset_utf8;
- gsize truncated_length;
-
- /* Set current word offset in the original UTF-8 string */
- current_word_offset_utf8 = parser->offsets[parser->cursor];
-
- /* Find next word break. */
- next_word_offset_uchar = ubrk_next (parser->bi);
-
- /* Check if any forced wordbreaks here... */
- if (parser->enable_forced_wordbreaks) {
- /* Returns TRUE if next word offset changed */
- if (parser_check_forced_wordbreaks (parser->utxt,
- parser->cursor,
- &next_word_offset_uchar)) {
- /* We need to reset the iterator so that next word
- * actually returns the same result */
- ubrk_previous (parser->bi);
- }
- }
-
- if (next_word_offset_uchar >= parser->utxt_size) {
- /* Last word support... */
- next_word_offset_uchar = parser->utxt_size;
- next_word_offset_utf8 = parser->txt_size;
- } else {
- next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
- }
-
- /* Word end is the first byte after the word, which is either the
- * start of next word or the end of the string */
- word_length_uchar = next_word_offset_uchar - parser->cursor;
- word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
-
- /* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
- /* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
- /* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
- /* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
- /* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
- /* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */
-
- /* Ignore the word if longer than the maximum allowed */
- if (word_length_utf8 >= parser->max_word_length) {
- /* Ignore this word and keep on looping */
- parser->cursor = next_word_offset_uchar;
- continue;
- }
-
- /* Get word info... */
- if (!get_word_info (&parser->utxt[parser->cursor],
- word_length_uchar,
- parser->ignore_numbers,
- &is_allowed,
- &type)) {
- /* Quit loop just in case */
- parser->cursor = parser->utxt_size;
- break;
- }
-
- /* Ignore the word if not an allowed word start */
- if (!is_allowed) {
- /* Ignore this word and keep on looping */
- parser->cursor = next_word_offset_uchar;
- continue;
- }
-
- /* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
- if (parser->ignore_reserved_words &&
- tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
- word_length_utf8)) {
- /* Ignore this word and keep on looping */
- parser->cursor = next_word_offset_uchar;
- continue;
- }
-
- /* compute truncated word length (in UChar bytes) if needed (to
- * avoid extremely long words) */
- truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
- word_length_uchar :
- 2 * WORD_BUFFER_LENGTH);
-
- /* Process the word here. If it fails, we can still go
- * to the next one. Returns newly allocated UTF-8
- * string always.
- * Enable UNAC stripping only if no ASCII and no CJK
- * Note we are passing UChar encoded string here!
- */
- processed_word = process_word_uchar (parser,
- &(parser->utxt[parser->cursor]),
- truncated_length,
- type,
- stop_word);
- if (!processed_word) {
- /* Ignore this word and keep on looping */
- parser->cursor = next_word_offset_uchar;
- continue;
- }
- }
-
- /* If we got a word here, set output */
- if (processed_word) {
- /* Set outputs */
- *byte_offset_start = current_word_offset_utf8;
- *byte_offset_end = current_word_offset_utf8 + word_length_utf8;
-
- /* Update cursor */
- parser->cursor += word_length_uchar;
-
- parser->word_length = strlen (processed_word);
- parser->word = processed_word;
-
- return TRUE;
- }
-
- /* No more words... */
- return FALSE;
-}
-
-TrackerParser *
-tracker_parser_new (TrackerLanguage *language)
-{
- TrackerParser *parser;
-
- g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
- parser = g_new0 (TrackerParser, 1);
-
- parser->language = g_object_ref (language);
-
- return parser;
-}
-
-void
-tracker_parser_free (TrackerParser *parser)
-{
- g_return_if_fail (parser != NULL);
-
- if (parser->language) {
- g_object_unref (parser->language);
- }
-
- if (parser->bi) {
- ubrk_close (parser->bi);
- }
-
- g_free (parser->utxt);
- g_free (parser->offsets);
-
- g_free (parser->word);
-
- g_free (parser);
-}
-
-void
-tracker_parser_reset (TrackerParser *parser,
- const gchar *txt,
- gint txt_size,
- guint max_word_length,
- gboolean enable_stemmer,
- gboolean enable_unaccent,
- gboolean ignore_stop_words,
- gboolean ignore_reserved_words,
- gboolean ignore_numbers)
-{
- UErrorCode error = U_ZERO_ERROR;
- UConverter *converter;
- UChar *last_uchar;
- const gchar *last_utf8;
-
- g_return_if_fail (parser != NULL);
- g_return_if_fail (txt != NULL);
-
- parser->max_word_length = max_word_length;
- parser->enable_stemmer = enable_stemmer;
- parser->enable_unaccent = enable_unaccent;
- parser->ignore_stop_words = ignore_stop_words;
- parser->ignore_reserved_words = ignore_reserved_words;
- parser->ignore_numbers = ignore_numbers;
-
- /* Note: We're forcing some unicode characters to behave
- * as wordbreakers: e.g, the '.' The main reason for this
- * is to enable FTS searches matching file extension. */
- parser->enable_forced_wordbreaks = TRUE;
-
- parser->txt_size = txt_size;
- parser->txt = txt;
-
- g_free (parser->word);
- parser->word = NULL;
-
- if (parser->bi) {
- ubrk_close (parser->bi);
- parser->bi = NULL;
- }
- g_free (parser->utxt);
- parser->utxt = NULL;
- g_free (parser->offsets);
- parser->offsets = NULL;
-
- parser->word_position = 0;
-
- parser->cursor = 0;
-
- /* Open converter UTF-8 to UChar */
- converter = ucnv_open ("UTF-8", &error);
- if (!converter) {
- g_warning ("Cannot open UTF-8 converter: '%s'",
- U_FAILURE (error) ? u_errorName (error) : "none");
- return;
- }
-
- /* Allocate UChars and offsets buffers */
- parser->utxt_size = txt_size + 1;
- parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
- parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));
-
- /* last_uchar and last_utf8 will be also an output parameter! */
- last_uchar = parser->utxt;
- last_utf8 = parser->txt;
-
- /* Convert to UChars storing offsets */
- ucnv_toUnicode (converter,
- &last_uchar,
- &parser->utxt[txt_size],
- &last_utf8,
- &parser->txt[txt_size],
- parser->offsets,
- FALSE,
- &error);
- if (U_SUCCESS (error)) {
- /* Proper UChar array size is now given by 'last_uchar' */
- parser->utxt_size = last_uchar - parser->utxt;
-
- /* Open word-break iterator */
- parser->bi = ubrk_open(UBRK_WORD,
- setlocale (LC_CTYPE, NULL),
- parser->utxt,
- parser->utxt_size,
- &error);
- if (U_SUCCESS (error)) {
- /* Find FIRST word in the UChar array */
- parser->cursor = ubrk_first (parser->bi);
- }
- }
-
- /* If any error happened, reset buffers */
- if (U_FAILURE (error)) {
- g_warning ("Error initializing libicu support: '%s'",
- u_errorName (error));
- /* Reset buffers */
- g_free (parser->utxt);
- parser->utxt = NULL;
- g_free (parser->offsets);
- parser->offsets = NULL;
- parser->utxt_size = 0;
- if (parser->bi) {
- ubrk_close (parser->bi);
- parser->bi = NULL;
- }
- }
-
- /* Close converter */
- ucnv_close (converter);
-}
-
-const gchar *
-tracker_parser_next (TrackerParser *parser,
- gint *position,
- gint *byte_offset_start,
- gint *byte_offset_end,
- gboolean *stop_word,
- gint *word_length)
-{
- const gchar *str;
- gint byte_start = 0, byte_end = 0;
-
- str = NULL;
-
- g_free (parser->word);
- parser->word = NULL;
-
- *stop_word = FALSE;
-
- if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
- str = parser->word;
- }
-
- if (!*stop_word) {
- parser->word_position++;
- }
-
- *word_length = parser->word_length;
- *position = parser->word_position;
- *byte_offset_start = byte_start;
- *byte_offset_end = byte_end;
-
- return str;
-}
-
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
deleted file mode 100644
index 9de6e46f7..000000000
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-#include "config.h"
-
-#include <stdio.h>
-#include <string.h>
-
-/* libunistring versions prior to 9.1.2 need this hack */
-#define _UNUSED_PARAMETER_
-#include <unistr.h>
-#include <uniwbrk.h>
-#include <unictype.h>
-#include <unicase.h>
-
-#include "tracker-parser.h"
-#include "tracker-parser-utils.h"
-
-/* Type of words detected */
-typedef enum {
- TRACKER_PARSER_WORD_TYPE_ASCII,
- TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
- TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
-} TrackerParserWordType;
-
-/* Max possible length of a UTF-8 encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
-struct TrackerParser {
- const gchar *txt;
- gint txt_size;
-
- TrackerLanguage *language;
- guint max_word_length;
- gboolean enable_stemmer;
- gboolean enable_unaccent;
- gboolean ignore_stop_words;
- gboolean ignore_reserved_words;
- gboolean ignore_numbers;
- gboolean enable_forced_wordbreaks;
-
- /* Private members */
- gchar *word;
- gint word_length;
- guint word_position;
-
- /* Cursor, as index of the input array of bytes */
- gsize cursor;
- /* libunistring flags array */
- gchar *word_break_flags;
- /* general category of the start character in words */
- uc_general_category_t allowed_start;
-};
-
-static gboolean
-get_word_info (TrackerParser *parser,
- gsize *p_word_length,
- gboolean *p_is_allowed_word_start,
- TrackerParserWordType *p_word_type)
-{
- ucs4_t first_unichar;
- gint first_unichar_len;
- gboolean ascii_only;
-
- /* Defaults */
- *p_is_allowed_word_start = TRUE;
-
- /* Get first character of the word as UCS4 */
- first_unichar_len = u8_strmbtouc (&first_unichar,
- &(parser->txt[parser->cursor]));
- if (first_unichar_len <= 0) {
- /* This should only happen if NIL was passed to u8_strmbtouc,
- * so better just force stop here */
- return FALSE;
- } else {
- /* If first character has length 1, it's ASCII-7 */
- ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
- }
-
- /* Consider word starts with a forced wordbreak */
- if (parser->enable_forced_wordbreaks &&
- IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
- *p_word_length = first_unichar_len;
- } else {
- gsize i;
-
- /* Find next word break, and in the same loop checking if only ASCII
- * characters */
- i = parser->cursor + first_unichar_len;
- while (1) {
- /* Text bounds reached? */
- if (i >= parser->txt_size)
- break;
- /* Proper unicode word break detected? */
- if (parser->word_break_flags[i])
- break;
- /* Forced word break detected? */
- if (parser->enable_forced_wordbreaks &&
- IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
- break;
-
- if (ascii_only &&
- !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
- ascii_only = FALSE;
- }
-
- i++;
- }
-
- /* Word end is the first byte after the word, which is either the
- * start of next word or the end of the string */
- *p_word_length = i - parser->cursor;
- }
-
- /* We only want the words where the first character
- * in the word is either a letter, a number or a symbol.
- * This is needed because the word break algorithm also
- * considers word breaks after for example commas or other
- * punctuation marks.
- * Note that looking at the first character in the string
- * should be compatible with all Unicode normalization
- * methods.
- */
- if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
- !uc_is_general_category (first_unichar,
- parser->allowed_start)) {
- *p_is_allowed_word_start = FALSE;
- return TRUE;
- }
-
- /* Decide word type */
- if (ascii_only) {
- *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
- } else if (IS_CJK_UCS4 (first_unichar)) {
- *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
- } else {
- *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
- }
- return TRUE;
-}
-
-/* The input word in this method MUST be normalized in NFKD form,
- * and given in UTF-8, where str_length is the byte-length */
-gboolean
-tracker_parser_unaccent_nfkd_string (gpointer str,
- gsize *str_length)
-{
- gchar *word;
- gsize word_length;
- gsize i;
- gsize j;
-
- g_return_val_if_fail (str != NULL, FALSE);
- g_return_val_if_fail (str_length != NULL, FALSE);
- g_return_val_if_fail (*str_length > 0, FALSE);
-
- word = (gchar *)str;
- word_length = *str_length;
-
- i = 0;
- j = 0;
- while (i < word_length) {
- ucs4_t unichar;
- gint utf8_len;
-
- /* Get next character of the word as UCS4 */
- utf8_len = u8_strmbtouc (&unichar, &word[i]);
-
- /* Invalid UTF-8 character or end of original string. */
- if (utf8_len <= 0) {
- break;
- }
-
- /* If the given unichar is a combining diacritical mark,
- * just update the original index, not the output one */
- if (IS_CDM_UCS4 ((guint32) unichar)) {
- i += utf8_len;
- continue;
- }
-
- /* If already found a previous combining
- * diacritical mark, indexes are different so
- * need to copy characters. As output and input
- * buffers may overlap, need to use memmove
- * instead of memcpy */
- if (i != j) {
- memmove (&word[j], &word[i], utf8_len);
- }
-
- /* Update both indexes */
- i += utf8_len;
- j += utf8_len;
- }
-
- /* Force proper string end */
- word[j] = '\0';
-
- /* Set new output length */
- *str_length = j;
-
- return TRUE;
-}
-
-static gchar *
-process_word_utf8 (TrackerParser *parser,
- const gchar *word,
- gint length,
- TrackerParserWordType type,
- gboolean *stop_word)
-{
- gchar word_buffer [WORD_BUFFER_LENGTH];
- gchar *normalized = NULL;
- gchar *stemmed = NULL;
- size_t new_word_length;
-
- g_return_val_if_fail (parser != NULL, NULL);
- g_return_val_if_fail (word != NULL, NULL);
-
- /* If length is set as -1, the input word MUST be NIL-terminated.
- * Otherwise, this restriction is not needed as the length to process
- * is given as input argument */
- if (length < 0) {
- length = strlen (word);
- }
-
- /* Log original word */
- tracker_parser_message_hex ("ORIGINAL word",
- word, length);
-
- /* Normalization and case-folding ONLY for non-ASCII */
- if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
- /* Leave space for last NIL */
- new_word_length = WORD_BUFFER_LENGTH - 1;
-
- /* Casefold and NFKD normalization in output.
- * NOTE: if the output buffer is not big enough, u8_casefold will
- * return a newly-allocated buffer. */
- normalized = u8_casefold ((const uint8_t *)word,
- length,
- uc_locale_language (),
- UNINORM_NFKD,
- word_buffer,
- &new_word_length);
-
- /* Case folding + Normalization failed, ignore this word */
- g_return_val_if_fail (normalized != NULL, NULL);
-
- /* If output buffer is not the same as the one passed to
- * u8_casefold, we know it was newly-allocated, so need
- * to resize it in 1 byte to add last NIL */
- if (normalized != word_buffer) {
- normalized = g_realloc (normalized, new_word_length + 1);
- }
-
- /* Log after Normalization */
- tracker_parser_message_hex (" After Casefolding and NFKD normalization",
- normalized, new_word_length);
- } else {
- /* For ASCII-only, just tolower() each character */
- gsize i;
-
- normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
-
- for (i = 0; i < length; i++) {
- normalized[i] = g_ascii_tolower (word[i]);
- }
-
- new_word_length = length;
-
- /* Log after tolower */
- tracker_parser_message_hex (" After Lowercasing",
- normalized, new_word_length);
- }
-
- /* Set output NIL */
- normalized[new_word_length] = '\0';
-
- /* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (parser->enable_unaccent &&
- type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
- tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
- /* Log after UNAC stripping */
- tracker_parser_message_hex (" After UNAC stripping",
- normalized, new_word_length);
- }
-
- /* Check if stop word */
- if (parser->ignore_stop_words) {
- *stop_word = tracker_language_is_stop_word (parser->language,
- normalized);
- }
-
- /* Stemming needed? */
- if (parser->enable_stemmer) {
- stemmed = tracker_language_stem_word (parser->language,
- normalized,
- new_word_length);
-
- /* Log after stemming */
- tracker_parser_message_hex (" After stemming",
- stemmed, strlen (stemmed));
- }
-
- /* If stemmed wanted and succeeded, free previous and return it */
- if (stemmed) {
- if (normalized != word_buffer) {
- g_free (normalized);
- }
- return stemmed;
- }
-
- /* It may be the case that no stripping and no stemming was needed, and
- * that the output buffer in stack was enough for case-folding and
- * normalization. In this case, need to strdup() the string to return it */
- return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
-}
-
-static gboolean
-parser_next (TrackerParser *parser,
- gint *byte_offset_start,
- gint *byte_offset_end,
- gboolean *stop_word)
-{
- gsize word_length = 0;
- gchar *processed_word = NULL;
-
- *byte_offset_start = 0;
- *byte_offset_end = 0;
-
- g_return_val_if_fail (parser, FALSE);
-
- /* Loop to look for next valid word */
- while (!processed_word &&
- parser->cursor < parser->txt_size) {
- TrackerParserWordType type;
- gsize truncated_length;
- gboolean is_allowed;
-
- /* Get word info */
- if (!get_word_info (parser,
- &word_length,
- &is_allowed,
- &type)) {
- /* Quit loop just in case */
- parser->cursor = parser->txt_size;
- break;
- }
-
- /* Ignore the word if not an allowed word start */
- if (!is_allowed) {
- /* Ignore this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
-
- /* Ignore the word if longer than the maximum allowed */
- if (word_length >= parser->max_word_length) {
- /* Ignore this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
-
- /* check if word is reserved and ignore it if so */
- if (parser->ignore_reserved_words &&
- tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
- word_length)) {
- /* Ignore this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
-
- /* compute truncated word length if needed (to avoid extremely
- * long words)*/
- truncated_length = (word_length < WORD_BUFFER_LENGTH ?
- word_length :
- WORD_BUFFER_LENGTH - 1);
-
- /* Process the word here. If it fails, we can still go
- * to the next one. Returns newly allocated string
- * always */
- processed_word = process_word_utf8 (parser,
- &(parser->txt[parser->cursor]),
- truncated_length,
- type,
- stop_word);
- if (!processed_word) {
- /* Ignore this word and keep on looping */
- parser->cursor += word_length;
- continue;
- }
- }
-
- /* If we got a word here, set output */
- if (processed_word) {
- /* Set outputs */
- *byte_offset_start = parser->cursor;
- *byte_offset_end = parser->cursor + word_length;
-
- /* Update cursor */
- parser->cursor += word_length;
-
- parser->word_length = strlen (processed_word);
- parser->word = processed_word;
-
- return TRUE;
- }
-
- /* No more words... */
- return FALSE;
-}
-
-TrackerParser *
-tracker_parser_new (TrackerLanguage *language)
-{
- TrackerParser *parser;
-
- g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
- parser = g_new0 (TrackerParser, 1);
-
- parser->language = g_object_ref (language);
-
- return parser;
-}
-
-void
-tracker_parser_free (TrackerParser *parser)
-{
- g_return_if_fail (parser != NULL);
-
- if (parser->language) {
- g_object_unref (parser->language);
- }
-
- g_free (parser->word_break_flags);
-
- g_free (parser->word);
-
- g_free (parser);
-}
-
-void
-tracker_parser_reset (TrackerParser *parser,
- const gchar *txt,
- gint txt_size,
- guint max_word_length,
- gboolean enable_stemmer,
- gboolean enable_unaccent,
- gboolean ignore_stop_words,
- gboolean ignore_reserved_words,
- gboolean ignore_numbers)
-{
- g_return_if_fail (parser != NULL);
- g_return_if_fail (txt != NULL);
-
- parser->max_word_length = max_word_length;
- parser->enable_stemmer = enable_stemmer;
- parser->enable_unaccent = enable_unaccent;
- parser->ignore_stop_words = ignore_stop_words;
- parser->ignore_reserved_words = ignore_reserved_words;
- parser->ignore_numbers = ignore_numbers;
-
- /* Note: We're forcing some unicode characters to behave
- * as wordbreakers: e.g, the '.' The main reason for this
- * is to enable FTS searches matching file extension. */
- parser->enable_forced_wordbreaks = TRUE;
-
- parser->txt_size = txt_size;
- parser->txt = txt;
-
- g_free (parser->word);
- parser->word = NULL;
-
- parser->word_position = 0;
-
- parser->cursor = 0;
-
- g_free (parser->word_break_flags);
-
- /* Create array of flags, same size as original text. */
- parser->word_break_flags = g_malloc (txt_size);
-
- /* Get wordbreak flags in the whole string */
- u8_wordbreaks ((const uint8_t *)txt,
- (size_t) txt_size,
- (char *)parser->word_break_flags);
-
- /* Prepare a custom category which is a combination of the
- * desired ones */
- parser->allowed_start = UC_LETTER;
- if (!parser->ignore_numbers) {
- parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
- }
-}
-
-const gchar *
-tracker_parser_next (TrackerParser *parser,
- gint *position,
- gint *byte_offset_start,
- gint *byte_offset_end,
- gboolean *stop_word,
- gint *word_length)
-{
- const gchar *str;
- gint byte_start = 0, byte_end = 0;
-
- str = NULL;
-
- g_free (parser->word);
- parser->word = NULL;
-
- *stop_word = FALSE;
-
- if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
- str = parser->word;
- }
-
- if (!*stop_word) {
- parser->word_position++;
- }
-
- *word_length = parser->word_length;
- *position = parser->word_position;
- *byte_offset_start = byte_start;
- *byte_offset_end = byte_end;
-
- return str;
-}
-
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
deleted file mode 100644
index dec597747..000000000
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-#include "config.h"
-
-#include <string.h>
-
-#include <libtracker-common/tracker-utils.h>
-
-#include "tracker-parser-utils.h"
-
-/*
- * Definition of the possible reserved words.
- * Length of word is explicitly given to avoid strlen() calls
- */
-typedef struct {
- const gchar *word;
- gsize word_length;
-} TrackerParserReservedWord;
-
-static const TrackerParserReservedWord reserved_words[] = {
- { "or", 2 },
- { NULL, 0 }
-};
-
-gboolean
-tracker_parser_is_reserved_word_utf8 (const gchar *word,
- gsize word_length)
-{
- gint i = 0;
-
- /* Loop the array of predefined reserved words */
- while (reserved_words[i].word != NULL) {
- if (word_length == reserved_words[i].word_length &&
- strncmp (word,
- reserved_words[i].word,
- word_length) == 0) {
- return TRUE;
- }
- i++;
- }
-
- return FALSE;
-}
-
-
-#if TRACKER_PARSER_DEBUG_HEX
-void
-tracker_parser_message_hex (const gchar *message,
- const gchar *str,
- gsize str_length)
-{
- gchar *hex_aux;
- gchar *str_aux;
-
- g_return_if_fail (message);
- g_return_if_fail (str);
- g_return_if_fail (str_length != 0);
-
- /* String may not come NIL-terminated */
- str_aux = g_malloc (str_length + 1);
- memcpy (str_aux, str, str_length);
- str_aux[str_length] = '\0';
-
- /* Get hexadecimal representation of the input string */
- hex_aux = tracker_strhex (str, str_length, ':');
-
- /* Log it */
- g_message ("%s: '%s' (%s)",
- message, str_aux, hex_aux);
-
- g_free (str_aux);
- g_free (hex_aux);
-}
-#endif
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
deleted file mode 100644
index 614740f81..000000000
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-#ifndef __TRACKER_PARSER_UTILS_H__
-#define __TRACKER_PARSER_UTILS_H__
-
-#include "config.h"
-
-#include <glib.h>
-
-#ifdef HAVE_LIBICU
-#include <unicode/utypes.h>
-#endif
-
-G_BEGIN_DECLS
-
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
-#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
- ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
- ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
-/* ASCII undescore? */
-#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
-
-/* Combining diacritical mark?
- * Basic range: [0x0300,0x036F]
- * Supplement: [0x1DC0,0x1DFF]
- * For Symbols: [0x20D0,0x20FF]
- * Half marks: [0xFE20,0xFE2F]
- */
-#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F) || \
- ((c) >= 0x1DC0 && (c) <= 0x1DFF) || \
- ((c) >= 0x20D0 && (c) <= 0x20FF) || \
- ((c) >= 0xFE20 && (c) <= 0xFE2F))
-
-/* Forced word breaks in Unicode parsers.
- * If any of these is found INSIDE a properly delimited Unicode word, a new word
- * break is forced and the Unicode word is split in two words.
- * Current forced wordbreaks:
- * - 0x002E: DOT ('.')
- */
-#define IS_FORCED_WORDBREAK_UCS4(c) ((c) == 0x002E)
-
-
-gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
- gsize word_length);
-
-
-/* Define to 1 if you want to enable debugging logs showing HEX contents
- * of the words being parsed */
-#define TRACKER_PARSER_DEBUG_HEX 0
-
-#if TRACKER_PARSER_DEBUG_HEX
-void tracker_parser_message_hex (const gchar *message,
- const gchar *str,
- gsize str_length);
-#else
-#define tracker_parser_message_hex(a,b,c)
-#endif
-
-G_END_DECLS
-
-#endif /* __TRACKER_PARSER_UTILS_H__ */
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
deleted file mode 100644
index e6cb10e06..000000000
--- a/src/libtracker-fts/tracker-parser.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-#ifndef __LIBTRACKER_FTS_PARSER_H__
-#define __LIBTRACKER_FTS_PARSER_H__
-
-#include <glib.h>
-
-#include <libtracker-common/tracker-language.h>
-
-G_BEGIN_DECLS
-
-typedef struct TrackerParser TrackerParser;
-
-TrackerParser *tracker_parser_new (TrackerLanguage *language);
-
-void tracker_parser_reset (TrackerParser *parser,
- const gchar *txt,
- gint txt_size,
- guint max_word_length,
- gboolean enable_stemmer,
- gboolean enable_unaccent,
- gboolean ignore_stop_words,
- gboolean ignore_reserved_words,
- gboolean ignore_numbers);
-
-const gchar * tracker_parser_next (TrackerParser *parser,
- gint *position,
- gint *byte_offset_start,
- gint *byte_offset_end,
- gboolean *stop_word,
- gint *word_length);
-
-void tracker_parser_free (TrackerParser *parser);
-
-/* Other helper methods */
-
-gboolean tracker_parser_unaccent_nfkd_string (gpointer str,
- gsize *str_length);
-
-G_END_DECLS
-
-#endif /* __LIBTRACKER_FTS_PARSER_H__ */