Refactor tracker-parser to be located in libtracker-common

This also fixes the unaccenting SPARQL function in case of --disable-tracker-fts
author: Philip Van Hoof <philip@codeminded.be> 2014-12-01 14:07:16 +0100
committer: Philip Van Hoof <philip@codeminded.be> 2014-12-01 14:07:16 +0100
commit: 5d05f68da10ef5a2c25fba4246120ce525035d51 (patch)
tree: f40db3e46953ef15f0dc7db5a5685711cbd79f18 /src/libtracker-fts
parent: 8d64b9783d3aebff2d9259068c402a9e1c1a81e4 (diff)
download: tracker-5d05f68da10ef5a2c25fba4246120ce525035d51.tar.gz
7 files changed, 3 insertions, 1541 deletions
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index e583c391c..86b7ac34d 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -33,23 +33,12 @@ libtracker_fts_la_SOURCES =                            \
 	tracker-fts-config.c                           \
 	tracker-fts-config.h                           \
 	tracker-fts-tokenizer.c                        \
-	tracker-fts-tokenizer.h                        \
-	tracker-parser-utils.c                         \
-	tracker-parser-utils.h                         \
-	tracker-parser.h
+	tracker-fts-tokenizer.h
 
 if !HAVE_BUILTIN_FTS
   libtracker_fts_la_SOURCES += $(fts4_sources)
 endif
 
-if BUILD_LIBUNISTRING_PARSER
-  libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
-endif
-
-if BUILD_LIBICU_PARSER
-  libtracker_fts_la_SOURCES += tracker-parser-libicu.c
-endif
-
 libtracker_fts_la_LIBADD =                             \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la \
 	$(BUILD_LIBS)                                  \
diff --git a/src/libtracker-fts/tracker-fts-tokenizer.c b/src/libtracker-fts/tracker-fts-tokenizer.c
index d61ae79e2..c45d73d9a 100644
--- a/src/libtracker-fts/tracker-fts-tokenizer.c
+++ b/src/libtracker-fts/tracker-fts-tokenizer.c
@@ -26,9 +26,10 @@
 #include <assert.h>
 #include <string.h>
 
+#include <libtracker-common/tracker-parser.h>
+
 #include "tracker-fts-tokenizer.h"
 #include "tracker-fts-config.h"
-#include "tracker-parser.h"
 #include "fts3_tokenizer.h"
 
 typedef struct TrackerTokenizer TrackerTokenizer;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
deleted file mode 100644
index b26722c96..000000000
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301  USA
- */
-
-#include "config.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <locale.h>
-
-#include <unicode/utypes.h>
-#include <unicode/ucnv.h>
-#include <unicode/ubrk.h>
-#include <unicode/ustring.h>
-#include <unicode/uchar.h>
-#include <unicode/unorm.h>
-
-#include "tracker-parser.h"
-#include "tracker-parser-utils.h"
-
-/* Type of words detected */
-typedef enum {
-	TRACKER_PARSER_WORD_TYPE_ASCII,
-	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
-	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
-} TrackerParserWordType;
-
-/* Max possible length of a UChar encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
-struct TrackerParser {
-	const gchar           *txt;
-	gint                   txt_size;
-
-	TrackerLanguage       *language;
-	guint                  max_word_length;
-	gboolean               enable_stemmer;
-	gboolean               enable_unaccent;
-	gboolean               ignore_stop_words;
-	gboolean               ignore_reserved_words;
-	gboolean               ignore_numbers;
-	gboolean               enable_forced_wordbreaks;
-
-	/* Private members */
-	gchar                 *word;
-	gint                   word_length;
-	guint                  word_position;
-
-	/* Text as UChars */
-	UChar                 *utxt;
-	gint                   utxt_size;
-	/* Original offset of each UChar in the input txt string */
-	gint32                *offsets;
-
-	/* The word-break iterator */
-	UBreakIterator        *bi;
-
-	/* Cursor, as index of the utxt array of bytes */
-	gsize                  cursor;
-};
-
-
-static gboolean
-get_word_info (const UChar           *word,
-               gsize                  word_length,
-               gboolean               ignore_numbers,
-               gboolean              *p_is_allowed_word_start,
-               TrackerParserWordType *p_word_type)
-{
-	UCharIterator iter;
-	UChar32 unichar;
-	guint8 unichar_gc;
-
-	/* Get first character of the word as UCS4 */
-	uiter_setString (&iter, word, word_length);
-	unichar = uiter_current32 (&iter);
-	if (unichar == U_SENTINEL) {
-		return FALSE;
-	}
-
-	/* We only want the words where the first character
-	 * in the word is either a letter, a number or a symbol.
-	 *
-	 * This is needed because the word break algorithm also
-	 * considers word breaks after for example commas or other
-	 * punctuation marks.
-	 *
-	 * Note that looking at the first character in the string
-	 * should be compatible with all Unicode normalization
-	 * methods.
-	 */
-	unichar_gc = u_charType (unichar);
-	if (unichar_gc == U_UPPERCASE_LETTER ||
-	    unichar_gc == U_LOWERCASE_LETTER ||
-	    unichar_gc == U_TITLECASE_LETTER ||
-	    unichar_gc == U_MODIFIER_LETTER ||
-	    unichar_gc == U_OTHER_LETTER ||
-	    IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
-	    (!ignore_numbers &&
-	     (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
-	      unichar_gc == U_LETTER_NUMBER ||
-	      unichar_gc == U_OTHER_NUMBER))) {
-		*p_is_allowed_word_start = TRUE;
-	} else {
-		*p_is_allowed_word_start = FALSE;
-		return TRUE;
-	}
-
-	/* Word starts with a CJK character? */
-	if (IS_CJK_UCS4 ((guint32)unichar)) {
-		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
-		return TRUE;
-	}
-
-	/* Is ASCII-only string? */
-	while (unichar != U_SENTINEL) {
-		if (!IS_ASCII_UCS4 ((guint32)unichar)) {
-			*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
-			return TRUE;
-		}
-		unichar = uiter_next32 (&iter);
-	}
-
-	*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
-	return TRUE;
-}
-
-/* The input word in this method MUST be normalized in NFKD form,
- * and given in UChars, where str_length is the number of UChars
- * (not the number of bytes) */
-gboolean
-tracker_parser_unaccent_nfkd_string (gpointer  str,
-                                     gsize    *str_length)
-{
-	UChar *word;
-	gsize word_length;
-	gsize i;
-	gsize j;
-
-	g_return_val_if_fail (str != NULL, FALSE);
-	g_return_val_if_fail (str_length != NULL, FALSE);
-	g_return_val_if_fail (*str_length > 0, FALSE);
-
-	word = (UChar *)str;
-	word_length = *str_length;
-
-	i = 0;
-	j = 0;
-	while (i < word_length) {
-		UChar32 unichar;
-		gint utf16_len; /* given in UChars */
-		gsize aux_i;
-
-		/* Get next character of the word as UCS4 */
-		aux_i = i;
-		U16_NEXT (word, aux_i, word_length, unichar);
-		utf16_len = aux_i - i;
-
-		/* Invalid UTF-16 character or end of original string. */
-		if (utf16_len <= 0) {
-			break;
-		}
-
-		/* If the given unichar is a combining diacritical mark,
-		 * just update the original index, not the output one */
-		if (IS_CDM_UCS4 ((guint32) unichar)) {
-			i += utf16_len;
-			continue;
-		}
-
-		/* If already found a previous combining
-		 * diacritical mark, indexes are different so
-		 * need to copy characters. As output and input
-		 * buffers may overlap, need to use memmove
-		 * instead of memcpy */
-		if (i != j) {
-			memmove (&word[j], &word[i], sizeof (UChar) * utf16_len);
-		}
-
-		/* Update both indexes */
-		i += utf16_len;
-		j += utf16_len;
-	}
-
-	/* Force proper string end */
-	word[j] = (UChar) 0;
-
-	/* Set new output length */
-	*str_length = j;
-
-	return TRUE;
-}
-
-static gchar *
-convert_UChar_to_utf8 (const UChar *word,
-                       gsize        uchar_len,
-                       gsize       *utf8_len)
-{
-	gchar *utf8_str;
-	UErrorCode icu_error = U_ZERO_ERROR;
-	UConverter *converter;
-	gsize new_utf8_len;
-
-	g_return_val_if_fail (word, NULL);
-	g_return_val_if_fail (utf8_len, NULL);
-
-	/* Open converter UChar to UTF-16BE */
-	converter = ucnv_open ("UTF-8", &icu_error);
-	if (!converter) {
-		g_warning ("Cannot open UTF-8 converter: '%s'",
-		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
-		return NULL;
-	}
-
-	/* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
-	 *  in UTF-8. */
-	utf8_str = g_malloc (2 * uchar_len * sizeof (UChar) + 1);
-
-	/* Convert from UChar to UTF-8 (NIL-terminated) */
-	new_utf8_len = ucnv_fromUChars (converter,
-	                                utf8_str,
-	                                2 * uchar_len * sizeof (UChar) + 1,
-	                                word,
-	                                uchar_len,
-	                                &icu_error);
-	if (U_FAILURE (icu_error)) {
-		g_warning ("Cannot convert from UChar to UTF-8: '%s'",
-		           u_errorName (icu_error));
-		g_free (utf8_str);
-		ucnv_close (converter);
-		return NULL;
-	}
-
-	*utf8_len = new_utf8_len;
-	ucnv_close (converter);
-
-	return utf8_str;
-}
-
-static gchar *
-process_word_uchar (TrackerParser         *parser,
-                    const UChar           *word,
-                    gint                   length,
-                    TrackerParserWordType  type,
-                    gboolean              *stop_word)
-{
-	UErrorCode error = U_ZERO_ERROR;
-	UChar normalized_buffer[WORD_BUFFER_LENGTH];
-	gchar *utf8_str = NULL;
-	gsize new_word_length;
-
-	/* Log original word */
-	tracker_parser_message_hex ("ORIGINAL word",
-	                            (guint8 *)word,
-	                            length * sizeof (UChar));
-
-
-	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
-		UChar casefolded_buffer [WORD_BUFFER_LENGTH];
-
-		/* Casefold... */
-		new_word_length = u_strFoldCase (casefolded_buffer,
-		                                 WORD_BUFFER_LENGTH,
-		                                 word,
-		                                 length,
-		                                 U_FOLD_CASE_DEFAULT,
-		                                 &error);
-		if (U_FAILURE (error)) {
-			g_warning ("Error casefolding: '%s'",
-			           u_errorName (error));
-			return NULL;
-		}
-		if (new_word_length > WORD_BUFFER_LENGTH)
-			new_word_length = WORD_BUFFER_LENGTH;
-
-		/* Log after casefolding */
-		tracker_parser_message_hex (" After Casefolding",
-		                            (guint8 *)casefolded_buffer,
-		                            new_word_length * sizeof (UChar));
-
-		/* NFKD normalization... */
-		new_word_length = unorm_normalize (casefolded_buffer,
-		                                   new_word_length,
-		                                   UNORM_NFKD,
-		                                   0,
-		                                   normalized_buffer,
-		                                   WORD_BUFFER_LENGTH,
-		                                   &error);
-		if (U_FAILURE (error)) {
-			g_warning ("Error normalizing: '%s'",
-			           u_errorName (error));
-			return NULL;
-		}
-
-		if (new_word_length > WORD_BUFFER_LENGTH)
-			new_word_length = WORD_BUFFER_LENGTH;
-
-		/* Log after casefolding */
-		tracker_parser_message_hex (" After Normalization",
-		                            (guint8 *) normalized_buffer,
-		                            new_word_length * sizeof (UChar));
-	} else {
-		/* For ASCII-only, just tolower() each character */
-		new_word_length = u_strToLower (normalized_buffer,
-		                                WORD_BUFFER_LENGTH,
-		                                word,
-		                                length,
-		                                NULL,
-		                                &error);
-		if (U_FAILURE (error)) {
-			g_warning ("Error lowercasing: '%s'",
-			           u_errorName (error));
-			return NULL;
-		}
-
-		/* Log after casefolding */
-		tracker_parser_message_hex (" After lowercase",
-		                            (guint8 *) normalized_buffer,
-		                            new_word_length * sizeof (UChar));
-	}
-
-	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
-	if (parser->enable_unaccent &&
-	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
-	    tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) {
-		/* Log after unaccenting */
-		tracker_parser_message_hex ("  After UNAC",
-		                            (guint8 *) normalized_buffer,
-		                            new_word_length * sizeof (UChar));
-	}
-
-	/* Finally, convert to UTF-8 */
-	utf8_str = convert_UChar_to_utf8 (normalized_buffer,
-	                                  new_word_length,
-	                                  &new_word_length);
-
-	/* Log after unaccenting */
-	tracker_parser_message_hex ("   After UTF8 conversion",
-	                            utf8_str,
-	                            new_word_length);
-
-	/* Check if stop word */
-	if (parser->ignore_stop_words) {
-		*stop_word = tracker_language_is_stop_word (parser->language,
-		                                            utf8_str);
-	}
-
-	/* Stemming needed? */
-	if (utf8_str &&
-	    parser->enable_stemmer) {
-		gchar *stemmed;
-
-		/* Input for stemmer ALWAYS in UTF-8, as well as output */
-		stemmed = tracker_language_stem_word (parser->language,
-		                                      utf8_str,
-		                                      new_word_length);
-
-		/* Log after stemming */
-		tracker_parser_message_hex ("    After stemming",
-		                            stemmed, strlen (stemmed));
-
-		/* If stemmed wanted and succeeded, free previous and return it */
-		if (stemmed) {
-			g_free (utf8_str);
-			return stemmed;
-		}
-	}
-
-	return utf8_str;
-}
-
-static gboolean
-parser_check_forced_wordbreaks (const UChar *buffer,
-                                gsize        current,
-                                gsize       *next)
-{
-	gsize unicode_word_length = *next - current;
-	gsize word_length = 0;
-	UCharIterator iter;
-	UChar32 unichar;
-
-	uiter_setString (&iter, &buffer[current], unicode_word_length);
-
-	/* Iterate over the string looking for forced word breaks */
-	while ((unichar = uiter_next32 (&iter)) != U_SENTINEL &&
-	       word_length < unicode_word_length) {
-
-		if (IS_FORCED_WORDBREAK_UCS4 ((guint32) unichar)) {
-			/* Support word starting with a forced wordbreak */
-			if (word_length == 0) {
-				word_length = 1;
-			}
-			break;
-		}
-
-		word_length ++;
-	}
-
-	/* g_debug ("current: %" G_GSIZE_FORMAT ", " */
-	/*          "next: %" G_GSIZE_FORMAT ", " */
-	/*          "now: %" G_GSIZE_FORMAT, */
-	/*          current, */
-	/*          *next, */
-	/*          current + word_length); */
-
-	if (word_length != unicode_word_length) {
-		*next = current + word_length;
-		return TRUE;
-	}
-	return FALSE;
-}
-
-static gboolean
-parser_next (TrackerParser *parser,
-             gint          *byte_offset_start,
-             gint          *byte_offset_end,
-             gboolean      *stop_word)
-{
-	gsize word_length_uchar = 0;
-	gsize word_length_utf8 = 0;
-	gchar *processed_word = NULL;
-	gsize current_word_offset_utf8;
-
-	*byte_offset_start = 0;
-	*byte_offset_end = 0;
-
-	g_return_val_if_fail (parser, FALSE);
-
-	/* Loop to look for next valid word */
-	while (!processed_word &&
-	       parser->cursor < parser->utxt_size) {
-		TrackerParserWordType type;
-		gboolean is_allowed;
-		gsize next_word_offset_uchar;
-		gsize next_word_offset_utf8;
-		gsize truncated_length;
-
-		/* Set current word offset in the original UTF-8 string */
-		current_word_offset_utf8 = parser->offsets[parser->cursor];
-
-		/* Find next word break. */
-		next_word_offset_uchar = ubrk_next (parser->bi);
-
-		/* Check if any forced wordbreaks here... */
-		if (parser->enable_forced_wordbreaks) {
-			/* Returns TRUE if next word offset changed */
-			if (parser_check_forced_wordbreaks (parser->utxt,
-			                                    parser->cursor,
-			                                    &next_word_offset_uchar)) {
-				/* We need to reset the iterator so that next word
-				 * actually returns the same result */
-				ubrk_previous (parser->bi);
-			}
-		}
-
-		if (next_word_offset_uchar >= parser->utxt_size) {
-			/* Last word support... */
-			next_word_offset_uchar = parser->utxt_size;
-			next_word_offset_utf8 = parser->txt_size;
-		} else {
-			next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
-		}
-
-		/* Word end is the first byte after the word, which is either the
-		 *  start of next word or the end of the string */
-		word_length_uchar = next_word_offset_uchar - parser->cursor;
-		word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
-
-		/* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
-		/* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
-		/* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
-		/* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
-		/* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
-		/* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */
-
-		/* Ignore the word if longer than the maximum allowed */
-		if (word_length_utf8 >= parser->max_word_length) {
-			/* Ignore this word and keep on looping */
-			parser->cursor = next_word_offset_uchar;
-			continue;
-		}
-
-		/* Get word info... */
-		if (!get_word_info (&parser->utxt[parser->cursor],
-		                    word_length_uchar,
-		                    parser->ignore_numbers,
-		                    &is_allowed,
-		                    &type)) {
-			/* Quit loop just in case */
-			parser->cursor = parser->utxt_size;
-			break;
-		}
-
-		/* Ignore the word if not an allowed word start */
-		if (!is_allowed) {
-			/* Ignore this word and keep on looping */
-			parser->cursor = next_word_offset_uchar;
-			continue;
-		}
-
-		/* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
-		if (parser->ignore_reserved_words &&
-		    tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
-		                                          word_length_utf8)) {
-			/* Ignore this word and keep on looping */
-			parser->cursor = next_word_offset_uchar;
-			continue;
-		}
-
-		/* compute truncated word length (in UChar bytes) if needed (to
-		 * avoid extremely long words) */
-		truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
-		                    word_length_uchar :
-		                    2 * WORD_BUFFER_LENGTH);
-
-		/* Process the word here. If it fails, we can still go
-		 *  to the next one. Returns newly allocated UTF-8
-		 *  string always.
-		 * Enable UNAC stripping only if no ASCII and no CJK
-		 * Note we are passing UChar encoded string here!
-		 */
-		processed_word = process_word_uchar (parser,
-		                                     &(parser->utxt[parser->cursor]),
-		                                     truncated_length,
-		                                     type,
-		                                     stop_word);
-		if (!processed_word) {
-			/* Ignore this word and keep on looping */
-			parser->cursor = next_word_offset_uchar;
-			continue;
-		}
-	}
-
-	/* If we got a word here, set output */
-	if (processed_word) {
-		/* Set outputs */
-		*byte_offset_start = current_word_offset_utf8;
-		*byte_offset_end = current_word_offset_utf8 + word_length_utf8;
-
-		/* Update cursor */
-		parser->cursor += word_length_uchar;
-
-		parser->word_length = strlen (processed_word);
-		parser->word = processed_word;
-
-		return TRUE;
-	}
-
-	/* No more words... */
-	return FALSE;
-}
-
-TrackerParser *
-tracker_parser_new (TrackerLanguage *language)
-{
-	TrackerParser *parser;
-
-	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
-	parser = g_new0 (TrackerParser, 1);
-
-	parser->language = g_object_ref (language);
-
-	return parser;
-}
-
-void
-tracker_parser_free (TrackerParser *parser)
-{
-	g_return_if_fail (parser != NULL);
-
-	if (parser->language) {
-		g_object_unref (parser->language);
-	}
-
-	if (parser->bi) {
-		ubrk_close (parser->bi);
-	}
-
-	g_free (parser->utxt);
-	g_free (parser->offsets);
-
-	g_free (parser->word);
-
-	g_free (parser);
-}
-
-void
-tracker_parser_reset (TrackerParser *parser,
-                      const gchar   *txt,
-                      gint           txt_size,
-                      guint          max_word_length,
-                      gboolean       enable_stemmer,
-                      gboolean       enable_unaccent,
-                      gboolean       ignore_stop_words,
-                      gboolean       ignore_reserved_words,
-                      gboolean       ignore_numbers)
-{
-	UErrorCode error = U_ZERO_ERROR;
-	UConverter *converter;
-	UChar *last_uchar;
-	const gchar *last_utf8;
-
-	g_return_if_fail (parser != NULL);
-	g_return_if_fail (txt != NULL);
-
-	parser->max_word_length = max_word_length;
-	parser->enable_stemmer = enable_stemmer;
-	parser->enable_unaccent = enable_unaccent;
-	parser->ignore_stop_words = ignore_stop_words;
-	parser->ignore_reserved_words = ignore_reserved_words;
-	parser->ignore_numbers = ignore_numbers;
-
-	/* Note: We're forcing some unicode characters to behave
-	 * as wordbreakers: e.g, the '.' The main reason for this
-	 * is to enable FTS searches matching file extension. */
-	parser->enable_forced_wordbreaks = TRUE;
-
-	parser->txt_size = txt_size;
-	parser->txt = txt;
-
-	g_free (parser->word);
-	parser->word = NULL;
-
-	if (parser->bi) {
-		ubrk_close (parser->bi);
-		parser->bi = NULL;
-	}
-	g_free (parser->utxt);
-	parser->utxt = NULL;
-	g_free (parser->offsets);
-	parser->offsets = NULL;
-
-	parser->word_position = 0;
-
-	parser->cursor = 0;
-
-	/* Open converter UTF-8 to UChar */
-	converter = ucnv_open ("UTF-8", &error);
-	if (!converter) {
-		g_warning ("Cannot open UTF-8 converter: '%s'",
-		           U_FAILURE (error) ? u_errorName (error) : "none");
-		return;
-	}
-
-	/* Allocate UChars and offsets buffers */
-	parser->utxt_size = txt_size + 1;
-	parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
-	parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));
-
-	/* last_uchar and last_utf8 will be also an output parameter! */
-	last_uchar = parser->utxt;
-	last_utf8 = parser->txt;
-
-	/* Convert to UChars storing offsets */
-	ucnv_toUnicode (converter,
-	                &last_uchar,
-	                &parser->utxt[txt_size],
-	                &last_utf8,
-	                &parser->txt[txt_size],
-	                parser->offsets,
-	                FALSE,
-	                &error);
-	if (U_SUCCESS (error)) {
-		/* Proper UChar array size is now given by 'last_uchar' */
-		parser->utxt_size = last_uchar - parser->utxt;
-
-		/* Open word-break iterator */
-		parser->bi = ubrk_open(UBRK_WORD,
-		                       setlocale (LC_CTYPE, NULL),
-		                       parser->utxt,
-		                       parser->utxt_size,
-		                       &error);
-		if (U_SUCCESS (error)) {
-			/* Find FIRST word in the UChar array */
-			parser->cursor = ubrk_first (parser->bi);
-		}
-	}
-
-	/* If any error happened, reset buffers */
-	if (U_FAILURE (error)) {
-		g_warning ("Error initializing libicu support: '%s'",
-		           u_errorName (error));
-		/* Reset buffers */
-		g_free (parser->utxt);
-		parser->utxt = NULL;
-		g_free (parser->offsets);
-		parser->offsets = NULL;
-		parser->utxt_size = 0;
-		if (parser->bi) {
-			ubrk_close (parser->bi);
-			parser->bi = NULL;
-		}
-	}
-
-	/* Close converter */
-	ucnv_close (converter);
-}
-
-const gchar *
-tracker_parser_next (TrackerParser *parser,
-                     gint          *position,
-                     gint          *byte_offset_start,
-                     gint          *byte_offset_end,
-                     gboolean      *stop_word,
-                     gint          *word_length)
-{
-	const gchar  *str;
-	gint byte_start = 0, byte_end = 0;
-
-	str = NULL;
-
-	g_free (parser->word);
-	parser->word = NULL;
-
-	*stop_word = FALSE;
-
-	if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
-		str = parser->word;
-	}
-
-	if (!*stop_word) {
-		parser->word_position++;
-	}
-
-	*word_length = parser->word_length;
-	*position = parser->word_position;
-	*byte_offset_start = byte_start;
-	*byte_offset_end = byte_end;
-
-	return str;
-}
-
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
deleted file mode 100644
index 9de6e46f7..000000000
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301  USA
- */
-
-#include "config.h"
-
-#include <stdio.h>
-#include <string.h>
-
-/* libunistring versions prior to 9.1.2 need this hack */
-#define _UNUSED_PARAMETER_
-#include <unistr.h>
-#include <uniwbrk.h>
-#include <unictype.h>
-#include <unicase.h>
-
-#include "tracker-parser.h"
-#include "tracker-parser-utils.h"
-
-/* Type of words detected */
-typedef enum {
-	TRACKER_PARSER_WORD_TYPE_ASCII,
-	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
-	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
-} TrackerParserWordType;
-
-/* Max possible length of a UTF-8 encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
-struct TrackerParser {
-	const gchar           *txt;
-	gint                   txt_size;
-
-	TrackerLanguage       *language;
-	guint                  max_word_length;
-	gboolean               enable_stemmer;
-	gboolean               enable_unaccent;
-	gboolean               ignore_stop_words;
-	gboolean               ignore_reserved_words;
-	gboolean               ignore_numbers;
-	gboolean               enable_forced_wordbreaks;
-
-	/* Private members */
-	gchar                 *word;
-	gint                   word_length;
-	guint                  word_position;
-
-	/* Cursor, as index of the input array of bytes */
-	gsize                  cursor;
-	/* libunistring flags array */
-	gchar                 *word_break_flags;
-	/* general category of the  start character in words */
-	uc_general_category_t  allowed_start;
-};
-
-static gboolean
-get_word_info (TrackerParser         *parser,
-               gsize                 *p_word_length,
-               gboolean              *p_is_allowed_word_start,
-               TrackerParserWordType *p_word_type)
-{
-	ucs4_t first_unichar;
-	gint first_unichar_len;
-	gboolean ascii_only;
-
-	/* Defaults */
-	*p_is_allowed_word_start = TRUE;
-
-	/* Get first character of the word as UCS4 */
-	first_unichar_len = u8_strmbtouc (&first_unichar,
-	                                  &(parser->txt[parser->cursor]));
-	if (first_unichar_len <= 0) {
-		/* This should only happen if NIL was passed to u8_strmbtouc,
-		 *  so better just force stop here */
-		return FALSE;
-	} else  {
-		/* If first character has length 1, it's ASCII-7 */
-		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
-	}
-
-	/* Consider word starts with a forced wordbreak */
-	if (parser->enable_forced_wordbreaks &&
-	    IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
-		*p_word_length = first_unichar_len;
-	} else {
-		gsize i;
-
-		/* Find next word break, and in the same loop checking if only ASCII
-		 *  characters */
-		i = parser->cursor + first_unichar_len;
-		while (1) {
-			/* Text bounds reached? */
-			if (i >= parser->txt_size)
-				break;
-			/* Proper unicode word break detected? */
-			if (parser->word_break_flags[i])
-				break;
-			/* Forced word break detected? */
-			if (parser->enable_forced_wordbreaks &&
-			    IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
-				break;
-
-			if (ascii_only &&
-			    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
-				ascii_only = FALSE;
-			}
-
-			i++;
-		}
-
-		/* Word end is the first byte after the word, which is either the
-		 *  start of next word or the end of the string */
-		*p_word_length = i - parser->cursor;
-	}
-
-	/* We only want the words where the first character
-	 *  in the word is either a letter, a number or a symbol.
-	 * This is needed because the word break algorithm also
-	 *  considers word breaks after for example commas or other
-	 *  punctuation marks.
-	 * Note that looking at the first character in the string
-	 *  should be compatible with all Unicode normalization
-	 *  methods.
-	 */
-	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
-	    !uc_is_general_category (first_unichar,
-	                             parser->allowed_start)) {
-		*p_is_allowed_word_start = FALSE;
-		return TRUE;
-	}
-
-	/* Decide word type */
-	if (ascii_only) {
-		*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
-	} else if (IS_CJK_UCS4 (first_unichar)) {
-		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
-	} else {
-		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
-	}
-	return TRUE;
-}
-
-/* The input word in this method MUST be normalized in NFKD form,
- * and given in UTF-8, where str_length is the byte-length */
-gboolean
-tracker_parser_unaccent_nfkd_string (gpointer  str,
-                                     gsize    *str_length)
-{
-	gchar *word;
-	gsize word_length;
-	gsize i;
-	gsize j;
-
-	g_return_val_if_fail (str != NULL, FALSE);
-	g_return_val_if_fail (str_length != NULL, FALSE);
-	g_return_val_if_fail (*str_length > 0, FALSE);
-
-	word = (gchar *)str;
-	word_length = *str_length;
-
-	i = 0;
-	j = 0;
-	while (i < word_length) {
-		ucs4_t unichar;
-		gint utf8_len;
-
-		/* Get next character of the word as UCS4 */
-		utf8_len = u8_strmbtouc (&unichar, &word[i]);
-
-		/* Invalid UTF-8 character or end of original string. */
-		if (utf8_len <= 0) {
-			break;
-		}
-
-		/* If the given unichar is a combining diacritical mark,
-		 * just update the original index, not the output one */
-		if (IS_CDM_UCS4 ((guint32) unichar)) {
-			i += utf8_len;
-			continue;
-		}
-
-		/* If already found a previous combining
-		 * diacritical mark, indexes are different so
-		 * need to copy characters. As output and input
-		 * buffers may overlap, need to use memmove
-		 * instead of memcpy */
-		if (i != j) {
-			memmove (&word[j], &word[i], utf8_len);
-		}
-
-		/* Update both indexes */
-		i += utf8_len;
-		j += utf8_len;
-	}
-
-	/* Force proper string end */
-	word[j] = '\0';
-
-	/* Set new output length */
-	*str_length = j;
-
-	return TRUE;
-}
-
-static gchar *
-process_word_utf8 (TrackerParser         *parser,
-                   const gchar           *word,
-                   gint                   length,
-                   TrackerParserWordType  type,
-                   gboolean              *stop_word)
-{
-	gchar word_buffer [WORD_BUFFER_LENGTH];
-	gchar *normalized = NULL;
-	gchar *stemmed = NULL;
-	size_t new_word_length;
-
-	g_return_val_if_fail (parser != NULL, NULL);
-	g_return_val_if_fail (word != NULL, NULL);
-
-	/* If length is set as -1, the input word MUST be NIL-terminated.
-	 * Otherwise, this restriction is not needed as the length to process
-	 * is given as input argument */
-	if (length < 0) {
-		length = strlen (word);
-	}
-
-	/* Log original word */
-	tracker_parser_message_hex ("ORIGINAL word",
-	                            word, length);
-
-	/* Normalization and case-folding ONLY for non-ASCII */
-	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
-		/* Leave space for last NIL */
-		new_word_length = WORD_BUFFER_LENGTH - 1;
-
-		/* Casefold and NFKD normalization in output.
-		 * NOTE: if the output buffer is not big enough, u8_casefold will
-		 * return a newly-allocated buffer. */
-		normalized = u8_casefold ((const uint8_t *)word,
-		                          length,
-		                          uc_locale_language (),
-		                          UNINORM_NFKD,
-		                          word_buffer,
-		                          &new_word_length);
-
-		/* Case folding + Normalization failed, ignore this word */
-		g_return_val_if_fail (normalized != NULL, NULL);
-
-		/* If output buffer is not the same as the one passed to
-		 * u8_casefold, we know it was newly-allocated, so need
-		 * to resize it in 1 byte to add last NIL */
-		if (normalized != word_buffer) {
-			normalized = g_realloc (normalized, new_word_length + 1);
-		}
-
-		/* Log after Normalization */
-		tracker_parser_message_hex (" After Casefolding and NFKD normalization",
-		                            normalized, new_word_length);
-	} else {
-		/* For ASCII-only, just tolower() each character */
-		gsize i;
-
-		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
-
-		for (i = 0; i < length; i++) {
-			normalized[i] = g_ascii_tolower (word[i]);
-		}
-
-		new_word_length = length;
-
-		/* Log after tolower */
-		tracker_parser_message_hex (" After Lowercasing",
-		                            normalized, new_word_length);
-	}
-
-	/* Set output NIL */
-	normalized[new_word_length] = '\0';
-
-	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
-	if (parser->enable_unaccent &&
-	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
-	    tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
-		/* Log after UNAC stripping */
-		tracker_parser_message_hex ("  After UNAC stripping",
-		                            normalized, new_word_length);
-	}
-
-	/* Check if stop word */
-	if (parser->ignore_stop_words) {
-		*stop_word = tracker_language_is_stop_word (parser->language,
-		                                            normalized);
-	}
-
-	/* Stemming needed? */
-	if (parser->enable_stemmer) {
-		stemmed = tracker_language_stem_word (parser->language,
-		                                      normalized,
-		                                      new_word_length);
-
-		/* Log after stemming */
-		tracker_parser_message_hex ("   After stemming",
-		                            stemmed, strlen (stemmed));
-	}
-
-	/* If stemmed wanted and succeeded, free previous and return it */
-	if (stemmed) {
-		if (normalized != word_buffer) {
-			g_free (normalized);
-		}
-		return stemmed;
-	}
-
-	/* It may be the case that no stripping and no stemming was needed, and
-	 * that the output buffer in stack was enough for case-folding and
-	 * normalization. In this case, need to strdup() the string to return it */
-	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
-}
-
-static gboolean
-parser_next (TrackerParser *parser,
-             gint          *byte_offset_start,
-             gint          *byte_offset_end,
-             gboolean      *stop_word)
-{
-	gsize word_length = 0;
-	gchar *processed_word = NULL;
-
-	*byte_offset_start = 0;
-	*byte_offset_end = 0;
-
-	g_return_val_if_fail (parser, FALSE);
-
-	/* Loop to look for next valid word */
-	while (!processed_word &&
-	       parser->cursor < parser->txt_size) {
-		TrackerParserWordType type;
-		gsize truncated_length;
-		gboolean is_allowed;
-
-		/* Get word info */
-		if (!get_word_info (parser,
-		                    &word_length,
-		                    &is_allowed,
-		                    &type)) {
-			/* Quit loop just in case */
-			parser->cursor = parser->txt_size;
-			break;
-		}
-
-		/* Ignore the word if not an allowed word start */
-		if (!is_allowed) {
-			/* Ignore this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-
-		/* Ignore the word if longer than the maximum allowed */
-		if (word_length >= parser->max_word_length) {
-			/* Ignore this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-
-		/* check if word is reserved and ignore it if so */
-		if (parser->ignore_reserved_words &&
-		    tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
-		                                          word_length)) {
-			/* Ignore this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-
-		/* compute truncated word length if needed (to avoid extremely
-		 *  long words)*/
-		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
-		                    word_length :
-		                    WORD_BUFFER_LENGTH - 1);
-
-		/* Process the word here. If it fails, we can still go
-		 *  to the next one. Returns newly allocated string
-		 *  always */
-		processed_word = process_word_utf8 (parser,
-		                                    &(parser->txt[parser->cursor]),
-		                                    truncated_length,
-		                                    type,
-		                                    stop_word);
-		if (!processed_word) {
-			/* Ignore this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-	}
-
-	/* If we got a word here, set output */
-	if (processed_word) {
-		/* Set outputs */
-		*byte_offset_start = parser->cursor;
-		*byte_offset_end = parser->cursor + word_length;
-
-		/* Update cursor */
-		parser->cursor += word_length;
-
-		parser->word_length = strlen (processed_word);
-		parser->word = processed_word;
-
-		return TRUE;
-	}
-
-	/* No more words... */
-	return FALSE;
-}
-
-TrackerParser *
-tracker_parser_new (TrackerLanguage *language)
-{
-	TrackerParser *parser;
-
-	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
-
-	parser = g_new0 (TrackerParser, 1);
-
-	parser->language = g_object_ref (language);
-
-	return parser;
-}
-
-void
-tracker_parser_free (TrackerParser *parser)
-{
-	g_return_if_fail (parser != NULL);
-
-	if (parser->language) {
-		g_object_unref (parser->language);
-	}
-
-	g_free (parser->word_break_flags);
-
-	g_free (parser->word);
-
-	g_free (parser);
-}
-
-void
-tracker_parser_reset (TrackerParser *parser,
-                      const gchar   *txt,
-                      gint           txt_size,
-                      guint          max_word_length,
-                      gboolean       enable_stemmer,
-                      gboolean       enable_unaccent,
-                      gboolean       ignore_stop_words,
-                      gboolean       ignore_reserved_words,
-                      gboolean       ignore_numbers)
-{
-	g_return_if_fail (parser != NULL);
-	g_return_if_fail (txt != NULL);
-
-	parser->max_word_length = max_word_length;
-	parser->enable_stemmer = enable_stemmer;
-	parser->enable_unaccent = enable_unaccent;
-	parser->ignore_stop_words = ignore_stop_words;
-	parser->ignore_reserved_words = ignore_reserved_words;
-	parser->ignore_numbers = ignore_numbers;
-
-	/* Note: We're forcing some unicode characters to behave
-	 * as wordbreakers: e.g, the '.' The main reason for this
-	 * is to enable FTS searches matching file extension. */
-	parser->enable_forced_wordbreaks = TRUE;
-
-	parser->txt_size = txt_size;
-	parser->txt = txt;
-
-	g_free (parser->word);
-	parser->word = NULL;
-
-	parser->word_position = 0;
-
-	parser->cursor = 0;
-
-	g_free (parser->word_break_flags);
-
-	/* Create array of flags, same size as original text. */
-	parser->word_break_flags = g_malloc (txt_size);
-
-	/* Get wordbreak flags in the whole string */
-	u8_wordbreaks ((const uint8_t *)txt,
-	               (size_t) txt_size,
-	               (char *)parser->word_break_flags);
-
-	/* Prepare a custom category which is a combination of the
-	 * desired ones */
-	parser->allowed_start = UC_LETTER;
-	if (!parser->ignore_numbers) {
-		parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
-	}
-}
-
-const gchar *
-tracker_parser_next (TrackerParser *parser,
-                     gint          *position,
-                     gint          *byte_offset_start,
-                     gint          *byte_offset_end,
-                     gboolean      *stop_word,
-                     gint          *word_length)
-{
-	const gchar  *str;
-	gint byte_start = 0, byte_end = 0;
-
-	str = NULL;
-
-	g_free (parser->word);
-	parser->word = NULL;
-
-	*stop_word = FALSE;
-
-	if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
-		str = parser->word;
-	}
-
-	if (!*stop_word) {
-		parser->word_position++;
-	}
-
-	*word_length = parser->word_length;
-	*position = parser->word_position;
-	*byte_offset_start = byte_start;
-	*byte_offset_end = byte_end;
-
-	return str;
-}
-
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
deleted file mode 100644
index dec597747..000000000
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301  USA
- */
-
-#include "config.h"
-
-#include <string.h>
-
-#include <libtracker-common/tracker-utils.h>
-
-#include "tracker-parser-utils.h"
-
-/*
- * Definition of the possible reserved words.
- *  Length of word is explicitly given to avoid strlen() calls
- */
-typedef struct {
-	const gchar *word;
-	gsize        word_length;
-} TrackerParserReservedWord;
-
-static const TrackerParserReservedWord reserved_words[] = {
-	{ "or", 2 },
-	{ NULL, 0 }
-};
-
-gboolean
-tracker_parser_is_reserved_word_utf8 (const gchar *word,
-                                      gsize word_length)
-{
-	gint i = 0;
-
-	/* Loop the array of predefined reserved words */
-	while (reserved_words[i].word != NULL) {
-		if (word_length == reserved_words[i].word_length &&
-		    strncmp (word,
-		             reserved_words[i].word,
-		             word_length) == 0) {
-			return TRUE;
-		}
-		i++;
-	}
-
-	return FALSE;
-}
-
-
-#if TRACKER_PARSER_DEBUG_HEX
-void
-tracker_parser_message_hex (const gchar  *message,
-                            const gchar  *str,
-                            gsize         str_length)
-{
-	gchar *hex_aux;
-	gchar *str_aux;
-
-	g_return_if_fail (message);
-	g_return_if_fail (str);
-	g_return_if_fail (str_length != 0);
-
-	/* String may not come NIL-terminated */
-	str_aux = g_malloc (str_length + 1);
-	memcpy (str_aux, str, str_length);
-	str_aux[str_length] = '\0';
-
-	/* Get hexadecimal representation of the input string */
-	hex_aux = tracker_strhex (str, str_length, ':');
-
-	/* Log it */
-	g_message ("%s: '%s' (%s)",
-	           message, str_aux, hex_aux);
-
-	g_free (str_aux);
-	g_free (hex_aux);
-}
-#endif
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
deleted file mode 100644
index 614740f81..000000000
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301  USA
- */
-
-#ifndef __TRACKER_PARSER_UTILS_H__
-#define __TRACKER_PARSER_UTILS_H__
-
-#include "config.h"
-
-#include <glib.h>
-
-#ifdef HAVE_LIBICU
-#include <unicode/utypes.h>
-#endif
-
-G_BEGIN_DECLS
-
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_UCS4(c)      ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
-#define IS_CJK_UCS4(c)        (((c) >= 0x3400 && (c) <= 0x4DB5)  ||	\
-                               ((c) >= 0x4E00 && (c) <= 0x9FA5)  ||	\
-                               ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
-/* ASCII undescore? */
-#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
-
-/* Combining diacritical mark?
- * Basic range: [0x0300,0x036F]
- * Supplement:  [0x1DC0,0x1DFF]
- * For Symbols: [0x20D0,0x20FF]
- * Half marks:  [0xFE20,0xFE2F]
- */
-#define IS_CDM_UCS4(c)        (((c) >= 0x0300 && (c) <= 0x036F)  ||	\
-                               ((c) >= 0x1DC0 && (c) <= 0x1DFF)  ||	\
-                               ((c) >= 0x20D0 && (c) <= 0x20FF)  ||	\
-                               ((c) >= 0xFE20 && (c) <= 0xFE2F))
-
-/* Forced word breaks in Unicode parsers.
- * If any of these is found INSIDE a properly delimited Unicode word, a new word
- * break is forced and the Unicode word is split in two words.
- * Current forced wordbreaks:
- *   - 0x002E: DOT ('.')
- */
-#define IS_FORCED_WORDBREAK_UCS4(c) ((c) == 0x002E)
-
-
-gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
-                                               gsize word_length);
-
-
-/* Define to 1 if you want to enable debugging logs showing HEX contents
- * of the words being parsed */
-#define TRACKER_PARSER_DEBUG_HEX 0
-
-#if TRACKER_PARSER_DEBUG_HEX
-void    tracker_parser_message_hex (const gchar  *message,
-                                    const gchar  *str,
-                                    gsize         str_length);
-#else
-#define tracker_parser_message_hex(a,b,c)
-#endif
-
-G_END_DECLS
-
-#endif /* __TRACKER_PARSER_UTILS_H__ */
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
deleted file mode 100644
index e6cb10e06..000000000
--- a/src/libtracker-fts/tracker-parser.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
- * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301  USA
- */
-
-#ifndef __LIBTRACKER_FTS_PARSER_H__
-#define __LIBTRACKER_FTS_PARSER_H__
-
-#include <glib.h>
-
-#include <libtracker-common/tracker-language.h>
-
-G_BEGIN_DECLS
-
-typedef struct TrackerParser TrackerParser;
-
-TrackerParser *tracker_parser_new             (TrackerLanguage *language);
-
-void           tracker_parser_reset           (TrackerParser   *parser,
-                                               const gchar     *txt,
-                                               gint             txt_size,
-                                               guint            max_word_length,
-                                               gboolean         enable_stemmer,
-                                               gboolean         enable_unaccent,
-                                               gboolean         ignore_stop_words,
-                                               gboolean         ignore_reserved_words,
-                                               gboolean         ignore_numbers);
-
-const gchar *  tracker_parser_next            (TrackerParser   *parser,
-                                               gint            *position,
-                                               gint            *byte_offset_start,
-                                               gint            *byte_offset_end,
-                                               gboolean        *stop_word,
-                                               gint            *word_length);
-
-void           tracker_parser_free            (TrackerParser   *parser);
-
-/* Other helper methods */
-
-gboolean       tracker_parser_unaccent_nfkd_string (gpointer  str,
-                                                    gsize    *str_length);
-
-G_END_DECLS
-
-#endif /* __LIBTRACKER_FTS_PARSER_H__ */
author	Philip Van Hoof <philip@codeminded.be>	2014-12-01 14:07:16 +0100
committer	Philip Van Hoof <philip@codeminded.be>	2014-12-01 14:07:16 +0100
commit	5d05f68da10ef5a2c25fba4246120ce525035d51 (patch)
tree	f40db3e46953ef15f0dc7db5a5685711cbd79f18 /src/libtracker-fts
parent	8d64b9783d3aebff2d9259068c402a9e1c1a81e4 (diff)
download	tracker-5d05f68da10ef5a2c25fba4246120ce525035d51.tar.gz