summaryrefslogtreecommitdiff
path: root/src/libtracker-fts
diff options
context:
space:
mode:
authorAleksander Morgado <aleksander@aleksander.es>2014-01-15 15:52:07 +0100
committerAleksander Morgado <aleksander@aleksander.es>2014-01-21 12:18:39 +0100
commitb3fb86ea9cfc7062240aa8527dbc0a9ac897396e (patch)
tree00c46ca6f101f3e0341e39fb089c0591454366f9 /src/libtracker-fts
parent8e00e18156328725c03210edb1a7585922c32984 (diff)
downloadtracker-b3fb86ea9cfc7062240aa8527dbc0a9ac897396e.tar.gz
libtracker-data: new 'tracker:unaccent' method
https://bugzilla.gnome.org/show_bug.cgi?id=722254 This method allows removing combining diacritical marks (accents) from strings used in SPARQL queries. It expects a single argument, the string to be unaccented. Note that the output string will also be NFKD-normalized. Example: 1) First, insert a new element which has accents in the nie:title. In the example we insert the word 'école' which in UTF-8 NFC looks like "0xC3 0xA9 0x63 0x6F 0x6C 0x65": $ tracker-sparql -u -q " INSERT { <abc> a nie:InformationElement . <abc> nie:title 'école' }" 2) Second, get hexdump of querying nie:title, we should get the original string in UTF-8 and NFC normalization: $ tracker-sparql -q " SELECT ?title WHERE { <abc> nie:title ?title }" | hexdump 0000000 6552 7573 746c 3a73 200a c320 63a9 6c6f 0000010 0a65 000a 0000013 Or, without the hexdump... $ tracker-sparql -q " SELECT ?title WHERE { <abc> nie:title ?title }" Results: école 3) Last, apply the unaccenting method. The expected string should look like "0×65 0×63 0x6F 0x6C 0×65" (i.e. without the combining diacritical mark): $ tracker-sparql -q " SELECT tracker:unaccent(?title) WHERE { <abc> nie:title ?title }" | hexdump 0000000 6552 7573 746c 3a73 200a 6520 6f63 656c 0000010 0a0a 0000012 Or, without the hexdump... $ tracker-sparql -q " SELECT tracker:unaccent(?title) WHERE { <abc> nie:title ?title }" Results: ecole
Diffstat (limited to 'src/libtracker-fts')
-rw-r--r--src/libtracker-fts/tracker-parser-libicu.c29
-rw-r--r--src/libtracker-fts/tracker-parser-libunistring.c26
-rw-r--r--src/libtracker-fts/tracker-parser.h5
3 files changed, 39 insertions, 21 deletions
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 69f75ed20..b26722c96 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -141,28 +141,35 @@ get_word_info (const UChar *word,
return TRUE;
}
-static gboolean
-parser_unaccent_nfkd_word (UChar *word,
- gsize *word_length)
+/* The input word in this method MUST be normalized in NFKD form,
+ * and given in UChars, where str_length is the number of UChars
+ * (not the number of bytes) */
+gboolean
+tracker_parser_unaccent_nfkd_string (gpointer str,
+ gsize *str_length)
{
- /* The input word in this method MUST be normalized in NFKD form */
+ UChar *word;
+ gsize word_length;
gsize i;
gsize j;
- g_return_val_if_fail (word, FALSE);
- g_return_val_if_fail (word_length, FALSE);
- g_return_val_if_fail (*word_length > 0, FALSE);
+ g_return_val_if_fail (str != NULL, FALSE);
+ g_return_val_if_fail (str_length != NULL, FALSE);
+ g_return_val_if_fail (*str_length > 0, FALSE);
+
+ word = (UChar *)str;
+ word_length = *str_length;
i = 0;
j = 0;
- while (i < *word_length) {
+ while (i < word_length) {
UChar32 unichar;
gint utf16_len; /* given in UChars */
gsize aux_i;
/* Get next character of the word as UCS4 */
aux_i = i;
- U16_NEXT (word, aux_i, *word_length, unichar);
+ U16_NEXT (word, aux_i, word_length, unichar);
utf16_len = aux_i - i;
/* Invalid UTF-16 character or end of original string. */
@@ -195,7 +202,7 @@ parser_unaccent_nfkd_word (UChar *word,
word[j] = (UChar) 0;
/* Set new output length */
- *word_length = j;
+ *str_length = j;
return TRUE;
}
@@ -331,7 +338,7 @@ process_word_uchar (TrackerParser *parser,
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
if (parser->enable_unaccent &&
type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
- parser_unaccent_nfkd_word (normalized_buffer, &new_word_length)) {
+ tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) {
/* Log after unaccenting */
tracker_parser_message_hex (" After UNAC",
(guint8 *) normalized_buffer,
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 1824528f1..9de6e46f7 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -156,21 +156,27 @@ get_word_info (TrackerParser *parser,
return TRUE;
}
-static gboolean
-parser_unaccent_nfkd_word (gchar *word,
- gsize *word_length)
+/* The input word in this method MUST be normalized in NFKD form,
+ * and given in UTF-8, where str_length is the byte-length */
+gboolean
+tracker_parser_unaccent_nfkd_string (gpointer str,
+ gsize *str_length)
{
- /* The input word in this method MUST be normalized in NFKD form */
+ gchar *word;
+ gsize word_length;
gsize i;
gsize j;
- g_return_val_if_fail (word, FALSE);
- g_return_val_if_fail (word_length, FALSE);
- g_return_val_if_fail (*word_length > 0, FALSE);
+ g_return_val_if_fail (str != NULL, FALSE);
+ g_return_val_if_fail (str_length != NULL, FALSE);
+ g_return_val_if_fail (*str_length > 0, FALSE);
+
+ word = (gchar *)str;
+ word_length = *str_length;
i = 0;
j = 0;
- while (i < *word_length) {
+ while (i < word_length) {
ucs4_t unichar;
gint utf8_len;
@@ -207,7 +213,7 @@ parser_unaccent_nfkd_word (gchar *word,
word[j] = '\0';
/* Set new output length */
- *word_length = j;
+ *str_length = j;
return TRUE;
}
@@ -289,7 +295,7 @@ process_word_utf8 (TrackerParser *parser,
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
if (parser->enable_unaccent &&
type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
- parser_unaccent_nfkd_word (normalized, &new_word_length)) {
+ tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
/* Log after UNAC stripping */
tracker_parser_message_hex (" After UNAC stripping",
normalized, new_word_length);
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index cc1239836..e6cb10e06 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -50,6 +50,11 @@ const gchar * tracker_parser_next (TrackerParser *parser,
void tracker_parser_free (TrackerParser *parser);
+/* Other helper methods */
+
+gboolean tracker_parser_unaccent_nfkd_string (gpointer str,
+ gsize *str_length);
+
G_END_DECLS
#endif /* __LIBTRACKER_FTS_PARSER_H__ */