diff options
author | Aleksander Morgado <aleksander@aleksander.es> | 2014-01-15 15:52:07 +0100 |
---|---|---|
committer | Aleksander Morgado <aleksander@aleksander.es> | 2014-01-21 12:18:39 +0100 |
commit | b3fb86ea9cfc7062240aa8527dbc0a9ac897396e (patch) | |
tree | 00c46ca6f101f3e0341e39fb089c0591454366f9 /src/libtracker-fts | |
parent | 8e00e18156328725c03210edb1a7585922c32984 (diff) | |
download | tracker-b3fb86ea9cfc7062240aa8527dbc0a9ac897396e.tar.gz |
libtracker-data: new 'tracker:unaccent' method
https://bugzilla.gnome.org/show_bug.cgi?id=722254
This method allows removing combining diacritical marks (accents) from strings
used in SPARQL queries. It expects a single argument, the string to be
unaccented.
Note that the output string will also be NFKD-normalized.
Example:
1) First, insert a new element which has accents in the nie:title. In the
example we insert the word 'école' which in UTF-8 NFC looks like
"0xC3 0xA9 0x63 0x6F 0x6C 0x65":
$ tracker-sparql -u -q "
INSERT { <abc> a nie:InformationElement .
<abc> nie:title 'école' }"
2) Second, get hexdump of querying nie:title, we should get the original string
in UTF-8 and NFC normalization:
$ tracker-sparql -q "
SELECT ?title
WHERE { <abc> nie:title ?title }" | hexdump
0000000 6552 7573 746c 3a73 200a c320 63a9 6c6f
0000010 0a65 000a
0000013
Or, without the hexdump...
$ tracker-sparql -q "
SELECT ?title
WHERE { <abc> nie:title ?title }"
Results:
école
3) Last, apply the unaccenting method. The expected string should look like
"0×65 0×63 0x6F 0x6C 0×65" (i.e. without the combining diacritical mark):
$ tracker-sparql -q "
SELECT tracker:unaccent(?title)
WHERE { <abc> nie:title ?title }" | hexdump
0000000 6552 7573 746c 3a73 200a 6520 6f63 656c
0000010 0a0a
0000012
Or, without the hexdump...
$ tracker-sparql -q "
SELECT tracker:unaccent(?title)
WHERE { <abc> nie:title ?title }"
Results:
ecole
Diffstat (limited to 'src/libtracker-fts')
-rw-r--r-- | src/libtracker-fts/tracker-parser-libicu.c | 29 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser-libunistring.c | 26 | ||||
-rw-r--r-- | src/libtracker-fts/tracker-parser.h | 5 |
3 files changed, 39 insertions, 21 deletions
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c index 69f75ed20..b26722c96 100644 --- a/src/libtracker-fts/tracker-parser-libicu.c +++ b/src/libtracker-fts/tracker-parser-libicu.c @@ -141,28 +141,35 @@ get_word_info (const UChar *word, return TRUE; } -static gboolean -parser_unaccent_nfkd_word (UChar *word, - gsize *word_length) +/* The input word in this method MUST be normalized in NFKD form, + * and given in UChars, where str_length is the number of UChars + * (not the number of bytes) */ +gboolean +tracker_parser_unaccent_nfkd_string (gpointer str, + gsize *str_length) { - /* The input word in this method MUST be normalized in NFKD form */ + UChar *word; + gsize word_length; gsize i; gsize j; - g_return_val_if_fail (word, FALSE); - g_return_val_if_fail (word_length, FALSE); - g_return_val_if_fail (*word_length > 0, FALSE); + g_return_val_if_fail (str != NULL, FALSE); + g_return_val_if_fail (str_length != NULL, FALSE); + g_return_val_if_fail (*str_length > 0, FALSE); + + word = (UChar *)str; + word_length = *str_length; i = 0; j = 0; - while (i < *word_length) { + while (i < word_length) { UChar32 unichar; gint utf16_len; /* given in UChars */ gsize aux_i; /* Get next character of the word as UCS4 */ aux_i = i; - U16_NEXT (word, aux_i, *word_length, unichar); + U16_NEXT (word, aux_i, word_length, unichar); utf16_len = aux_i - i; /* Invalid UTF-16 character or end of original string. */ @@ -195,7 +202,7 @@ parser_unaccent_nfkd_word (UChar *word, word[j] = (UChar) 0; /* Set new output length */ - *word_length = j; + *str_length = j; return TRUE; } @@ -331,7 +338,7 @@ process_word_uchar (TrackerParser *parser, /* UNAC stripping needed? (for non-CJK and non-ASCII) */ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && - parser_unaccent_nfkd_word (normalized_buffer, &new_word_length)) { + tracker_parser_unaccent_nfkd_string (normalized_buffer, &new_word_length)) { /* Log after unaccenting */ tracker_parser_message_hex (" After UNAC", (guint8 *) normalized_buffer, diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c index 1824528f1..9de6e46f7 100644 --- a/src/libtracker-fts/tracker-parser-libunistring.c +++ b/src/libtracker-fts/tracker-parser-libunistring.c @@ -156,21 +156,27 @@ get_word_info (TrackerParser *parser, return TRUE; } -static gboolean -parser_unaccent_nfkd_word (gchar *word, - gsize *word_length) +/* The input word in this method MUST be normalized in NFKD form, + * and given in UTF-8, where str_length is the byte-length */ +gboolean +tracker_parser_unaccent_nfkd_string (gpointer str, + gsize *str_length) { - /* The input word in this method MUST be normalized in NFKD form */ + gchar *word; + gsize word_length; gsize i; gsize j; - g_return_val_if_fail (word, FALSE); - g_return_val_if_fail (word_length, FALSE); - g_return_val_if_fail (*word_length > 0, FALSE); + g_return_val_if_fail (str != NULL, FALSE); + g_return_val_if_fail (str_length != NULL, FALSE); + g_return_val_if_fail (*str_length > 0, FALSE); + + word = (gchar *)str; + word_length = *str_length; i = 0; j = 0; - while (i < *word_length) { + while (i < word_length) { ucs4_t unichar; gint utf8_len; @@ -207,7 +213,7 @@ parser_unaccent_nfkd_word (gchar *word, word[j] = '\0'; /* Set new output length */ - *word_length = j; + *str_length = j; return TRUE; } @@ -289,7 +295,7 @@ process_word_utf8 (TrackerParser *parser, /* UNAC stripping needed? (for non-CJK and non-ASCII) */ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && - parser_unaccent_nfkd_word (normalized, &new_word_length)) { + tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) { /* Log after UNAC stripping */ tracker_parser_message_hex (" After UNAC stripping", normalized, new_word_length); diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h index cc1239836..e6cb10e06 100644 --- a/src/libtracker-fts/tracker-parser.h +++ b/src/libtracker-fts/tracker-parser.h @@ -50,6 +50,11 @@ const gchar * tracker_parser_next (TrackerParser *parser, void tracker_parser_free (TrackerParser *parser); +/* Other helper methods */ + +gboolean tracker_parser_unaccent_nfkd_string (gpointer str, + gsize *str_length); + G_END_DECLS #endif /* __LIBTRACKER_FTS_PARSER_H__ */ |