diff options
author | Carlos Garnacho <carlosg@gnome.org> | 2017-07-19 23:05:04 +0200 |
---|---|---|
committer | Carlos Garnacho <carlosg@gnome.org> | 2017-07-24 10:19:11 +0200 |
commit | 0dab83673b48e20747f00cabc2bfac7cfc3fab56 (patch) | |
tree | ba8e059060c245af96173e3d6b55ef095975d46b /src/libtracker-data | |
parent | 1f39141f3e0cac24fde66e3a4462d0aff3ad7c31 (diff) | |
download | tracker-0dab83673b48e20747f00cabc2bfac7cfc3fab56.tar.gz |
libtracker-data: Don't rely on spaces as separators on title collation
Skip non alphanumeric characters both at the beginning of titles, and after
the prefix match. Of course, require at least one such non alphanumeric
character after the prefix match, in order to avoid matching beginnings of
words.
https://bugzilla.gnome.org/show_bug.cgi?id=785146
Diffstat (limited to 'src/libtracker-data')
-rw-r--r-- | src/libtracker-data/tracker-collation.c | 60 |
1 files changed, 51 insertions, 9 deletions
diff --git a/src/libtracker-data/tracker-collation.c b/src/libtracker-data/tracker-collation.c index d7d2b525d..8bd3e6dab 100644 --- a/src/libtracker-data/tracker-collation.c +++ b/src/libtracker-data/tracker-collation.c @@ -242,6 +242,36 @@ tracker_collation_utf8 (gpointer collator, #endif static gboolean +skip_non_alphanumeric (const gchar **str, + gint *len) +{ + GUnicodeBreakType break_type; + const gchar *remaining = *str, *end = &remaining[*len]; + gboolean found = FALSE, is_alnum; + gunichar unichar; + + do { + unichar = g_utf8_get_char (remaining); + is_alnum = g_unichar_isalnum (unichar); + if (!is_alnum) { + found = TRUE; + remaining = g_utf8_next_char (remaining); + } + } while (!is_alnum && remaining < end); + + /* The string must not be left empty */ + if (remaining == end) + return FALSE; + + if (found) { + *len = end - remaining; + *str = remaining; + } + + return found; +} + +static gboolean check_remove_prefix (const gchar *str, gint len, const gchar *prefix, @@ -249,22 +279,33 @@ check_remove_prefix (const gchar *str, const gchar **str_out, gint *len_out) { - gboolean substituted = FALSE; + const gchar *remaining; gchar *strstart; + gint remaining_len; if (len <= prefix_len) return FALSE; + /* Check that the prefix matches */ strstart = g_utf8_casefold (str, prefix_len); - if (strcmp (strstart, prefix) == 0) { - *str_out = str + prefix_len; - *len_out = len - prefix_len; - substituted = TRUE; + if (strcmp (strstart, prefix) != 0) { + g_free (strstart); + return FALSE; } + /* Check that the following letter is a break + * character. + */ g_free (strstart); + remaining = &str[prefix_len]; + remaining_len = len - prefix_len; - return substituted; + if (!skip_non_alphanumeric (&remaining, &remaining_len)) + return FALSE; + + *len_out = remaining_len; + *str_out = remaining; + return TRUE; } /* Helper function valid for all implementations */ @@ -280,6 +321,9 @@ tracker_collation_utf8_title (gpointer collator, const gchar *res1 = NULL, *res2 = NULL; gint i; + skip_non_alphanumeric ((const gchar **) &str1, &len1); + skip_non_alphanumeric ((const gchar **) &str2, &len2); + /* Translators: this is a '|' (U+007C) separated list of common * title beginnings. Meant to be skipped for sorting purposes, * case doesn't matter. Given English media is quite common, it is @@ -295,8 +339,7 @@ tracker_collation_utf8_title (gpointer collator, gchar *prefix, *str; gint prefix_len; - str = g_strdup_printf ("%s ", title_beginnings[i]); - prefix = g_utf8_casefold (str, -1); + prefix = g_utf8_casefold (title_beginnings[i], -1); prefix_len = strlen (prefix); if (!res1) @@ -306,7 +349,6 @@ tracker_collation_utf8_title (gpointer collator, check_remove_prefix (str2, len2, prefix, prefix_len, &res2, &len2); g_free (prefix); - g_free (str); } if (!res1) |