summaryrefslogtreecommitdiff
path: root/src/libtracker-data
diff options
context:
space:
mode:
authorCarlos Garnacho <carlosg@gnome.org>2017-07-19 23:05:04 +0200
committerCarlos Garnacho <carlosg@gnome.org>2017-07-24 10:19:11 +0200
commit0dab83673b48e20747f00cabc2bfac7cfc3fab56 (patch)
treeba8e059060c245af96173e3d6b55ef095975d46b /src/libtracker-data
parent1f39141f3e0cac24fde66e3a4462d0aff3ad7c31 (diff)
downloadtracker-0dab83673b48e20747f00cabc2bfac7cfc3fab56.tar.gz
libtracker-data: Don't rely on spaces as separators on title collation
Skip non alphanumeric characters both at the beginning of titles, and after the prefix match. Of course, require at least one such non alphanumeric character after the prefix match, in order to avoid matching beginnings of words. https://bugzilla.gnome.org/show_bug.cgi?id=785146
Diffstat (limited to 'src/libtracker-data')
-rw-r--r--src/libtracker-data/tracker-collation.c60
1 files changed, 51 insertions, 9 deletions
diff --git a/src/libtracker-data/tracker-collation.c b/src/libtracker-data/tracker-collation.c
index d7d2b525d..8bd3e6dab 100644
--- a/src/libtracker-data/tracker-collation.c
+++ b/src/libtracker-data/tracker-collation.c
@@ -242,6 +242,36 @@ tracker_collation_utf8 (gpointer collator,
#endif
static gboolean
+skip_non_alphanumeric (const gchar **str,
+ gint *len)
+{
+ GUnicodeBreakType break_type;
+ const gchar *remaining = *str, *end = &remaining[*len];
+ gboolean found = FALSE, is_alnum;
+ gunichar unichar;
+
+ do {
+ unichar = g_utf8_get_char (remaining);
+ is_alnum = g_unichar_isalnum (unichar);
+ if (!is_alnum) {
+ found = TRUE;
+ remaining = g_utf8_next_char (remaining);
+ }
+ } while (!is_alnum && remaining < end);
+
+ /* The string must not be left empty */
+ if (remaining == end)
+ return FALSE;
+
+ if (found) {
+ *len = end - remaining;
+ *str = remaining;
+ }
+
+ return found;
+}
+
+static gboolean
check_remove_prefix (const gchar *str,
gint len,
const gchar *prefix,
@@ -249,22 +279,33 @@ check_remove_prefix (const gchar *str,
const gchar **str_out,
gint *len_out)
{
- gboolean substituted = FALSE;
+ const gchar *remaining;
gchar *strstart;
+ gint remaining_len;
if (len <= prefix_len)
return FALSE;
+ /* Check that the prefix matches */
strstart = g_utf8_casefold (str, prefix_len);
- if (strcmp (strstart, prefix) == 0) {
- *str_out = str + prefix_len;
- *len_out = len - prefix_len;
- substituted = TRUE;
+ if (strcmp (strstart, prefix) != 0) {
+ g_free (strstart);
+ return FALSE;
}
+ /* Check that the following letter is a break
+ * character.
+ */
g_free (strstart);
+ remaining = &str[prefix_len];
+ remaining_len = len - prefix_len;
- return substituted;
+ if (!skip_non_alphanumeric (&remaining, &remaining_len))
+ return FALSE;
+
+ *len_out = remaining_len;
+ *str_out = remaining;
+ return TRUE;
}
/* Helper function valid for all implementations */
@@ -280,6 +321,9 @@ tracker_collation_utf8_title (gpointer collator,
const gchar *res1 = NULL, *res2 = NULL;
gint i;
+ skip_non_alphanumeric ((const gchar **) &str1, &len1);
+ skip_non_alphanumeric ((const gchar **) &str2, &len2);
+
/* Translators: this is a '|' (U+007C) separated list of common
* title beginnings. Meant to be skipped for sorting purposes,
* case doesn't matter. Given English media is quite common, it is
@@ -295,8 +339,7 @@ tracker_collation_utf8_title (gpointer collator,
gchar *prefix, *str;
gint prefix_len;
- str = g_strdup_printf ("%s ", title_beginnings[i]);
- prefix = g_utf8_casefold (str, -1);
+ prefix = g_utf8_casefold (title_beginnings[i], -1);
prefix_len = strlen (prefix);
if (!res1)
@@ -306,7 +349,6 @@ tracker_collation_utf8_title (gpointer collator,
check_remove_prefix (str2, len2, prefix, prefix_len,
&res2, &len2);
g_free (prefix);
- g_free (str);
}
if (!res1)