/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2013 Intel Corporation
*
* This library is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
* Authors: Tristan Van Berkom
*/
/**
* SECTION: e-collator
* @include: libedataserver/libedataserver.h
* @short_description: Collation services for locale sensitive sorting
*
* The #ECollator is a wrapper object around ICU collation services and
* provides features to sort words in locale specific ways. The collator
* also provides some API for determining features of the active alphabet
* in the user's locale, and which words should be sorted under which
* letter in the user's alphabet.
*/
#ifdef HAVE_CONFIG_H
# include
#endif
#include
#include
/* ICU includes */
#include
#include
#include
#include "e-collator.h"
#include "e-alphabet-index-private.h"
#include "e-transliterator-private.h"
#define CONVERT_BUFFER_LEN 512
#define COLLATION_KEY_BUFFER_LEN 1024
#define LOCALE_BUFFER_LEN 256
#define ENABLE_DEBUGGING 0
G_DEFINE_QUARK (e-collator-error-quark, e_collator_error)
G_DEFINE_BOXED_TYPE (ECollator,
e_collator,
e_collator_ref,
e_collator_unref)
struct _ECollator
{
UCollator *coll;
volatile gint ref_count;
EAlphabetIndex *alpha_index;
gchar **labels;
gint n_labels;
gint underflow;
gint inflow;
gint overflow;
ETransliterator *transliterator;
};
/*****************************************************
* ICU Helper Functions *
*****************************************************/
#if ENABLE_DEBUGGING
static void
print_available_locales (void)
{
UErrorCode status = U_ZERO_ERROR;
UChar result[100];
gchar printable[100 * 4];
gint count, i;
u_init (&status);
g_printerr ("List of available locales (default locale is: %s)\n", uloc_getDefault ());
count = uloc_countAvailable ();
for (i = 0; i < count; i++) {
UEnumeration *keywords;
const gchar *keyword;
uloc_getDisplayName (uloc_getAvailable (i), NULL, result, 100, &status);
u_austrncpy (printable, result, sizeof (printable));
/* print result */
g_printerr ("\t%s - %s", uloc_getAvailable (i), printable);
keywords = uloc_openKeywords (uloc_getAvailable (i), &status);
if (keywords) {
UErrorCode kstatus = U_ZERO_ERROR;
g_printerr ("[");
while ((keyword = uenum_next (keywords, NULL, &kstatus)) != NULL)
g_printerr (" %s ", keyword);
g_printerr ("]");
uenum_close (keywords);
}
g_printerr ("\n");
}
}
#endif
static gchar *
canonicalize_locale (const gchar *posix_locale,
gchar **language_code,
gchar **country_code,
GError **error)
{
UErrorCode status = U_ZERO_ERROR;
gchar locale_buffer[LOCALE_BUFFER_LEN];
gchar language_buffer[8];
gchar country_buffer[8];
gchar *icu_locale;
gchar *final_locale;
gint len;
const gchar *collation_type = NULL;
len = uloc_canonicalize (posix_locale, locale_buffer, LOCALE_BUFFER_LEN, &status);
if (U_FAILURE (status)) {
g_set_error (
error, E_COLLATOR_ERROR,
E_COLLATOR_ERROR_INVALID_LOCALE,
"Failed to interpret locale '%s' (%s)",
posix_locale,
u_errorName (status));
return NULL;
}
if (len > LOCALE_BUFFER_LEN) {
icu_locale = g_malloc (len);
uloc_canonicalize (posix_locale, icu_locale, len, &status);
} else {
icu_locale = g_strndup (locale_buffer, len);
}
status = U_ZERO_ERROR;
len = uloc_getLanguage (icu_locale, language_buffer, 8, &status);
if (U_FAILURE (status)) {
g_set_error (
error, E_COLLATOR_ERROR,
E_COLLATOR_ERROR_INVALID_LOCALE,
"Failed to interpret language for locale '%s': %s",
icu_locale,
u_errorName (status));
g_free (icu_locale);
return NULL;
}
status = U_ZERO_ERROR;
len = uloc_getCountry (icu_locale, country_buffer, 8, &status);
if (U_FAILURE (status)) {
g_set_error (
error, E_COLLATOR_ERROR,
E_COLLATOR_ERROR_INVALID_LOCALE,
"Failed to interpret country for locale '%s': %s",
icu_locale,
u_errorName (status));
g_free (icu_locale);
return NULL;
}
/* Add 'phonebook' tailoring to certain locales */
if (len < 8 &&
(strcmp (language_buffer, "de") == 0 ||
strcmp (language_buffer, "fi") == 0)) {
collation_type = "phonebook";
}
if (collation_type != NULL)
final_locale = g_strconcat (icu_locale, "@collation=", collation_type, NULL);
else {
final_locale = icu_locale;
icu_locale = NULL;
}
g_free (icu_locale);
if (language_code)
*language_code = g_strdup (language_buffer);
if (country_code)
*country_code = g_strdup (country_buffer);
return final_locale;
}
/* All purpose character encoding function, encodes text
* to a UChar from UTF-8 and first ensures that the string
* is valid UTF-8
*/
static const UChar *
convert_to_ustring (const gchar *string,
UChar *buffer,
gint buffer_len,
gint *result_len,
UChar **free_me,
GError **error)
{
UErrorCode status = U_ZERO_ERROR;
const gchar *source_utf8;
gchar *alloc_utf8 = NULL;
gint converted_len = 0;
UChar *converted_buffer;
/* First make sure we're dealing with utf8 */
if (g_utf8_validate (string, -1, NULL))
source_utf8 = string;
else {
alloc_utf8 = e_util_utf8_make_valid (string);
source_utf8 = alloc_utf8;
}
/* First pass, try converting to UChar in the given buffer */
converted_buffer = u_strFromUTF8Lenient (
buffer,
buffer_len,
&converted_len,
source_utf8,
-1,
&status);
/* Set the result length right away... */
*result_len = converted_len;
if (U_FAILURE (status)) {
converted_buffer = NULL;
goto out;
}
/* Second pass, allocate a buffer big enough and then convert */
if (converted_len > buffer_len) {
*free_me = g_new (UChar, converted_len);
converted_buffer = u_strFromUTF8Lenient (
*free_me,
converted_len,
NULL,
source_utf8,
-1,
&status);
if (U_FAILURE (status)) {
g_free (*free_me);
*free_me = NULL;
converted_buffer = NULL;
goto out;
}
}
out:
g_free (alloc_utf8);
if (U_FAILURE (status))
g_set_error (
error, E_COLLATOR_ERROR,
E_COLLATOR_ERROR_CONVERSION,
"Error occured while converting character encoding (%s)",
u_errorName (status));
return converted_buffer;
}
/*****************************************************
* API *
*****************************************************/
/**
* e_collator_new:
* @locale: The locale under which to sort
* @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
*
* Creates a new #ECollator for the given @locale,
* the returned collator should be freed with e_collator_unref().
*
* Returns: (transfer full): A newly created #ECollator.
*
* Since: 3.12
*/
ECollator *
e_collator_new (const gchar *locale,
GError **error)
{
return e_collator_new_interpret_country (locale, NULL, error);
}
/**
* e_collator_new_interpret_country:
* @locale: The locale under which to sort
* @country_code: (allow-none) (out) (transfer full): A location to store the interpreted country code from @locale
* @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
*
* Creates a new #ECollator for the given @locale,
* the returned collator should be freed with e_collator_unref().
*
* In addition, this also reliably interprets the country
* code from the @locale string and stores it to @country_code.
*
* Returns: (transfer full): A newly created #ECollator.
*
* Since: 3.12
*/
ECollator *
e_collator_new_interpret_country (const gchar *locale,
gchar **country_code,
GError **error)
{
ECollator *collator;
UCollator *coll;
UErrorCode status = U_ZERO_ERROR;
gchar *icu_locale;
gchar *language_code = NULL;
gchar *local_country_code = NULL;
g_return_val_if_fail (locale && locale[0], NULL);
#if ENABLE_DEBUGGING
print_available_locales ();
#endif
icu_locale = canonicalize_locale (
locale,
&language_code,
&local_country_code,
error);
if (!icu_locale)
return NULL;
coll = ucol_open (icu_locale, &status);
if (U_FAILURE (status)) {
g_set_error (
error, E_COLLATOR_ERROR,
E_COLLATOR_ERROR_OPEN,
"Unable to open collator for locale '%s' (%s)",
icu_locale,
u_errorName (status));
g_free (language_code);
g_free (local_country_code);
g_free (icu_locale);
ucol_close (coll);
return NULL;
}
g_free (icu_locale);
ucol_setStrength (coll, UCOL_DEFAULT_STRENGTH);
collator = g_slice_new0 (ECollator);
collator->coll = coll;
collator->ref_count = 1;
/* In Chinese we use transliteration services to sort latin
* names interleaved with Chinese names in a latin AlphabeticIndex
*/
if (g_strcmp0 (language_code, "zh") == 0)
collator->transliterator = _e_transliterator_cxx_new ("Han-Latin");
collator->alpha_index = _e_alphabet_index_cxx_new_for_language (language_code);
collator->labels = _e_alphabet_index_cxx_get_labels (
collator->alpha_index,
&collator->n_labels,
&collator->underflow,
&collator->inflow,
&collator->overflow);
g_free (language_code);
if (country_code)
*country_code = local_country_code;
else
g_free (local_country_code);
return collator;
}
/**
* e_collator_ref:
* @collator: An #ECollator
*
* Increases the reference count of @collator.
*
* Returns: (transfer full): @collator
*
* Since: 3.12
*/
ECollator *
e_collator_ref (ECollator *collator)
{
g_return_val_if_fail (collator != NULL, NULL);
g_atomic_int_inc (&collator->ref_count);
return collator;
}
/**
* e_collator_unref:
* @collator: An #ECollator
*
* Decreases the reference count of @collator.
* If the reference count reaches 0 then the collator is freed
*
* Since: 3.12
*/
void
e_collator_unref (ECollator *collator)
{
g_return_if_fail (collator != NULL);
if (g_atomic_int_dec_and_test (&collator->ref_count)) {
if (collator->coll)
ucol_close (collator->coll);
_e_alphabet_index_cxx_free (collator->alpha_index);
g_strfreev (collator->labels);
/* The transliterator is only used for specialized sorting in some locales,
* notably Chinese locales
*/
if (collator->transliterator)
_e_transliterator_cxx_free (collator->transliterator);
g_slice_free (ECollator, collator);
}
}
/**
* e_collator_generate_key:
* @collator: An #ECollator
* @str: The string to generate a collation key for
* @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
*
* Generates a collation key for @str, the result of comparing
* two collation keys with strcmp() will be the same result
* of calling e_collator_collate() on the same original strings.
*
* This function will first ensure that @str is valid UTF-8 encoded.
*
* Returns: (transfer full): A collation key for @str, or %NULL on failure with @error set.
*
* Since: 3.12
*/
gchar *
e_collator_generate_key (ECollator *collator,
const gchar *str,
GError **error)
{
UChar source_buffer[CONVERT_BUFFER_LEN];
UChar *free_me = NULL;
const UChar *source;
gchar stack_buffer[COLLATION_KEY_BUFFER_LEN];
gchar *collation_key;
gint key_len, source_len = 0;
gint alphabet_index;
gchar *translit_str = NULL;
const gchar *input_str;
g_return_val_if_fail (collator != NULL, NULL);
g_return_val_if_fail (str != NULL, NULL);
/* We may need to perform a conversion before generating the sort key */
if (collator->transliterator) {
translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
input_str = translit_str;
} else {
input_str = str;
}
source = convert_to_ustring (
input_str,
source_buffer,
CONVERT_BUFFER_LEN,
&source_len,
&free_me,
error);
if (!source) {
g_free (translit_str);
return NULL;
}
/* Get the numerical index for this string */
alphabet_index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
/* First try to generate a key in a predefined buffer size */
key_len = ucol_getSortKey (
collator->coll, source, source_len,
(guchar *) stack_buffer, COLLATION_KEY_BUFFER_LEN);
if (key_len > COLLATION_KEY_BUFFER_LEN) {
/* Stack buffer wasn't large enough, regenerate into a new buffer
* (add a byte for a trailing NULL char)
*
* Note we allocate 4 extra chars to hold the prefixed alphabetic
* index into the first 4 charachters (the 5th extra char is the trailing
* null character).
*/
collation_key = g_malloc (key_len + 5);
/* Format the alphabetic index into the first 4 chars */
snprintf (collation_key, 4, "%03d-", alphabet_index);
/* Get the sort key and put it in &collation_key[4] */
ucol_getSortKey (
collator->coll, source, source_len,
(guchar *)(collation_key + 4), key_len);
/* Just being paranoid, make sure we're null terminated since the API
* doesn't specify if the result length is null character inclusive
*/
collation_key[key_len + 4] = '\0';
} else {
GString *string = g_string_new (NULL);
/* Format the alphabetic index into the first 4 chars */
g_string_append_printf (string, "%03d-", alphabet_index);
/* Insert the rest of the sort key from the stack buffer into the allocated buffer */
g_string_insert_len (string, 4, stack_buffer, key_len);
collation_key = g_string_free (string, FALSE);
}
g_free (free_me);
g_free (translit_str);
return (gchar *) collation_key;
}
/**
* e_collator_generate_key_for_index:
* @collator: An #ECollator
* @index: An index into the alphabetic labels
*
* Generates a sort key for the given alphabetic @index.
*
* The generated sort key is guaranteed to sort below
* any sort keys for words beginning with any variant of
* the given letter.
*
* For instance, a sort key generated for the index 5 of
* a latin alphabet, where the fifth index is 'E' will sort
* below any sort keys generated for words starting with
* the characters 'e', 'E', 'é', 'É', 'è' or 'È'. It will also
* sort above any sort keys generated for words starting with
* the characters 'd' or 'D'.
*
* Returns: (transfer full): A sort key for the given index
*
* Since: 3.12
*/
gchar *
e_collator_generate_key_for_index (ECollator *collator,
gint index)
{
g_return_val_if_fail (collator != NULL, NULL);
g_return_val_if_fail (index >= 0 && index < collator->n_labels, NULL);
return g_strdup_printf ("%03d", index);
}
/**
* e_collator_collate:
* @collator: An #ECollator
* @str_a: (allow-none): A string to compare
* @str_b: (allow-none): The string to compare with @str_a
* @result: (out): A location to store the comparison result
* @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
*
* Compares @str_a with @str_b, the order of strings is determined by the parameters of @collator.
*
* The @result will be set to integer less than, equal to, or greater than zero if @str_a is found,
* respectively, to be less than, to match, or be greater than @str_b.
*
* Either @str_a or @str_b can be %NULL, %NULL strings are considered to sort below other strings.
*
* This function will first ensure that both strings are valid UTF-8.
*
* Returns: %TRUE on success, otherwise if %FALSE is returned then @error will be set.
*
* Since: 3.12
*/
gboolean
e_collator_collate (ECollator *collator,
const gchar *str_a,
const gchar *str_b,
gint *result,
GError **error)
{
gchar *sort_key_a, *sort_key_b;
g_return_val_if_fail (collator != NULL, -1);
g_return_val_if_fail (result != NULL, -1);
if (!str_a || !str_b) {
*result = g_strcmp0 (str_a, str_b);
return TRUE;
}
sort_key_a = e_collator_generate_key (collator, str_a, error);
if (!sort_key_a)
return FALSE;
sort_key_b = e_collator_generate_key (collator, str_b, error);
if (!sort_key_b) {
g_free (sort_key_a);
return FALSE;
}
*result = strcmp (sort_key_a, sort_key_b);
g_free (sort_key_a);
g_free (sort_key_b);
return TRUE;
}
/**
* e_collator_get_index_labels:
* @collator: An #ECollator
* @n_labels: (out): The number of labels/indexes available for @collator
* @underflow: (allow-none) (out): The underflow index, for any words which sort below the active alphabet(s)
* @inflow: (allow-none) (out): The inflow index, for any words which sort between the active alphabets (if there is more than one)
* @overflow: (allow-none) (out): The overflow index, for any words which sort above the active alphabet(s)
*
* Fetches the displayable labels and index positions for the active alphabet.
*
* Returns: (array zero-terminated=1) (element-type utf8) (transfer none):
* The array of displayable labels for each index in the active alphabet(s).
*
* Since: 3.12
*/
const gchar *const *
e_collator_get_index_labels (ECollator *collator,
gint *n_labels,
gint *underflow,
gint *inflow,
gint *overflow)
{
g_return_val_if_fail (collator != NULL, NULL);
if (n_labels)
*n_labels = collator->n_labels;
if (underflow)
*underflow = collator->underflow;
if (inflow)
*inflow = collator->inflow;
if (overflow)
*overflow = collator->overflow;
return (const gchar *const *) collator->labels;
}
/**
* e_collator_get_index:
* @collator: An #ECollator
* @str: A string
*
* Checks which index, as determined by e_collator_get_index_labels(),
* that @str should sort under.
*
* Returns: The alphabetic index under which @str would sort
*
* Since: 3.12
*/
gint
e_collator_get_index (ECollator *collator,
const gchar *str)
{
gint index;
gchar *translit_str = NULL;
const gchar *input_str;
g_return_val_if_fail (collator != NULL, -1);
g_return_val_if_fail (str != NULL, -1);
/* We may need to perform a conversion before generating the sort key */
if (collator->transliterator) {
translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
input_str = translit_str;
} else {
input_str = str;
}
index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
g_free (translit_str);
return index;
}