/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* camel-iconv.c * * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com) * * This library is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library. If not, see . * * Authors: Michael Zucchi * Jeffery Stedfast */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #ifdef HAVE_CODESET #include #endif #include "camel-iconv.h" #include "iconv-detect.h" #define cd(x) G_LOCK_DEFINE_STATIC (iconv); struct _iconv_cache_node { struct _iconv_cache *parent; gint busy; iconv_t ip; }; struct _iconv_cache { gchar *conv; GQueue open; /* stores iconv_cache_nodes, busy ones up front */ }; #define E_ICONV_CACHE_SIZE (16) static GQueue iconv_cache_list = G_QUEUE_INIT; static GHashTable *iconv_cache; static GHashTable *iconv_cache_open; static GHashTable *iconv_charsets = NULL; static gchar *locale_charset = NULL; static gchar *locale_lang = NULL; struct { const gchar *charset; const gchar *iconv_name; } known_iconv_charsets[] = { #if 0 /* charset name, iconv-friendly charset name */ { "iso-8859-1", "iso-8859-1" }, { "iso8859-1", "iso-8859-1" }, /* the above mostly serves as an example for iso-style charsets, * but we have code that will populate the iso-*'s if/when they * show up in camel_iconv_charset_name () so I'm * not going to bother putting them all in here... */ { "windows-cp1251", "cp1251" }, { "windows-1251", "cp1251" }, { "cp1251", "cp1251" }, /* the above mostly serves as an example for windows-style * charsets, but we have code that will parse and convert them * to their cp#### equivalents if/when they show up in * camel_iconv_charset_name () so I'm not going to bother * putting them all in here either... */ #endif /* charset name (lowercase!), iconv-friendly name (sometimes case sensitive) */ { "utf-8", "UTF-8" }, /* 10646 is a special case, its usually UCS-2 big endian */ /* This might need some checking but should be ok for solaris/linux */ { "iso-10646-1", "UCS-2BE" }, { "iso_10646-1", "UCS-2BE" }, { "iso10646-1", "UCS-2BE" }, { "iso-10646", "UCS-2BE" }, { "iso_10646", "UCS-2BE" }, { "iso10646", "UCS-2BE" }, { "ks_c_5601-1987", "EUC-KR" }, /* FIXME: Japanese/Korean/Chinese stuff needs checking */ { "euckr-0", "EUC-KR" }, { "5601", "EUC-KR" }, { "zh_TW-euc", "EUC-TW" }, { "zh_CN.euc", "gb18030" }, { "zh_TW-big5", "BIG5" }, { "euc-cn", "gb18030" }, { "big5-0", "BIG5" }, { "big5.eten-0", "BIG5" }, { "big5hkscs-0", "BIG5HKSCS" }, { "gb2312-0", "gb18030" }, { "gb2312.1980-0", "gb18030" }, { "gb-2312", "gb18030" }, { "gb2312", "gb18030" }, { "gb18030-0", "gb18030" }, { "gbk-0", "GBK" }, { "eucjp-0", "eucJP" }, { "ujis-0", "ujis" }, { "jisx0208.1983-0","SJIS" }, { "jisx0212.1990-0","SJIS" }, { "pck", "SJIS" }, { NULL, NULL } }; static const gchar * e_strdown (gchar *str) { register gchar *s = str; while (*s) { if (*s >= 'A' && *s <= 'Z') *s += 0x20; s++; } return str; } static const gchar * e_strup (gchar *str) { register gchar *s = str; while (*s) { if (*s >= 'a' && *s <= 'z') *s -= 0x20; s++; } return str; } static void locale_parse_lang (const gchar *locale) { gchar *codeset, *lang; if ((codeset = strchr (locale, '.'))) lang = g_strndup (locale, codeset - locale); else lang = g_strdup (locale); /* validate the language */ if (strlen (lang) >= 2) { if (lang[2] == '-' || lang[2] == '_') { /* canonicalise the lang */ e_strdown (lang); /* validate the country code */ if (strlen (lang + 3) > 2) { /* invalid country code */ lang[2] = '\0'; } else { lang[2] = '-'; e_strup (lang + 3); } } else if (lang[2] != '\0') { /* invalid language */ g_free (lang); lang = NULL; } locale_lang = lang; } else { /* invalid language */ locale_lang = NULL; g_free (lang); } } /* NOTE: Owns the lock on return if keep is TRUE !*/ static void iconv_init (gint keep) { gchar *from, *to, *locale; gint i; G_LOCK (iconv); if (iconv_charsets != NULL) { if (!keep) G_UNLOCK (iconv); return; } iconv_charsets = g_hash_table_new (g_str_hash, g_str_equal); for (i = 0; known_iconv_charsets[i].charset != NULL; i++) { from = g_strdup (known_iconv_charsets[i].charset); to = g_strdup (known_iconv_charsets[i].iconv_name); e_strdown (from); g_hash_table_insert (iconv_charsets, from, to); } iconv_cache = g_hash_table_new (g_str_hash, g_str_equal); iconv_cache_open = g_hash_table_new (NULL, NULL); #ifndef G_OS_WIN32 locale = setlocale (LC_ALL, NULL); #else locale = g_win32_getlocale (); #endif if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) { /* The locale "C" or "POSIX" is a portable locale; its * LC_CTYPE part corresponds to the 7-bit ASCII character * set. */ locale_charset = NULL; locale_lang = NULL; } else { #ifdef G_OS_WIN32 g_get_charset (&locale_charset); locale_charset = g_strdup (locale_charset); e_strdown (locale_charset); #else #ifdef HAVE_CODESET locale_charset = g_strdup (nl_langinfo (CODESET)); e_strdown (locale_charset); #else /* A locale name is typically of the form language[_terri- * tory][.codeset][@modifier], where language is an ISO 639 * language code, territory is an ISO 3166 country code, and * codeset is a character set or encoding identifier like * ISO-8859-1 or UTF-8. */ gchar *codeset, *p; codeset = strchr (locale, '.'); if (codeset) { codeset++; /* ; is a hack for debian systems and / is a hack for Solaris systems */ for (p = codeset; *p && !strchr ("@;/", *p); p++); locale_charset = g_strndup (codeset, p - codeset); e_strdown (locale_charset); } else { /* charset unknown */ locale_charset = NULL; } #endif #endif /* !G_OS_WIN32 */ /* parse the locale lang */ locale_parse_lang (locale); } #ifdef G_OS_WIN32 g_free (locale); #endif if (!keep) G_UNLOCK (iconv); } const gchar * camel_iconv_charset_name (const gchar *charset) { gchar *name, *ret, *tmp; gsize name_len; if (charset == NULL) return NULL; name_len = strlen (charset) + 1; name = g_alloca (name_len); g_strlcpy (name, charset, name_len); e_strdown (name); iconv_init (TRUE); ret = g_hash_table_lookup (iconv_charsets, name); if (ret != NULL) { G_UNLOCK (iconv); return ret; } /* Unknown, try canonicalise some basic charset types to something that should work */ if (strncmp (name, "iso", 3) == 0) { /* Convert iso-nnnn-n or isonnnn-n or iso_nnnn-n to iso-nnnn-n or isonnnn-n */ gint iso, codepage; gchar *p; tmp = name + 3; if (*tmp == '-' || *tmp == '_') tmp++; iso = strtoul (tmp, &p, 10); if (iso == 10646) { /* they all become ICONV_10646 */ ret = g_strdup (ICONV_10646); } else { tmp = p; if (*tmp == '-' || *tmp == '_') tmp++; codepage = strtoul (tmp, &p, 10); if (p > tmp) { /* codepage is numeric */ #ifdef __aix__ if (codepage == 13) ret = g_strdup ("IBM-921"); else #endif /* __aix__ */ ret = g_strdup_printf (ICONV_ISO_D_FORMAT, iso, codepage); } else { /* codepage is a string - probably iso-2022-jp or something */ ret = g_strdup_printf (ICONV_ISO_S_FORMAT, iso, p); } } } else if (strncmp (name, "windows-", 8) == 0) { /* Convert windows-nnnnn or windows-cpnnnnn to cpnnnn */ tmp = name + 8; if (!strncmp (tmp, "cp", 2)) tmp+=2; ret = g_strdup_printf ("CP%s", tmp); } else if (strncmp (name, "microsoft-", 10) == 0) { /* Convert microsoft-nnnnn or microsoft-cpnnnnn to cpnnnn */ tmp = name + 10; if (!strncmp (tmp, "cp", 2)) tmp+=2; ret = g_strdup_printf ("CP%s", tmp); } else { /* Just assume its ok enough as is, case and all */ ret = g_strdup (charset); } g_hash_table_insert (iconv_charsets, g_strdup (name), ret); G_UNLOCK (iconv); return ret; } static void flush_entry (struct _iconv_cache *ic) { struct _iconv_cache_node *in; while ((in = g_queue_pop_head (&ic->open)) != NULL) { if (in->ip != (iconv_t) - 1) { g_hash_table_remove (iconv_cache_open, in->ip); iconv_close (in->ip); } g_free (in); } g_free (ic->conv); g_free (ic); } /* This should run pretty quick, its called a lot */ iconv_t camel_iconv_open (const gchar *oto, const gchar *ofrom) { const gchar *to, *from; gchar *tofrom; gsize tofrom_len; struct _iconv_cache *ic; struct _iconv_cache_node *in; gint errnosav; iconv_t ip; if (oto == NULL || ofrom == NULL) { errno = EINVAL; return (iconv_t) -1; } to = camel_iconv_charset_name (oto); from = camel_iconv_charset_name (ofrom); tofrom_len = strlen (to) + strlen (from) + 2; tofrom = g_alloca (tofrom_len); g_snprintf (tofrom, tofrom_len, "%s%%%s", to, from); G_LOCK (iconv); ic = g_hash_table_lookup (iconv_cache, tofrom); if (ic) { g_queue_remove (&iconv_cache_list, ic); } else { GList *link; link = g_queue_peek_tail_link (&iconv_cache_list); while (link != NULL && iconv_cache_list.length > E_ICONV_CACHE_SIZE) { GList *prev = g_list_previous (link); ic = (struct _iconv_cache *) link->data; in = g_queue_peek_head (&ic->open); if (in != NULL && !in->busy) { cd (printf ("Flushing iconv converter '%s'\n", ic->conv)); g_queue_delete_link (&iconv_cache_list, link); g_hash_table_remove (iconv_cache, ic->conv); flush_entry (ic); } link = prev; } ic = g_malloc (sizeof (*ic)); g_queue_init (&ic->open); ic->conv = g_strdup (tofrom); g_hash_table_insert (iconv_cache, ic->conv, ic); cd (printf ("Creating iconv converter '%s'\n", ic->conv)); } g_queue_push_head (&iconv_cache_list, ic); /* If we have a free iconv, use it */ in = g_queue_peek_tail (&ic->open); if (in != NULL && !in->busy) { cd (printf ("using existing iconv converter '%s'\n", ic->conv)); ip = in->ip; if (ip != (iconv_t) - 1) { /* work around some broken iconv implementations * that die if the length arguments are NULL */ gsize buggy_iconv_len = 0; gchar *buggy_iconv_buf = NULL; /* resets the converter */ iconv (ip, &buggy_iconv_buf, &buggy_iconv_len, &buggy_iconv_buf, &buggy_iconv_len); in->busy = TRUE; g_queue_remove (&ic->open, in); g_queue_push_head (&ic->open, in); } } else { cd (printf ("creating new iconv converter '%s'\n", ic->conv)); ip = iconv_open (to, from); in = g_malloc (sizeof (*in)); in->ip = ip; in->parent = ic; g_queue_push_head (&ic->open, in); if (ip != (iconv_t) - 1) { g_hash_table_insert (iconv_cache_open, ip, in); in->busy = TRUE; } else { errnosav = errno; g_warning ("Could not open converter for '%s' to '%s' charset", from, to); in->busy = FALSE; errno = errnosav; } } G_UNLOCK (iconv); return ip; } gsize camel_iconv (iconv_t cd, const gchar **inbuf, gsize *inbytesleft, gchar **outbuf, gsize *outbytesleft) { return iconv (cd, (gchar **) inbuf, inbytesleft, outbuf, outbytesleft); } void camel_iconv_close (iconv_t ip) { struct _iconv_cache_node *in; if (ip == (iconv_t) - 1) return; G_LOCK (iconv); in = g_hash_table_lookup (iconv_cache_open, ip); if (in) { cd (printf ("closing iconv converter '%s'\n", in->parent->conv)); g_queue_remove (&in->parent->open, in); in->busy = FALSE; g_queue_push_tail (&in->parent->open, in); } else { g_warning ("trying to close iconv i dont know about: %p", ip); iconv_close (ip); } G_UNLOCK (iconv); } const gchar * camel_iconv_locale_charset (void) { iconv_init (FALSE); return locale_charset; } const gchar * camel_iconv_locale_language (void) { iconv_init (FALSE); return locale_lang; } /* map CJKR charsets to their language code */ /* NOTE: only support charset names that will be returned by * camel_iconv_charset_name() so that we don't have to keep track of all * the aliases too. */ static struct { const gchar *charset; const gchar *lang; } cjkr_lang_map[] = { { "Big5", "zh" }, { "BIG5HKSCS", "zh" }, { "gb2312", "zh" }, { "gb18030", "zh" }, { "gbk", "zh" }, { "euc-tw", "zh" }, { "iso-2022-jp", "ja" }, { "sjis", "ja" }, { "ujis", "ja" }, { "eucJP", "ja" }, { "euc-jp", "ja" }, { "euc-kr", "ko" }, { "koi8-r", "ru" }, { "koi8-u", "uk" } }; const gchar * camel_iconv_charset_language (const gchar *charset) { gint i; if (!charset) return NULL; charset = camel_iconv_charset_name (charset); for (i = 0; i < G_N_ELEMENTS (cjkr_lang_map); i++) { if (!g_ascii_strcasecmp (cjkr_lang_map[i].charset, charset)) return cjkr_lang_map[i].lang; } return NULL; }