/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */ /* * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com) * * This library is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library. If not, see . * * Authors: Michael Zucchi * Jeffrey Stedfast * Dan Winship */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #ifdef HAVE_CODESET #include #endif #include #include #include "camel-iconv.h" /* * if you want to build the charset map, compile this with something like: * gcc -DBUILD_MAP camel-charset-map.c `pkg-config --cflags --libs glib-2.0` * (plus any -I/-L/-l flags you need for iconv), then run it as * ./a.out > camel-charset-map-private.h * * Note that the big-endian variant isn't tested... * * The tables genereated work like this: * * An indirect array for each page of unicode character * Each array element has an indirect pointer to one of the bytes of * the generated bitmask. */ #ifdef BUILD_MAP static struct { gchar *name; /* charset name */ gint multibyte; /* charset type */ guint bit; /* assigned bit */ } tables[] = { /* These are the 8bit character sets (other than iso-8859-1, * which is special-cased) which are supported by both other * mailers and the GNOME environment. Note that the order * they're listed in is the order they'll be tried in, so put * the more-popular ones first. */ { "iso-8859-2", 0, 0 }, /* Central/Eastern European */ { "iso-8859-4", 0, 0 }, /* Baltic */ { "koi8-r", 0, 0 }, /* Russian */ { "koi8-u", 0, 0 }, /* Ukranian */ { "iso-8859-5", 0, 0 }, /* Least-popular Russian encoding */ { "iso-8859-6", 0, 0 }, /* Arabic */ { "iso-8859-7", 0, 0 }, /* Greek */ { "iso-8859-8", 0, 0 }, /* Hebrew; Visual */ { "iso-8859-9", 0, 0 }, /* Turkish */ { "iso-8859-13", 0, 0 }, /* Baltic again */ { "iso-8859-15", 0, 0 }, /* New-and-improved iso-8859-1, but most * programs that support this support UTF8 */ { "windows-1251", 0, 0 }, /* Russian */ /* These are the multibyte character sets which are commonly * supported by other mail clients. Note: order for multibyte * charsets does not affect priority unlike the 8bit charsets * listed above. */ { "iso-2022-jp", 1, 0 }, /* Japanese designed for use over the Net */ { "Shift-JIS", 1, 0 }, /* Japanese as used by Windows and MacOS systems */ { "euc-jp", 1, 0 }, /* Japanese traditionally used on Unix systems */ { "euc-kr", 1, 0 }, /* Korean */ { "iso-2022-kr", 1, 0 }, /* Korean (less popular than euc-kr) */ { "gb2312", 1, 0 }, /* Simplified Chinese */ { "Big5", 1, 0 }, /* Traditional Chinese */ { "euc-tw", 1, 0 }, { NULL, 0, 0 } }; guint encoding_map[256 * 256]; #if G_BYTE_ORDER == G_BIG_ENDIAN #define UCS "UCS-4BE" #else #define UCS "UCS-4LE" #endif static guint block_hash (gconstpointer v) { const gchar *p = v; guint32 h = *p++; gint i; for (i = 0; i < 256; i++) h = (h << 5) - h + *p++; return h; } static gint block_equal (gconstpointer v1, gconstpointer v2) { return !memcmp (v1, v2, 256); } gint main (gint argc, gchar **argv) { guchar *block = NULL; guint bit = 0x01; GHashTable *table_hash; gsize inleft, outleft; gchar *inbuf, *outbuf; guint32 out[128], c; gchar in[128]; gint i, j, k; gint bytes; iconv_t cd; /* dont count the terminator */ bytes = (G_N_ELEMENTS (tables) + 7 - 1) / 8; g_return_val_if_fail (bytes <= 4, -1); for (i = 0; i < 128; i++) in[i] = i + 128; for (j = 0; tables[j].name && !tables[j].multibyte; j++) { cd = iconv_open (UCS, tables[j].name); inbuf = in; inleft = sizeof (in); outbuf = (gchar *) out; outleft = sizeof (out); while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) { if (errno == EILSEQ) { inbuf++; inleft--; } else { g_warning ( "iconv (%s->UCS4, ..., %d, ..., %d): %s", tables[j].name, inleft, outleft, g_strerror (errno)); exit (1); } } iconv_close (cd); for (i = 0; i < 128 - outleft / 4; i++) { encoding_map[i] |= bit; encoding_map[out[i]] |= bit; } tables[j].bit = bit; bit <<= 1; } /* Mutibyte tables */ for (; tables[j].name && tables[j].multibyte; j++) { cd = iconv_open (tables[j].name, UCS); if (cd == (iconv_t) -1) continue; for (c = 128, i = 0; c < 65535 && i < 65535; c++) { inbuf = (gchar *) &c; inleft = sizeof (c); outbuf = in; outleft = sizeof (in); if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (gsize) -1) { /* this is a legal character in charset table[j].name */ iconv (cd, NULL, NULL, &outbuf, &outleft); encoding_map[i++] |= bit; encoding_map[c] |= bit; } else { /* reset the iconv descriptor */ iconv (cd, NULL, NULL, NULL, NULL); } } iconv_close (cd); tables[j].bit = bit; bit <<= 1; } printf ("/* This file is automatically generated: DO NOT EDIT */\n\n"); table_hash = g_hash_table_new_full (block_hash, block_equal, g_free, g_free); for (i = 0; i < 256; i++) { for (k = 0; k < bytes; k++) { gchar name[32], *alias; gint has_bits = FALSE; if (!block) { /* we reuse malloc'd blocks that are not added to the * hash table to avoid unnecessary malloc/free's */ block = g_malloc (256); } for (j = 0; j < 256; j++) { if ((block[j] = (encoding_map[i * 256 + j] >> (k * 8)) & 0xff)) has_bits = TRUE; } if (!has_bits) continue; g_snprintf (name, sizeof (name), "m%02x%x", i, k); if ((alias = g_hash_table_lookup (table_hash, block))) { /* this block is identical to an earlier block, just alias it */ printf ("#define %s %s\n\n", name, alias); } else { /* unique block, dump it */ g_hash_table_insert (table_hash, block, g_strdup (name)); printf ("static guchar %s[256] = {\n\t", name); for (j = 0; j < 256; j++) { printf ("0x%02x, ", block[j]); if (((j + 1) & 7) == 0 && j < 255) printf ("\n\t"); } printf ("\n};\n\n"); /* force the next loop to malloc a new block */ block = NULL; } } } g_hash_table_destroy (table_hash); g_free (block); printf ("static const struct {\n"); for (k = 0; k < bytes; k++) printf ("\tconst guchar *bits%d;\n", k); printf ("} camel_charmap[256] = {\n\t"); for (i = 0; i < 256; i++) { printf ("{ "); for (k = 0; k < bytes; k++) { for (j = 0; j < 256; j++) { if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0) break; } if (j < 256) printf ("m%02x%x, ", i, k); else printf ("NULL, "); } printf ("}, "); if (((i + 1) & 3) == 0 && i < 255) printf ("\n\t"); } printf ("\n};\n\n"); printf ( "static const struct {\n" "\tconst gchar *name;\n" "\tguint bit;\n" "} camel_charinfo[] = {\n"); for (j = 0; tables[j].name; j++) printf ( "\t{ \"%s\", 0x%08x },\n", tables[j].name, tables[j].bit); printf ("};\n\n"); printf ("#define charset_mask(x) \\\n"); for (k = 0; k < bytes; k++) { if (k != 0) printf ("\t| "); else printf ("\t"); printf ( "(camel_charmap[(x) >> 8].bits%d ? " "camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)", k, k, k * 8); if (k < bytes - 1) printf ("\t\\\n"); } printf ("\n\n"); return 0; } #else #include "camel-charset-map.h" #include "camel-charset-map-private.h" #include "camel-utf8.h" void camel_charset_init (CamelCharset *c) { c->mask = (guint) ~0; c->level = 0; } void camel_charset_step (CamelCharset *cc, const gchar *in, gint len) { const guchar *inptr = (const guchar *) in; const guchar *inend = inptr + len; register guint mask; register gint level; register guint32 c; mask = cc->mask; level = cc->level; /* check what charset a given string will fit in */ while ((c = camel_utf8_getc_limit (&inptr, inend)) != 0xffff) { if (c < 0xffff) { mask &= charset_mask (c); if (c >= 128 && c < 256) level = MAX (level, 1); else if (c >= 256) level = 2; } else { mask = 0; level = 2; break; } } cc->mask = mask; cc->level = level; } /* gets the best charset from the mask of chars in it */ static const gchar * camel_charset_best_mask (guint mask) { const gchar *locale_lang, *lang; gint i; locale_lang = camel_iconv_locale_language (); for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) { if (camel_charinfo[i].bit & mask) { lang = camel_iconv_charset_language (camel_charinfo[i].name); if (!locale_lang || (lang && !strncmp (locale_lang, lang, 2))) return camel_charinfo[i].name; } } return "UTF-8"; } const gchar * camel_charset_best_name (CamelCharset *charset) { if (charset->level == 1) return "ISO-8859-1"; else if (charset->level == 2) return camel_charset_best_mask (charset->mask); else return NULL; } /* finds the minimum charset for this string NULL means US-ASCII */ const gchar * camel_charset_best (const gchar *in, gint len) { CamelCharset charset; camel_charset_init (&charset); camel_charset_step (&charset, in, len); return camel_charset_best_name (&charset); } /** * camel_charset_iso_to_windows: * @isocharset: a canonicalised ISO charset * * Returns: the equivalent Windows charset. **/ const gchar * camel_charset_iso_to_windows (const gchar *isocharset) { /* According to http://czyborra.com/charsets/codepages.html, * the charset mapping is as follows: * * us-ascii maps to windows-cp1252 * iso-8859-1 maps to windows-cp1252 * iso-8859-2 maps to windows-cp1250 * iso-8859-3 maps to windows-cp???? * iso-8859-4 maps to windows-cp???? * iso-8859-5 maps to windows-cp1251 * iso-8859-6 maps to windows-cp1256 * iso-8859-7 maps to windows-cp1253 * iso-8859-8 maps to windows-cp1255 * iso-8859-9 maps to windows-cp1254 * iso-8859-10 maps to windows-cp???? * iso-8859-11 maps to windows-cp???? * iso-8859-12 maps to windows-cp???? * iso-8859-13 maps to windows-cp1257 * * Assumptions: * - I'm going to assume that since iso-8859-4 and * iso-8859-13 are Baltic that it also maps to * windows-cp1257. */ if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii")) return "windows-cp1252"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2")) return "windows-cp1250"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4")) return "windows-cp1257"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5")) return "windows-cp1251"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6")) return "windows-cp1256"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7")) return "windows-cp1253"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8")) return "windows-cp1255"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9")) return "windows-cp1254"; else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13")) return "windows-cp1257"; return isocharset; } #endif /* BUILD_MAP */