/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
/*
* Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
*
* This library is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
* Authors: Michael Zucchi
* Jeffrey Stedfast
* Dan Winship
*/
#ifdef HAVE_CONFIG_H
#include
#endif
#include
#include
#include
#include
#ifdef HAVE_CODESET
#include
#endif
#include
#include
#include "camel-iconv.h"
/*
* if you want to build the charset map, compile this with something like:
* gcc -DBUILD_MAP camel-charset-map.c `pkg-config --cflags --libs glib-2.0`
* (plus any -I/-L/-l flags you need for iconv), then run it as
* ./a.out > camel-charset-map-private.h
*
* Note that the big-endian variant isn't tested...
*
* The tables genereated work like this:
*
* An indirect array for each page of unicode character
* Each array element has an indirect pointer to one of the bytes of
* the generated bitmask.
*/
#ifdef BUILD_MAP
static struct {
gchar *name; /* charset name */
gint multibyte; /* charset type */
guint bit; /* assigned bit */
} tables[] = {
/* These are the 8bit character sets (other than iso-8859-1,
* which is special-cased) which are supported by both other
* mailers and the GNOME environment. Note that the order
* they're listed in is the order they'll be tried in, so put
* the more-popular ones first.
*/
{ "iso-8859-2", 0, 0 }, /* Central/Eastern European */
{ "iso-8859-4", 0, 0 }, /* Baltic */
{ "koi8-r", 0, 0 }, /* Russian */
{ "koi8-u", 0, 0 }, /* Ukranian */
{ "iso-8859-5", 0, 0 }, /* Least-popular Russian encoding */
{ "iso-8859-6", 0, 0 }, /* Arabic */
{ "iso-8859-7", 0, 0 }, /* Greek */
{ "iso-8859-8", 0, 0 }, /* Hebrew; Visual */
{ "iso-8859-9", 0, 0 }, /* Turkish */
{ "iso-8859-13", 0, 0 }, /* Baltic again */
{ "iso-8859-15", 0, 0 }, /* New-and-improved iso-8859-1, but most
* programs that support this support UTF8
*/
{ "windows-1251", 0, 0 }, /* Russian */
/* These are the multibyte character sets which are commonly
* supported by other mail clients. Note: order for multibyte
* charsets does not affect priority unlike the 8bit charsets
* listed above.
*/
{ "iso-2022-jp", 1, 0 }, /* Japanese designed for use over the Net */
{ "Shift-JIS", 1, 0 }, /* Japanese as used by Windows and MacOS systems */
{ "euc-jp", 1, 0 }, /* Japanese traditionally used on Unix systems */
{ "euc-kr", 1, 0 }, /* Korean */
{ "iso-2022-kr", 1, 0 }, /* Korean (less popular than euc-kr) */
{ "gb2312", 1, 0 }, /* Simplified Chinese */
{ "Big5", 1, 0 }, /* Traditional Chinese */
{ "euc-tw", 1, 0 },
{ NULL, 0, 0 }
};
guint encoding_map[256 * 256];
#if G_BYTE_ORDER == G_BIG_ENDIAN
#define UCS "UCS-4BE"
#else
#define UCS "UCS-4LE"
#endif
static guint
block_hash (gconstpointer v)
{
const gchar *p = v;
guint32 h = *p++;
gint i;
for (i = 0; i < 256; i++)
h = (h << 5) - h + *p++;
return h;
}
static gint
block_equal (gconstpointer v1,
gconstpointer v2)
{
return !memcmp (v1, v2, 256);
}
gint main (gint argc, gchar **argv)
{
guchar *block = NULL;
guint bit = 0x01;
GHashTable *table_hash;
gsize inleft, outleft;
gchar *inbuf, *outbuf;
guint32 out[128], c;
gchar in[128];
gint i, j, k;
gint bytes;
iconv_t cd;
/* dont count the terminator */
bytes = (G_N_ELEMENTS (tables) + 7 - 1) / 8;
g_return_val_if_fail (bytes <= 4, -1);
for (i = 0; i < 128; i++)
in[i] = i + 128;
for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
cd = iconv_open (UCS, tables[j].name);
inbuf = in;
inleft = sizeof (in);
outbuf = (gchar *) out;
outleft = sizeof (out);
while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) {
if (errno == EILSEQ) {
inbuf++;
inleft--;
} else {
g_warning (
"iconv (%s->UCS4, ..., %d, ..., %d): %s",
tables[j].name, inleft, outleft,
g_strerror (errno));
exit (1);
}
}
iconv_close (cd);
for (i = 0; i < 128 - outleft / 4; i++) {
encoding_map[i] |= bit;
encoding_map[out[i]] |= bit;
}
tables[j].bit = bit;
bit <<= 1;
}
/* Mutibyte tables */
for (; tables[j].name && tables[j].multibyte; j++) {
cd = iconv_open (tables[j].name, UCS);
if (cd == (iconv_t) -1)
continue;
for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
inbuf = (gchar *) &c;
inleft = sizeof (c);
outbuf = in;
outleft = sizeof (in);
if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (gsize) -1) {
/* this is a legal character in charset table[j].name */
iconv (cd, NULL, NULL, &outbuf, &outleft);
encoding_map[i++] |= bit;
encoding_map[c] |= bit;
} else {
/* reset the iconv descriptor */
iconv (cd, NULL, NULL, NULL, NULL);
}
}
iconv_close (cd);
tables[j].bit = bit;
bit <<= 1;
}
printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
table_hash = g_hash_table_new_full (block_hash, block_equal, g_free, g_free);
for (i = 0; i < 256; i++) {
for (k = 0; k < bytes; k++) {
gchar name[32], *alias;
gint has_bits = FALSE;
if (!block) {
/* we reuse malloc'd blocks that are not added to the
* hash table to avoid unnecessary malloc/free's */
block = g_malloc (256);
}
for (j = 0; j < 256; j++) {
if ((block[j] = (encoding_map[i * 256 + j] >> (k * 8)) & 0xff))
has_bits = TRUE;
}
if (!has_bits)
continue;
g_snprintf (name, sizeof (name), "m%02x%x", i, k);
if ((alias = g_hash_table_lookup (table_hash, block))) {
/* this block is identical to an earlier block, just alias it */
printf ("#define %s %s\n\n", name, alias);
} else {
/* unique block, dump it */
g_hash_table_insert (table_hash, block, g_strdup (name));
printf ("static guchar %s[256] = {\n\t", name);
for (j = 0; j < 256; j++) {
printf ("0x%02x, ", block[j]);
if (((j + 1) & 7) == 0 && j < 255)
printf ("\n\t");
}
printf ("\n};\n\n");
/* force the next loop to malloc a new block */
block = NULL;
}
}
}
g_hash_table_destroy (table_hash);
g_free (block);
printf ("static const struct {\n");
for (k = 0; k < bytes; k++)
printf ("\tconst guchar *bits%d;\n", k);
printf ("} camel_charmap[256] = {\n\t");
for (i = 0; i < 256; i++) {
printf ("{ ");
for (k = 0; k < bytes; k++) {
for (j = 0; j < 256; j++) {
if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
break;
}
if (j < 256)
printf ("m%02x%x, ", i, k);
else
printf ("NULL, ");
}
printf ("}, ");
if (((i + 1) & 3) == 0 && i < 255)
printf ("\n\t");
}
printf ("\n};\n\n");
printf (
"static const struct {\n"
"\tconst gchar *name;\n"
"\tguint bit;\n"
"} camel_charinfo[] = {\n");
for (j = 0; tables[j].name; j++)
printf (
"\t{ \"%s\", 0x%08x },\n",
tables[j].name, tables[j].bit);
printf ("};\n\n");
printf ("#define charset_mask(x) \\\n");
for (k = 0; k < bytes; k++) {
if (k != 0)
printf ("\t| ");
else
printf ("\t");
printf (
"(camel_charmap[(x) >> 8].bits%d ? "
"camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)",
k, k, k * 8);
if (k < bytes - 1)
printf ("\t\\\n");
}
printf ("\n\n");
return 0;
}
#else
#include "camel-charset-map.h"
#include "camel-charset-map-private.h"
#include "camel-utf8.h"
void
camel_charset_init (CamelCharset *c)
{
c->mask = (guint) ~0;
c->level = 0;
}
void
camel_charset_step (CamelCharset *cc,
const gchar *in,
gint len)
{
const guchar *inptr = (const guchar *) in;
const guchar *inend = inptr + len;
register guint mask;
register gint level;
register guint32 c;
mask = cc->mask;
level = cc->level;
/* check what charset a given string will fit in */
while ((c = camel_utf8_getc_limit (&inptr, inend)) != 0xffff) {
if (c < 0xffff) {
mask &= charset_mask (c);
if (c >= 128 && c < 256)
level = MAX (level, 1);
else if (c >= 256)
level = 2;
} else {
mask = 0;
level = 2;
break;
}
}
cc->mask = mask;
cc->level = level;
}
/* gets the best charset from the mask of chars in it */
static const gchar *
camel_charset_best_mask (guint mask)
{
const gchar *locale_lang, *lang;
gint i;
locale_lang = camel_iconv_locale_language ();
for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
if (camel_charinfo[i].bit & mask) {
lang = camel_iconv_charset_language (camel_charinfo[i].name);
if (!locale_lang || (lang && !strncmp (locale_lang, lang, 2)))
return camel_charinfo[i].name;
}
}
return "UTF-8";
}
const gchar *
camel_charset_best_name (CamelCharset *charset)
{
if (charset->level == 1)
return "ISO-8859-1";
else if (charset->level == 2)
return camel_charset_best_mask (charset->mask);
else
return NULL;
}
/* finds the minimum charset for this string NULL means US-ASCII */
const gchar *
camel_charset_best (const gchar *in,
gint len)
{
CamelCharset charset;
camel_charset_init (&charset);
camel_charset_step (&charset, in, len);
return camel_charset_best_name (&charset);
}
/**
* camel_charset_iso_to_windows:
* @isocharset: a canonicalised ISO charset
*
* Returns: the equivalent Windows charset.
**/
const gchar *
camel_charset_iso_to_windows (const gchar *isocharset)
{
/* According to http://czyborra.com/charsets/codepages.html,
* the charset mapping is as follows:
*
* us-ascii maps to windows-cp1252
* iso-8859-1 maps to windows-cp1252
* iso-8859-2 maps to windows-cp1250
* iso-8859-3 maps to windows-cp????
* iso-8859-4 maps to windows-cp????
* iso-8859-5 maps to windows-cp1251
* iso-8859-6 maps to windows-cp1256
* iso-8859-7 maps to windows-cp1253
* iso-8859-8 maps to windows-cp1255
* iso-8859-9 maps to windows-cp1254
* iso-8859-10 maps to windows-cp????
* iso-8859-11 maps to windows-cp????
* iso-8859-12 maps to windows-cp????
* iso-8859-13 maps to windows-cp1257
*
* Assumptions:
* - I'm going to assume that since iso-8859-4 and
* iso-8859-13 are Baltic that it also maps to
* windows-cp1257.
*/
if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
return "windows-cp1252";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
return "windows-cp1250";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
return "windows-cp1257";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
return "windows-cp1251";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
return "windows-cp1256";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
return "windows-cp1253";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
return "windows-cp1255";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
return "windows-cp1254";
else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
return "windows-cp1257";
return isocharset;
}
#endif /* BUILD_MAP */