/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
*
* This library is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
* Authors: Michael Zucchi
*/
#ifdef HAVE_CONFIG_H
#include
#endif
#include
#include
#include "camel-utf8.h"
/**
* camel_utf8_putc:
* @ptr:
* @c:
*
* Output a 32 bit unicode character as utf8 octets. At most 4 octets will
* be written to @ptr. @ptr will be advanced to the next character position.
**/
void
camel_utf8_putc (guchar **ptr,
guint32 c)
{
register guchar *p = *ptr;
if (c <= 0x7f)
*p++ = c;
else if (c <= 0x7ff) {
*p++ = 0xc0 | c >> 6;
*p++ = 0x80 | (c & 0x3f);
} else if (c <= 0xffff) {
*p++ = 0xe0 | c >> 12;
*p++ = 0x80 | ((c >> 6) & 0x3f);
*p++ = 0x80 | (c & 0x3f);
} else {
/* see unicode standard 3.0, S 3.8, max 4 octets */
*p++ = 0xf0 | c >> 18;
*p++ = 0x80 | ((c >> 12) & 0x3f);
*p++ = 0x80 | ((c >> 6) & 0x3f);
*p++ = 0x80 | (c & 0x3f);
}
*ptr = p;
}
/**
* camel_utf8_getc:
* @ptr:
*
* Get a Unicode character from a utf8 stream. @ptr will be advanced
* to the next character position. Invalid utf8 characters will be
* silently skipped. @ptr should point to a NUL terminated array.
*
* Returns: The next Unicode character. @ptr will be advanced to
* the next character always.
**/
guint32
camel_utf8_getc (const guchar **ptr)
{
register guchar *p = (guchar *) * ptr;
register guchar c, r;
register guint32 v, m;
again:
r = *p++;
loop:
if (r < 0x80) {
*ptr = p;
v = r;
} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
v = r;
m = 0x7f80; /* used to mask out the length bits */
do {
c = *p++;
if ((c & 0xc0) != 0x80) {
r = c;
goto loop;
}
v = (v << 6) | (c & 0x3f);
r <<= 1;
m <<= 5;
} while (r & 0x40);
*ptr = p;
v &= ~m;
} else {
goto again;
}
return v;
}
/**
* camel_utf8_getc_limit:
* @ptr:
* @end: must not be NULL.
*
* Get the next utf8 gchar at @ptr, and return it, advancing @ptr to
* the next character. If @end is reached before a full utf8
* character can be read, then the invalid Unicode gchar 0xffff is
* returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
* advanced.
*
* Returns: The next utf8 char, or 0xffff.
**/
guint32
camel_utf8_getc_limit (const guchar **ptr,
const guchar *end)
{
register guchar *p = (guchar *) * ptr;
register guchar c, r;
register guint32 v = 0xffff, m;
again:
while (p < end) {
r = *p++;
loop:
if (r < 0x80) {
*ptr = p;
return r;
} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
v = r;
m = 0x7f80; /* used to mask out the length bits */
do {
if (p >= end)
return 0xffff;
c = *p++;
if ((c & 0xc0) != 0x80) {
r = c;
goto loop;
}
v = (v << 6) | (c & 0x3f);
r <<= 1;
m <<= 5;
} while (r & 0x40);
*ptr = p;
v &= ~m;
return v;
} else {
goto again;
}
}
return 0xffff;
}
void
g_string_append_u (GString *out,
guint32 c)
{
guchar buffer[8];
guchar *p = buffer;
camel_utf8_putc (&p, c);
*p = 0;
g_string_append (out, (const gchar *) buffer);
}
static const gchar utf7_alphabet[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
static const guchar utf7_rank[256] = {
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
};
/**
* camel_utf7_utf8:
* @ptr:
*
* Convert a modified utf7 string to utf8. If the utf7 string
* contains 8 bit characters, they are treated as iso-8859-1.
*
* The IMAP rules [rfc2060] are used in the utf7 encoding.
*
* Returns: The converted string.
**/
gchar *
camel_utf7_utf8 (const gchar *ptr)
{
const guchar *p = (guchar *) ptr;
guint c;
guint32 v = 0, x;
GString *out;
gint i = 0;
gint state = 0;
gchar *ret;
out = g_string_new ("");
do {
c = *p++;
switch (state) {
case 0:
if (c == '&')
state = 1;
else
g_string_append_c (out, c);
break;
case 1:
if (c == '-') {
g_string_append_c (out, '&');
state = 0;
} else if (utf7_rank[c] != 0xff) {
v = utf7_rank[c];
i = 6;
state = 2;
} else {
/* invalid */
g_string_append (out, "&-");
state = 0;
}
break;
case 2:
if (c == '-') {
state = 0;
} else if (utf7_rank[c] != 0xff) {
v = (v << 6) | utf7_rank[c];
i+=6;
if (i >= 16) {
x = (v >> (i - 16)) & 0xffff;
g_string_append_u (out, x);
i-=16;
}
} else {
g_string_append_u (out, c);
state = 0;
}
break;
}
} while (c);
ret = g_strdup (out->str);
g_string_free (out, TRUE);
return ret;
}
static void utf7_closeb64 (GString *out, guint32 v, guint32 i)
{
guint32 x;
if (i > 0) {
x = (v << (6 - i)) & 0x3f;
g_string_append_c (out, utf7_alphabet[x]);
}
g_string_append_c (out, '-');
}
/**
* camel_utf8_utf7:
* @ptr:
*
* Convert a utf8 string to a modified utf7 format.
*
* The IMAP rules [rfc2060] are used in the utf7 encoding.
*
* Returns:
**/
gchar *
camel_utf8_utf7 (const gchar *ptr)
{
const guchar *p = (guchar *) ptr;
guint c;
guint32 x, v = 0;
gint state = 0;
GString *out;
gint i = 0;
gchar *ret;
out = g_string_new ("");
while ((c = camel_utf8_getc (&p))) {
if (c >= 0x20 && c <= 0x7e) {
if (state == 1) {
utf7_closeb64 (out, v, i);
state = 0;
i = 0;
}
if (c == '&')
g_string_append (out, "&-");
else
g_string_append_c (out, c);
} else {
if (state == 0) {
g_string_append_c (out, '&');
state = 1;
}
v = (v << 16) | c;
i += 16;
while (i >= 6) {
x = (v >> (i - 6)) & 0x3f;
g_string_append_c (out, utf7_alphabet[x]);
i -= 6;
}
}
}
if (state == 1)
utf7_closeb64 (out, v, i);
ret = g_strdup (out->str);
g_string_free (out, TRUE);
return ret;
}
/**
* camel_utf8_ucs2:
* @ptr:
*
* Convert a utf8 string into a ucs2 one. The ucs string will be in
* network byte order, and terminated with a 16 bit NULL.
*
* Returns:
**/
gchar *
camel_utf8_ucs2 (const gchar *pptr)
{
GByteArray *work = g_byte_array_new ();
guint32 c;
gchar *out;
const guchar *ptr = (const guchar *) pptr;
/* what if c is > 0xffff ? */
while ((c = camel_utf8_getc (&ptr))) {
guint16 s = g_htons (c);
g_byte_array_append (work, (guchar *) &s, 2);
}
g_byte_array_append (work, (guchar *) "\000\000", 2);
out = g_malloc (work->len);
memcpy (out, work->data, work->len);
g_byte_array_free (work, TRUE);
return out;
}
/**
* camel_ucs2_utf8:
* @ptr:
*
* Convert a ucs2 string into a utf8 one. The ucs2 string is treated
* as network byte ordered, and terminated with a 16 bit NUL.
*
* Returns:
**/
gchar *camel_ucs2_utf8 (const gchar *ptr)
{
guint16 *ucs = (guint16 *) ptr;
guint32 c;
GString *work = g_string_new ("");
gchar *out;
while ((c = *ucs++))
g_string_append_u (work, g_ntohs (c));
out = g_strdup (work->str);
g_string_free (work, TRUE);
return out;
}
/**
* camel_utf8_make_valid:
* @text:
*
* Ensures the returned text will be valid UTF-8 string, with incorrect letters
* changed to question marks. Returned pointer should be freed with g_free.
*
* Since: 2.26
**/
gchar *
camel_utf8_make_valid (const gchar *text)
{
gchar *res = g_strdup (text), *p;
if (!res)
return res;
p = res;
while (!g_utf8_validate (p, -1, (const gchar **) &p)) {
/* make all invalid characters appear as question marks */
*p = '?';
}
return res;
}