diff options
Diffstat (limited to 'src/cr-utils.c')
-rw-r--r-- | src/cr-utils.c | 1492 |
1 files changed, 1492 insertions, 0 deletions
diff --git a/src/cr-utils.c b/src/cr-utils.c new file mode 100644 index 0000000..4c7c00a --- /dev/null +++ b/src/cr-utils.c @@ -0,0 +1,1492 @@ +/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ + +/* + * This file is part of The Croco Library + * + * Copyright (C) 2002-2003 Dodji Seketeli <dodji at seketeli.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + */ + +/* + *$Id$ + */ + +#include "cr-utils.h" + +/** + *@file: + *Some misc utility functions used + *in the libcroco. + *Note that troughout this file I will + *refer to the CSS SPECIFICATIONS DOCUMENTATION + *written by the w3c guys. You can find that document + *at http://www.w3.org/TR/REC-CSS2/ . + */ + + +/**************************** + *Encoding transformations and + *encoding helpers + ****************************/ + +/* + *Here is the correspondance between the ucs-4 charactere codes + *and there matching utf-8 encoding pattern as dscribed by RFC 2279: + * + *UCS-4 range (hex.) UTF-8 octet sequence (binary) + *------------------ ----------------------------- + *0000 0000-0000 007F 0xxxxxxx + *0000 0080-0000 07FF 110xxxxx 10xxxxxx + *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx + */ + + + +/** + *Given an utf8 string buffer, calculates + *the length of this string if it was encoded + *in ucs4. + *@param a_in_start a pointer to the begining of + *the input utf8 string. + *@param a_in_end a pointre to the end of the input + *utf8 string (points to the last byte of the buffer) + *@param a_len out parameter the calculated length. + *@return CR_OK upon succesfull completion, an error code + *otherwise. + */ +enum CRStatus +cr_utils_utf8_str_len_as_ucs4 (const guchar *a_in_start, + const guchar *a_in_end, + gulong *a_len) +{ + guchar *byte_ptr = NULL ; + gint len = 0 ; + + /* + *to store the final decoded + *unicode char + */ + guint c = 0 ; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR) ; + *a_len = 0 ; + + for (byte_ptr = (guchar*)a_in_start ; + byte_ptr <= a_in_end ; + byte_ptr++) + { + gint nb_bytes_2_decode = 0 ; + + if (*byte_ptr <= 0x7F) + { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = *byte_ptr ; + nb_bytes_2_decode = 1 ; + + } + else if ((*byte_ptr & 0xE0) == 0xC0) + { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = *byte_ptr & 0x1F ; + nb_bytes_2_decode = 2 ; + + } + else if ((*byte_ptr & 0xF0) == 0xE0) + { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x0F ; + nb_bytes_2_decode = 3 ; + + } + else if ((*byte_ptr & 0xF8) == 0xF0) + { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x7 ; + nb_bytes_2_decode = 4 ; + + } + else if ((*byte_ptr & 0xFC) == 0xF8) + { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 3 ; + nb_bytes_2_decode = 5 ; + + } + else if ((*byte_ptr & 0xFE) == 0xFC) + { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 1 ; + nb_bytes_2_decode = 6 ; + + } + else + { + /* + *BAD ENCODING + */ + return CR_ENCODING_ERROR ; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for ( ; + nb_bytes_2_decode > 1 ; + nb_bytes_2_decode --) + { + /*decode the next byte*/ + byte_ptr ++ ; + + /*byte pattern must be: 10xx xxxx*/ + if ((*byte_ptr & 0xC0) != 0x80) + { + return CR_ENCODING_ERROR ; + } + + c = (c << 6) | (*byte_ptr & 0x3F) ; + } + + len ++ ; + } + + *a_len = len ; + + return CR_OK ; +} + + + +/** + *Given an ucs4 string, this function + *returns the size (in bytes) this string + *would have occupied if it was encoded in utf-8. + *@param a_in_start a pointer to the beginning of the input + *buffer. + *@param a_in_end a pointer to the end of the input buffer. + *@param a_len out parameter. The computed length. + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs4_str_len_as_utf8 (const guint32 *a_in_start, + const guint32 *a_in_end, + gulong *a_len) +{ + gint len = 0 ; + guint32 *char_ptr = NULL ; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR) ; + + for (char_ptr = (guint32*)a_in_start ; + char_ptr <= a_in_end ; + char_ptr ++) + { + if (*char_ptr <= 0x7F) + { + /*the utf-8 char would take 1 byte*/ + len += 1 ; + } + else if (*char_ptr <= 0x7FF) + { + /*the utf-8 char would take 2 bytes*/ + len += 2 ; + } + else if (*char_ptr <= 0xFFFF) + { + len += 3 ; + } + else if (*char_ptr <= 0x1FFFFF) + { + len += 4 ; + } + else if (*char_ptr <= 0x3FFFFFF) + { + len += 5 ; + } + else if (*char_ptr <= 0x7FFFFFFF) + { + len+= 6 ; + } + } + + *a_len = len ; + return CR_OK ; +} + + +/** + *Given an ucsA string, this function + *returns the size (in bytes) this string + *would have occupied if it was encoded in utf-8. + *@param a_in_start a pointer to the beginning of the input + *buffer. + *@param a_in_end a pointer to the end of the input buffer. + *@param a_len out parameter. The computed length. + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs1_str_len_as_utf8 (const guchar *a_in_start, + const guchar *a_in_end, + gulong *a_len) +{ + gint len = 0 ; + guchar *char_ptr = NULL ; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR) ; + + for (char_ptr = (guchar *)a_in_start ; + char_ptr <= a_in_end ; + char_ptr ++) + { + if (*char_ptr <= 0x7F) + { + /*the utf-8 char would take 1 byte*/ + len += 1 ; + } + else + { + /*the utf-8 char would take 2 bytes*/ + len += 2 ; + } + } + + *a_len = len ; + return CR_OK ; +} + +/** + *Converts an utf8 buffer into an ucs4 buffer. + * + *@param a_in the input utf8 buffer to convert. + *@param a_in_len in/out parameter. The size of the + *input buffer to convert. After return, this parameter contains + *the actual number of bytes consumed. + *@param a_out the output converted ucs4 buffer. Must be allocated by + *the caller. + *@param a_out_len in/out parameter. The size of the output buffer. + *If this size is actually smaller than the real needed size, the function + *just converts what it can and returns a success status. After return, + *this param points to the actual number of characters decoded. + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_utf8_to_ucs4 (const guchar * a_in, + gulong *a_in_len, + guint32 *a_out, + gulong *a_out_len) +{ + gulong in_len = 0, out_len = 0, in_index = 0, out_index = 0 ; + enum CRStatus status = CR_OK ; + + /* + *to store the final decoded + *unicode char + */ + guint c = 0 ; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, + CR_BAD_PARAM_ERROR) ; + + if (*a_in_len < 1) + { + status = CR_OK ; + goto end ; + } + + in_len = *a_in_len ; + out_len = *a_out_len ; + + for (in_index = 0, out_index = 0 ; + (in_index < in_len) && (out_index < out_len) ; + in_index++, out_index++) + { + gint nb_bytes_2_decode = 0 ; + + if (a_in[in_index] <= 0x7F) + { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = a_in[in_index] ; + nb_bytes_2_decode = 1 ; + + } + else if ((a_in[in_index] & 0xE0) == 0xC0) + { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x1F ; + nb_bytes_2_decode = 2 ; + + } + else if ((a_in[in_index] & 0xF0) == 0xE0) + { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x0F ; + nb_bytes_2_decode = 3 ; + + } + else if ((a_in[in_index] & 0xF8) == 0xF0) + { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x7 ; + nb_bytes_2_decode = 4 ; + + } + else if ((a_in[in_index] & 0xFC) == 0xF8) + { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 3 ; + nb_bytes_2_decode = 5 ; + + } + else if ((a_in[in_index] & 0xFE) == 0xFC) + { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 1 ; + nb_bytes_2_decode = 6 ; + + } + else + { + /*BAD ENCODING*/ + goto end ; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for ( ; + nb_bytes_2_decode > 1 ; + nb_bytes_2_decode --) + { + /*decode the next byte*/ + in_index ++ ; + + /*byte pattern must be: 10xx xxxx*/ + if ((a_in[in_index] & 0xC0) != 0x80) + { + goto end ; + } + + c = (c << 6) | (a_in[in_index] & 0x3F) ; + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + /************************ + *Some security tests + ***********************/ + + /*be sure c is a char*/ + if (c == 0xFFFF || c == 0xFFFE) goto end ; + + /*be sure c is inferior to the max ucs4 char value*/ + if (c > 0x10FFFF) goto end ; + + /* + *c must be less than UTF16 "lower surrogate begin" + *or higher than UTF16 "High surrogate end" + */ + if (c >= 0xD800 && c <= 0xDFFF) goto end ; + + /*Avoid characters that equals zero*/ + if (c == 0) goto end ; + + + a_out[out_index] = c ; + } + + end: + *a_out_len = out_index + 1; + *a_in_len = in_index + 1; + + return status ; +} + + +/** + *Reads a character from an utf8 buffer. + *Actually decode the next character code (unicode character code) + *and returns it. + *@param a_in the starting address of the utf8 buffer. + *@param a_in_len the length of the utf8 buffer. + *@param a_out output parameter. The resulting read char. + *@param a_consumed the number of the bytes consumed to + *decode the returned character code. + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_read_char_from_utf8_buf (const guchar * a_in, + gulong a_in_len, + guint32 *a_out, gulong *a_consumed) +{ + gulong in_len = 0, in_index = 0, nb_bytes_2_decode = 0 ; + enum CRStatus status = CR_OK ; + + /* + *to store the final decoded + *unicode char + */ + guint32 c = 0 ; + + g_return_val_if_fail (a_in && a_out && a_out + && a_consumed, CR_BAD_PARAM_ERROR) ; + + if (a_in_len < 1) + { + status = CR_OK ; + goto end ; + } + + in_len = a_in_len ; + + if (*a_in <= 0x7F) + { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = *a_in ; + nb_bytes_2_decode = 1 ; + + } + else if ((*a_in & 0xE0) == 0xC0) + { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = *a_in & 0x1F ; + nb_bytes_2_decode = 2 ; + + } + else if ((*a_in & 0xF0) == 0xE0) + { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = *a_in & 0x0F ; + nb_bytes_2_decode = 3 ; + + } + else if ((*a_in & 0xF8) == 0xF0) + { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *a_in & 0x7 ; + nb_bytes_2_decode = 4 ; + + } + else if ((*a_in & 0xFC) == 0xF8) + { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = *a_in & 3 ; + nb_bytes_2_decode = 5 ; + + } + else if ((*a_in & 0xFE) == 0xFC) + { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *a_in & 1 ; + nb_bytes_2_decode = 6 ; + + } + else + { + /*BAD ENCODING*/ + goto end ; + } + + if (nb_bytes_2_decode > a_in_len) + { + status = CR_END_OF_INPUT_ERROR ; + goto end ; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for ( in_index = 1 ; + in_index < nb_bytes_2_decode ; + in_index ++) + { + /*byte pattern must be: 10xx xxxx*/ + if ((a_in[in_index] & 0xC0) != 0x80) + { + goto end ; + } + + c = (c << 6) | (a_in[in_index] & 0x3F) ; + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + /************************ + *Some security tests + ***********************/ + + /*be sure c is a char*/ + if (c == 0xFFFF || c == 0xFFFE) goto end ; + + /*be sure c is inferior to the max ucs4 char value*/ + if (c > 0x10FFFF) goto end ; + + /* + *c must be less than UTF16 "lower surrogate begin" + *or higher than UTF16 "High surrogate end" + */ + if (c >= 0xD800 && c <= 0xDFFF) goto end ; + + /*Avoid characters that equals zero*/ + if (c == 0) goto end ; + + *a_out = c ; + + end: + *a_consumed = nb_bytes_2_decode ; + + return status ; +} + + +/** + * + */ +enum CRStatus +cr_utils_utf8_str_len_as_ucs1 (const guchar *a_in_start, + const guchar *a_in_end, + gulong *a_len) +{ + /* + *Note: this function can be made shorter + *but it considers all the cases of the utf8 encoding + *to ease further extensions ... + */ + + guchar *byte_ptr = NULL ; + gint len = 0 ; + + /* + *to store the final decoded + *unicode char + */ + guint c = 0 ; + + g_return_val_if_fail (a_in_start && a_in_end && a_len, + CR_BAD_PARAM_ERROR) ; + *a_len = 0 ; + + for (byte_ptr = (guchar*)a_in_start ; + byte_ptr <= a_in_end ; + byte_ptr++) + { + gint nb_bytes_2_decode = 0 ; + + if (*byte_ptr <= 0x7F) + { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = *byte_ptr ; + nb_bytes_2_decode = 1 ; + + } + else if ((*byte_ptr & 0xE0) == 0xC0) + { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = *byte_ptr & 0x1F ; + nb_bytes_2_decode = 2 ; + + } + else if ((*byte_ptr & 0xF0) == 0xE0) + { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x0F ; + nb_bytes_2_decode = 3 ; + + } + else if ((*byte_ptr & 0xF8) == 0xF0) + { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 0x7 ; + nb_bytes_2_decode = 4 ; + + } + else if ((*byte_ptr & 0xFC) == 0xF8) + { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 3 ; + nb_bytes_2_decode = 5 ; + + } + else if ((*byte_ptr & 0xFE) == 0xFC) + { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = *byte_ptr & 1 ; + nb_bytes_2_decode = 6 ; + + } + else + { + /* + *BAD ENCODING + */ + return CR_ENCODING_ERROR ; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + for ( ; + nb_bytes_2_decode > 1 ; + nb_bytes_2_decode --) + { + /*decode the next byte*/ + byte_ptr ++ ; + + /*byte pattern must be: 10xx xxxx*/ + if ((*byte_ptr & 0xC0) != 0x80) + { + return CR_ENCODING_ERROR ; + } + + c = (c << 6) | (*byte_ptr & 0x3F) ; + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + if (c <= 0xFF) {/*Add other conditions to support + *other char sets (ucs2, ucs3, ucs4). + */ + len ++ ; + } else { + /*the char is too long to fit + *into the supposed charset len. + */ + return CR_ENCODING_ERROR ; + } + } + + *a_len = len ; + + return CR_OK ; +} + +/** + *Converts an utf8 string into an ucs4 string. + *@param a_in the input string to convert. + *@param a_in_len in/out parameter. The length of the input + *string. After return, points to the actual number of bytes + *consumed. This can be usefull to debug the input stream in case + *of encoding error. + *@param a_out out parameter. Points to the output string. It is allocated + *by this function and must be freed by the caller. + *@param a_out_len out parameter. The length of the output string. + *@return CR_OK upon successfull completion, an error code otherwise. + * + */ +enum CRStatus +cr_utils_utf8_str_to_ucs4 (const guchar * a_in, + gulong *a_in_len, + guint32 **a_out, gulong *a_out_len) +{ + enum CRStatus status = CR_OK ; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, + CR_BAD_PARAM_ERROR) ; + + status = + cr_utils_utf8_str_len_as_ucs4 (a_in, + &a_in[*a_in_len - 1], + a_out_len) ; + + g_return_val_if_fail (status == CR_OK, status) ; + + *a_out = g_malloc0 (*a_out_len * sizeof (guint32)) ; + + status = + cr_utils_utf8_to_ucs4 (a_in, a_in_len, + *a_out, a_out_len) ; + + return status ; +} + +/** + *Converts an ucs4 buffer into an utf8 buffer. + * + *@param a_in the input ucs4 buffer to convert. + *@param a_in_len in/out parameter. The size of the + *input buffer to convert. After return, this parameter contains + *the actual number of characters consumed. + *@param a_out the output converted utf8 buffer. Must be allocated by + *the caller. + *@param a_out_len in/out parameter. The size of the output buffer. + *If this size is actually smaller than the real needed size, the function + *just converts what it can and returns a success status. After return, + *this param points to the actual number of bytes in the buffer. + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs4_to_utf8 (const guint32 *a_in, + gulong *a_in_len, + guchar *a_out, + gulong *a_out_len) +{ + gulong in_len = 0, in_index = 0, out_index = 0 ; + enum CRStatus status = CR_OK ; + + g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, + CR_BAD_PARAM_ERROR) ; + + if (*a_in_len < 1) + { + status = CR_OK ; + goto end ; + } + + in_len = *a_in_len ; + + for (in_index = 0 ; + in_index < in_len ; + in_index++) + { + /* + *FIXME: return whenever we encounter forbidden char values. + */ + + if (a_in[in_index] <= 0x7F) + { + a_out[out_index] = a_in[in_index] ; + out_index ++ ; + } + else if (a_in[in_index] <= 0x7FF) + { + a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)) ; + a_out[out_index + 1] = (0x80 | (a_in[in_index] & 0x3F)); + out_index += 2 ; + } + else if (a_in[in_index] <= 0xFFFF) + { + a_out[out_index] = (0xE0 | (a_in[in_index] >> 12)) ; + a_out[out_index + 1] = + (0x80 | ((a_in[in_index] >> 6) & 0x3F)) ; + a_out[out_index + 2] = (0x80 | (a_in[in_index] & 0x3F)) ; + out_index += 3 ; + } + else if (a_in[in_index] <= 0x1FFFFF) + { + a_out[out_index] = (0xF0 | (a_in[in_index] >> 18)) ; + a_out[out_index + 1] + = (0x80 | ((a_in[in_index] >> 12) & 0x3F)) ; + a_out[out_index + 2] + = (0x80 | ((a_in[in_index] >> 6) & 0x3F)) ; + a_out[out_index + 3] + = (0x80 | (a_in[in_index] & 0x3F)) ; + out_index += 4 ; + } + else if (a_in[in_index] <= 0x3FFFFFF) + { + a_out[out_index] = (0xF8 | (a_in[in_index] >> 24)) ; + a_out[out_index + 1] = (0x80 | (a_in[in_index] >> 18)) ; + a_out[out_index + 2] + = (0x80 | ((a_in[in_index] >> 12) & 0x3F)) ; + a_out[out_index + 3] + = (0x80 | ((a_in[in_index] >> 6) & 0x3F)) ; + a_out[out_index + 4] + = (0x80 | (a_in[in_index] & 0x3F)) ; + out_index += 5 ; + } + else if (a_in[in_index] <= 0x7FFFFFFF) + { + a_out[out_index] = (0xFC | (a_in[in_index] >> 30)) ; + a_out[out_index + 1] = (0x80 | (a_in[in_index] >> 24)) ; + a_out[out_index + 2] + = (0x80 | ((a_in[in_index] >> 18) & 0x3F)) ; + a_out[out_index + 3] + = (0x80 | ((a_in[in_index] >> 12) & 0x3F)) ; + a_out[out_index + 4] + = (0x80 | ((a_in[in_index] >> 6) & 0x3F)) ; + a_out[out_index + 4] + = (0x80 | (a_in[in_index] & 0x3F)) ; + out_index += 6 ; + } + else + { + status = CR_ENCODING_ERROR ; + goto end ; + } + }/*end for*/ + + end: + *a_in_len = in_index + 1 ; + *a_out_len = out_index + 1 ; + + return status ; +} + + +/** + *Converts an ucs4 string into an utf8 string. + *@param a_in the input string to convert. + *@param a_in_len in/out parameter. The length of the input + *string. After return, points to the actual number of characters + *consumed. This can be usefull to debug the input string in case + *of encoding error. + *@param a_out out parameter. Points to the output string. It is allocated + *by this function and must be freed by the caller. + *@param a_out_len out parameter. The length (in bytes) of the output string. + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_ucs4_str_to_utf8 (const guint32 *a_in, + gulong *a_in_len, + guchar **a_out, gulong *a_out_len) +{ + enum CRStatus status = CR_OK ; + + g_return_val_if_fail (a_in && a_in_len && a_out + && a_out_len, CR_BAD_PARAM_ERROR) ; + + status = + cr_utils_ucs4_str_len_as_utf8 (a_in, + &a_in[*a_out_len -1], + a_out_len) ; + + g_return_val_if_fail (status == CR_OK, status) ; + + status = + cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len) ; + + return status ; +} + + +/** + *Converts an ucs1 buffer into an utf8 buffer. + *The caller must know the size of the resulting buffer and + *allocate it prior to calling this function. + * + *@param a_in the input ucs1 buffer. + * + *@param a_in_len in/out parameter. The length of the input buffer. + *After return, points to the number of bytes actually consumed even + *in case of encoding error. + * + *@param a_out out parameter. The output utf8 converted buffer. + * + *@param a_out_len in/out parameter. The size of the output buffer. + *If the output buffer size is shorter than the actual needed size, + *this function just convert what it can. + * + *@return CR_OK upon successfull completion, an error code otherwise. + * + */ +enum CRStatus +cr_utils_ucs1_to_utf8 (const guchar *a_in, + gulong *a_in_len, + guchar *a_out, + gulong *a_out_len) +{ + gulong out_index = 0, in_index = 0, in_len = 0, out_len = 0 ; + enum CRStatus status = CR_OK ; + + g_return_val_if_fail (a_in && a_in_len && a_out + && a_out_len, CR_BAD_PARAM_ERROR) ; + + if (*a_in_len < 1) + { + status = CR_OK ; + goto end ; + } + + in_len = *a_in_len ; + out_len = *a_out_len ; + + for (in_index = 0, out_index = 0 ; + (in_index < in_len) && (out_index < out_len) ; + in_index ++) + { + /* + *FIXME: return whenever we encounter forbidden char values. + */ + + if (a_in[in_index] <= 0x7F) + { + a_out[out_index] = a_in[in_index] ; + out_index ++ ; + } + else + { + a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)) ; + a_out[out_index + 1] = (0x80 | (a_in[in_index] & 0x3F)); + out_index += 2 ; + } + }/*end for*/ + + end: + *a_in_len = in_index ; + *a_out_len = out_index ; + + return CR_OK ; +} + + +/** + *Converts an ucs1 string into an utf8 string. + *@param a_in_start the beginning of the input string to convert. + *@param a_in_end the end of the input string to convert. + *@param a_out out parameter. The converted string. + *@param a_out out parameter. The length of the converted string. + *@return CR_OK upon successfull completion, an error code otherwise. + * + */ +enum CRStatus +cr_utils_ucs1_str_to_utf8 (const guchar *a_in, + gulong *a_in_len, + guchar **a_out, + gulong *a_out_len) +{ + gulong in_len = 0, out_len = 0 ; + enum CRStatus status = CR_OK ; + + g_return_val_if_fail (a_in && a_in_len && a_out + && a_out_len, CR_BAD_PARAM_ERROR) ; + + if (*a_in_len < 1) + { + *a_out_len = 0 ; + *a_out = NULL ; + return CR_OK ; + } + + status = + cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len -1], + &out_len) ; + + g_return_val_if_fail (status == CR_OK, status) ; + + in_len = *a_in_len ; + + *a_out = g_malloc0 (out_len) ; + + status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, + *a_out, &out_len) ; + + *a_out_len = out_len ; + + return status ; +} + + +/** + *Converts an utf8 buffer into an ucs1 buffer. + *The caller must know the size of the resulting + *converted buffer, and allocated it prior to calling this + *function. + * + *@param a_in the input utf8 buffer to convert. + * + *@param a_in_len in/out parameter. The size of the input utf8 buffer. + *After return, points to the number of bytes consumed + *by the function even in case of encoding error. + * + *@param a_out out parameter. Points to the resulting buffer. + *Must be allocated by the caller. If the size of a_out is shorter + *than its required size, this function converts what it can and return + *a successfull status. + * + *@param a_out_len in/out parameter. The size of the output buffer. + *After return, points to the number of bytes consumed even in case of + *encoding error. + * + *@return CR_OK upon successfull completion, an error code otherwise. + */ +enum CRStatus +cr_utils_utf8_to_ucs1 (const guchar * a_in, + gulong * a_in_len, + guchar *a_out, + gulong *a_out_len) +{ + gulong in_index = 0, out_index = 0, in_len = 0, out_len = 0 ; + enum CRStatus status = CR_OK ; + + /* + *to store the final decoded + *unicode char + */ + guint32 c = 0 ; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, + CR_BAD_PARAM_ERROR) ; + + if (*a_in_len < 1) + { + status = CR_OK ; + goto end ; + } + + in_len = *a_in_len ; + out_len = *a_out_len ; + + for (in_index = 0 , out_index = 0 ; + (in_index < in_len) && (out_index < out_len) ; + in_index ++, out_index++) + { + gint nb_bytes_2_decode = 0 ; + + if (a_in[in_index] <= 0x7F) + { + /* + *7 bits long char + *encoded over 1 byte: + * 0xxx xxxx + */ + c = a_in[in_index] ; + nb_bytes_2_decode = 1 ; + + } + else if ((a_in[in_index] & 0xE0) == 0xC0) + { + /* + *up to 11 bits long char. + *encoded over 2 bytes: + *110x xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x1F ; + nb_bytes_2_decode = 2 ; + + } + else if ((a_in[in_index] & 0xF0) == 0xE0) + { + /* + *up to 16 bit long char + *encoded over 3 bytes: + *1110 xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x0F ; + nb_bytes_2_decode = 3 ; + + } + else if ((a_in[in_index] & 0xF8) == 0xF0) + { + /* + *up to 21 bits long char + *encoded over 4 bytes: + *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 0x7 ; + nb_bytes_2_decode = 4 ; + + } + else if ((a_in[in_index] & 0xFC) == 0xF8) + { + /* + *up to 26 bits long char + *encoded over 5 bytes. + *1111 10xx 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 3 ; + nb_bytes_2_decode = 5 ; + + } + else if ((a_in[in_index] & 0xFE) == 0xFC) + { + /* + *up to 31 bits long char + *encoded over 6 bytes: + *1111 110x 10xx xxxx 10xx xxxx + *10xx xxxx 10xx xxxx 10xx xxxx + */ + c = a_in[in_index] & 1 ; + nb_bytes_2_decode = 6 ; + + } + else + { + /*BAD ENCODING*/ + status = CR_ENCODING_ERROR ; + goto end ; + } + + /* + *Go and decode the remaining byte(s) + *(if any) to get the current character. + */ + if (in_index + nb_bytes_2_decode - 1 >= in_len) + { + status = CR_OK ; + goto end ; + } + + for ( ; + nb_bytes_2_decode > 1 ; + nb_bytes_2_decode --) + { + /*decode the next byte*/ + in_index ++ ; + + /*byte pattern must be: 10xx xxxx*/ + if ((a_in[in_index] & 0xC0) != 0x80) + { + status = CR_ENCODING_ERROR ; + goto end ; + } + + c = (c << 6) | (a_in[in_index] & 0x3F) ; + } + + /* + *The decoded ucs4 char is now + *in c. + */ + + if (c > 0xFF) + { + status = CR_ENCODING_ERROR ; + goto end ; + } + + a_out[out_index] = c ; + } + + end: + *a_out_len = out_index ; + *a_in_len = in_index ; + + return CR_OK ; +} + + +/** + *Converts an utf8 buffer into an + *ucs1 buffer. + *@param a_in_start the start of the input buffer. + *@param a_in_end the end of the input buffer. + *@param a_out out parameter. The resulting converted ucs4 buffer. + *Must be freed by the caller. + *@param a_out_len out parameter. The length of the converted buffer. + *@return CR_OK upon successfull completion, an error code otherwise. + *Note that out parameters are valid if and only if this function + *returns CR_OK. + */ +enum CRStatus +cr_utils_utf8_str_to_ucs1 (const guchar * a_in, + gulong * a_in_len, + guchar **a_out, + gulong *a_out_len) +{ + enum CRStatus status = CR_OK ; + + g_return_val_if_fail (a_in && a_in_len + && a_out && a_out_len, + CR_BAD_PARAM_ERROR) ; + + if (*a_in_len < 1) + { + *a_out_len = 0 ; + *a_out = NULL ; + return CR_OK ; + } + + status = + cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], + a_out_len) ; + + g_return_val_if_fail (status == CR_OK, status) ; + + *a_out = g_malloc0 (*a_out_len * sizeof (guint32)) ; + + status = + cr_utils_utf8_to_ucs1 (a_in, a_in_len, + *a_out, a_out_len) ; + return status ; +} + + +/***************************************** + *CSS basic types identification utilities + *****************************************/ + + +/** + *Returns TRUE if a_char is a white space as + *defined in the css spec in chap 4.1.1. + * + *white-space ::= ' '| \t|\r|\n|\f + * + *@param a_char the character to test. + *return TRUE if is a white space, false otherwise. + */ +gboolean +cr_utils_is_white_space (guint32 a_char) +{ + switch (a_char) + { + case ' ': + case '\t': + case '\r': + case '\n': + case '\f': + return TRUE ; + break ; + default: + return FALSE ; + } +} + +/** + *Returns true if the character is a newline + *as defined in the css spec in the chap 4.1.1. + * + *nl ::= \n|\r\n|\r|\f + * + *@param a_char the character to test. + *@return TRUE if the character is a newline, FALSE otherwise. + */ +gboolean +cr_utils_is_newline (guint32 a_char) +{ + switch (a_char) + { + case '\n': + case '\r': + case '\f': + return TRUE ; + break; + default: + return FALSE ; + } +} + +/** + *returns TRUE if the char is part of an hexa num char: + *i.e hexa_char ::= [0-9A-F] + */ +gboolean +cr_utils_is_hexa_char (guint32 a_char) +{ + if ((a_char >= '0' && a_char <= '9') + || (a_char >= 'A' && a_char <= 'F')) + { + return TRUE ; + } + return FALSE ; +} + +/** + *Returns true if the character is a nonascii + *character (as defined in the css spec chap 4.1.1): + * + *nonascii ::= [^\0-\177] + * + *@param a_char the character to test. + *@return TRUE if the character is a nonascii char, + *FALSE otherwise. + */ +gboolean +cr_utils_is_nonascii (guint32 a_char) +{ + if (a_char <= 177) + { + return FALSE ; + } + + return TRUE ; +} + +/** + *Dumps a character a_nb times on a file. + *@param a_char the char to dump + *@param a_fp the destination file pointer + *@param a_nb the number of times a_char is to be dumped. + */ +void +cr_utils_dump_n_chars (guchar a_char, FILE *a_fp, glong a_nb) +{ + glong i = 0 ; + + for (i = 0 ; i < a_nb ; i++) + { + fprintf (a_fp, "%c", a_char) ; + } +} + +void +cr_utils_dump_n_chars2 (guchar a_char, + GString *a_string, + glong a_nb) +{ + glong i = 0 ; + + g_return_if_fail (a_string) ; + + for (i = 0 ; i < a_nb ; i++) + { + g_string_append_printf (a_string, "%c", a_char) ; + } +} + +gdouble +cr_utils_n_to_0_dot_n (glong a_n) +{ + gdouble result = a_n ; + + while (ABS (result) > 1) + { + result = result / 10 ; + } + + return result ; +} + +/** + *Duplicates a list of GString instances. + *@return the duplicated list of GString instances or NULL if + *something bad happened. + *@param a_list_of_strings the list of strings to be duplicated. + */ +GList * +cr_dup_glist_of_string (GList *a_list_of_strings) +{ + GList *cur = NULL, *result = NULL ; + + g_return_val_if_fail (a_list_of_strings, NULL) ; + + for (cur = a_list_of_strings ; cur ; cur = cur->next) + { + GString *str = NULL ; + + str = g_string_new_len (((GString *)cur->data)->str, + ((GString *)cur->data)->len) ; + if (str) + result = g_list_append (result, str) ; + } + + return result ; +} |