diff options
| author | Sterling Hughes <sterling@php.net> | 2003-05-06 19:38:49 +0000 |
|---|---|---|
| committer | Sterling Hughes <sterling@php.net> | 2003-05-06 19:38:49 +0000 |
| commit | a0351b093f43f9c2be5b45d6e08542adb4de419a (patch) | |
| tree | c40e915b27205178d43cd88ee35cebf3d12e9eb1 /bundle/libxml/encoding.c | |
| parent | 33a10b342ec8104133699e796fb77e9c55c8eb38 (diff) | |
| download | php-git-a0351b093f43f9c2be5b45d6e08542adb4de419a.tar.gz | |
Bundle libxml and add compatibility layer
Diffstat (limited to 'bundle/libxml/encoding.c')
| -rw-r--r-- | bundle/libxml/encoding.c | 2340 |
1 files changed, 2340 insertions, 0 deletions
diff --git a/bundle/libxml/encoding.c b/bundle/libxml/encoding.c new file mode 100644 index 0000000000..69d67cd6b9 --- /dev/null +++ b/bundle/libxml/encoding.c @@ -0,0 +1,2340 @@ +/* + * encoding.c : implements the encoding conversion functions needed for XML + * + * Related specs: + * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies + * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau + * [ISO-10646] UTF-8 and UTF-16 in Annexes + * [ISO-8859-1] ISO Latin-1 characters codes. + * [UNICODE] The Unicode Consortium, "The Unicode Standard -- + * Worldwide Character Encoding -- Version 1.0", Addison- + * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is + * described in Unicode Technical Report #4. + * [US-ASCII] Coded Character Set--7-bit American Standard Code for + * Information Interchange, ANSI X3.4-1986. + * + * See Copyright for the status of this software. + * + * daniel@veillard.com + * + * UTF8 string routines from: + * "William M. Brack" <wbrack@mmm.com.hk> + * + * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> + */ + +#define IN_LIBXML +#include "libxml.h" + +#include <string.h> + +#ifdef HAVE_CTYPE_H +#include <ctype.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif +#ifdef LIBXML_ICONV_ENABLED +#ifdef HAVE_ERRNO_H +#include <errno.h> +#endif +#endif +#include <libxml/encoding.h> +#include <libxml/xmlmemory.h> +#ifdef LIBXML_HTML_ENABLED +#include <libxml/HTMLparser.h> +#endif +#include <libxml/globals.h> +#include <libxml/xmlerror.h> + +static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; +static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; + +typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias; +typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr; +struct _xmlCharEncodingAlias { + const char *name; + const char *alias; +}; + +static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; +static int xmlCharEncodingAliasesNb = 0; +static int xmlCharEncodingAliasesMax = 0; + +#ifdef LIBXML_ICONV_ENABLED +#if 0 +#define DEBUG_ENCODING /* Define this to get encoding traces */ +#endif +#endif + +static int xmlLittleEndian = 1; + +/************************************************************************ + * * + * Generic UTF8 handling routines * + * * + * From rfc2044: encoding of the Unicode values on UTF-8: * + * * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) * + * 0000 0000-0000 007F 0xxxxxxx * + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * + * * + * I hope we won't use values > 0xFFFF anytime soon ! * + * * + ************************************************************************/ + +/** + * xmlUTF8Strlen: + * @utf: a sequence of UTF-8 encoded bytes + * + * compute the length of an UTF8 string, it doesn't do a full UTF8 + * checking of the content of the string. + * + * Returns the number of characters in the string or -1 in case of error + */ +int +xmlUTF8Strlen(const xmlChar *utf) { + int ret = 0; + + if (utf == NULL) + return(-1); + + while (*utf != 0) { + if (utf[0] & 0x80) { + if ((utf[1] & 0xc0) != 0x80) + return(-1); + if ((utf[0] & 0xe0) == 0xe0) { + if ((utf[2] & 0xc0) != 0x80) + return(-1); + if ((utf[0] & 0xf0) == 0xf0) { + if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) + return(-1); + utf += 4; + } else { + utf += 3; + } + } else { + utf += 2; + } + } else { + utf++; + } + ret++; + } + return(ret); +} + +/** + * xmlGetUTF8Char: + * @utf: a sequence of UTF-8 encoded bytes + * @len: a pointer to @bytes len + * + * Read one UTF8 Char from @utf + * + * Returns the char value or -1 in case of error and update @len with the + * number of bytes used + */ +int +xmlGetUTF8Char(const unsigned char *utf, int *len) { + unsigned int c; + + if (utf == NULL) + goto error; + if (len == NULL) + goto error; + if (*len < 1) + goto error; + + c = utf[0]; + if (c & 0x80) { + if (*len < 2) + goto error; + if ((utf[1] & 0xc0) != 0x80) + goto error; + if ((c & 0xe0) == 0xe0) { + if (*len < 3) + goto error; + if ((utf[2] & 0xc0) != 0x80) + goto error; + if ((c & 0xf0) == 0xf0) { + if (*len < 4) + goto error; + if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) + goto error; + *len = 4; + /* 4-byte code */ + c = (utf[0] & 0x7) << 18; + c |= (utf[1] & 0x3f) << 12; + c |= (utf[2] & 0x3f) << 6; + c |= utf[3] & 0x3f; + } else { + /* 3-byte code */ + *len = 3; + c = (utf[0] & 0xf) << 12; + c |= (utf[1] & 0x3f) << 6; + c |= utf[2] & 0x3f; + } + } else { + /* 2-byte code */ + *len = 2; + c = (utf[0] & 0x1f) << 6; + c |= utf[1] & 0x3f; + } + } else { + /* 1-byte code */ + *len = 1; + } + return(c); + +error: + *len = 0; + return(-1); +} + +/** + * xmlCheckUTF8: + * @utf: Pointer to putative utf-8 encoded string. + * + * Checks @utf for being valid utf-8. @utf is assumed to be + * null-terminated. This function is not super-strict, as it will + * allow longer utf-8 sequences than necessary. Note that Java is + * capable of producing these sequences if provoked. Also note, this + * routine checks for the 4-byte maximum size, but does not check for + * 0x10ffff maximum value. + * + * Return value: true if @utf is valid. + **/ +int +xmlCheckUTF8(const unsigned char *utf) +{ + int ix; + unsigned char c; + + for (ix = 0; (c = utf[ix]);) { + if (c & 0x80) { + if ((utf[ix + 1] & 0xc0) != 0x80) + return(0); + if ((c & 0xe0) == 0xe0) { + if ((utf[ix + 2] & 0xc0) != 0x80) + return(0); + if ((c & 0xf0) == 0xf0) { + if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80) + return(0); + ix += 4; + /* 4-byte code */ + } else + /* 3-byte code */ + ix += 3; + } else + /* 2-byte code */ + ix += 2; + } else + /* 1-byte code */ + ix++; + } + return(1); +} + +/** + * xmlUTF8Strsize: + * @utf: a sequence of UTF-8 encoded bytes + * @len: the number of characters in the array + * + * storage size of an UTF8 string + * + * Returns the storage size of + * the first 'len' characters of ARRAY + * + */ + +int +xmlUTF8Strsize(const xmlChar *utf, int len) { + const xmlChar *ptr=utf; + xmlChar ch; + + if (len <= 0) + return(0); + + while ( len-- > 0) { + if ( !*ptr ) + break; + if ( (ch = *ptr++) & 0x80) + while ( (ch<<=1) & 0x80 ) + ptr++; + } + return (ptr - utf); +} + + +/** + * xmlUTF8Strndup: + * @utf: the input UTF8 * + * @len: the len of @utf (in chars) + * + * a strndup for array of UTF8's + * + * Returns a new UTF8 * or NULL + */ +xmlChar * +xmlUTF8Strndup(const xmlChar *utf, int len) { + xmlChar *ret; + int i; + + if ((utf == NULL) || (len < 0)) return(NULL); + i = xmlUTF8Strsize(utf, len); + ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar)); + if (ret == NULL) { + xmlGenericError(xmlGenericErrorContext, + "malloc of %ld byte failed\n", + (len + 1) * (long)sizeof(xmlChar)); + return(NULL); + } + memcpy(ret, utf, i * sizeof(xmlChar)); + ret[i] = 0; + return(ret); +} + +/** + * xmlUTF8Strpos: + * @utf: the input UTF8 * + * @pos: the position of the desired UTF8 char (in chars) + * + * a function to provide the equivalent of fetching a + * character from a string array + * + * Returns a pointer to the UTF8 character or NULL + */ +xmlChar * +xmlUTF8Strpos(const xmlChar *utf, int pos) { + xmlChar ch; + + if (utf == NULL) return(NULL); + if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) ) + return(NULL); + while (pos--) { + if ((ch=*utf++) == 0) return(NULL); + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(NULL); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(NULL); + } + } + return((xmlChar *)utf); +} + +/** + * xmlUTF8Strloc: + * @utf: the input UTF8 * + * @utfchar: the UTF8 character to be found + * + * a function to provide relative location of a UTF8 char + * + * Returns the relative character position of the desired char + * or -1 if not found + */ +int +xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { + int i, size; + xmlChar ch; + + if (utf==NULL || utfchar==NULL) return -1; + size = xmlUTF8Strsize(utfchar, 1); + for(i=0; (ch=*utf) != 0; i++) { + if (xmlStrncmp(utf, utfchar, size)==0) + return(i); + utf++; + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(-1); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(-1); + } + } + + return(-1); +} +/** + * xmlUTF8Strsub: + * @utf: a sequence of UTF-8 encoded bytes + * @start: relative pos of first char + * @len: total number to copy + * + * Note: positions are given in units of UTF-8 chars + * + * Returns a pointer to a newly created string + * or NULL if any problem + */ + +xmlChar * +xmlUTF8Strsub(const xmlChar *utf, int start, int len) { + int i; + xmlChar ch; + + if (utf == NULL) return(NULL); + if (start < 0) return(NULL); + if (len < 0) return(NULL); + + /* + * Skip over any leading chars + */ + for (i = 0;i < start;i++) { + if ((ch=*utf++) == 0) return(NULL); + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(NULL); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(NULL); + } + } + + return(xmlUTF8Strndup(utf, len)); +} + +/************************************************************************ + * * + * Conversions To/From UTF8 encoding * + * * + ************************************************************************/ + +/** + * asciiToUTF8: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ASCII chars + * @inlen: the length of @in + * + * Take a block of ASCII chars in and try to convert it to an UTF-8 + * block of chars out. + * Returns 0 if success, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +static int +asciiToUTF8(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + unsigned char* outstart = out; + const unsigned char* base = in; + const unsigned char* processed = in; + unsigned char* outend = out + *outlen; + const unsigned char* inend; + unsigned int c; + int bits; + + inend = in + (*inlen); + while ((in < inend) && (out - outstart + 5 < *outlen)) { + c= *in++; + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else { + *outlen = out - outstart; + *inlen = processed - base; + return(-1); + } + + for ( ; bits >= 0; bits-= 6) { + if (out >= outend) + break; + *out++= ((c >> bits) & 0x3F) | 0x80; + } + processed = (const unsigned char*) in; + } + *outlen = out - outstart; + *inlen = processed - base; + return(0); +} + +/** + * UTF8Toascii: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an ASCII + * block of chars out. + * + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +static int +UTF8Toascii(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + const unsigned char* processed = in; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); + while (in < inend) { + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x80) { + if (out >= outend) + break; + *out++ = c; + } else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + processed = in; + } + *outlen = out - outstart; + *inlen = processed - instart; + return(0); +} + +/** + * isolat1ToUTF8: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ISO Latin 1 chars + * @inlen: the length of @in + * + * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 + * block of chars out. + * Returns 0 if success, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +int +isolat1ToUTF8(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + unsigned char* outstart = out; + const unsigned char* base = in; + unsigned char* outend = out + *outlen; + const unsigned char* inend; + const unsigned char* instop; + xmlChar c = *in; + + inend = in + (*inlen); + instop = inend; + + while (in < inend && out < outend - 1) { + if (c >= 0x80) { + *out++= ((c >> 6) & 0x1F) | 0xC0; + *out++= (c & 0x3F) | 0x80; + ++in; + c = *in; + } + if (instop - in > outend - out) instop = in + (outend - out); + while (c < 0x80 && in < instop) { + *out++ = c; + ++in; + c = *in; + } + } + if (in < inend && out < outend && c < 0x80) { + *out++ = c; + ++in; + } + *outlen = out - outstart; + *inlen = in - base; + return(0); +} + + +/** + * UTF8Toisolat1: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 + * block of chars out. + * + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +int +UTF8Toisolat1(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen) { + const unsigned char* processed = in; + const unsigned char* outend; + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend; + unsigned int c, d; + int trailing; + + if (in == NULL) { + /* + * initialization nothing to do + */ + *outlen = 0; + *inlen = 0; + return(0); + } + inend = in + (*inlen); + outend = out + (*outlen); + while (in < inend) { + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in IsoLat1 */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if (in >= inend) + break; + if (((d= *in++) & 0xC0) != 0x80) { + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c <= 0xFF) { + if (out >= outend) + break; + *out++ = c; + } else { + /* no chance for this in IsoLat1 */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + processed = in; + } + *outlen = out - outstart; + *inlen = processed - instart; + return(0); +} + +/** + * UTF16LEToUTF8: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @inb: a pointer to an array of UTF-16LE passwd as a byte array + * @inlenb: the length of @in in UTF-16LE chars + * + * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8 + * block of chars out. This function assume the endian property + * is the same between the native type of this machine and the + * inputed one. + * + * Returns the number of byte written, or -1 by lack of space, or -2 + * if the transcoding fails (for *in is not valid utf16 string) + * The value of *inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + */ +static int +UTF16LEToUTF8(unsigned char* out, int *outlen, + const unsigned char* inb, int *inlenb) +{ + unsigned char* outstart = out; + const unsigned char* processed = inb; + unsigned char* outend = out + *outlen; + unsigned short* in = (unsigned short*) inb; + unsigned short* inend; + unsigned int c, d, inlen; + unsigned char *tmp; + int bits; + + if ((*inlenb % 2) == 1) + (*inlenb)--; + inlen = *inlenb / 2; + inend = in + inlen; + while ((in < inend) && (out - outstart + 5 < *outlen)) { + if (xmlLittleEndian) { + c= *in++; + } else { + tmp = (unsigned char *) in; + c = *tmp++; + c = c | (((unsigned int)*tmp) << 8); + in++; + } + if ((c & 0xFC00) == 0xD800) { /* surrogates */ + if (in >= inend) { /* (in > inend) shouldn't happens */ + break; + } + if (xmlLittleEndian) { + d = *in++; + } else { + tmp = (unsigned char *) in; + d = *tmp++; + d = d | (((unsigned int)*tmp) << 8); + in++; + } + if ((d & 0xFC00) == 0xDC00) { + c &= 0x03FF; + c <<= 10; + c |= d & 0x03FF; + c += 0x10000; + } + else { + *outlen = out - outstart; + *inlenb = processed - inb; + return(-2); + } + } + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + if (out >= outend) + break; + *out++= ((c >> bits) & 0x3F) | 0x80; + } + processed = (const unsigned char*) in; + } + *outlen = out - outstart; + *inlenb = processed - inb; + return(0); +} + +/** + * UTF8ToUTF16LE: + * @outb: a pointer to an array of bytes to store the result + * @outlen: the length of @outb + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE + * block of chars out. + * + * Returns the number of byte written, or -1 by lack of space, or -2 + * if the transcoding failed. + */ +static int +UTF8ToUTF16LE(unsigned char* outb, int *outlen, + const unsigned char* in, int *inlen) +{ + unsigned short* out = (unsigned short*) outb; + const unsigned char* processed = in; + unsigned short* outstart= out; + unsigned short* outend; + const unsigned char* inend= in+*inlen; + unsigned int c, d; + int trailing; + unsigned char *tmp; + unsigned short tmp1, tmp2; + + if (in == NULL) { + /* + * initialization, add the Byte Order Mark + */ + if (*outlen >= 2) { + outb[0] = 0xFF; + outb[1] = 0xFE; + *outlen = 2; + *inlen = 0; +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Added FFFE Byte Order Mark\n"); +#endif + return(2); + } + *outlen = 0; + *inlen = 0; + return(0); + } + outend = out + (*outlen / 2); + while (in < inend) { + d= *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = (out - outstart) * 2; + *inlen = processed - in; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in UTF-16 */ + *outlen = (out - outstart) * 2; + *inlen = processed - in; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) + break; + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x10000) { + if (out >= outend) + break; + if (xmlLittleEndian) { + *out++ = c; + } else { + tmp = (unsigned char *) out; + *tmp = c ; + *(tmp + 1) = c >> 8 ; + out++; + } + } + else if (c < 0x110000) { + if (out+1 >= outend) + break; + c -= 0x10000; + if (xmlLittleEndian) { + *out++ = 0xD800 | (c >> 10); + *out++ = 0xDC00 | (c & 0x03FF); + } else { + tmp1 = 0xD800 | (c >> 10); + tmp = (unsigned char *) out; + *tmp = (unsigned char) tmp1; + *(tmp + 1) = tmp1 >> 8; + out++; + + tmp2 = 0xDC00 | (c & 0x03FF); + tmp = (unsigned char *) out; + *tmp = (unsigned char) tmp2; + *(tmp + 1) = tmp2 >> 8; + out++; + } + } + else + break; + processed = in; + } + *outlen = (out - outstart) * 2; + *inlen = processed - in; + return(0); +} + +/** + * UTF16BEToUTF8: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @inb: a pointer to an array of UTF-16 passwd as a byte array + * @inlenb: the length of @in in UTF-16 chars + * + * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8 + * block of chars out. This function assume the endian property + * is the same between the native type of this machine and the + * inputed one. + * + * Returns the number of byte written, or -1 by lack of space, or -2 + * if the transcoding fails (for *in is not valid utf16 string) + * The value of *inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + */ +static int +UTF16BEToUTF8(unsigned char* out, int *outlen, + const unsigned char* inb, int *inlenb) +{ + unsigned char* outstart = out; + const unsigned char* processed = inb; + unsigned char* outend = out + *outlen; + unsigned short* in = (unsigned short*) inb; + unsigned short* inend; + unsigned int c, d, inlen; + unsigned char *tmp; + int bits; + + if ((*inlenb % 2) == 1) + (*inlenb)--; + inlen = *inlenb / 2; + inend= in + inlen; + while (in < inend) { + if (xmlLittleEndian) { + tmp = (unsigned char *) in; + c = *tmp++; + c = c << 8; + c = c | (unsigned int) *tmp; + in++; + } else { + c= *in++; + } + if ((c & 0xFC00) == 0xD800) { /* surrogates */ + if (in >= inend) { /* (in > inend) shouldn't happens */ + *outlen = out - outstart; + *inlenb = processed - inb; + return(-2); + } + if (xmlLittleEndian) { + tmp = (unsigned char *) in; + d = *tmp++; + d = d << 8; + d = d | (unsigned int) *tmp; + in++; + } else { + d= *in++; + } + if ((d & 0xFC00) == 0xDC00) { + c &= 0x03FF; + c <<= 10; + c |= d & 0x03FF; + c += 0x10000; + } + else { + *outlen = out - outstart; + *inlenb = processed - inb; + return(-2); + } + } + + /* assertion: c is a single UTF-4 value */ + if (out >= outend) + break; + if (c < 0x80) { *out++= c; bits= -6; } + else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } + else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } + else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; } + + for ( ; bits >= 0; bits-= 6) { + if (out >= outend) + break; + *out++= ((c >> bits) & 0x3F) | 0x80; + } + processed = (const unsigned char*) in; + } + *outlen = out - outstart; + *inlenb = processed - inb; + return(0); +} + +/** + * UTF8ToUTF16BE: + * @outb: a pointer to an array of bytes to store the result + * @outlen: the length of @outb + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * + * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE + * block of chars out. + * + * Returns the number of byte written, or -1 by lack of space, or -2 + * if the transcoding failed. + */ +static int +UTF8ToUTF16BE(unsigned char* outb, int *outlen, + const unsigned char* in, int *inlen) +{ + unsigned short* out = (unsigned short*) outb; + const unsigned char* processed = in; + unsigned short* outstart= out; + unsigned short* outend; + const unsigned char* inend= in+*inlen; + unsigned int c, d; + int trailing; + unsigned char *tmp; + unsigned short tmp1, tmp2; + + if (in == NULL) { + /* + * initialization, add the Byte Order Mark + */ + if (*outlen >= 2) { + outb[0] = 0xFE; + outb[1] = 0xFF; + *outlen = 2; + *inlen = 0; +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Added FEFF Byte Order Mark\n"); +#endif + return(2); + } + *outlen = 0; + *inlen = 0; + return(0); + } + outend = out + (*outlen / 2); + while (in < inend) { + d= *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - in; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in UTF-16 */ + *outlen = out - outstart; + *inlen = processed - in; + return(-2); + } + + if (inend - in < trailing) { + break; + } + + for ( ; trailing; trailing--) { + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break; + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x10000) { + if (out >= outend) break; + if (xmlLittleEndian) { + tmp = (unsigned char *) out; + *tmp = c >> 8; + *(tmp + 1) = c; + out++; + } else { + *out++ = c; + } + } + else if (c < 0x110000) { + if (out+1 >= outend) break; + c -= 0x10000; + if (xmlLittleEndian) { + tmp1 = 0xD800 | (c >> 10); + tmp = (unsigned char *) out; + *tmp = tmp1 >> 8; + *(tmp + 1) = (unsigned char) tmp1; + out++; + + tmp2 = 0xDC00 | (c & 0x03FF); + tmp = (unsigned char *) out; + *tmp = tmp2 >> 8; + *(tmp + 1) = (unsigned char) tmp2; + out++; + } else { + *out++ = 0xD800 | (c >> 10); + *out++ = 0xDC00 | (c & 0x03FF); + } + } + else + break; + processed = in; + } + *outlen = (out - outstart) * 2; + *inlen = processed - in; + return(0); +} + +/************************************************************************ + * * + * Generic encoding handling routines * + * * + ************************************************************************/ + +/** + * xmlDetectCharEncoding: + * @in: a pointer to the first bytes of the XML entity, must be at least + * 4 bytes long. + * @len: pointer to the length of the buffer + * + * Guess the encoding of the entity using the first bytes of the entity content + * accordingly of the non-normative appendix F of the XML-1.0 recommendation. + * + * Returns one of the XML_CHAR_ENCODING_... values. + */ +xmlCharEncoding +xmlDetectCharEncoding(const unsigned char* in, int len) +{ + if (len >= 4) { + if ((in[0] == 0x00) && (in[1] == 0x00) && + (in[2] == 0x00) && (in[3] == 0x3C)) + return(XML_CHAR_ENCODING_UCS4BE); + if ((in[0] == 0x3C) && (in[1] == 0x00) && + (in[2] == 0x00) && (in[3] == 0x00)) + return(XML_CHAR_ENCODING_UCS4LE); + if ((in[0] == 0x00) && (in[1] == 0x00) && + (in[2] == 0x3C) && (in[3] == 0x00)) + return(XML_CHAR_ENCODING_UCS4_2143); + if ((in[0] == 0x00) && (in[1] == 0x3C) && + (in[2] == 0x00) && (in[3] == 0x00)) + return(XML_CHAR_ENCODING_UCS4_3412); + if ((in[0] == 0x4C) && (in[1] == 0x6F) && + (in[2] == 0xA7) && (in[3] == 0x94)) + return(XML_CHAR_ENCODING_EBCDIC); + if ((in[0] == 0x3C) && (in[1] == 0x3F) && + (in[2] == 0x78) && (in[3] == 0x6D)) + return(XML_CHAR_ENCODING_UTF8); + } + if (len >= 3) { + /* + * Errata on XML-1.0 June 20 2001 + * We now allow an UTF8 encoded BOM + */ + if ((in[0] == 0xEF) && (in[1] == 0xBB) && + (in[2] == 0xBF)) + return(XML_CHAR_ENCODING_UTF8); + } + if (len >= 2) { + if ((in[0] == 0xFE) && (in[1] == 0xFF)) + return(XML_CHAR_ENCODING_UTF16BE); + if ((in[0] == 0xFF) && (in[1] == 0xFE)) + return(XML_CHAR_ENCODING_UTF16LE); + } + return(XML_CHAR_ENCODING_NONE); +} + +/** + * xmlCleanupEncodingAliases: + * + * Unregisters all aliases + */ +void +xmlCleanupEncodingAliases(void) { + int i; + + if (xmlCharEncodingAliases == NULL) + return; + + for (i = 0;i < xmlCharEncodingAliasesNb;i++) { + if (xmlCharEncodingAliases[i].name != NULL) + xmlFree((char *) xmlCharEncodingAliases[i].name); + if (xmlCharEncodingAliases[i].alias != NULL) + xmlFree((char *) xmlCharEncodingAliases[i].alias); + } + xmlCharEncodingAliasesNb = 0; + xmlCharEncodingAliasesMax = 0; + xmlFree(xmlCharEncodingAliases); + xmlCharEncodingAliases = NULL; +} + +/** + * xmlGetEncodingAlias: + * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) + * + * Lookup an encoding name for the given alias. + * + * Returns NULL if not found the original name otherwise + */ +const char * +xmlGetEncodingAlias(const char *alias) { + int i; + char upper[100]; + + if (alias == NULL) + return(NULL); + + if (xmlCharEncodingAliases == NULL) + return(NULL); + + for (i = 0;i < 99;i++) { + upper[i] = toupper(alias[i]); + if (upper[i] == 0) break; + } + upper[i] = 0; + + /* + * Walk down the list looking for a definition of the alias + */ + for (i = 0;i < xmlCharEncodingAliasesNb;i++) { + if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { + return(xmlCharEncodingAliases[i].name); + } + } + return(NULL); +} + +/** + * xmlAddEncodingAlias: + * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) + * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) + * + * Registers and alias @alias for an encoding named @name. Existing alias + * will be overwritten. + * + * Returns 0 in case of success, -1 in case of error + */ +int +xmlAddEncodingAlias(const char *name, const char *alias) { + int i; + char upper[100]; + + if ((name == NULL) || (alias == NULL)) + return(-1); + + for (i = 0;i < 99;i++) { + upper[i] = toupper(alias[i]); + if (upper[i] == 0) break; + } + upper[i] = 0; + + if (xmlCharEncodingAliases == NULL) { + xmlCharEncodingAliasesNb = 0; + xmlCharEncodingAliasesMax = 20; + xmlCharEncodingAliases = (xmlCharEncodingAliasPtr) + xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias)); + if (xmlCharEncodingAliases == NULL) + return(-1); + } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) { + xmlCharEncodingAliasesMax *= 2; + xmlCharEncodingAliases = (xmlCharEncodingAliasPtr) + xmlRealloc(xmlCharEncodingAliases, + xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias)); + } + /* + * Walk down the list looking for a definition of the alias + */ + for (i = 0;i < xmlCharEncodingAliasesNb;i++) { + if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { + /* + * Replace the definition. + */ + xmlFree((char *) xmlCharEncodingAliases[i].name); + xmlCharEncodingAliases[i].name = xmlMemStrdup(name); + return(0); + } + } + /* + * Add the definition + */ + xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name); + xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper); + xmlCharEncodingAliasesNb++; + return(0); +} + +/** + * xmlDelEncodingAlias: + * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) + * + * Unregisters an encoding alias @alias + * + * Returns 0 in case of success, -1 in case of error + */ +int +xmlDelEncodingAlias(const char *alias) { + int i; + + if (alias == NULL) + return(-1); + + if (xmlCharEncodingAliases == NULL) + return(-1); + /* + * Walk down the list looking for a definition of the alias + */ + for (i = 0;i < xmlCharEncodingAliasesNb;i++) { + if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) { + xmlFree((char *) xmlCharEncodingAliases[i].name); + xmlFree((char *) xmlCharEncodingAliases[i].alias); + xmlCharEncodingAliasesNb--; + memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1], + sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i)); + return(0); + } + } + return(-1); +} + +/** + * xmlParseCharEncoding: + * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) + * + * Compare the string to the known encoding schemes already known. Note + * that the comparison is case insensitive accordingly to the section + * [XML] 4.3.3 Character Encoding in Entities. + * + * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE + * if not recognized. + */ +xmlCharEncoding +xmlParseCharEncoding(const char* name) +{ + const char *alias; + char upper[500]; + int i; + + if (name == NULL) + return(XML_CHAR_ENCODING_NONE); + + /* + * Do the alias resolution + */ + alias = xmlGetEncodingAlias(name); + if (alias != NULL) + name = alias; + + for (i = 0;i < 499;i++) { + upper[i] = toupper(name[i]); + if (upper[i] == 0) break; + } + upper[i] = 0; + + if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE); + if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8); + if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8); + + /* + * NOTE: if we were able to parse this, the endianness of UTF16 is + * already found and in use + */ + if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE); + if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE); + + if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2); + if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2); + if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2); + + /* + * NOTE: if we were able to parse this, the endianness of UCS4 is + * already found and in use + */ + if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE); + if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE); + if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE); + + + if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1); + if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1); + if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1); + + if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2); + if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2); + if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2); + + if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3); + if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4); + if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5); + if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6); + if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7); + if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8); + if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9); + + if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP); + if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS); + if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP); + +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name); +#endif + return(XML_CHAR_ENCODING_ERROR); +} + +/** + * xmlGetCharEncodingName: + * @enc: the encoding + * + * The "canonical" name for XML encoding. + * C.f. http://www.w3.org/TR/REC-xml#charencoding + * Section 4.3.3 Character Encoding in Entities + * + * Returns the canonical name for the given encoding + */ + +const char* +xmlGetCharEncodingName(xmlCharEncoding enc) { + switch (enc) { + case XML_CHAR_ENCODING_ERROR: + return(NULL); + case XML_CHAR_ENCODING_NONE: + return(NULL); + case XML_CHAR_ENCODING_UTF8: + return("UTF-8"); + case XML_CHAR_ENCODING_UTF16LE: + return("UTF-16"); + case XML_CHAR_ENCODING_UTF16BE: + return("UTF-16"); + case XML_CHAR_ENCODING_EBCDIC: + return("EBCDIC"); + case XML_CHAR_ENCODING_UCS4LE: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4BE: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4_2143: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS4_3412: + return("ISO-10646-UCS-4"); + case XML_CHAR_ENCODING_UCS2: + return("ISO-10646-UCS-2"); + case XML_CHAR_ENCODING_8859_1: + return("ISO-8859-1"); + case XML_CHAR_ENCODING_8859_2: + return("ISO-8859-2"); + case XML_CHAR_ENCODING_8859_3: + return("ISO-8859-3"); + case XML_CHAR_ENCODING_8859_4: + return("ISO-8859-4"); + case XML_CHAR_ENCODING_8859_5: + return("ISO-8859-5"); + case XML_CHAR_ENCODING_8859_6: + return("ISO-8859-6"); + case XML_CHAR_ENCODING_8859_7: + return("ISO-8859-7"); + case XML_CHAR_ENCODING_8859_8: + return("ISO-8859-8"); + case XML_CHAR_ENCODING_8859_9: + return("ISO-8859-9"); + case XML_CHAR_ENCODING_2022_JP: + return("ISO-2022-JP"); + case XML_CHAR_ENCODING_SHIFT_JIS: + return("Shift-JIS"); + case XML_CHAR_ENCODING_EUC_JP: + return("EUC-JP"); + case XML_CHAR_ENCODING_ASCII: + return(NULL); + } + return(NULL); +} + +/************************************************************************ + * * + * Char encoding handlers * + * * + ************************************************************************/ + + +/* the size should be growable, but it's not a big deal ... */ +#define MAX_ENCODING_HANDLERS 50 +static xmlCharEncodingHandlerPtr *handlers = NULL; +static int nbCharEncodingHandler = 0; + +/* + * The default is UTF-8 for XML, that's also the default used for the + * parser internals, so the default encoding handler is NULL + */ + +static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL; + +/** + * xmlNewCharEncodingHandler: + * @name: the encoding name, in UTF-8 format (ASCII actually) + * @input: the xmlCharEncodingInputFunc to read that encoding + * @output: the xmlCharEncodingOutputFunc to write that encoding + * + * Create and registers an xmlCharEncodingHandler. + * + * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error). + */ +xmlCharEncodingHandlerPtr +xmlNewCharEncodingHandler(const char *name, + xmlCharEncodingInputFunc input, + xmlCharEncodingOutputFunc output) { + xmlCharEncodingHandlerPtr handler; + const char *alias; + char upper[500]; + int i; + char *up = 0; + + /* + * Do the alias resolution + */ + alias = xmlGetEncodingAlias(name); + if (alias != NULL) + name = alias; + + /* + * Keep only the uppercase version of the encoding. + */ + if (name == NULL) { + xmlGenericError(xmlGenericErrorContext, + "xmlNewCharEncodingHandler : no name !\n"); + return(NULL); + } + for (i = 0;i < 499;i++) { + upper[i] = toupper(name[i]); + if (upper[i] == 0) break; + } + upper[i] = 0; + up = xmlMemStrdup(upper); + if (up == NULL) { + xmlGenericError(xmlGenericErrorContext, + "xmlNewCharEncodingHandler : out of memory !\n"); + return(NULL); + } + + /* + * allocate and fill-up an handler block. + */ + handler = (xmlCharEncodingHandlerPtr) + xmlMalloc(sizeof(xmlCharEncodingHandler)); + if (handler == NULL) { + xmlGenericError(xmlGenericErrorContext, + "xmlNewCharEncodingHandler : out of memory !\n"); + return(NULL); + } + handler->input = input; + handler->output = output; + handler->name = up; + +#ifdef LIBXML_ICONV_ENABLED + handler->iconv_in = NULL; + handler->iconv_out = NULL; +#endif /* LIBXML_ICONV_ENABLED */ + + /* + * registers and returns the handler. + */ + xmlRegisterCharEncodingHandler(handler); +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Registered encoding handler for %s\n", name); +#endif + return(handler); +} + +/** + * xmlInitCharEncodingHandlers: + * + * Initialize the char encoding support, it registers the default + * encoding supported. + * NOTE: while public, this function usually doesn't need to be called + * in normal processing. + */ +void +xmlInitCharEncodingHandlers(void) { + unsigned short int tst = 0x1234; + unsigned char *ptr = (unsigned char *) &tst; + + if (handlers != NULL) return; + + handlers = (xmlCharEncodingHandlerPtr *) + xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr)); + + if (*ptr == 0x12) xmlLittleEndian = 0; + else if (*ptr == 0x34) xmlLittleEndian = 1; + else xmlGenericError(xmlGenericErrorContext, + "Odd problem at endianness detection\n"); + + if (handlers == NULL) { + xmlGenericError(xmlGenericErrorContext, + "xmlInitCharEncodingHandlers : out of memory !\n"); + return; + } + xmlNewCharEncodingHandler("UTF-8", NULL, NULL); + xmlUTF16LEHandler = + xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE); + xmlUTF16BEHandler = + xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE); + xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1); + xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii); + xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii); +#ifdef LIBXML_HTML_ENABLED + xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml); +#endif +} + +/** + * xmlCleanupCharEncodingHandlers: + * + * Cleanup the memory allocated for the char encoding support, it + * unregisters all the encoding handlers and the aliases. + */ +void +xmlCleanupCharEncodingHandlers(void) { + xmlCleanupEncodingAliases(); + + if (handlers == NULL) return; + + for (;nbCharEncodingHandler > 0;) { + nbCharEncodingHandler--; + if (handlers[nbCharEncodingHandler] != NULL) { + if (handlers[nbCharEncodingHandler]->name != NULL) + xmlFree(handlers[nbCharEncodingHandler]->name); + xmlFree(handlers[nbCharEncodingHandler]); + } + } + xmlFree(handlers); + handlers = NULL; + nbCharEncodingHandler = 0; + xmlDefaultCharEncodingHandler = NULL; +} + +/** + * xmlRegisterCharEncodingHandler: + * @handler: the xmlCharEncodingHandlerPtr handler block + * + * Register the char encoding handler, surprising, isn't it ? + */ +void +xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { + if (handlers == NULL) xmlInitCharEncodingHandlers(); + if (handler == NULL) { + xmlGenericError(xmlGenericErrorContext, + "xmlRegisterCharEncodingHandler: NULL handler !\n"); + return; + } + + if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) { + xmlGenericError(xmlGenericErrorContext, + "xmlRegisterCharEncodingHandler: Too many handler registered\n"); + xmlGenericError(xmlGenericErrorContext, + "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__); + return; + } + handlers[nbCharEncodingHandler++] = handler; +} + +/** + * xmlGetCharEncodingHandler: + * @enc: an xmlCharEncoding value. + * + * Search in the registered set the handler able to read/write that encoding. + * + * Returns the handler or NULL if not found + */ +xmlCharEncodingHandlerPtr +xmlGetCharEncodingHandler(xmlCharEncoding enc) { + xmlCharEncodingHandlerPtr handler; + + if (handlers == NULL) xmlInitCharEncodingHandlers(); + switch (enc) { + case XML_CHAR_ENCODING_ERROR: + return(NULL); + case XML_CHAR_ENCODING_NONE: + return(NULL); + case XML_CHAR_ENCODING_UTF8: + return(NULL); + case XML_CHAR_ENCODING_UTF16LE: + return(xmlUTF16LEHandler); + case XML_CHAR_ENCODING_UTF16BE: + return(xmlUTF16BEHandler); + case XML_CHAR_ENCODING_EBCDIC: + handler = xmlFindCharEncodingHandler("EBCDIC"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("ebcdic"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_UCS4BE: + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS4"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_UCS4LE: + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS4"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_UCS4_2143: + break; + case XML_CHAR_ENCODING_UCS4_3412: + break; + case XML_CHAR_ENCODING_UCS2: + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-2"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS2"); + if (handler != NULL) return(handler); + break; + + /* + * We used to keep ISO Latin encodings native in the + * generated data. This led to so many problems that + * this has been removed. One can still change this + * back by registering no-ops encoders for those + */ + case XML_CHAR_ENCODING_8859_1: + handler = xmlFindCharEncodingHandler("ISO-8859-1"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_2: + handler = xmlFindCharEncodingHandler("ISO-8859-2"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_3: + handler = xmlFindCharEncodingHandler("ISO-8859-3"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_4: + handler = xmlFindCharEncodingHandler("ISO-8859-4"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_5: + handler = xmlFindCharEncodingHandler("ISO-8859-5"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_6: + handler = xmlFindCharEncodingHandler("ISO-8859-6"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_7: + handler = xmlFindCharEncodingHandler("ISO-8859-7"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_8: + handler = xmlFindCharEncodingHandler("ISO-8859-8"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_8859_9: + handler = xmlFindCharEncodingHandler("ISO-8859-9"); + if (handler != NULL) return(handler); + break; + + + case XML_CHAR_ENCODING_2022_JP: + handler = xmlFindCharEncodingHandler("ISO-2022-JP"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_SHIFT_JIS: + handler = xmlFindCharEncodingHandler("SHIFT-JIS"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("SHIFT_JIS"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("Shift_JIS"); + if (handler != NULL) return(handler); + break; + case XML_CHAR_ENCODING_EUC_JP: + handler = xmlFindCharEncodingHandler("EUC-JP"); + if (handler != NULL) return(handler); + break; + default: + break; + } + +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "No handler found for encoding %d\n", enc); +#endif + return(NULL); +} + +/** + * xmlFindCharEncodingHandler: + * @name: a string describing the char encoding. + * + * Search in the registered set the handler able to read/write that encoding. + * + * Returns the handler or NULL if not found + */ +xmlCharEncodingHandlerPtr +xmlFindCharEncodingHandler(const char *name) { + const char *nalias; + const char *norig; + xmlCharEncoding alias; +#ifdef LIBXML_ICONV_ENABLED + xmlCharEncodingHandlerPtr enc; + iconv_t icv_in, icv_out; +#endif /* LIBXML_ICONV_ENABLED */ + char upper[100]; + int i; + + if (handlers == NULL) xmlInitCharEncodingHandlers(); + if (name == NULL) return(xmlDefaultCharEncodingHandler); + if (name[0] == 0) return(xmlDefaultCharEncodingHandler); + + /* + * Do the alias resolution + */ + norig = name; + nalias = xmlGetEncodingAlias(name); + if (nalias != NULL) + name = nalias; + + /* + * Check first for directly registered encoding names + */ + for (i = 0;i < 99;i++) { + upper[i] = toupper(name[i]); + if (upper[i] == 0) break; + } + upper[i] = 0; + + for (i = 0;i < nbCharEncodingHandler; i++) + if (!strcmp(upper, handlers[i]->name)) { +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Found registered handler for encoding %s\n", name); +#endif + return(handlers[i]); + } + +#ifdef LIBXML_ICONV_ENABLED + /* check whether iconv can handle this */ + icv_in = iconv_open("UTF-8", name); + icv_out = iconv_open(name, "UTF-8"); + if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) { + enc = (xmlCharEncodingHandlerPtr) + xmlMalloc(sizeof(xmlCharEncodingHandler)); + if (enc == NULL) { + iconv_close(icv_in); + iconv_close(icv_out); + return(NULL); + } + enc->name = xmlMemStrdup(name); + enc->input = NULL; + enc->output = NULL; + enc->iconv_in = icv_in; + enc->iconv_out = icv_out; +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Found iconv handler for encoding %s\n", name); +#endif + return enc; + } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) { + xmlGenericError(xmlGenericErrorContext, + "iconv : problems with filters for '%s'\n", name); + } +#endif /* LIBXML_ICONV_ENABLED */ + +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "No handler found for encoding %s\n", name); +#endif + + /* + * Fallback using the canonical names + */ + alias = xmlParseCharEncoding(norig); + if (alias != XML_CHAR_ENCODING_ERROR) { + const char* canon; + canon = xmlGetCharEncodingName(alias); + if ((canon != NULL) && (strcmp(name, canon))) { + return(xmlFindCharEncodingHandler(canon)); + } + } + + return(NULL); +} + +/************************************************************************ + * * + * ICONV based generic conversion functions * + * * + ************************************************************************/ + +#ifdef LIBXML_ICONV_ENABLED +/** + * xmlIconvWrapper: + * @cd: iconv converter data structure + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ISO Latin 1 chars + * @inlen: the length of @in + * + * Returns 0 if success, or + * -1 by lack of space, or + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + * -3 if there the last byte can't form a single output char. + * + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +static int +xmlIconvWrapper(iconv_t cd, + unsigned char *out, int *outlen, + const unsigned char *in, int *inlen) { + + size_t icv_inlen = *inlen, icv_outlen = *outlen; + const char *icv_in = (const char *) in; + char *icv_out = (char *) out; + int ret; + + ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen); + if (in != NULL) { + *inlen -= icv_inlen; + *outlen -= icv_outlen; + } else { + *inlen = 0; + *outlen = 0; + } + if ((icv_inlen != 0) || (ret == -1)) { +#ifdef EILSEQ + if (errno == EILSEQ) { + return -2; + } else +#endif +#ifdef E2BIG + if (errno == E2BIG) { + return -1; + } else +#endif +#ifdef EINVAL + if (errno == EINVAL) { + return -3; + } else +#endif + { + return -3; + } + } + return 0; +} +#endif /* LIBXML_ICONV_ENABLED */ + +/************************************************************************ + * * + * The real API used by libxml for on-the-fly conversion * + * * + ************************************************************************/ + +/** + * xmlCharEncFirstLine: + * @handler: char enconding transformation data structure + * @out: an xmlBuffer for the output. + * @in: an xmlBuffer for the input + * + * Front-end for the encoding handler input function, but handle only + * the very first line, i.e. limit itself to 45 chars. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in) { + int ret = -2; + int written; + int toconv; + + if (handler == NULL) return(-1); + if (out == NULL) return(-1); + if (in == NULL) return(-1); + + written = out->size - out->use; + toconv = in->use; + if (toconv * 2 >= written) { + xmlBufferGrow(out, toconv); + written = out->size - out->use - 1; + } + + /* + * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 + * 45 chars should be sufficient to reach the end of the encoding + * declaration without going too far inside the document content. + */ + written = 45; + + if (handler->input != NULL) { + ret = handler->input(&out->content[out->use], &written, + in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_in != NULL) { + ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + switch (ret) { + case 0: + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input\n", + toconv, written); + break; + case -1: + xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); + break; + case -2: + xmlGenericError(xmlGenericErrorContext, + "input conversion failed due to input error\n"); + break; + case -3: + xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); + break; + default: + xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret); + } +#endif /* DEBUG_ENCODING */ + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) ret = 0; + if (ret == -1) ret = 0; + return(ret); +} + +/** + * xmlCharEncInFunc: + * @handler: char encoding transformation data structure + * @out: an xmlBuffer for the output. + * @in: an xmlBuffer for the input + * + * Generic front-end for the encoding handler input function + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, + xmlBufferPtr in) +{ + int ret = -2; + int written; + int toconv; + + if (handler == NULL) + return (-1); + if (out == NULL) + return (-1); + if (in == NULL) + return (-1); + + toconv = in->use; + if (toconv == 0) + return (0); + written = out->size - out->use; + if (toconv * 2 >= written) { + xmlBufferGrow(out, out->size + toconv * 2); + written = out->size - out->use - 1; + } + if (handler->input != NULL) { + ret = handler->input(&out->content[out->use], &written, + in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_in != NULL) { + ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input\n", + toconv, written); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); +#endif + break; + case -2: + xmlGenericError(xmlGenericErrorContext, + "input conversion failed due to input error\n"); + xmlGenericError(xmlGenericErrorContext, + "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + in->content[0], in->content[1], + in->content[2], in->content[3]); + } + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) + ret = 0; + return (written); +} + +/** + * xmlCharEncOutFunc: + * @handler: char enconding transformation data structure + * @out: an xmlBuffer for the output. + * @in: an xmlBuffer for the input + * + * Generic front-end for the encoding handler output function + * a first call with @in == NULL has to be made firs to initiate the + * output in case of non-stateless encoding needing to initiate their + * state or the output (like the BOM in UTF16). + * In case of UTF8 sequence conversion errors for the given encoder, + * the content will be automatically remapped to a CharRef sequence. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in) { + int ret = -2; + int written; + int writtentot = 0; + int toconv; + int output = 0; + + if (handler == NULL) return(-1); + if (out == NULL) return(-1); + +retry: + + written = out->size - out->use; + + /* + * First specific handling of in = NULL, i.e. the initialization call + */ + if (in == NULL) { + toconv = 0; + if (handler->output != NULL) { + ret = handler->output(&out->content[out->use], &written, + NULL, &toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_out != NULL) { + ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], + &written, NULL, &toconv); + out->use += written; + out->content[out->use] = 0; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "initialized encoder\n"); +#endif + return(0); + } + + /* + * Conversion itself. + */ + toconv = in->use; + if (toconv == 0) + return(0); + if (toconv * 2 >= written) { + xmlBufferGrow(out, toconv * 2); + written = out->size - out->use - 1; + } + if (handler->output != NULL) { + ret = handler->output(&out->content[out->use], &written, + in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + writtentot += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_out != NULL) { + ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + writtentot += written; + out->content[out->use] = 0; + if (ret == -1) { + if (written > 0) { + /* + * Can be a limitation of iconv + */ + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICONV_ENABLED */ + else { + xmlGenericError(xmlGenericErrorContext, + "xmlCharEncOutFunc: no output function !\n"); + return(-1); + } + + if (ret >= 0) output += ret; + + /* + * Attempt to handle error cases + */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of output\n", + toconv, written); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "output conversion failed by lack of space\n"); +#endif + break; + case -3: + xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n", + toconv, written, in->use); + break; + case -2: { + int len = in->use; + const xmlChar *utf = (const xmlChar *) in->content; + int cur; + + cur = xmlGetUTF8Char(utf, &len); + if (cur > 0) { + xmlChar charref[20]; + +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "handling output conversion error\n"); + xmlGenericError(xmlGenericErrorContext, + "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + in->content[0], in->content[1], + in->content[2], in->content[3]); +#endif + /* + * Removes the UTF8 sequence, and replace it by a charref + * and continue the transcoding phase, hoping the error + * did not mangle the encoder state. + */ + snprintf((char *) charref, sizeof(charref), "&#%d;", cur); + xmlBufferShrink(in, len); + xmlBufferAddHead(in, charref, -1); + + goto retry; + } else { + xmlGenericError(xmlGenericErrorContext, + "output conversion failed due to conv error\n"); + xmlGenericError(xmlGenericErrorContext, + "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + in->content[0], in->content[1], + in->content[2], in->content[3]); + in->content[0] = ' '; + } + break; + } + } + return(ret); +} + +/** + * xmlCharEncCloseFunc: + * @handler: char enconding transformation data structure + * + * Generic front-end for encoding handler close function + * + * Returns 0 if success, or -1 in case of error + */ +int +xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { + int ret = 0; + if (handler == NULL) return(-1); + if (handler->name == NULL) return(-1); +#ifdef LIBXML_ICONV_ENABLED + /* + * Iconv handlers can be used only once, free the whole block. + * and the associated icon resources. + */ + if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) { + if (handler->name != NULL) + xmlFree(handler->name); + handler->name = NULL; + if (handler->iconv_out != NULL) { + if (iconv_close(handler->iconv_out)) + ret = -1; + handler->iconv_out = NULL; + } + if (handler->iconv_in != NULL) { + if (iconv_close(handler->iconv_in)) + ret = -1; + handler->iconv_in = NULL; + } + xmlFree(handler); + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + if (ret) + xmlGenericError(xmlGenericErrorContext, + "failed to close the encoding handler\n"); + else + xmlGenericError(xmlGenericErrorContext, + "closed the encoding handler\n"); +#endif + + return(ret); +} + |
