diff options
-rw-r--r-- | i18n/unix/i18n.h | 89 | ||||
-rw-r--r-- | i18n/unix/utf8_ucs2.c | 280 | ||||
-rw-r--r-- | include/arch/unix/i18n.h | 89 |
3 files changed, 458 insertions, 0 deletions
diff --git a/i18n/unix/i18n.h b/i18n/unix/i18n.h new file mode 100644 index 000000000..da5f58f2c --- /dev/null +++ b/i18n/unix/i18n.h @@ -0,0 +1,89 @@ +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2000 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" must + * not be used to endorse or promote products derived from this + * software without prior written permission. For written + * permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * nor may "Apache" appear in their name, without prior written + * permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * <http://www.apache.org/>. + */ + +#ifndef I18N_H +#define I18N_H + +#include "apr.h" +#include "apr_xlate.h" + +/* If we ever support anything more exciting than char... this could move. + */ +typedef apr_int16_t apr_wchar_t; + +/** + * An APR internal function for fast utf-8 octet-encoded Unicode conversion + * to the ucs-2 wide Unicode format. This function is used for filename and + * other resource conversions for platforms providing native Unicode support. + * + * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former + * when the character code is invalid (in or out of context) and the later + * when more characters were expected, but insufficient characters remain. + */ +apr_status_t conv_utf8_to_ucs2(const char *in, apr_size_t *inbytes, + apr_wchar_t *out, apr_size_t *outwords); + +/** + * An APR internal function for fast ucs-2 wide Unicode format conversion to + * the utf-8 octet-encoded Unicode. This function is used for filename and + * other resource conversions for platforms providing native Unicode support. + * + * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former + * when the character code is invalid (in or out of context) and the later + * when more words were expected, but insufficient words remain. + */ +apr_status_t conv_ucs2_to_utf8(const apr_wchar_t *in, apr_size_t *inwords, + char *out, apr_size_t *outbytes); + +#endif /* def I18N_H */ diff --git a/i18n/unix/utf8_ucs2.c b/i18n/unix/utf8_ucs2.c new file mode 100644 index 000000000..a3dddf467 --- /dev/null +++ b/i18n/unix/utf8_ucs2.c @@ -0,0 +1,280 @@ +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2000 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" must + * not be used to endorse or promote products derived from this + * software without prior written permission. For written + * permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * nor may "Apache" appear in their name, without prior written + * permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * <http://www.apache.org/>. + */ + +#include "i18n.h" + +/* Implement the design principal specified by RFC 2718 2.2.5 + * Guidelines for new URL Schemes - within the APR. + * + * Since many architectures support unicode, and UCS2 is the most + * efficient storage used by those archictures, these functions + * exist to validate a UCS string. It is up to the operating system + * to determine the validitity of the string in the context of it's + * native language support. File systems that support filename + * characters of 0x80-0xff but have no support of Unicode will find + * this function useful only for validating the character sequences + * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is + * desired. + * + * from RFC 2279 UTF-8, a transformation format of ISO 10646 + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 1:2 0000 0000-0000 007F 0xxxxxxx + * 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx + * 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx + * 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx + * inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx + * inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * One of the X values must be one for the encoding length to be legit. + * Neither the z bit, nor the final two forms, are used for ucs-2 + * + * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in + * Unicode parlance), being actually UCS-4 characters transformed + * through UTF-16, need special treatment: the UTF-16 transformation + * must be undone, yielding a UCS-4 character that is then transformed + * as above." + * + * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask + * + * U' = U - 0x10000 + * U' = 000000000000yyyyyyyyyyxxxxxxxxxx + * W1 = 110110yyyyyyyyyy + * W2 = 110111xxxxxxxxxx + * + * conv_utf8_to_ucs2 out bytes: sizeof(in) * 1 <= Req <= sizeof(in) * 2 + * + * conv_ucs2_to_utf8 out words: sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2 + */ + +apr_status_t conv_utf8_to_ucs2(const char *in, apr_size_t *inbytes, + apr_wchar_t *out, apr_size_t *outwords) +{ + apr_int64_t newch, mask; + int ch, expect, eating; + + while (*inbytes && *outwords) + { + ch = (unsigned char)(*in++); + if (!(ch & 0200)) { + /* US-ASCII-7 plain text + */ + --*inbytes; + --*outwords; + *(out++) = ch; + } + else + { + if ((ch & 0300) != 0300) { + /* Multibyte Continuation is out of place + */ + return APR_EINVAL; + } + else + { + /* Multibyte Sequence Lead Character + * + * Compute the expected bytes while adjusting + * or lead byte and leading zeros mask. + */ + mask = 0340; + expect = 1; + while ((ch & mask) == mask) { + mask |= mask >> 1; + if (++expect > 3) /* (truly 5 for ucs-4) */ + return APR_EINVAL; + } + newch = ch & ~mask; + eating = expect + 1; + if (*inbytes <= expect) + return APR_INCOMPLETE; + /* Reject values of excessive leading 0 bits + * utf-8 _demands_ the shortest possible byte length + */ + if (expect == 1) { + if (!(newch & 0036)) + return APR_EINVAL; + } + else { + /* Reject values of excessive leading 0 bits + */ + if (!newch && !((unsigned char)*in & 0077 & (mask << 1))) + return APR_EINVAL; + if (expect == 2) { + /* Reject values D800-DFFF when not utf16 encoded + * (may not be an appropriate restriction for ucs-4) + */ + if (newch == 0015 && ((unsigned char)*in & 0040)) + return APR_EINVAL; + } + else if (expect == 3) { + /* Short circuit values > 110000 + */ + if (newch > 4) + return APR_EINVAL; + if (newch == 4 && ((unsigned char)*in & 0060)) + return APR_EINVAL; + } + } + if (*outwords < (expect > 2) + 1) + break; /* buffer full */ + while (expect--) + { + /* Multibyte Continuation must be legal */ + if (((ch = (unsigned char)*(in++)) & 0300) != 0200) + return APR_EINVAL; + newch <<= 6; + newch |= (ch & 0077); + } + *inbytes -= eating; + /* newch is now a true ucs-4 character + * + * now we need to fold to ucs-2 + */ + if (newch < 0x10000) + { + --*outwords; + *(out++) = (apr_wchar_t) newch; + } + else + { + *outwords -= 2; + newch -= 0x10000; + *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10)); + *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF)); + } + } + } + } + /* Buffer full 'errors' aren't errors, the client must inspect both + * the inbytes and outwords values + */ + return APR_SUCCESS; +} + +apr_status_t conv_ucs2_to_utf8(const apr_wchar_t *in, apr_size_t *inwords, + char *out, apr_size_t *outbytes) +{ + apr_int64_t newch, require; + char *invout; + int ch, need; + + while (*inwords && *outbytes) + { + ch = (unsigned short)(*in++); + if (ch < 0x80) + { + --*inwords; + --*outbytes; + *(out++) = (unsigned char) ch; + } + else + { + if ((ch & 0xFC00) == 0xDC00) { + /* Invalid Leading ucs-2 Multiword Continuation Character + */ + return APR_EINVAL; + } + if ((ch & 0xFC00) == 0xD800) { + /* Leading ucs-2 Multiword Character + */ + if (*inwords < 2) { + /* Missing ucs-2 Multiword Continuation Character + */ + return APR_INCOMPLETE; + } + if (((unsigned short)(*in) & 0xFC00) != 0xDC00) { + /* Invalid ucs-2 Multiword Continuation Character + */ + return APR_EINVAL; + } + newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF); + newch += 0x10000; + } + else { + /* ucs-2 Single Word Character + */ + newch = ch; + } + /* Determine the absolute minimum utf-8 bytes required + */ + require = newch >> 11; + need = 1; + while (require) + require >>= 5, ++need; + if (need >= *outbytes) + break; /* Insufficient buffer */ + *inwords -= (need > 2) + 1; + *outbytes -= need + 1; + /* Compute the utf-8 characters in last to first order, + * calculating the lead character length bits along the way. + */ + ch = 0200; + out += need + 1; + invout = out; + while (need--) { + ch |= ch >> 1; + *(--invout) = (unsigned char)(0200 | (newch & 0077)); + newch >>= 6; + } + /* Compute the lead utf-8 character and move the dest offset + */ + *(--invout) = (unsigned char)(ch | newch); + } + } + /* Buffer full 'errors' aren't errors, the client must inspect both + * the inwords and outbytes values + */ + return APR_SUCCESS; +} diff --git a/include/arch/unix/i18n.h b/include/arch/unix/i18n.h new file mode 100644 index 000000000..da5f58f2c --- /dev/null +++ b/include/arch/unix/i18n.h @@ -0,0 +1,89 @@ +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2000 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" must + * not be used to endorse or promote products derived from this + * software without prior written permission. For written + * permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * nor may "Apache" appear in their name, without prior written + * permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * <http://www.apache.org/>. + */ + +#ifndef I18N_H +#define I18N_H + +#include "apr.h" +#include "apr_xlate.h" + +/* If we ever support anything more exciting than char... this could move. + */ +typedef apr_int16_t apr_wchar_t; + +/** + * An APR internal function for fast utf-8 octet-encoded Unicode conversion + * to the ucs-2 wide Unicode format. This function is used for filename and + * other resource conversions for platforms providing native Unicode support. + * + * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former + * when the character code is invalid (in or out of context) and the later + * when more characters were expected, but insufficient characters remain. + */ +apr_status_t conv_utf8_to_ucs2(const char *in, apr_size_t *inbytes, + apr_wchar_t *out, apr_size_t *outwords); + +/** + * An APR internal function for fast ucs-2 wide Unicode format conversion to + * the utf-8 octet-encoded Unicode. This function is used for filename and + * other resource conversions for platforms providing native Unicode support. + * + * @tip Only the errors APR_EINVAL and APR_INCOMPLETE may occur, the former + * when the character code is invalid (in or out of context) and the later + * when more words were expected, but insufficient words remain. + */ +apr_status_t conv_ucs2_to_utf8(const apr_wchar_t *in, apr_size_t *inwords, + char *out, apr_size_t *outbytes); + +#endif /* def I18N_H */ |