/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

#include "unix/i18n.h"

/* Implement the design principal specified by RFC 2718 2.2.5 
 * Guidelines for new URL Schemes - within the APR.
 *
 * Since many architectures support unicode, and UCS2 is the most
 * efficient storage used by those archictures, these functions
 * exist to validate a UCS string.  It is up to the operating system
 * to determine the validitity of the string in the context of it's
 * native language support.  File systems that support filename 
 * characters of 0x80-0xff but have no support of Unicode will find 
 * this function useful only for validating the character sequences 
 * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
 * desired.
 *
 * from RFC 2279 UTF-8, a transformation format of ISO 10646
 *
 *     UCS-4 range (hex.)    UTF-8 octet sequence (binary)
 * 1:2 0000 0000-0000 007F   0xxxxxxx
 * 2:2 0000 0080-0000 07FF   110XXXXx 10xxxxxx
 * 3:2 0000 0800-0000 FFFF   1110XXXX 10Xxxxxx 10xxxxxx
 * 4:4 0001 0000-001F FFFF   11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
 * inv 0020 0000-03FF FFFF   111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
 * inv 0400 0000-7FFF FFFF   1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 * One of the X values must be one for the encoding length to be legit.
 * Neither the z bit, nor the final two forms, are used for ucs-2
 *
 *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in 
 *   Unicode parlance), being actually UCS-4 characters transformed 
 *   through UTF-16, need special treatment: the UTF-16 transformation 
 *   must be undone, yielding a UCS-4 character that is then transformed 
 *   as above."
 *
 * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
 *
 *  U' = U - 0x10000
 *  U' = 000000000000yyyyyyyyyyxxxxxxxxxx
 *                  W1 = 110110yyyyyyyyyy
 *                  W2 = 110111xxxxxxxxxx
 *
 * conv_utf8_to_ucs2 out bytes:  sizeof(in) * 1 <= Req <= sizeof(in) * 2
 *
 * conv_ucs2_to_utf8 out words:  sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
 */

apr_status_t conv_utf8_to_ucs2(const char *in, apr_size_t *inbytes,
                               apr_wchar_t *out, apr_size_t *outwords)
{
    apr_int64_t newch, mask;
    int ch, expect, eating;
    
    while (*inbytes && *outwords) 
    {
        ch = (unsigned char)(*in++);
        if (!(ch & 0200)) {
            /* US-ASCII-7 plain text
             */
            --*inbytes;
            --*outwords;
            *(out++) = ch;
        }
        else
        {
            if ((ch & 0300) != 0300) { 
                /* Multibyte Continuation is out of place
                 */
                return APR_EINVAL;
            }
            else
            {
                /* Multibyte Sequence Lead Character
                 *
                 * Compute the expected bytes while adjusting
                 * or lead byte and leading zeros mask.
                 */
                mask = 0340;
                expect = 1;
                while ((ch & mask) == mask) {
                    mask |= mask >> 1;
                    if (++expect > 3) /* (truly 5 for ucs-4) */
                        return APR_EINVAL;
                }
                newch = ch & ~mask;
                eating = expect + 1;
                if (*inbytes <= expect)
                    return APR_INCOMPLETE;
                /* Reject values of excessive leading 0 bits
                 * utf-8 _demands_ the shortest possible byte length
                 */
                if (expect == 1) {
                    if (!(newch & 0036))
                        return APR_EINVAL;
                }
                else {
                    /* Reject values of excessive leading 0 bits
                     */
                    if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
                        return APR_EINVAL;
                    if (expect == 2) {
                        /* Reject values D800-DFFF when not utf16 encoded
                         * (may not be an appropriate restriction for ucs-4)
                         */
                        if (newch == 0015 && ((unsigned char)*in & 0040))
                            return APR_EINVAL;
                    }
                    else if (expect == 3) {
                        /* Short circuit values > 110000
                         */
                        if (newch > 4)
                            return APR_EINVAL;
                        if (newch == 4 && ((unsigned char)*in & 0060))
                            return APR_EINVAL;
                    }
                }
                if (*outwords < (expect > 2) + 1) 
                    break; /* buffer full */
                while (expect--)
                {
                    /* Multibyte Continuation must be legal */
                    if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
                        return APR_EINVAL;
                    newch <<= 6;
                    newch |= (ch & 0077);
                }
                *inbytes -= eating;
                /* newch is now a true ucs-4 character
                 *
                 * now we need to fold to ucs-2
                 */
                if (newch < 0x10000) 
                {
                    --*outwords;
                    *(out++) = (apr_wchar_t) newch;
                }
                else 
                {
                    *outwords -= 2;
                    newch -= 0x10000;
                    *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
                    *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));                    
                }
            }
        }
    }
    /* Buffer full 'errors' aren't errors, the client must inspect both
     * the inbytes and outwords values
     */
    return APR_SUCCESS;
}

apr_status_t conv_ucs2_to_utf8(const apr_wchar_t *in, apr_size_t *inwords,
                               char *out, apr_size_t *outbytes)
{
    apr_int64_t newch, require;
    char *invout;
    int ch, need;
    
    while (*inwords && *outbytes) 
    {
        ch = (unsigned short)(*in++);
        if (ch < 0x80)
        {
            --*inwords;
            --*outbytes;
            *(out++) = (unsigned char) ch;
        }
        else 
        {
            if ((ch & 0xFC00) == 0xDC00) {
                /* Invalid Leading ucs-2 Multiword Continuation Character
                 */
                return APR_EINVAL;
            }
            if ((ch & 0xFC00) == 0xD800) {
                /* Leading ucs-2 Multiword Character
                 */
                if (*inwords < 2) {
                    /* Missing ucs-2 Multiword Continuation Character
                     */
                    return APR_INCOMPLETE;
                }
                if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
                    /* Invalid ucs-2 Multiword Continuation Character
                     */
                    return APR_EINVAL;
                }
                newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
                newch += 0x10000;
            }
            else {
                /* ucs-2 Single Word Character
                 */
                newch = ch;
            }
            /* Determine the absolute minimum utf-8 bytes required
             */
            require = newch >> 11;
            need = 1;
            while (require)
                require >>= 5, ++need;
            if (need >= *outbytes)
                break; /* Insufficient buffer */
            *inwords -= (need > 2) + 1;
            *outbytes -= need + 1;
            /* Compute the utf-8 characters in last to first order,
             * calculating the lead character length bits along the way.
             */
            ch = 0200;
            out += need + 1;
            invout = out;
            while (need--) {
                ch |= ch >> 1;
                *(--invout) = (unsigned char)(0200 | (newch & 0077));
                newch >>= 6;
            }
            /* Compute the lead utf-8 character and move the dest offset
             */
            *(--invout) = (unsigned char)(ch | newch);
        }
    }
    /* Buffer full 'errors' aren't errors, the client must inspect both
     * the inwords and outbytes values
     */
    return APR_SUCCESS;    
}