diff options
author | Ulrich Drepper <drepper@redhat.com> | 2000-09-11 20:33:59 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2000-09-11 20:33:59 +0000 |
commit | f6ad47269a0bcfb8d037d19579bd801efd7d926c (patch) | |
tree | 80d87ad6a4f4d48d0f03ea04bcaca5beebbb4912 /iconvdata/utf-7.c | |
parent | b81c896174dc98cb15cc80844751fb23cd9e02d1 (diff) | |
download | glibc-f6ad47269a0bcfb8d037d19579bd801efd7d926c.tar.gz |
Update.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-collate.c (collate_read): Fix typo in handling
of decimal ellipsis.
2000-09-11 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-collate.c (collate_read): Always initialize
error_section.next.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-collate.c (collate_finish): Upper bound for
ruleidx is 128, not 256.
2000-09-11 Ulrich Drepper <drepper@redhat.com>
* locale/programs/ld-collate.c (collate_read): Correct check for
already inserted entries.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconv/skeleton.c (FUNCTION_NAME): Handle unaligned access in
second try as well.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconv/skeleton.c (FUNCTION_NAME): Optimize an `if' if
MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconv/skeleton.c (gconv_init): Replace all uses of RESET_STATE with
SAVE_RESET_STATE.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconvdata/utf-7.c: New file.
* iconvdata/gconv-modules (UTF-7): New module entries.
* iconvdata/Makefile (modules): Add UTF-7.
(distribute): Add utf-7.c.
* iconvdata/testdata/UTF-7: New file.
* iconvdata/testdata/UTF-7..UTF8: New file.
* iconvdata/TESTS (UTF-7): New entry.
* iconvdata/run-iconv-test.sh: Fix confusing output.
Diffstat (limited to 'iconvdata/utf-7.c')
-rw-r--r-- | iconvdata/utf-7.c | 559 |
1 files changed, 559 insertions, 0 deletions
diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c new file mode 100644 index 0000000000..f02063d334 --- /dev/null +++ b/iconvdata/utf-7.c @@ -0,0 +1,559 @@ +/* Conversion module for UTF-7. + Copyright (C) 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Bruno Haible <haible@clisp.cons.org>, 2000. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* UTF-7 is a legacy encoding used for transmitting Unicode within the + ASCII character set, used primarily by mail agents. New programs + are encouraged to use UTF-8 instead. + + UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The + original Base64 encoding is defined in RFC 2045. */ + +#include <dlfcn.h> +#include <gconv.h> +#include <stdint.h> +#include <stdlib.h> + + +/* Define this to 1 if you want the so-called "optional direct" characters + ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } + to be encoded. Define to 0 if you want them to be passed straight + through, like the so-called "direct" characters. + We set this to 1 because it's safer. + */ +#define UTF7_ENCODE_OPTIONAL_CHARS 1 + + +/* The set of "direct characters": + A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr +*/ + +static const unsigned char direct_tab[128/8] = + { + 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 + }; + +static inline int +isdirect (uint32_t ch) +{ + return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1)); +} + + +/* The set of "direct and optional direct characters": + A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr + ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } +*/ + +static const unsigned char xdirect_tab[128/8] = + { + 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f + }; + +static inline int +isxdirect (uint32_t ch) +{ + return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1)); +} + + +/* The set of "extended base64 characters": + A-Z a-z 0-9 + / - +*/ + +static const unsigned char xbase64_tab[128/8] = + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 + }; + +static inline int +isxbase64 (uint32_t ch) +{ + return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1)); +} + + +/* Converts a value in the range 0..63 to a base64 encoded char. */ +static inline unsigned char +base64 (unsigned int i) +{ + if (i < 26) + return i + 'A'; + else if (i < 52) + return i - 26 + 'a'; + else if (i < 62) + return i - 52 + '0'; + else if (i == 62) + return '+'; + else if (i == 63) + return '/'; + else + abort (); +} + + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "UTF-7//" +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define FROM_LOOP from_utf7_loop +#define TO_LOOP to_utf7_loop +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 6 +#define MIN_NEEDED_TO 4 +#define MAX_NEEDED_TO 4 +#define PREPARE_LOOP \ + mbstate_t saved_state; \ + mbstate_t *statep = data->__statep; +#define EXTRA_LOOP_ARGS , statep + + +/* Since we might have to reset input pointer we must be able to save + and restore the state. */ +#define SAVE_RESET_STATE(Save) \ + if (Save) \ + saved_state = *statep; \ + else \ + *statep = saved_state + + +/* First define the conversion function from UTF-7 to UCS4. + The state is structured as follows: + __count bit 2..0: zero + __count bit 8..3: shift + __wch: data + Precise meaning: + shift data + 0 -- not inside base64 encoding + 1..32 XX..XX00..00 inside base64, (32 - shift) bits pending + This state layout is simpler than relying on STORE_REST/UNPACK_BYTES. + + When shift = 0, __wch needs to store at most one lookahead byte (see + __GCONV_INCOMPLETE_INPUT below). +*/ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint_fast8_t ch = *inptr; \ + \ + if ((statep->__count >> 3) == 0) \ + { \ + /* base64 encoding inactive. */ \ + if (isxdirect (ch)) \ + { \ + inptr++; \ + put32 (outptr, ch); \ + outptr += 4; \ + } \ + else if (__builtin_expect (ch == '+', 1)) \ + { \ + if (__builtin_expect (inptr + 2 >= inend, 0)) \ + { \ + /* Not enough input available. */ \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + if (inptr[1] == '-') \ + { \ + inptr += 2; \ + put32 (outptr, ch); \ + outptr += 4; \ + } \ + else \ + { \ + /* Switch into base64 mode. */ \ + inptr++; \ + statep->__count = (32 << 3); \ + statep->__value.__wch = 0; \ + } \ + } \ + else \ + { \ + /* The input is invalid. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + } \ + } \ + else \ + { \ + /* base64 encoding active. */ \ + uint32_t i; \ + int shift; \ + \ + if (ch >= 'A' && ch <= 'Z') \ + i = ch - 'A'; \ + else if (ch >= 'a' && ch <= 'z') \ + i = ch - 'a' + 26; \ + else if (ch >= '0' && ch <= '9') \ + i = ch - '0' + 52; \ + else if (ch == '+') \ + i = 62; \ + else if (ch == '/') \ + i = 63; \ + else \ + { \ + /* Terminate base64 encoding. */ \ + \ + /* If accumulated data is nonzero, the input is invalid. */ \ + /* Also, partial UTF-16 characters are invalid. */ \ + if (__builtin_expect (statep->__value.__wch != 0, 0) \ + || __builtin_expect ((statep->__count >> 3) <= 26, 0)) \ + { \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + statep->__count = 0; \ + continue; \ + } \ + \ + if (ch == '-') \ + inptr++; \ + \ + statep->__count = 0; \ + continue; \ + } \ + \ + /* Concatenate the base64 integer i to the accumulator. */ \ + shift = (statep->__count >> 3); \ + if (shift > 6) \ + { \ + uint32_t wch; \ + \ + shift -= 6; \ + wch = statep->__value.__wch | (i << shift); \ + \ + if (shift <= 16 && shift > 10) \ + { \ + /* An UTF-16 character has just been completed. */ \ + uint32_t wc1 = wch >> 16; \ + \ + /* UTF-16: When we see a High Surrogate, we must also decode \ + the following Low Surrogate. */ \ + if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \ + { \ + wch = wch << 16; \ + shift += 16; \ + put32 (outptr, wc1); \ + outptr += 4; \ + } \ + } \ + else if (shift <= 10 && shift > 4) \ + { \ + /* After a High Surrogate, verify that the next 16 bit \ + indeed form a Low Surrogate. */ \ + uint32_t wc2 = wch & 0xffff; \ + \ + if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \ + { \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + statep->__count = 0; \ + continue; \ + } \ + } \ + \ + statep->__value.__wch = wch; \ + } \ + else \ + { \ + /* An UTF-16 surrogate pair has just been completed. */ \ + uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \ + uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \ + | (i >> (6 - shift)); \ + \ + statep->__value.__wch = (i << shift) << 26; \ + shift += 26; \ + \ + assert (wc1 >= 0xd800 && wc1 < 0xdc00); \ + assert (wc2 >= 0xdc00 && wc2 < 0xe000); \ + put32 (outptr, \ + 0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \ + outptr += 4; \ + } \ + \ + statep->__count = shift << 3; \ + \ + /* Now that we digested the input increment the input pointer. */ \ + inptr++; \ + } \ + } +#define LOOP_NEED_FLAGS +#define EXTRA_LOOP_DECLS , mbstate_t *statep +#include <iconv/loop.c> + + +/* Next, define the conversion from UCS4 to UTF-7. + The state is structured as follows: + __count bit 2..0: zero + __count bit 4..3: shift + __count bit 8..5: data + Precise meaning: + shift data + 0 0 not inside base64 encoding + 1 0 inside base64, no pending bits + 2 XX00 inside base64, 2 bits known for next byte + 3 XXXX inside base64, 4 bits known for next byte + + __count bit 2..0 and __wch are always zero, because this direction + never returns __GCONV_INCOMPLETE_INPUT. +*/ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MAX_NEEDED_INPUT MAX_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + uint32_t ch = get32 (inptr); \ + \ + if ((statep->__count & 0x18) == 0) \ + { \ + /* base64 encoding inactive */ \ + if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ + { \ + *outptr++ = (unsigned char) ch; \ + } \ + else \ + { \ + size_t count; \ + \ + if (ch == '+') \ + count = 2; \ + else if (ch < 0x10000) \ + count = 3; \ + else if (ch < 0x110000) \ + count = 6; \ + else \ + STANDARD_ERR_HANDLER (4); \ + \ + if (__builtin_expect (outptr + count > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *outptr++ = '+'; \ + if (ch == '+') \ + *outptr++ = '-'; \ + else if (ch < 0x10000) \ + { \ + *outptr++ = base64 (ch >> 10); \ + *outptr++ = base64 ((ch >> 4) & 0x3f); \ + statep->__count = ((ch & 15) << 5) | (3 << 3); \ + } \ + else if (ch < 0x110000) \ + { \ + uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ + uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ + \ + ch = (ch1 << 16) | ch2; \ + *outptr++ = base64 (ch >> 26); \ + *outptr++ = base64 ((ch >> 20) & 0x3f); \ + *outptr++ = base64 ((ch >> 14) & 0x3f); \ + *outptr++ = base64 ((ch >> 8) & 0x3f); \ + *outptr++ = base64 ((ch >> 2) & 0x3f); \ + statep->__count = ((ch & 3) << 7) | (2 << 3); \ + } \ + else \ + abort (); \ + } \ + } \ + else \ + { \ + /* base64 encoding active */ \ + if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ + { \ + /* deactivate base64 encoding */ \ + size_t count; \ + \ + count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \ + if (__builtin_expect (outptr + count > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + if ((statep->__count & 0x18) >= 0x10) \ + *outptr++ = base64 ((statep->__count >> 3) & ~3); \ + if (isxbase64 (ch)) \ + *outptr++ = '-'; \ + *outptr++ = (unsigned char) ch; \ + statep->__count = 0; \ + } \ + else \ + { \ + size_t count; \ + \ + if (ch < 0x10000) \ + count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \ + else if (ch < 0x110000) \ + count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \ + else \ + STANDARD_ERR_HANDLER (4); \ + \ + if (__builtin_expect (outptr + count > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + if (ch < 0x10000) \ + { \ + switch ((statep->__count >> 3) & 3) \ + { \ + case 1: \ + *outptr++ = base64 (ch >> 10); \ + *outptr++ = base64 ((ch >> 4) & 0x3f); \ + statep->__count = ((ch & 15) << 5) | (3 << 3); \ + break; \ + case 2: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 12)); \ + *outptr++ = base64 ((ch >> 6) & 0x3f); \ + *outptr++ = base64 (ch & 0x3f); \ + statep->__count = (1 << 3); \ + break; \ + case 3: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 14)); \ + *outptr++ = base64 ((ch >> 8) & 0x3f); \ + *outptr++ = base64 ((ch >> 2) & 0x3f); \ + statep->__count = ((ch & 3) << 7) | (2 << 3); \ + break; \ + default: \ + abort (); \ + } \ + } \ + else if (ch < 0x110000) \ + { \ + uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ + uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ + \ + ch = (ch1 << 16) | ch2; \ + switch ((statep->__count >> 3) & 3) \ + { \ + case 1: \ + *outptr++ = base64 (ch >> 26); \ + *outptr++ = base64 ((ch >> 20) & 0x3f); \ + *outptr++ = base64 ((ch >> 14) & 0x3f); \ + *outptr++ = base64 ((ch >> 8) & 0x3f); \ + *outptr++ = base64 ((ch >> 2) & 0x3f); \ + statep->__count = ((ch & 3) << 7) | (2 << 3); \ + break; \ + case 2: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 28)); \ + *outptr++ = base64 ((ch >> 22) & 0x3f); \ + *outptr++ = base64 ((ch >> 16) & 0x3f); \ + *outptr++ = base64 ((ch >> 10) & 0x3f); \ + *outptr++ = base64 ((ch >> 4) & 0x3f); \ + statep->__count = ((ch & 15) << 5) | (3 << 3); \ + break; \ + case 3: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 30)); \ + *outptr++ = base64 ((ch >> 24) & 0x3f); \ + *outptr++ = base64 ((ch >> 18) & 0x3f); \ + *outptr++ = base64 ((ch >> 12) & 0x3f); \ + *outptr++ = base64 ((ch >> 6) & 0x3f); \ + *outptr++ = base64 (ch & 0x3f); \ + statep->__count = (1 << 3); \ + break; \ + default: \ + abort (); \ + } \ + } \ + else \ + abort (); \ + } \ + } \ + \ + /* Now that we wrote the output increment the input pointer. */ \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#define EXTRA_LOOP_DECLS , mbstate_t *statep +#include <iconv/loop.c> + + +/* Since this is a stateful encoding we have to provide code which resets + the output state to the initial state. This has to be done during the + flushing. */ +#define EMIT_SHIFT_TO_INIT \ + if (FROM_DIRECTION) \ + /* Nothing to emit. */ \ + memset (data->__statep, '\0', sizeof (mbstate_t)); \ + else \ + { \ + /* The "to UTF-7" direction. Flush the remaining bits and terminate \ + with a '-' byte. This will guarantee correct decoding if more \ + UTF-7 encoded text is added afterwards. */ \ + int state = data->__statep->__count; \ + \ + if (state & 0x18) \ + { \ + /* Deactivate base64 encoding. */ \ + unsigned char *outbuf = data->__outbuf; \ + size_t count = ((state & 0x18) >= 0x10) + 1; \ + \ + if (__builtin_expect (outbuf + count > data->__outbufend, 0)) \ + /* We don't have enough room in the output buffer. */ \ + status = __GCONV_FULL_OUTPUT; \ + else \ + { \ + /* Write out the shift sequence. */ \ + if ((state & 0x18) >= 0x10) \ + *outbuf++ = base64 ((state >> 3) & ~3); \ + *outbuf++ = '-'; \ + \ + data->__outbuf = outbuf; \ + data->__statep->__count = 0; \ + } \ + } \ + else \ + data->__statep->__count = 0; \ + } + + +/* Now define the toplevel functions. */ +#include <iconv/skeleton.c> |