diff options
author | Ulrich Drepper <drepper@redhat.com> | 2000-09-11 20:33:59 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2000-09-11 20:33:59 +0000 |
commit | f6ad47269a0bcfb8d037d19579bd801efd7d926c (patch) | |
tree | 80d87ad6a4f4d48d0f03ea04bcaca5beebbb4912 | |
parent | b81c896174dc98cb15cc80844751fb23cd9e02d1 (diff) | |
download | glibc-f6ad47269a0bcfb8d037d19579bd801efd7d926c.tar.gz |
Update.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-collate.c (collate_read): Fix typo in handling
of decimal ellipsis.
2000-09-11 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-collate.c (collate_read): Always initialize
error_section.next.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* locale/programs/ld-collate.c (collate_finish): Upper bound for
ruleidx is 128, not 256.
2000-09-11 Ulrich Drepper <drepper@redhat.com>
* locale/programs/ld-collate.c (collate_read): Correct check for
already inserted entries.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconv/skeleton.c (FUNCTION_NAME): Handle unaligned access in
second try as well.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconv/skeleton.c (FUNCTION_NAME): Optimize an `if' if
MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconv/skeleton.c (gconv_init): Replace all uses of RESET_STATE with
SAVE_RESET_STATE.
2000-09-10 Bruno Haible <haible@clisp.cons.org>
* iconvdata/utf-7.c: New file.
* iconvdata/gconv-modules (UTF-7): New module entries.
* iconvdata/Makefile (modules): Add UTF-7.
(distribute): Add utf-7.c.
* iconvdata/testdata/UTF-7: New file.
* iconvdata/testdata/UTF-7..UTF8: New file.
* iconvdata/TESTS (UTF-7): New entry.
* iconvdata/run-iconv-test.sh: Fix confusing output.
-rw-r--r-- | ChangeLog | 46 | ||||
-rw-r--r-- | iconv/skeleton.c | 78 | ||||
-rw-r--r-- | iconvdata/Makefile | 7 | ||||
-rw-r--r-- | iconvdata/TESTS | 1 | ||||
-rw-r--r-- | iconvdata/gconv-modules | 4 | ||||
-rwxr-xr-x | iconvdata/run-iconv-test.sh | 2 | ||||
-rw-r--r-- | iconvdata/testdata/UTF-7 | 25 | ||||
-rw-r--r-- | iconvdata/testdata/UTF-7..UTF8 | 25 | ||||
-rw-r--r-- | iconvdata/utf-7.c | 559 | ||||
-rw-r--r-- | locale/programs/ld-collate.c | 85 |
10 files changed, 774 insertions, 58 deletions
@@ -1,3 +1,49 @@ +2000-09-10 Bruno Haible <haible@clisp.cons.org> + + * locale/programs/ld-collate.c (collate_read): Fix typo in handling + of decimal ellipsis. + +2000-09-11 Bruno Haible <haible@clisp.cons.org> + + * locale/programs/ld-collate.c (collate_read): Always initialize + error_section.next. + +2000-09-10 Bruno Haible <haible@clisp.cons.org> + + * locale/programs/ld-collate.c (collate_finish): Upper bound for + ruleidx is 128, not 256. + +2000-09-11 Ulrich Drepper <drepper@redhat.com> + + * locale/programs/ld-collate.c (collate_read): Correct check for + already inserted entries. + +2000-09-10 Bruno Haible <haible@clisp.cons.org> + + * iconv/skeleton.c (FUNCTION_NAME): Handle unaligned access in + second try as well. + +2000-09-10 Bruno Haible <haible@clisp.cons.org> + + * iconv/skeleton.c (FUNCTION_NAME): Optimize an `if' if + MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1. + +2000-09-10 Bruno Haible <haible@clisp.cons.org> + + * iconv/skeleton.c (gconv_init): Replace all uses of RESET_STATE with + SAVE_RESET_STATE. + +2000-09-10 Bruno Haible <haible@clisp.cons.org> + + * iconvdata/utf-7.c: New file. + * iconvdata/gconv-modules (UTF-7): New module entries. + * iconvdata/Makefile (modules): Add UTF-7. + (distribute): Add utf-7.c. + * iconvdata/testdata/UTF-7: New file. + * iconvdata/testdata/UTF-7..UTF8: New file. + * iconvdata/TESTS (UTF-7): New entry. + * iconvdata/run-iconv-test.sh: Fix confusing output. + 2000-09-11 Ulrich Drepper <drepper@redhat.com> * sysdeps/posix/tempname.c (__gen_tempname): Use __lxstat and __xstat. diff --git a/iconv/skeleton.c b/iconv/skeleton.c index 76189b56a9..f459ed588f 100644 --- a/iconv/skeleton.c +++ b/iconv/skeleton.c @@ -57,7 +57,12 @@ from the current characters. TO_LOOP likewise for the other direction - RESET_STATE in case of an error we must reset the state for + ONE_DIRECTION optional. If defined to 1, only one conversion + direction is defined instead of two. In this + case, FROM_DIRECTION should be defined to 1, and + FROM_LOOP and TO_LOOP should have the same value. + + SAVE_RESET_STATE in case of an error we must reset the state for the rerun so this macro must be defined for stateful encodings. It takes an argument which is nonzero when saving. @@ -184,8 +189,8 @@ static int to_object; #endif -/* For conversions from a fixed width character sets to another fixed width - character set we we can define RESET_INPUT_BUFFER is necessary. */ +/* For conversions from a fixed width character set to another fixed width + character set we can define RESET_INPUT_BUFFER in a very fast way. */ #if !defined RESET_INPUT_BUFFER && !defined SAVE_RESET_STATE # if MIN_NEEDED_FROM == MAX_NEEDED_FROM && MIN_NEEDED_TO == MAX_NEEDED_TO /* We have to use these `if's here since the compiler cannot know that @@ -233,7 +238,7 @@ gconv_init (struct __gconv_step *step) else return __GCONV_NOCONV; -#ifdef RESET_STATE +#ifdef SAVE_RESET_STATE step->__stateful = 1; #else step->__stateful = 0; @@ -245,8 +250,8 @@ gconv_init (struct __gconv_step *step) /* The default destructor function does nothing in the moment and so - be define it at all. But we still provide the macro just in case - we need it some day. */ + we don't define it at all. But we still provide the macro just in + case we need it some day. */ #if DEFINE_FINI #endif @@ -339,7 +344,8 @@ FUNCTION_NAME (struct __gconv_step *step, struct __gconv_step_data *data, /* If the function is used to implement the mb*towc*() or wc*tomb*() functions we must test whether any bytes from the last call are stored in the `state' object. */ - if (((MAX_NEEDED_FROM > 1 && FROM_DIRECTION) + if (((MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1) + || (MAX_NEEDED_FROM > 1 && FROM_DIRECTION) || (MAX_NEEDED_TO > 1 && !FROM_DIRECTION)) && consume_incomplete && (data->__statep->__count & 7) != 0) { @@ -491,23 +497,44 @@ FUNCTION_NAME (struct __gconv_step *step, struct __gconv_step_data *data, SAVE_RESET_STATE (0); # endif - /* XXX Handle unaligned access here as well. */ - if (FROM_DIRECTION) - /* Run the conversion loop. */ - nstatus = FROM_LOOP (step, data, - (const unsigned char **) inptrp, - (const unsigned char *) inend, - (unsigned char **) &outbuf, - (unsigned char *) outerr, - lirreversiblep EXTRA_LOOP_ARGS); + if (__builtin_expect (!unaligned, 1)) + { + if (FROM_DIRECTION) + /* Run the conversion loop. */ + nstatus = FROM_LOOP (step, data, inptrp, inend, + &outbuf, + (unsigned char *) outerr, + lirreversiblep + EXTRA_LOOP_ARGS); + else + /* Run the conversion loop. */ + nstatus = TO_LOOP (step, data, inptrp, inend, + &outbuf, + (unsigned char *) outerr, + lirreversiblep + EXTRA_LOOP_ARGS); + } +# if !defined _STRING_ARCH_unaligned \ + && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \ + && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0 else - /* Run the conversion loop. */ - nstatus = TO_LOOP (step, data, - (const unsigned char **) inptrp, - (const unsigned char *) inend, - (unsigned char **) &outbuf, - (unsigned char *) outerr, - lirreversiblep EXTRA_LOOP_ARGS); + { + if (FROM_DIRECTION) + /* Run the conversion loop. */ + nstatus = GEN_unaligned (FROM_LOOP) (step, data, + inptrp, inend, + (unsigned char *) outerr, + lirreversiblep + EXTRA_LOOP_ARGS); + else + /* Run the conversion loop. */ + nstatus = GEN_unaligned (TO_LOOP) (step, data, + inptrp, inend, + (unsigned char *) outerr, + lirreversiblep + EXTRA_LOOP_ARGS); + } +# endif /* We must run out of output buffer space in this rerun. */ @@ -540,7 +567,8 @@ FUNCTION_NAME (struct __gconv_step *step, struct __gconv_step_data *data, /* If we are supposed to consume all character store now all of the remaining characters in the `state' object. */ #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1 - if (((MAX_NEEDED_FROM > 1 && FROM_DIRECTION) + if (((MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1) + || (MAX_NEEDED_FROM > 1 && FROM_DIRECTION) || (MAX_NEEDED_TO > 1 && !FROM_DIRECTION)) && __builtin_expect (consume_incomplete, 0) && status == __GCONV_INCOMPLETE_INPUT) @@ -580,7 +608,7 @@ FUNCTION_NAME (struct __gconv_step *step, struct __gconv_step_data *data, #undef EMIT_SHIFT_TO_INIT #undef FROM_LOOP #undef TO_LOOP -#undef RESET_STATE +#undef SAVE_RESET_STATE #undef RESET_INPUT_BUFFER #undef FUNCTION_NAME #undef PREPARE_LOOP diff --git a/iconvdata/Makefile b/iconvdata/Makefile index 687ca4e156..44e3f8f531 100644 --- a/iconvdata/Makefile +++ b/iconvdata/Makefile @@ -45,7 +45,8 @@ modules := ISO8859-1 ISO8859-2 ISO8859-3 ISO8859-4 ISO8859-5 \ INIS-CYRILLIC ISO_6937-2 ISO_2033 ISO_5427 ISO_5427-EXT \ ISO_5428 ISO_10367-BOX MAC-IS MAC-UK NATS-DANO NATS-SEFI \ SAMI-WS2 ISO-IR-197 TIS-620 KOI8-U GBK ISIRI-3342 GBGBK \ - ISO-2022-CN libISOIR165 UTF-16 UNICODE BIG5HKSCS GB18030 + ISO-2022-CN libISOIR165 UTF-16 UNICODE UTF-7 BIG5HKSCS \ + GB18030 modules.so := $(addsuffix .so, $(modules)) @@ -125,8 +126,8 @@ distribute := gconv-modules extra-module.mk gap.awk gaptab.awk \ macintosh.c mac-is.c mac-uk.c nats-dano.c nats-sefi.c sjis.c \ t.61.c uhc.c sami-ws2.c iso-ir-197.c tis-620.c koi8-u.c \ isiri-3342.c isiri-3342.h gbgbk.c iso-2022-cn.c cns11643l2.h \ - iso8859-16.c utf-16.c unicode.c big5hkscs.c iso-ir-165.c \ - iso-ir-165.h gb18030.c + iso8859-16.c utf-16.c unicode.c utf-7.c big5hkscs.c \ + iso-ir-165.c iso-ir-165.h gb18030.c # We build the transformation modules only when we build shared libs. ifeq (yes,$(build-shared)) diff --git a/iconvdata/TESTS b/iconvdata/TESTS index 87ecfc96d1..34a4fc33e7 100644 --- a/iconvdata/TESTS +++ b/iconvdata/TESTS @@ -81,3 +81,4 @@ EUC-KR EUC-KR Y UTF8 EUC-CN EUC-CN Y UTF8 GBK GBK Y UTF8 BIG5HKSCS BIG5HKSCS Y UTF8 +UTF-7 UTF-7 N UTF8 diff --git a/iconvdata/gconv-modules b/iconvdata/gconv-modules index 725f45d3c9..0df538848e 100644 --- a/iconvdata/gconv-modules +++ b/iconvdata/gconv-modules @@ -1195,5 +1195,9 @@ module UNICODE// INTERNAL UNICODE 1 module INTERNAL UNICODE// UNICODE 1 # from to module cost +module UTF-7// INTERNAL UTF-7 1 +module INTERNAL UTF-7// UTF-7 1 + +# from to module cost module GB18030// INTERNAL GB18030 1 module INTERNAL GB18030// GB18030 1 diff --git a/iconvdata/run-iconv-test.sh b/iconvdata/run-iconv-test.sh index a16f667c67..a000b68b14 100755 --- a/iconvdata/run-iconv-test.sh +++ b/iconvdata/run-iconv-test.sh @@ -100,7 +100,7 @@ while read from to subset targets; do { echo "/FAILED"; failed=1; continue; } else - echo $ac_n " suntzu: $from -> ASCII -> $to $ac_c" + echo $ac_n " suntzu: ASCII -> $to -> ASCII $ac_c" $PROG -f ASCII -t $to testdata/suntzus | $PROG -f $to -t ASCII > $temp1 || { if test $? -gt 128; then exit 1; fi diff --git a/iconvdata/testdata/UTF-7 b/iconvdata/testdata/UTF-7 new file mode 100644 index 0000000000..06097f42ff --- /dev/null +++ b/iconvdata/testdata/UTF-7 @@ -0,0 +1,25 @@ ++EqASGxItEps Amharic ++AQ0-esky Czech +Dansk Danish +English English +Suomi Finnish +Fran+AOc-ais French +Deutsch German ++A5UDuwO7A7cDvQO5A7oDrA Greek ++BeIF0QXoBdkF6g Hebrew +Italiano Italian +Norsk Norwegian ++BCAEQwRBBEEEOgQ4BDk Russian +Espa+APE-ol Spanish +Svenska Swedish ++DiAOMg4pDjIORA4XDiI Thai +T+APw-rk+AOc-e Turkish +Ti+Hr8-ng Vi+Hsc-t Vietnamese ++ZeVnLIqe Japanese ++Ti1lhw Chinese ++1VyuAA Korean + +// The last line of this file is missing the end-of-line terminator +// on purpose, in order to test that the conversion empties the bit buffer +// and shifts back to the initial state at the end of the conversion. +A+ImIDkQ-
\ No newline at end of file diff --git a/iconvdata/testdata/UTF-7..UTF8 b/iconvdata/testdata/UTF-7..UTF8 new file mode 100644 index 0000000000..3b362e578c --- /dev/null +++ b/iconvdata/testdata/UTF-7..UTF8 @@ -0,0 +1,25 @@ +አማርኛ Amharic +česky Czech +Dansk Danish +English English +Suomi Finnish +Français French +Deutsch German +Ελληνικά Greek +עברית Hebrew +Italiano Italian +Norsk Norwegian +Русский Russian +Español Spanish +Svenska Swedish +ภาษาไทย Thai +Türkçe Turkish +Tiếng Việt Vietnamese +日本語 Japanese +中文 Chinese +한글 Korean + +// The last line of this file is missing the end-of-line terminator +// on purpose, in order to test that the conversion empties the bit buffer +// and shifts back to the initial state at the end of the conversion. +A≢Α
\ No newline at end of file diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c new file mode 100644 index 0000000000..f02063d334 --- /dev/null +++ b/iconvdata/utf-7.c @@ -0,0 +1,559 @@ +/* Conversion module for UTF-7. + Copyright (C) 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Bruno Haible <haible@clisp.cons.org>, 2000. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* UTF-7 is a legacy encoding used for transmitting Unicode within the + ASCII character set, used primarily by mail agents. New programs + are encouraged to use UTF-8 instead. + + UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The + original Base64 encoding is defined in RFC 2045. */ + +#include <dlfcn.h> +#include <gconv.h> +#include <stdint.h> +#include <stdlib.h> + + +/* Define this to 1 if you want the so-called "optional direct" characters + ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } + to be encoded. Define to 0 if you want them to be passed straight + through, like the so-called "direct" characters. + We set this to 1 because it's safer. + */ +#define UTF7_ENCODE_OPTIONAL_CHARS 1 + + +/* The set of "direct characters": + A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr +*/ + +static const unsigned char direct_tab[128/8] = + { + 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 + }; + +static inline int +isdirect (uint32_t ch) +{ + return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1)); +} + + +/* The set of "direct and optional direct characters": + A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr + ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } +*/ + +static const unsigned char xdirect_tab[128/8] = + { + 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f + }; + +static inline int +isxdirect (uint32_t ch) +{ + return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1)); +} + + +/* The set of "extended base64 characters": + A-Z a-z 0-9 + / - +*/ + +static const unsigned char xbase64_tab[128/8] = + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03, + 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 + }; + +static inline int +isxbase64 (uint32_t ch) +{ + return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1)); +} + + +/* Converts a value in the range 0..63 to a base64 encoded char. */ +static inline unsigned char +base64 (unsigned int i) +{ + if (i < 26) + return i + 'A'; + else if (i < 52) + return i - 26 + 'a'; + else if (i < 62) + return i - 52 + '0'; + else if (i == 62) + return '+'; + else if (i == 63) + return '/'; + else + abort (); +} + + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "UTF-7//" +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define FROM_LOOP from_utf7_loop +#define TO_LOOP to_utf7_loop +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 6 +#define MIN_NEEDED_TO 4 +#define MAX_NEEDED_TO 4 +#define PREPARE_LOOP \ + mbstate_t saved_state; \ + mbstate_t *statep = data->__statep; +#define EXTRA_LOOP_ARGS , statep + + +/* Since we might have to reset input pointer we must be able to save + and restore the state. */ +#define SAVE_RESET_STATE(Save) \ + if (Save) \ + saved_state = *statep; \ + else \ + *statep = saved_state + + +/* First define the conversion function from UTF-7 to UCS4. + The state is structured as follows: + __count bit 2..0: zero + __count bit 8..3: shift + __wch: data + Precise meaning: + shift data + 0 -- not inside base64 encoding + 1..32 XX..XX00..00 inside base64, (32 - shift) bits pending + This state layout is simpler than relying on STORE_REST/UNPACK_BYTES. + + When shift = 0, __wch needs to store at most one lookahead byte (see + __GCONV_INCOMPLETE_INPUT below). +*/ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint_fast8_t ch = *inptr; \ + \ + if ((statep->__count >> 3) == 0) \ + { \ + /* base64 encoding inactive. */ \ + if (isxdirect (ch)) \ + { \ + inptr++; \ + put32 (outptr, ch); \ + outptr += 4; \ + } \ + else if (__builtin_expect (ch == '+', 1)) \ + { \ + if (__builtin_expect (inptr + 2 >= inend, 0)) \ + { \ + /* Not enough input available. */ \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + if (inptr[1] == '-') \ + { \ + inptr += 2; \ + put32 (outptr, ch); \ + outptr += 4; \ + } \ + else \ + { \ + /* Switch into base64 mode. */ \ + inptr++; \ + statep->__count = (32 << 3); \ + statep->__value.__wch = 0; \ + } \ + } \ + else \ + { \ + /* The input is invalid. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + } \ + } \ + else \ + { \ + /* base64 encoding active. */ \ + uint32_t i; \ + int shift; \ + \ + if (ch >= 'A' && ch <= 'Z') \ + i = ch - 'A'; \ + else if (ch >= 'a' && ch <= 'z') \ + i = ch - 'a' + 26; \ + else if (ch >= '0' && ch <= '9') \ + i = ch - '0' + 52; \ + else if (ch == '+') \ + i = 62; \ + else if (ch == '/') \ + i = 63; \ + else \ + { \ + /* Terminate base64 encoding. */ \ + \ + /* If accumulated data is nonzero, the input is invalid. */ \ + /* Also, partial UTF-16 characters are invalid. */ \ + if (__builtin_expect (statep->__value.__wch != 0, 0) \ + || __builtin_expect ((statep->__count >> 3) <= 26, 0)) \ + { \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + statep->__count = 0; \ + continue; \ + } \ + \ + if (ch == '-') \ + inptr++; \ + \ + statep->__count = 0; \ + continue; \ + } \ + \ + /* Concatenate the base64 integer i to the accumulator. */ \ + shift = (statep->__count >> 3); \ + if (shift > 6) \ + { \ + uint32_t wch; \ + \ + shift -= 6; \ + wch = statep->__value.__wch | (i << shift); \ + \ + if (shift <= 16 && shift > 10) \ + { \ + /* An UTF-16 character has just been completed. */ \ + uint32_t wc1 = wch >> 16; \ + \ + /* UTF-16: When we see a High Surrogate, we must also decode \ + the following Low Surrogate. */ \ + if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \ + { \ + wch = wch << 16; \ + shift += 16; \ + put32 (outptr, wc1); \ + outptr += 4; \ + } \ + } \ + else if (shift <= 10 && shift > 4) \ + { \ + /* After a High Surrogate, verify that the next 16 bit \ + indeed form a Low Surrogate. */ \ + uint32_t wc2 = wch & 0xffff; \ + \ + if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \ + { \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++inptr; \ + ++*irreversible; \ + statep->__count = 0; \ + continue; \ + } \ + } \ + \ + statep->__value.__wch = wch; \ + } \ + else \ + { \ + /* An UTF-16 surrogate pair has just been completed. */ \ + uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \ + uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \ + | (i >> (6 - shift)); \ + \ + statep->__value.__wch = (i << shift) << 26; \ + shift += 26; \ + \ + assert (wc1 >= 0xd800 && wc1 < 0xdc00); \ + assert (wc2 >= 0xdc00 && wc2 < 0xe000); \ + put32 (outptr, \ + 0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \ + outptr += 4; \ + } \ + \ + statep->__count = shift << 3; \ + \ + /* Now that we digested the input increment the input pointer. */ \ + inptr++; \ + } \ + } +#define LOOP_NEED_FLAGS +#define EXTRA_LOOP_DECLS , mbstate_t *statep +#include <iconv/loop.c> + + +/* Next, define the conversion from UCS4 to UTF-7. + The state is structured as follows: + __count bit 2..0: zero + __count bit 4..3: shift + __count bit 8..5: data + Precise meaning: + shift data + 0 0 not inside base64 encoding + 1 0 inside base64, no pending bits + 2 XX00 inside base64, 2 bits known for next byte + 3 XXXX inside base64, 4 bits known for next byte + + __count bit 2..0 and __wch are always zero, because this direction + never returns __GCONV_INCOMPLETE_INPUT. +*/ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MAX_NEEDED_INPUT MAX_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + uint32_t ch = get32 (inptr); \ + \ + if ((statep->__count & 0x18) == 0) \ + { \ + /* base64 encoding inactive */ \ + if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ + { \ + *outptr++ = (unsigned char) ch; \ + } \ + else \ + { \ + size_t count; \ + \ + if (ch == '+') \ + count = 2; \ + else if (ch < 0x10000) \ + count = 3; \ + else if (ch < 0x110000) \ + count = 6; \ + else \ + STANDARD_ERR_HANDLER (4); \ + \ + if (__builtin_expect (outptr + count > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *outptr++ = '+'; \ + if (ch == '+') \ + *outptr++ = '-'; \ + else if (ch < 0x10000) \ + { \ + *outptr++ = base64 (ch >> 10); \ + *outptr++ = base64 ((ch >> 4) & 0x3f); \ + statep->__count = ((ch & 15) << 5) | (3 << 3); \ + } \ + else if (ch < 0x110000) \ + { \ + uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ + uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ + \ + ch = (ch1 << 16) | ch2; \ + *outptr++ = base64 (ch >> 26); \ + *outptr++ = base64 ((ch >> 20) & 0x3f); \ + *outptr++ = base64 ((ch >> 14) & 0x3f); \ + *outptr++ = base64 ((ch >> 8) & 0x3f); \ + *outptr++ = base64 ((ch >> 2) & 0x3f); \ + statep->__count = ((ch & 3) << 7) | (2 << 3); \ + } \ + else \ + abort (); \ + } \ + } \ + else \ + { \ + /* base64 encoding active */ \ + if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ + { \ + /* deactivate base64 encoding */ \ + size_t count; \ + \ + count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \ + if (__builtin_expect (outptr + count > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + if ((statep->__count & 0x18) >= 0x10) \ + *outptr++ = base64 ((statep->__count >> 3) & ~3); \ + if (isxbase64 (ch)) \ + *outptr++ = '-'; \ + *outptr++ = (unsigned char) ch; \ + statep->__count = 0; \ + } \ + else \ + { \ + size_t count; \ + \ + if (ch < 0x10000) \ + count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \ + else if (ch < 0x110000) \ + count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \ + else \ + STANDARD_ERR_HANDLER (4); \ + \ + if (__builtin_expect (outptr + count > outend, 0)) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + if (ch < 0x10000) \ + { \ + switch ((statep->__count >> 3) & 3) \ + { \ + case 1: \ + *outptr++ = base64 (ch >> 10); \ + *outptr++ = base64 ((ch >> 4) & 0x3f); \ + statep->__count = ((ch & 15) << 5) | (3 << 3); \ + break; \ + case 2: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 12)); \ + *outptr++ = base64 ((ch >> 6) & 0x3f); \ + *outptr++ = base64 (ch & 0x3f); \ + statep->__count = (1 << 3); \ + break; \ + case 3: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 14)); \ + *outptr++ = base64 ((ch >> 8) & 0x3f); \ + *outptr++ = base64 ((ch >> 2) & 0x3f); \ + statep->__count = ((ch & 3) << 7) | (2 << 3); \ + break; \ + default: \ + abort (); \ + } \ + } \ + else if (ch < 0x110000) \ + { \ + uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ + uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ + \ + ch = (ch1 << 16) | ch2; \ + switch ((statep->__count >> 3) & 3) \ + { \ + case 1: \ + *outptr++ = base64 (ch >> 26); \ + *outptr++ = base64 ((ch >> 20) & 0x3f); \ + *outptr++ = base64 ((ch >> 14) & 0x3f); \ + *outptr++ = base64 ((ch >> 8) & 0x3f); \ + *outptr++ = base64 ((ch >> 2) & 0x3f); \ + statep->__count = ((ch & 3) << 7) | (2 << 3); \ + break; \ + case 2: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 28)); \ + *outptr++ = base64 ((ch >> 22) & 0x3f); \ + *outptr++ = base64 ((ch >> 16) & 0x3f); \ + *outptr++ = base64 ((ch >> 10) & 0x3f); \ + *outptr++ = base64 ((ch >> 4) & 0x3f); \ + statep->__count = ((ch & 15) << 5) | (3 << 3); \ + break; \ + case 3: \ + *outptr++ = \ + base64 (((statep->__count >> 3) & ~3) | (ch >> 30)); \ + *outptr++ = base64 ((ch >> 24) & 0x3f); \ + *outptr++ = base64 ((ch >> 18) & 0x3f); \ + *outptr++ = base64 ((ch >> 12) & 0x3f); \ + *outptr++ = base64 ((ch >> 6) & 0x3f); \ + *outptr++ = base64 (ch & 0x3f); \ + statep->__count = (1 << 3); \ + break; \ + default: \ + abort (); \ + } \ + } \ + else \ + abort (); \ + } \ + } \ + \ + /* Now that we wrote the output increment the input pointer. */ \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#define EXTRA_LOOP_DECLS , mbstate_t *statep +#include <iconv/loop.c> + + +/* Since this is a stateful encoding we have to provide code which resets + the output state to the initial state. This has to be done during the + flushing. */ +#define EMIT_SHIFT_TO_INIT \ + if (FROM_DIRECTION) \ + /* Nothing to emit. */ \ + memset (data->__statep, '\0', sizeof (mbstate_t)); \ + else \ + { \ + /* The "to UTF-7" direction. Flush the remaining bits and terminate \ + with a '-' byte. This will guarantee correct decoding if more \ + UTF-7 encoded text is added afterwards. */ \ + int state = data->__statep->__count; \ + \ + if (state & 0x18) \ + { \ + /* Deactivate base64 encoding. */ \ + unsigned char *outbuf = data->__outbuf; \ + size_t count = ((state & 0x18) >= 0x10) + 1; \ + \ + if (__builtin_expect (outbuf + count > data->__outbufend, 0)) \ + /* We don't have enough room in the output buffer. */ \ + status = __GCONV_FULL_OUTPUT; \ + else \ + { \ + /* Write out the shift sequence. */ \ + if ((state & 0x18) >= 0x10) \ + *outbuf++ = base64 ((state >> 3) & ~3); \ + *outbuf++ = '-'; \ + \ + data->__outbuf = outbuf; \ + data->__statep->__count = 0; \ + } \ + } \ + else \ + data->__statep->__count = 0; \ + } + + +/* Now define the toplevel functions. */ +#include <iconv/skeleton.c> diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c index f7aaf0d34f..da3cfa7744 100644 --- a/locale/programs/ld-collate.c +++ b/locale/programs/ld-collate.c @@ -65,7 +65,9 @@ struct element_t; /* Data type for list of strings. */ struct section_list { + /* Successor in the known_sections list. */ struct section_list *def_next; + /* Successor in the sections list. */ struct section_list *next; /* Name of the section. */ const char *name; @@ -291,6 +293,7 @@ make_seclist_elem (struct locale_collate_t *collate, const char *string, newp->next = next; newp->name = string; newp->first = NULL; + newp->last = NULL; return newp; } @@ -336,6 +339,10 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen, newp->used_in_level = 0; newp->is_character = is_character; + /* Will be assigned later. XXX */ + newp->mbseqorder = 0; + newp->wcseqorder = 0; + /* Will be allocated later. */ newp->weights = NULL; @@ -350,6 +357,9 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen, newp->mbnext = NULL; newp->mblast = NULL; + newp->wcnext = NULL; + newp->wclast = NULL; + return newp; } @@ -619,9 +629,8 @@ find_element (struct linereader *ldfile, struct locale_collate_t *collate, /* It's also no collation element. So it is a character element defined later. */ result = new_element (collate, NULL, 0, NULL, str, len, 1); - if (result != NULL) - /* Insert it into the sequence table. */ - insert_entry (&collate->seq_table, str, len, result); + /* Insert it into the sequence table. */ + insert_entry (&collate->seq_table, str, len, result); } } @@ -660,11 +669,11 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, /* Initialize all the fields. */ elem->file = ldfile->fname; elem->line = ldfile->lineno; + elem->last = collate->cursor; elem->next = collate->cursor ? collate->cursor->next : NULL; if (collate->cursor != NULL && collate->cursor->next != NULL) collate->cursor->next->last = elem; - elem->section = collate->current_section; if (collate->cursor != NULL) collate->cursor->next = elem; if (collate->start == NULL) @@ -672,9 +681,8 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, assert (collate->cursor == NULL); collate->start = elem; } - elem->weights = (struct element_list_t *) - obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t)); - memset (elem->weights, '\0', nrules * sizeof (struct element_list_t)); + + elem->section = collate->current_section; if (collate->current_section->first == NULL) collate->current_section->first = elem; @@ -683,6 +691,10 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, collate->cursor = elem; + elem->weights = (struct element_list_t *) + obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t)); + memset (elem->weights, '\0', nrules * sizeof (struct element_list_t)); + weight_cnt = 0; arg = lr_token (ldfile, charmap, repertoire); @@ -839,8 +851,8 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, %s: weights must use the same ellipsis symbol as the name"), "LC_COLLATE"); - /* The weight for this level has to be ignored. We use the - null pointer to indicate this. */ + /* The weight for this level will depend on the element + iterating over the range. Put a placeholder. */ elem->weights[weight_cnt].w = (struct element_t **) obstack_alloc (&collate->mempool, sizeof (struct element_t *)); elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2; @@ -988,8 +1000,7 @@ insert_value (struct linereader *ldfile, const char *symstr, size_t symlen, } /* Test whether this element is not already in the list. */ - if (elem->next != NULL || (collate->cursor != NULL - && elem->next == collate->cursor)) + if (elem->next != NULL || elem == collate->cursor) { lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"), (int) symlen, symstr, elem->file, elem->line); @@ -1434,6 +1445,7 @@ collate_startup (struct linereader *ldfile, struct localedef_t *locale, collate->col_weight_max = -1; } else + /* Reuse the copy_locale's data structures. */ collate = locale->categories[LC_COLLATE].collate = copy_locale->categories[LC_COLLATE].collate; } @@ -1788,9 +1800,9 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap) while (sect != NULL && sect->rules == NULL); } while (sect != NULL); - /* We are currently not prepared for more than 256 rulesets. But this + /* We are currently not prepared for more than 128 rulesets. But this should never really be a problem. */ - assert (ruleidx <= 256); + assert (ruleidx <= 128); } @@ -2529,9 +2541,18 @@ collate_read (struct linereader *ldfile, struct localedef_t *result, struct token *now; struct token *arg = NULL; enum token_t nowtok; - int state = 0; enum token_t was_ellipsis = tok_none; struct localedef_t *copy_locale = NULL; + /* Parsing state: + 0 - start + 1 - between `order-start' and `order-end' + 2 - after `order-end' + 3 - after `reorder-after', waiting for `reorder-end' + 4 - after `reorder-end' + 5 - after `reorder-sections-after', waiting for `reorder-sections-end' + 6 - after `reorder-sections-end' + */ + int state = 0; /* Get the repertoire we have to use. */ if (repertoire_name != NULL) @@ -2828,9 +2849,10 @@ collate_read (struct linereader *ldfile, struct localedef_t *result, } else if (ellipsis == tok_none) { - /* The name is already defined. */ + /* A single symbol, no ellipsis. */ if (check_duplicate (ldfile, collate, charmap, repertoire, symbol, symbol_len)) + /* The name is already defined. */ goto col_sym_free; insert_entry (&collate->sym_table, symbol, symbol_len, @@ -2884,13 +2906,13 @@ collate_read (struct linereader *ldfile, struct localedef_t *result, /* Create the name. */ sprintf (symbuf, ellipsis == tok_ellipsis2 - ? "%.*s%.*lX" : "%.*s%.*lX", + ? "%.*s%.*lX" : "%.*s%.*lu", (int) prefixlen, symbol, (int) (symbol_len - prefixlen), from); - /* The name is already defined. */ if (check_duplicate (ldfile, collate, charmap, repertoire, symbuf, symbol_len)) + /* The name is already defined. */ goto col_sym_free; insert_entry (&collate->sym_table, symbuf, @@ -3021,8 +3043,8 @@ error while adding equivalent collating symbol")); } runp = (struct section_list *) xcalloc (1, sizeof (*runp)); - name = strncpy (xmalloc (arg->val.str.lenmb + 1), - arg->val.str.startmb, arg->val.str.lenmb); + name = (char *) xmalloc (arg->val.str.lenmb + 1); + memcpy (name, arg->val.str.startmb, arg->val.str.lenmb); name[arg->val.str.lenmb] = '\0'; runp->name = name; @@ -3070,6 +3092,8 @@ error while adding equivalent collating symbol")); if (collate->error_section.first == NULL) { + /* Insert &collate->error_section at the end of + the collate->sections list. */ if (collate->sections == NULL) collate->sections = &collate->error_section; else @@ -3078,9 +3102,9 @@ error while adding equivalent collating symbol")); while (sp->next != NULL) sp = sp->next; - collate->error_section.next = NULL; sp->next = &collate->error_section; } + collate->error_section.next = NULL; } } else @@ -3093,6 +3117,8 @@ error while adding equivalent collating symbol")); "LC_COLLATE", sp->name); else { + /* Insert sp in the collate->sections list, + right after collate->current_section. */ if (collate->current_section == NULL) collate->current_section = sp; else @@ -3141,6 +3167,8 @@ error while adding equivalent collating symbol")); "LC_COLLATE"); else { + /* Insert &collate->unnamed_section at the beginning of + the collate->sections list. */ collate->unnamed_section.next = collate->sections; collate->sections = &collate->unnamed_section; } @@ -3149,7 +3177,7 @@ error while adding equivalent collating symbol")); /* Now read the direction names. */ read_directions (ldfile, arg, charmap, repertoire, collate); - /* From now be need the strings untranslated. */ + /* From now we need the strings untranslated. */ ldfile->translate_strings = 0; break; @@ -3231,7 +3259,7 @@ error while adding equivalent collating symbol")); (void **) &insp) == 0) /* Yes, the symbol exists. Simply point the cursor to it. */ - collate->cursor = insp; + collate->cursor = insp; else { struct symbol_t *symbp; @@ -3428,7 +3456,7 @@ error while adding equivalent collating symbol")); /* We are outside an `order_start' region. This means we must only accept definitions of values for collation symbols since these are purely abstract - values and don't need dorections associated. */ + values and don't need directions associated. */ struct element_t *seqp; if (find_entry (&collate->seq_table, symstr, symlen, @@ -3510,7 +3538,7 @@ error while adding equivalent collating symbol")); if (seqp->section->first == seqp) { if (seqp->section->first == seqp->section->last) - /* This setion has no content anymore. */ + /* This section has no content anymore. */ seqp->section->first = seqp->section->last = NULL; else seqp->section->first = seqp->next; @@ -3616,8 +3644,7 @@ error while adding equivalent collating symbol")); /* See whether UNDEFINED already appeared somewhere. */ if (collate->undefined.next != NULL - || (collate->cursor != NULL - && collate->undefined.next == collate->cursor)) + || &collate->undefined == collate->cursor) { lr_error (ldfile, _("%s: order for `%.*s' already defined at %s:%Zu"), @@ -3632,9 +3659,9 @@ error while adding equivalent collating symbol")); repertoire, collate, tok_none); break; - case tok_ellipsis2: - case tok_ellipsis3: - case tok_ellipsis4: + case tok_ellipsis2: /* symbolic hexadecimal ellipsis */ + case tok_ellipsis3: /* absolute ellipsis */ + case tok_ellipsis4: /* symbolic decimal ellipsis */ /* This is the symbolic (decimal or hexadecimal) or absolute ellipsis. */ if (was_ellipsis != tok_none) |