diff options
author | Ulrich Drepper <drepper@redhat.com> | 1999-12-29 07:32:44 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 1999-12-29 07:32:44 +0000 |
commit | 15a2315cb457be0599d7a662e64aa54e560f96f0 (patch) | |
tree | 0e9915fb5d494269e1637674e5a78368d0700f81 /iconvdata/iso-2022-cn.c | |
parent | 6d110ca1858feba75e4b5528d7941770c59d6b5d (diff) | |
download | glibc-15a2315cb457be0599d7a662e64aa54e560f96f0.tar.gz |
Update.
* iconvdata/Makefile (modules): Add ISO-2022-CN.
Add link rules for this module.
(distribute): Add iso-2022-cn.c and cns11643l2.h.
* iconvdata/cns11643l2.h: New file.
* iconvdata/iso-2022-cn.c: New file.
* iconvdata/gconv-modules: Add entries for ISO-2022-CN module.
* iconvdata/cns11643l1.h (cns11643l1_to_ucs4): Make string argument
unsigned.
(ucs4_to_cns11643l1): Likewise.
* iconvdata/euc-tw.c: Correct parameter passed to cns11643l1_to_ucs4.
* iconvdata/iso-2022-kr.c: Remove unnecessary test from conversion
loop to UCS4.
Optimize recognition of escape sequences a bit.
Diffstat (limited to 'iconvdata/iso-2022-cn.c')
-rw-r--r-- | iconvdata/iso-2022-cn.c | 398 |
1 files changed, 398 insertions, 0 deletions
diff --git a/iconvdata/iso-2022-cn.c b/iconvdata/iso-2022-cn.c new file mode 100644 index 0000000000..563d173d66 --- /dev/null +++ b/iconvdata/iso-2022-cn.c @@ -0,0 +1,398 @@ +/* Conversion module for ISO-2022-CN. + Copyright (C) 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include <gconv.h> +#include <stdint.h> +#include <string.h> +#include "gb2312.h" +#include "cns11643l1.h" +#include "cns11643l2.h" + +#include <assert.h> + +/* This makes obvious what everybody knows: 0x1b is the Esc character. */ +#define ESC 0x1b + +/* We have single-byte shift-in and shift-out sequences, and the single + shift sequence SS2 which replaces the SS2 designation for the next + two bytes. */ +#define SI 0x0f +#define SO 0x0e +#define SS2_0 ESC +#define SS2_1 0x4e + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "ISO-2022-CN//" +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define FROM_LOOP from_iso2022cn_loop +#define TO_LOOP to_iso2022cn_loop +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 4 +#define MAX_NEEDED_TO 4 +#define PREPARE_LOOP \ + int save_set; \ + int *setp = &data->__statep->count; +#define EXTRA_LOOP_ARGS , setp + + +/* The COUNT element of the state keeps track of the currently selected + character set. The possible values are: */ +enum +{ + ASCII_set = 0, + GB2312_set, + CNS11643_1_set, + CNS11643_2_set, + CURRENT_MASK = 3, + GB2312_ann = 4, + CNS11643_1_ann = 8, + CNS11643_2_ann = 16 +}; + + +/* Since this is a stateful encoding we have to provide code which resets + the output state to the initial state. This has to be done during the + flushing. */ +#define EMIT_SHIFT_TO_INIT \ + if (data->__statep->count != ASCII_set) \ + { \ + if (FROM_DIRECTION) \ + /* It's easy, we don't have to emit anything, we just reset the \ + state for the input. */ \ + data->__statep->count = ASCII_set; \ + else \ + { \ + unsigned char *outbuf = data->__outbuf; \ + \ + /* We are not in the initial state. To switch back we have \ + to emit `SI'. */ \ + if (outbuf == data->__outbufend) \ + /* We don't have enough room in the output buffer. */ \ + status = __GCONV_FULL_OUTPUT; \ + else \ + { \ + /* Write out the shift sequence. */ \ + *outbuf++ = SI; \ + if (data->__is_last) \ + *written += 1; \ + data->__outbuf = outbuf; \ + data->__statep->count = ASCII_set; \ + } \ + } \ + } + + +/* Since we might have to reset input pointer we must be able to save + and retore the state. */ +#define SAVE_RESET_STATE(Save) \ + if (Save) \ + save_set = *setp; \ + else \ + *setp = save_set + + +/* First define the conversion function from ISO-2022-CN to UCS4. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint32_t ch = *inptr; \ + \ + /* This is a 7bit character set, disallow all 8bit characters. */ \ + if (ch > 0x7f) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + /* Recognize escape sequences. */ \ + if (ch == ESC) \ + { \ + /* There are two kinds of escape sequences we have to handle: \ + - those announcing the use of GB and CNS characters on the \ + line; we can simply ignore them \ + - the initial byte of the SS2 sequence. \ + */ \ + if (NEED_LENGTH_TEST \ + && (inptr + 1 > inend \ + || (inptr[1] == '$' \ + && (inptr + 2 > inend \ + || (inptr[2] == ')' && inptr + 3 > inend) \ + || (inptr[2] == '*' && inptr + 3 > inend))) \ + || (inptr[1] == SS2_1 && inptr + 3 > inend))) \ + { \ + result = __GCONV_EMPTY_INPUT; \ + break; \ + } \ + if (inptr[1] == '$' \ + && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \ + || (inptr[2] == '*' && inptr[3] == 'H'))) \ + { \ + /* OK, we accept those character sets. */ \ + if (inptr[3] == 'A') \ + ann = GB2312_ann; \ + else if (inptr[3] == 'G') \ + ann = CNS11643_1_ann; \ + inptr += 4; \ + continue; \ + } \ + } \ + else if (ch == SO) \ + { \ + /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \ + S0 designation came last. The only problem is what to do with \ + faulty input files where no designator came. \ + XXX For now I'll default to use GB2312. If this is not the \ + best behaviour (e.g., we should flag an error) let me know. */ \ + ++inptr; \ + set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \ + continue; \ + } \ + else if (ch == SI) \ + { \ + /* Switch to use ASCII. */ \ + ++inptr; \ + set = ASCII_set; \ + continue; \ + } \ + \ + if (ch == ESC && inptr[1] == SS2_1) \ + { \ + /* This is a character from CNS 11643 plane 2. \ + XXX We could test here whether the use of this character \ + set was announced. */ \ + inptr += 2; \ + ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \ + if (ch == __UNKNOWN_10646_CHAR) \ + { \ + inptr -= 2; \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + else if (set == ASCII_set) \ + { \ + /* Almost done, just advance the input pointer. */ \ + ++inptr; \ + } \ + else \ + { \ + /* That's pretty easy, we have a dedicated functions for this. */ \ + if (set == GB2312_set) \ + ch = gb2312_to_ucs4 (&inptr, \ + NEED_LENGTH_TEST ? inend - inptr : 2, 0); \ + else \ + { \ + assert (set == CNS11643_1_set); \ + ch = cns11643l1_to_ucs4 (&inptr, \ + NEED_LENGTH_TEST ? inend - inptr : 2, 0);\ + } \ + \ + if (NEED_LENGTH_TEST && ch == 0) \ + { \ + result = __GCONV_EMPTY_INPUT; \ + break; \ + } \ + else if (ch == __UNKNOWN_10646_CHAR) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + \ + *((uint32_t *) outptr)++ = ch; \ + } +#define EXTRA_LOOP_DECLS , int *setp +#define INIT_PARAMS int set = *setp & CURRENT_MASK; \ + int ann = *setp & ~CURRENT_MASK +#define UPDATE_PARAMS *setp = set | ann +#include <iconv/loop.c> + + +/* Next, define the other direction. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + uint32_t ch; \ + size_t written = 0; \ + \ + ch = *((uint32_t *) inptr); \ + \ + /* First see whether we can write the character using the currently \ + selected character set. */ \ + if (ch < 0x80) \ + { \ + if (set != ASCII_set) \ + { \ + *outptr++ = SI; \ + set = ASCII_set; \ + if (NEED_LENGTH_TEST && outptr == outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + } \ + \ + *outptr++ = ch; \ + written = 1; \ + \ + /* At the end of the line we have to clear the `ann' flags since \ + every line must contain this information again. */ \ + if (ch == L'\n') \ + ann = 0; \ + } \ + else \ + { \ + char buf[2]; \ + int used; \ + \ + if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \ + { \ + written = ucs4_to_gb2312 (ch, buf, 2); \ + used = GB2312_set; \ + } \ + else \ + { \ + written = ucs4_to_cns11643l1 (ch, buf, 2); \ + used = CNS11643_1_set; \ + } \ + \ + if (written == __UNKNOWN_10646_CHAR) \ + { \ + /* Cannot convert it using the currently selected SO set. \ + Next try the SS2 set. */ \ + written = ucs4_to_cns11643l2 (ch, buf, 2); \ + if (written != __UNKNOWN_10646_CHAR) \ + /* Yep, that worked. */ \ + used = CNS11643_2_set; \ + else \ + { \ + /* Well, see whether we have to change the SO set. */ \ + if (set == GB2312_set) \ + written = ucs4_to_cns11643l1 (ch, buf, 2); \ + else \ + written = ucs4_to_gb2312 (ch, buf, 2); \ + \ + if (written != __UNKNOWN_10646_CHAR) \ + /* Oh well, then switch SO. */ \ + used = GB2312_set + CNS11643_1_set - set; \ + else \ + { \ + /* Even this does not work. Error. */ \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + } \ + assert (written == 2); \ + \ + /* See whether we have to emit an escape sequence. */ \ + if (set != used) \ + { \ + /* First see whether we announced that we use this \ + character set. */ \ + if ((ann & (2 << used)) == 0) \ + { \ + const char *escseq; \ + \ + if (NEED_LENGTH_TEST && outptr + 4 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + assert (used >= 1 && used <= 3); \ + escseq = "\e$)A\e$)G\e$*H" + (used - 1) * 4; \ + *outptr++ = *escseq++; \ + *outptr++ = *escseq++; \ + *outptr++ = *escseq++; \ + *outptr++ = *escseq++; \ + \ + if (used == GB2312_set) \ + ann = (ann & CNS11643_2_ann) | GB2312_ann; \ + else if (used == CNS11643_1_set) \ + ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \ + else \ + ann |= CNS11643_2_ann; \ + } \ + \ + if (used == CNS11643_2_set) \ + { \ + if (outptr + 2 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + *outptr++ = SS2_0; \ + *outptr++ = SS2_1; \ + } \ + else \ + { \ + /* We only have to emit something is currently ASCII is \ + selected. Otherwise we are switching within the \ + SO charset. */ \ + if (set == ASCII_set) \ + { \ + if (outptr + 1 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + *outptr++ = SO; \ + } \ + } \ + \ + /* Always test the length here since we have used up all the \ + guaranteed output buffer slots. */ \ + if (outptr + 2 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + } \ + else if (NEED_LENGTH_TEST && outptr + 2 > outend) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *outptr++ = buf[0]; \ + *outptr++ = buf[1]; \ + } \ + \ + /* Now that we wrote the output increment the input pointer. */ \ + inptr += 4; \ + } +#define EXTRA_LOOP_DECLS , int *setp +#define INIT_PARAMS int set = *setp & CURRENT_MASK; \ + int ann = *setp & ~CURRENT_MASK +#define UPDATE_PARAMS *setp = set | ann +#include <iconv/loop.c> + + +/* Now define the toplevel functions. */ +#include <iconv/skeleton.c> |