diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2009-02-07 16:32:56 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2009-02-07 16:32:56 +0000 |
commit | a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch) | |
tree | a966aeee62e69ae3ad13275d07ddb15049b14e0e /src-diclib/xstr.c | |
download | anthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz |
anthy-9100hHEADanthy-9100hmaster
Diffstat (limited to 'src-diclib/xstr.c')
-rw-r--r-- | src-diclib/xstr.c | 643 |
1 files changed, 643 insertions, 0 deletions
diff --git a/src-diclib/xstr.c b/src-diclib/xstr.c new file mode 100644 index 0000000..646a0cb --- /dev/null +++ b/src-diclib/xstr.c @@ -0,0 +1,643 @@ +/* + * Anthy内部で使う文字列の処理 + * typedef struct xstr_ { + * xstr *str; int len; + * } xstr; + * + * malloc(0);の意味は考えないで0文字の文字列を扱えるような + * コーディングをする。free(0)は良い。 + * + * デフォルトの設定では + * cstrはCの普通のEUC文字列 + * + * Copyright (C) 2000-2007 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "config.h" +/* for ANTHY_*_ENCODING */ +#include <anthy/anthy.h> + +#include <anthy/xstr.h> +#include <anthy/xchar.h> +#include "diclib_inner.h" + +/* 画面に出力するときのエンコーディング */ +static int print_encoding; + +#define MAX_BYTES_PER_XCHAR 10 + +static int +xc_isprint(xchar xc) +{ + return xc > 0; +} + +/** Cの文字列に対応するxstrの長さを計算する + */ +static int +xlengthofcstr(const char *c) +{ + int ll = 0; + int len = strlen(c); + int i; + for (i = 0; i < len; i++) { + ll ++; + if ((c[i] & 0x80)) { + i++; + } + } + return ll; +} + +const char * +anthy_utf8_to_ucs4_xchar(const char *s, xchar *res) +{ + const unsigned char *str = (const unsigned char *)s; + int i, len; + xchar cur; + cur = str[0]; + if (str[0] < 0x80) { + len = 1; + } else if (str[0] < 0xe0) { + cur &= 0x1f; + len = 2; + } else if (str[0] < 0xf0) { + cur &= 0x0f; + len = 3; + } else if (str[0] < 0xf8) { + cur &= 0x07; + len = 4; + } else if (str[0] < 0xfc) { + cur &= 0x03; + len = 5; + } else { + cur &= 0x01; + len = 6; + } + str ++; + for (i = 1; i < len; i++) { + cur <<= 6; + cur |= (str[0] & 0x3f); + str++; + } + *res = cur; + return (const char *)str; +} + +static xstr * +utf8_to_ucs4_xstr(const char *s) +{ + const unsigned char *str = (const unsigned char *)s; + xstr res; + res.str = (xchar *)alloca(sizeof(xchar) * strlen(s)); + res.len = 0; + + while (*str) { + xchar cur; + str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str, + &cur); + res.str[res.len] = cur; + res.len ++; + } + return anthy_xstr_dup(&res); +} + +static int +put_xchar_to_utf8_str(xchar xc, char *buf_) +{ + int i, len; + unsigned char *buf = (unsigned char *)buf_; + if (xc < 0x80) { + buf[0] = 0; + len = 1; + } else if (xc < 0x800) { + buf[0] = 0xc0; + len = 2; + } else if (xc < 0x10000) { + buf[0] = 0xe0; + len = 3; + } else if (xc < 0x200000) { + buf[0] = 0xf0; + len = 4; + } else if (xc < 0x400000) { + buf[0] = 0xf8; + len = 5; + } else { + buf[0] = 0xfc; + len = 6; + } + for (i = len - 1; i > 0; i--) { + buf[i] = (xc & 0x3f) | 0x80; + xc >>= 6; + } + buf[0] += xc; + buf[len] = 0; + return len; +} + +static char * +ucs4_xstr_to_utf8(xstr *xs) +{ + char *buf = alloca(xs->len * 6 + 1); + int i, t = 0; + buf[0] = 0; + for (i = 0; i < xs->len; i++) { + xchar xc = xs->str[i]; + put_xchar_to_utf8_str(xc, &buf[t]); + t = strlen(buf); + } + return strdup(buf); +} + +/** Cの文字列をxstrに変更する + */ +xstr * +anthy_cstr_to_xstr(const char *c, int encoding) +{ + xstr *x; + int i, j, l; + if (encoding == ANTHY_UTF8_ENCODING) { + return utf8_to_ucs4_xstr(c); + } + l = xlengthofcstr(c); + x = (xstr *)malloc(sizeof(struct xstr_)); + if (!x) { + return NULL; + } + x->len = l; + x->str = malloc(sizeof(xchar)*l); + for (i = 0, j = 0; i < l; i++) { + if (!(c[j] & 0x80)){ + x->str[i] = c[j]; + j++; + } else { + unsigned char *p = (unsigned char *)&c[j]; + x->str[i] = (p[1] | (p[0]<<8)) | 0x8080; + x->str[i] = anthy_euc_to_ucs(x->str[i]); + j++; + j++; + } + } + return x; +} + +char * +anthy_xstr_to_cstr(xstr *s, int encoding) +{ + int i, j, l; + char *p; + + if (encoding == ANTHY_UTF8_ENCODING) { + return ucs4_xstr_to_utf8(s); + } + + l = s->len; + for (i = 0; i < s->len; i++) { + int ec = anthy_ucs_to_euc(s->str[i]); + if (ec > 255) { + l++; + } + } + p = (char *)malloc(l + 1); + p[l] = 0; + j = 0; + for (i = 0; i < s->len; i++) { + int ec = anthy_ucs_to_euc(s->str[i]); + if (ec < 256) { + p[j] = ec; + j++; + }else{ + p[j] = ec >> 8; + j++; + p[j] = ec & 255; + j++; + } + } + return p; +} + +xstr * +anthy_xstr_dup(xstr *s) +{ + int i; + xstr *x = (xstr *)malloc(sizeof(xstr)); + x->len = s->len; + if (s->len) { + x->str = malloc(sizeof(xchar)*s->len); + }else{ + x->str = NULL; + } + for (i = 0; i < x->len; i++) { + x->str[i] = s->str[i]; + } + return x; +} + +xchar * +anthy_xstr_dup_str(xstr *s) +{ + xchar *c; + int i; + if (s->len) { + c = malloc(sizeof(xchar)*s->len); + }else{ + c = 0; + } + for (i = 0; i < s->len; i++) { + c[i] = s->str[i]; + } + return c; +} + +void +anthy_free_xstr(xstr *x) +{ + if (!x) { + return ; + } + /**/ + free(x->str); + free(x); +} + +void +anthy_free_xstr_str(xstr *x) +{ + if (!x) { + return ; + } + free(x->str); +} + +int +anthy_sputxchar(char *buf, xchar x, int encoding) +{ + if (!xc_isprint(x)) { + sprintf(buf, "??"); + return 2; + } + if (encoding == ANTHY_UTF8_ENCODING) { + return put_xchar_to_utf8_str(x, buf); + } + x = anthy_ucs_to_euc(x); + if (x < 256) { + buf[0] = x; + buf[1] = 0; + return 1; + } + buf[2] = 0; + buf[1] = 0x80 | (x & 255); + buf[0] = 0x80 | ((x>>8) & 255); + return 2; +} + +int +anthy_sputxstr(char *buf, xstr *x, int encoding) +{ + char b[MAX_BYTES_PER_XCHAR]; + int i, l = 0; + for (i = 0; i < x->len; i++) { + anthy_sputxchar(b, x->str[i], encoding); + sprintf(&buf[l], "%s", b); + l += strlen(b); + } + return l; +} + +int +anthy_snputxstr(char *buf, int n, xstr *x, int encoding) +{ + char b[MAX_BYTES_PER_XCHAR]; + int i, l=0; + for (i = 0; i < x->len; i++) { + anthy_sputxchar(b, x->str[i], encoding); + if ((int)strlen(b) + l >= n) { + return l; + } + n -= sprintf(&buf[l], "%s", b); + l += strlen(b); + } + return l; +} + +void +anthy_putxchar(xchar x) +{ + char buf[MAX_BYTES_PER_XCHAR]; + if (!xc_isprint(x)) { + printf("\\%x", x); + return ; + } + anthy_sputxchar(buf, x, print_encoding); + printf("%s", buf); +} + +void +anthy_putxstr(xstr *x) +{ + int i; + for (i = 0; i < x->len; i++) { + anthy_putxchar(x->str[i]); + } +} + +void +anthy_putxstrln(xstr *x) +{ + anthy_putxstr(x); + printf("\n"); +} + +xstr* +anthy_xstrcpy(xstr *dest, xstr *src) +{ + int i; + /* 文字列をコピー */ + dest->len = src->len; + for (i = 0; i < src->len; i++) { + dest->str[i] = src->str[i]; + } + + return dest; +} +/* 返り値の符号はstrcmpと同じ */ +int +anthy_xstrcmp(xstr *x1, xstr *x2) +{ + int i, m; + if (x1->len < x2->len) { + m = x1->len; + }else{ + m = x2->len; + } + for (i = 0 ; i < m ; i++) { + if (x1->str[i] < x2->str[i]) { + return -1; + } + if (x1->str[i] > x2->str[i]) { + return 1; + } + } + if (x1->len < x2->len) { + return -1; + } + if (x1->len > x2->len) { + return 1; + } + return 0; +} + +/* 返り値の符号はstrncmpと同じ */ +int +anthy_xstrncmp(xstr *x1, xstr *x2, int n) +{ + int i, m; + if (x1->len < x2->len) { + m = x1->len; + }else{ + m = x2->len; + } + if (m > n) m = n; + for (i = 0 ; i < m ; i++) { + if (x1->str[i] < x2->str[i]) { + return -1; + } + if (x1->str[i] > x2->str[i]) { + return 1; + } + } + if (x2->len <= n && x1->len < x2->len) { + return -1; + } + if (x1->len <= n && x1->len > x2->len) { + return 1; + } + return 0; +} + + +xstr * +anthy_xstrcat(xstr *s, xstr *a) +{ + int i, l; + if (!s) { + s = malloc(sizeof(xstr)); + s->str = NULL; + s->len = 0; + } + l = s->len + a->len; + + if (l < 1) { /* 辞書もしくは学習データが壊れていた時の対策 */ + free(s->str); + s->str = NULL; + s->len = 0; + return s; + } + + s->str = realloc(s->str, sizeof(xchar)*l); + for (i = 0; i < a->len; i ++) { + s->str[s->len+i] = a->str[i]; + } + s->len = l; + return s; +} + +xstr * +anthy_xstrappend(xstr *xs, xchar xc) +{ + xstr p; + xchar q[1]; + p.len = 1; + p.str = q; + q[0] = xc; + return anthy_xstrcat(xs, &p); +} + +long long +anthy_xstrtoll(xstr *x) +{ + xchar c; + int i; + long long n = 0;/* 数 */ + if (!x->len || x->len > 16) { + return -1; + } + if (!anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM)) { + return -1; + } + for (i = 0; i < x->len; i++) { + c = x->str[i]; + n *= 10; + n += anthy_xchar_to_num(c); + } + return n; +} + +/** 全角の数字を半角にする + */ +xstr * +anthy_xstr_wide_num_to_num(xstr* src_xs) +{ + int i; + xstr *dst_xs; + dst_xs = anthy_xstr_dup(src_xs); + for (i = 0; i < src_xs->len; ++i) { + dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]); + } + return dst_xs; +} + +/** 平仮名をカタカナに変換する + */ +xstr * +anthy_xstr_hira_to_kata(xstr *src_xs) +{ + xstr *dst_xs; + int i, j; + dst_xs = anthy_xstr_dup(src_xs); + + for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) { + /* 「う゛」のチェック */ + if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U + && dst_xs->str[i+1] == HK_DDOT) { + dst_xs->str[j] = KK_VU;/* ヴ */ + i++; + continue ; + } + /**/ + dst_xs->str[j] = dst_xs->str[i]; + if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) { + /* ひらがなだったら256足す */ + dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]); + dst_xs->str[j] += 256; + dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]); + } + } + dst_xs->len = j; + return dst_xs; +} + +xstr * +anthy_xstr_hira_to_half_kata(xstr *src_xs) +{ + int len = src_xs->len; + int i, j; + xstr *xs; + for (i = 0; i < src_xs->len; i++) { + const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]); + if (tab && tab->mod) { + len ++; + } + } + xs = malloc(sizeof(xstr)); + xs->len = len; + xs->str = malloc(sizeof(xchar) * len); + j = 0; + for (i = 0; i < src_xs->len; i++) { + const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]); + if (tab) { + xs->str[j] = anthy_euc_to_ucs(tab->dst); + if (tab->mod) { + j++; + xs->str[j] = anthy_euc_to_ucs(tab->mod); + } + } else { + xs->str[j] = src_xs->str[i]; + } + j++; + } + return xs; +} + +xstr * +anthy_conv_half_wide(xstr *xs) +{ + int i; + xstr *res; + for (i = 0; i < xs->len; i++) { + if (!anthy_lookup_half_wide(xs->str[i])) { + return NULL; + } + } + res = anthy_xstr_dup(xs); + for (i = 0; i < xs->len; i++) { + res->str[i] = anthy_lookup_half_wide(xs->str[i]); + } + return res; +} + +int +anthy_xstr_hash(xstr *xs) +{ + int h,i; + h = 0; + for (i = 0 ;i < xs->len ;i++) { + h *= 97; + h += xs->str[i]<<4; + h += xs->str[i]>>4; + } + if (h < 0) { + return -h; + } + return h; +} + +static char * +conv_cstr(const char *s, int from, int to) +{ + char *res; + xstr *xs = anthy_cstr_to_xstr(s, from); + if (!xs) { + return NULL; + } + res = anthy_xstr_to_cstr(xs, to); + anthy_free_xstr(xs); + return res; +} + +char * +anthy_conv_euc_to_utf8(const char *s) +{ + return conv_cstr(s, ANTHY_EUC_JP_ENCODING, ANTHY_UTF8_ENCODING); +} + +char * +anthy_conv_utf8_to_euc(const char *s) +{ + return conv_cstr(s, ANTHY_UTF8_ENCODING, ANTHY_EUC_JP_ENCODING); +} + +void +anthy_xstr_set_print_encoding(int encoding) +{ + print_encoding = encoding; +} + +int +anthy_init_xstr(void) +{ + return 0; +} + +void anthy_quit_xstr(void) +{ +} |