/* * Anthy内部で使う文字列の処理 * typedef struct xstr_ { * xstr *str; int len; * } xstr; * * malloc(0);の意味は考えないで0文字の文字列を扱えるような * コーディングをする。free(0)は良い。 * * デフォルトの設定では * cstrはCの普通のEUC文字列 * * Copyright (C) 2000-2007 TABATA Yusuke * */ /* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include "config.h" /* for ANTHY_*_ENCODING */ #include #include #include #include "diclib_inner.h" /* 画面に出力するときのエンコーディング */ static int print_encoding; #define MAX_BYTES_PER_XCHAR 10 static int xc_isprint(xchar xc) { return xc > 0; } /** Cの文字列に対応するxstrの長さを計算する */ static int xlengthofcstr(const char *c) { int ll = 0; int len = strlen(c); int i; for (i = 0; i < len; i++) { ll ++; if ((c[i] & 0x80)) { i++; } } return ll; } const char * anthy_utf8_to_ucs4_xchar(const char *s, xchar *res) { const unsigned char *str = (const unsigned char *)s; int i, len; xchar cur; cur = str[0]; if (str[0] < 0x80) { len = 1; } else if (str[0] < 0xe0) { cur &= 0x1f; len = 2; } else if (str[0] < 0xf0) { cur &= 0x0f; len = 3; } else if (str[0] < 0xf8) { cur &= 0x07; len = 4; } else if (str[0] < 0xfc) { cur &= 0x03; len = 5; } else { cur &= 0x01; len = 6; } str ++; for (i = 1; i < len; i++) { cur <<= 6; cur |= (str[0] & 0x3f); str++; } *res = cur; return (const char *)str; } static xstr * utf8_to_ucs4_xstr(const char *s) { const unsigned char *str = (const unsigned char *)s; xstr res; res.str = (xchar *)alloca(sizeof(xchar) * strlen(s)); res.len = 0; while (*str) { xchar cur; str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str, &cur); res.str[res.len] = cur; res.len ++; } return anthy_xstr_dup(&res); } static int put_xchar_to_utf8_str(xchar xc, char *buf_) { int i, len; unsigned char *buf = (unsigned char *)buf_; if (xc < 0x80) { buf[0] = 0; len = 1; } else if (xc < 0x800) { buf[0] = 0xc0; len = 2; } else if (xc < 0x10000) { buf[0] = 0xe0; len = 3; } else if (xc < 0x200000) { buf[0] = 0xf0; len = 4; } else if (xc < 0x400000) { buf[0] = 0xf8; len = 5; } else { buf[0] = 0xfc; len = 6; } for (i = len - 1; i > 0; i--) { buf[i] = (xc & 0x3f) | 0x80; xc >>= 6; } buf[0] += xc; buf[len] = 0; return len; } static char * ucs4_xstr_to_utf8(xstr *xs) { char *buf = alloca(xs->len * 6 + 1); int i, t = 0; buf[0] = 0; for (i = 0; i < xs->len; i++) { xchar xc = xs->str[i]; put_xchar_to_utf8_str(xc, &buf[t]); t = strlen(buf); } return strdup(buf); } /** Cの文字列をxstrに変更する */ xstr * anthy_cstr_to_xstr(const char *c, int encoding) { xstr *x; int i, j, l; if (encoding == ANTHY_UTF8_ENCODING) { return utf8_to_ucs4_xstr(c); } l = xlengthofcstr(c); x = (xstr *)malloc(sizeof(struct xstr_)); if (!x) { return NULL; } x->len = l; x->str = malloc(sizeof(xchar)*l); for (i = 0, j = 0; i < l; i++) { if (!(c[j] & 0x80)){ x->str[i] = c[j]; j++; } else { unsigned char *p = (unsigned char *)&c[j]; x->str[i] = (p[1] | (p[0]<<8)) | 0x8080; x->str[i] = anthy_euc_to_ucs(x->str[i]); j++; j++; } } return x; } char * anthy_xstr_to_cstr(xstr *s, int encoding) { int i, j, l; char *p; if (encoding == ANTHY_UTF8_ENCODING) { return ucs4_xstr_to_utf8(s); } l = s->len; for (i = 0; i < s->len; i++) { int ec = anthy_ucs_to_euc(s->str[i]); if (ec > 255) { l++; } } p = (char *)malloc(l + 1); p[l] = 0; j = 0; for (i = 0; i < s->len; i++) { int ec = anthy_ucs_to_euc(s->str[i]); if (ec < 256) { p[j] = ec; j++; }else{ p[j] = ec >> 8; j++; p[j] = ec & 255; j++; } } return p; } xstr * anthy_xstr_dup(xstr *s) { int i; xstr *x = (xstr *)malloc(sizeof(xstr)); x->len = s->len; if (s->len) { x->str = malloc(sizeof(xchar)*s->len); }else{ x->str = NULL; } for (i = 0; i < x->len; i++) { x->str[i] = s->str[i]; } return x; } xchar * anthy_xstr_dup_str(xstr *s) { xchar *c; int i; if (s->len) { c = malloc(sizeof(xchar)*s->len); }else{ c = 0; } for (i = 0; i < s->len; i++) { c[i] = s->str[i]; } return c; } void anthy_free_xstr(xstr *x) { if (!x) { return ; } /**/ free(x->str); free(x); } void anthy_free_xstr_str(xstr *x) { if (!x) { return ; } free(x->str); } int anthy_sputxchar(char *buf, xchar x, int encoding) { if (!xc_isprint(x)) { sprintf(buf, "??"); return 2; } if (encoding == ANTHY_UTF8_ENCODING) { return put_xchar_to_utf8_str(x, buf); } x = anthy_ucs_to_euc(x); if (x < 256) { buf[0] = x; buf[1] = 0; return 1; } buf[2] = 0; buf[1] = 0x80 | (x & 255); buf[0] = 0x80 | ((x>>8) & 255); return 2; } int anthy_sputxstr(char *buf, xstr *x, int encoding) { char b[MAX_BYTES_PER_XCHAR]; int i, l = 0; for (i = 0; i < x->len; i++) { anthy_sputxchar(b, x->str[i], encoding); sprintf(&buf[l], "%s", b); l += strlen(b); } return l; } int anthy_snputxstr(char *buf, int n, xstr *x, int encoding) { char b[MAX_BYTES_PER_XCHAR]; int i, l=0; for (i = 0; i < x->len; i++) { anthy_sputxchar(b, x->str[i], encoding); if ((int)strlen(b) + l >= n) { return l; } n -= sprintf(&buf[l], "%s", b); l += strlen(b); } return l; } void anthy_putxchar(xchar x) { char buf[MAX_BYTES_PER_XCHAR]; if (!xc_isprint(x)) { printf("\\%x", x); return ; } anthy_sputxchar(buf, x, print_encoding); printf("%s", buf); } void anthy_putxstr(xstr *x) { int i; for (i = 0; i < x->len; i++) { anthy_putxchar(x->str[i]); } } void anthy_putxstrln(xstr *x) { anthy_putxstr(x); printf("\n"); } xstr* anthy_xstrcpy(xstr *dest, xstr *src) { int i; /* 文字列をコピー */ dest->len = src->len; for (i = 0; i < src->len; i++) { dest->str[i] = src->str[i]; } return dest; } /* 返り値の符号はstrcmpと同じ */ int anthy_xstrcmp(xstr *x1, xstr *x2) { int i, m; if (x1->len < x2->len) { m = x1->len; }else{ m = x2->len; } for (i = 0 ; i < m ; i++) { if (x1->str[i] < x2->str[i]) { return -1; } if (x1->str[i] > x2->str[i]) { return 1; } } if (x1->len < x2->len) { return -1; } if (x1->len > x2->len) { return 1; } return 0; } /* 返り値の符号はstrncmpと同じ */ int anthy_xstrncmp(xstr *x1, xstr *x2, int n) { int i, m; if (x1->len < x2->len) { m = x1->len; }else{ m = x2->len; } if (m > n) m = n; for (i = 0 ; i < m ; i++) { if (x1->str[i] < x2->str[i]) { return -1; } if (x1->str[i] > x2->str[i]) { return 1; } } if (x2->len <= n && x1->len < x2->len) { return -1; } if (x1->len <= n && x1->len > x2->len) { return 1; } return 0; } xstr * anthy_xstrcat(xstr *s, xstr *a) { int i, l; if (!s) { s = malloc(sizeof(xstr)); s->str = NULL; s->len = 0; } l = s->len + a->len; if (l < 1) { /* 辞書もしくは学習データが壊れていた時の対策 */ free(s->str); s->str = NULL; s->len = 0; return s; } s->str = realloc(s->str, sizeof(xchar)*l); for (i = 0; i < a->len; i ++) { s->str[s->len+i] = a->str[i]; } s->len = l; return s; } xstr * anthy_xstrappend(xstr *xs, xchar xc) { xstr p; xchar q[1]; p.len = 1; p.str = q; q[0] = xc; return anthy_xstrcat(xs, &p); } long long anthy_xstrtoll(xstr *x) { xchar c; int i; long long n = 0;/* 数 */ if (!x->len || x->len > 16) { return -1; } if (!anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM)) { return -1; } for (i = 0; i < x->len; i++) { c = x->str[i]; n *= 10; n += anthy_xchar_to_num(c); } return n; } /** 全角の数字を半角にする */ xstr * anthy_xstr_wide_num_to_num(xstr* src_xs) { int i; xstr *dst_xs; dst_xs = anthy_xstr_dup(src_xs); for (i = 0; i < src_xs->len; ++i) { dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]); } return dst_xs; } /** 平仮名をカタカナに変換する */ xstr * anthy_xstr_hira_to_kata(xstr *src_xs) { xstr *dst_xs; int i, j; dst_xs = anthy_xstr_dup(src_xs); for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) { /* 「う゛」のチェック */ if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U && dst_xs->str[i+1] == HK_DDOT) { dst_xs->str[j] = KK_VU;/* ヴ */ i++; continue ; } /**/ dst_xs->str[j] = dst_xs->str[i]; if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) { /* ひらがなだったら256足す */ dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]); dst_xs->str[j] += 256; dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]); } } dst_xs->len = j; return dst_xs; } xstr * anthy_xstr_hira_to_half_kata(xstr *src_xs) { int len = src_xs->len; int i, j; xstr *xs; for (i = 0; i < src_xs->len; i++) { const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]); if (tab && tab->mod) { len ++; } } xs = malloc(sizeof(xstr)); xs->len = len; xs->str = malloc(sizeof(xchar) * len); j = 0; for (i = 0; i < src_xs->len; i++) { const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]); if (tab) { xs->str[j] = anthy_euc_to_ucs(tab->dst); if (tab->mod) { j++; xs->str[j] = anthy_euc_to_ucs(tab->mod); } } else { xs->str[j] = src_xs->str[i]; } j++; } return xs; } xstr * anthy_conv_half_wide(xstr *xs) { int i; xstr *res; for (i = 0; i < xs->len; i++) { if (!anthy_lookup_half_wide(xs->str[i])) { return NULL; } } res = anthy_xstr_dup(xs); for (i = 0; i < xs->len; i++) { res->str[i] = anthy_lookup_half_wide(xs->str[i]); } return res; } int anthy_xstr_hash(xstr *xs) { int h,i; h = 0; for (i = 0 ;i < xs->len ;i++) { h *= 97; h += xs->str[i]<<4; h += xs->str[i]>>4; } if (h < 0) { return -h; } return h; } static char * conv_cstr(const char *s, int from, int to) { char *res; xstr *xs = anthy_cstr_to_xstr(s, from); if (!xs) { return NULL; } res = anthy_xstr_to_cstr(xs, to); anthy_free_xstr(xs); return res; } char * anthy_conv_euc_to_utf8(const char *s) { return conv_cstr(s, ANTHY_EUC_JP_ENCODING, ANTHY_UTF8_ENCODING); } char * anthy_conv_utf8_to_euc(const char *s) { return conv_cstr(s, ANTHY_UTF8_ENCODING, ANTHY_EUC_JP_ENCODING); } void anthy_xstr_set_print_encoding(int encoding) { print_encoding = encoding; } int anthy_init_xstr(void) { return 0; } void anthy_quit_xstr(void) { }