summaryrefslogtreecommitdiff
path: root/src-diclib/xstr.c
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2009-02-07 16:32:56 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2009-02-07 16:32:56 +0000
commita7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch)
treea966aeee62e69ae3ad13275d07ddb15049b14e0e /src-diclib/xstr.c
downloadanthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz
Diffstat (limited to 'src-diclib/xstr.c')
-rw-r--r--src-diclib/xstr.c643
1 files changed, 643 insertions, 0 deletions
diff --git a/src-diclib/xstr.c b/src-diclib/xstr.c
new file mode 100644
index 0000000..646a0cb
--- /dev/null
+++ b/src-diclib/xstr.c
@@ -0,0 +1,643 @@
+/*
+ * Anthy内部で使う文字列の処理
+ * typedef struct xstr_ {
+ * xstr *str; int len;
+ * } xstr;
+ *
+ * malloc(0);の意味は考えないで0文字の文字列を扱えるような
+ * コーディングをする。free(0)は良い。
+ *
+ * デフォルトの設定では
+ * cstrはCの普通のEUC文字列
+ *
+ * Copyright (C) 2000-2007 TABATA Yusuke
+ *
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+/* for ANTHY_*_ENCODING */
+#include <anthy/anthy.h>
+
+#include <anthy/xstr.h>
+#include <anthy/xchar.h>
+#include "diclib_inner.h"
+
+/* 画面に出力するときのエンコーディング */
+static int print_encoding;
+
+#define MAX_BYTES_PER_XCHAR 10
+
+static int
+xc_isprint(xchar xc)
+{
+ return xc > 0;
+}
+
+/** Cの文字列に対応するxstrの長さを計算する
+ */
+static int
+xlengthofcstr(const char *c)
+{
+ int ll = 0;
+ int len = strlen(c);
+ int i;
+ for (i = 0; i < len; i++) {
+ ll ++;
+ if ((c[i] & 0x80)) {
+ i++;
+ }
+ }
+ return ll;
+}
+
+const char *
+anthy_utf8_to_ucs4_xchar(const char *s, xchar *res)
+{
+ const unsigned char *str = (const unsigned char *)s;
+ int i, len;
+ xchar cur;
+ cur = str[0];
+ if (str[0] < 0x80) {
+ len = 1;
+ } else if (str[0] < 0xe0) {
+ cur &= 0x1f;
+ len = 2;
+ } else if (str[0] < 0xf0) {
+ cur &= 0x0f;
+ len = 3;
+ } else if (str[0] < 0xf8) {
+ cur &= 0x07;
+ len = 4;
+ } else if (str[0] < 0xfc) {
+ cur &= 0x03;
+ len = 5;
+ } else {
+ cur &= 0x01;
+ len = 6;
+ }
+ str ++;
+ for (i = 1; i < len; i++) {
+ cur <<= 6;
+ cur |= (str[0] & 0x3f);
+ str++;
+ }
+ *res = cur;
+ return (const char *)str;
+}
+
+static xstr *
+utf8_to_ucs4_xstr(const char *s)
+{
+ const unsigned char *str = (const unsigned char *)s;
+ xstr res;
+ res.str = (xchar *)alloca(sizeof(xchar) * strlen(s));
+ res.len = 0;
+
+ while (*str) {
+ xchar cur;
+ str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str,
+ &cur);
+ res.str[res.len] = cur;
+ res.len ++;
+ }
+ return anthy_xstr_dup(&res);
+}
+
+static int
+put_xchar_to_utf8_str(xchar xc, char *buf_)
+{
+ int i, len;
+ unsigned char *buf = (unsigned char *)buf_;
+ if (xc < 0x80) {
+ buf[0] = 0;
+ len = 1;
+ } else if (xc < 0x800) {
+ buf[0] = 0xc0;
+ len = 2;
+ } else if (xc < 0x10000) {
+ buf[0] = 0xe0;
+ len = 3;
+ } else if (xc < 0x200000) {
+ buf[0] = 0xf0;
+ len = 4;
+ } else if (xc < 0x400000) {
+ buf[0] = 0xf8;
+ len = 5;
+ } else {
+ buf[0] = 0xfc;
+ len = 6;
+ }
+ for (i = len - 1; i > 0; i--) {
+ buf[i] = (xc & 0x3f) | 0x80;
+ xc >>= 6;
+ }
+ buf[0] += xc;
+ buf[len] = 0;
+ return len;
+}
+
+static char *
+ucs4_xstr_to_utf8(xstr *xs)
+{
+ char *buf = alloca(xs->len * 6 + 1);
+ int i, t = 0;
+ buf[0] = 0;
+ for (i = 0; i < xs->len; i++) {
+ xchar xc = xs->str[i];
+ put_xchar_to_utf8_str(xc, &buf[t]);
+ t = strlen(buf);
+ }
+ return strdup(buf);
+}
+
+/** Cの文字列をxstrに変更する
+ */
+xstr *
+anthy_cstr_to_xstr(const char *c, int encoding)
+{
+ xstr *x;
+ int i, j, l;
+ if (encoding == ANTHY_UTF8_ENCODING) {
+ return utf8_to_ucs4_xstr(c);
+ }
+ l = xlengthofcstr(c);
+ x = (xstr *)malloc(sizeof(struct xstr_));
+ if (!x) {
+ return NULL;
+ }
+ x->len = l;
+ x->str = malloc(sizeof(xchar)*l);
+ for (i = 0, j = 0; i < l; i++) {
+ if (!(c[j] & 0x80)){
+ x->str[i] = c[j];
+ j++;
+ } else {
+ unsigned char *p = (unsigned char *)&c[j];
+ x->str[i] = (p[1] | (p[0]<<8)) | 0x8080;
+ x->str[i] = anthy_euc_to_ucs(x->str[i]);
+ j++;
+ j++;
+ }
+ }
+ return x;
+}
+
+char *
+anthy_xstr_to_cstr(xstr *s, int encoding)
+{
+ int i, j, l;
+ char *p;
+
+ if (encoding == ANTHY_UTF8_ENCODING) {
+ return ucs4_xstr_to_utf8(s);
+ }
+
+ l = s->len;
+ for (i = 0; i < s->len; i++) {
+ int ec = anthy_ucs_to_euc(s->str[i]);
+ if (ec > 255) {
+ l++;
+ }
+ }
+ p = (char *)malloc(l + 1);
+ p[l] = 0;
+ j = 0;
+ for (i = 0; i < s->len; i++) {
+ int ec = anthy_ucs_to_euc(s->str[i]);
+ if (ec < 256) {
+ p[j] = ec;
+ j++;
+ }else{
+ p[j] = ec >> 8;
+ j++;
+ p[j] = ec & 255;
+ j++;
+ }
+ }
+ return p;
+}
+
+xstr *
+anthy_xstr_dup(xstr *s)
+{
+ int i;
+ xstr *x = (xstr *)malloc(sizeof(xstr));
+ x->len = s->len;
+ if (s->len) {
+ x->str = malloc(sizeof(xchar)*s->len);
+ }else{
+ x->str = NULL;
+ }
+ for (i = 0; i < x->len; i++) {
+ x->str[i] = s->str[i];
+ }
+ return x;
+}
+
+xchar *
+anthy_xstr_dup_str(xstr *s)
+{
+ xchar *c;
+ int i;
+ if (s->len) {
+ c = malloc(sizeof(xchar)*s->len);
+ }else{
+ c = 0;
+ }
+ for (i = 0; i < s->len; i++) {
+ c[i] = s->str[i];
+ }
+ return c;
+}
+
+void
+anthy_free_xstr(xstr *x)
+{
+ if (!x) {
+ return ;
+ }
+ /**/
+ free(x->str);
+ free(x);
+}
+
+void
+anthy_free_xstr_str(xstr *x)
+{
+ if (!x) {
+ return ;
+ }
+ free(x->str);
+}
+
+int
+anthy_sputxchar(char *buf, xchar x, int encoding)
+{
+ if (!xc_isprint(x)) {
+ sprintf(buf, "??");
+ return 2;
+ }
+ if (encoding == ANTHY_UTF8_ENCODING) {
+ return put_xchar_to_utf8_str(x, buf);
+ }
+ x = anthy_ucs_to_euc(x);
+ if (x < 256) {
+ buf[0] = x;
+ buf[1] = 0;
+ return 1;
+ }
+ buf[2] = 0;
+ buf[1] = 0x80 | (x & 255);
+ buf[0] = 0x80 | ((x>>8) & 255);
+ return 2;
+}
+
+int
+anthy_sputxstr(char *buf, xstr *x, int encoding)
+{
+ char b[MAX_BYTES_PER_XCHAR];
+ int i, l = 0;
+ for (i = 0; i < x->len; i++) {
+ anthy_sputxchar(b, x->str[i], encoding);
+ sprintf(&buf[l], "%s", b);
+ l += strlen(b);
+ }
+ return l;
+}
+
+int
+anthy_snputxstr(char *buf, int n, xstr *x, int encoding)
+{
+ char b[MAX_BYTES_PER_XCHAR];
+ int i, l=0;
+ for (i = 0; i < x->len; i++) {
+ anthy_sputxchar(b, x->str[i], encoding);
+ if ((int)strlen(b) + l >= n) {
+ return l;
+ }
+ n -= sprintf(&buf[l], "%s", b);
+ l += strlen(b);
+ }
+ return l;
+}
+
+void
+anthy_putxchar(xchar x)
+{
+ char buf[MAX_BYTES_PER_XCHAR];
+ if (!xc_isprint(x)) {
+ printf("\\%x", x);
+ return ;
+ }
+ anthy_sputxchar(buf, x, print_encoding);
+ printf("%s", buf);
+}
+
+void
+anthy_putxstr(xstr *x)
+{
+ int i;
+ for (i = 0; i < x->len; i++) {
+ anthy_putxchar(x->str[i]);
+ }
+}
+
+void
+anthy_putxstrln(xstr *x)
+{
+ anthy_putxstr(x);
+ printf("\n");
+}
+
+xstr*
+anthy_xstrcpy(xstr *dest, xstr *src)
+{
+ int i;
+ /* 文字列をコピー */
+ dest->len = src->len;
+ for (i = 0; i < src->len; i++) {
+ dest->str[i] = src->str[i];
+ }
+
+ return dest;
+}
+/* 返り値の符号はstrcmpと同じ */
+int
+anthy_xstrcmp(xstr *x1, xstr *x2)
+{
+ int i, m;
+ if (x1->len < x2->len) {
+ m = x1->len;
+ }else{
+ m = x2->len;
+ }
+ for (i = 0 ; i < m ; i++) {
+ if (x1->str[i] < x2->str[i]) {
+ return -1;
+ }
+ if (x1->str[i] > x2->str[i]) {
+ return 1;
+ }
+ }
+ if (x1->len < x2->len) {
+ return -1;
+ }
+ if (x1->len > x2->len) {
+ return 1;
+ }
+ return 0;
+}
+
+/* 返り値の符号はstrncmpと同じ */
+int
+anthy_xstrncmp(xstr *x1, xstr *x2, int n)
+{
+ int i, m;
+ if (x1->len < x2->len) {
+ m = x1->len;
+ }else{
+ m = x2->len;
+ }
+ if (m > n) m = n;
+ for (i = 0 ; i < m ; i++) {
+ if (x1->str[i] < x2->str[i]) {
+ return -1;
+ }
+ if (x1->str[i] > x2->str[i]) {
+ return 1;
+ }
+ }
+ if (x2->len <= n && x1->len < x2->len) {
+ return -1;
+ }
+ if (x1->len <= n && x1->len > x2->len) {
+ return 1;
+ }
+ return 0;
+}
+
+
+xstr *
+anthy_xstrcat(xstr *s, xstr *a)
+{
+ int i, l;
+ if (!s) {
+ s = malloc(sizeof(xstr));
+ s->str = NULL;
+ s->len = 0;
+ }
+ l = s->len + a->len;
+
+ if (l < 1) { /* 辞書もしくは学習データが壊れていた時の対策 */
+ free(s->str);
+ s->str = NULL;
+ s->len = 0;
+ return s;
+ }
+
+ s->str = realloc(s->str, sizeof(xchar)*l);
+ for (i = 0; i < a->len; i ++) {
+ s->str[s->len+i] = a->str[i];
+ }
+ s->len = l;
+ return s;
+}
+
+xstr *
+anthy_xstrappend(xstr *xs, xchar xc)
+{
+ xstr p;
+ xchar q[1];
+ p.len = 1;
+ p.str = q;
+ q[0] = xc;
+ return anthy_xstrcat(xs, &p);
+}
+
+long long
+anthy_xstrtoll(xstr *x)
+{
+ xchar c;
+ int i;
+ long long n = 0;/* 数 */
+ if (!x->len || x->len > 16) {
+ return -1;
+ }
+ if (!anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM)) {
+ return -1;
+ }
+ for (i = 0; i < x->len; i++) {
+ c = x->str[i];
+ n *= 10;
+ n += anthy_xchar_to_num(c);
+ }
+ return n;
+}
+
+/** 全角の数字を半角にする
+ */
+xstr *
+anthy_xstr_wide_num_to_num(xstr* src_xs)
+{
+ int i;
+ xstr *dst_xs;
+ dst_xs = anthy_xstr_dup(src_xs);
+ for (i = 0; i < src_xs->len; ++i) {
+ dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]);
+ }
+ return dst_xs;
+}
+
+/** 平仮名をカタカナに変換する
+ */
+xstr *
+anthy_xstr_hira_to_kata(xstr *src_xs)
+{
+ xstr *dst_xs;
+ int i, j;
+ dst_xs = anthy_xstr_dup(src_xs);
+
+ for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) {
+ /* 「う゛」のチェック */
+ if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U
+ && dst_xs->str[i+1] == HK_DDOT) {
+ dst_xs->str[j] = KK_VU;/* ヴ */
+ i++;
+ continue ;
+ }
+ /**/
+ dst_xs->str[j] = dst_xs->str[i];
+ if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) {
+ /* ひらがなだったら256足す */
+ dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]);
+ dst_xs->str[j] += 256;
+ dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]);
+ }
+ }
+ dst_xs->len = j;
+ return dst_xs;
+}
+
+xstr *
+anthy_xstr_hira_to_half_kata(xstr *src_xs)
+{
+ int len = src_xs->len;
+ int i, j;
+ xstr *xs;
+ for (i = 0; i < src_xs->len; i++) {
+ const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
+ if (tab && tab->mod) {
+ len ++;
+ }
+ }
+ xs = malloc(sizeof(xstr));
+ xs->len = len;
+ xs->str = malloc(sizeof(xchar) * len);
+ j = 0;
+ for (i = 0; i < src_xs->len; i++) {
+ const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
+ if (tab) {
+ xs->str[j] = anthy_euc_to_ucs(tab->dst);
+ if (tab->mod) {
+ j++;
+ xs->str[j] = anthy_euc_to_ucs(tab->mod);
+ }
+ } else {
+ xs->str[j] = src_xs->str[i];
+ }
+ j++;
+ }
+ return xs;
+}
+
+xstr *
+anthy_conv_half_wide(xstr *xs)
+{
+ int i;
+ xstr *res;
+ for (i = 0; i < xs->len; i++) {
+ if (!anthy_lookup_half_wide(xs->str[i])) {
+ return NULL;
+ }
+ }
+ res = anthy_xstr_dup(xs);
+ for (i = 0; i < xs->len; i++) {
+ res->str[i] = anthy_lookup_half_wide(xs->str[i]);
+ }
+ return res;
+}
+
+int
+anthy_xstr_hash(xstr *xs)
+{
+ int h,i;
+ h = 0;
+ for (i = 0 ;i < xs->len ;i++) {
+ h *= 97;
+ h += xs->str[i]<<4;
+ h += xs->str[i]>>4;
+ }
+ if (h < 0) {
+ return -h;
+ }
+ return h;
+}
+
+static char *
+conv_cstr(const char *s, int from, int to)
+{
+ char *res;
+ xstr *xs = anthy_cstr_to_xstr(s, from);
+ if (!xs) {
+ return NULL;
+ }
+ res = anthy_xstr_to_cstr(xs, to);
+ anthy_free_xstr(xs);
+ return res;
+}
+
+char *
+anthy_conv_euc_to_utf8(const char *s)
+{
+ return conv_cstr(s, ANTHY_EUC_JP_ENCODING, ANTHY_UTF8_ENCODING);
+}
+
+char *
+anthy_conv_utf8_to_euc(const char *s)
+{
+ return conv_cstr(s, ANTHY_UTF8_ENCODING, ANTHY_EUC_JP_ENCODING);
+}
+
+void
+anthy_xstr_set_print_encoding(int encoding)
+{
+ print_encoding = encoding;
+}
+
+int
+anthy_init_xstr(void)
+{
+ return 0;
+}
+
+void anthy_quit_xstr(void)
+{
+}