anthy-9100hHEAD anthy-9100h master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2009-02-07 16:32:56 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2009-02-07 16:32:56 +0000
commit: a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch)
tree: a966aeee62e69ae3ad13275d07ddb15049b14e0e /src-diclib/xstr.c
download: anthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz
1 files changed, 643 insertions, 0 deletions
diff --git a/src-diclib/xstr.c b/src-diclib/xstr.c
new file mode 100644
index 0000000..646a0cb
--- /dev/null
+++ b/src-diclib/xstr.c
@@ -0,0 +1,643 @@
+/*
+ * Anthy内部で使う文字列の処理
+ *  typedef struct xstr_ {
+ *    xstr *str; int len;
+ *  } xstr;
+ *
+ * malloc(0);の意味は考えないで0文字の文字列を扱えるような
+ * コーディングをする。free(0)は良い。
+ *
+ * デフォルトの設定では
+ *  cstrはCの普通のEUC文字列
+ *
+ * Copyright (C) 2000-2007 TABATA Yusuke
+ *
+ */
+/*
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+/* for ANTHY_*_ENCODING */
+#include <anthy/anthy.h>
+
+#include <anthy/xstr.h>
+#include <anthy/xchar.h>
+#include "diclib_inner.h"
+
+/* 画面に出力するときのエンコーディング */
+static int print_encoding;
+
+#define MAX_BYTES_PER_XCHAR 10
+
+static int
+xc_isprint(xchar xc)
+{
+  return xc > 0;
+}
+
+/** Cの文字列に対応するxstrの長さを計算する
+ */
+static int
+xlengthofcstr(const char *c)
+{
+  int ll = 0;
+  int len = strlen(c);
+  int i;
+  for (i = 0; i < len; i++) {
+    ll ++;
+    if ((c[i] & 0x80)) {
+      i++;
+    }
+  }
+  return ll;
+}
+
+const char *
+anthy_utf8_to_ucs4_xchar(const char *s, xchar *res)
+{
+  const unsigned char *str = (const unsigned char *)s;
+  int i, len;
+  xchar cur;
+  cur = str[0];
+  if (str[0] < 0x80) {
+    len = 1;
+  } else if (str[0] < 0xe0) {
+    cur &= 0x1f;
+    len = 2;
+  } else if (str[0] < 0xf0) {
+    cur &= 0x0f;
+    len = 3;
+  } else if (str[0] < 0xf8) {
+    cur &= 0x07;
+    len = 4;
+  } else if (str[0] < 0xfc) {
+    cur &= 0x03;
+    len = 5;
+  } else {
+    cur &= 0x01;
+    len = 6;
+  }
+  str ++;
+  for (i = 1; i < len; i++) {
+    cur <<= 6;
+    cur |= (str[0] & 0x3f);
+    str++;
+  }
+  *res = cur;
+  return (const char *)str;
+}
+
+static xstr *
+utf8_to_ucs4_xstr(const char *s)
+{
+  const unsigned char *str = (const unsigned char *)s;
+  xstr res;
+  res.str = (xchar *)alloca(sizeof(xchar) * strlen(s));
+  res.len = 0;
+
+  while (*str) {
+    xchar cur;
+    str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str,
+							  &cur);
+    res.str[res.len] = cur;
+    res.len ++;
+  }
+  return anthy_xstr_dup(&res);
+}
+
+static int
+put_xchar_to_utf8_str(xchar xc, char *buf_)
+{
+  int i, len;
+  unsigned char *buf = (unsigned char *)buf_;
+  if (xc < 0x80) {
+    buf[0] = 0;
+    len = 1;
+  } else if (xc < 0x800) {
+    buf[0] = 0xc0;
+    len = 2;
+  } else if (xc < 0x10000) {
+    buf[0] = 0xe0;
+    len = 3;
+  } else if (xc < 0x200000) {
+    buf[0] = 0xf0;
+    len = 4;
+  } else if (xc < 0x400000) {
+    buf[0] = 0xf8;
+    len = 5;
+  } else {
+    buf[0] = 0xfc;
+    len = 6;
+  }
+  for (i = len - 1; i > 0; i--) {
+    buf[i] = (xc & 0x3f) | 0x80;
+    xc >>= 6;
+  }
+  buf[0] += xc;
+  buf[len] = 0;
+  return len;
+}
+
+static char *
+ucs4_xstr_to_utf8(xstr *xs)
+{
+  char *buf = alloca(xs->len * 6 + 1);
+  int i, t = 0;
+  buf[0] = 0;
+  for (i = 0; i < xs->len; i++) {
+    xchar xc = xs->str[i];
+    put_xchar_to_utf8_str(xc, &buf[t]);
+    t = strlen(buf);
+  }
+  return strdup(buf);
+}
+
+/** Cの文字列をxstrに変更する
+ */
+xstr *
+anthy_cstr_to_xstr(const char *c, int encoding)
+{
+  xstr *x;
+  int i, j, l;
+  if (encoding == ANTHY_UTF8_ENCODING) {
+    return utf8_to_ucs4_xstr(c);
+  }
+  l = xlengthofcstr(c);
+  x = (xstr *)malloc(sizeof(struct xstr_));
+  if (!x) {
+    return NULL;
+  }
+  x->len = l;
+  x->str = malloc(sizeof(xchar)*l);
+  for (i = 0, j = 0; i < l; i++) {
+    if (!(c[j] & 0x80)){
+      x->str[i] = c[j];
+      j++;
+    } else {
+      unsigned char *p = (unsigned char *)&c[j];
+      x->str[i] = (p[1] | (p[0]<<8)) | 0x8080;
+      x->str[i] = anthy_euc_to_ucs(x->str[i]);
+      j++;
+      j++;
+    }
+  }
+  return x;
+}
+
+char *
+anthy_xstr_to_cstr(xstr *s, int encoding)
+{
+  int i, j, l;
+  char *p;
+
+  if (encoding == ANTHY_UTF8_ENCODING) {
+    return ucs4_xstr_to_utf8(s);
+  }
+
+  l = s->len;
+  for (i = 0; i < s->len; i++) {
+    int ec = anthy_ucs_to_euc(s->str[i]);
+    if (ec > 255) {
+      l++;
+    }
+  }
+  p = (char *)malloc(l + 1);
+  p[l] = 0;
+  j = 0;
+  for (i =  0; i < s->len; i++) {
+    int ec = anthy_ucs_to_euc(s->str[i]);
+    if (ec < 256) {
+      p[j] = ec;
+      j++;
+    }else{
+      p[j] = ec >> 8;
+      j++;
+      p[j] = ec & 255;
+      j++;
+    }
+  }
+  return p;
+}
+
+xstr *
+anthy_xstr_dup(xstr *s)
+{
+  int i;
+  xstr *x = (xstr *)malloc(sizeof(xstr));
+  x->len = s->len;
+  if (s->len) {
+    x->str = malloc(sizeof(xchar)*s->len);
+  }else{
+    x->str = NULL;
+  }
+  for (i = 0; i < x->len; i++) {
+    x->str[i] = s->str[i];
+  }
+  return x;
+}
+
+xchar *
+anthy_xstr_dup_str(xstr *s)
+{
+  xchar *c;
+  int i;
+  if (s->len) {
+    c = malloc(sizeof(xchar)*s->len);
+  }else{
+    c = 0;
+  }
+  for (i = 0; i < s->len; i++) {
+    c[i] = s->str[i];
+  }
+  return c;
+}
+
+void
+anthy_free_xstr(xstr *x)
+{
+  if (!x) {
+    return ;
+  }
+  /**/
+  free(x->str);
+  free(x);
+}
+
+void
+anthy_free_xstr_str(xstr *x)
+{
+  if (!x) {
+    return ;
+  }
+  free(x->str);
+}
+
+int
+anthy_sputxchar(char *buf, xchar x, int encoding)
+{
+  if (!xc_isprint(x)) {
+    sprintf(buf, "??");
+    return 2;
+  }
+  if (encoding == ANTHY_UTF8_ENCODING) {
+    return put_xchar_to_utf8_str(x, buf);
+  }
+  x = anthy_ucs_to_euc(x);
+  if (x < 256) {
+    buf[0] = x;
+    buf[1] = 0;
+    return 1;
+  }
+  buf[2] = 0;
+  buf[1] = 0x80 | (x & 255);
+  buf[0] = 0x80 | ((x>>8) & 255);
+  return 2;
+}
+
+int
+anthy_sputxstr(char *buf, xstr *x, int encoding)
+{
+  char b[MAX_BYTES_PER_XCHAR];
+  int i, l = 0;
+  for (i = 0; i < x->len; i++) {
+    anthy_sputxchar(b, x->str[i], encoding);
+    sprintf(&buf[l], "%s", b);
+    l += strlen(b);
+  }
+  return l;
+}
+
+int
+anthy_snputxstr(char *buf, int n, xstr *x, int encoding)
+{
+  char b[MAX_BYTES_PER_XCHAR];
+  int i, l=0;
+  for (i = 0; i < x->len; i++) {
+    anthy_sputxchar(b, x->str[i], encoding);
+    if ((int)strlen(b) + l >= n) {
+      return l;
+    }
+    n -= sprintf(&buf[l], "%s", b);
+    l += strlen(b);
+  }
+  return l;
+}
+
+void
+anthy_putxchar(xchar x)
+{
+  char buf[MAX_BYTES_PER_XCHAR];
+  if (!xc_isprint(x)) {
+    printf("\\%x", x);
+    return ;
+  }
+  anthy_sputxchar(buf, x, print_encoding);
+  printf("%s", buf);
+}
+
+void
+anthy_putxstr(xstr *x)
+{
+  int i;
+  for (i = 0; i < x->len; i++) {
+    anthy_putxchar(x->str[i]);
+  }
+}
+
+void
+anthy_putxstrln(xstr *x)
+{
+  anthy_putxstr(x);
+  printf("\n");
+}
+
+xstr*
+anthy_xstrcpy(xstr *dest, xstr *src)
+{
+  int i;
+  /* 文字列をコピー */
+  dest->len = src->len;
+  for (i = 0; i < src->len; i++) {
+    dest->str[i] = src->str[i];
+  }
+  
+  return dest;
+}
+/* 返り値の符号はstrcmpと同じ */
+int
+anthy_xstrcmp(xstr *x1, xstr *x2)
+{
+  int i, m;
+  if (x1->len < x2->len) {
+    m = x1->len;
+  }else{
+    m = x2->len;
+  }
+  for (i = 0 ; i < m ; i++) {
+    if (x1->str[i] < x2->str[i]) {
+      return -1;
+    }
+    if (x1->str[i] > x2->str[i]) {
+      return 1;
+    }
+  }
+  if (x1->len < x2->len) {
+    return -1;
+  }
+  if (x1->len > x2->len) {
+    return 1;
+  }
+  return 0;
+}
+
+/* 返り値の符号はstrncmpと同じ */
+int
+anthy_xstrncmp(xstr *x1, xstr *x2, int n)
+{
+  int i, m;
+  if (x1->len < x2->len) {
+    m = x1->len;
+  }else{
+    m = x2->len;
+  }
+  if (m > n) m = n;
+  for (i = 0 ; i < m ; i++) {
+    if (x1->str[i] < x2->str[i]) {
+      return -1;
+    }
+    if (x1->str[i] > x2->str[i]) {
+      return 1;
+    }
+  }
+  if (x2->len <= n && x1->len < x2->len) {
+    return -1;
+  }
+  if (x1->len <= n && x1->len > x2->len) {
+    return 1;
+  }
+  return 0;
+}
+
+
+xstr *
+anthy_xstrcat(xstr *s, xstr *a)
+{
+  int i, l;
+  if (!s) {
+    s = malloc(sizeof(xstr));
+    s->str = NULL;
+    s->len = 0;
+  }
+  l = s->len + a->len;
+
+  if (l < 1) {              /* 辞書もしくは学習データが壊れていた時の対策 */
+    free(s->str);
+    s->str = NULL;
+    s->len = 0;
+    return s;
+  }
+
+  s->str = realloc(s->str, sizeof(xchar)*l);
+  for (i = 0; i < a->len; i ++) {
+    s->str[s->len+i] = a->str[i];
+  }
+  s->len = l;
+  return s;
+}
+
+xstr *
+anthy_xstrappend(xstr *xs, xchar xc)
+{
+  xstr p;
+  xchar q[1];
+  p.len = 1;
+  p.str = q;
+  q[0] = xc;
+  return anthy_xstrcat(xs, &p);
+}
+
+long long
+anthy_xstrtoll(xstr *x)
+{
+  xchar c;
+  int i;
+  long long n = 0;/* 数 */
+  if (!x->len || x->len > 16) {
+    return -1;
+  }
+  if (!anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM)) {
+    return -1;
+  }
+  for (i = 0; i < x->len; i++) {
+    c = x->str[i];
+    n *= 10;
+    n += anthy_xchar_to_num(c);
+  }
+  return n;
+}
+
+/** 全角の数字を半角にする
+ */
+xstr *
+anthy_xstr_wide_num_to_num(xstr* src_xs)
+{
+  int i;
+  xstr *dst_xs;
+  dst_xs = anthy_xstr_dup(src_xs);
+  for (i = 0; i < src_xs->len; ++i) {
+    dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]);
+  }
+  return dst_xs;
+}
+
+/** 平仮名をカタカナに変換する
+ */
+xstr *
+anthy_xstr_hira_to_kata(xstr *src_xs)
+{
+  xstr *dst_xs;
+  int i, j;
+  dst_xs = anthy_xstr_dup(src_xs);
+
+  for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) {
+    /* 「う゛」のチェック */
+    if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U
+	&& dst_xs->str[i+1] == HK_DDOT) {
+      dst_xs->str[j] = KK_VU;/* ヴ */
+      i++;
+      continue ;
+    }
+    /**/
+    dst_xs->str[j] = dst_xs->str[i];
+    if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) {
+      /* ひらがなだったら256足す */
+      dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]);
+      dst_xs->str[j] += 256;
+      dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]);
+    }
+  }
+  dst_xs->len = j;
+  return dst_xs;
+}
+
+xstr *
+anthy_xstr_hira_to_half_kata(xstr *src_xs)
+{
+  int len = src_xs->len;
+  int i, j;
+  xstr *xs;
+  for (i = 0; i < src_xs->len; i++) {
+    const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
+    if (tab && tab->mod) {
+      len ++;
+    }
+  }
+  xs = malloc(sizeof(xstr));
+  xs->len = len;
+  xs->str = malloc(sizeof(xchar) * len);
+  j = 0;
+  for (i = 0; i < src_xs->len; i++) {
+    const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
+    if (tab) {
+      xs->str[j] = anthy_euc_to_ucs(tab->dst);
+      if (tab->mod) {
+	j++;
+	xs->str[j] = anthy_euc_to_ucs(tab->mod);
+      }
+    } else {
+      xs->str[j] = src_xs->str[i];
+    }
+    j++;
+  }
+  return xs;
+}
+
+xstr *
+anthy_conv_half_wide(xstr *xs)
+{
+  int i;
+  xstr *res;
+  for (i = 0; i < xs->len; i++) {
+    if (!anthy_lookup_half_wide(xs->str[i])) {
+      return NULL;
+    }
+  }
+  res = anthy_xstr_dup(xs);
+  for (i = 0; i < xs->len; i++) {
+    res->str[i] = anthy_lookup_half_wide(xs->str[i]);
+  }
+  return res;
+}
+
+int
+anthy_xstr_hash(xstr *xs)
+{
+  int h,i;
+  h = 0;
+  for (i = 0 ;i < xs->len ;i++) {
+    h *= 97;
+    h += xs->str[i]<<4;
+    h += xs->str[i]>>4;
+  }
+  if (h < 0) {
+    return -h;
+  }
+  return h;
+}
+
+static char *
+conv_cstr(const char *s, int from, int to)
+{
+  char *res;
+  xstr *xs = anthy_cstr_to_xstr(s, from);
+  if (!xs) {
+    return NULL;
+  }
+  res = anthy_xstr_to_cstr(xs, to);
+  anthy_free_xstr(xs);
+  return res;
+}
+
+char *
+anthy_conv_euc_to_utf8(const char *s)
+{
+  return conv_cstr(s, ANTHY_EUC_JP_ENCODING, ANTHY_UTF8_ENCODING);
+}
+
+char *
+anthy_conv_utf8_to_euc(const char *s)
+{
+  return conv_cstr(s, ANTHY_UTF8_ENCODING, ANTHY_EUC_JP_ENCODING);
+}
+
+void
+anthy_xstr_set_print_encoding(int encoding)
+{
+  print_encoding = encoding;
+}
+
+int
+anthy_init_xstr(void)
+{
+  return 0;
+}
+
+void anthy_quit_xstr(void)
+{
+}
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2009-02-07 16:32:56 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2009-02-07 16:32:56 +0000
commit	a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch)
tree	a966aeee62e69ae3ad13275d07ddb15049b14e0e /src-diclib/xstr.c
download	anthy-a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0.tar.gz