1 files changed, 329 insertions, 0 deletions
diff --git a/src-splitter/splitter.c b/src-splitter/splitter.c
new file mode 100644
index 0000000..75ace2b
--- /dev/null
+++ b/src-splitter/splitter.c
@@ -0,0 +1,329 @@
+/*
+ * 文を文節にsplitするsplitter
+ *
+ * 文節の境界を検出する
+ *  anthy_init_split_context() 分割用のコンテキストを作って
+ *  anthy_mark_border() 分割をして
+ *  anthy_release_split_context() コンテキストを解放する
+ *
+ *  anthy_commit_border() コミットされた内容に対して学習をする
+ *
+ * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
+ *
+ * Copyright (C) 2004 YOSHIDA Yuichi
+ * Copyright (C) 2000-2004 TABATA Yusuke
+ * Copyright (C) 2000-2001 UGAWA Tomoharu
+ *
+ * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
+ */
+/*
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include <anthy/alloc.h>
+#include <anthy/record.h>
+#include <anthy/splitter.h>
+#include <anthy/logger.h>
+#include "wordborder.h"
+
+#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
+
+static int splitter_debug_flags;
+
+/**/
+wtype_t anthy_wtype_noun;
+wtype_t anthy_wtype_name_noun;
+wtype_t anthy_wtype_num_noun;
+wtype_t anthy_wtype_prefix;
+wtype_t anthy_wtype_num_prefix;
+wtype_t anthy_wtype_num_postfix;
+wtype_t anthy_wtype_name_postfix;
+wtype_t anthy_wtype_sv_postfix;
+wtype_t anthy_wtype_a_tail_of_v_renyou;
+wtype_t anthy_wtype_v_renyou;
+wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */
+wtype_t anthy_wtype_n1;
+wtype_t anthy_wtype_n10;
+
+
+/** make_word_cacheで作成した文節情報を解放する
+ */
+static void
+release_info_cache(struct splitter_context *sc)
+{
+  struct word_split_info_cache *info = sc->word_split_info;
+
+  anthy_free_allocator(info->MwAllocator);
+  anthy_free_allocator(info->WlAllocator);
+  free(info->cnode);
+  free(info->seq_len);
+  free(info->rev_seq_len);
+  free(info);
+}
+
+static void
+metaword_dtor(void *p)
+{
+  struct meta_word *mw = (struct meta_word*)p;
+  if (mw->cand_hint.str) {
+    free(mw->cand_hint.str);
+  }
+}
+
+
+static void
+alloc_char_ent(xstr *xs, struct splitter_context *sc)
+{
+  int i;
+ 
+  sc->char_count = xs->len;
+  sc->ce = (struct char_ent*)
+    malloc(sizeof(struct char_ent)*(xs->len + 1));
+  for (i = 0; i <= xs->len; i++) {
+    sc->ce[i].c = &xs->str[i];
+    sc->ce[i].seg_border = 0;
+    sc->ce[i].initial_seg_len = 0;
+    sc->ce[i].best_seg_class = SEG_HEAD;
+    sc->ce[i].best_mw = NULL;
+  }
+ 
+  /* 左右両端は文節の境界である */
+  sc->ce[0].seg_border = 1;
+  sc->ce[xs->len].seg_border = 1;
+}
+
+/*  ここで確保した内容はrelease_info_cacheで解放される 
+ */
+static void
+alloc_info_cache(struct splitter_context *sc)
+{
+  int i;
+  struct word_split_info_cache *info;
+
+  /* キャッシュのデータを確保 */
+  sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
+  info = sc->word_split_info;
+  info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
+  info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
+  info->cnode =
+    malloc(sizeof(struct char_node) * (sc->char_count + 1));
+
+  info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
+  info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
+
+  /* 各文字インデックスに対して初期化を行う */
+  for (i = 0; i <= sc->char_count; i++) {
+    info->seq_len[i] = 0;
+    info->rev_seq_len[i] = 0;
+    info->cnode[i].wl = NULL;
+    info->cnode[i].mw = NULL;
+    info->cnode[i].max_len = 0;
+  }
+}
+
+/** 外から呼び出されるwordsplitterのトップレベルの関数 */
+void
+anthy_mark_border(struct splitter_context *sc,
+		  int from, int from2, int to)
+{
+  int i;
+  struct word_split_info_cache *info;
+
+  /* sanity check */
+  if ((to - from) <= 0) {
+    return ;
+  }
+
+  /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */
+  info = sc->word_split_info;
+  info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
+  info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
+  info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
+  for (i = 0; i < sc->char_count + 1; ++i) {
+    info->seg_border[i] = sc->ce[i].seg_border;
+    info->best_seg_class[i] = sc->ce[i].best_seg_class;
+    info->best_mw[i] = sc->ce[i].best_mw;
+  }
+
+  /* 境界を決定する */
+  anthy_eval_border(sc, from, from2, to);
+
+  for (i = from; i < to; ++i) {
+    sc->ce[i].seg_border = info->seg_border[i];
+    sc->ce[i].best_seg_class = info->best_seg_class[i];
+    sc->ce[i].best_mw = info->best_mw[i];
+  }
+}
+
+/* 文節が拡大されたので，それを学習する */
+static void
+proc_expanded_segment(struct splitter_context *sc,
+		      int from, int len)
+{
+  int initial_len = sc->ce[from].initial_seg_len;
+  int i, nr;
+  xstr from_xs, to_xs, *xs;
+
+  from_xs.str = sc->ce[from].c;
+  from_xs.len = initial_len;
+  to_xs.str = sc->ce[from].c;
+  to_xs.len = len;
+  if (anthy_select_section("EXPANDPAIR", 1) == -1) {
+    return ;
+  }
+  if (anthy_select_row(&from_xs, 1) == -1) {
+    return ;
+  }
+  nr = anthy_get_nr_values();
+  for (i = 0; i < nr; i ++) {
+    xs = anthy_get_nth_xstr(i);
+    if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
+      /* 既にある */
+      return ;
+    }
+  }
+  anthy_set_nth_xstr(nr, &to_xs);
+  anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
+}
+
+/* 文節のマージと語尾を学習する */
+void
+anthy_commit_border(struct splitter_context *sc, int nr_segments,
+		    struct meta_word **mw, int *seg_len)
+{
+  int i, from = 0;
+
+  /* 伸ばした文節 */
+  for (i = 0; i < nr_segments; i++) {
+    /* それぞれの文節に対して */
+
+    int len = seg_len[i];
+    int initial_len = sc->ce[from].initial_seg_len;
+    int real_len = 0;
+    int l2;
+
+    if (!initial_len || from + initial_len == sc->char_count) {
+      /* そこは境界ではない */
+      goto tail;
+    }
+    l2 = sc->ce[from + initial_len].initial_seg_len;
+    if (initial_len + l2 > len) {
+      /* 隣の文節を含むほど拡大されたわけではない */
+      goto tail;
+    }
+    if (mw[i]) {
+      real_len = mw[i]->len;
+    }
+    if (real_len <= initial_len) {
+      goto tail;
+    }
+    /* 右の文節を含む長さに拡張された文節がコミットされた */
+    proc_expanded_segment(sc, from, real_len);
+  tail:
+    from += len;
+  }
+}
+
+int
+anthy_splitter_debug_flags(void)
+{
+  return splitter_debug_flags;
+}
+
+void
+anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
+{
+  alloc_char_ent(xs, sc);
+  alloc_info_cache(sc);
+  sc->is_reverse = is_reverse;
+  /* 全ての部分文字列をチェックして、文節の候補を列挙する
+     word_listを構成してからmetawordを構成する */
+  anthy_lock_dic();
+  anthy_make_word_list_all(sc);
+  anthy_unlock_dic();
+  anthy_make_metaword_all(sc);
+
+}
+
+void
+anthy_release_split_context(struct splitter_context *sc)
+{
+  if (sc->word_split_info) {
+    release_info_cache(sc);
+    sc->word_split_info = 0;
+  }
+  if (sc->ce) {
+    free(sc->ce);
+    sc->ce = 0;
+  }
+}
+
+/** splitter全体の初期化を行う */
+int
+anthy_init_splitter(void)
+{
+  /* デバッグプリントの設定 */
+  char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
+  char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
+  splitter_debug_flags = SPLITTER_DEBUG_NONE;
+  if (!dis && en && strlen(en)) {
+    char *fs = getenv("ANTHY_SPLITTER_PRINT");
+    if (fs) {
+      if (strchr(fs, 'w')) {
+	splitter_debug_flags |= SPLITTER_DEBUG_WL;
+      }
+      if (strchr(fs, 'm')) {
+	splitter_debug_flags |= SPLITTER_DEBUG_MW;
+      }
+      if (strchr(fs, 'l')) {
+	splitter_debug_flags |= SPLITTER_DEBUG_LN;
+      }
+      if (strchr(fs, 'i')) {
+	splitter_debug_flags |= SPLITTER_DEBUG_ID;
+      }
+      if (strchr(fs, 'c')) {
+	splitter_debug_flags |= SPLITTER_DEBUG_CAND;
+      }
+    }
+  }
+  /* 付属語グラフの初期化 */
+  if (anthy_init_depword_tab()) {
+    anthy_log(0, "Failed to init dependent word table.\n");
+    return -1;
+  }
+  /**/
+  anthy_wtype_noun = anthy_init_wtype_by_name("名詞35");
+  anthy_wtype_name_noun = anthy_init_wtype_by_name("人名");
+  anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞");
+  anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語");
+  anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形");
+  anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語");
+  anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞");
+  anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞");
+  anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞");
+  anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞");
+  anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞");
+  anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1");
+  anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10");
+  return 0;
+}
+
+void
+anthy_quit_splitter(void)
+{
+  anthy_quit_depword_tab();
+}