summaryrefslogtreecommitdiff
path: root/src-splitter/splitter.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-splitter/splitter.c')
-rw-r--r--src-splitter/splitter.c329
1 files changed, 329 insertions, 0 deletions
diff --git a/src-splitter/splitter.c b/src-splitter/splitter.c
new file mode 100644
index 0000000..75ace2b
--- /dev/null
+++ b/src-splitter/splitter.c
@@ -0,0 +1,329 @@
+/*
+ * 文を文節にsplitするsplitter
+ *
+ * 文節の境界を検出する
+ * anthy_init_split_context() 分割用のコンテキストを作って
+ * anthy_mark_border() 分割をして
+ * anthy_release_split_context() コンテキストを解放する
+ *
+ * anthy_commit_border() コミットされた内容に対して学習をする
+ *
+ * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
+ *
+ * Copyright (C) 2004 YOSHIDA Yuichi
+ * Copyright (C) 2000-2004 TABATA Yusuke
+ * Copyright (C) 2000-2001 UGAWA Tomoharu
+ *
+ * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include <anthy/alloc.h>
+#include <anthy/record.h>
+#include <anthy/splitter.h>
+#include <anthy/logger.h>
+#include "wordborder.h"
+
+#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
+
+static int splitter_debug_flags;
+
+/**/
+wtype_t anthy_wtype_noun;
+wtype_t anthy_wtype_name_noun;
+wtype_t anthy_wtype_num_noun;
+wtype_t anthy_wtype_prefix;
+wtype_t anthy_wtype_num_prefix;
+wtype_t anthy_wtype_num_postfix;
+wtype_t anthy_wtype_name_postfix;
+wtype_t anthy_wtype_sv_postfix;
+wtype_t anthy_wtype_a_tail_of_v_renyou;
+wtype_t anthy_wtype_v_renyou;
+wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */
+wtype_t anthy_wtype_n1;
+wtype_t anthy_wtype_n10;
+
+
+/** make_word_cacheで作成した文節情報を解放する
+ */
+static void
+release_info_cache(struct splitter_context *sc)
+{
+ struct word_split_info_cache *info = sc->word_split_info;
+
+ anthy_free_allocator(info->MwAllocator);
+ anthy_free_allocator(info->WlAllocator);
+ free(info->cnode);
+ free(info->seq_len);
+ free(info->rev_seq_len);
+ free(info);
+}
+
+static void
+metaword_dtor(void *p)
+{
+ struct meta_word *mw = (struct meta_word*)p;
+ if (mw->cand_hint.str) {
+ free(mw->cand_hint.str);
+ }
+}
+
+
+static void
+alloc_char_ent(xstr *xs, struct splitter_context *sc)
+{
+ int i;
+
+ sc->char_count = xs->len;
+ sc->ce = (struct char_ent*)
+ malloc(sizeof(struct char_ent)*(xs->len + 1));
+ for (i = 0; i <= xs->len; i++) {
+ sc->ce[i].c = &xs->str[i];
+ sc->ce[i].seg_border = 0;
+ sc->ce[i].initial_seg_len = 0;
+ sc->ce[i].best_seg_class = SEG_HEAD;
+ sc->ce[i].best_mw = NULL;
+ }
+
+ /* 左右両端は文節の境界である */
+ sc->ce[0].seg_border = 1;
+ sc->ce[xs->len].seg_border = 1;
+}
+
+/* ここで確保した内容はrelease_info_cacheで解放される
+ */
+static void
+alloc_info_cache(struct splitter_context *sc)
+{
+ int i;
+ struct word_split_info_cache *info;
+
+ /* キャッシュのデータを確保 */
+ sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
+ info = sc->word_split_info;
+ info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
+ info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
+ info->cnode =
+ malloc(sizeof(struct char_node) * (sc->char_count + 1));
+
+ info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
+ info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
+
+ /* 各文字インデックスに対して初期化を行う */
+ for (i = 0; i <= sc->char_count; i++) {
+ info->seq_len[i] = 0;
+ info->rev_seq_len[i] = 0;
+ info->cnode[i].wl = NULL;
+ info->cnode[i].mw = NULL;
+ info->cnode[i].max_len = 0;
+ }
+}
+
+/** 外から呼び出されるwordsplitterのトップレベルの関数 */
+void
+anthy_mark_border(struct splitter_context *sc,
+ int from, int from2, int to)
+{
+ int i;
+ struct word_split_info_cache *info;
+
+ /* sanity check */
+ if ((to - from) <= 0) {
+ return ;
+ }
+
+ /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */
+ info = sc->word_split_info;
+ info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
+ info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
+ info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
+ for (i = 0; i < sc->char_count + 1; ++i) {
+ info->seg_border[i] = sc->ce[i].seg_border;
+ info->best_seg_class[i] = sc->ce[i].best_seg_class;
+ info->best_mw[i] = sc->ce[i].best_mw;
+ }
+
+ /* 境界を決定する */
+ anthy_eval_border(sc, from, from2, to);
+
+ for (i = from; i < to; ++i) {
+ sc->ce[i].seg_border = info->seg_border[i];
+ sc->ce[i].best_seg_class = info->best_seg_class[i];
+ sc->ce[i].best_mw = info->best_mw[i];
+ }
+}
+
+/* 文節が拡大されたので,それを学習する */
+static void
+proc_expanded_segment(struct splitter_context *sc,
+ int from, int len)
+{
+ int initial_len = sc->ce[from].initial_seg_len;
+ int i, nr;
+ xstr from_xs, to_xs, *xs;
+
+ from_xs.str = sc->ce[from].c;
+ from_xs.len = initial_len;
+ to_xs.str = sc->ce[from].c;
+ to_xs.len = len;
+ if (anthy_select_section("EXPANDPAIR", 1) == -1) {
+ return ;
+ }
+ if (anthy_select_row(&from_xs, 1) == -1) {
+ return ;
+ }
+ nr = anthy_get_nr_values();
+ for (i = 0; i < nr; i ++) {
+ xs = anthy_get_nth_xstr(i);
+ if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
+ /* 既にある */
+ return ;
+ }
+ }
+ anthy_set_nth_xstr(nr, &to_xs);
+ anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
+}
+
+/* 文節のマージと語尾を学習する */
+void
+anthy_commit_border(struct splitter_context *sc, int nr_segments,
+ struct meta_word **mw, int *seg_len)
+{
+ int i, from = 0;
+
+ /* 伸ばした文節 */
+ for (i = 0; i < nr_segments; i++) {
+ /* それぞれの文節に対して */
+
+ int len = seg_len[i];
+ int initial_len = sc->ce[from].initial_seg_len;
+ int real_len = 0;
+ int l2;
+
+ if (!initial_len || from + initial_len == sc->char_count) {
+ /* そこは境界ではない */
+ goto tail;
+ }
+ l2 = sc->ce[from + initial_len].initial_seg_len;
+ if (initial_len + l2 > len) {
+ /* 隣の文節を含むほど拡大されたわけではない */
+ goto tail;
+ }
+ if (mw[i]) {
+ real_len = mw[i]->len;
+ }
+ if (real_len <= initial_len) {
+ goto tail;
+ }
+ /* 右の文節を含む長さに拡張された文節がコミットされた */
+ proc_expanded_segment(sc, from, real_len);
+ tail:
+ from += len;
+ }
+}
+
+int
+anthy_splitter_debug_flags(void)
+{
+ return splitter_debug_flags;
+}
+
+void
+anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
+{
+ alloc_char_ent(xs, sc);
+ alloc_info_cache(sc);
+ sc->is_reverse = is_reverse;
+ /* 全ての部分文字列をチェックして、文節の候補を列挙する
+ word_listを構成してからmetawordを構成する */
+ anthy_lock_dic();
+ anthy_make_word_list_all(sc);
+ anthy_unlock_dic();
+ anthy_make_metaword_all(sc);
+
+}
+
+void
+anthy_release_split_context(struct splitter_context *sc)
+{
+ if (sc->word_split_info) {
+ release_info_cache(sc);
+ sc->word_split_info = 0;
+ }
+ if (sc->ce) {
+ free(sc->ce);
+ sc->ce = 0;
+ }
+}
+
+/** splitter全体の初期化を行う */
+int
+anthy_init_splitter(void)
+{
+ /* デバッグプリントの設定 */
+ char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
+ char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
+ splitter_debug_flags = SPLITTER_DEBUG_NONE;
+ if (!dis && en && strlen(en)) {
+ char *fs = getenv("ANTHY_SPLITTER_PRINT");
+ if (fs) {
+ if (strchr(fs, 'w')) {
+ splitter_debug_flags |= SPLITTER_DEBUG_WL;
+ }
+ if (strchr(fs, 'm')) {
+ splitter_debug_flags |= SPLITTER_DEBUG_MW;
+ }
+ if (strchr(fs, 'l')) {
+ splitter_debug_flags |= SPLITTER_DEBUG_LN;
+ }
+ if (strchr(fs, 'i')) {
+ splitter_debug_flags |= SPLITTER_DEBUG_ID;
+ }
+ if (strchr(fs, 'c')) {
+ splitter_debug_flags |= SPLITTER_DEBUG_CAND;
+ }
+ }
+ }
+ /* 付属語グラフの初期化 */
+ if (anthy_init_depword_tab()) {
+ anthy_log(0, "Failed to init dependent word table.\n");
+ return -1;
+ }
+ /**/
+ anthy_wtype_noun = anthy_init_wtype_by_name("名詞35");
+ anthy_wtype_name_noun = anthy_init_wtype_by_name("人名");
+ anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞");
+ anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語");
+ anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形");
+ anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語");
+ anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞");
+ anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞");
+ anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞");
+ anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞");
+ anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞");
+ anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1");
+ anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10");
+ return 0;
+}
+
+void
+anthy_quit_splitter(void)
+{
+ anthy_quit_depword_tab();
+}