diff options
Diffstat (limited to 'src-splitter')
-rw-r--r-- | src-splitter/Makefile.am | 10 | ||||
-rw-r--r-- | src-splitter/Makefile.in | 441 | ||||
-rw-r--r-- | src-splitter/compose.c | 479 | ||||
-rw-r--r-- | src-splitter/depgraph.c | 319 | ||||
-rw-r--r-- | src-splitter/evalborder.c | 187 | ||||
-rw-r--r-- | src-splitter/lattice.c | 541 | ||||
-rw-r--r-- | src-splitter/metaword.c | 967 | ||||
-rw-r--r-- | src-splitter/segclass.c | 130 | ||||
-rw-r--r-- | src-splitter/splitter.c | 329 | ||||
-rw-r--r-- | src-splitter/wordborder.h | 210 | ||||
-rw-r--r-- | src-splitter/wordlist.c | 584 |
11 files changed, 4197 insertions, 0 deletions
diff --git a/src-splitter/Makefile.am b/src-splitter/Makefile.am new file mode 100644 index 0000000..a3e8262 --- /dev/null +++ b/src-splitter/Makefile.am @@ -0,0 +1,10 @@ +## $Id: Makefile.am,v 1.2 2001/12/30 15:06:36 tak Exp $ + +EXTRAD_DIST = + +INCLUDES = -I$(top_srcdir)/ + +noinst_LTLIBRARIES = libsplit.la +libsplit_la_SOURCES = wordlist.c metaword.c depgraph.c\ + splitter.c evalborder.c compose.c\ + wordborder.h lattice.c segclass.c diff --git a/src-splitter/Makefile.in b/src-splitter/Makefile.in new file mode 100644 index 0000000..81cc061 --- /dev/null +++ b/src-splitter/Makefile.in @@ -0,0 +1,441 @@ +# Makefile.in generated by automake 1.9.6 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src-splitter +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libsplit_la_LIBADD = +am_libsplit_la_OBJECTS = wordlist.lo metaword.lo depgraph.lo \ + splitter.lo evalborder.lo compose.lo lattice.lo segclass.lo +libsplit_la_OBJECTS = $(am_libsplit_la_OBJECTS) +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +SOURCES = $(libsplit_la_SOURCES) +DIST_SOURCES = $(libsplit_la_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ELISP_FALSE = @ELISP_FALSE@ +ELISP_TRUE = @ELISP_TRUE@ +EMACS = @EMACS@ +EMACSLOADPATH = @EMACSLOADPATH@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +GREP = @GREP@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +lispdir = @lispdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +EXTRAD_DIST = +INCLUDES = -I$(top_srcdir)/ +noinst_LTLIBRARIES = libsplit.la +libsplit_la_SOURCES = wordlist.c metaword.c depgraph.c\ + splitter.c evalborder.c compose.c\ + wordborder.h lattice.c segclass.c + +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src-splitter/Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --gnu src-splitter/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" != "$$p" || dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libsplit.la: $(libsplit_la_OBJECTS) $(libsplit_la_DEPENDENCIES) + $(LINK) $(libsplit_la_LDFLAGS) $(libsplit_la_OBJECTS) $(libsplit_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/compose.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/depgraph.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/evalborder.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lattice.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/metaword.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/segclass.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splitter.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wordlist.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ if $(LTCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Plo"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkdir_p) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-info-am + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-exec \ + install-exec-am install-info install-info-am install-man \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ + pdf pdf-am ps ps-am tags uninstall uninstall-am \ + uninstall-info-am + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src-splitter/compose.c b/src-splitter/compose.c new file mode 100644 index 0000000..ddc2d47 --- /dev/null +++ b/src-splitter/compose.c @@ -0,0 +1,479 @@ +/* + * 文節に対して候補のリストを生成する。 + * make_candidates()がcontext管理部から呼ばれる。 + * + * 候補の生成は次の方法で行う + * (1)splitterが割り当てた品詞に対してproc_splitter_info() + * から候補を生成する + * (2)ひらがなのみとカタカナのみの候補を生成する + * (3)最後の文字を助詞と解釈して無理矢理候補を生成する + */ +/* + * Funded by IPA未踏ソフトウェア創造事業 2001 9/30 + * Copyright (C) 2000-2005 TABATA Yusuke + * Copyright (C) 2004-2005 YOSHIDA Yuichi + * Copyright (C) 2002 UGAWA Tomoharu + * + * $Id: compose.c,v 1.25 2005/08/19 04:20:25 oxy Exp $ + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <anthy/dic.h> +#include <anthy/splitter.h> +#include <anthy/segment.h> +#include "wordborder.h" + + +static struct cand_ent * +alloc_cand_ent(void) +{ + struct cand_ent *ce; + ce = (struct cand_ent *)malloc(sizeof(struct cand_ent)); + ce->nr_words = 0; + ce->elm = NULL; + ce->mw = NULL; + ce->core_elm_index = -1; + ce->dep_word_hash = 0; + return ce; +} + +/* + * 候補を複製する + */ +static struct cand_ent * +dup_candidate(struct cand_ent *ce) +{ + struct cand_ent *ce_new; + int i; + ce_new = alloc_cand_ent(); + ce_new->nr_words = ce->nr_words; + ce_new->str.len = ce->str.len; + ce_new->str.str = anthy_xstr_dup_str(&ce->str); + ce_new->elm = malloc(sizeof(struct cand_elm)*ce->nr_words); + ce_new->flag = ce->flag; + ce_new->core_elm_index = ce->core_elm_index; + ce_new->mw = ce->mw; + ce_new->score = ce->score; + ce_new->dep_word_hash = ce->dep_word_hash; + + for (i = 0 ; i < ce->nr_words ; i++) { + ce_new->elm[i] = ce->elm[i]; + } + return ce_new; +} + +/** 文節に候補を追加する */ +static void +push_back_candidate(struct seg_ent *seg, struct cand_ent *ce) +{ + /* seg_entに候補ceを追加 */ + seg->nr_cands++; + seg->cands = (struct cand_ent **) + realloc(seg->cands, sizeof(struct cand_ent *) * seg->nr_cands); + seg->cands[seg->nr_cands - 1] = ce; + /**/ + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) { + anthy_print_candidate(ce); + printf("\n"); + } +} + +static void +push_back_guessed_candidate(struct seg_ent *seg) +{ + xchar xc; + xstr *xs; + struct cand_ent *ce; + if (seg->str.len < 2) { + return ; + } + /* 最後の文字は助詞か? */ + xc = seg->str.str[seg->str.len - 1]; + if (!(anthy_get_xchar_type(xc) & XCT_DEP)) { + return ; + } + /* 最後の文字以外をカタカナにしてみる */ + ce = alloc_cand_ent(); + xs = anthy_xstr_hira_to_kata(&seg->str); + xs->str[xs->len-1] = xc; + ce->str.str = anthy_xstr_dup_str(xs); + ce->str.len = xs->len; + ce->flag = CEF_GUESS; + anthy_free_xstr(xs); + push_back_candidate(seg, ce); +} + +/** 再帰で1単語ずつ候補を割当てていく */ +static int +enum_candidates(struct seg_ent *seg, + struct cand_ent *ce, + int from, int n) +{ + int i, p; + struct cand_ent *cand; + int nr_cands = 0; + int pos; + + if (n == ce->mw->nr_parts) { + /* 完成形 */ + /* 文節後部の解析しなかった部分を候補文字列に追加 */ + xstr tail; + tail.len = seg->len - from; + tail.str = &seg->str.str[from]; + anthy_xstrcat(&ce->str, &tail); + if (ce->str.str && (0 < ce->str.len)) { /* 辞書もしくは学習データが壊れていた時の対策 */ + push_back_candidate(seg, dup_candidate(ce)); + } + return 1; + } + + p = anthy_get_nr_dic_ents(ce->elm[n].se, &ce->elm[n].str); + + /* 品詞が割当てられているので、その品詞にマッチするものを割当てる */ + for (i = 0; i < p; i++) { + wtype_t wt; + if (anthy_get_nth_dic_ent_is_compound(ce->elm[n].se, i)) { + continue; + } + anthy_get_nth_dic_ent_wtype(ce->elm[n].se, &ce->elm[n].str, i, &wt); + + ce->elm[n].wt = anthy_get_wtype_with_ct(ce->elm[n].wt, CT_NONE); + if (anthy_wtype_include(ce->elm[n].wt, wt)) { + xstr word, yomi; + + yomi.len = ce->elm[n].str.len; + yomi.str = &seg->str.str[from]; + cand = dup_candidate(ce); + anthy_get_nth_dic_ent_str(cand->elm[n].se, + &yomi, i, &word); + cand->elm[n].nth = i; + cand->elm[n].id = anthy_xstr_hash(&word); + + /* 単語の本体 */ + anthy_xstrcat(&cand->str, &word); + free(word.str); + /* 自分を再帰呼び出しして続きを割り当てる */ + nr_cands += enum_candidates(seg, cand, + from + yomi.len, + n+1); + anthy_release_cand_ent(cand); + } + } + + /* 品詞不定の場合には未変換で次の単語へ行く */ + pos = anthy_wtype_get_pos(ce->elm[n].wt); + if (nr_cands == 0 || pos == POS_INVAL || pos == POS_NONE) { + xstr xs; + xs.len = ce->elm[n].str.len; + xs.str = &seg->str.str[from]; + cand = dup_candidate(ce); + cand->elm[n].nth = -1; + cand->elm[n].id = -1; + anthy_xstrcat(&cand->str, &xs); + nr_cands = enum_candidates(seg,cand, + from + xs.len, + n + 1); + anthy_release_cand_ent(cand); + return nr_cands; + } + + return nr_cands; +} + +/** + * 文節全体を含む一単語(単漢字を含む)の候補を生成する + */ +static void +push_back_singleword_candidate(struct seg_ent *seg, + int is_reverse) +{ + seq_ent_t se; + struct cand_ent *ce; + wtype_t wt; + int i, n; + xstr xs; + + se = anthy_get_seq_ent_from_xstr(&seg->str, is_reverse); + n = anthy_get_nr_dic_ents(se, &seg->str); + /* 辞書の各エントリに対して */ + for (i = 0; i < n; i++) { + int ct; + if (anthy_get_nth_dic_ent_is_compound(se, i)) { + continue; + } + /* 品詞を取り出して */ + anthy_get_nth_dic_ent_wtype(se, &seg->str, i, &wt); + ct = anthy_wtype_get_ct(wt); + /* 終止形か活用しないものの原形なら */ + if (ct == CT_SYUSI || ct == CT_NONE) { + ce = alloc_cand_ent(); + anthy_get_nth_dic_ent_str(se,&seg->str, i, &xs); + ce->str.str = xs.str; + ce->str.len = xs.len; + ce->flag = CEF_SINGLEWORD; + push_back_candidate(seg, ce); + } + } +} + +static void +push_back_noconv_candidate(struct seg_ent *seg) +{ + /* 無変換で片仮名になる候補と平仮名のみになる候補を追加 */ + struct cand_ent *ce; + xstr *xs; + + /* ひらがなのみ */ + ce = alloc_cand_ent(); + ce->str.str = anthy_xstr_dup_str(&seg->str); + ce->str.len = seg->str.len; + ce->flag = CEF_HIRAGANA; + push_back_candidate(seg, ce); + + /* 次にカタカナ */ + ce = alloc_cand_ent(); + xs = anthy_xstr_hira_to_kata(&seg->str); + ce->str.str = anthy_xstr_dup_str(xs); + ce->str.len = xs->len; + ce->flag = CEF_KATAKANA; + anthy_free_xstr(xs); + push_back_candidate(seg, ce); + + /* 記号のみの文節 */ + xs = anthy_conv_half_wide(&seg->str); + if (xs) { + ce = alloc_cand_ent(); + ce->str.str = anthy_xstr_dup_str(xs); + ce->str.len = xs->len; + ce->flag = CEF_NONE; + anthy_free_xstr(xs); + push_back_candidate(seg, ce); + } +} + +/* word_listの要素part_infoの配列からcand_elmの配列を作る */ +static void +make_cand_elem_from_word_list(struct seg_ent *se, + struct cand_ent *ce, + struct word_list *wl, + int index, + int is_reverse) +{ + int i; + int from = wl->from - se->from; + + for (i = 0; i < NR_PARTS; ++i) { + struct part_info *part = &wl->part[i]; + xstr core_xs; + if (part->len == 0) { + /* 長さの無いpartは無視する */ + continue; + } + if (i == PART_CORE) { + ce->core_elm_index = i + index; + } + core_xs.str = &se->str.str[from]; + core_xs.len = part->len; + if (i == PART_DEPWORD) { + ce->dep_word_hash = anthy_dep_word_hash(&core_xs); + } + ce->elm[i + index].se = anthy_get_seq_ent_from_xstr(&core_xs, is_reverse); + ce->elm[i + index].str.str = core_xs.str; + ce->elm[i + index].str.len = core_xs.len; + ce->elm[i + index].wt = part->wt; + ce->elm[i + index].ratio = RATIO_BASE * wl->len; + from += part->len; + } +} + + +/** まずwordlistを持つmetawordからmeta_wordを取り出す */ +static void +make_candidate_from_simple_metaword(struct seg_ent *se, + struct meta_word *mw, + struct meta_word *top_mw, + int is_reverse) +{ + /* + * 各単語の品詞が決定された状態でコミットされる。 + */ + struct cand_ent *ce; + + /* 複数(1も含む)の単語で構成される文節に単語を割当てていく */ + ce = alloc_cand_ent(); + ce->nr_words = mw->nr_parts; + ce->str.str = NULL; + ce->str.len = 0; + ce->elm = calloc(sizeof(struct cand_elm),ce->nr_words); + ce->mw = mw; + ce->score = 0; + + /* 接頭辞, 自立語部, 接尾辞, 付属語 */ + make_cand_elem_from_word_list(se, ce, mw->wl, 0, is_reverse); + + /* WRAPされていたらGUESSと同じ扱いにして点数を下げる */ + if (anthy_metaword_type_tab[top_mw->type].status != MW_STATUS_WRAPPED) { + ce->flag = (se->best_mw == mw) ? CEF_BEST : CEF_NONE; + } else { + ce->flag = CEF_GUESS; + } + + enum_candidates(se, ce, 0, 0); + anthy_release_cand_ent(ce); +} + +/** combinedなmetawordは二つの語を合体して一つの語として出す */ +static void +make_candidate_from_combined_metaword(struct seg_ent *se, + struct meta_word *mw, + struct meta_word *top_mw, + int is_reverse) +{ + /* + * 各単語の品詞が決定された状態でコミットされる。 + */ + struct cand_ent *ce; + + /* 複数(1も含む)の単語で構成される文節に単語を割当てていく */ + ce = alloc_cand_ent(); + ce->nr_words = mw->nr_parts; + ce->score = 0; + ce->str.str = NULL; + ce->str.len = 0; + ce->elm = calloc(sizeof(struct cand_elm),ce->nr_words); + ce->mw = top_mw; + + /* 接頭辞, 自立語部, 接尾辞, 付属語 */ + make_cand_elem_from_word_list(se, ce, mw->mw1->wl, 0, is_reverse); + if (mw->mw2) { + make_cand_elem_from_word_list(se, ce, mw->mw2->mw1->wl, NR_PARTS, is_reverse); + } + + /* WRAPされていたらGUESSと同じ扱いにして点数を下げる */ + if (anthy_metaword_type_tab[top_mw->type].status != MW_STATUS_WRAPPED) { + ce->flag = (se->best_mw == mw) ? CEF_BEST : CEF_NONE; + } else { + ce->flag = CEF_GUESS; + } + + enum_candidates(se, ce, 0, 0); + anthy_release_cand_ent(ce); +} + + +/** splitterの情報を利用して候補を生成する + */ +static void +proc_splitter_info(struct seg_ent *se, + struct meta_word *mw, + /* topとはtreeのトップ */ + struct meta_word *top_mw, + int is_reverse) +{ + enum mw_status st; + if (!mw) return; + + /* まずwordlistを持つmetawordの場合 */ + if (mw->wl && mw->wl->len) { + make_candidate_from_simple_metaword(se, mw, top_mw, is_reverse); + return; + } + + st = anthy_metaword_type_tab[mw->type].status; + switch (st) { + case MW_STATUS_WRAPPED: + /* wrapされたものの情報を取り出す */ + proc_splitter_info(se, mw->mw1, top_mw, is_reverse); + break; + case MW_STATUS_COMBINED: + make_candidate_from_combined_metaword(se, mw, top_mw, is_reverse); + break; + case MW_STATUS_COMPOUND: + /* 連文節の葉 */ + { + struct cand_ent *ce; + ce = alloc_cand_ent(); + ce->str.str = anthy_xstr_dup_str(&mw->cand_hint); + ce->str.len = mw->cand_hint.len; + ce->flag = CEF_COMPOUND; + ce->mw = top_mw; + push_back_candidate(se, ce); + } + break; + case MW_STATUS_COMPOUND_PART: + /* 連文節の個々の文節を結合して一つの文節としてみたもの */ + /* BREAK THROUGH */ + case MW_STATUS_OCHAIRE: + { + /* metawordを持たない候補文字列が + 直接に指定された */ + struct cand_ent *ce; + ce = alloc_cand_ent(); + ce->str.str = anthy_xstr_dup_str(&mw->cand_hint); + ce->str.len = mw->cand_hint.len; + ce->mw = top_mw; + ce->flag = (st == MW_STATUS_OCHAIRE) ? CEF_OCHAIRE : CEF_COMPOUND_PART; + + if (mw->len < se->len) { + /* metawordでカバーされてない領域の文字列を付ける */ + xstr xs; + xs.str = &se->str.str[mw->len]; + xs.len = se->len - mw->len; + anthy_xstrcat(&ce->str ,&xs); + } + push_back_candidate(se, ce); + } + break; + case MW_STATUS_NONE: + break; + default: + break; + } +} + +/** context.cから呼出されるもっとも大物 + * 一つ以上の候補を必ず生成する + */ +void +anthy_do_make_candidates(struct splitter_context *sc, + struct seg_ent *se, int is_reverse) +{ + int i; + + /* metawordから候補を生成する */ + for (i = 0; i < se->nr_metaword; i++) { + struct meta_word *mw = se->mw_array[i]; + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) { + anthy_print_metaword(sc, mw); + } + proc_splitter_info(se, mw, mw, is_reverse); + } + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) { + printf("#done\n"); + } + /* 単漢字などの候補 */ + push_back_singleword_candidate(se, is_reverse); + + /* ひらがな、カタカナの無変換エントリを作る */ + push_back_noconv_candidate(se); + + /* 候補が二つしか無いときは最後が助詞で残りが平仮名の候補を作れるか試す */ + push_back_guessed_candidate(se); +} diff --git a/src-splitter/depgraph.c b/src-splitter/depgraph.c new file mode 100644 index 0000000..004ceaf --- /dev/null +++ b/src-splitter/depgraph.c @@ -0,0 +1,319 @@ +/* + * 文節の自立語部(接頭辞、接尾辞含む)に続く + * 助詞、助動詞などの付属語のパターンをたどる。 + * パターンはグラフとして設定ファイルに用意する。 + * + * + * +------+ + * | | + * |branch+--cond--+--transition--> node + * | | +--transition--> node + * | NODE | + * | | + * |branch+--cond-----transition--> node + * | | + * |branch+--cond-----transition--> node + * | | + * +------+ + * + * Copyright (C) 2000-2007 TABATA Yusuke + * Copyright (C) 2006 YOSHIDA Yuichi + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "config.h" +#include <anthy/anthy.h> + +#include <anthy/conf.h> +#include <anthy/ruleparser.h> +#include <anthy/xstr.h> +#include <anthy/filemap.h> +#include <anthy/logger.h> +#include <anthy/segclass.h> +#include <anthy/splitter.h> +#include <anthy/wtype.h> +#include <anthy/diclib.h> +#include "wordborder.h" + +/* 遷移グラフ */ +static struct dep_dic ddic; + + +static void +match_branch(struct splitter_context *sc, + struct word_list *tmpl, + xstr *xs, struct dep_branch *db); +static void +match_nodes(struct splitter_context *sc, + struct word_list *wl, + xstr follow_str, int node); + + +static int +anthy_xstrcmp_with_ondisk(xstr *xs, + ondisk_xstr *dxs) +{ + int *d = (int *)dxs; + int len = anthy_dic_ntohl(d[0]); + int i; + xchar c; + if (len != xs->len) { + return 1; + } + d++; + for (i = 0; i < len; i++) { + c = anthy_dic_ntohl(d[i]); + if (xs->str[i] != c) { + return 1; + } + } + return 0; +} + +static ondisk_xstr * +anthy_next_ondisk_xstr(ondisk_xstr *dxs) +{ + int *d = (int *)dxs; + int len = anthy_dic_ntohl(d[0]); + return &d[len+1]; +} + +static int +anthy_ondisk_xstr_len(ondisk_xstr *dxs) +{ + int *d = (int *)dxs; + return anthy_dic_ntohl(d[0]); +} + +/* + * 各ノードにおける遷移条件をテストする + * + * wl 自立語部のword_list + * follow_str 自立語部以降の文字列 + * node ルールの番号 + */ +static void +match_nodes(struct splitter_context *sc, + struct word_list *wl, + xstr follow_str, int node) +{ + struct dep_node *dn = &ddic.nodes[node]; + struct dep_branch *db; + int i,j; + + /* 各ルールの */ + for (i = 0; i < dn->nr_branch; i++) { + ondisk_xstr *dep_xs; + db = &dn->branch[i]; + dep_xs = db->xstrs; + + /* 各遷移条件 */ + for (j = 0; j < db->nr_strs; + j++, dep_xs = anthy_next_ondisk_xstr(dep_xs)) { + xstr cond_xs; + /* 付属語の方が遷移条件より長いことが必要 */ + if (follow_str.len < anthy_ondisk_xstr_len(dep_xs)) { + continue; + } + /* 遷移条件の部分を切り出す */ + cond_xs.str = follow_str.str; + cond_xs.len = anthy_ondisk_xstr_len(dep_xs); + + /* 遷移条件と比較する */ + if (!anthy_xstrcmp_with_ondisk(&cond_xs, dep_xs)) { + /* 遷移条件にmatchした */ + struct word_list new_wl = *wl; + struct part_info *part = &new_wl.part[PART_DEPWORD]; + xstr new_follow; + + part->len += cond_xs.len; + new_follow.str = &follow_str.str[cond_xs.len]; + new_follow.len = follow_str.len - cond_xs.len; + /* 遷移してみる */ + match_branch(sc, &new_wl, &new_follow, db); + } + } + } +} + +/* + * 各遷移を実行してみる + * + * tmpl ここまでに構成したword_list + * xs 残りの文字列 + * db 現在調査中のbranch + */ +static void +match_branch(struct splitter_context *sc, + struct word_list *tmpl, + xstr *xs, struct dep_branch *db) +{ + struct part_info *part = &tmpl->part[PART_DEPWORD]; + int i; + + /* 遷移先を順にトライする */ + for (i = 0; i < db->nr_transitions; i++) { + /**/ + int head_pos = tmpl->head_pos; /* 品詞の情報 */ + int features = tmpl->mw_features; + enum dep_class dc = part->dc; + /**/ + struct dep_transition *transition = &db->transition[i]; + + tmpl->tail_ct = anthy_dic_ntohl(transition->ct); + /* 遷移の活用形と品詞 */ + if (anthy_dic_ntohl(transition->dc) != DEP_NONE) { + part->dc = anthy_dic_ntohl(transition->dc); + } + /* 名詞化する動詞等で品詞名を上書き */ + if (anthy_dic_ntohl(transition->head_pos) != POS_NONE) { + tmpl->head_pos = anthy_dic_ntohl(transition->head_pos); + } + if (transition->weak) { + tmpl->mw_features |= MW_FEATURE_WEAK_CONN; + } + + /* 遷移か終端か */ + if (anthy_dic_ntohl(transition->next_node)) { + /* 遷移 */ + match_nodes(sc, tmpl, *xs, anthy_dic_ntohl(transition->next_node)); + } else { + struct word_list *wl; + + /* + * 終端ノードに到達したので、 + * それをword_listとしてコミット + */ + wl = anthy_alloc_word_list(sc); + *wl = *tmpl; + wl->len += part->len; + + /**/ + anthy_commit_word_list(sc, wl); + } + /* 書き戻し */ + part->dc = dc; + tmpl->head_pos = head_pos; + tmpl->mw_features = features; + } +} + +/** 検索開始 + */ +void +anthy_scan_node(struct splitter_context *sc, + struct word_list *tmpl, + xstr *follow, int node) +{ + /* 付属語の付いていない状態から検索を開始する */ + match_nodes(sc, tmpl, *follow, node); +} + + + + +static void +read_xstr(struct dep_dic* ddic, int* offset) +{ + int len = anthy_dic_ntohl(*(int*)&ddic->file_ptr[*offset]); + *offset += sizeof(int); + *offset += sizeof(xchar) * len; +} + +static void +read_branch(struct dep_dic* ddic, struct dep_branch* branch, int* offset) +{ + int i; + + /* 遷移条件の数を読む */ + branch->nr_strs = anthy_dic_ntohl(*(int*)&ddic->file_ptr[*offset]); + *offset += sizeof(int); + /* 遷移条件の文字列を読み取る */ + branch->xstrs = (ondisk_xstr *)&ddic->file_ptr[*offset]; + + for (i = 0; i < branch->nr_strs; ++i) { + read_xstr(ddic, offset); + } + + branch->nr_transitions = anthy_dic_ntohl(*(int*)&ddic->file_ptr[*offset]); + *offset += sizeof(int); + branch->transition = (struct dep_transition*)&ddic->file_ptr[*offset]; + *offset += sizeof(struct dep_transition) * branch->nr_transitions; +} + +static void +read_node(struct dep_dic* ddic, struct dep_node* node, int* offset) +{ + int i; + node->nr_branch = anthy_dic_ntohl(*(int*)&ddic->file_ptr[*offset]); + *offset += sizeof(int); + + node->branch = malloc(sizeof(struct dep_branch) * node->nr_branch); + for (i = 0; i < node->nr_branch; ++i) { + read_branch(ddic, &node->branch[i], offset); + } +} + +static void +read_file(void) +{ + int i; + + int offset = 0; + + ddic.file_ptr = anthy_file_dic_get_section("dep_dic"); + + /* 最初にルールの数 */ + ddic.nrRules = anthy_dic_ntohl(*(int*)&ddic.file_ptr[offset]); + offset += sizeof(int); + + /* 各ルールの定義 */ + ddic.rules = (struct ondisk_wordseq_rule*)&ddic.file_ptr[offset]; + offset += sizeof(struct ondisk_wordseq_rule) * ddic.nrRules; + /* ノードの数 */ + ddic.nrNodes = anthy_dic_ntohl(*(int*)&ddic.file_ptr[offset]); + offset += sizeof(int); + + /* 各ノードを読み込む */ + ddic.nodes = malloc(sizeof(struct dep_node) * ddic.nrNodes); + for (i = 0; i < ddic.nrNodes; ++i) { + read_node(&ddic, &ddic.nodes[i], &offset); + } +} + +int +anthy_get_nr_dep_rule() +{ + return ddic.nrRules; +} + +void +anthy_get_nth_dep_rule(int index, struct wordseq_rule *rule) +{ + /* ファイル上の情報からデータを取り出す */ + struct ondisk_wordseq_rule *r = &ddic.rules[index]; + rule->wt = anthy_get_wtype(r->wt[0], r->wt[1], r->wt[2], + r->wt[3], r->wt[4], r->wt[5]); + rule->node_id = anthy_dic_ntohl(r->node_id); +} + +int +anthy_init_depword_tab() +{ + read_file(); + return 0; +} + +void +anthy_quit_depword_tab(void) +{ + int i; + for (i = 0; i < ddic.nrNodes; i++) { + struct dep_node* node = &ddic.nodes[i]; + free(node->branch); + } + free(ddic.nodes); +} + diff --git a/src-splitter/evalborder.c b/src-splitter/evalborder.c new file mode 100644 index 0000000..7c4a8c4 --- /dev/null +++ b/src-splitter/evalborder.c @@ -0,0 +1,187 @@ +/* + * 文節の境界を検出する。 + * + * metawordの選択にはビタビアルゴリズムを使う + * + * anthy_eval_border() で指定された領域を文節に分割する + * + * Funded by IPA未踏ソフトウェア創造事業 2001 10/29 + * Copyright (C) 2000-2003 TABATA Yusuke, UGAWA Tomoharu + */ +#include <stdio.h> +#include <stdlib.h> + +#include <anthy/alloc.h> +#include <anthy/splitter.h> +#include "wordborder.h" + +static int +border_check(struct meta_word* mw, + int from, + int border) +{ + if (mw->from < border) { + /* 先頭の文節の中から始まるmwは文節区切りにぴったりあっていないとダメ */ + if (mw->from == from && mw->from + mw->len == border) { + return 1; + } else { + return 0; + } + } else { + /* 後ろの文節は無条件に使用可能 */ + return 1; + } +} + +/* + * 再帰的にmetawordが使用可能かチェックする + */ +static void +metaword_constraint_check(struct splitter_context *sc, + struct meta_word *mw, + int from, + int border) +{ + if (!mw) return; + if (mw->can_use != unchecked) return; + + switch(anthy_metaword_type_tab[mw->type].check){ + case MW_CHECK_SINGLE: + mw->can_use = border_check(mw, from, border) ? ok : ng; + break; + case MW_CHECK_BORDER: + { + struct meta_word* mw1 = mw->mw1; + struct meta_word* mw2 = mw->mw2; + + if (mw1&&mw2&&mw1->from + mw1->len == border) { + /* ちょうど境目にマークが入ってる */ + mw->can_use = ng; + break; + } + if (mw1) + metaword_constraint_check(sc, mw1, from, border); + if (mw2) + metaword_constraint_check(sc, mw2, mw2->from, border); + + if ((!mw1 || mw1->can_use == ok) && (!mw2 || mw2->can_use == ok)) { + mw->can_use = ok; + } else { + mw->can_use = ng; + } + } + break; + case MW_CHECK_WRAP: + metaword_constraint_check(sc, mw->mw1, from, border); + mw->can_use = mw->mw1->can_use; + break; + case MW_CHECK_NUMBER: + { + struct meta_word* itr = mw; + mw->can_use = ok; + + /* 個々の文節の一つでも文節区切りをまたがっていれば、この複合語は使えない */ + for (; itr && itr->type == MW_NUMBER; itr = itr->mw2) { + struct meta_word* mw1 = itr->mw1; + if (!border_check(mw1, from, border)) { + mw->can_use = ng; + break; + } + } + } + break; + case MW_CHECK_COMPOUND: + { + struct meta_word* itr = mw; + mw->can_use = ok; + + /* 個々の文節の一つでも文節区切りをまたがっていれば、この複合語は使えない */ + for (; itr && (itr->type == MW_COMPOUND_HEAD || itr->type == MW_COMPOUND); itr = itr->mw2) { + struct meta_word* mw1 = itr->mw1; + if (!border_check(mw1, from, border)) { + mw->can_use = ng; + break; + } + } + } + break; + case MW_CHECK_OCHAIRE: + { + struct meta_word* mw1; + if (border_check(mw, from, border)) { + for (mw1 = mw; mw1; mw1 = mw1->mw1) { + mw1->can_use = ok; + } + } else { + for (mw1 = mw; mw1; mw1 = mw1->mw1) { + mw1->can_use = ng; + } + } + } + break; + case MW_CHECK_NONE: + break; + default: + printf("try to check unknown type of metaword (%d).\n", mw->type); + } +} + +/* + * 全てのmetawordについて使用できるかどうかをチェックする + */ +static void +metaword_constraint_check_all(struct splitter_context *sc, + int from, int to, + int border) +{ + int i; + struct word_split_info_cache *info; + info = sc->word_split_info; + + /* まずuncheckedにする */ + for (i = from; i < to; i ++) { + struct meta_word *mw; + for (mw = info->cnode[i].mw; + mw; mw = mw->next) { + mw->can_use = unchecked; + } + } + + /* 次に合成されたmetawordについてチェック */ + for (i = from; i < to; i ++) { + struct meta_word *mw; + for (mw = info->cnode[i].mw; mw; mw = mw->next) { + metaword_constraint_check(sc, mw, from, border); + } + } +} + +/* + * ここから文節境界をマークする + */ +void +anthy_eval_border(struct splitter_context *sc, int from, int from2, int to) +{ + struct meta_word *mw; + int nr; + + /* 文節候補のうち使えるもののみ選択 */ + metaword_constraint_check_all(sc, from, to, from2); + + /* fromとfrom2の間をカバーするmeta_wordがあるかどうかを探す。 + * あれば、fromから解析を行い、なければfrom2から解析をする。 + */ + nr = 0; + for (mw = sc->word_split_info->cnode[from].mw; mw; mw = mw->next) { + if (mw->can_use == ok) { + nr ++; + break; + } + } + if (nr == 0) { + from = from2; + } + + /* 文節の境界を設定する */ + anthy_mark_borders(sc, from, to); +} diff --git a/src-splitter/lattice.c b/src-splitter/lattice.c new file mode 100644 index 0000000..2a242e2 --- /dev/null +++ b/src-splitter/lattice.c @@ -0,0 +1,541 @@ +/* + * 確率を評価しビタビアルゴリズム(viterbi algorithm)によって + * 文節の区切りを決定してマークする。 + * + * + * 外部から呼び出される関数 + * anthy_mark_borders() + * + * Copyright (C) 2006-2007 TABATA Yusuke + * Copyright (C) 2004-2006 YOSHIDA Yuichi + * Copyright (C) 2006 HANAOKA Toshiyuki + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * コンテキスト中に存在するmeta_wordをつないでグラフを構成します。 + * (このグラフのことをラティス(lattice/束)もしくはトレリス(trellis)と呼びます) + * meta_wordどうしの接続がグラフのノードとなり、構造体lattice_nodeの + * リンクとして構成されます。 + * + * ここでの処理は次の二つの要素で構成されます + * (1) グラフを構成しつつ、各ノードへの到達確率を求める + * (2) グラフを後ろ(右)からたどって最適なパスを求める + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> + +#include <anthy/alloc.h> +#include <anthy/xstr.h> +#include <anthy/segclass.h> +#include <anthy/splitter.h> +#include <anthy/feature_set.h> +#include <anthy/diclib.h> +#include "wordborder.h" + +static float anthy_normal_length = 20.0; /* 文節の期待される長さ */ +static void *trans_info_array; + +#define NODE_MAX_SIZE 50 + +/* グラフのノード(遷移状態) */ +struct lattice_node { + int border; /* 文字列中のどこから始まる状態か */ + enum seg_class seg_class; /* この状態の品詞 */ + + + double real_probability; /* ここに至るまでの確率(文節数補正無し) */ + double adjusted_probability; /* ここに至るまでの確率(文節数補正有り) */ + + + struct lattice_node* before_node; /* 一つ前の遷移状態 */ + struct meta_word* mw; /* この遷移状態に対応するmeta_word */ + + struct lattice_node* next; /* リスト構造のためのポインタ */ +}; + +struct node_list_head { + struct lattice_node *head; + int nr_nodes; +}; + +struct lattice_info { + /* 遷移状態のリストの配列 */ + struct node_list_head *lattice_node_list; + struct splitter_context *sc; + /* ノードのアロケータ */ + allocator node_allocator; +}; + +/* + */ +static void +print_lattice_node(struct lattice_info *info, struct lattice_node *node) +{ + if (!node) { + printf("**lattice_node (null)*\n"); + return ; + } + printf("**lattice_node probability=%.128f\n", node->real_probability); + if (node->mw) { + anthy_print_metaword(info->sc, node->mw); + } +} + +static double +get_poisson(double lambda, int r) +{ + int i; + double result; + + /* 要するにポワソン分布 */ + result = pow(lambda, r) * exp(-lambda); + for (i = 2; i <= r; ++i) { + result /= i; + } + + return result; +} + +/* 文節の形式からスコアを調整する */ +static double +get_form_bias(struct meta_word *mw) +{ + double bias; + int r; + /* wrapされている場合は内部のを使う */ + while (mw->type == MW_WRAP) { + mw = mw->mw1; + } + /* 文節長による調整 */ + r = mw->len; + if (r > 6) { + r = 6; + } + if (r < 2) { + r = 2; + } + if (mw->seg_class == SEG_RENTAI_SHUSHOKU && + r < 3) { + /* 指示語 */ + r = 3; + } + bias = get_poisson(anthy_normal_length, r); + return bias; +} + +static void +build_feature_list(struct lattice_node *node, + struct feature_list *features) +{ + int pc, cc; + if (node) { + cc = node->seg_class; + } else { + cc = SEG_TAIL; + } + anthy_feature_list_set_cur_class(features, cc); + if (node && node->before_node) { + pc = node->before_node->seg_class; + } else { + pc = SEG_HEAD; + } + anthy_feature_list_set_class_trans(features, pc, cc); + + if (node && node->mw) { + struct meta_word *mw = node->mw; + anthy_feature_list_set_dep_class(features, mw->dep_class); + anthy_feature_list_set_dep_word(features, + mw->dep_word_hash); + anthy_feature_list_set_mw_features(features, mw->mw_features); + anthy_feature_list_set_noun_cos(features, mw->core_wt); + + } + anthy_feature_list_sort(features); +} + +static double +calc_probability(int cc, struct feature_list *fl) +{ + struct feature_freq *res, arg; + double prob; + + /* 確率を計算する */ + res = anthy_find_feature_freq(trans_info_array, + fl, &arg); + prob = 0; + if (res) { + double pos = res->f[15]; + double neg = res->f[14]; + prob = 1 - (neg) / (double) (pos + neg); + } + if (prob <= 0) { + /* 例文中に存在しないパターンなので0に近いスコア */ + prob = 1.0f / (double)(10000 * 100); + } + + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) { + anthy_feature_list_print(fl); + printf(" cc=%d(%s), P=%f\n", cc, anthy_seg_class_name(cc), prob); + } + return prob; +} + +static double +get_transition_probability(struct lattice_node *node) +{ + struct feature_list features; + double probability; + + /**/ + anthy_feature_list_init(&features); + build_feature_list(node, &features); + probability = calc_probability(node->seg_class, &features); + anthy_feature_list_free(&features); + + /* 文節の形に対する評価 */ + probability *= get_form_bias(node->mw); + return probability; +} + +static struct lattice_info* +alloc_lattice_info(struct splitter_context *sc, int size) +{ + int i; + struct lattice_info* info = (struct lattice_info*)malloc(sizeof(struct lattice_info)); + info->sc = sc; + info->lattice_node_list = (struct node_list_head*) + malloc((size + 1) * sizeof(struct node_list_head)); + for (i = 0; i < size + 1; i++) { + info->lattice_node_list[i].head = NULL; + info->lattice_node_list[i].nr_nodes = 0; + } + info->node_allocator = anthy_create_allocator(sizeof(struct lattice_node), + NULL); + return info; +} + +static void +calc_node_parameters(struct lattice_node *node) +{ + /* 対応するmetawordが無い場合は文頭と判断する */ + node->seg_class = node->mw ? node->mw->seg_class : SEG_HEAD; + + if (node->before_node) { + /* 左に隣接するノードがある場合 */ + node->real_probability = node->before_node->real_probability * + get_transition_probability(node); + node->adjusted_probability = node->real_probability * + (node->mw ? node->mw->score : 1000); + } else { + /* 左に隣接するノードが無い場合 */ + node->real_probability = 1.0; + node->adjusted_probability = node->real_probability; + } +} + +static struct lattice_node* +alloc_lattice_node(struct lattice_info *info, + struct lattice_node* before_node, + struct meta_word* mw, int border) +{ + struct lattice_node* node; + node = anthy_smalloc(info->node_allocator); + node->before_node = before_node; + node->border = border; + node->next = NULL; + node->mw = mw; + + calc_node_parameters(node); + + return node; +} + +static void +release_lattice_node(struct lattice_info *info, struct lattice_node* node) +{ + anthy_sfree(info->node_allocator, node); +} + +static void +release_lattice_info(struct lattice_info* info) +{ + anthy_free_allocator(info->node_allocator); + free(info->lattice_node_list); + free(info); +} + +static int +cmp_node_by_type(struct lattice_node *lhs, struct lattice_node *rhs, + enum metaword_type type) +{ + if (lhs->mw->type == type && rhs->mw->type != type) { + return 1; + } else if (lhs->mw->type != type && rhs->mw->type == type) { + return -1; + } else { + return 0; + } +} + +static int +cmp_node_by_type_to_type(struct lattice_node *lhs, struct lattice_node *rhs, + enum metaword_type type1, enum metaword_type type2) +{ + if (lhs->mw->type == type1 && rhs->mw->type == type2) { + return 1; + } else if (lhs->mw->type == type2 && rhs->mw->type == type1) { + return -1; + } else { + return 0; + } +} + +/* + * ノードを比較する + * + ** 返り値 + * 1: lhsの方が確率が高い + * 0: 同じ + * -1: rhsの方が確率が高い + */ +static int +cmp_node(struct lattice_node *lhs, struct lattice_node *rhs) +{ + struct lattice_node *lhs_before = lhs; + struct lattice_node *rhs_before = rhs; + int ret; + + if (lhs && !rhs) return 1; + if (!lhs && rhs) return -1; + if (!lhs && !rhs) return 0; + + while (lhs_before && rhs_before) { + if (lhs_before->mw && rhs_before->mw && + lhs_before->mw->from + lhs_before->mw->len == rhs_before->mw->from + rhs_before->mw->len) { + /* 学習から作られたノードかどうかを見る */ + ret = cmp_node_by_type(lhs_before, rhs_before, MW_OCHAIRE); + if (ret != 0) return ret; + + /* COMPOUND_PARTよりはCOMPOUND_HEADを優先 */ + ret = cmp_node_by_type_to_type(lhs_before, rhs_before, MW_COMPOUND_HEAD, MW_COMPOUND_PART); + if (ret != 0) return ret; + } else { + break; + } + lhs_before = lhs_before->before_node; + rhs_before = rhs_before->before_node; + } + + /* 最後に遷移確率を見る */ + if (lhs->adjusted_probability > rhs->adjusted_probability) { + return 1; + } else if (lhs->adjusted_probability < rhs->adjusted_probability) { + return -1; + } else { + return 0; + } +} + +/* + * 構成中のラティスにノードを追加する + */ +static void +push_node(struct lattice_info* info, struct lattice_node* new_node, + int position) +{ + struct lattice_node* node; + struct lattice_node* previous_node = NULL; + + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) { + print_lattice_node(info, new_node); + } + + /* 先頭のnodeが無ければ無条件に追加 */ + node = info->lattice_node_list[position].head; + if (!node) { + info->lattice_node_list[position].head = new_node; + info->lattice_node_list[position].nr_nodes ++; + return; + } + + while (node->next) { + /* 余計なノードを追加しないための枝刈り */ + if (new_node->seg_class == node->seg_class && + new_node->border == node->border) { + /* segclassが同じで、始まる位置が同じなら */ + switch (cmp_node(new_node, node)) { + case 0: + case 1: + /* 新しい方が確率が大きいか学習によるものなら、古いのと置き換え*/ + if (previous_node) { + previous_node->next = new_node; + } else { + info->lattice_node_list[position].head = new_node; + } + new_node->next = node->next; + release_lattice_node(info, node); + break; + case -1: + /* そうでないなら削除 */ + release_lattice_node(info, new_node); + break; + } + return; + } + previous_node = node; + node = node->next; + } + + /* 最後のノードの後ろに追加 */ + node->next = new_node; + info->lattice_node_list[position].nr_nodes ++; +} + +/* 一番確率の低いノードを消去する*/ +static void +remove_min_node(struct lattice_info *info, struct node_list_head *node_list) +{ + struct lattice_node* node = node_list->head; + struct lattice_node* previous_node = NULL; + struct lattice_node* min_node = node; + struct lattice_node* previous_min_node = NULL; + + /* 一番確率の低いノードを探す */ + while (node) { + if (cmp_node(node, min_node) < 0) { + previous_min_node = previous_node; + min_node = node; + } + previous_node = node; + node = node->next; + } + + /* 一番確率の低いノードを削除する */ + if (previous_min_node) { + previous_min_node->next = min_node->next; + } else { + node_list->head = min_node->next; + } + release_lattice_node(info, min_node); + node_list->nr_nodes --; +} + +/* いわゆるビタビアルゴリズムを使用して経路を選ぶ */ +static void +choose_path(struct lattice_info* info, int to) +{ + /* 最後まで到達した遷移のなかで一番確率の大きいものを選ぶ */ + struct lattice_node* node; + struct lattice_node* best_node = NULL; + int last = to; + while (!info->lattice_node_list[last].head) { + /* 最後の文字まで遷移していなかったら後戻り */ + --last; + } + for (node = info->lattice_node_list[last].head; node; node = node->next) { + if (cmp_node(node, best_node) > 0) { + best_node = node; + } + } + if (!best_node) { + return; + } + + /* 遷移を逆にたどりつつ文節の切れ目を記録 */ + node = best_node; + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) { + printf("choose_path()\n"); + } + while (node->before_node) { + info->sc->word_split_info->best_seg_class[node->border] = + node->seg_class; + anthy_mark_border_by_metaword(info->sc, node->mw); + /**/ + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) { + print_lattice_node(info, node); + } + /**/ + node = node->before_node; + } +} + +static void +build_graph(struct lattice_info* info, int from, int to) +{ + int i; + struct lattice_node* node; + struct lattice_node* left_node; + + /* 始点となるノードを追加 */ + node = alloc_lattice_node(info, NULL, NULL, from); + push_node(info, node, from); + + /* info->lattice_node_list[index]にはindexまでの遷移が入っているのであって、 + * indexからの遷移が入っているのではない + */ + + /* 全ての遷移を左から試す */ + for (i = from; i < to; ++i) { + for (left_node = info->lattice_node_list[i].head; left_node; + left_node = left_node->next) { + struct meta_word *mw; + /* i文字目に到達するlattice_nodeのループ */ + + for (mw = info->sc->word_split_info->cnode[i].mw; mw; mw = mw->next) { + int position; + struct lattice_node* new_node; + /* i文字目からのmeta_wordのループ */ + + if (mw->can_use != ok) { + continue; /* 決められた文節の区切りをまたぐmetawordは使わない */ + } + position = i + mw->len; + new_node = alloc_lattice_node(info, left_node, mw, i); + push_node(info, new_node, position); + + /* 解の候補が多すぎたら、確率の低い方から削る */ + if (info->lattice_node_list[position].nr_nodes >= NODE_MAX_SIZE) { + remove_min_node(info, &info->lattice_node_list[position]); + } + } + } + } + + /* 文末補正 */ + for (node = info->lattice_node_list[to].head; node; node = node->next) { + struct feature_list features; + anthy_feature_list_init(&features); + build_feature_list(NULL, &features); + node->adjusted_probability = node->adjusted_probability * + calc_probability(SEG_TAIL, &features); + anthy_feature_list_free(&features); + } +} + +void +anthy_mark_borders(struct splitter_context *sc, int from, int to) +{ + struct lattice_info* info = alloc_lattice_info(sc, to); + trans_info_array = anthy_file_dic_get_section("trans_info"); + build_graph(info, from, to); + choose_path(info, to); + release_lattice_info(info); +} diff --git a/src-splitter/metaword.c b/src-splitter/metaword.c new file mode 100644 index 0000000..0491035 --- /dev/null +++ b/src-splitter/metaword.c @@ -0,0 +1,967 @@ +/* + * 文節もしくは単語を一つ以上セットにしてmetawordとして扱う。 + * ここでは各種のmetawordを生成する + * + * init_metaword_tab() metaword処理のための情報を構成する + * anthy_make_metaword_all() context中のmetawordを構成する + * anthy_print_metaword() 指定されたmetawordを表示する + * + * Funded by IPA未踏ソフトウェア創造事業 2001 10/29 + * Copyright (C) 2000-2006 TABATA Yusuke + * Copyright (C) 2004-2006 YOSHIDA Yuichi + * Copyright (C) 2000-2003 UGAWA Tomoharu + */ +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +#include <anthy/record.h> +#include <anthy/splitter.h> +#include <anthy/xchar.h> +#include <anthy/xstr.h> +#include <anthy/segment.h> +#include <anthy/segclass.h> +#include "wordborder.h" + +/* 各種meta_wordをどのように処理するか */ +struct metaword_type_tab_ anthy_metaword_type_tab[] = { + {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE}, + {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE}, + {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP}, + {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND}, + {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE}, + {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE}, + {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE}, + {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER}, + {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER}, + {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER}, + {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE}, + /**/ + {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE} +}; + +static void +combine_metaword(struct splitter_context *sc, struct meta_word *mw); + +/* コンテキスト中にmetawordを追加する */ +void +anthy_commit_meta_word(struct splitter_context *sc, + struct meta_word *mw) +{ + struct word_split_info_cache *info = sc->word_split_info; + /* 同じ開始点を持つノードのリスト */ + mw->next = info->cnode[mw->from].mw; + info->cnode[mw->from].mw = mw; + /**/ + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) { + anthy_print_metaword(sc, mw); + } +} + +static void +print_metaword_features(int features) +{ + if (features & MW_FEATURE_SV) { + printf(":sv"); + } + if (features & MW_FEATURE_WEAK_CONN) { + printf(":weak"); + } + if (features & MW_FEATURE_SUFFIX) { + printf(":suffix"); + } + if (features & MW_FEATURE_NUM) { + printf(":num"); + } + if (features & MW_FEATURE_CORE1) { + printf(":c1"); + } + if (features & MW_FEATURE_HIGH_FREQ) { + printf(":hf"); + } +} + +static void +anthy_do_print_metaword(struct splitter_context *sc, + struct meta_word *mw, + int indent) +{ + int i; + for (i = 0; i < indent; i++) { + printf(" "); + } + printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s", + anthy_metaword_type_tab[mw->type].name, + mw->from, mw->len, mw->score, + anthy_seg_class_name(mw->seg_class)); + print_metaword_features(mw->mw_features); + printf(":can_use=%d*\n", mw->can_use); + if (mw->wl) { + anthy_print_word_list(sc, mw->wl); + } + if (mw->cand_hint.str) { + printf("("); + anthy_putxstr(&mw->cand_hint); + printf(")\n"); + } + if (mw->mw1) { + anthy_do_print_metaword(sc, mw->mw1, indent + 1); + } + if (mw->mw2) { + anthy_do_print_metaword(sc, mw->mw2, indent + 1); + } +} + +void +anthy_print_metaword(struct splitter_context *sc, + struct meta_word *mw) +{ + anthy_do_print_metaword(sc, mw, 0); +} + +static struct meta_word * +alloc_metaword(struct splitter_context *sc) +{ + struct meta_word *mw; + mw = anthy_smalloc(sc->word_split_info->MwAllocator); + mw->type = MW_SINGLE; + mw->score = 0; + mw->struct_score = 0; + mw->dep_word_hash = 0; + mw->core_wt = anthy_wt_none; + mw->mw_features = 0; + mw->dep_class = DEP_NONE; + mw->wl = NULL; + mw->mw1 = NULL; + mw->mw2 = NULL; + mw->cand_hint.str = NULL; + mw->cand_hint.len = 0; + mw->seg_class = SEG_HEAD; + mw->can_use = ok; + return mw; +} + + +/* + * wlの接頭辞部分と接尾辞部分を文字列として取り出す + */ +static void +get_surrounding_text(struct splitter_context* sc, + struct word_list* wl, + xstr* xs_pre, xstr* xs_post) +{ + int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len; + int pre_len = wl->part[PART_PREFIX].len; + + xs_pre->str = sc->ce[wl->from].c; + xs_pre->len = pre_len; + xs_post->str = sc->ce[wl->from + wl->len - post_len].c; + xs_post->len = post_len; +} + +static int +count_vu(xstr *xs) { + int i, r = 0; + for (i = 0; i < xs->len; i++) { + if (xs->str[i] == KK_VU) { + r++; + } + } + return r; +} + +/* + * 複合語であるwlからn番めの部分を取り出してmwにする + */ +static struct meta_word* +make_compound_nth_metaword(struct splitter_context* sc, + compound_ent_t ce, int nth, + struct word_list* wl, + enum metaword_type type) +{ + int i; + int len = 0; + int from = wl->from; + int seg_num = anthy_compound_get_nr_segments(ce); + struct meta_word* mw; + xstr xs_pre, xs_core, xs_post; + + get_surrounding_text(sc, wl, &xs_pre, &xs_post); + + for (i = 0; i <= nth; ++i) { + xstr part; + from += len; + len = anthy_compound_get_nth_segment_len(ce, i); + part.str = sc->ce[from].c; + part.len = len; + len -= count_vu(&part); + if (i == 0) { + len += xs_pre.len; + } + if (i == seg_num - 1) { + len += xs_post.len; + } + } + + mw = alloc_metaword(sc); + mw->from = from; + mw->len = len; + mw->type = type; + mw->score = 1000; + mw->seg_class = wl->seg_class; + + anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core); + if (nth == 0) { + anthy_xstrcat(&mw->cand_hint, &xs_pre); + } + anthy_xstrcat(&mw->cand_hint, &xs_core); + if (nth == seg_num - 1) { + anthy_xstrcat(&mw->cand_hint, &xs_post); + } + return mw; +} + + +/* + * metawordを実際に結合する + */ +static struct meta_word * +anthy_do_cons_metaword(struct splitter_context *sc, + enum metaword_type type, + struct meta_word *mw, struct meta_word *mw2) +{ + struct meta_word *n; + + n = alloc_metaword(sc); + n->from = mw->from; + n->len = mw->len + (mw2 ? mw2->len : 0); + + if (mw2) { + n->score = sqrt(mw->score) * sqrt(mw2->score); + } else { + n->score = mw->score; + } + n->type = type; + n->mw1 = mw; + n->mw2 = mw2; + if (mw2) { + n->seg_class = mw2->seg_class; + n->nr_parts = mw->nr_parts + mw2->nr_parts; + n->dep_word_hash = mw2->dep_word_hash; + } else { + n->seg_class = mw->seg_class; + n->nr_parts = mw->nr_parts; + n->dep_word_hash = mw->dep_word_hash; + } + anthy_commit_meta_word(sc, n); + return n; +} + +/* + * 複合語用のmeta_wordを作成する。 + */ +static void +make_compound_metaword(struct splitter_context* sc, struct word_list* wl) +{ + int i, j; + seq_ent_t se = wl->part[PART_CORE].seq; + int ent_num = anthy_get_nr_dic_ents(se, NULL); + + for (i = 0; i < ent_num; ++i) { + compound_ent_t ce; + int seg_num; + struct meta_word *mw = NULL; + struct meta_word *mw2 = NULL; + if (!anthy_get_nth_dic_ent_is_compound(se, i)) { + continue; + } + ce = anthy_get_nth_compound_ent(se, i); + seg_num = anthy_compound_get_nr_segments(ce); + + for (j = seg_num - 1; j >= 0; --j) { + enum metaword_type type; + mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF); + anthy_commit_meta_word(sc, mw); + + type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND; + mw2 = anthy_do_cons_metaword(sc, type, mw, mw2); + } + } +} + +/* + * 複合語の中の個々の文節を結合したmeta_wordを作成する。 + */ +static void +make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl) +{ + int i, j, k; + seq_ent_t se = wl->part[PART_CORE].seq; + int ent_num = anthy_get_nr_dic_ents(se, NULL); + + for (i = 0; i < ent_num; ++i) { + compound_ent_t ce; + int seg_num; + struct meta_word *mw = NULL; + struct meta_word *mw2 = NULL; + + if (!anthy_get_nth_dic_ent_is_compound(se, i)) { + continue; + } + + ce = anthy_get_nth_compound_ent(se, i); + seg_num = anthy_compound_get_nr_segments(ce); + + /* 後ろから */ + for (j = seg_num - 1; j >= 0; --j) { + mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART); + for (k = j - 1; k >= 0; --k) { + mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART); + mw2->len += mw->len; + mw2->score += mw->score; + anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint); + + anthy_commit_meta_word(sc, mw2); + mw = mw2; + } + } + } +} + +/* + * 単文節単語 + */ +static void +make_simple_metaword(struct splitter_context *sc, struct word_list* wl) +{ + struct meta_word *mw = alloc_metaword(sc); + mw->wl = wl; + mw->from = wl->from; + mw->len = wl->len; + mw->score = 1000; + mw->type = MW_SINGLE; + mw->dep_class = wl->part[PART_DEPWORD].dc; + mw->seg_class = wl->seg_class; + if (wl->part[PART_CORE].len) { + mw->core_wt = wl->part[PART_CORE].wt; + } + mw->nr_parts = NR_PARTS; + mw->dep_word_hash = wl->dep_word_hash; + mw->mw_features = wl->mw_features; + anthy_commit_meta_word(sc, mw); +} + +/* + * wordlist一個からなる、metawordを作成 + */ +static void +make_metaword_from_word_list(struct splitter_context *sc) +{ + int i; + for (i = 0; i < sc->char_count; i++) { + struct word_list *wl; + for (wl = sc->word_split_info->cnode[i].wl; + wl; wl = wl->next) { + if (wl->is_compound) { + make_compound_part_metaword(sc, wl); + make_compound_metaword(sc, wl); + } else { + make_simple_metaword(sc, wl); + } + } + } +} + +/* + * metawordをリスト風に結合する + */ +static struct meta_word * +list_metaword(struct splitter_context *sc, + enum metaword_type type, + struct meta_word *mw, struct meta_word *mw2) +{ + struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL); + struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw); + + n->mw_features = mw->mw_features | mw2->mw_features; + + return n; +} + +/* + * 動詞連用形 + 形容詞化接尾語 「〜しやすい」など + */ +static void +try_combine_v_renyou_a(struct splitter_context *sc, + struct meta_word *mw, struct meta_word *mw2) +{ + wtype_t w2; + if (!mw->wl || !mw2->wl) return; + + w2 = mw2->wl->part[PART_CORE].wt; + + if (mw->wl->head_pos == POS_V && + mw->wl->tail_ct == CT_RENYOU && + anthy_wtype_get_pos(w2) == POS_D2KY) { + /* 形容詞ではあるので次のチェック */ + if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq, + anthy_wtype_a_tail_of_v_renyou)) { + list_metaword(sc, MW_V_RENYOU_A, mw, mw2); + } + } +} + +/* + * 動詞連用形 + 名詞化接尾語(#D2T35) 「入れ たて(のお茶)」など + */ +static void +try_combine_v_renyou_noun(struct splitter_context *sc, + struct meta_word *mw, struct meta_word *mw2) +{ + wtype_t w2; + if (!mw->wl || !mw2->wl) return; + + w2 = mw2->wl->part[PART_CORE].wt; + if (mw->wl->head_pos == POS_V && + mw->wl->tail_ct == CT_RENYOU && + anthy_wtype_get_pos(w2) == POS_NOUN && + anthy_wtype_get_scos(w2) == SCOS_T40) { + list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2); + } +} + +/* + * 数字を結合する + */ +static void +try_combine_number(struct splitter_context *sc, + struct meta_word *mw1, struct meta_word *mw2) +{ + struct word_list *wl1 = mw1->wl; + struct word_list *wl2 = mw2->wl; + struct meta_word *combined_mw; + int recursive = wl2 ? 0 : 1; /* combinedなmwを結合する場合1 */ + + /* 左mwは数詞 */ + + if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return; + if (recursive) { + /* 右mwは数字を結合したmw */ + if (mw2->type != MW_NUMBER) return; + wl2 = mw2->mw1->wl; + } else { + /* 右mwは数詞 */ + if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return; + } + /* 左mwの後ろに文字が付いていなければ */ + if (wl1->part[PART_POSTFIX].len == 0 && + wl1->part[PART_DEPWORD].len == 0) { + int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt); + int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt); + + /* #NNは対象外 */ + if (scos2 == SCOS_NONE) return; + /* + 左mwの種類によって、後ろにつくことができる右mwの種類が変わる + 例えば一〜九の後ろには万〜九万、億〜九億しかつくことができないが、 + 十〜九十の後ろには、あわせて一〜九などもつくことができる + */ + switch (scos1) { + case SCOS_N1: + if (scos2 == SCOS_N1) return; /* 後ろに一〜九がついてはいけない */ + case SCOS_N10: + if (scos2 == SCOS_N10) return; /* 後ろに十〜九十がついてはいけない */ + case SCOS_N100: + if (scos2 == SCOS_N100) return; /* 後ろに百〜九百がついてはいけない */ + case SCOS_N1000: + if (scos2 == SCOS_N1000) return; /* 後ろに千〜九千がついてはいけない */ + case SCOS_N10000: + /* 万〜九万、億〜九億…などは、 + いつでも後ろにつくことができる */ + break; + default: + return; + } + + if (recursive) { + combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2); + } else { + /* 初めて結合する場合は後ろにnullをつけてlistにする */ + combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2); + } + combine_metaword(sc, combined_mw); + } +} + +/* 右隣のmetawordと結合できるかチェック */ +static void +try_combine_metaword(struct splitter_context *sc, + struct meta_word *mw1, struct meta_word *mw2) +{ + if (!mw1->wl) return; + + /* metawordの結合を行うためには、後続の + metawordに接頭辞がないことが必要 */ + if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) { + return; + } + + try_combine_v_renyou_a(sc, mw1, mw2); + try_combine_v_renyou_noun(sc, mw1, mw2); + try_combine_number(sc, mw1, mw2); +} + +static void +combine_metaword(struct splitter_context *sc, struct meta_word *mw) +{ + struct word_split_info_cache *info = sc->word_split_info; + int i; + + if (mw->mw_features & MW_FEATURE_DEP_ONLY) { + /* 付属語だけの文節とは結合しない */ + return; + } + + for (i = mw->from - 1; i >= 0; i--) { + struct meta_word *mw_left; + for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) { + if (mw_left->from + mw_left->len == mw->from) { + /* 結合できるかチェック */ + try_combine_metaword(sc, mw_left, mw); + } + } + } +} + +static void +combine_metaword_all(struct splitter_context *sc) +{ + int i; + + struct word_split_info_cache *info = sc->word_split_info; + /* metawordの左端によるループ */ + for (i = sc->char_count - 1; i >= 0; i--){ + struct meta_word *mw; + /* 各metawordのループ */ + for (mw = info->cnode[i].mw; + mw; mw = mw->next) { + combine_metaword(sc, mw); + } + } +} + +static void +make_dummy_metaword(struct splitter_context *sc, int from, + int len, int orig_len) +{ + int score = 0; + struct meta_word *mw, *n; + + for (mw = sc->word_split_info->cnode[from].mw; mw; mw = mw->next) { + if (mw->len != orig_len) continue; + if (mw->score > score) { + score = mw->score; + } + } + + n = alloc_metaword(sc); + n->type = MW_DUMMY; + n->from = from; + n->len = len; + n->score = 3 * score * len / orig_len; + if (mw) { + mw->nr_parts = 0; + } + anthy_commit_meta_word(sc, n); +} + +/* + * 文節を伸ばしたらそれを覚えておく + */ +static void +make_expanded_metaword_all(struct splitter_context *sc) +{ + int i, j; + if (anthy_select_section("EXPANDPAIR", 0) == -1) { + return ; + } + for (i = 0; i < sc->char_count; i++) { + for (j = 1; j < sc->char_count - i; j++) { + /* 全ての部分文字列に対して */ + xstr xs; + xs.len = j; + xs.str = sc->ce[i].c; + if (anthy_select_row(&xs, 0) == 0) { + /* この部分文字列は過去に拡大の対象となった */ + int k; + int nr = anthy_get_nr_values(); + for (k = 0; k < nr; k++) { + xstr *exs; + exs = anthy_get_nth_xstr(k); + if (exs && exs->len <= sc->char_count - i) { + xstr txs; + txs.str = sc->ce[i].c; + txs.len = exs->len; + if (!anthy_xstrcmp(&txs, exs)) { + make_dummy_metaword(sc, i, txs.len, j); + } + } + } + } + } + } +} + +/* お茶入れ学習のmetawordを作る */ +static void +make_ochaire_metaword(struct splitter_context *sc, + int from, int len) +{ + struct meta_word *mw; + int count; + int s; + int j; + int seg_len; + int mw_len = 0; + xstr* xs; + + (void)len; + + /* 文節数を取得 */ + count = anthy_get_nth_value(0); + /* 一番右の文節をのぞいた文字数の合計を計算 */ + for (s = 0, j = 0; j < count - 1; j++) { + s += anthy_get_nth_value(j * 2 + 1); + } + /* 一番右の文節のmetawordを構成 */ + xs = anthy_get_nth_xstr((count - 1) * 2 + 2); + if (!xs) { + return ; + } + seg_len = anthy_get_nth_value((count - 1) * 2 + 1); + mw = alloc_metaword(sc); + mw->type = MW_OCHAIRE; + mw->from = from + s; + mw->len = seg_len; + mw->score = OCHAIRE_SCORE; + mw->cand_hint.str = malloc(sizeof(xchar)*xs->len); + anthy_xstrcpy(&mw->cand_hint, xs); + anthy_commit_meta_word(sc, mw); + mw_len += seg_len; + /* それ以外の文節でmetawordを構成 */ + for (j-- ; j >= 0; j--) { + struct meta_word *n; + seg_len = anthy_get_nth_value(j * 2 + 1); + s -= seg_len; + xs = anthy_get_nth_xstr(j * 2 + 2); + if (!xs) { + return ; + } + n = alloc_metaword(sc); + n->type = MW_OCHAIRE; + /* 右のmetawordをつなぐ */ + n->mw1 = mw; + n->from = from + s; + n->len = seg_len; + n->score = OCHAIRE_SCORE; + n->cand_hint.str = malloc(sizeof(xchar)*xs->len); + anthy_xstrcpy(&n->cand_hint, xs); + anthy_commit_meta_word(sc, n); + mw = n; + mw_len += seg_len; + } +} + +/* + * 複数の文節の組を履歴から検索する + */ +static void +make_ochaire_metaword_all(struct splitter_context *sc) +{ + int i; + if (anthy_select_section("OCHAIRE", 0) == -1) { + return ; + } + + for (i = 0; i < sc->char_count; i++) { + xstr xs; + xs.len = sc->char_count - i; + xs.str = sc->ce[i].c; + if (anthy_select_longest_row(&xs) == 0) { + xstr* key; + int len; + anthy_mark_row_used(); + key = anthy_get_index_xstr(); + len = key->len; + + make_ochaire_metaword(sc, i, len); + /* 今回見つかった meta_word の次の文字から始める */ + i += len - 1; + break; + } + } +} + +static void +add_dummy_metaword(struct splitter_context *sc, + int from) +{ + struct meta_word *n; + n = alloc_metaword(sc); + n->from = from; + n->len = 1; + n->type = MW_SINGLE; + n->score = 1; + n->seg_class = SEG_BUNSETSU; + anthy_commit_meta_word(sc, n); +} + +/* 指定したmetawordをwrapしてj文字長いmeta_wordを作る */ +static void +expand_meta_word(struct splitter_context *sc, + struct meta_word *mw, int from, int len, + int destroy_seg_class, int j) +{ + struct meta_word *n; + n = alloc_metaword(sc); + n->from = from; + n->len = len + j; + if (mw) { + n->type = MW_WRAP; + n->mw1 = mw; + n->score = mw->score; + n->nr_parts = mw->nr_parts; + if (destroy_seg_class) { + n->seg_class = SEG_BUNSETSU; + n->score /= 10; + } else { + n->seg_class = mw->seg_class; + } + } else { + n->type = MW_SINGLE; + n->score = 1; + n->seg_class = SEG_BUNSETSU; + } + anthy_commit_meta_word(sc, n); +} + +/* + * metawordの後ろの雑多な文字をくっつけたmetawordを構成する + */ +static void +make_metaword_with_depchar(struct splitter_context *sc, + struct meta_word *mw) +{ + int j; + int destroy_seg_class = 0; + int from = mw ? mw->from : 0; + int len = mw ? mw->len : 0; + + /* metawordの直後の文字の種類を調べる */ + int type; + if (sc->char_count <= from + len) { + return ; + } + type = anthy_get_xchar_type(*sc->ce[from + len].c); + if (!(type & XCT_SYMBOL) && + !(type & XCT_PART)) { + return; + } + if (type & XCT_PUNCTUATION) { + /* 句読点ならば別の文節にする */ + return ; + } + + /* 同じ種類の文字でなければくっつけるのをうちきり */ + for (j = 0; from + len + j < sc->char_count; j++) { + int p = from + len + j; + if ((anthy_get_xchar_type(*sc->ce[p].c) != type)) { + break; + } + if (!(p + 1 < sc->char_count) || + *sc->ce[p].c != *sc->ce[p + 1].c) { + destroy_seg_class = 1; + } + } + + /* 上のループを抜けた時、jには独立できない文字の数が入っている */ + + /* 独立できない文字があるので、それを付けたmetawordを作る */ + if (j > 0) { + expand_meta_word(sc, mw, from, len, destroy_seg_class, j); + } +} + +static void +make_metaword_with_depchar_all(struct splitter_context *sc) +{ + int i; + struct word_split_info_cache *info = sc->word_split_info; + + /* 全metawordに対して */ + for (i = 0; i < sc->char_count; i++) { + struct meta_word *mw; + for (mw = info->cnode[i].mw; + mw; mw = mw->next) { + make_metaword_with_depchar(sc, mw); + } + if (!info->cnode[i].mw) { + /**/ + add_dummy_metaword(sc, i); + } + } + /* 文の左端から始まるもの */ + make_metaword_with_depchar(sc, NULL); +} + +static int +is_single(xstr* xs) +{ + int i; + int xct; + for (i = xs->len - 1; i >= 1; --i) { + xct = anthy_get_xchar_type(xs->str[i]); + if (!(xct & XCT_PART)) { + return 0; + } + } + return 1; +} + +static void +bias_to_single_char_metaword(struct splitter_context *sc) +{ + int i; + + for (i = sc->char_count - 1; i >= 0; --i) { + struct meta_word *mw; + xstr xs; + int xct; + + struct char_node *cnode = &sc->word_split_info->cnode[i]; + + /* カッコの場合は一文字で文節を構成できる */ + xct = anthy_get_xchar_type(*sc->ce[i].c); + if (xct & (XCT_OPEN|XCT_CLOSE)) { + continue; + } + + xs.str = sc->ce[i].c; + for (mw = cnode->mw; mw; mw = mw->next) { + /* 付属語のみの文節は減点しない */ + if (mw->mw_features & MW_FEATURE_DEP_ONLY) { + continue; + } + /* 一文字(+直前につながる文字の繰り返し)のスコアを下げる */ + xs.len = mw->len; + if (is_single(&xs)) { + mw->score /= 10; + } + } + } +} + +void +anthy_mark_border_by_metaword(struct splitter_context* sc, + struct meta_word* mw) +{ + struct word_split_info_cache* info = sc->word_split_info; + if (!mw) return; + + switch (mw->type) { + case MW_DUMMY: + /* BREAK THROUGH */ + case MW_SINGLE: + /* BREAK THROUGH */ + case MW_COMPOUND_PART: + info->seg_border[mw->from] = 1; + break; + case MW_COMPOUND_LEAF: + info->seg_border[mw->from] = 1; + info->best_mw[mw->from] = mw; + mw->can_use = ok; + break; + case MW_COMPOUND_HEAD: + /* BREAK THROUGH */ + case MW_COMPOUND: + /* BREAK THROUGH */ + case MW_NUMBER: + info->best_mw[mw->mw1->from] = mw->mw1; + anthy_mark_border_by_metaword(sc, mw->mw1); + anthy_mark_border_by_metaword(sc, mw->mw2); + break; + case MW_V_RENYOU_A: + /* BREAK THROUGH */ + case MW_V_RENYOU_NOUN: + info->seg_border[mw->from] = 1; + break; + case MW_WRAP: + anthy_mark_border_by_metaword(sc, mw->mw1); + break; + case MW_OCHAIRE: + info->seg_border[mw->from] = 1; + anthy_mark_border_by_metaword(sc, mw->mw1); + break; + default: + break; + } +} + +void +anthy_make_metaword_all(struct splitter_context *sc) +{ + /* まず、word_list一個のmetawordを作る */ + make_metaword_from_word_list(sc); + + /* metawordを結合する */ + combine_metaword_all(sc); + + /* 拡大された文節を処理する */ + make_expanded_metaword_all(sc); + + /* 濁点や長音などの記号、その他の記号を処理 */ + make_metaword_with_depchar_all(sc); + + /* おちゃをいれる */ + make_ochaire_metaword_all(sc); + + /* 一文字の文節は減点 */ + bias_to_single_char_metaword(sc); +} + +/* + * 指定された領域をカバーするmetawordを数える + */ +int +anthy_get_nr_metaword(struct splitter_context *sc, + int from, int len) +{ + struct meta_word *mw; + int n; + + for (n = 0, mw = sc->word_split_info->cnode[from].mw; + mw; mw = mw->next) { + if (mw->len == len && mw->can_use == ok) { + n++; + } + } + return n; +} + +struct meta_word * +anthy_get_nth_metaword(struct splitter_context *sc, + int from, int len, int nth) +{ + struct meta_word *mw; + int n; + for (n = 0, mw = sc->word_split_info->cnode[from].mw; + mw; mw = mw->next) { + if (mw->len == len && mw->can_use == ok) { + if (n == nth) { + return mw; + } + n++; + } + } + return NULL; +} diff --git a/src-splitter/segclass.c b/src-splitter/segclass.c new file mode 100644 index 0000000..6b440a4 --- /dev/null +++ b/src-splitter/segclass.c @@ -0,0 +1,130 @@ +#include <string.h> + +#include <anthy/splitter.h> +#include <anthy/wtype.h> +#include <anthy/segclass.h> +#include "wordborder.h" + +static struct { + const char *name; + const char *sym; +} seg_class_tab[] = { + {"文頭", "H"}, {"文末", "T"}, {"文節", "B"}, + {"接続語", "C"}, {"名詞+格助詞", "Nk"}, {"名詞+終端", "Ne"}, + {"動詞+付属語", "Vf"}, {"動詞+終端", "Ve"}, {"形容詞", "A"}, + {"形容動詞", "AJV"}, + {"連用修飾", "YM"}, {"連体修飾", "TM"}, + {"名詞", "N"}, {"名詞+付属語", "Nf"}, {"名詞+連用", "Ny"}, + {"動詞+連用", "Vy"}, + {"動詞+連体", "Vt"}, + {NULL, NULL} +}; + +void +anthy_set_seg_class(struct word_list* wl) +{ + int head_pos; + enum dep_class dc; + enum seg_class seg_class; + + if (!wl) return; + + head_pos = wl->head_pos; + dc = wl->part[PART_DEPWORD].dc; + seg_class = SEG_HEAD; + + if (wl->part[PART_CORE].len == 0) { + seg_class = SEG_BUNSETSU; + } else { + switch (head_pos) { + case POS_NOUN: + case POS_NUMBER: + /* BREAK THROUGH */ + case POS_N2T: + if (dc == DEP_RAW) { + seg_class = SEG_MEISHI; + } else if (dc == DEP_END) { + seg_class = SEG_MEISHI_SHUTAN; + } else if (dc == DEP_RENYOU) { + seg_class = SEG_MEISHI_RENYOU; + } else if (dc == DEP_KAKUJOSHI) { + seg_class = SEG_MEISHI_KAKUJOSHI; + } else { + seg_class = SEG_MEISHI_FUZOKUGO; + } + break; + case POS_V: + if (dc == DEP_RAW) { + seg_class = SEG_BUNSETSU; + } else if (dc == DEP_END) { + seg_class = SEG_DOUSHI_SHUTAN; + } else if (dc == DEP_RENYOU) { + seg_class = SEG_DOUSHI_RENYOU; + } else if (dc == DEP_RENTAI) { + seg_class = SEG_DOUSHI_RENTAI; + } else { + seg_class = SEG_DOUSHI_FUZOKUGO; + } + break; + case POS_D2KY: + /* BREAK THROUGH */ + case POS_A: + seg_class = SEG_KEIYOUSHI; + if (dc == DEP_RENYOU) { + seg_class = SEG_RENYOU_SHUSHOKU; + } else if (dc == DEP_RENTAI) { + seg_class = SEG_RENTAI_SHUSHOKU; + } + break; + case POS_AJV: + seg_class = SEG_KEIYOUDOUSHI; + if (dc == DEP_RENYOU) { + seg_class = SEG_RENYOU_SHUSHOKU; + } else if (dc == DEP_RENTAI) { + seg_class = SEG_RENTAI_SHUSHOKU; + } + break; + case POS_AV: + seg_class = SEG_RENYOU_SHUSHOKU; + break; + case POS_ME: + seg_class = SEG_RENTAI_SHUSHOKU; + break; + case POS_CONJ: + seg_class = SEG_SETSUZOKUGO; + break; + case POS_OPEN: + seg_class = SEG_BUNSETSU; + break; + case POS_CLOSE: + seg_class = SEG_BUNSETSU; + break; + default: + seg_class = SEG_MEISHI; + break; + } + } + wl->seg_class = seg_class; +} + +const char* anthy_seg_class_name(enum seg_class sc) +{ + return seg_class_tab[sc].name; +} + +const char* anthy_seg_class_sym(enum seg_class sc) +{ + return seg_class_tab[sc].sym; +} + +enum seg_class +anthy_seg_class_by_name(const char *name) +{ + int i; + for (i = 0; seg_class_tab[i].name; i++) { + if (!strcmp(seg_class_tab[i].name, name)) { + return i; + } + } + return SEG_BUNSETSU; +} diff --git a/src-splitter/splitter.c b/src-splitter/splitter.c new file mode 100644 index 0000000..75ace2b --- /dev/null +++ b/src-splitter/splitter.c @@ -0,0 +1,329 @@ +/* + * 文を文節にsplitするsplitter + * + * 文節の境界を検出する + * anthy_init_split_context() 分割用のコンテキストを作って + * anthy_mark_border() 分割をして + * anthy_release_split_context() コンテキストを解放する + * + * anthy_commit_border() コミットされた内容に対して学習をする + * + * Funded by IPA未踏ソフトウェア創造事業 2001 9/22 + * + * Copyright (C) 2004 YOSHIDA Yuichi + * Copyright (C) 2000-2004 TABATA Yusuke + * Copyright (C) 2000-2001 UGAWA Tomoharu + * + * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $ + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <string.h> + +#include <anthy/alloc.h> +#include <anthy/record.h> +#include <anthy/splitter.h> +#include <anthy/logger.h> +#include "wordborder.h" + +#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000 + +static int splitter_debug_flags; + +/**/ +wtype_t anthy_wtype_noun; +wtype_t anthy_wtype_name_noun; +wtype_t anthy_wtype_num_noun; +wtype_t anthy_wtype_prefix; +wtype_t anthy_wtype_num_prefix; +wtype_t anthy_wtype_num_postfix; +wtype_t anthy_wtype_name_postfix; +wtype_t anthy_wtype_sv_postfix; +wtype_t anthy_wtype_a_tail_of_v_renyou; +wtype_t anthy_wtype_v_renyou; +wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */ +wtype_t anthy_wtype_n1; +wtype_t anthy_wtype_n10; + + +/** make_word_cacheで作成した文節情報を解放する + */ +static void +release_info_cache(struct splitter_context *sc) +{ + struct word_split_info_cache *info = sc->word_split_info; + + anthy_free_allocator(info->MwAllocator); + anthy_free_allocator(info->WlAllocator); + free(info->cnode); + free(info->seq_len); + free(info->rev_seq_len); + free(info); +} + +static void +metaword_dtor(void *p) +{ + struct meta_word *mw = (struct meta_word*)p; + if (mw->cand_hint.str) { + free(mw->cand_hint.str); + } +} + + +static void +alloc_char_ent(xstr *xs, struct splitter_context *sc) +{ + int i; + + sc->char_count = xs->len; + sc->ce = (struct char_ent*) + malloc(sizeof(struct char_ent)*(xs->len + 1)); + for (i = 0; i <= xs->len; i++) { + sc->ce[i].c = &xs->str[i]; + sc->ce[i].seg_border = 0; + sc->ce[i].initial_seg_len = 0; + sc->ce[i].best_seg_class = SEG_HEAD; + sc->ce[i].best_mw = NULL; + } + + /* 左右両端は文節の境界である */ + sc->ce[0].seg_border = 1; + sc->ce[xs->len].seg_border = 1; +} + +/* ここで確保した内容はrelease_info_cacheで解放される + */ +static void +alloc_info_cache(struct splitter_context *sc) +{ + int i; + struct word_split_info_cache *info; + + /* キャッシュのデータを確保 */ + sc->word_split_info = malloc(sizeof(struct word_split_info_cache)); + info = sc->word_split_info; + info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor); + info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0); + info->cnode = + malloc(sizeof(struct char_node) * (sc->char_count + 1)); + + info->seq_len = malloc(sizeof(int) * (sc->char_count + 1)); + info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1)); + + /* 各文字インデックスに対して初期化を行う */ + for (i = 0; i <= sc->char_count; i++) { + info->seq_len[i] = 0; + info->rev_seq_len[i] = 0; + info->cnode[i].wl = NULL; + info->cnode[i].mw = NULL; + info->cnode[i].max_len = 0; + } +} + +/** 外から呼び出されるwordsplitterのトップレベルの関数 */ +void +anthy_mark_border(struct splitter_context *sc, + int from, int from2, int to) +{ + int i; + struct word_split_info_cache *info; + + /* sanity check */ + if ((to - from) <= 0) { + return ; + } + + /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */ + info = sc->word_split_info; + info->seg_border = alloca(sizeof(int)*(sc->char_count + 1)); + info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1)); + info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1)); + for (i = 0; i < sc->char_count + 1; ++i) { + info->seg_border[i] = sc->ce[i].seg_border; + info->best_seg_class[i] = sc->ce[i].best_seg_class; + info->best_mw[i] = sc->ce[i].best_mw; + } + + /* 境界を決定する */ + anthy_eval_border(sc, from, from2, to); + + for (i = from; i < to; ++i) { + sc->ce[i].seg_border = info->seg_border[i]; + sc->ce[i].best_seg_class = info->best_seg_class[i]; + sc->ce[i].best_mw = info->best_mw[i]; + } +} + +/* 文節が拡大されたので,それを学習する */ +static void +proc_expanded_segment(struct splitter_context *sc, + int from, int len) +{ + int initial_len = sc->ce[from].initial_seg_len; + int i, nr; + xstr from_xs, to_xs, *xs; + + from_xs.str = sc->ce[from].c; + from_xs.len = initial_len; + to_xs.str = sc->ce[from].c; + to_xs.len = len; + if (anthy_select_section("EXPANDPAIR", 1) == -1) { + return ; + } + if (anthy_select_row(&from_xs, 1) == -1) { + return ; + } + nr = anthy_get_nr_values(); + for (i = 0; i < nr; i ++) { + xs = anthy_get_nth_xstr(i); + if (!xs || !anthy_xstrcmp(xs, &to_xs)) { + /* 既にある */ + return ; + } + } + anthy_set_nth_xstr(nr, &to_xs); + anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT); +} + +/* 文節のマージと語尾を学習する */ +void +anthy_commit_border(struct splitter_context *sc, int nr_segments, + struct meta_word **mw, int *seg_len) +{ + int i, from = 0; + + /* 伸ばした文節 */ + for (i = 0; i < nr_segments; i++) { + /* それぞれの文節に対して */ + + int len = seg_len[i]; + int initial_len = sc->ce[from].initial_seg_len; + int real_len = 0; + int l2; + + if (!initial_len || from + initial_len == sc->char_count) { + /* そこは境界ではない */ + goto tail; + } + l2 = sc->ce[from + initial_len].initial_seg_len; + if (initial_len + l2 > len) { + /* 隣の文節を含むほど拡大されたわけではない */ + goto tail; + } + if (mw[i]) { + real_len = mw[i]->len; + } + if (real_len <= initial_len) { + goto tail; + } + /* 右の文節を含む長さに拡張された文節がコミットされた */ + proc_expanded_segment(sc, from, real_len); + tail: + from += len; + } +} + +int +anthy_splitter_debug_flags(void) +{ + return splitter_debug_flags; +} + +void +anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse) +{ + alloc_char_ent(xs, sc); + alloc_info_cache(sc); + sc->is_reverse = is_reverse; + /* 全ての部分文字列をチェックして、文節の候補を列挙する + word_listを構成してからmetawordを構成する */ + anthy_lock_dic(); + anthy_make_word_list_all(sc); + anthy_unlock_dic(); + anthy_make_metaword_all(sc); + +} + +void +anthy_release_split_context(struct splitter_context *sc) +{ + if (sc->word_split_info) { + release_info_cache(sc); + sc->word_split_info = 0; + } + if (sc->ce) { + free(sc->ce); + sc->ce = 0; + } +} + +/** splitter全体の初期化を行う */ +int +anthy_init_splitter(void) +{ + /* デバッグプリントの設定 */ + char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT"); + char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT"); + splitter_debug_flags = SPLITTER_DEBUG_NONE; + if (!dis && en && strlen(en)) { + char *fs = getenv("ANTHY_SPLITTER_PRINT"); + if (fs) { + if (strchr(fs, 'w')) { + splitter_debug_flags |= SPLITTER_DEBUG_WL; + } + if (strchr(fs, 'm')) { + splitter_debug_flags |= SPLITTER_DEBUG_MW; + } + if (strchr(fs, 'l')) { + splitter_debug_flags |= SPLITTER_DEBUG_LN; + } + if (strchr(fs, 'i')) { + splitter_debug_flags |= SPLITTER_DEBUG_ID; + } + if (strchr(fs, 'c')) { + splitter_debug_flags |= SPLITTER_DEBUG_CAND; + } + } + } + /* 付属語グラフの初期化 */ + if (anthy_init_depword_tab()) { + anthy_log(0, "Failed to init dependent word table.\n"); + return -1; + } + /**/ + anthy_wtype_noun = anthy_init_wtype_by_name("名詞35"); + anthy_wtype_name_noun = anthy_init_wtype_by_name("人名"); + anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞"); + anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語"); + anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形"); + anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語"); + anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞"); + anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞"); + anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞"); + anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞"); + anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞"); + anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1"); + anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10"); + return 0; +} + +void +anthy_quit_splitter(void) +{ + anthy_quit_depword_tab(); +} diff --git a/src-splitter/wordborder.h b/src-splitter/wordborder.h new file mode 100644 index 0000000..1b93727 --- /dev/null +++ b/src-splitter/wordborder.h @@ -0,0 +1,210 @@ +/* 文節境界の検出に使うデータ */ +#ifndef _wordborder_h_included_ +#define _wordborder_h_included_ + + +#include <anthy/dic.h> +#include <anthy/alloc.h> +#include <anthy/segclass.h> +#include <anthy/depgraph.h> + +struct splitter_context; + +/* + * meta_wordの使用可能チェックのやり方 + */ +enum mw_check { + /* なにもせず */ + MW_CHECK_NONE, + /* mw->wlが無いか、wlが使える場合 */ + MW_CHECK_SINGLE, + MW_CHECK_BORDER, + MW_CHECK_WRAP, + MW_CHECK_OCHAIRE, + MW_CHECK_NUMBER, + MW_CHECK_COMPOUND +}; + +/* + * 文字列中のある場所を表し, + * そこから始まるmeta_word, word_listのセットを持つ + */ +struct char_node { + int max_len; + struct meta_word *mw; + struct word_list *wl; +}; + +/* + * コンテキスト中の自立語などの情報、最初に変換キーを押したときに + * 構築される + */ +struct word_split_info_cache { + struct char_node *cnode; + + /* キャッシュ構成時に使う情報 */ + /* 接尾辞を探すのに使う */ + int *seq_len;/* そこから始まる最長の単語の長さ */ + /* 接頭辞を探すのに使う */ + int *rev_seq_len;/* そこで終わる最長の単語の長さ */ + /* 文節境界contextからのコピー */ + int *seg_border; + /* 検索で一番成績の良かったクラス */ + enum seg_class* best_seg_class; + /* */ + struct meta_word **best_mw; + /* アロケータ */ + allocator MwAllocator, WlAllocator; +}; + +/* + * meta_wordの状態 + */ +enum mw_status { + MW_STATUS_NONE, + /* mw->mw1に中身が入っている */ + MW_STATUS_WRAPPED, + /* mw-mw1とmw->mw2から連結 */ + MW_STATUS_COMBINED, + /* 複合語用 */ + MW_STATUS_COMPOUND, + /* 複合語の個々の文節を結合して一つの文節として見たもの */ + MW_STATUS_COMPOUND_PART, + /* OCHAIRE学習から取り出す */ + MW_STATUS_OCHAIRE +}; + + + +/* metawordの種類による処理の違い (metaword.c) */ +extern struct metaword_type_tab_ { + enum metaword_type type; + const char *name; + enum mw_status status; + enum mw_check check; +} anthy_metaword_type_tab[]; + +/* + * 0: 接頭辞 + * 1: 自立語部 + * 2: 接尾辞 + */ +#define NR_PARTS 4 +#define PART_PREFIX 0 +#define PART_CORE 1 +#define PART_POSTFIX 2 +#define PART_DEPWORD 3 + +struct part_info { + /* このpartの長さ */ + int from, len; + /* 品詞 */ + wtype_t wt; + seq_ent_t seq; + /* 頻度 */ + int freq; + /* 付属語クラス */ + enum dep_class dc; +}; + +/* + * word_list: 文節を形成するもの + * 接頭語、自立語、接尾語、付属語を含む + */ +struct word_list { + /**/ + int from, len; /* 文節全体 */ + int is_compound; /* 複合語かどうか */ + + /**/ + int dep_word_hash; + int mw_features; + /**/ + enum seg_class seg_class; + enum constraint_stat can_use; /* セグメント境界に跨がっていない */ + + /* 漢字を得るためではなくて、雑多な処理に使いたい情報 */ + int head_pos; /* lattice検索用の品詞 */ + int tail_ct; /* meta_wordの結合用の活用形 */ + + /**/ + int last_part; + struct part_info part[NR_PARTS]; + + /* このword_listを作った際の情報 */ + int node_id; /* 付属語グラフの検索開始のnodeのid*/ + + /* 同じfromを持つword_listのリスト */ + struct word_list *next; +}; + + +/* splitter.c */ +#define SPLITTER_DEBUG_NONE 0 +/* wordlistの表示 */ +#define SPLITTER_DEBUG_WL 1 +/* metawordの表示 */ +#define SPLITTER_DEBUG_MW 2 +/* latticeの nodeの表示 */ +#define SPLITTER_DEBUG_LN 4 +/* 自立語のマッチした品詞 */ +#define SPLITTER_DEBUG_ID 8 +/**/ +#define SPLITTER_DEBUG_CAND 16 + +int anthy_splitter_debug_flags(void); + + +/* defined in wordseq.c */ +/* 自立語以降の接続の処理 */ +void anthy_scan_node(struct splitter_context *sc, + struct word_list *wl, + xstr *follow, int node); +int anthy_get_node_id_by_name(const char *name); +int anthy_init_depword_tab(void); +void anthy_quit_depword_tab(void); + +/* depgraph.c */ +int anthy_get_nr_dep_rule(void); +void anthy_get_nth_dep_rule(int, struct wordseq_rule *); + +/* defined in wordlist.c */ +void anthy_commit_word_list(struct splitter_context *, struct word_list *wl); +struct word_list *anthy_alloc_word_list(struct splitter_context *); +void anthy_print_word_list(struct splitter_context *, struct word_list *); +void anthy_make_word_list_all(struct splitter_context *); + +/* defined in metaword.c */ +void anthy_commit_meta_word(struct splitter_context *, struct meta_word *mw); +void anthy_make_metaword_all(struct splitter_context *); +void anthy_print_metaword(struct splitter_context *, struct meta_word *); + +void anthy_mark_border_by_metaword(struct splitter_context* sc, + struct meta_word* mw); + + +/* defined in evalborder.c */ +void anthy_eval_border(struct splitter_context *, int, int, int); + +/* defined at lattice.c */ +void anthy_mark_borders(struct splitter_context *sc, int from, int to); + +/* defined at seg_class.c */ +void anthy_set_seg_class(struct word_list* wl); + +/* 品詞(anthy_init_splitterで初期化される) */ +extern wtype_t anthy_wtype_noun; +extern wtype_t anthy_wtype_name_noun; +extern wtype_t anthy_wtype_num_noun; +extern wtype_t anthy_wtype_prefix; +extern wtype_t anthy_wtype_num_prefix; +extern wtype_t anthy_wtype_num_postfix; +extern wtype_t anthy_wtype_name_postfix; +extern wtype_t anthy_wtype_sv_postfix; +extern wtype_t anthy_wtype_a_tail_of_v_renyou; +extern wtype_t anthy_wtype_v_renyou; +extern wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */ +extern wtype_t anthy_wtype_n1; +extern wtype_t anthy_wtype_n10; + +#endif diff --git a/src-splitter/wordlist.c b/src-splitter/wordlist.c new file mode 100644 index 0000000..245e0ee --- /dev/null +++ b/src-splitter/wordlist.c @@ -0,0 +1,584 @@ +/* + * 文節の最小単位であるwordlistを構成する + * + * anthy_make_word_list_all() + * 文節の形式を満たす部分文字列を列挙する + * いくかの経路で列挙されたword_listは + * anthy_commit_word_listでsplitter_contextに追加される + * + * Funded by IPA未踏ソフトウェア創造事業 2002 2/27 + * Copyright (C) 2000-2006 TABATA Yusuke + * Copyright (C) 2004-2006 YOSHIDA Yuichi + * Copyright (C) 2000-2003 UGAWA Tomoharu + * + * $Id: wordlist.c,v 1.50 2002/11/17 14:45:47 yusuke Exp $ + * + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <arpa/inet.h> + +#include <anthy/alloc.h> +#include <anthy/record.h> +#include <anthy/xstr.h> +#include <anthy/diclib.h> +#include <anthy/wtype.h> +#include <anthy/ruleparser.h> +#include <anthy/dic.h> +#include <anthy/splitter.h> +#include <anthy/feature_set.h> +#include "wordborder.h" + +#define HF_THRESH 784 + +static void *weak_word_array; + +/* デバッグ用 */ +void +anthy_print_word_list(struct splitter_context *sc, + struct word_list *wl) +{ + xstr xs; + if (!wl) { + printf("--\n"); + return ; + } + /* 接頭辞 */ + xs.len = wl->part[PART_CORE].from - wl->from; + xs.str = sc->ce[wl->from].c; + anthy_putxstr(&xs); + printf("."); + /* 自立語 */ + xs.len = wl->part[PART_CORE].len; + xs.str = sc->ce[wl->part[PART_CORE].from].c; + anthy_putxstr(&xs); + printf("."); + /* 接尾辞 */ + xs.len = wl->part[PART_POSTFIX].len; + xs.str = sc->ce[wl->part[PART_CORE].from + wl->part[PART_CORE].len].c; + anthy_putxstr(&xs); + printf("-"); + /* 付属語 */ + xs.len = wl->part[PART_DEPWORD].len; + xs.str = sc->ce[wl->part[PART_CORE].from + + wl->part[PART_CORE].len + + wl->part[PART_POSTFIX].len].c; + anthy_putxstr(&xs); + anthy_print_wtype(wl->part[PART_CORE].wt); + printf(" %s%s\n", anthy_seg_class_name(wl->seg_class), + (wl->is_compound ? ",compound" : "")); +} + +int +anthy_dep_word_hash(xstr *xs) +{ + return anthy_xstr_hash(xs) % WORD_HASH_MAX; +} + +/** word_listを比較する、枝刈りのためなので、 + 厳密な比較である必要は無い */ +static int +word_list_same(struct word_list *wl1, struct word_list *wl2) +{ + if (wl1->node_id != wl2->node_id || + wl1->from != wl2->from || + wl1->len != wl2->len || + wl1->mw_features != wl2->mw_features || + wl1->tail_ct != wl2->tail_ct || + wl1->part[PART_CORE].len != wl2->part[PART_CORE].len || + wl1->is_compound != wl2->is_compound || + !anthy_wtype_equal(wl1->part[PART_CORE].wt, wl2->part[PART_CORE].wt) || + wl1->head_pos != wl2->head_pos) { + return 0; + } + if (wl1->part[PART_DEPWORD].dc != wl2->part[PART_DEPWORD].dc) { + return 0; + } + /* 同じと判断 */ + return 1; +} + +static void +set_features(struct word_list *wl) +{ + if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NOUN && + anthy_wtype_get_sv(wl->part[PART_CORE].wt)) { + wl->mw_features |= MW_FEATURE_SV; + } + if (wl->part[PART_POSTFIX].len || wl->part[PART_PREFIX].len) { + wl->mw_features |= MW_FEATURE_SUFFIX; + } + if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NUMBER) { + wl->mw_features |= MW_FEATURE_NUM; + } + if (wl->part[PART_CORE].len == 1) { + wl->mw_features |= MW_FEATURE_CORE1; + } + if (wl->part[PART_CORE].len == 0) { + wl->mw_features |= MW_FEATURE_DEP_ONLY; + } + if (wl->part[PART_CORE].freq > HF_THRESH) { + wl->mw_features |= MW_FEATURE_HIGH_FREQ; + } +} + +/** 作ったword_listのスコアを計算してからコミットする */ +void +anthy_commit_word_list(struct splitter_context *sc, + struct word_list *wl) +{ + struct word_list *tmp; + xstr xs; + + /* 付属語だけのword_listで、長さ0のもやってくるので */ + if (wl->len == 0) return; + /**/ + wl->last_part = PART_DEPWORD; + + /**/ + set_features(wl); + /* 文節境界の検索で使用するクラスの設定 */ + anthy_set_seg_class(wl); + /**/ + xs.len = wl->part[PART_DEPWORD].len; + xs.str = sc->ce[wl->part[PART_POSTFIX].from + wl->part[PART_POSTFIX].len].c; + wl->dep_word_hash = anthy_dep_word_hash(&xs); + if (wl->part[PART_POSTFIX].len) { + xs.len = wl->part[PART_POSTFIX].len; + xs.str = sc->ce[wl->part[PART_POSTFIX].from].c; + } + + /* 同じ内容のword_listがないかを調べる */ + for (tmp = sc->word_split_info->cnode[wl->from].wl; tmp; tmp = tmp->next) { + if (word_list_same(tmp, wl)) { + return ; + } + } + /* wordlistのリストに追加 */ + wl->next = sc->word_split_info->cnode[wl->from].wl; + sc->word_split_info->cnode[wl->from].wl = wl; + + /* デバッグプリント */ + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_WL) { + anthy_print_word_list(sc, wl); + } +} + +struct word_list * +anthy_alloc_word_list(struct splitter_context *sc) +{ + return anthy_smalloc(sc->word_split_info->WlAllocator); +} + +/* 後続の活用語尾、助詞、助動詞を付ける */ +static void +make_following_word_list(struct splitter_context *sc, + struct word_list *tmpl) +{ + /* このxsは自立語部の後続の文字列 */ + xstr xs; + xs.str = sc->ce[tmpl->from+tmpl->len].c; + xs.len = sc->char_count - tmpl->from - tmpl->len; + tmpl->part[PART_DEPWORD].from = + tmpl->part[PART_POSTFIX].from + tmpl->part[PART_POSTFIX].len; + + if (tmpl->node_id >= 0) { + /* 普通のword_list */ + anthy_scan_node(sc, tmpl, &xs, tmpl->node_id); + } else { + /* 自立語がないword_list */ + struct wordseq_rule rule; + struct word_list new_tmpl; + int i; + int nr_rule = anthy_get_nr_dep_rule(); + new_tmpl = *tmpl; + /* 名詞35の後に続くルールに対して */ + for (i = 0; i < nr_rule; ++i) { + anthy_get_nth_dep_rule(i, &rule); + if (anthy_wtype_get_pos(rule.wt) == POS_NOUN + && anthy_wtype_get_scos(rule.wt) == SCOS_T35) { + new_tmpl.part[PART_CORE].wt = rule.wt; + new_tmpl.node_id = rule.node_id; + new_tmpl.head_pos = anthy_wtype_get_pos(new_tmpl.part[PART_CORE].wt); + anthy_scan_node(sc, &new_tmpl, &xs, new_tmpl.node_id); + } + } + } +} + +static void +push_part_back(struct word_list *tmpl, int len, + seq_ent_t se, wtype_t wt) +{ + tmpl->len += len; + tmpl->part[PART_POSTFIX].len += len; + tmpl->part[PART_POSTFIX].wt = wt; + tmpl->part[PART_POSTFIX].seq = se; + tmpl->last_part = PART_POSTFIX; +} + +/* 接尾辞をくっつける */ +static void +make_suc_words(struct splitter_context *sc, + struct word_list *tmpl) +{ + int i, right; + + wtype_t core_wt = tmpl->part[PART_CORE].wt; + /* 数詞、名前、サ変名詞のいずれかに付属語は付く */ + int core_is_num = 0; + int core_is_name = 0; + int core_is_sv_noun = 0; + + /* まず、接尾辞が付く自立語かチェックする */ + if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) { + core_is_num = 1; + } + if (anthy_wtype_include(anthy_wtype_name_noun, core_wt)) { + core_is_name = 1; + } + if (anthy_wtype_get_sv(core_wt)) { + core_is_sv_noun = 1; + } + if (!core_is_num && !core_is_name && !core_is_sv_noun) { + return ; + } + + right = tmpl->part[PART_CORE].from + tmpl->part[PART_CORE].len; + /* 自立語の右側の文字列に対して */ + for (i = 1; + i <= sc->word_split_info->seq_len[right]; + i++){ + xstr xs; + seq_ent_t suc; + xs.str = sc->ce[right].c; + xs.len = i; + suc = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse); + if (anthy_get_seq_ent_pos(suc, POS_SUC)) { + /* 右側の文字列は付属語なので、自立語の品詞にあわせてチェック */ + struct word_list new_tmpl; + if (core_is_num && + anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_num_postfix)) { + new_tmpl = *tmpl; + push_part_back(&new_tmpl, i, suc, anthy_wtype_num_postfix); + make_following_word_list(sc, &new_tmpl); + } + if (core_is_name && + anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_name_postfix)) { + new_tmpl = *tmpl; + push_part_back(&new_tmpl, i, suc, anthy_wtype_name_postfix); + make_following_word_list(sc, &new_tmpl); + } + if (core_is_sv_noun && + anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_sv_postfix)) { + new_tmpl = *tmpl; + push_part_back(&new_tmpl, i, suc, anthy_wtype_sv_postfix); + make_following_word_list(sc, &new_tmpl); + } + } + } +} + +static void +push_part_front(struct word_list *tmpl, int len, + seq_ent_t se, wtype_t wt) +{ + tmpl->from = tmpl->from - len; + tmpl->len = tmpl->len + len; + tmpl->part[PART_PREFIX].from = tmpl->from; + tmpl->part[PART_PREFIX].len += len; + tmpl->part[PART_PREFIX].wt = wt; + tmpl->part[PART_PREFIX].seq = se; +} + +/* 接頭辞をくっつけてから接尾辞をくっつける */ +static void +make_pre_words(struct splitter_context *sc, + struct word_list *tmpl) +{ + int i; + wtype_t core_wt = tmpl->part[PART_CORE].wt; + int core_is_num = 0; + /* 自立語は数詞か? */ + if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) { + core_is_num = 1; + } + /* 接頭辞を列挙する */ + for (i = 1; + i <= sc->word_split_info->rev_seq_len[tmpl->part[PART_CORE].from]; + i++) { + seq_ent_t pre; + /* このxsは自立語部の前の文字列 */ + xstr xs; + xs.str = sc->ce[tmpl->part[PART_CORE].from - i].c; + xs.len = i; + pre = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse); + if (anthy_get_seq_ent_pos(pre, POS_PRE)) { + struct word_list new_tmpl; + if (core_is_num && + anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_num_prefix)) { + new_tmpl = *tmpl; + push_part_front(&new_tmpl, i, pre, anthy_wtype_num_prefix); + make_following_word_list(sc, &new_tmpl); + /* 数の場合は接尾辞もくっつける */ + make_suc_words(sc, &new_tmpl); + }/* else if (anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_prefix)) { + new_tmpl = *tmpl; + push_part_front(&new_tmpl, i, pre, anthy_wtype_prefix); + make_following_word_list(sc, &new_tmpl); + }*/ + } + } +} + +/* wordlistを初期化する */ +static void +setup_word_list(struct word_list *wl, int from, int len, + int is_compound, int is_weak) +{ + int i; + wl->from = from; + wl->len = len; + wl->is_compound = is_compound; + /* partの配列を初期化する */ + for (i = 0; i < NR_PARTS; i++) { + wl->part[i].from = 0; + wl->part[i].len = 0; + wl->part[i].wt = anthy_wt_none; + wl->part[i].seq = 0; + wl->part[i].freq = 1;/* 頻度の低い単語としておく */ + wl->part[i].dc = DEP_NONE; + } + /* 自立語のパートを設定 */ + wl->part[PART_CORE].from = from; + wl->part[PART_CORE].len = len; + /**/ + wl->mw_features = MW_FEATURE_NONE; + wl->node_id = -1; + wl->last_part = PART_CORE; + wl->head_pos = POS_NONE; + wl->tail_ct = CT_NONE; + if (is_weak) { + wl->mw_features |= MW_FEATURE_WEAK_SEQ; + } +} + +/* + * ある独立語に対して、接頭辞、接尾辞、付属語を付けたものを + * 文節の候補(=word_list)としてcacheに追加する + */ +static void +make_word_list(struct splitter_context *sc, + seq_ent_t se, + int from, int len, + int is_compound, + int is_weak) +{ + struct word_list tmpl; + struct wordseq_rule rule; + int nr_rule = anthy_get_nr_dep_rule(); + int i; + + /* テンプレートの初期化 */ + setup_word_list(&tmpl, from, len, is_compound, is_weak); + tmpl.part[PART_CORE].seq = se; + + /* 各ルールにマッチするか比較 */ + for (i = 0; i < nr_rule; ++i) { + int freq; + anthy_get_nth_dep_rule(i, &rule); + if (!is_compound) { + freq = anthy_get_seq_ent_wtype_freq(se, rule.wt); + } else { + freq = anthy_get_seq_ent_wtype_compound_freq(se, rule.wt); + } + + if (freq) { + /* 自立語の品詞はそのルールにあっている */ + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_ID) { + /* 品詞表のデバッグ用*/ + xstr xs; + xs.str = sc->ce[tmpl.part[PART_CORE].from].c; + xs.len = tmpl.part[PART_CORE].len; + anthy_putxstr(&xs); + printf(" freq=%d rule_id=%d node_id=%d\n", + freq, i, rule.node_id); + } + /* 遷移したルールの情報を転記する */ + tmpl.part[PART_CORE].wt = rule.wt; + tmpl.part[PART_CORE].freq = freq; + tmpl.node_id = rule.node_id; + tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt); + + /**/ + tmpl.part[PART_POSTFIX].from = + tmpl.part[PART_CORE].from + + tmpl.part[PART_CORE].len; + /**/ + if (anthy_wtype_get_pos(rule.wt) == POS_NOUN || + anthy_wtype_get_pos(rule.wt) == POS_NUMBER) { + /* 接頭辞、接尾辞は名詞、数詞にしか付かないことにしている */ + make_pre_words(sc, &tmpl); + make_suc_words(sc, &tmpl); + } + /* 接頭辞、接尾辞無しで助詞助動詞をつける */ + make_following_word_list(sc, &tmpl); + } + } +} + +static void +make_dummy_head(struct splitter_context *sc) +{ + struct word_list tmpl; + setup_word_list(&tmpl, 0, 0, 0, 0); + tmpl.part[PART_CORE].seq = 0; + tmpl.part[PART_CORE].wt = anthy_wtype_noun; + + tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt); + make_suc_words(sc, &tmpl); +} + +static int +compare_hash(const void *kp, const void *cp) +{ + const int *h = kp; + const int *c = cp; + return (*h) - ntohl(*c); +} + +static int +check_weak(xstr *xs) +{ + const int *array = (int *)weak_word_array; + int nr; + int h; + if (!array) { + return 0; + } + nr = ntohl(array[1]); + h = anthy_xstr_hash(xs); + if (bsearch(&h, &array[16], nr, + sizeof(int), compare_hash)) { + return 1; + } + return 0; +} + +/* コンテキストに設定された文字列の部分文字列から全てのword_listを列挙する */ +void +anthy_make_word_list_all(struct splitter_context *sc) +{ + int i, j; + xstr xs; + seq_ent_t se; + struct depword_ent { + struct depword_ent *next; + int from, len; + int is_compound; + int is_weak; + seq_ent_t se; + } *head, *de; + struct word_split_info_cache *info; + allocator de_ator; + + weak_word_array = anthy_file_dic_get_section("weak_words"); + + info = sc->word_split_info; + head = NULL; + de_ator = anthy_create_allocator(sizeof(struct depword_ent), 0); + + xs.str = sc->ce[0].c; + xs.len = sc->char_count; + anthy_gang_load_dic(&xs, sc->is_reverse); + + /* 全ての自立語を列挙 */ + /* 開始地点のループ */ + for (i = 0; i < sc->char_count ; i++) { + int search_len = sc->char_count - i; + int search_from = 0; + if (search_len > 30) { + search_len = 30; + } + + /* 文字列長のループ(長い方から) */ + for (j = search_len; j > search_from; j--) { + /* seq_entを取得する */ + xs.len = j; + xs.str = sc->ce[i].c; + se = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse); + + /* 単語として認識できない */ + if (!se) { + continue; + } + + /* 各、部分文字列が単語ならば接頭辞、接尾辞の + 最大長を調べてマークする */ + if (j > info->seq_len[i] && + anthy_get_seq_ent_pos(se, POS_SUC)) { + info->seq_len[i] = j; + } + if (j > info->rev_seq_len[i + j] && + anthy_get_seq_ent_pos(se, POS_PRE)) { + info->rev_seq_len[i + j] = j; + } + + /* 発見した自立語をリストに追加 */ + if (anthy_get_seq_ent_indep(se) && + /* 複合語で無い候補があることを確認 */ + anthy_has_non_compound_ents(se)) { + de = (struct depword_ent *)anthy_smalloc(de_ator); + de->from = i; + de->len = j; + de->se = se; + de->is_compound = 0; + de->is_weak = check_weak(&xs); + + de->next = head; + head = de; + } + /* 発見した複合語をリストに追加 */ + if (anthy_has_compound_ents(se)) { + de = (struct depword_ent *)anthy_smalloc(de_ator); + de->from = i; + de->len = j; + de->se = se; + de->is_compound = 1; + de->is_weak = 0; + + de->next = head; + head = de; + } + } + } + + /* 発見した自立語全てに対して付属語パターンの検索 */ + for (de = head; de; de = de->next) { + make_word_list(sc, de->se, de->from, de->len, + de->is_compound, de->is_weak); + } + + /* 自立語の無いword_list */ + for (i = 0; i < sc->char_count; i++) { + struct word_list tmpl; + setup_word_list(&tmpl, i, 0, 0, 0); + if (i == 0) { + make_following_word_list(sc, &tmpl); + } else { + int type = anthy_get_xchar_type(*sc->ce[i - 1].c); + if ((type & (XCT_CLOSE | XCT_SYMBOL)) && + !(type & XCT_PUNCTUATION)) { + /* 句読点以外の記号 */ + make_following_word_list(sc, &tmpl); + } + } + } + + /* 先頭に0文字の自立語を付ける */ + make_dummy_head(sc); + + anthy_free_allocator(de_ator); +} |