diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2009-02-07 16:32:56 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2009-02-07 16:32:56 +0000 |
commit | a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 (patch) | |
tree | a966aeee62e69ae3ad13275d07ddb15049b14e0e /src-ordering | |
download | anthy-master.tar.gz |
anthy-9100hHEADanthy-9100hmaster
Diffstat (limited to 'src-ordering')
-rw-r--r-- | src-ordering/Makefile.am | 8 | ||||
-rw-r--r-- | src-ordering/Makefile.in | 438 | ||||
-rw-r--r-- | src-ordering/candhistory.c | 207 | ||||
-rw-r--r-- | src-ordering/candsort.c | 278 | ||||
-rw-r--r-- | src-ordering/candswap.c | 216 | ||||
-rw-r--r-- | src-ordering/commit.c | 275 | ||||
-rw-r--r-- | src-ordering/infosort.c | 148 | ||||
-rw-r--r-- | src-ordering/relation.c | 447 | ||||
-rw-r--r-- | src-ordering/sorter.h | 18 |
9 files changed, 2035 insertions, 0 deletions
diff --git a/src-ordering/Makefile.am b/src-ordering/Makefile.am new file mode 100644 index 0000000..3b983ff --- /dev/null +++ b/src-ordering/Makefile.am @@ -0,0 +1,8 @@ +## + +EXTRA_DIST = +INCLUDES = -I$(top_srcdir)/ + +noinst_LTLIBRARIES = libordering.la +libordering_la_SOURCES = candswap.c candsort.c commit.c\ + relation.c infosort.c candhistory.c sorter.h diff --git a/src-ordering/Makefile.in b/src-ordering/Makefile.in new file mode 100644 index 0000000..3dfeb13 --- /dev/null +++ b/src-ordering/Makefile.in @@ -0,0 +1,438 @@ +# Makefile.in generated by automake 1.9.6 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src-ordering +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libordering_la_LIBADD = +am_libordering_la_OBJECTS = candswap.lo candsort.lo commit.lo \ + relation.lo infosort.lo candhistory.lo +libordering_la_OBJECTS = $(am_libordering_la_OBJECTS) +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +SOURCES = $(libordering_la_SOURCES) +DIST_SOURCES = $(libordering_la_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ELISP_FALSE = @ELISP_FALSE@ +ELISP_TRUE = @ELISP_TRUE@ +EMACS = @EMACS@ +EMACSLOADPATH = @EMACSLOADPATH@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +GREP = @GREP@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +lispdir = @lispdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +EXTRA_DIST = +INCLUDES = -I$(top_srcdir)/ +noinst_LTLIBRARIES = libordering.la +libordering_la_SOURCES = candswap.c candsort.c commit.c\ + relation.c infosort.c candhistory.c sorter.h + +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src-ordering/Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --gnu src-ordering/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" != "$$p" || dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libordering.la: $(libordering_la_OBJECTS) $(libordering_la_DEPENDENCIES) + $(LINK) $(libordering_la_LDFLAGS) $(libordering_la_OBJECTS) $(libordering_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/candhistory.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/candsort.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/candswap.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/commit.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/infosort.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/relation.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ if $(LTCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Plo"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkdir_p) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-info-am + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-exec \ + install-exec-am install-info install-info-am install-man \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ + pdf pdf-am ps ps-am tags uninstall uninstall-am \ + uninstall-info-am + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src-ordering/candhistory.c b/src-ordering/candhistory.c new file mode 100644 index 0000000..4b3e1cb --- /dev/null +++ b/src-ordering/candhistory.c @@ -0,0 +1,207 @@ +/* + * 候補の履歴を覚える + * + * + * ある読みの履歴が 候補A 候補B 候補A 候補A 候補A + * であったというような情報をもとに候補のスコアを加点する。 + * + * Copyright (C) 2006-2007 TABATA Yusuke + * + */ +#include <stdlib.h> + +#include <anthy/segment.h> +#include <anthy/record.h> +#include "sorter.h" + +#define HISTORY_DEPTH 8 +#define MAX_HISTORY_ENTRY 200 + +/** 文節のコミットを履歴に追加する */ +static void +learn_cand_history(struct seg_ent *seg) +{ + int nr, i; + + if (anthy_select_section("CAND_HISTORY", 1)) { + return ; + } + if (anthy_select_row(&seg->str, 1)) { + return ; + } + /* シフトする */ + nr = anthy_get_nr_values(); + nr ++; + if (nr > HISTORY_DEPTH) { + nr = HISTORY_DEPTH; + } + for (i = nr - 1; i > 0; i--) { + xstr *xs = anthy_get_nth_xstr(i - 1); + anthy_set_nth_xstr(i, xs); + } + /* 0番目に設定 */ + anthy_set_nth_xstr(0, &seg->cands[seg->committed]->str); + anthy_mark_row_used(); +} + +static void +learn_suffix_history(struct seg_ent *seg) +{ + int i; + struct cand_ent *cand = seg->cands[seg->committed]; + if (anthy_select_section("SUFFIX_HISTORY", 1)) { + return ; + } + for (i = 0; i < cand->nr_words; i++) { + struct cand_elm *elm = &cand->elm[i]; + xstr xs; + if (elm->nth == -1) { + continue; + } + if (anthy_wtype_get_pos(elm->wt) != POS_SUC) { + continue; + } + if (anthy_select_row(&elm->str, 1)) { + continue; + } + if (anthy_get_nth_dic_ent_str(elm->se, &elm->str, elm->nth, &xs)) { + continue; + } + anthy_set_nth_xstr(0, &xs); + free(xs.str); + } +} + +/** 外から呼ばれる関数 + * 履歴に追加する */ +void +anthy_learn_cand_history(struct segment_list *sl) +{ + int i, nr = 0; + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + xstr *xs = &seg->str; + if (seg->committed < 0) { + continue; + } + if (anthy_select_row(xs, 0)) { + if (seg->committed == 0) { + /* 候補のエントリが無くて、コミットされた候補も先頭のものであればパス */ + continue; + } + } + /**/ + learn_cand_history(seg); + learn_suffix_history(seg); + nr ++; + } + if (nr > 0) { + if (!anthy_select_section("CAND_HISTORY", 1)) { + anthy_truncate_section(MAX_HISTORY_ENTRY); + } + if (!anthy_select_section("SUFFIX_HISTORY", 1)) { + anthy_truncate_section(MAX_HISTORY_ENTRY); + } + } +} + +/* 履歴をみて候補の重みを計算する */ +static int +get_history_weight(xstr *xs) +{ + int i, nr = anthy_get_nr_values(); + int w = 0; + for (i = 0; i < nr; i++) { + xstr *h = anthy_get_nth_xstr(i); + if (!h) { + continue; + } + if (!anthy_xstrcmp(xs, h)) { + w++; + if (i == 0) { + /* 直前に確定されたものには高いスコア*/ + w += (HISTORY_DEPTH / 2); + } + } + } + return w; +} + +static void +reorder_by_candidate(struct seg_ent *se) +{ + int i, primary_score; + /**/ + if (anthy_select_section("CAND_HISTORY", 1)) { + return ; + } + if (anthy_select_row(&se->str, 0)) { + return ; + } + /* 最も評価の高い候補 */ + primary_score = se->cands[0]->score; + /**/ + for (i = 0; i < se->nr_cands; i++) { + struct cand_ent *ce = se->cands[i]; + int weight = get_history_weight(&ce->str); + ce->score += primary_score / (HISTORY_DEPTH /2) * weight; + } + anthy_mark_row_used(); +} + +/* 接尾辞の学習を適用する */ +static void +reorder_by_suffix(struct seg_ent *se) +{ + int i, j; + int delta = 0; + int top_cand = -1; + if (anthy_select_section("SUFFIX_HISTORY", 0)) { + return ; + } + /* 各候補 */ + for (i = 0; i < se->nr_cands; i++) { + struct cand_ent *ce = se->cands[i]; + /* 候補を構成する各単語 */ + for (j = 0; j < ce->nr_words; j++) { + struct cand_elm *elm = &ce->elm[j]; + xstr xs; + if (elm->nth == -1) { + continue; + } + if (anthy_wtype_get_pos(elm->wt) != POS_SUC) { + continue; + } + /* 変換元の文字列をキーに検索 */ + if (anthy_select_row(&elm->str, 0)) { + continue; + } + /* 変換後の文字列を取得 */ + if (anthy_get_nth_dic_ent_str(elm->se, &elm->str, elm->nth, &xs)) { + continue; + } + /* 履歴中の文字列と比較する */ + if (anthy_xstrcmp(&xs, anthy_get_nth_xstr(0))) { + free(xs.str); + continue; + } + /**/ + if (top_cand < 0) { + top_cand = i; + } + if (delta == 0) { + delta = (se->cands[top_cand]->score - ce->score) + 1; + } + ce->score += delta; + free(xs.str); + } + } +} + +/* 履歴で加点する */ +void +anthy_reorder_candidates_by_history(struct seg_ent *se) +{ + reorder_by_candidate(se); + reorder_by_suffix(se); +} diff --git a/src-ordering/candsort.c b/src-ordering/candsort.c new file mode 100644 index 0000000..913a26b --- /dev/null +++ b/src-ordering/candsort.c @@ -0,0 +1,278 @@ +/* + * 文節に対する候補をソートする。 + * 将来的には近接する文節も見て、単語の結合による評価をする。 + * ダブった候補の削除もする。 + * + * Funded by IPA未踏ソフトウェア創造事業 2001 9/22 + * Copyright (C) 2000-2006 TABATA Yusuke + * Copyright (C) 2001 UGAWA Tomoharu + * + * $Id: candsort.c,v 1.27 2002/11/17 14:45:47 yusuke Exp $ + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <limits.h> +#include <stdlib.h> +#include <stdio.h> + +#include <anthy/segment.h> +#include <anthy/splitter.h> +#include <anthy/ordering.h> +#include "sorter.h" + +/* お茶入れ学習による候補 */ +#define OCHAIRE_BASE OCHAIRE_SCORE +/* metawordが十分無理矢理くさいときの、ひらがなカタカナのスコア */ +#define NOCONV_WITH_BIAS 900000 +/* 普通の候補 */ +#define NORMAL_BASE 100 +/* 単漢字 */ +#define SINGLEWORD_BASE 10 +/* 複合語 */ +#define COMPOUND_BASE (OCHAIRE_SCORE / 2) +/* 複合語の一部分を一文節にしたもの */ +#define COMPOUND_PART_BASE 2 +/* 付属語のみ */ +#define DEPWORD_BASE (OCHAIRE_SCORE / 2) +/* ひらがなカタカナのデフォルトのスコア */ +#define NOCONV_BASE 1 + +/* 無理っぽい候補割り当てか判断する */ +static int +uncertain_segment_p(struct seg_ent *se) +{ + struct meta_word *mw; + if (se->nr_metaword == 0) { + return 0; + } + + mw = se->mw_array[0]; + + /* 長さの6割 */ + if (se->len * 3 >= mw->len * 5) { + return 1; + } + return 0; +} + +static void +release_redundant_candidate(struct seg_ent *se) +{ + int i, j; + /* 配列はソートされているのでscoreが0の候補が後ろに並んでいる */ + for (i = 0; i < se->nr_cands && se->cands[i]->score; i++); + /* iから後ろの候補を解放 */ + if (i < se->nr_cands) { + for (j = i; j < se->nr_cands; j++) { + anthy_release_cand_ent(se->cands[j]); + } + se->nr_cands = i; + } +} + +/* qsort用の候補比較関数 */ +static int +candidate_compare_func(const void *p1, const void *p2) +{ + const struct cand_ent *const *c1 = p1, *const *c2 = p2; + return (*c2)->score - (*c1)->score; +} + +static void +sort_segment(struct seg_ent *se) +{ + qsort(se->cands, se->nr_cands, + sizeof(struct cand_ent *), + candidate_compare_func); +} + +static void +trim_kana_candidate(struct seg_ent *se) +{ + int i; + if (NULL == se->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + if (se->cands[0]->flag & CEF_KATAKANA) { + return ; + } + for (i = 1; i < se->nr_cands; i++) { + if (se->cands[i]->flag & CEF_KATAKANA) { + /* 最低点まで下げる */ + se->cands[i]->score = NOCONV_BASE; + } + } +} + +static void +check_dupl_candidate(struct seg_ent *se) +{ + int i,j; + for (i = 0; i < se->nr_cands - 1; i++) { + for (j = i + 1; j < se->nr_cands; j++) { + if (!anthy_xstrcmp(&se->cands[i]->str, &se->cands[j]->str)) { + /* ルールに良くマッチしたものの方を選ぶとかすべき */ + se->cands[j]->score = 0; + se->cands[i]->flag |= se->cands[j]->flag; + } + } + } +} + +/* 品詞割り当てによって生成された候補を評価する */ +static void +eval_candidate_by_metaword(struct cand_ent *ce) +{ + int i; + int score = 1; + + /* まず、単語の頻度によるscoreを加算 */ + for (i = 0; i < ce->nr_words; i++) { + struct cand_elm *elm = &ce->elm[i]; + int pos, div = 1; + int freq; + + if (elm->nth < 0) { + /* 候補割り当ての対象外なのでスキップ */ + continue; + } + pos = anthy_wtype_get_pos(elm->wt); + if (pos == POS_PRE || pos == POS_SUC) { + div = 4; + } + + freq = anthy_get_nth_dic_ent_freq(elm->se, elm->nth); + score += freq / div; + } + + if (ce->mw) { + score *= ce->mw->struct_score; + score /= RATIO_BASE; + } + ce->score = score; +} + +/* 候補を評価する */ +static void +eval_candidate(struct cand_ent *ce, int uncertain) +{ + if ((ce->flag & + (CEF_OCHAIRE | CEF_SINGLEWORD | CEF_HIRAGANA | + CEF_KATAKANA | CEF_GUESS | CEF_COMPOUND | CEF_COMPOUND_PART | + CEF_BEST)) == 0) { + /* splitterからの情報(metaword)によって生成された候補 */ + eval_candidate_by_metaword(ce); + } else if (ce->flag & CEF_OCHAIRE) { + ce->score = OCHAIRE_BASE; + } else if (ce->flag & CEF_SINGLEWORD) { + ce->score = SINGLEWORD_BASE; + } else if (ce->flag & CEF_COMPOUND) { + ce->score = COMPOUND_BASE; + } else if (ce->flag & CEF_COMPOUND_PART) { + ce->score = COMPOUND_PART_BASE; + } else if (ce->flag & CEF_BEST) { + ce->score = OCHAIRE_BASE; + } else if (ce->flag & (CEF_HIRAGANA | CEF_KATAKANA | + CEF_GUESS)) { + if (uncertain) { + /* + * この文節は外来語などのようなので、生成した候補よりも + * ひらがなカタカナの候補を出した方がよい + */ + ce->score = NOCONV_WITH_BIAS; + if (CEF_KATAKANA & ce->flag) { + ce->score ++; + } + if (CEF_GUESS & ce->flag) { + ce->score += 2; + } + } else { + ce->score = NOCONV_BASE; + } + } + ce->score += 1; +} + +static void +eval_segment(struct seg_ent *se) +{ + int i; + int uncertain = uncertain_segment_p(se); + for (i = 0; i < se->nr_cands; i++) { + eval_candidate(se->cands[i], uncertain); + } +} + +/* 学習履歴の内容で順位を調整する */ +static void +apply_learning(struct segment_list *sl, int nth) +{ + int i; + + /* + * 優先順位の低いものから順に適用する + */ + + /* 用例辞書による順序の変更 */ + anthy_reorder_candidates_by_relation(sl, nth); + /* 候補の交換 */ + for (i = nth; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + /* 候補の交換 */ + anthy_proc_swap_candidate(seg); + /* 履歴による順序の変更 */ + anthy_reorder_candidates_by_history(anthy_get_nth_segment(sl, i)); + } +} + +/** 外から呼ばれるエントリポイント + * @nth以降の文節を対象とする + */ +void +anthy_sort_candidate(struct segment_list *sl, int nth) +{ + int i; + for (i = nth; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + /* まず評価する */ + eval_segment(seg); + /* つぎにソートする */ + sort_segment(seg); + /* ダブったエントリの点の低い方に0点を付ける */ + check_dupl_candidate(seg); + /* もういちどソートする */ + sort_segment(seg); + /* 評価0の候補を解放 */ + release_redundant_candidate(seg); + } + + /* 学習の履歴を適用する */ + apply_learning(sl, nth); + + /* またソートする */ + for ( i = nth ; i < sl->nr_segments ; i++){ + sort_segment(anthy_get_nth_segment(sl, i)); + } + /* カタカナの候補が先頭でなければ最後に回す */ + for (i = nth; i < sl->nr_segments; i++) { + trim_kana_candidate(anthy_get_nth_segment(sl, i)); + } + /* またソートする */ + for ( i = nth ; i < sl->nr_segments ; i++){ + sort_segment(anthy_get_nth_segment(sl, i)); + } +} diff --git a/src-ordering/candswap.c b/src-ordering/candswap.c new file mode 100644 index 0000000..79e2ac8 --- /dev/null +++ b/src-ordering/candswap.c @@ -0,0 +1,216 @@ +/* + * 候補の交換のヒストリを管理する。 + * + * anthy_swap_cand_ent() で学習する + * anthy_proc_swap_candidate() で学習結果を用いる + * + * 「田端が」という候補をトップに出して「田畑が」で確定された場合は + * 自立語部:「田端」->「田畑」 + * の二つのエントリを追加する + * + */ +#include <stdlib.h> + +#include <anthy/record.h> +#include <anthy/segment.h> +/* for OCHAIRE_SCORE */ +#include <anthy/splitter.h> +#include "sorter.h" + +#define MAX_INDEP_PAIR_ENTRY 100 + +/* 候補の自立語部を学習する */ +static void +learn_swap_cand_indep(struct cand_ent *o, struct cand_ent *n) +{ + xstr os, ns; + int res; + int o_idx = o->core_elm_index; + int n_idx = n->core_elm_index; + + /* 自立語部を含む文節しか学習しない */ + if (o_idx < 0 || n_idx < 0) { + return ; + } + if (o->elm[o_idx].str.len != n->elm[n_idx].str.len) { + return ; + } + if (o->elm[o_idx].nth == -1 || n->elm[n_idx].nth == -1) { + return ; + } + res = anthy_get_nth_dic_ent_str(o->elm[o_idx].se, &o->elm[o_idx].str, + o->elm[o_idx].nth, &os); + if (res) { + return ; + } + res = anthy_get_nth_dic_ent_str(n->elm[n_idx].se, &n->elm[n_idx].str, + n->elm[n_idx].nth, &ns); + if (res) { + free(os.str); + return ; + } + if (anthy_select_section("INDEPPAIR", 1) == 0) { + if (anthy_select_row(&os, 1) == 0) { + anthy_set_nth_xstr(0, &ns); + } + } + free(os.str); + free(ns.str); +} + +/* + * 候補o を出したらn がコミットされたので + * o -> n をrecordにセットする + */ +void +anthy_swap_cand_ent(struct cand_ent *o, struct cand_ent *n) +{ + if (o == n) { + /* 同じ候補 */ + return ; + } + if (n->flag & CEF_USEDICT) { + /* 用例辞書から出てきた候補 */ + return ; + } + /* 自立語部 */ + learn_swap_cand_indep(o, n); +} + + +/* + * 変換時に生成した候補を並べた状態で最優先の候補を決める + * ループの除去なども行う + */ +static xstr * +prepare_swap_candidate(xstr *target) +{ + xstr *xs, *n; + if (anthy_select_row(target, 0) == -1) { + return NULL; + } + xs = anthy_get_nth_xstr(0); + if (!xs) { + return NULL; + } + /* 第一候補 -> xs となるのを発見 */ + anthy_mark_row_used(); + if (anthy_select_row(xs, 0) != 0){ + /* xs -> ⊥ */ + return xs; + } + /* xs -> n */ + n = anthy_get_nth_xstr(0); + if (!n) { + return NULL; + } + + if (!anthy_xstrcmp(target, n)) { + /* 第一候補 -> xs -> n で n = 第一候補のループ */ + anthy_select_row(target, 0); + anthy_release_row(); + anthy_select_row(xs, 0); + anthy_release_row(); + /* 第一候補 -> xs を消して、交換の必要は無し */ + return NULL; + } + /* 第一候補 -> xs -> n で n != 第一候補なので + * 第一候補 -> nを設定 + */ + if (anthy_select_row(target, 0) == 0){ + anthy_set_nth_xstr(0, n); + } + return n; +} + +#include <src-worddic/dic_ent.h> + +/* + * 自立語のみ + */ +static void +proc_swap_candidate_indep(struct seg_ent *se) +{ + xstr *xs; + xstr key; + int i; + int core_elm_idx; + int res; + struct cand_elm *core_elm; + + core_elm_idx = se->cands[0]->core_elm_index; + if (core_elm_idx < 0) { + return ; + } + + /* 0番目の候補の文字列を取り出す */ + core_elm = &se->cands[0]->elm[core_elm_idx]; + if (core_elm->nth < 0) { + return ; + } + res = anthy_get_nth_dic_ent_str(core_elm->se, + &core_elm->str, + core_elm->nth, + &key); + if (res) { + return ; + } + + /**/ + anthy_select_section("INDEPPAIR", 1); + xs = prepare_swap_candidate(&key); + free(key.str); + if (!xs) { + return ; + } + + /* 第一候補 -> xs なので xsの候補を探す */ + for (i = 1; i < se->nr_cands; i++) { + if (se->cands[i]->nr_words == se->cands[0]->nr_words && + se->cands[i]->core_elm_index == core_elm_idx) { + xstr cand; + res = anthy_get_nth_dic_ent_str(se->cands[i]->elm[core_elm_idx].se, + &se->cands[i]->elm[core_elm_idx].str, + se->cands[i]->elm[core_elm_idx].nth, + &cand); + if (res == 0 && + !anthy_xstrcmp(&cand, xs)) { + free(cand.str); + /* みつけたのでその候補のスコアをアップ */ + se->cands[i]->score = se->cands[0]->score + 1; + return ; + } + free(cand.str); + } + } +} + +/* + * 変換時に生成した候補を並べた状態で最優先の候補を決める + */ +void +anthy_proc_swap_candidate(struct seg_ent *seg) +{ + if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + + if (seg->cands[0]->score >= OCHAIRE_SCORE) { + /* cands[0] は特別な点数を持っている */ + return ; + } + if (seg->cands[0]->flag & CEF_USEDICT) { + return ; + } + /**/ + proc_swap_candidate_indep(seg); +} + +/* 候補交換の古いエントリを消す */ +void +anthy_cand_swap_ageup(void) +{ + if (anthy_select_section("INDEPPAIR", 0) == 0) { + anthy_truncate_section(MAX_INDEP_PAIR_ENTRY); + } +} diff --git a/src-ordering/commit.c b/src-ordering/commit.c new file mode 100644 index 0000000..3a43a4f --- /dev/null +++ b/src-ordering/commit.c @@ -0,0 +1,275 @@ +/* + * 確定(コミット)後の処理をする。 + * 各種の学習処理を呼び出す + * + * anthy_proc_commit() が外部から呼ばれる + */ +#include <stdlib.h> +#include <time.h> + +#include <anthy/ordering.h> +#include <anthy/record.h> +#include <anthy/splitter.h> +#include <anthy/segment.h> +#include "sorter.h" + +#define MAX_OCHAIRE_ENTRY_COUNT 100 +#define MAX_OCHAIRE_LEN 32 +#define MAX_PREDICTION_ENTRY 100 + +#define MAX_UNKNOWN_WORD 100 + +/* 交換された候補を探す */ +static void +learn_swapped_candidates(struct segment_list *sl) +{ + int i; + struct seg_ent *seg; + for (i = 0; i < sl->nr_segments; i++) { + seg = anthy_get_nth_segment(sl, i); + if (seg->committed != 0) { + /* 最初の候補(0番目)でない候補(seg->committed番目)がコミットされた */ + anthy_swap_cand_ent(seg->cands[0], + seg->cands[seg->committed]); + } + } + anthy_cand_swap_ageup(); +} + +/* 長さが変わった文節の変更後に対して */ +static void +learn_resized_segment(struct splitter_context *sc, + struct segment_list *sl) + +{ + int i; + struct meta_word **mw + = alloca(sizeof(struct meta_word*) * sl->nr_segments); + int *len_array + = alloca(sizeof(int) * sl->nr_segments); + + /* 各文節の長さの配列とmeta_wordの配列を用意する */ + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *se = anthy_get_nth_segment(sl, i); + mw[i] = se->cands[se->committed]->mw; + len_array[i] = se->str.len; + } + + anthy_commit_border(sc, sl->nr_segments, mw, len_array); +} + +/* 長さが変わった文節の変更前に対して */ +static void +clear_resized_segment(struct splitter_context *sc, + struct segment_list *sl) +{ + int *mark, i, from; + struct seg_ent *seg; + mark = alloca(sizeof(int)*sc->char_count); + for (i = 0; i < sc->char_count; i++) { + mark[i] = 0; + } + /* 実際に確定された文節の長さをマークする */ + from = 0; + for (i = 0; i < sl->nr_segments; i++) { + seg = anthy_get_nth_segment(sl, i); + mark[from] = seg->len; + from = from + seg->len; + } + for (i = 0; i < sc->char_count; i++) { + int len = sc->ce[i].initial_seg_len; + /* 最初の長さと確定された長さが異なれば、 + 使われなかった未知語の可能性がある */ + if (len && len != mark[i]) { + xstr xs; + xs.str = sc->ce[i].c; + xs.len = len; + anthy_forget_unused_unknown_word(&xs); + } + } + if (!anthy_select_section("UNKNOWN_WORD", 0)) { + anthy_truncate_section(MAX_UNKNOWN_WORD); + } +} + +/* recordにお茶入れ学習の結果を書き込む */ +static void +commit_ochaire(struct seg_ent *seg, int count, xstr* xs) +{ + int i; + if (xs->len >= MAX_OCHAIRE_LEN) { + return ; + } + if (anthy_select_row(xs, 1)) { + return ; + } + anthy_set_nth_value(0, count); + for (i = 0; i < count; i++, seg = seg->next) { + anthy_set_nth_value(i * 2 + 1, seg->len); + anthy_set_nth_xstr(i * 2 + 2, &seg->cands[seg->committed]->str); + } +} + +/* recordの領域を節約するために、お茶入れ学習のネガティブな + エントリを消す */ +static void +release_negative_ochaire(struct splitter_context *sc, + struct segment_list *sl) +{ + int start, len; + xstr xs; + (void)sl; + /* 変換前のひらがな文字列 */ + xs.len = sc->char_count; + xs.str = sc->ce[0].c; + + /* xsの部分文字列に対して */ + for (start = 0; start < xs.len; start ++) { + for (len = 1; len <= xs.len - start && len < MAX_OCHAIRE_LEN; len ++) { + xstr part; + part.str = &xs.str[start]; + part.len = len; + if (anthy_select_row(&part, 0) == 0) { + anthy_release_row(); + } + } + } +} + +/* お茶入れ学習を行う */ +static void +learn_ochaire(struct splitter_context *sc, + struct segment_list *sl) +{ + int i; + int count; + + if (anthy_select_section("OCHAIRE", 1)) { + return ; + } + + /* お茶入れ学習のネガティブなエントリを消す */ + release_negative_ochaire(sc, sl); + + /* お茶入れ学習をする */ + for (count = 2; count <= sl->nr_segments && count < 5; count++) { + /* 2文節以上の長さの文節列に対して */ + + for (i = 0; i <= sl->nr_segments - count; i++) { + struct seg_ent *head = anthy_get_nth_segment(sl, i); + struct seg_ent *s; + xstr xs; + int j; + xs = head->str; + if (xs.len < 2 && count < 3) { + /* 細切れの文節を学習することを避ける、 + * いい加減なheuristics */ + continue; + } + /* 文節列を構成する文字列を作る */ + for (j = 1, s = head->next; j < count; j++, s = s->next) { + xs.len += s->str.len; + } + /**/ + commit_ochaire(head, count, &xs); + } + } + if (anthy_select_section("OCHAIRE", 1)) { + return ; + } + anthy_truncate_section(MAX_OCHAIRE_ENTRY_COUNT); +} + +static int +learn_prediction_str(xstr *idx, xstr *xs) +{ + int nr_predictions; + int i; + time_t t = time(NULL); + if (anthy_select_row(idx, 1)) { + return 0; + } + nr_predictions = anthy_get_nr_values(); + + /* 既に履歴にある場合はタイムスタンプだけ更新 */ + for (i = 0; i < nr_predictions; i += 2) { + xstr *log = anthy_get_nth_xstr(i + 1); + if (!log) { + continue; + } + if (anthy_xstrcmp(log, xs) == 0) { + anthy_set_nth_value(i, t); + break; + } + } + + /* ない場合は末尾に追加 */ + if (i == nr_predictions) { + anthy_set_nth_value(nr_predictions, t); + anthy_set_nth_xstr(nr_predictions + 1, xs); + anthy_mark_row_used(); + return 1; + } + anthy_mark_row_used(); + return 0; +} + +static void +learn_prediction(struct segment_list *sl) +{ + int i; + int added = 0; + if (anthy_select_section("PREDICTION", 1)) { + return ; + } + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + xstr *xs = &seg->cands[seg->committed]->str; + + if (seg->committed < 0) { + continue; + } + if (learn_prediction_str(&seg->str, xs)) { + added = 1; + } + } + if (added) { + anthy_truncate_section(MAX_PREDICTION_ENTRY); + } +} + +static void +learn_unknown(struct segment_list *sl) +{ + int i; + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + struct cand_ent *ce = seg->cands[seg->committed]; + if (ce->nr_words == 0) { + anthy_add_unknown_word(&seg->str, &ce->str); + } + } +} + +void +anthy_do_commit_prediction(xstr *src, xstr *xs) +{ + if (anthy_select_section("PREDICTION", 1)) { + return ; + } + learn_prediction_str(src, xs); +} + +void +anthy_proc_commit(struct segment_list *sl, + struct splitter_context *sc) +{ + /* 各種の学習を行う */ + learn_swapped_candidates(sl); + learn_resized_segment(sc, sl); + clear_resized_segment(sc, sl); + learn_ochaire(sc, sl); + learn_prediction(sl); + learn_unknown(sl); + anthy_learn_cand_history(sl); +} diff --git a/src-ordering/infosort.c b/src-ordering/infosort.c new file mode 100644 index 0000000..b20f731 --- /dev/null +++ b/src-ordering/infosort.c @@ -0,0 +1,148 @@ +/* + * 文節の構造metawordをソートする + * + * 文節に対する複数の構造の候補をソートする + * + * Copyright (C) 2000-2007 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <math.h> + +#include <anthy/segment.h> +#include <anthy/ordering.h> +#include <anthy/feature_set.h> +#include <anthy/splitter.h> +#include <anthy/diclib.h> +#include "sorter.h" + +static void *cand_info_array; + +static double +calc_probability(struct feature_list *fl) +{ + struct feature_freq *res, arg; + res = anthy_find_feature_freq(cand_info_array, + fl, &arg); + if (res) { + double pos = (double)res->f[15]; + double neg = (double)res->f[14]; + double prob = pos / (pos + neg); + prob = prob * prob; + /**/ + return prob; + } + return 0; +} + +static void +mw_eval(struct seg_ent *prev_seg, struct seg_ent *seg, + struct meta_word *mw) +{ + int pc; + struct feature_list fl; + double prob; + (void)seg; + anthy_feature_list_init(&fl); + /**/ + anthy_feature_list_set_cur_class(&fl, mw->seg_class); + anthy_feature_list_set_dep_word(&fl, mw->dep_word_hash); + anthy_feature_list_set_dep_class(&fl, mw->dep_class); + anthy_feature_list_set_mw_features(&fl, mw->mw_features); + /* 前の文節の素性 */ + if (prev_seg) { + pc = prev_seg->best_seg_class; + } else { + pc = SEG_HEAD; + } + anthy_feature_list_set_class_trans(&fl, pc, mw->seg_class); + anthy_feature_list_sort(&fl); + /* 計算する */ + prob = 0.1 + calc_probability(&fl); + if (prob < 0) { + prob = (double)1 / (double)1000; + } + anthy_feature_list_free(&fl); + mw->struct_score = RATIO_BASE * RATIO_BASE; + mw->struct_score *= prob; + /* + anthy_feature_list_print(&fl); + printf(" prob=%f, struct_score=%d\n", prob, mw->struct_score); + */ + + /**/ + if (mw->mw_features & MW_FEATURE_SUFFIX) { + mw->struct_score /= 2; + } + if (mw->mw_features & MW_FEATURE_WEAK_CONN) { + mw->struct_score /= 10; + } +} + +static void +seg_eval(struct seg_ent *prev_seg, + struct seg_ent *seg) +{ + int i; + for (i = 0; i < seg->nr_metaword; i++) { + mw_eval(prev_seg, seg, seg->mw_array[i]); + } +} + +static void +sl_eval(struct segment_list *seg_list) +{ + int i; + struct seg_ent *prev_seg = NULL; + for (i = 0; i < seg_list->nr_segments; i++) { + struct seg_ent *seg; + seg = anthy_get_nth_segment(seg_list, i); + seg_eval(prev_seg, seg); + prev_seg = seg; + } +} + +static int +metaword_compare_func(const void *p1, const void *p2) +{ + const struct meta_word * const *s1 = p1; + const struct meta_word * const *s2 = p2; + return (*s2)->struct_score - (*s1)->struct_score; +} + +void +anthy_sort_metaword(struct segment_list *seg_list) +{ + int i; + /**/ + sl_eval(seg_list); + /**/ + for (i = 0; i < seg_list->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(seg_list, i); + if (seg->mw_array) { /* 不正なメモリアクセスを行うバグの修正 */ + qsort(seg->mw_array, seg->nr_metaword, sizeof(struct meta_word *), + metaword_compare_func); + } + } +} + +void +anthy_infosort_init(void) +{ + cand_info_array = anthy_file_dic_get_section("cand_info"); +} diff --git a/src-ordering/relation.c b/src-ordering/relation.c new file mode 100644 index 0000000..3c1ae74 --- /dev/null +++ b/src-ordering/relation.c @@ -0,0 +1,447 @@ +/* + * 文節の関係を処理する + * Copyright (C) 2006 Higashiyama Masahiko (thanks google summer of code program) + * Copyright (C) 2002-2007 TABATA Yusuke + * + * anthy_reorder_candidates_by_relation() + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <arpa/inet.h> +#include <stdlib.h> + +#include <anthy/segclass.h> +#include <anthy/segment.h> +#include <anthy/ordering.h> +#include <anthy/dic.h> +#include <anthy/diclib.h> +#include <anthy/feature_set.h> +#include <anthy/corpus.h> +#include "sorter.h" + +#define MAX_COLLISION 4 +#define SEARCH_LIMIT 100 +#define MAX_NEIGHBOR 10 + + +/* 全文検索用のコーパス */ +static struct corpus_ { + /* header */ + void *corpus_bucket; + void *corpus_array; + /**/ + int *bucket; + int *array; + /**/ + int bucket_size; + int array_size; +} corpus_info; + +/* 検索用のiterator */ +struct iterator { + /* 検索のキーと現在の場所 */ + int key; + int idx; + /* 検索回数の上限 */ + int limit; +}; + +struct neighbor { + int nr; + int id[MAX_NEIGHBOR]; +}; + +/** 文節@segの中に@from_word_idの単語と共起関係にある + * 候補があるかどうかを探し、あればスコアを上げる。 + */ +static void +reorder_candidate(int from_word_id, struct seg_ent *seg) +{ + int i, pos; + struct cand_ent *ce; + if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + ce = seg->cands[0]; + if (ce->core_elm_index == -1) { + return ; + } + /* 0番目の候補の品詞 */ + pos = anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt); + + for (i = 0; i < seg->nr_cands; i++) { + int word_id; + ce = seg->cands[i]; + if (ce->core_elm_index == -1) { + continue; + } + word_id = ce->elm[ce->core_elm_index].id; + if (anthy_dic_check_word_relation(from_word_id, word_id) && + anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt) == pos) { + /* 用例にマッチしたので、候補のスコアを更新 */ + ce->flag |= CEF_USEDICT; + ce->score *= 10; + } + } +} + +static int +get_indep_word_id(struct seg_ent *seg, int nth) +{ + struct cand_ent *ce; + if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return -1; + } + if (seg->cands[nth]->core_elm_index == -1) { + /* 一番目の候補がseq_entから作られた候補ではない */ + return -1; + } + ce = seg->cands[nth]; + /* 自立語のidを取り出す */ + return ce->elm[ce->core_elm_index].id; +} + +/* 用例辞書を使って並び替えをする */ +static void +reorder_by_use_dict(struct segment_list *sl, int nth) +{ + int i; + struct seg_ent *cur_seg; + int word_id; + + cur_seg = anthy_get_nth_segment(sl, nth); + word_id = get_indep_word_id(cur_seg, 0); + if (word_id == -1) { + /**/ + return ; + } + /* 近所の文節を順に見ていく */ + for (i = nth - 2; i < nth + 2 && i < sl->nr_segments; i++) { + struct seg_ent *target_seg; + if (i < 0 || i == nth) { + continue ; + } + /* i番目の文節と前後のj番目の文節に対して */ + target_seg = anthy_get_nth_segment(sl, i); + reorder_candidate(word_id, target_seg); + } +} + +static int +find_border_of_this_word(int idx) +{ + int val; + if (idx < 0) { + return 0; + } + val = ntohl(corpus_info.array[idx * 2]); + while (!(val & ELM_WORD_BORDER) && + idx > -1) { + idx --; + } + return idx; +} + +static int +find_left_word_border(int idx) +{ + int val; + if (idx == -1) { + return -1; + } + val = ntohl(corpus_info.array[idx * 2]); + if (val & ELM_BOS) { + return -1; + } + idx --; + return find_border_of_this_word(idx); +} + +static int +find_right_word_border(int idx) +{ + if (idx == -1) { + return -1; + } + while (idx < corpus_info.array_size - 2) { + int val; + idx ++; + val = ntohl(corpus_info.array[idx * 2]); + if (val & ELM_BOS) { + return -1; + } + if (val & ELM_WORD_BORDER) { + return idx; + } + } + return -1; +} + +static void +push_id(struct neighbor *ctx, + int id) +{ + if (ctx->nr < MAX_NEIGHBOR - 1) { + ctx->id[ctx->nr] = id; + ctx->nr++; + } +} + +static void +collect_word_context(struct neighbor *ctx, int idx) +{ + int id = ntohl(corpus_info.array[idx * 2]) & CORPUS_KEY_MASK; + /*printf(" id=%d\n", id);*/ + push_id(ctx, id); +} + +/* 例文中で周辺の情報を取得する */ +static void +collect_corpus_context(struct neighbor *ctx, + struct iterator *it) +{ + int i; + int this_idx, idx; + + this_idx = find_border_of_this_word(it->idx); + + /*printf(" key=%d\n", it->key);*/ + /* 左へスキャン */ + idx = this_idx; + for (i = 0; i < 2; i++) { + idx = find_left_word_border(idx); + if (idx == -1) { + break; + } + collect_word_context(ctx, idx); + } + /* 右へスキャン */ + idx = this_idx; + for (i = 0; i < 2; i++) { + idx = find_right_word_border(idx); + if (idx == -1) { + break; + } + collect_word_context(ctx, idx); + } +} + +/* 変換対象の文字列の周辺の情報を取得する */ +static void +collect_user_context(struct neighbor *ctx, + struct segment_list *sl, int nth) +{ + int i; + ctx->nr = 0; + for (i = nth - 2; i <= nth + 2 && i < sl->nr_segments; i++) { + int id; + if ((i < 0) || (i == nth)) { + continue; + } + id = get_indep_word_id(anthy_get_nth_segment(sl, i), 0); + if (id > -1) { + id &= CORPUS_KEY_MASK; + /*printf("user_ctx=%d\n", id);*/ + push_id(ctx, id); + } + } +} + +/* 隣接文節の情報を比較する */ +static int +do_compare_context(struct neighbor *n1, + struct neighbor *n2) +{ + int i, j; + int m = 0; + for (i = 0; i < n1->nr; i++) { + for (j = 0; j < n2->nr; j++) { + if (n1->id[i] == n2->id[j]) { + m++; + } + } + } + return m; +} + +/* 隣接文節の情報を取得して比較する */ +static int +compare_context(struct neighbor *user, + struct iterator *it) +{ + struct neighbor sample; + int nr; + /**/ + sample.nr = 0; + /* 例文中の周辺情報を集める */ + collect_corpus_context(&sample, it); + if (sample.nr == 0) { + return 0; + } + /* 比較する */ + nr = do_compare_context(user, &sample); + if (nr >= sample.nr / 2) { + return nr; + } + return 0; +} + +/* keyの最初の出現場所を見つける + * 見つからなかったら-1を返す + */ +static int +find_first_pos(int key) +{ + int i; + for (i = 0; i < MAX_COLLISION; i++) { + int bkt = (key + i) % corpus_info.bucket_size; + if ((int)ntohl(corpus_info.bucket[bkt * 2]) == key) { + return ntohl(corpus_info.bucket[bkt * 2 + 1]); + } + } + return -1; +} + +/* keyの最初の出現場所でiteratorを初期化する + * 見つからなかったら-1を返す + */ +static int +find_first_from_corpus(int key, struct iterator *it, int limit) +{ + key &= CORPUS_KEY_MASK; + it->idx = find_first_pos(key); + it->key = key; + it->limit = limit; + return it->idx; +} + +/* keyの次の出現場所のiteratorを設定する + */ +static int +find_next_from_corpus(struct iterator *it) +{ + int idx = it->idx; + it->limit--; + if (it->limit < 1) { + it->idx = -1; + return -1; + } + it->idx = ntohl(corpus_info.array[it->idx * 2 + 1]); + if (it->idx < 0 || it->idx >= corpus_info.array_size || + it->idx < idx) { + it->idx = -1; + } + return it->idx; +} + +static void +check_candidate_context(struct seg_ent *cur_seg, + int i, + struct neighbor *user) +{ + struct iterator it; + int nr = 0; + int word_id; + word_id = get_indep_word_id(cur_seg, i); + if (word_id == -1) { + return ; + } + /* 各出現場所をスキャンする */ + find_first_from_corpus(word_id, &it, SEARCH_LIMIT); + /*printf("word_id=%d %d\n", word_id, it.idx);*/ + while (it.idx > -1) { + nr += compare_context(user, &it); + /**/ + find_next_from_corpus(&it); + } + /**/ + if (nr > 0) { + cur_seg->cands[i]->flag |= CEF_CONTEXT; + } +} + +/* 全文検索で候補を並び替える */ +static void +reorder_by_corpus(struct segment_list *sl, int nth) +{ + struct seg_ent *cur_seg; + struct neighbor user; + int i; + /* 文節の周辺情報を集める */ + collect_user_context(&user, sl, nth); + if (user.nr == 0) { + return ; + } + cur_seg = anthy_get_nth_segment(sl, nth); + if (NULL == cur_seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + /* 各候補について */ + for (i = 0; i < cur_seg->nr_cands; i++) { + check_candidate_context(cur_seg, i, &user); + } + /* トップの候補に用例があれば、他の候補は見ない */ + if (cur_seg->cands[0]->flag & CEF_CONTEXT) { + cur_seg->cands[0]->flag &= ~CEF_CONTEXT; + return ; + } + /* 用例によるスコア加算 */ + for (i = 1; i < cur_seg->nr_cands; i++) { + if (cur_seg->cands[i]->flag & CEF_CONTEXT) { + cur_seg->cands[i]->score *= 2; + } + } +} + +/* + * 用例を用いて候補を並び替える + * @nth番目以降の文節を対象とする + */ +void +anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth) +{ + int i; + for (i = nth; i < sl->nr_segments; i++) { + reorder_by_use_dict(sl, i); + reorder_by_corpus(sl, i); + } +} + +void +anthy_relation_init(void) +{ + corpus_info.corpus_array = anthy_file_dic_get_section("corpus_array"); + corpus_info.corpus_bucket = anthy_file_dic_get_section("corpus_bucket"); + if (!corpus_info.corpus_array || + !corpus_info.corpus_array) { + return ; + } + corpus_info.array_size = ntohl(((int *)corpus_info.corpus_array)[1]); + corpus_info.bucket_size = ntohl(((int *)corpus_info.corpus_bucket)[1]); + corpus_info.array = &(((int *)corpus_info.corpus_array)[16]); + corpus_info.bucket = &(((int *)corpus_info.corpus_bucket)[16]); + /* + { + int i; + for (i = 0; i < corpus_info.array_size; i++) { + int v = ntohl(corpus_info.array[i * 2]); + printf("%d: %d %d\n", i, v, v & CORPUS_KEY_MASK); + } + } + */ +} diff --git a/src-ordering/sorter.h b/src-ordering/sorter.h new file mode 100644 index 0000000..d8e6648 --- /dev/null +++ b/src-ordering/sorter.h @@ -0,0 +1,18 @@ +#ifndef _sorter_h_included_ +#define _sorter_h_included_ + +/* candswap.c */ +/* 一位として出した候補ではない候補がコミットされた */ +void anthy_swap_cand_ent(struct cand_ent *old_one, struct cand_ent *new_one); +/**/ +void anthy_proc_swap_candidate(struct seg_ent *se); +/* コミット時にcandswapの記録をagingする */ +void anthy_cand_swap_ageup(void); + +/**/ +void anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth); + +void anthy_learn_cand_history(struct segment_list *sl); +void anthy_reorder_candidates_by_history(struct seg_ent *se); + +#endif |