diff options
Diffstat (limited to 'src-worddic')
-rw-r--r-- | src-worddic/Makefile.am | 19 | ||||
-rw-r--r-- | src-worddic/Makefile.in | 486 | ||||
-rw-r--r-- | src-worddic/dic_ent.h | 59 | ||||
-rw-r--r-- | src-worddic/dic_main.h | 93 | ||||
-rw-r--r-- | src-worddic/dic_personality.h | 20 | ||||
-rw-r--r-- | src-worddic/dic_util.c | 594 | ||||
-rw-r--r-- | src-worddic/ext_ent.c | 564 | ||||
-rw-r--r-- | src-worddic/feature_set.c | 248 | ||||
-rw-r--r-- | src-worddic/matrix.c | 575 | ||||
-rw-r--r-- | src-worddic/mem_dic.c | 250 | ||||
-rw-r--r-- | src-worddic/mem_dic.h | 17 | ||||
-rw-r--r-- | src-worddic/priv_dic.c | 425 | ||||
-rw-r--r-- | src-worddic/ptab.h | 153 | ||||
-rw-r--r-- | src-worddic/record.c | 2074 | ||||
-rw-r--r-- | src-worddic/textdict.c | 202 | ||||
-rw-r--r-- | src-worddic/texttrie.c | 1516 | ||||
-rw-r--r-- | src-worddic/use_dic.c | 21 | ||||
-rw-r--r-- | src-worddic/word_dic.c | 782 | ||||
-rw-r--r-- | src-worddic/word_lookup.c | 673 | ||||
-rw-r--r-- | src-worddic/wtab.h | 160 | ||||
-rw-r--r-- | src-worddic/wtype.c | 292 |
21 files changed, 9223 insertions, 0 deletions
diff --git a/src-worddic/Makefile.am b/src-worddic/Makefile.am new file mode 100644 index 0000000..17cb4a7 --- /dev/null +++ b/src-worddic/Makefile.am @@ -0,0 +1,19 @@ +## $Id: Makefile.am,v 1.4 2002/02/25 07:25:03 yusuke Exp $ + +INCLUDES = -I$(top_srcdir)/ -DCONF_DIR=\"$(sysconfdir)\" + +libanthydic_la_SOURCES = \ + word_dic.c dic_util.c \ + wtype.c\ + texttrie.c textdict.c record.c\ + word_lookup.c use_dic.c \ + priv_dic.c mem_dic.c \ + ext_ent.c matrix.c\ + feature_set.c\ + dic_main.h\ + ptab.h wtab.h dic_ent.h \ + mem_dic.h dic_personality.h + +libanthydic_la_LIBADD = ../src-diclib/libdiclib.la +libanthydic_la_LDFLAGS = -version-info 1:0:1 +lib_LTLIBRARIES = libanthydic.la diff --git a/src-worddic/Makefile.in b/src-worddic/Makefile.in new file mode 100644 index 0000000..093622a --- /dev/null +++ b/src-worddic/Makefile.in @@ -0,0 +1,486 @@ +# Makefile.in generated by automake 1.9.6 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src-worddic +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = `echo $$p | sed -e 's|^.*/||'`; +am__installdirs = "$(DESTDIR)$(libdir)" +libLTLIBRARIES_INSTALL = $(INSTALL) +LTLIBRARIES = $(lib_LTLIBRARIES) +libanthydic_la_DEPENDENCIES = ../src-diclib/libdiclib.la +am_libanthydic_la_OBJECTS = word_dic.lo dic_util.lo wtype.lo \ + texttrie.lo textdict.lo record.lo word_lookup.lo use_dic.lo \ + priv_dic.lo mem_dic.lo ext_ent.lo matrix.lo feature_set.lo +libanthydic_la_OBJECTS = $(am_libanthydic_la_OBJECTS) +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +SOURCES = $(libanthydic_la_SOURCES) +DIST_SOURCES = $(libanthydic_la_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ELISP_FALSE = @ELISP_FALSE@ +ELISP_TRUE = @ELISP_TRUE@ +EMACS = @EMACS@ +EMACSLOADPATH = @EMACSLOADPATH@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +GREP = @GREP@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +lispdir = @lispdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +INCLUDES = -I$(top_srcdir)/ -DCONF_DIR=\"$(sysconfdir)\" +libanthydic_la_SOURCES = \ + word_dic.c dic_util.c \ + wtype.c\ + texttrie.c textdict.c record.c\ + word_lookup.c use_dic.c \ + priv_dic.c mem_dic.c \ + ext_ent.c matrix.c\ + feature_set.c\ + dic_main.h\ + ptab.h wtab.h dic_ent.h \ + mem_dic.h dic_personality.h + +libanthydic_la_LIBADD = ../src-diclib/libdiclib.la +libanthydic_la_LDFLAGS = -version-info 1:0:1 +lib_LTLIBRARIES = libanthydic.la +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src-worddic/Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --gnu src-worddic/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +install-libLTLIBRARIES: $(lib_LTLIBRARIES) + @$(NORMAL_INSTALL) + test -z "$(libdir)" || $(mkdir_p) "$(DESTDIR)$(libdir)" + @list='$(lib_LTLIBRARIES)'; for p in $$list; do \ + if test -f $$p; then \ + f=$(am__strip_dir) \ + echo " $(LIBTOOL) --mode=install $(libLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) '$$p' '$(DESTDIR)$(libdir)/$$f'"; \ + $(LIBTOOL) --mode=install $(libLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) "$$p" "$(DESTDIR)$(libdir)/$$f"; \ + else :; fi; \ + done + +uninstall-libLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @set -x; list='$(lib_LTLIBRARIES)'; for p in $$list; do \ + p=$(am__strip_dir) \ + echo " $(LIBTOOL) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$p'"; \ + $(LIBTOOL) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$p"; \ + done + +clean-libLTLIBRARIES: + -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) + @list='$(lib_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" != "$$p" || dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +libanthydic.la: $(libanthydic_la_OBJECTS) $(libanthydic_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libanthydic_la_LDFLAGS) $(libanthydic_la_OBJECTS) $(libanthydic_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dic_util.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ext_ent.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/feature_set.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matrix.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mem_dic.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priv_dic.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/record.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textdict.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/texttrie.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/use_dic.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/word_dic.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/word_lookup.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wtype.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ if $(LTCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ +@am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Plo"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkdir_p) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: + for dir in "$(DESTDIR)$(libdir)"; do \ + test -z "$$dir" || $(mkdir_p) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: install-libLTLIBRARIES + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-info-am uninstall-libLTLIBRARIES + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libLTLIBRARIES clean-libtool ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-exec \ + install-exec-am install-info install-info-am \ + install-libLTLIBRARIES install-man install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-info-am \ + uninstall-libLTLIBRARIES + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src-worddic/dic_ent.h b/src-worddic/dic_ent.h new file mode 100644 index 0000000..062ec5e --- /dev/null +++ b/src-worddic/dic_ent.h @@ -0,0 +1,59 @@ +#ifndef _dic_ent_h_included_ +#define _dic_ent_h_included_ + +#include <anthy/wtype.h> +#include <anthy/dic.h> + +/* 文字列のタイプ (seq_ent->seq_type) */ +#define ST_NONE 0 +/**/ +#define ST_REVERSE 8 + +/** ある単語 */ +struct dic_ent { + wtype_t type; /** 品詞 */ + int freq; /** 頻度 */ + int feature; + const char *wt_name; + int is_compound; + xstr str; /** 変換結果の文字列 */ + /** 同一品詞の場合の辞書中の順番(anthy_get_seq_ent_wtype_freqから + anthy_wtype_includeが呼ばれる回数を減らすのに用いる */ + int order; +}; + +/**ある文字列と同音異義語の配列 + * seq_ent_t として参照される + */ +struct seq_ent { + xstr str;/* 読み */ + + int seq_type; /** ST_(type) */ + + /** dic_entの配列 */ + int nr_dic_ents; + struct dic_ent **dic_ents; + /** compound_entの配列 */ + int nr_compound_ents; + + /* 属するメモリ辞書 */ + struct mem_dic *md; + /* メモリ辞書中のhash chain */ + struct seq_ent *next; +}; + +/* ext_ent.c */ +void anthy_init_ext_ent(void); +/**/ +int anthy_get_nr_dic_ents_of_ext_ent(struct seq_ent *se,xstr *xs); +int anthy_get_nth_dic_ent_str_of_ext_ent(seq_ent_t ,xstr *,int ,xstr *); +int anthy_get_nth_dic_ent_wtype_of_ext_ent(xstr *,int ,wtype_t *); +int anthy_get_nth_dic_ent_freq_of_ext_ent(struct seq_ent *se, int nth); +int anthy_get_ext_seq_ent_wtype(struct seq_ent *, wtype_t ); +seq_ent_t anthy_get_ext_seq_ent_from_xstr(xstr *x, int is_reverse); +int anthy_get_ext_seq_ent_pos(struct seq_ent *, int); +int anthy_get_ext_seq_ent_indep(struct seq_ent *); +int anthy_get_ext_seq_ent_ct(struct seq_ent *, int, int); +int anthy_get_ext_seq_ent_wtype(struct seq_ent *se, wtype_t w); + +#endif diff --git a/src-worddic/dic_main.h b/src-worddic/dic_main.h new file mode 100644 index 0000000..a497ed8 --- /dev/null +++ b/src-worddic/dic_main.h @@ -0,0 +1,93 @@ +#ifndef _dic_main_h_included_ +#define _dic_main_h_included_ + +#include <anthy/dic.h> +#include <anthy/word_dic.h> +#include <anthy/wtype.h> +#include <anthy/xstr.h> + + +/* 辞書中の頻度に対して内部の頻度の倍率 */ +#define FREQ_RATIO 8 + + +/* dic_main.c */ +int anthy_init_dic_cache(void); +struct seq_ent *anthy_cache_get_seq_ent(xstr *x, int is_reverse); + + +/* word_dic.c */ +/* 辞書検索のキーに使用する部分文字列 */ +struct gang_elm { + char *key; + xstr xs; + union { + /* 省メモリのためにunionにしている */ + int idx; + struct gang_elm *next; + } tmp; +}; +struct seq_ent *anthy_cache_get_seq_ent(xstr *xs, int is_reverse); +struct seq_ent *anthy_validate_seq_ent(struct seq_ent *seq, xstr *xs, + int is_reverse); + + +/* word_lookup.c */ +void anthy_init_word_dic(void); +struct word_dic* anthy_create_word_dic(void); +void anthy_release_word_dic(struct word_dic *); +void anthy_gang_fill_seq_ent(struct word_dic *wd, + struct gang_elm **array, int nr, + int is_reverse); + + +/* use_dic.c */ +void anthy_init_use_dic(void); +void anthy_quit_use_dic(void); +int anthy_word_dic_check_word_relation(struct word_dic *, + int from, int to); + +struct dic_session *anthy_create_session(void); +void anthy_activate_session(struct dic_session *); +void anthy_release_session(struct dic_session *); +int anthy_get_current_session_mask(void); + +/* mem_dic.c */ +void anthy_init_mem_dic(void); +void anthy_quit_mem_dic(void); +struct mem_dic * anthy_create_mem_dic(void); +void anthy_release_mem_dic(struct mem_dic * ); +/* node がなければ作る */ +struct seq_ent *anthy_mem_dic_alloc_seq_ent_by_xstr(struct mem_dic * d, + xstr *, int is_reverse); +/* node がなければ作らない */ +struct seq_ent *anthy_mem_dic_find_seq_ent_by_xstr(struct mem_dic * d, + xstr *, int is_reverse); +/**/ +void anthy_mem_dic_push_back_dic_ent(struct seq_ent *se, int is_compound, + xstr *xs, wtype_t wt, + const char *wt_name, int freq, + int feature); +void anthy_mem_dic_release_seq_ent(struct mem_dic * d, xstr *, int is_reverse); + + +/* priv_dic.c */ +void anthy_init_private_dic(const char *id); +void anthy_copy_words_from_private_dic(struct seq_ent *seq, xstr *xs, + int is_reverse); +void anthy_release_private_dic(void); +void anthy_check_user_dir(void); +void anthy_priv_dic_lock(void); +void anthy_priv_dic_unlock(void); +void anthy_priv_dic_update(void); +struct word_line { + char wt[10]; + int freq; + const char *word; +}; +int anthy_parse_word_line(const char *line, struct word_line *res); +struct textdict; +void anthy_ask_scan(void (*request_scan)(struct textdict *, void *), + void *arg); + +#endif diff --git a/src-worddic/dic_personality.h b/src-worddic/dic_personality.h new file mode 100644 index 0000000..00d6fc1 --- /dev/null +++ b/src-worddic/dic_personality.h @@ -0,0 +1,20 @@ +/* + * 辞書側でのパーソナリティの管理 + * リファレンスカウントなどはフロントエンドがやる。 + */ +#ifndef _dic_personality_h_included_ +#define _dic_personality_h_included_ + +extern struct mem_dic *anthy_current_personal_dic_cache; +extern struct record_stat *anthy_current_record; + +/* record */ +void anthy_init_record(void); +struct record_stat *anthy_create_record(const char *id); +void anthy_release_record(struct record_stat *); + +/* dic_cache */ +struct dic_cache *anthy_create_dic_cache(const char *id); +void anthy_release_dic_cache(struct dic_cache *); + +#endif diff --git a/src-worddic/dic_util.c b/src-worddic/dic_util.c new file mode 100644 index 0000000..e03a801 --- /dev/null +++ b/src-worddic/dic_util.c @@ -0,0 +1,594 @@ +/* + * 個人辞書管理用の関数群 + * + * 互換性の都合で + * utf8の辞書はtextdict + * eucjpの辞書はtexttrie + * およびrecordを使ってて混乱しまくり + * textdictへ移行する + * + * 開発予定 + * + * 新規登録はtextdictに対して行うようにする <- todo + * texttrieの単語は移行するようにする + * record関係は消す + * + * + * Funded by IPA未踏ソフトウェア創造事業 2001 10/24 + * + * Copyright (C) 2001-2007 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <anthy/anthy.h> +#include <anthy/conf.h> +#include <anthy/dic.h> +#include <anthy/texttrie.h> +#include <anthy/textdict.h> +#include <anthy/dicutil.h> + +#include "dic_main.h" +#include "dic_personality.h" + +/* + * 個人辞書はtexttrie中に格納されるとき + * 「 見出し 数字」 -> 「#品詞*頻度 単語」という形式をとる + * (UTF8の場合は「 p見出し 数字」 -> 「#品詞*頻度 単語」) + * 最初の2文字の空白は単語情報のセクションであることを意味し、 + * 数字の部分は同音語を区別するために用いられる。 + * + */ + +/* UTF8で32文字 x 3bytes */ +#define MAX_KEY_LEN 96 + +static int gIsInit; +static int dic_util_encoding; + +extern struct text_trie *anthy_private_tt_dic; +extern struct textdict *anthy_private_text_dic; +/* 現在選択されている読み */ +static struct iterate_contex { + /**/ + int in_tt; + /* texttrie */ + char key_buf[MAX_KEY_LEN+32]; + /* textdictの検索用 */ + int dicfile_offset; + char *current_index; + char *current_line; +} word_iterator; +/**/ +struct scan_context { + const char *yomi; + const char *word; + const char *wt_name; + int offset; + int found_word; +}; + +static void +set_current_line(const char *index, const char *line) +{ + if (word_iterator.current_line) { + free(word_iterator.current_line); + word_iterator.current_line = NULL; + } + if (line) { + word_iterator.current_line = strdup(line); + } + if (word_iterator.current_index) { + free(word_iterator.current_index); + word_iterator.current_index = NULL; + } + if (index) { + word_iterator.current_index = strdup(index); + } +} + +/** 個人辞書ライブラリを初期化する */ +void +anthy_dic_util_init(void) +{ + if (gIsInit) { + return ; + } + if (anthy_init_dic() == -1) { + return ; + } + anthy_dic_set_personality("default"); + gIsInit = 1; + dic_util_encoding = ANTHY_EUC_JP_ENCODING; + /**/ + word_iterator.key_buf[0] = 0; + word_iterator.in_tt = 1; +} + +/** 辞書ライブラリを解放する */ +void +anthy_dic_util_quit(void) +{ + if (gIsInit) { + anthy_quit_dic(); + } + set_current_line(NULL, NULL); + gIsInit = 0; +} + +/** 辞書ユーティリティAPIのエンコーディングを設定する */ +int +anthy_dic_util_set_encoding(int enc) +{ + if (enc == ANTHY_UTF8_ENCODING || + enc == ANTHY_EUC_JP_ENCODING) { + dic_util_encoding = enc; + } + return dic_util_encoding; +} + +void +anthy_dic_util_set_personality(const char *id) +{ + anthy_dic_set_personality(id); +} + +static char * +find_next_key(const char *prefix) +{ + char *v; + v = anthy_trie_find_next_key(anthy_private_tt_dic, + word_iterator.key_buf, MAX_KEY_LEN+32); + + if (v && v[0] == prefix[0] && v[1] == prefix[1]) { + /* 次のkeyも指定されたprefixを持っている */ + return v; + } + /**/ + sprintf(word_iterator.key_buf, "%s", prefix); + return NULL; +} + +static void +delete_prefix(const char *prefix) +{ + sprintf(word_iterator.key_buf, "%s", prefix); + anthy_priv_dic_lock(); + /* word_iterator.key_bufがprefixの文字列であれば、find_next_key()は + 最初の単語を返す */ + while (find_next_key(prefix)) { + anthy_trie_delete(anthy_private_tt_dic, word_iterator.key_buf); + sprintf(word_iterator.key_buf, "%s", prefix); + } + anthy_priv_dic_unlock(); +} + +static const char * +encoding_prefix(int encoding) +{ + if (encoding == ANTHY_UTF8_ENCODING) { + return " p"; + } + /* EUC-JP */ + return " "; +} + +/** (API) 個人辞書を全部消す */ +void +anthy_priv_dic_delete(void) +{ + delete_prefix(encoding_prefix(ANTHY_EUC_JP_ENCODING)); + /**/ + while (!anthy_textdict_delete_line(anthy_private_text_dic, 0)) { + /**/ + } +} + +static int +scan_one_word_cb(void *p, int next_offset, const char *key, const char *n) +{ + (void)p; + set_current_line(key, n); + word_iterator.dicfile_offset = next_offset; + return -1; +} + +static int +select_first_entry_in_textdict(void) +{ + word_iterator.dicfile_offset = 0; + set_current_line(NULL, NULL); + anthy_textdict_scan(anthy_private_text_dic, + word_iterator.dicfile_offset, NULL, + scan_one_word_cb); + if (word_iterator.current_line) { + word_iterator.in_tt = 0; + return 0; + } + /* 単語が無い */ + return ANTHY_DIC_UTIL_ERROR; +} + +/** (API) 最初の単語を選択する */ +int +anthy_priv_dic_select_first_entry(void) +{ + if (dic_util_encoding == ANTHY_UTF8_ENCODING) { + return select_first_entry_in_textdict(); + } + if (anthy_private_tt_dic) { + sprintf(word_iterator.key_buf, "%s", encoding_prefix(dic_util_encoding)); + /* prefixの次のエントリが最初のエントリ */ + if (find_next_key(encoding_prefix(dic_util_encoding))) { + word_iterator.in_tt = 1; + return 0; + } + } + /* 単語が無いのでtextdictに移動を試みる */ + return select_first_entry_in_textdict(); +} + +/** (API) 現在選択されている単語の次の単語を選択する */ +int +anthy_priv_dic_select_next_entry(void) +{ + if (!word_iterator.in_tt) { + set_current_line(NULL, NULL); + anthy_textdict_scan(anthy_private_text_dic, word_iterator.dicfile_offset, + NULL, + scan_one_word_cb); + if (word_iterator.current_line) { + return 0; + } + return ANTHY_DIC_UTIL_ERROR; + } + if (find_next_key(encoding_prefix(dic_util_encoding))) { + return 0; + } + /* 単語が無いのでtextdictに移動を試みる */ + return select_first_entry_in_textdict(); +} + +/** 未実装 */ +int +anthy_priv_dic_select_entry(const char *index) +{ + (void)index; + return 0; +} + +/** 現在選択されている単語の読みをを取得する */ +char * +anthy_priv_dic_get_index(char *buf, int len) +{ + int i; + char *src_buf; + if (word_iterator.in_tt) { + src_buf = &word_iterator.key_buf[2]; + } else { + src_buf = word_iterator.current_index; + } + if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) { + /**/ + src_buf = anthy_conv_utf8_to_euc(src_buf); + } else { + src_buf = strdup(src_buf); + } + /* 最初の空白か\0までをコピーする */ + for (i = 0; src_buf[i] && src_buf[i] != ' '; i++) { + if (i >= len - 1) { + free(src_buf); + return NULL; + } + buf[i] = src_buf[i]; + } + buf[i] = 0; + free(src_buf); + return buf; +} + +/** 現在選択されている単語の頻度を取得する */ +int +anthy_priv_dic_get_freq(void) +{ + struct word_line res; + char *v; + if (word_iterator.in_tt) { + v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf); + anthy_parse_word_line(v, &res); + free(v); + } else { + anthy_parse_word_line(word_iterator.current_line, &res); + } + return res.freq; +} + +/** 現在選択されている単語の品詞を取得する */ +char * +anthy_priv_dic_get_wtype(char *buf, int len) +{ + struct word_line res; + char *v; + if (word_iterator.in_tt) { + v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf); + anthy_parse_word_line(v, &res); + free(v); + } else { + anthy_parse_word_line(word_iterator.current_line, &res); + } + if (len - 1 < (int)strlen(res.wt)) { + return NULL; + } + sprintf(buf, "%s", res.wt); + return buf; +} + +/** 現在選択されている単語を取得する */ +char * +anthy_priv_dic_get_word(char *buf, int len) +{ + char *v; + char *s; + if (word_iterator.in_tt) { + v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf); + } else { + v = word_iterator.current_line; + } + if (!v) { + return NULL; + } + /* 品詞の後ろにある単語を取り出す */ + s = strchr(v, ' '); + s++; + if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) { + s = anthy_conv_utf8_to_euc(s); + snprintf(buf, len, "%s", s); + free(s); + } else { + snprintf(buf, len, "%s", s); + } + if (word_iterator.in_tt) { + free(v); + } + return buf; +} + +static int +find_cb(void *p, int next_offset, const char *key, const char *n) +{ + struct scan_context *sc = p; + struct word_line res; + if (strcmp(key, sc->yomi)) { + sc->offset = next_offset; + return 0; + } + anthy_parse_word_line(n, &res); + if (!strcmp(res.wt, sc->wt_name) && + !strcmp(res.word, sc->word)) { + sc->found_word = 1; + return -1; + } + sc->offset = next_offset; + return 0; +} + +static int +order_cb(void *p, int next_offset, const char *key, const char *n) +{ + struct scan_context *sc = p; + (void)n; + if (strcmp(key, sc->yomi) >= 0) { + sc->found_word = 1; + return -1; + } + sc->offset = next_offset; + return 0; +} + +/* 引数はutf8 */ +static int +do_add_word_to_textdict(struct textdict *td, int offset, + const char *yomi, const char *word, + const char *wt_name, int freq) +{ + char *buf = malloc(strlen(yomi) + strlen(word) + strlen(wt_name) + 20); + int rv; + if (!buf) { + return -1; + } + sprintf(buf, "%s %s*%d %s\n", yomi, wt_name, freq, word); + rv = anthy_textdict_insert_line(td, offset, buf); + free(buf); + return rv; +} + +static int +dup_word_check(const char *v, const char *word, const char *wt) +{ + struct word_line res; + + if (anthy_parse_word_line(v, &res)) { + return 0; + } + + /* 読みと単語を比較する */ + if (!strcmp(res.wt, wt) && + !strcmp(res.word, word)) { + return 1; + } + return 0; +} + +static int +find_same_word(char *idx_buf, const char *yomi, + const char *word, const char *wt_name, int yomi_len) +{ + int found = 0; + sprintf(idx_buf, "%s%s ", + encoding_prefix(dic_util_encoding), + yomi); + anthy_trie_find_next_key(anthy_private_tt_dic, + idx_buf, yomi_len + 12); + + /* trieのインデックスを探す */ + do { + char *v; + if (strncmp(&idx_buf[2], yomi, yomi_len) || + idx_buf[yomi_len+2] != ' ') { + /* 見出語が異なるのでループ終了 */ + break; + } + /* texttrieにアクセスして、見出語以外も一致しているかをチェック */ + v = anthy_trie_find(anthy_private_tt_dic, idx_buf); + if (v) { + found = dup_word_check(v, word, wt_name); + free(v); + if (found) { + break; + } + } + } while (anthy_trie_find_next_key(anthy_private_tt_dic, + idx_buf, yomi_len + 12)); + + return found; +} + +static int +add_word_to_textdict(const char *yomi, const char *word, + const char *wt_name, int freq) +{ + struct scan_context sc; + int rv; + int yomi_len = strlen(yomi); + + if (yomi_len > MAX_KEY_LEN || yomi_len == 0) { + return ANTHY_DIC_UTIL_ERROR; + } + + if (wt_name[0] != '#') { + return ANTHY_DIC_UTIL_ERROR; + } + + /* texttrieにあれば消す */ + if (anthy_private_tt_dic) { + char *idx_buf = malloc(yomi_len + 12); + if (find_same_word(idx_buf, yomi, word, wt_name, yomi_len)) { + anthy_trie_delete(anthy_private_tt_dic, idx_buf); + } + free(idx_buf); + } + + /* 同じ物があったら消す */ + sc.yomi = yomi; + sc.word = word; + sc.wt_name = wt_name; + /**/ + sc.offset = 0; + sc.found_word = 0; + anthy_textdict_scan(anthy_private_text_dic, 0, &sc, + find_cb); + if (sc.found_word == 1) { + anthy_textdict_delete_line(anthy_private_text_dic, sc.offset); + } + if (freq == 0) { + return ANTHY_DIC_UTIL_OK; + } + /* 追加する場所を探す */ + sc.offset = 0; + sc.found_word = 0; + anthy_textdict_scan(anthy_private_text_dic, 0, &sc, + order_cb); + /* 追加する */ + rv = do_add_word_to_textdict(anthy_private_text_dic, sc.offset, + yomi, word, wt_name, freq); + if (!rv) { + return ANTHY_DIC_UTIL_OK; + } + return ANTHY_DIC_UTIL_ERROR; +} + +/** 単語を登録する + * 頻度が0の場合は削除 + */ +int +anthy_priv_dic_add_entry(const char *yomi, const char *word, + const char *wt_name, int freq) +{ + if (dic_util_encoding == ANTHY_UTF8_ENCODING) { + return add_word_to_textdict(yomi, word, wt_name, freq); + } else { + int rv; + char *yomi_utf8 = anthy_conv_euc_to_utf8(yomi); + char *word_utf8 = anthy_conv_euc_to_utf8(word); + rv = add_word_to_textdict(yomi_utf8, word_utf8, wt_name, freq); + free(yomi_utf8); + free(word_utf8); + return rv; + } +} + +const char * +anthy_dic_util_get_anthydir(void) +{ + return anthy_conf_get_str("ANTHYDIR"); +} + +/* lookコマンドの辞書を検索するための関数 */ +static char * +do_search(FILE *fp, const char *word) +{ + char buf[32]; + char *res = NULL; + int word_len = strlen(word); + while (fgets(buf, 32, fp)) { + int len = strlen(buf); + buf[len - 1] = 0; + len --; + if (len > word_len) { + continue; + } + if (!strncasecmp(buf, word, len)) { + if (res) { + free(res); + } + res = strdup(buf); + } + } + return res; +} + +/* lookコマンドの辞書を検索するAPI */ +char * +anthy_dic_search_words_file(const char *word) +{ + FILE *fp; + char *res; + const char *words_dict_fn = anthy_conf_get_str("WORDS_FILE"); + if (!words_dict_fn) { + return NULL; + } + fp = fopen(words_dict_fn, "r"); + if (!fp) { + return NULL; + } + res = do_search(fp, word); + fclose(fp); + return res; +} diff --git a/src-worddic/ext_ent.c b/src-worddic/ext_ent.c new file mode 100644 index 0000000..db3edf4 --- /dev/null +++ b/src-worddic/ext_ent.c @@ -0,0 +1,564 @@ +/* + * "123" "ABC" のような辞書にのってない + * 文字列に対する問合せの場合は全ての候補をここで生成する + * 上記の他に郵便番号へのアクセスも行なう + * + * Copyright (C) 2001-2005 TABATA Yusuke + * Copyright (C) 2004-2005 YOSHIDA Yuichi + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <anthy/anthy.h> /* for ANTHY_*_ENCODING */ +#include <anthy/conf.h> +#include <anthy/xstr.h> +#include <anthy/xchar.h> +#include "dic_main.h" +#include "dic_ent.h" + +/* ext entry */ +static struct seq_ent unkseq_ent;/*未知文字列たとえば英文字列とか*/ +static struct seq_ent num_ent;/*数字など*/ +static struct seq_ent sep_ent;/*セパレータなど。*/ +/* ext entryのwtype*/ +static wtype_t wt_num; + +static xchar narrow_wide_tab[]= {WIDE_0, WIDE_1, WIDE_2, + WIDE_3, WIDE_4, WIDE_5, + WIDE_6, WIDE_7, WIDE_8, WIDE_9}; +static int kj_num_tab[]={KJ_0, KJ_1, KJ_2, KJ_3, KJ_4, + KJ_5, KJ_6, KJ_7, KJ_8, KJ_9}; + +struct zipcode_line { + int nr; + xstr **strs; +}; + +/* 地名を追加する */ +static void +pushback_place_name(struct zipcode_line *zl, char *pn) +{ + if (pn[0] == '#') { + return ; + } + zl->strs = realloc(zl->strs, sizeof(xstr *) * (zl->nr + 1)); + zl->strs[zl->nr] = anthy_cstr_to_xstr(pn, ANTHY_EUC_JP_ENCODING); + zl->nr++; +} + +/* 郵便番号辞書をパースしてスペース区切りを検出する */ +static void +parse_zipcode_line(struct zipcode_line *zl, char *ln) +{ + char buf[1000]; + int i = 0; + while (*ln) { + buf[i] = *ln; + if (*ln == '\\') { + buf[i] = ln[1]; + i ++; + if (ln[1]) { + ln ++; + } + } else if (*ln == ' ') { + buf[i] = 0; + i = 0; + pushback_place_name(zl, buf); + } else { + i ++; + } + /**/ + ln ++; + } + buf[i] = 0; + pushback_place_name(zl, buf); +} + +/* 郵便番号辞書から探す */ +static void +search_zipcode_dict(struct zipcode_line *zl, xstr* xs) +{ + FILE *fp; + char buf[1000]; + int len; + xstr *temp; + char *index; + + zl->nr = 0; + zl->strs = NULL; + fp = fopen(anthy_conf_get_str("ZIPDICT_EUC"), "r"); + if (!fp) { + return ; + } + + /* 半角、全角を吸収する */ + temp = anthy_xstr_wide_num_to_num(xs); + index = anthy_xstr_to_cstr(temp, 0); + len = strlen(index); + + /* 全部grepする */ + while (fgets(buf, 1000, fp)) { + /* 3文字の郵便番号が7文字の郵便番号の頭にマッチしないように */ + if (!strncmp(buf, index, len) && buf[len] == ' ') { + /* 改行を消す */ + buf[strlen(buf) - 1] = 0; + parse_zipcode_line(zl, &buf[len + 1]); + } + } + anthy_free_xstr(temp); /* メモリリークの修正 */ + free(index); + fclose(fp); +} + +/* 郵便番号辞書の情報を解放する */ +static void +free_zipcode_line(struct zipcode_line *zl) +{ + int i; + for (i = 0; i < zl->nr; i++) { + anthy_free_xstr(zl->strs[i]); + } + free(zl->strs); +} + +static int +gen_zipcode(xstr* xs, xstr *dest, int nth) +{ + struct zipcode_line zl; + + /* 郵便番号辞書から地名を読み取る */ + search_zipcode_dict(&zl, xs); + + /* 候補を取得する */ + if (zl.nr > nth) { + dest->len = zl.strs[nth]->len; + dest->str = anthy_xstr_dup_str(zl.strs[nth]); + free_zipcode_line(&zl); + return 0; + } else { + free_zipcode_line(&zl); + return -1; + } +} + + + +/* 半角の数字から全角の数字を求める */ +static xchar +narrow_num_to_wide_num(xchar xc) +{ + if (xc > '9' || xc < '0') { + return WIDE_0; + } + return narrow_wide_tab[(int)(xc - '0')]; +} + +/* 全角の数字から半角の数字を求める */ +static xchar +wide_num_to_narrow_num(xchar xc) +{ + int i; + for (i = 0; i < 10; i++) { + if (xc == narrow_wide_tab[i]) { + return i + '0'; + } + } + return '0'; +} +/* + * 一桁の整数を漢数字に変換する + */ +static xchar +get_kj_num(int n) +{ + if (n > 9 || n < 1) { + return KJ_10; + } + return kj_num_tab[n]; +} + +/* + * 4桁分の整数を漢字文字列としてを生成する + */ +static void +compose_num_component(xstr *xs, long long num) +{ + int n[4],i; + int a[4] = { 0 , KJ_10, KJ_100, KJ_1000}; + for (i = 0; i < 4; i++) { + n[i] = num-(num/10)*10; + num /= 10; + } + /* 10,100,1000の位 */ + for (i = 3; i > 0; i--) { + if (n[i] > 0) { + if (n[i] > 1) { + anthy_xstrappend(xs, get_kj_num(n[i])); + } + anthy_xstrappend(xs, a[i]); + } + } + /* 1の位 */ + if (n[0]) { + anthy_xstrappend(xs, get_kj_num(n[0])); + } +} + +/** 漢数字の文字列を作る */ +static int +gen_kanji_num(long long num, xstr *dest) +{ + int i; + int n[10]; + if (num < 1 || num >= 10000000000000000LL) { + return -1; + } + /* 4桁ずつ配列nにつめる */ + for (i = 0; i < 10; i ++) { + n[i] = num-(num/10000)*10000; + num = num/10000; + } + /**/ + dest->len = 0; + dest->str = 0; + /* 京の位をつくる */ + if (n[3]) { + compose_num_component(dest, n[3]); + anthy_xstrappend(dest, KJ_1000000000000); + } + /* 億の位をつくる */ + if (n[2]) { + compose_num_component(dest, n[2]); + anthy_xstrappend(dest, KJ_100000000); + } + /* 万の位をつくる */ + if (n[1]) { + compose_num_component(dest, n[1]); + anthy_xstrappend(dest, KJ_10000); + } + /**/ + compose_num_component(dest, n[0]); + return 0; +} + +static int +get_nr_zipcode(xstr* xs) +{ + struct zipcode_line zl; + int nr = 0; + if (xs->len != 3 && xs->len != 7) { + return 0; + } + /* 郵便番号辞書から地名を読み取る */ + search_zipcode_dict(&zl, xs); + + nr = zl.nr; + free_zipcode_line(&zl); + return nr; +} + +static int +get_nr_num_ents(long long num) +{ + if (num > 0 && num < 10000000000000000LL) { + if (num > 999) { + /* アラビア数字(そのまま)、アラビア数字(全角半角切替え)、 + 漢数字、3桁区切り(全角、半角) */ + return 5; + } else { + /* アラビア数字(そのまま)、全角半角切替え、漢数字 */ + return 3; + } + } else { + /* アラビア数字(そのまま)、全角半角切替え */ + return 2; + } +} + + +/* + * いくつの合成のエントリーがあるか + */ +int +anthy_get_nr_dic_ents_of_ext_ent(seq_ent_t se, xstr *xs) +{ + if (se == &unkseq_ent) { + return 1; + } + if (anthy_get_xstr_type(xs) & (XCT_NUM|XCT_WIDENUM)) { + long long num = anthy_xstrtoll(xs); + return get_nr_num_ents(num) + get_nr_zipcode(xs); + } + return 0; +} + +/* 文字列の全角半角を交換する */ +static void +toggle_wide_narrow(xstr *dest, xstr *src) +{ + int f, i; + dest->len = src->len; + dest->str = anthy_xstr_dup_str(src); + f = anthy_get_xstr_type(src) & XCT_WIDENUM; + for (i = 0; i < dest->len; i++) { + if (f) { + dest->str[i] = wide_num_to_narrow_num(src->str[i]); + } else { + dest->str[i] = narrow_num_to_wide_num(src->str[i]); + } + } +} + +/* 3桁に区切った数字を生成する */ +static int +gen_separated_num(long long num, xstr *dest, int full) +{ + int width = 0, dot_count; + long long tmp; + int i, pos; + + if (num < 1000) { + return -1; + } + + /* 桁数を数える */ + for (tmp = num; tmp != 0; tmp /= 10) { + width ++; + } + /* 点の数 */ + dot_count = (width - 1) / 3; + /* 格納するのに必要な文字列を用意する */ + dest->len = dot_count + width; + dest->str = malloc(sizeof(xchar)*dest->len); + + /* 右の桁から順に決めていく */ + for (i = 0, pos = dest->len - 1; i < width; i++, pos --) { + int n = num % 10; + /* カンマを追加 */ + if (i > 0 && (i % 3) == 0) { + if (full) { + dest->str[pos] = WIDE_COMMA; + } else { + dest->str[pos] = ','; + } + pos --; + } + if (full) { + /* 全角数字 */ + dest->str[pos] = narrow_wide_tab[n]; + } else { + /* ASCII数字 */ + dest->str[pos] = 48 + n; + } + num /= 10; + } + return 0; +} + +/* + * nth個めの候補を取り出す + */ +int +anthy_get_nth_dic_ent_str_of_ext_ent(seq_ent_t se, xstr *xs, + int nth, xstr *dest) +{ + dest->str = NULL; /* 不正なメモリアクセスやメモリの多重解放をするバグの修正 */ + dest->len = 0; + if (nth == 0) { + /* 無変換文字列 */ + dest->len = xs->len; + dest->str = anthy_xstr_dup_str(xs); + return 0; + } + if (se == &unkseq_ent) { + switch(nth) { + case 1: + /* 全角、半角のトグル */ + return 0; + } + } + if (anthy_get_xstr_type(xs) & (XCT_NUM|XCT_WIDENUM)) { + long long num = anthy_xstrtoll(xs); + const int base_ents = get_nr_num_ents(num); /* 3桁郵便番号への対応 */ + /* 漢数字、アラビア数字、全角半角切替え */ + switch(nth) { + case 1: + /* 全角半角を入れ換えたもの */ + toggle_wide_narrow(dest, xs); + return 0; + case 2: + /* 漢数字 */ + if (!gen_kanji_num(num, dest)) { + return 0; + } + /* break無し */ + case 3: + /* 3桁で区切った数字 */ + if (!gen_separated_num(num, dest, 0)) { + return 0; + } + /* break無し */ + case 4: + /* 3桁で区切った数字(全角) */ + if (!gen_separated_num(num, dest, 1)) { + return 0; + } + /* break無し */ + default: + /* 郵便番号 */ + if (base_ents <= nth) { /* 3桁郵便番号への対応 */ + if (xs->len == 3 || xs->len == 7) { + if (!gen_zipcode(xs, dest, nth - base_ents)) { /* 3桁郵便番号への対応 */ + return 0; + } + } + } + break; + } + return -1; + } + return 0; +} + +int +anthy_get_ext_seq_ent_indep(struct seq_ent *se) +{ + if (se == &num_ent || se == &unkseq_ent) { + return 1; + } + return 0; +} + +/* 活用形を得る */ +int +anthy_get_ext_seq_ent_ct(struct seq_ent *se, int pos, int ct) +{ + if (anthy_get_ext_seq_ent_pos(se, pos) && ct == CT_NONE) { + /* 品詞が合っていてかつ無活用の場合 + (ext_entは活用しない) */ + return 10; + } + return 0; +} + +/* 品詞を取得する */ +int +anthy_get_ext_seq_ent_pos(struct seq_ent *se, int pos) +{ + /* ext_entは名詞のみ */ + if (se == &num_ent && pos == POS_NOUN) { + return 10; + } + if ((se == &unkseq_ent) && pos == POS_NOUN) { + return 10; + } + return 0; +} + +/* + * 辞書にのっていないシーケンスを解析 + */ +seq_ent_t +anthy_get_ext_seq_ent_from_xstr(xstr *x, int is_reverse) +{ + int t = anthy_get_xstr_type(x); + + /* 数字のみで構成されていれば num_ent */ + if (t & (XCT_NUM | XCT_WIDENUM)) { + return &num_ent; + } + /* 英数ならunkseq */ + if (t & XCT_ASCII) { + return &unkseq_ent; + } + if (t & XCT_KATA) { + return &unkseq_ent; + } + if (!is_reverse) { + /* 逆変換中は漢字候補は作らない */ + if (t & XCT_KANJI) { + return &unkseq_ent; + } + } + if (x->len == 1) { + /* 辞書にのってなくて1文字ならセパレータ */ + return &sep_ent; + } + return 0; +} + +int +anthy_get_nth_dic_ent_wtype_of_ext_ent(xstr *xs, int nth, + wtype_t *wt) +{ + int type; + (void)nth; + type = anthy_get_xstr_type(xs); + if (type & (XCT_NUM | XCT_WIDENUM)) { + *wt = wt_num; + return 0; + } + if (type & XCT_KATA) { + *wt = anthy_get_wtype(POS_NOUN, COS_NONE, SCOS_NONE, CC_NONE, + CT_NONE, WF_INDEP); + return 0; + } + return -1; +} + +int +anthy_get_nth_dic_ent_freq_of_ext_ent(struct seq_ent *se, int nth) +{ + (void)se; + (void)nth; + return 100; +} + +int +anthy_get_ext_seq_ent_wtype(struct seq_ent *se, wtype_t w) +{ + if (se == &num_ent) { + if (anthy_wtype_include(w, wt_num)) { + /* 数字の場合 */ + return 10; + } + return 0; + } + if (anthy_wtype_get_pos(w) == POS_NOUN && + anthy_wtype_get_cos(w) == COS_NONE && + anthy_wtype_get_scos(w) == SCOS_NONE) { + /* 名詞、副品詞なし、副々品詞無しにマッチ */ + return 10; + } + return 0; +} + +void +anthy_init_ext_ent(void) +{ + /**/ + unkseq_ent.seq_type = 0; + unkseq_ent.nr_dic_ents = 0; + num_ent.seq_type = 0; + num_ent.nr_dic_ents = 0; + sep_ent.seq_type = 0; + sep_ent.nr_dic_ents = 0; + /**/ + wt_num = anthy_init_wtype_by_name("数詞"); +} diff --git a/src-worddic/feature_set.c b/src-worddic/feature_set.c new file mode 100644 index 0000000..fcf7aba --- /dev/null +++ b/src-worddic/feature_set.c @@ -0,0 +1,248 @@ +/* features + * + * 素性の番号と意味を隠蔽して管理する + * + * Copyright (C) 2006-2007 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <arpa/inet.h> +#include <anthy/segclass.h> +#include <anthy/feature_set.h> +/* for MW_FEATURE* constants */ +#include <anthy/splitter.h> + +/* 素性の番号 + * + * 0-19 クラス素性 + * 30-319(30+SEG_SIZE^2) クラス遷移属性 + * 540-579 その他 + * 580- (1024個) 付属語の種類 + */ + +#define CUR_CLASS_BASE 0 +#define DEP_TYPE_FEATURE_BASE 20 +#define CLASS_TRANS_BASE 30 +#define FEATURE_SV 542 +#define FEATURE_WEAK 543 +#define FEATURE_SUFFIX 544 +#define FEATURE_NUM 546 +#define FEATURE_CORE1 547 +#define FEATURE_HIGH_FREQ 548 +#define FEATURE_WEAK_SEQ 549 +#define COS_BASE 573 +#define DEP_FEATURE_BASE 580 + +void +anthy_feature_list_init(struct feature_list *fl) +{ + fl->nr = 0; + fl->size = NR_EM_FEATURES; +} + +void +anthy_feature_list_free(struct feature_list *fl) +{ + (void)fl; +} + +void +anthy_feature_list_add(struct feature_list *fl, int f) +{ + if (fl->nr < NR_EM_FEATURES) { + fl->u.index[fl->nr] = f; + fl->nr++; + } +} + +int +anthy_feature_list_nr(const struct feature_list *fl) +{ + return fl->nr; +} + +int +anthy_feature_list_nth(const struct feature_list *fl, int nth) +{ + return fl->u.index[nth]; +} + +static int +cmp_short(const void *p1, const void *p2) +{ + return *((short *)p1) - *((short *)p2); +} + +void +anthy_feature_list_sort(struct feature_list *fl) +{ + qsort(fl->u.index, fl->nr, sizeof(fl->u.index[0]), + cmp_short); +} + + +void +anthy_feature_list_set_cur_class(struct feature_list *fl, int cl) +{ + anthy_feature_list_add(fl, CUR_CLASS_BASE + cl); +} + +void +anthy_feature_list_set_class_trans(struct feature_list *fl, int pc, int cc) +{ + anthy_feature_list_add(fl, CLASS_TRANS_BASE + pc * SEG_SIZE + cc); +} + +void +anthy_feature_list_set_dep_word(struct feature_list *fl, int h) +{ + anthy_feature_list_add(fl, h + DEP_FEATURE_BASE); +} + +void +anthy_feature_list_set_dep_class(struct feature_list *fl, int c) +{ + anthy_feature_list_add(fl, c + DEP_TYPE_FEATURE_BASE); +} + +void +anthy_feature_list_set_noun_cos(struct feature_list *fl, wtype_t wt) +{ + int c; + if (anthy_wtype_get_pos(wt) != POS_NOUN) { + return ; + } + c = anthy_wtype_get_cos(wt); + if (c == COS_SUFFIX) { + anthy_feature_list_add(fl, COS_BASE + c); + } +} + +void +anthy_feature_list_set_mw_features(struct feature_list *fl, int mask) +{ + if (mask & MW_FEATURE_WEAK_CONN) { + anthy_feature_list_add(fl, FEATURE_WEAK); + } + if (mask & MW_FEATURE_SUFFIX) { + anthy_feature_list_add(fl, FEATURE_SUFFIX); + } + if (mask & MW_FEATURE_SV) { + anthy_feature_list_add(fl, FEATURE_SV); + } + if (mask & MW_FEATURE_NUM) { + anthy_feature_list_add(fl, FEATURE_NUM); + } + if (mask & MW_FEATURE_CORE1) { + anthy_feature_list_add(fl, FEATURE_CORE1); + } + if (mask & MW_FEATURE_HIGH_FREQ) { + anthy_feature_list_add(fl, FEATURE_HIGH_FREQ); + } + if (mask & MW_FEATURE_WEAK_SEQ) { + anthy_feature_list_add(fl, FEATURE_WEAK_SEQ); + } +} + +void +anthy_feature_list_print(struct feature_list *fl) +{ + int i; + printf("features="); + for (i = 0; i < fl->nr; i++) { + if (i) { + printf(","); + } + printf("%d", fl->u.index[i]); + } + printf("\n"); +} + +static int +compare_line(const void *kp, const void *cp) +{ + const int *f = kp; + const struct feature_freq *c = cp; + int i; + for (i = 0; i < NR_EM_FEATURES; i++) { + if (f[i] != (int)ntohl(c->f[i])) { + return f[i] - ntohl(c->f[i]); + } + } + return 0; +} + +struct feature_freq * +anthy_find_array_freq(const void *image, int *f, int nr, + struct feature_freq *arg) +{ + struct feature_freq *res; + int nr_lines, i; + const int *array = (int *)image; + int n[NR_EM_FEATURES]; + if (!image) { + return NULL; + } + /* コピーする */ + for (i = 0; i < NR_EM_FEATURES; i++) { + if (i < nr) { + n[i] = f[i]; + } else { + n[i] = 0; + } + } + /**/ + nr_lines = ntohl(array[1]); + res = bsearch(n, &array[16], nr_lines, + sizeof(struct feature_freq), + compare_line); + if (!res) { + return NULL; + } + for (i = 0; i < NR_EM_FEATURES + 2; i++) { + arg->f[i] = ntohl(res->f[i]); + } + return arg; +} + +struct feature_freq * +anthy_find_feature_freq(const void *image, + const struct feature_list *fl, + struct feature_freq *arg) +{ + int i, nr; + int f[NR_EM_FEATURES + 2]; + + /* 配列にコピーする */ + nr = anthy_feature_list_nr(fl); + for (i = 0; i < NR_EM_FEATURES + 2; i++) { + if (i < nr) { + f[i] = anthy_feature_list_nth(fl, i); + } else { + f[i] = 0; + } + } + return anthy_find_array_freq(image, f, NR_EM_FEATURES, arg); +} + +void +anthy_init_features(void) +{ +} diff --git a/src-worddic/matrix.c b/src-worddic/matrix.c new file mode 100644 index 0000000..edc12cd --- /dev/null +++ b/src-worddic/matrix.c @@ -0,0 +1,575 @@ +/* + * 疎行列を扱うためのコード + * + * (1) 行列(sparse_matrix)のインスタンスを作成し行列の要素を設定する + * (2) 行列から行列イメージ(matrix_image)を作成する + * * 行列イメージをnetwork byteorderでファイルに書き出す + * (3) 行列イメージを読み込み(or mmapする)要素にアクセスする + * + */ +/* + * sparse matrix crammer + * + * sparse matrix storage uses following 2 sparse arrays + * *array of row + * *array of cells in a row + * + *(1/2) + * sparse row crammed row + * 0:0 1:1 + * 1:1 ---->> 3:1 + * 2:0 hash(h)%m 7:1 + * 3:1 / + * 4:0 / + * 5:0 / + * 6:0 + * 7:1 + * 8:0 + * (?:1 means non-all 0 row) + *(2/2) + * crammed row cram shift count + * 1:1 . . -> .. shift 0 + * 3:1 . . -> .. shift 2 + * 7:1 . . . -> ... shift 4 + * + * contents of | + * matrix \|/ + * + * ....... unified array of (value.column) pair + * + * matrix image + * image[0] : length of hashed row array + * image[1] : length of crammed cell array + * image[2 ~ 2+image[0]-1] : hashed row array + * image[2+image[0] ~ 2+image[0]+image[1]-1] : hashed row array + * + * Copyright (C) 2005 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <stdlib.h> + +#include <anthy/diclib.h> +/* public APIs */ +#include <anthy/matrix.h> + +/* maximum length allowed for hash chain */ +#define MAX_FAILURE 50 + +struct list_elm { + int index; + int value; + void *ptr; + struct list_elm *next; + /* bypass to mitigate O(n) insertion cost */ + struct list_elm *orig_next; +}; + +struct array_elm { + int index; + int value; + void *ptr; +}; + +/* + * sparse array has two representation + * + * (1) list and (2) hashed array + * build list first and sparse_array_make_array() to build hashed array + * this stores one value and one pointer + * + */ +struct sparse_array { + /* list representation */ + int elm_count; + /* sorted */ + struct list_elm head; + + /* array representation */ + int array_len; + struct array_elm *array; +}; + +static struct sparse_array * +sparse_array_new(void) +{ + struct sparse_array *a = malloc(sizeof(struct sparse_array)); + /**/ + a->elm_count = 0; + a->head.next = NULL; + a->head.orig_next = NULL; + a->head.index = -1; + /**/ + a->array_len = 0; + a->array = NULL; + return a; +} + +static void +insert_elm_after(struct list_elm *elm, int idx, int val, void *ptr) +{ + struct list_elm *new_elm = malloc(sizeof(struct list_elm)); + new_elm->value = val; + new_elm->index = idx; + new_elm->ptr = ptr; + /**/ + new_elm->next = elm->next; + new_elm->orig_next = elm->next; + elm->next = new_elm; +} + +static void +sparse_array_set(struct sparse_array *sa, int idx, int val, void *ptr) +{ + struct list_elm *e; + e = &sa->head; + while (e) { + if (e->index == idx) { + /* find same index and update */ + e->value = val; + e->ptr = ptr; + return ; + } + /* search */ + if (e->index < idx && (!e->next || idx < e->next->index)) { + insert_elm_after(e, idx, val, ptr); + /**/ + sa->elm_count ++; + return ; + } + /* go next */ + if (e->orig_next && e->orig_next->index < idx) { + /* leap ahead */ + e = e->orig_next; + } else { + e = e->next; + } + } +} + +static int +hash(int val, int max, int nth) +{ + val += nth * 113; + if (val < 0) { + val = -val; + } + if (max == 0) { + return 0; + } + return val % max; +} + +static int +sparse_array_try_make_array(struct sparse_array *s) +{ + int i; + struct list_elm *e; + /* initialize */ + free(s->array); + s->array = malloc(sizeof(struct array_elm) * s->array_len); + for (i = 0; i < s->array_len; i++) { + s->array[i].index = -1; + } + + /* push */ + for (e = s->head.next; e; e = e->next) { + int ok = 0; + int n = 0; + do { + int h = hash(e->index, s->array_len, n); + if (s->array[h].index == -1) { + /* find unused element in this array */ + ok = 1; + s->array[h].index = e->index; + s->array[h].value = e->value; + s->array[h].ptr = e->ptr; + } else { + /* collision */ + n ++; + if (n > MAX_FAILURE) { + /* too much collision */ + return 1; + } + } + } while (!ok); + } + return 0; +} + +static void +sparse_array_make_array(struct sparse_array *s) +{ + /* estimate length */ + if (s->elm_count == 1) { + s->array_len = 1; + } else { + s->array_len = s->elm_count; + } + while (sparse_array_try_make_array(s)) { + /* expand a little */ + s->array_len ++; + s->array_len *= 9; + s->array_len /= 8; + } +} + +static struct array_elm * +sparse_array_get(struct sparse_array *s, int index, struct array_elm *arg) +{ + if (s->array) { + int n = 0; + while (1) { + int h = hash(index, s->array_len, n); + if (s->array[h].index == index) { + *arg = s->array[h]; + return arg; + } + n ++; + if (n == MAX_FAILURE) { + return NULL; + } + } + } else { + struct list_elm *e = e = s->head.next; + while (e) { + if (e->index == index) { + arg->value = e->value; + arg->ptr = e->ptr; + return arg; + } + /* go next */ + if (e->orig_next && e->orig_next->index < index) { + /* leap ahead */ + e = e->orig_next; + } else { + e = e->next; + } + } + return NULL; + } +} + +static int +sparse_array_get_int(struct sparse_array *s, int index) +{ + struct array_elm elm; + if (sparse_array_get(s, index, &elm)) { + return elm.value; + } + return 0; +} + +static void * +sparse_array_get_ptr(struct sparse_array *s, int index) +{ + struct array_elm elm; + if (sparse_array_get(s, index, &elm)) { + return elm.ptr; + } + return NULL; +} + +/**/ +struct sparse_matrix { + /**/ + struct sparse_array *row_array; + /* image information */ + int nr_rows; + int array_length; +}; + +/* API */ +struct sparse_matrix * +anthy_sparse_matrix_new() +{ + struct sparse_matrix *m = malloc(sizeof(struct sparse_matrix)); + m->row_array = sparse_array_new(); + m->nr_rows = 0; + return m; +} + +static struct sparse_array * +find_row(struct sparse_matrix *m, int row, int create) +{ + struct sparse_array *a; + a = sparse_array_get_ptr(m->row_array, row); + if (a) { + return a; + } + if (!create) { + return NULL; + } + /* allocate a new row */ + a = sparse_array_new(); + sparse_array_set(m->row_array, row, 0, a); + m->nr_rows ++; + return a; +} + +/* API */ +void +anthy_sparse_matrix_set(struct sparse_matrix *m, int row, int column, + int value, void *ptr) +{ + struct sparse_array *a; + a = find_row(m, row, 1); + sparse_array_set(a, column, value, ptr); +} + +/* API */ +int +anthy_sparse_matrix_get_int(struct sparse_matrix *m, int row, int column) +{ + struct sparse_array *a; + struct list_elm *e; + a = find_row(m, row, 1); + if (!a) { + return 0; + } + for (e = &a->head; e; e = e->next) { + if (e->index == column) { + return e->value; + } + } + return 0; +} + +/* API */ +void +anthy_sparse_matrix_make_matrix(struct sparse_matrix *m) +{ + struct array_elm *ae; + int i; + int offset = 0; + /**/ + sparse_array_make_array(m->row_array); + /**/ + for (i = 0; i < m->row_array->array_len; i++) { + struct sparse_array *row; + ae = &m->row_array->array[i]; + /**/ + ae->value = offset; + if (ae->index == -1) { + continue; + } + /**/ + row = ae->ptr; + sparse_array_make_array(row); + offset += row->array_len; + } + m->array_length = offset; +} + +/* API */ +struct matrix_image * +anthy_matrix_image_new(struct sparse_matrix *s) +{ + struct matrix_image *mi; + int i; + int offset; + /**/ + mi = malloc(sizeof(struct matrix_image)); + mi->size = 2 + s->row_array->array_len * 2 + s->array_length * 2; + mi->image = malloc(sizeof(int) * mi->size); + mi->image[0] = s->row_array->array_len; + mi->image[1] = s->array_length; + /* row index */ + offset = 2; + for (i = 0; i < s->row_array->array_len; i++) { + struct array_elm *ae; + ae = &s->row_array->array[i]; + mi->image[offset + i*2] = ae->index; + mi->image[offset + i*2 + 1] = ae->value; + } + /* cells */ + offset = 2 + s->row_array->array_len * 2; + for (i = 0; i < s->row_array->array_len; i++) { + struct array_elm *ae; + struct sparse_array *sa; + int j; + ae = &s->row_array->array[i]; + if (ae->index == -1) { + continue; + } + sa = ae->ptr; + if (!sa) { + continue; + } + for (j = 0; j < sa->array_len; j++) { + struct array_elm *cell = &sa->array[j]; + mi->image[offset] = cell->index; + if (cell->index == -1) { + mi->image[offset + 1] = -1; + } else { + mi->image[offset + 1] = cell->value; + } + offset += 2; + } + } + /**/ + return mi; +} + +static int +read_int(int *image, int idx, int en) +{ + if (en) { + return anthy_dic_ntohl(image[idx]); + } + return image[idx]; +} + +static int +do_matrix_peek(int *image, int row, int col, int en) +{ + int n, h, shift, next_shift; + int row_array_len = read_int(image, 0, en); + int column_array_len; + int cell_offset; + + /* find row */ + if (row_array_len == 0) { + return 0; + } + for (n = 0; ; n++) { + h = hash(row, row_array_len, n); + if (read_int(image, 2+ h * 2, en) == row) { + shift = read_int(image, 2+h*2+1, en); + break; + } + if (read_int(image, 2+ h * 2, en) == -1) { + return 0; + } + if (n > MAX_FAILURE) { + return 0; + } + } + + /* find shift count of next row */ + if (h == row_array_len - 1) { + /* last one */ + next_shift = read_int(image, 1, en); + } else { + /* not last one */ + next_shift = read_int(image, 2+h*2+2+1, en); + } + + /* crammed width of this row */ + column_array_len = next_shift - shift; + + /* cells in this image */ + cell_offset = 2 + row_array_len * 2; + for (n = 0; ; n++) { + h = hash(col, column_array_len, n); + if (read_int(image, cell_offset + shift * 2+ h * 2, en) == col) { + return read_int(image, cell_offset + shift * 2 + h*2+1, en); + } + if (read_int(image, cell_offset + shift * 2+ h * 2, en) == -1) { + /* not exist */ + return 0; + } + if (n > MAX_FAILURE) { + return 0; + } + } + return 0; +} + +/* API */ +int +anthy_matrix_image_peek(int *image, int row, int col) +{ + if (!image) { + return 0; + } + return do_matrix_peek(image, row, col, 1); +} + +#ifdef DEBUG +/* for debug purpose */ +static void +sparse_array_dump(struct sparse_array *s) +{ + struct list_elm *e; + int i; + printf("list(%d):", s->elm_count); + for (e = s->head.next; e; e = e->next) { + printf(" %d:%d(%x)", e->index, e->value, (unsigned long)e->ptr); + } + printf("\n"); + if (!s->array) { + return ; + } + printf("array(%d):", s->array_len); + for (i = 0; i < s->array_len; i ++) { + struct array_elm *ae = &s->array[i]; + if (ae->index != -1) { + printf(" %d:%d,%d(%x)", i, ae->index, ae->value, (unsigned long)ae->ptr); + } + } + printf("\n"); + return ; + /**/ +} + +/* for debug purpose */ +void +sparse_matrix_dump(struct sparse_matrix *m) +{ + struct list_elm *e; + struct array_elm *ae; + int i, offset; + if (!m->row_array) { + for (e = m->row_array->head.next; e; e = e->next) { + sparse_array_dump(e->ptr); + } + return ; + } + printf("\nnumber of row=%d, row array size=%d, cell array size=%d\n\n", + m->nr_rows, m->row_array->array_len, m->array_length); + /* row part */ + for (i = 0; i < m->row_array->array_len; i++) { + struct array_elm *ae; + ae = &m->row_array->array[i]; + if (ae->index != -1) { + printf(" [%d] row=%d, shift=%d\n", i, ae->index, ae->value); + } + } + printf("\n"); + offset = 0; + for (i = 0; i < m->row_array->array_len; i++) { + struct array_elm *ae; + struct sparse_array *sa; + int j; + ae = &m->row_array->array[i]; + sa = ae->ptr; + if (!sa) { + continue; + } + for (j = 0; j < sa->array_len; j++) { + struct array_elm *cell = &sa->array[j]; + if (cell->index != -1) { + printf(" [%d] column=%d, value=%d\n", offset, cell->index, cell->value); + } + offset ++; + } + } + printf("\n"); +} +#endif /* DEBUG */ diff --git a/src-worddic/mem_dic.c b/src-worddic/mem_dic.c new file mode 100644 index 0000000..9a26b71 --- /dev/null +++ b/src-worddic/mem_dic.c @@ -0,0 +1,250 @@ +/* + * mem_dic 辞書のキャッシュを行う + * + * キャッシュは読みの文字列と逆変換用かのフラグ(is_reverse)の + * 二つをキーとして操作される。 + * + * Copyright (C) 2000-2007 TABATA Yusuke + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> + +#include <anthy/alloc.h> +#include "dic_main.h" +#include "mem_dic.h" + +static allocator mem_dic_ator; + +static void +dic_ent_dtor(void *p) +{ + struct dic_ent *de = p; + if (de->str.str) { + free(de->str.str); + } +} + +static void +seq_ent_dtor(void *p) +{ + struct seq_ent *seq = p; + int i; + /**/ + for (i = 0; i < seq->nr_dic_ents; i++) { + anthy_sfree(seq->md->dic_ent_allocator, seq->dic_ents[i]); + } + if (seq->nr_dic_ents) { + free(seq->dic_ents); + } + /**/ + free(seq->str.str); +} + +static void +mem_dic_dtor(void *p) +{ + struct mem_dic * md = p; + anthy_free_allocator(md->seq_ent_allocator); + anthy_free_allocator(md->dic_ent_allocator); +} + +/** xstrに対応するseq_entを確保する */ +static struct seq_ent * +alloc_seq_ent_by_xstr(struct mem_dic * md, xstr *x, int is_reverse) +{ + struct seq_ent *se; + se = (struct seq_ent *)anthy_smalloc(md->seq_ent_allocator); + if (is_reverse) { + se->seq_type = ST_REVERSE; + } else { + se->seq_type = ST_NONE; + } + se->md = md; + se->str.len = x->len; + /**/ + se->nr_dic_ents = 0; + se->dic_ents = NULL; + /**/ + se->nr_compound_ents = 0; + + se->str.str = anthy_xstr_dup_str(x); + return se; +} + +/* ハッシュ関数。とりあえずてきとー */ +static int +hash_function(xstr *xs) +{ + if (xs->len) { + return xs->str[0]% HASH_SIZE; + } + return 0; +} + +/** xstrに対応するseq_entを返す */ +struct seq_ent * +anthy_mem_dic_alloc_seq_ent_by_xstr(struct mem_dic * md, xstr *xs, + int is_reverse) +{ + struct seq_ent *se; + int h; + /* キャッシュにあればそれを返す */ + se = anthy_mem_dic_find_seq_ent_by_xstr(md, xs, is_reverse); + if (se) { + return se; + } + /* キャッシュには無いので作る */ + se = alloc_seq_ent_by_xstr(md, xs, is_reverse); + + /* mem_dic中につなぐ */ + h = hash_function(xs); + se->next = md->seq_ent_hash[h]; + md->seq_ent_hash[h] = se; + + return se; +} + +static int +compare_seq_ent(struct seq_ent *seq, xstr *xs, int is_reverse) +{ + /* まず、どちらかが逆変換用のエントリかをチェック */ + if (seq->seq_type & ST_REVERSE) { + if (!is_reverse) { + return 1; + } + } else { + if (is_reverse) { + return 1; + } + } + /* 次に文字列の比較 */ + return anthy_xstrcmp(&seq->str, xs); +} + +/*** mem_dicの中から文字列に対応するseq_ent*を取得する + * */ +struct seq_ent * +anthy_mem_dic_find_seq_ent_by_xstr(struct mem_dic * md, xstr *xs, + int is_reverse) +{ + struct seq_ent *sn; + int h; + h = hash_function(xs); + for (sn = md->seq_ent_hash[h]; sn; sn = sn->next) { + if (!compare_seq_ent(sn, xs, is_reverse)){ + return sn; + } + } + return 0; +} + +void +anthy_mem_dic_release_seq_ent(struct mem_dic * md, xstr *xs, int is_reverse) +{ + struct seq_ent *sn; + struct seq_ent **sn_prev_p; + int h; + + h = hash_function(xs); + sn_prev_p = &md->seq_ent_hash[h]; + for (sn = md->seq_ent_hash[h]; sn; sn = sn->next) { + if (!compare_seq_ent(sn, xs, is_reverse)){ + *sn_prev_p = sn->next; + anthy_sfree(md->seq_ent_allocator, sn); + return; + } else { + sn_prev_p = &sn->next; + } + } +} + +/** seq_entにdic_entを追加する */ +void +anthy_mem_dic_push_back_dic_ent(struct seq_ent *se, int is_compound, + xstr *xs, wtype_t wt, + const char *wt_name, int freq, int feature) +{ + struct dic_ent *de; + de = anthy_smalloc(se->md->dic_ent_allocator); + de->type = wt; + de->wt_name = wt_name; + de->freq = freq; + de->feature = feature; + de->order = 0; + de->is_compound = is_compound; + de->str.len = xs->len; + de->str.str = anthy_xstr_dup_str(xs); + + if (is_compound) { + se->nr_compound_ents ++; + } + + /* orderを計算する */ + if (se->nr_dic_ents > 0) { + struct dic_ent *prev_de = se->dic_ents[se->nr_dic_ents-1]; + if (anthy_wtype_equal(prev_de->type, de->type) && + prev_de->freq > de->freq) { + de->order = prev_de->order + 1; + } + } + + /* 配列に追加する */ + se->nr_dic_ents ++; + se->dic_ents = realloc(se->dic_ents, + sizeof(struct dic_ent *)*se->nr_dic_ents); + se->dic_ents[se->nr_dic_ents-1] = de; +} + +struct mem_dic * +anthy_create_mem_dic(void) +{ + int i; + struct mem_dic *md; + + md = anthy_smalloc(mem_dic_ator); + for (i = 0; i < HASH_SIZE; i++) { + md->seq_ent_hash[i] = NULL; + } + + md->seq_ent_allocator = + anthy_create_allocator(sizeof(struct seq_ent), + seq_ent_dtor); + md->dic_ent_allocator = + anthy_create_allocator(sizeof(struct dic_ent), + dic_ent_dtor); + + return md; +} + +void +anthy_release_mem_dic(struct mem_dic * d) +{ + anthy_sfree(mem_dic_ator, d); +} + +void +anthy_init_mem_dic(void) +{ + mem_dic_ator = anthy_create_allocator(sizeof(struct mem_dic), + mem_dic_dtor); +} + +void +anthy_quit_mem_dic(void) +{ + anthy_free_allocator(mem_dic_ator); +} diff --git a/src-worddic/mem_dic.h b/src-worddic/mem_dic.h new file mode 100644 index 0000000..7ba7cb6 --- /dev/null +++ b/src-worddic/mem_dic.h @@ -0,0 +1,17 @@ +#ifndef _mem_dic_h_included_ +#define _mem_dic_h_included_ + +#include <anthy/alloc.h> +#include "dic_ent.h" + + +#define HASH_SIZE 64 /*ハッシュテーブルのサイズ…64(なんとなく)*/ + +/** メモリ辞書 */ +struct mem_dic { + struct seq_ent *seq_ent_hash[HASH_SIZE]; + allocator seq_ent_allocator; + allocator dic_ent_allocator; +}; + +#endif diff --git a/src-worddic/priv_dic.c b/src-worddic/priv_dic.c new file mode 100644 index 0000000..04e1f19 --- /dev/null +++ b/src-worddic/priv_dic.c @@ -0,0 +1,425 @@ +/* + * 個人辞書を扱うためのコード + * + * ユーザが明示的に登録した単語だけでなく、 + * 未知語を自動的に学習して管理するAPIも持つ。 + * + * Copyright (C) 2000-2007 TABATA Yusuke + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> +#include <fcntl.h> +#include <unistd.h> + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include <anthy/anthy.h> +#include <anthy/alloc.h> +#include <anthy/dic.h> +#include <anthy/record.h> +#include <anthy/dicutil.h> +#include <anthy/conf.h> +#include <anthy/logger.h> +#include <anthy/texttrie.h> +#include <anthy/textdict.h> +#include <anthy/word_dic.h> +#include "dic_main.h" +#include "dic_ent.h" + +/* 個人辞書 */ +struct text_trie *anthy_private_tt_dic; +struct textdict *anthy_private_text_dic; +static struct textdict *anthy_imported_text_dic; +static char *imported_dic_dir; +/* ロック用の変数 */ +static char *lock_fn; +static int lock_depth; +static int lock_fd; + +#define MAX_DICT_SIZE 100000000 + +/* 個人辞書のディレクトリの有無を確認する */ +void +anthy_check_user_dir(void) +{ + const char *hd; + char *dn; + struct stat st; + hd = anthy_conf_get_str("HOME"); + dn = alloca(strlen(hd) + 10); + sprintf(dn, "%s/.anthy", hd); + if (stat(dn, &st) || !S_ISDIR(st.st_mode)) { + int r; + /*fprintf(stderr, "Anthy: Failed to open anthy directory(%s).\n", dn);*/ + r = mkdir(dn, S_IRWXU); + if (r == -1){ + anthy_log(0, "Failed to create profile directory\n"); + return ; + } + /*fprintf(stderr, "Anthy: Created\n");*/ + r = chmod(dn, S_IRUSR | S_IWUSR | S_IXUSR); + if (r == -1) { + anthy_log(0, "But failed to change permission.\n"); + } + } +} + +static void +init_lock_fn(const char *home, const char *id) +{ + lock_fn = malloc(strlen(home) + strlen(id) + 40); + sprintf(lock_fn, "%s/.anthy/lock-file_%s", home, id); +} + +static struct text_trie * +open_tt_dic(const char *home, const char *id) +{ + struct text_trie *tt; + char *buf = malloc(strlen(home) + strlen(id) + 40); + sprintf(buf, "%s/.anthy/private_dict_%s.tt", home, id); + tt = anthy_trie_open(buf, 0); + free(buf); + return tt; +} + +static struct textdict * +open_textdic(const char *home, const char *name, const char *id) +{ + char *fn = malloc(strlen(home) + strlen(name) + strlen(id) + 10); + struct textdict *td; + sprintf(fn, "%s/.anthy/%s%s", home, name, id); + td = anthy_textdict_open(fn, 0); + free(fn); + return td; +} + +void +anthy_priv_dic_lock(void) +{ + struct flock lck; + lock_depth ++; + if (lock_depth > 1) { + return ; + } + if (!lock_fn) { + /* 初期化をミスってる */ + lock_fd = -1; + return ; + } + + /* ファイルロックの方法は多数あるが、この方法はcygwinでも動くので採用した */ + lock_fd = open(lock_fn, O_CREAT|O_RDWR, S_IREAD|S_IWRITE); + if (lock_fd == -1) { + return ; + } + + lck.l_type = F_WRLCK; + lck.l_whence = (short) 0; + lck.l_start = (off_t) 0; + lck.l_len = (off_t) 1; + if (fcntl(lock_fd, F_SETLKW, &lck) == -1) { + close(lock_fd); + lock_fd = -1; + } +} + +void +anthy_priv_dic_unlock(void) +{ + lock_depth --; + if (lock_depth > 0) { + return ; + } + + if (lock_fd != -1) { + close(lock_fd); + lock_fd = -1; + } +} + +void +anthy_priv_dic_update(void) +{ + if (!anthy_private_tt_dic) { + return ; + } + + anthy_trie_update_mapping(anthy_private_tt_dic); +} + +/* seq_entに追加する */ +static void +add_to_seq_ent(const char *line, int encoding, struct seq_ent *seq) +{ + struct word_line wl; + wtype_t wt; + xstr *xs; + /* */ + if (anthy_parse_word_line(line, &wl)) { + return ; + } + xs = anthy_cstr_to_xstr(wl.word, encoding); + anthy_type_to_wtype(wl.wt, &wt); + anthy_mem_dic_push_back_dic_ent(seq, 0, xs, wt, + NULL, wl.freq, 0); + anthy_free_xstr(xs); +} + +/* texttrieに登録されているかをチェックし、 + * 登録されていればseq_entに追加する + */ +static void +copy_words_from_tt(struct seq_ent *seq, xstr *xs, + int encoding, const char *prefix) +{ + char *key, *v; + int key_len; + char *key_buf; + int prefix_len = strlen(prefix); + /**/ + if (!anthy_private_tt_dic) { + return ; + } + key = anthy_xstr_to_cstr(xs, encoding); + key_len = strlen(key); + key_buf = malloc(key_len + 12); + /* 辞書中には各単語が「見出し XXXX」(XXXXはランダムな文字列)を + * キーとして保存されているので列挙する + */ + sprintf(key_buf, "%s%s ", prefix, key); + do { + if (strncmp(&key_buf[2], key, key_len) || + strncmp(&key_buf[0], prefix, prefix_len) || + key_buf[key_len+2] != ' ') { + /* 「見出し 」で始まっていないので対象外 */ + break; + } + /* 単語を読み出して登録 */ + v = anthy_trie_find(anthy_private_tt_dic, key_buf); + if (v) { + add_to_seq_ent(v, encoding, seq); + } + free(v); + /**/ + } while (anthy_trie_find_next_key(anthy_private_tt_dic, + key_buf, key_len + 8)); + free(key); + free(key_buf); +} + +void +anthy_copy_words_from_private_dic(struct seq_ent *seq, + xstr *xs, int is_reverse) +{ + if (is_reverse) { + return ; + } + /* 個人辞書から取ってくる */ + copy_words_from_tt(seq, xs, ANTHY_EUC_JP_ENCODING, " "); + copy_words_from_tt(seq, xs, ANTHY_UTF8_ENCODING, " p"); + /**/ + if (!anthy_select_section("UNKNOWN_WORD", 0) && + !anthy_select_row(xs, 0)) { + wtype_t wt; + xstr *word_xs; + anthy_type_to_wtype("#T35", &wt); + word_xs = anthy_get_nth_xstr(0); + anthy_mem_dic_push_back_dic_ent(seq, 0, word_xs, wt, NULL, 10, 0); + } +} + +int +anthy_parse_word_line(const char *line, struct word_line *res) +{ + int i; + const char *buf = line; + /* default values */ + res->wt[0] = 0; + res->freq = 1; + res->word = NULL; + /* 品詞と頻度をparse */ + for (i = 0; i < 9 && *buf && *buf != '*' && *buf != ' '; buf++, i++) { + res->wt[i] = *buf; + } + res->wt[i] = 0; + if (*buf == '*') { + buf ++; + sscanf(buf, "%d", &res->freq); + buf = strchr(buf, ' '); + } else { + res->freq = 1; + } + if (!buf || !(*buf)) { + res->word = ""; + return -1; + } + buf++; + /* 単語 */ + res->word = buf; + return 0; +} + +void +anthy_ask_scan(void (*request_scan)(struct textdict *, void *), + void *arg) +{ + DIR *dir; + struct dirent *de; + int size = 0; + request_scan(anthy_private_text_dic, arg); + request_scan(anthy_imported_text_dic, arg); + dir = opendir(imported_dic_dir); + if (!dir) { + return ; + } + while ((de = readdir(dir))) { + struct stat st_buf; + struct textdict *td; + char *fn = malloc(strlen(imported_dic_dir) + + strlen(de->d_name) + 3); + if (!fn) { + break; + } + sprintf(fn, "%s/%s", imported_dic_dir, de->d_name); + if (stat(fn, &st_buf)) { + free(fn); + continue; + } + if (!S_ISREG(st_buf.st_mode)) { + free(fn); + continue; + } + size += st_buf.st_size; + if (size > MAX_DICT_SIZE) { + free(fn); + break; + } + td = anthy_textdict_open(fn, 0); + request_scan(td, arg); + anthy_textdict_close(td); + free(fn); + } + closedir(dir); +} + +static void +add_unknown_word(xstr *yomi, xstr *word) +{ + /* recordに追加 */ + if (anthy_select_section("UNKNOWN_WORD", 1)) { + return ; + } + if (!anthy_select_row(yomi, 0)) { + anthy_mark_row_used(); + } + if (anthy_select_row(yomi, 1)) { + return ; + } + anthy_set_nth_xstr(0, word); +} + +void +anthy_add_unknown_word(xstr *yomi, xstr *word) +{ + if (!(anthy_get_xstr_type(word) & XCT_KATA) && + !(anthy_get_xstr_type(word) & XCT_HIRA)) { + return ; + } + if (yomi->len < 4 || yomi->len > 30) { + return ; + } + /**/ + add_unknown_word(yomi, word); +} + +void +anthy_forget_unused_unknown_word(xstr *xs) +{ + char key_buf[128]; + char *v; + + if (!anthy_private_tt_dic) { + return ; + } + + v = anthy_xstr_to_cstr(xs, ANTHY_UTF8_ENCODING); + sprintf(key_buf, " U%s 0", v); + free(v); + anthy_trie_delete(anthy_private_tt_dic, key_buf); + + /* recordに記録された物を消す */ + if (anthy_select_section("UNKNOWN_WORD", 0)) { + return ; + } + if (!anthy_select_row(xs, 0)) { + anthy_release_row(); + } +} + +void +anthy_init_private_dic(const char *id) +{ + const char *home = anthy_conf_get_str("HOME"); + if (anthy_private_tt_dic) { + anthy_trie_close(anthy_private_tt_dic); + } + /**/ + anthy_textdict_close(anthy_private_text_dic); + anthy_textdict_close(anthy_imported_text_dic); + /**/ + if (lock_fn) { + free(lock_fn); + } + init_lock_fn(home, id); + anthy_private_tt_dic = open_tt_dic(home, id); + /**/ + anthy_private_text_dic = open_textdic(home, "private_words_", id); + anthy_imported_text_dic = open_textdic(home, "imported_words_", id); + imported_dic_dir = malloc(strlen(home) + strlen(id) + 30); + sprintf(imported_dic_dir, "%s/.anthy/imported_words_%s.d/", home, id); +} + +void +anthy_release_private_dic(void) +{ + if (anthy_private_tt_dic) { + anthy_trie_close(anthy_private_tt_dic); + anthy_private_tt_dic = NULL; + } + /**/ + anthy_textdict_close(anthy_private_text_dic); + anthy_textdict_close(anthy_imported_text_dic); + free(imported_dic_dir); + anthy_private_text_dic = NULL; + anthy_imported_text_dic = NULL; + imported_dic_dir = NULL; + /**/ + if (lock_depth > 0) { + /* not sane situation */ + lock_depth = 0; + if (lock_fn) { + unlink(lock_fn); + } + } + /**/ + free(lock_fn); + lock_fn = NULL; +} diff --git a/src-worddic/ptab.h b/src-worddic/ptab.h new file mode 100644 index 0000000..552e653 --- /dev/null +++ b/src-worddic/ptab.h @@ -0,0 +1,153 @@ +/* POS(Part of Speech) table */ +{"名詞",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞0",POS_NOUN,COS_NONE,SCOS_T0,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞2",POS_NOUN,COS_NONE,SCOS_T2,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞3",POS_NOUN,COS_NONE,SCOS_T3,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞4",POS_NOUN,COS_NONE,SCOS_T4,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞5",POS_NOUN,COS_NONE,SCOS_T5,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞7",POS_NOUN,COS_NONE,SCOS_T7,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞8",POS_NOUN,COS_NONE,SCOS_T8,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞9",POS_NOUN,COS_NONE,SCOS_T9,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞10",POS_NOUN,COS_NONE,SCOS_T10,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞12",POS_NOUN,COS_NONE,SCOS_T12,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞13",POS_NOUN,COS_NONE,SCOS_T13,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞14",POS_NOUN,COS_NONE,SCOS_T14,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞15",POS_NOUN,COS_NONE,SCOS_T15,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞17",POS_NOUN,COS_NONE,SCOS_T17,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞18",POS_NOUN,COS_NONE,SCOS_T18,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞19",POS_NOUN,COS_NONE,SCOS_T19,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞20",POS_NOUN,COS_NONE,SCOS_T20,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞22",POS_NOUN,COS_NONE,SCOS_T22,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞23",POS_NOUN,COS_NONE,SCOS_T23,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞24",POS_NOUN,COS_NONE,SCOS_T24,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞25",POS_NOUN,COS_NONE,SCOS_T25,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞27",POS_NOUN,COS_NONE,SCOS_T27,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞28",POS_NOUN,COS_NONE,SCOS_T28,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞29",POS_NOUN,COS_NONE,SCOS_T29,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞30",POS_NOUN,COS_NONE,SCOS_T30,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞32",POS_NOUN,COS_NONE,SCOS_T32,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞33",POS_NOUN,COS_NONE,SCOS_T33,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞34",POS_NOUN,COS_NONE,SCOS_T34,CC_NONE,CT_NONE,WF_INDEP|WF_SV}, +{"名詞35",POS_NOUN,COS_NONE,SCOS_T35,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞37",POS_NOUN,COS_NONE,SCOS_T37,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞38",POS_NOUN,COS_NONE,SCOS_T38,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞39",POS_NOUN,COS_NONE,SCOS_T39,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞化接尾語",POS_NOUN,COS_NONE,SCOS_T40,CC_NONE,CT_NONE,WF_INDEP}, +{"数詞",POS_NUMBER,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"数詞1",POS_NUMBER,COS_NN,SCOS_N1,CC_NONE,CT_NONE,WF_INDEP}, +{"数詞10",POS_NUMBER,COS_NN,SCOS_N10,CC_NONE,CT_NONE,WF_INDEP}, +{"数詞100",POS_NUMBER,COS_NN,SCOS_N100,CC_NONE,CT_NONE,WF_INDEP}, +{"数詞1000",POS_NUMBER,COS_NN,SCOS_N1000,CC_NONE,CT_NONE,WF_INDEP}, +{"数詞10000",POS_NUMBER,COS_NN,SCOS_N10000,CC_NONE,CT_NONE,WF_INDEP}, +{"人名",POS_NOUN,COS_JN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"地名",POS_NOUN,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"団体名",POS_NOUN,COS_KK,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"開き括弧",POS_OPEN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"閉じ括弧",POS_CLOSE,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"名詞接頭辞",POS_PRE,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"人名接尾辞",POS_SUC,COS_JN,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"名詞接尾辞",POS_NOUN,COS_SUFFIX,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"数接頭辞",POS_PRE,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"数接尾辞",POS_SUC,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"地名接頭辞",POS_PRE,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"地名接尾辞",POS_SUC,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"サ変接尾辞",POS_SUC,COS_SVSUFFIX,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"助詞",POS_PRT,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"副詞",POS_AV,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞0",POS_AV,COS_NONE,SCOS_F0,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞1",POS_AV,COS_NONE,SCOS_F1,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞2",POS_AV,COS_NONE,SCOS_F2,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞3",POS_AV,COS_NONE,SCOS_F3,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞4",POS_AV,COS_NONE,SCOS_F4,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞5",POS_AV,COS_NONE,SCOS_F5,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞6",POS_AV,COS_NONE,SCOS_F6,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞7",POS_AV,COS_NONE,SCOS_F7,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞8",POS_AV,COS_NONE,SCOS_F8,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞9",POS_AV,COS_NONE,SCOS_F9,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞10",POS_AV,COS_NONE,SCOS_F10,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞11",POS_AV,COS_NONE,SCOS_F11,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞12",POS_AV,COS_NONE,SCOS_F12,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞13",POS_AV,COS_NONE,SCOS_F13,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞14",POS_AV,COS_NONE,SCOS_F14,CC_NONE,CT_NONE,WF_INDEP}, +{"副詞語幹",POS_AV,COS_NONE,SCOS_NONE,CC_NONE,CT_HEAD,WF_NONE}, +{"動詞",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP}, +{"動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_HEAD,WF_INDEP}, +{"接続詞",POS_CONJ,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, + /**/ +{"動詞未然形",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_MIZEN,WF_INDEP}, + /**/ +{"動詞連用形",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_RENYOU,WF_INDEP}, + /**/ +{"動詞終止形",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_SYUSI,WF_INDEP}, + /**/ +{"動詞連体形",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_RENTAI,WF_INDEP}, + /**/ +{"動詞仮定形",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_KATEI,WF_INDEP}, + /**/ +{"動詞命令形",POS_V,COS_NONE,SCOS_NONE,CC_NONE,CT_MEIREI,WF_INDEP}, + /**/ +{"カ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_K5,CT_HEAD,WF_INDEP}, +{"カ行C5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_C5,CT_HEAD,WF_INDEP}, +{"ガ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_G5,CT_HEAD,WF_INDEP}, +{"サ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_S5,CT_HEAD,WF_INDEP}, +{"タ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_T5,CT_HEAD,WF_INDEP}, +{"ナ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_N5,CT_HEAD,WF_INDEP}, +{"マ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_M5,CT_HEAD,WF_INDEP}, +{"バ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_B5,CT_HEAD,WF_INDEP}, +{"ラ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_R5,CT_HEAD,WF_INDEP}, +{"ラ行L5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_L5,CT_HEAD,WF_INDEP}, +{"ワ行5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_W5,CT_HEAD,WF_INDEP}, +{"ワ行U5段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_U5,CT_HEAD,WF_INDEP}, +{"上下一段活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_KS1,CT_HEAD,WF_INDEP}, +{"カ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_K5,CT_MEISIKA,WF_INDEP}, +{"カ行C5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_C5,CT_MEISIKA,WF_INDEP}, +{"ガ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_G5,CT_MEISIKA,WF_INDEP}, +{"サ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_S5,CT_MEISIKA,WF_INDEP}, +{"タ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_T5,CT_MEISIKA,WF_INDEP}, +{"ナ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_N5,CT_MEISIKA,WF_INDEP}, +{"マ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_M5,CT_MEISIKA,WF_INDEP}, +{"バ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_B5,CT_MEISIKA,WF_INDEP}, +{"ラ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_R5,CT_MEISIKA,WF_INDEP}, +{"ラ行L5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_L5,CT_MEISIKA,WF_INDEP}, +{"ワ行5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_W5,CT_MEISIKA,WF_INDEP}, +{"ワ行U5段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_U5,CT_MEISIKA,WF_INDEP}, +{"上下一段活用動詞名詞化語幹",POS_V,COS_NONE,SCOS_NONE,CC_KS1,CT_MEISIKA,WF_INDEP}, + /**/ +{"ラ変活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_RV,CT_HEAD,WF_INDEP}, +{"カ変活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_HEAD,WF_INDEP}, +{"サ変活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_SV,CT_HEAD,WF_INDEP}, +{"ザ変活用動詞語幹",POS_V,COS_NONE,SCOS_NONE,CC_ZV,CT_HEAD,WF_INDEP}, + /**/ +{"カ変活用動詞未然形",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_MIZEN,WF_INDEP}, +{"カ変活用動詞連用形",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_RENYOU,WF_INDEP}, +{"カ変活用動詞終止形",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_SYUSI,WF_INDEP}, +{"カ変活用動詞仮定形",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_KATEI,WF_INDEP}, +{"カ変活用動詞命令形",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_MEIREI,WF_INDEP}, + /**/ +{"する未然形「し」",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MIZEN,WF_INDEP}, +{"する未然形「せ」",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MIZEN,WF_INDEP}, +{"する連用形「し」",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_RENYOU,WF_INDEP}, +{"する命令形「しろ」",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MEIREI,WF_INDEP}, +{"する命令形「せよ」",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MEIREI,WF_INDEP}, + /**/ +{"形容詞",POS_A,COS_NONE,SCOS_A0,CC_NONE,CT_NONE,WF_NONE}, +{"形容詞化接尾語",POS_D2KY,COS_NONE,SCOS_A1,CC_NONE,CT_NONE,WF_NONE}, +{"形容詞未然形",POS_A,COS_NONE,SCOS_NONE,CC_NONE,CT_MIZEN,WF_NONE}, +{"形容詞連用形",POS_A,COS_NONE,SCOS_NONE,CC_NONE,CT_RENYOU,WF_NONE}, +{"形容詞終止形",POS_A,COS_NONE,SCOS_NONE,CC_NONE,CT_SYUSI,WF_NONE}, +{"形容詞連体形",POS_A,COS_NONE,SCOS_NONE,CC_NONE,CT_RENTAI,WF_NONE}, +{"形容詞仮定形",POS_A,COS_NONE,SCOS_NONE,CC_NONE,CT_KATEI,WF_NONE}, +{"形容詞語幹",POS_A,COS_NONE,SCOS_NONE,CC_NONE,CT_HEAD,WF_NONE}, + /**/ +{"形容動詞未然形",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_MIZEN,WF_NONE}, +{"形容動詞連用形",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_RENYOU,WF_NONE}, +{"形容動詞終止形",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_SYUSI,WF_NONE}, +{"形容動詞連体形",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_RENTAI,WF_NONE}, +{"形容動詞仮定形",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_KATEI,WF_NONE}, +{"形容動詞原形",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE},/*形容動詞の原形*/ +{"形容動詞語幹",POS_AJV,COS_NONE,SCOS_NONE,CC_NONE,CT_HEAD,WF_NONE}, + /**/ +{"単漢字",POS_TANKANJI,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"連体詞",POS_ME,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"動詞丁寧表現語幹",POS_V,COS_NONE,SCOS_NONE,CC_SV,CT_NONE,WF_INDEP}, +{0,0,0,0,0,0,0}, diff --git a/src-worddic/record.c b/src-worddic/record.c new file mode 100644 index 0000000..66e85f3 --- /dev/null +++ b/src-worddic/record.c @@ -0,0 +1,2074 @@ +/* + * 学習の履歴などを管理するためのデータベース + * 文字列(xstr)をキーにして高速に行(row)を検索することができる. + * 複数のセクションをもつことができ,学習の違うフェーズなどに対応する + * (セクション * 文字列 -> 行) + * 各行は文字列か数を持つ配列になっている + * + * 「パトリシア・トライ」というデータ構造を使用している。 + * 自然言語の検索などを扱っている教科書を参照のこと + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * Funded by IPA未踏ソフトウェア創造事業 2002 1/18 + * Funded by IPA未踏ソフトウェア創造事業 2005 + * Copyright (C) 2005 YOSHIDA Yuichi + * Copyright (C) 2000-2006 TABATA Yusuke + * Copyright (C) 2000-2003 UGAWA Tomoharu + * Copyright (C) 2001-2002 TAKAI Kosuke + */ +/* + * パーソナリティ""は匿名パーソナリティであり, + * ファイルへの読み書きは行わない. + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#include "config.h" +#include <anthy/anthy.h> +#include <anthy/dic.h> +#include <anthy/alloc.h> +#include <anthy/conf.h> +#include <anthy/ruleparser.h> +#include <anthy/record.h> +#include <anthy/logger.h> +#include <anthy/prediction.h> + +#include "dic_main.h" +#include "dic_personality.h" + +/* 個人辞書をセーブするファイル名のsuffix */ +#define ENCODING_SUFFIX ".utf8" + + +enum val_type { + RT_EMPTY, RT_VAL, RT_XSTR, RT_XSTRP +}; + +/* 値 */ +struct record_val { + enum val_type type; + union { + xstr str; + int val; + xstr* strp; + } u; +}; + +/* 行 */ +struct record_row { + xstr key; + int nr_vals; + struct record_val *vals; +}; + +/* trie node管理用 */ +struct trie_node { + struct trie_node *l; + struct trie_node *r; + int bit; + struct record_row row; + struct trie_node *lru_prev, *lru_next; /* 両端ループ */ + int dirty; /* LRU のための used, sused ビット */ +}; + +/* trie treeのroot */ +struct trie_root { + struct trie_node root; + allocator node_ator; +}; + +#define LRU_USED 0x01 +#define LRU_SUSED 0x02 +#define PROTECT 0x04 /* 差分書き出し時に使う(LRUとは関係ない) + * 差分書き出しでは、ファイルに書き出す前に + * ファイル上に他のプロセスが記録した更新を + * 読み込む。それによって、これから追加しよ + * うとするノードが消されるのを防ぐ + */ +/* + * LRU: + * USED: メモリ上で使われた + * SUSED: 保存された used ビット + * + * LRUリスト上では、 USED は必ずリスト先頭に並んでいるが、 SUSED は + * フラグなしのノードと混在している可能性がある。 + * + * n個を残すように指定された時の動作 + * 1. used > n + * LRU リストの先頭から n 番目以降を消す + * 2. used + sused > n + * used -> 残す + * sused -> sused フラグを落す + * それ以外 -> 消す + * 3. それ以外 + * 全て残す + * ファイルに書き出す時に、 used || sused -> sused として書き出す + */ + +/** セクション */ +struct record_section { + const char *name; + struct trie_root cols; + struct record_section *next; + int lru_nr_used, lru_nr_sused; /* LRU 用 */ +}; + +/** データベース */ +struct record_stat { + struct record_section section_list; /* sectionのリスト*/ + struct record_section *cur_section; + struct trie_root xstrs; /* xstr を intern するための trie */ + struct trie_node *cur_row; + int row_dirty; /* cur_row が保存の必要があるか */ + int encoding; + /**/ + int is_anon; + const char *id; /* パーソナリティのid */ + char *base_fn; /* 基本ファイル 絶対パス */ + char *journal_fn; /* 差分ファイル 絶対パス */ + /**/ + time_t base_timestamp; /* 基本ファイルのタイムスタンプ */ + int last_update; /* 差分ファイルの最後に読んだ位置 */ + time_t journal_timestamp; /* 差分ファイルのタイムスタンプ */ +}; + +/* 差分が100KB越えたら基本ファイルへマージ */ +#define FILE2_LIMIT 102400 + + +/* + * xstr の intern: + * 個人ごと( record_stat ごと)に文字列を intern する。これは、 + * メモリの節約の他に、データベースの flush 時にデータベースに + * 由来する xstr が無効になるのを防ぐ目的がある。 + * したがって、データベースの flush 時でも xstr の intern 用 + * のデータベース xstrs はそのまま保存する。 + * + * xstrs: xstr の intern 用のデータベース + * row の key を intern された xstr として使う。 + * row に value は持たない。 + * (将来的には参照カウンタをつけてもいいかも) + * 参照: intern_xstr() + */ + +/* + * 差分書き出し: + * データベースの保存、複数の anthy ライブラリをリンクした + * プロセスの学習履歴の同期のために、学習履歴の更新情報を + * 逐一ファイルに書き出す。 + * + * ・基本ファイル 古い anthy の学習履歴と同じ形式。 + * 差分情報を適用する元となるファイル。 + * 基本的には起動時だけに読み込む。 + * このプログラム中でファイル1,baseと呼ぶことがある。 + * ・差分ファイル 基本ファイルに対する更新情報。 + * データベースに対する更新がコミットされるたびに + * 読み書きされる。 + * このプログラム中でファイル2,journalと呼ぶことがある。 + * 基本方針: + * データベースに対する更新がコミットされると、まず差分ファイル + * に他のプロセスが追加した更新情報を読み込み、その後に自分の + * コミットした更新を差分ファイルに書き出す。 + * これらはロックファイルを用いてアトミックに行われる。また、 + * 基本ファイル、差分ファイルとも、ロックを取っている間しか + * オープンしていてはいけない。 + * 追加と削除: + * 追加はすでにメモリ上で更新された row をコミットによって + * メモリに書き出すため、 + * 1. コミット対象 row 以外を差分ファイルの情報で + * 2. コミット対象 row を差分ファイルに書き出し + * とする。削除はまだメモリ上に row が残っている状態でコミット + * が行われる(削除要求をコミットとして扱う)ため、 + * 1. 削除の情報を差分ファイルに書き出し + * 2. 差分ファイルの読み込みにより削除要求も実行する + * とする。 + * 基本ファイルの更新: + * 差分ファイルがある程度肥大化すると、差分ファイルの情報を + * 基本ファイルに反映して差分ファイルを空にする。 + * 更新するプロセス: + * 差分ファイルに書き出しを行った後、差分ファイルの大きさを調べ、 + * 肥大化していれば、そのときのメモリ上のデータベース(これには + * 全ての差分ファイルの更新が適用されている)を基本ファイルに + * 書き出す。 + * それ以外のプロセス: + * 差分ファイルを読む前に、基本ファイルが更新されているかを + * ファイルのタイムスタンプで調べ、更新されていれば、コミット + * された更新情報を直ちに更新ファイルに追加し、メモリ上の + * データベースを flush した後基本ファイル、差分ファイルを + * 読み込み直す。 + * データベースの flush により、 + * ・cur_row が無効になる (NULL になる) + * ・cur_section の有効性は保存される(sectionは解放しない) + * ・xstr は intern していれば保存される + * (すべての xstr は intern されているはず) + * 結局、次の様になる: + * if (基本ファイルが更新されている) { + * 差分ファイルへコミットされた更新を書き出す; + * データベースのフラッシュ; + * 基本ファイルの読込と差分ファイルの最終読込位置クリア; + * 差分ファイルの読込と差分ファイルの最終読込位置更新; + * } else { + * if (追加) { + * 差分ファイルの読込と差分ファイルの最終読込位置更新; + * 差分ファイルへの書き出し; + * } else { + * 差分ファイルへの書き出し; + * 差分ファイルの読込と差分ファイルの最終読込位置更新; + * } + * } + * if (差分ファイルが大きい) { + * 基本ファイルへの書き出し; + * 差分ファイルのクリア; + * } + */ + +static allocator record_ator; + +/* trie操作用 */ +static void init_trie_root(struct trie_root *n); +static int trie_key_nth_bit(xstr* key, int n); +static int trie_key_first_diff_bit_1byte(xchar c1, xchar c2); +static int trie_key_first_diff_bit(xstr *k1, xstr *k2); +static int trie_key_cmp(xstr *k1, xstr *k2); +static void trie_key_dup(xstr *dst, xstr *src); +static void trie_row_init(struct record_row *rc); +static void trie_row_free(struct record_row *rc); +static struct trie_node *trie_find(struct trie_root *root, xstr *key); +static struct trie_node *trie_insert(struct trie_root *root, xstr *key, + int dirty, int *nr_used, int *nr_sused); +static void trie_remove(struct trie_root *root, xstr *key, + int *nr_used, int *nr_sused); +static struct trie_node *trie_first(struct trie_root *root); +static struct trie_node *trie_next(struct trie_root *root, + struct trie_node *cur); +static void trie_remove_all(struct trie_root *root, + int *nr_used, int *nr_sused); +static void trie_remove_old(struct trie_root *root, int count, + int* nr_used, int* nr_sused); +static void trie_mark_used(struct trie_root *root, struct trie_node *n, + int *nr_used, int *nr_sused); + + +/* + * トライの実装 + * struct trie_nodeのうちrow以外の部分とrow.keyを使用 + * 削除の時はtrie_row_freeを使ってrowの内容を解放 + */ + +#define PUTNODE(x) ((x) == &root->root ? printf("root\n") : anthy_putxstrln(&(x)->row.key)) +static int +debug_trie_dump(FILE* fp, struct trie_node* n, int encoding) +{ + int cnt = 0; + char buf[1024]; + + if (n->l->bit > n->bit) { + cnt = debug_trie_dump(fp, n->l, encoding); + } else { + if (n->l->row.key.len == -1) { + if (fp) { + fprintf(fp, "root\n"); + } + } else { + if (fp) { + anthy_sputxstr(buf, &n->l->row.key, encoding); + fprintf(fp, "%s\n", buf); + } + cnt = 1; + } + } + + if (n->r->bit > n->bit) { + return cnt + debug_trie_dump(fp, n->r, encoding); + } else { + if (n->r->row.key.len == -1) { + if(fp) { + fprintf(fp, "root\n"); + } + } else { + if(fp) { + anthy_sputxstr(buf, &n->r->row.key, encoding); + fprintf(fp, "%s\n", buf); + } + return cnt + 1; + } + } + + return cnt; +} + +static void +init_trie_root(struct trie_root *root) +{ + struct trie_node* n; + root->node_ator = anthy_create_allocator(sizeof(struct trie_node), NULL); + n = &root->root; + n->l = n; + n->r = n; + n->bit = 0; + n->lru_next = n; + n->lru_prev = n; + n->dirty = 0; + trie_row_init(&n->row); + n->row.key.len = -1; +} + +/* + * bit0: 0 + * bit1: headのキーだけ0 + * bit2: 文字列のビット0 + * bit3: 文字列のビット1 + * ... + * 文字列長を越えると0 + */ +static int +trie_key_nth_bit(xstr* key, int n) +{ + switch (n) { + case 0: + return 0; + case 1: + return key->len + 1; /* key->len == -1 ? 0 : non-zero */ + default: + { + int pos; + n -= 2; + pos = n / (sizeof(xchar) << 3); + if (pos >= key->len) { + return 0; + } + return key->str[pos] & (1 << (n % (sizeof(xchar) << 3))); + } + } +} + +/* c1 == c2 では呼んではいけない */ +static int +trie_key_first_diff_bit_1byte(xchar c1, xchar c2) +{ + int i; + int ptn; + for (i = 0, ptn = c1 ^ c2; !(ptn & 1); i++, ptn >>= 1 ) + ; + return i; +} + +/* + * k1 == k2 では呼んではいけない + * ki->str[0 .. (ki->len - 1)]に0はないと仮定 + */ +#define MIN(a,b) ((a)<(b)?(a):(b)) +static int +trie_key_first_diff_bit(xstr *k1, xstr *k2) +{ + int len; + int i; + + len = MIN(k1->len, k2->len); + if (len == -1) { + return 1; + } + for ( i = 0 ; i < len ; i++ ){ + if (k1->str[i] != k2->str[i]) { + return (2 + (i * (sizeof(xchar) << 3)) + + trie_key_first_diff_bit_1byte(k1->str[i], k2->str[i])); + } + } + if (k1->len < k2->len) { + return (2 + (i * (sizeof(xchar) << 3)) + + trie_key_first_diff_bit_1byte(0, k2->str[i])); + } else { + return (2 + (i * (sizeof(xchar) << 3)) + + trie_key_first_diff_bit_1byte(k1->str[i], 0)); + } +} +#undef MIN + +static int +trie_key_cmp(xstr *k1, xstr *k2) +{ + if (k1->len == -1 || k2->len == -1) { + return k1->len - k2->len; + } + return anthy_xstrcmp(k1, k2); +} + +static void +trie_key_dup(xstr *dst, xstr *src) +{ + dst->str = anthy_xstr_dup_str(src); + dst->len = src->len; +} + +/* + * 見つからなければ 0 + */ +static struct trie_node * +trie_find(struct trie_root *root, xstr *key) +{ + struct trie_node *p; + struct trie_node *q; + + p = &root->root; + q = p->l; + while (p->bit < q->bit) { + p = q; + q = trie_key_nth_bit(key, p->bit) ? p->r : p->l; + } + return trie_key_cmp(&q->row.key,key) ? NULL : q; +} + +/* + * 最長マッチのための補助関数 + * key で探索して、始めて一致しなくなったノードを返す。 + */ +static struct trie_node * +trie_find_longest (struct trie_root* root, xstr *key) +{ + struct trie_node *p; + struct trie_node *q; + + p = &root->root; + q = p->l; + while (p->bit < q->bit) { + p = q; + q = trie_key_nth_bit(key, p->bit) ? p->r : p->l; + } + + return q; +} + +/* + * 追加したノードを返す + * すでに同じキーをもつノードがあるときは、追加せずに0を返す + */ +static struct trie_node * +trie_insert(struct trie_root *root, xstr *key, + int dirty, int *nr_used, int *nr_sused) +{ + struct trie_node *n; + struct trie_node *p; + struct trie_node *q; + int i; + + p = &root->root; + q = p->l; + while (p->bit < q->bit) { + p = q; + q = trie_key_nth_bit(key, p->bit) ? p->r : p->l; + } + if (trie_key_cmp(&q->row.key,key) == 0) { + /* USED > SUSED > 0 で強い方を残す */ + if (dirty == LRU_USED) { + trie_mark_used(root, q, nr_used, nr_sused); + } else if (q->dirty == 0) { + q->dirty = dirty; + } + return 0; + } + i = trie_key_first_diff_bit(&q->row.key, key); + p = &root->root; + q = p->l; + while (p->bit < q->bit && i > q->bit) { + p = q; + q = trie_key_nth_bit(key, p->bit) ? p->r : p->l; + } + n = anthy_smalloc(root->node_ator); + trie_row_init(&n->row); + trie_key_dup(&n->row.key, key); + n->bit = i; + if (trie_key_nth_bit(key, i)) { + n->l = q; + n->r = n; + } else { + n->l = n; + n->r = q; + } + if (p->l == q) { + p->l = n; + } else { + p->r = n; + } + + /* LRU の処理 */ + if (dirty == LRU_USED) { + root->root.lru_next->lru_prev = n; + n->lru_prev = &root->root; + n->lru_next = root->root.lru_next; + root->root.lru_next = n; + (*nr_used)++; + } else { + root->root.lru_prev->lru_next = n; + n->lru_next = &root->root; + n->lru_prev = root->root.lru_prev; + root->root.lru_prev = n; + if (dirty == LRU_SUSED) { + (*nr_sused)++; + } + } + n->dirty = dirty; + return n; +} + +/* + * ノードを見つけると削除する + * 内部でtrie_row_freeを呼び、キーを含むデータ部分をfreeする + * + * データとノードを削除する。 + * 削除対象のデータは削除対象のノードに格納されているとは + * 限らないことに注意。 + * 1. 削除対象の葉を持つノードに削除対象の葉が含まれているとき + * 削除対象のノードは、子への枝のうち、生きのこる枝を親に渡して死ぬ + * 2. 削除対象の葉を持つノードの祖先に削除対象の葉が含まれているとき + * 1. に加えて、削除対象の葉をもつノードを殺して、代わりに削除 + * 対象のノードを削除対象の葉をもつノードの位置に移動させ生かす + */ +static void +trie_remove(struct trie_root *root, xstr *key, + int *nr_used, int *nr_sused) +{ + struct trie_node *p; + struct trie_node *q; + struct trie_node **pp = NULL; /* gcc の warning 回避 */ + struct trie_node **qq; + p = &root->root; + qq = &p->l; + q = *qq; + while (p->bit < q->bit) { + pp = qq; + p = q; + qq = trie_key_nth_bit(key,p->bit) ? &p->r : &p->l; + q = *qq; + } + if (trie_key_cmp(&q->row.key, key) != 0) { + return ; + } + if (p != q) { + /* case 2. */ + struct trie_node *r; + struct trie_node *s; + r = &root->root; + s = r->l; + while (s != q) { + r = s; + s = trie_key_nth_bit(key, r->bit) ? r->r : r->l; + } + *pp = (p->r == q) ? p->l : p->r; + p->l = q->l; + p->r = q->r; + p->bit = q->bit; + if (trie_key_nth_bit(key, r->bit)) { + r->r = p; + } else { + r->l = p; + } + p = q; + } else { + *pp = (p->r == q) ? p->l : p->r; + } + p->lru_prev->lru_next = p->lru_next; + p->lru_next->lru_prev = p->lru_prev; + if (p->dirty == LRU_USED) { + (*nr_used)--; + } else if (p->dirty == LRU_SUSED) { + (*nr_sused)--; + } + trie_row_free(&p->row); + anthy_sfree(root->node_ator, p); +} + +/* head以外のノードがなければ 0 を返す */ +static struct trie_node * +trie_first (struct trie_root *root) +{ + return root->root.lru_next == &root->root ? + NULL : root->root.lru_next; +} + +/* 次のノードがなければ 0 を返す */ +static struct trie_node * +trie_next (struct trie_root *root, + struct trie_node *cur) +{ + return cur->lru_next == &root->root ? 0 : cur->lru_next; +} + +/* + * head以外全てのノードを削除する + * 内部でtrie_row_freeを呼び、キーを含むデータ部分をfreeする + */ +static void +trie_remove_all (struct trie_root *root, + int *nr_used, int *nr_sused) +{ + struct trie_node* p; + for (p = root->root.lru_next; p != &root->root; p = p->lru_next) { + trie_row_free(&p->row); + } + anthy_free_allocator(root->node_ator); + init_trie_root(root); + *nr_used = 0; + *nr_sused = 0; +} + +/* + * LRU リストの先頭から count 番目までを残して残りを解放する + */ +static void +trie_remove_old (struct trie_root *root, int count, + int *nr_used, int *nr_sused) +{ + struct trie_node *p; + struct trie_node *q; + + if (*nr_used > count) { + for (p = root->root.lru_next; count; count--, p = p->lru_next) + ; + /* p から head までを消す */ + for ( ; p != &root->root; p = q) { + q = p->lru_next; + trie_remove(root, &p->row.key, nr_used, nr_sused); + } + } else if (*nr_used + *nr_sused > count) { + for (p = root->root.lru_next; p->dirty == LRU_USED; p = p->lru_next) + ; + /* + * p から root まで sused -> dirty := 0 + * それ以外 -> 消す + */ + for ( ; p != &root->root; p = q) { + q = p->lru_next; + if (p->dirty == LRU_SUSED) { + p->dirty = 0; + } else { + trie_remove(root, &p->row.key, nr_used, nr_sused); + } + } + *nr_sused = 0; + } +} + +static void +trie_mark_used (struct trie_root *root, struct trie_node *n, + int *nr_used, int *nr_sused) +{ + switch(n->dirty) { + case LRU_USED: + break; + case LRU_SUSED: + (*nr_sused)--; + /* fall through */ + default: + n->dirty = LRU_USED; + (*nr_used)++; + break; + } + n->lru_prev->lru_next = n->lru_next; + n->lru_next->lru_prev = n->lru_prev; + root->root.lru_next->lru_prev = n; + n->lru_next = root->root.lru_next; + root->root.lru_next = n; + n->lru_prev = &root->root; +} + +/* + * トライの実装はここまで + */ + +static xstr * +do_get_index_xstr(struct record_stat *rec) +{ + if (!rec->cur_row) { + return 0; + } + return &rec->cur_row->row.key; +} + +static struct record_section* +do_select_section(struct record_stat *rst, const char *name, int flag) +{ + struct record_section *rsc; + + for (rsc = rst->section_list.next; rsc; rsc = rsc->next) { + if (!strcmp(name, rsc->name)) { + return rsc; + } + } + + if (flag) { + rsc = malloc(sizeof(struct record_section)); + rsc->name = strdup(name); + rsc->next = rst->section_list.next; + rst->section_list.next = rsc; + rsc->lru_nr_used = 0; + rsc->lru_nr_sused = 0; + init_trie_root(&rsc->cols); + return rsc; + } + + return NULL; +} + +static struct trie_node* +do_select_longest_row(struct record_section *rsc, xstr *name) +{ + struct trie_node *mark, *found; + xstr xs; + int i; + + if ((NULL == name) || (NULL == name->str) || (name->len < 1) || (0 == name->str[0])) { + /* 辞書もしくは学習データが壊れていた時の対策 */ + return NULL; + } + + mark = trie_find_longest(&rsc->cols, name); + xs.str = name->str; + for (i = (mark->row.key.len <= name->len) ? mark->row.key.len : name->len; i > 1; i--) { /* 不正なメモリアクセスの修正 */ + /* ルートノードは i == 1 でマッチするので除外 + * trie_key_nth_bit 参照 + */ + xs.len = i; + found = trie_find(&rsc->cols, &xs); + if (found) { + return found; + } + } + return NULL; +} + +static struct trie_node* +do_select_row(struct record_section* rsc, xstr *name, + int flag, int dirty) +{ + struct trie_node *node; + + if (flag) { + node = trie_insert(&rsc->cols, name, dirty, + &rsc->lru_nr_used, &rsc->lru_nr_sused); + if (node) { + node->row.nr_vals = 0; + node->row.vals = 0; + } else { + node = trie_find(&rsc->cols, name); + } + } else { + node = trie_find(&rsc->cols, name); + } + return node; +} + +static void +do_mark_row_used(struct record_section* rsc, struct trie_node* node) +{ + trie_mark_used(&rsc->cols, node, &rsc->lru_nr_used, &rsc->lru_nr_sused); +} + +static void +do_truncate_section(struct record_stat *s, int count) +{ + if (!s->cur_section) { + return; + } + + trie_remove_old(&s->cur_section->cols, count, + &s->cur_section->lru_nr_used, + &s->cur_section->lru_nr_sused); +} + + +static struct trie_node* +do_select_first_row(struct record_section *rsc) +{ + return trie_first(&rsc->cols); +} + +static struct trie_node* +do_select_next_row(struct record_section *rsc, + struct trie_node* node) +{ + return trie_next(&rsc->cols, node); +} + + +static int +do_get_nr_values(struct trie_node *node) +{ + if (!node) + return 0; + return node->row.nr_vals; +} + +static struct record_val * +get_nth_val_ent(struct trie_node *node, int n, int f) +{ + struct record_row *col; + col = &node->row; + if (n < 0) { + return NULL; + } + if (n < do_get_nr_values(node)) { + return &col->vals[n]; + } + if (f) { + int i; + col->vals = realloc(col->vals, sizeof(struct record_val)*(n + 1)); + for (i = col->nr_vals; i < n+1; i++) { + col->vals[i].type = RT_EMPTY; + } + col->nr_vals = n + 1; + return &col->vals[n]; + } + return NULL; +} + +static void +free_val_contents(struct record_val* v) +{ + switch (v->type) { + case RT_XSTR: + anthy_free_xstr_str(&v->u.str); + break; + case RT_XSTRP: + case RT_VAL: + case RT_EMPTY: + default: + break; + } +} + +static void +do_set_nth_value(struct trie_node *node, int nth, int val) +{ + struct record_val *v = get_nth_val_ent(node, nth, 1); + if (!v) { + return ; + } + free_val_contents(v); + v->type = RT_VAL; + v->u.val = val; +} + +static int +do_get_nth_value(struct trie_node *node, int n) +{ + struct record_val *v = get_nth_val_ent(node, n, 0); + if (v && v->type == RT_VAL) { + return v->u.val; + } + return 0; +} + +static xstr* +intern_xstr (struct trie_root* xstrs, xstr* xs) +{ + struct trie_node* node; + int dummy; + + if ((NULL == xs) || (NULL == xs->str) || (xs->len < 1) || (0 == xs->str[0])) { + /* 辞書もしくは学習データが壊れていた時の対策 */ + return NULL; + } + node = trie_find(xstrs, xs); + if (!node) + node = trie_insert(xstrs, xs, 0, &dummy, &dummy); + return &node->row.key; +} + +static void +do_set_nth_xstr (struct trie_node *node, int nth, xstr *xs, + struct trie_root* xstrs) +{ + struct record_val *v = get_nth_val_ent(node, nth, 1); + if (!v){ + return ; + } + free_val_contents(v); + v->type = RT_XSTRP; + v->u.strp = intern_xstr(xstrs, xs); +} + +static void +do_truncate_row (struct trie_node* node, int n) +{ + int i; + if (n < node->row.nr_vals) { + for (i = n; i < node->row.nr_vals; i++) { + free_val_contents(node->row.vals + i); + } + node->row.vals = realloc(node->row.vals, + sizeof(struct record_val)* n); + node->row.nr_vals = n; + } +} + +static void +do_remove_row (struct record_section* rsc, + struct trie_node* node) +{ + xstr* xs; + xs = anthy_xstr_dup(&node->row.key); + trie_remove(&rsc->cols, &node->row.key, + &rsc->lru_nr_used, &rsc->lru_nr_sused); + + anthy_free_xstr(xs); +} + +static xstr * +do_get_nth_xstr(struct trie_node *node, int n) +{ + struct record_val *v = get_nth_val_ent(node, n, 0); + if (v) { + if (v->type == RT_XSTR) { + return &v->u.str; + } else if (v->type == RT_XSTRP) { + return v->u.strp; + } + } + return 0; +} + +static void +lock_record (struct record_stat* rs) +{ + if (rs->is_anon) { + return ; + } + anthy_priv_dic_lock(); +} + +static void +unlock_record (struct record_stat* rs) +{ + if (rs->is_anon) { + return ; + } + anthy_priv_dic_unlock(); +} + +/* 再読み込みの必要があるかをチェックする + * 必要があれば返り値が1になる */ +static int +check_base_record_uptodate(struct record_stat *rst) +{ + struct stat st; + if (rst->is_anon) { + return 1; + } + anthy_check_user_dir(); + if (stat(rst->base_fn, &st) < 0) { + return 0; + } else if (st.st_mtime != rst->base_timestamp) { + return 0; + } + return 1; +} + + +/* + * row format: + * ROW := OPERATION SECTION KEY VALUE* + * OPERATION := "ADD" (追加またはLRU更新) + * "DEL" (削除) + * SECTION := (文字列) + * KEY := TD + * VALUE := TD + * TD := TYPE DATA (空白をあけずに書く) + * TYPE := "S" (xstr) + * "N" (number) + * DATA := (型ごとにシリアライズしたもの) + */ + +static char* +read_1_token (FILE* fp, int* eol) +{ + int c; + char* s; + int in_quote; + int len; + + in_quote = 0; + s = NULL; + len = 0; + while (1) { + c = fgetc(fp); + switch (c) { + case EOF: case '\n': + goto out; + case '\\': + c = fgetc(fp); + if (c == EOF || c == '\n') { + goto out; + } + break; + case '\"': + in_quote = !in_quote; + continue; + case ' ': case '\t': case '\r': + if (in_quote) { + break; + } + if (s != NULL) { + goto out; + } + break; + default: + break; + } + + s = (char*) realloc(s, len + 2); + s[len] = c; + len ++; + } +out: + if (s) { + s[len] = '\0'; + } + *eol = (c == '\n'); + return s; +} + +/* journalからADDの行を読む */ +static void +read_add_row(FILE *fp, struct record_stat* rst, + struct record_section* rsc) +{ + int n; + xstr* xs; + char *token; + int eol; + struct trie_node* node; + + token = read_1_token(fp, &eol); + if (!token) { + return ; + } + + xs = anthy_cstr_to_xstr(/* xstr 型を表す S を読み捨てる */ + token + 1, + rst->encoding); + node = do_select_row(rsc, xs, 1, LRU_USED); + anthy_free_xstr(xs); + free(token); + + if (node->dirty & PROTECT) { + /* 保存すべき row なので、差分ファイルを読み捨てる */ + while (!eol) { + free(read_1_token(fp, &eol)); + } + return ; + } + + n = 0; + while (!eol) { + token = read_1_token(fp, &eol); + if (token) { + switch(*token) { + /* String 文字列 */ + case 'S': + { + xstr* xs; + xs = anthy_cstr_to_xstr(token + 1, rst->encoding); + do_set_nth_xstr(node, n, xs, &rst->xstrs); + anthy_free_xstr(xs); + } + break; + /* Number 数値 */ + case 'N': + do_set_nth_value(node, n, atoi(token + 1)); + break; + } + free(token); + n++; + } + } + do_truncate_row(node, n); +} + +/* journalからDELの行を読む */ +static void +read_del_row(FILE *fp, struct record_stat* rst, + struct record_section* rsc) +{ + struct trie_node* node; + char* token; + xstr* xs; + int eol; + + token = read_1_token(fp, &eol); + if (!token) { + return ; + } + + xs = anthy_cstr_to_xstr(/* xstr 型を表す S を読み飛ばす */ + token + 1, + rst->encoding); + if ((node = do_select_row(rsc, xs, 0, 0)) != NULL) { + do_remove_row(rsc, node); + } + anthy_free_xstr(xs); + free(token); +} + +/** 差分ファイルから1行読み込む */ +static void +read_1_row(struct record_stat* rst, FILE* fp, char *op) +{ + char* sec_name; + struct record_section* rsc; + int eol; + + sec_name = read_1_token(fp, &eol); + if (!sec_name || eol) { + free(sec_name); + return ; + } + rsc = do_select_section(rst, sec_name, 1); + free(sec_name); + if (!rsc) { + return ; + } + + if (strcmp(op, "ADD") == 0) { + read_add_row(fp, rst, rsc); + } else if (strcmp(op, "DEL") == 0) { + read_del_row(fp, rst, rsc); + } +} + +/* + * journal(差分)ファイルを読む + */ +static void +read_journal_record(struct record_stat* rs) +{ + FILE* fp; + struct stat st; + + if (rs->is_anon) { + return ; + } + fp = fopen(rs->journal_fn, "r"); + if (fp == NULL) { + return; + } + if (fstat(fileno(fp), &st) == -1) { + fclose(fp); + return ; + } + if (st.st_size < rs->last_update) { + /* ファイルサイズが小さくなっているので、 + * 最初から読み込む */ + fseek(fp, 0, SEEK_SET); + } else { + fseek(fp, rs->last_update, SEEK_SET); + } + rs->journal_timestamp = st.st_mtime; + while (!feof(fp)) { + char *op; + int eol; + op = read_1_token(fp, &eol); + if (op && !eol) { + read_1_row(rs, fp, op); + } + free(op); + } + rs->last_update = ftell(fp); + fclose(fp); +} + +static void +write_string(FILE* fp, const char* str) +{ + fprintf(fp, "%s", str); +} + +/* ダブルクオートもしくはバックスラッシュにバックスラッシュを付ける */ +static void +write_quote_string(FILE* fp, const char* str) +{ + const char* p; + + for (p = str; *p; p++) { + if (*p == '\"' || *p == '\\') { + fputc('\\', fp); + } + fputc(*p, fp); + } +} + +static void +write_quote_xstr(FILE* fp, xstr* xs, int encoding) +{ + char* buf; + + if ((NULL == xs) || (NULL == xs->str) || (xs->len < 1) || (0 == xs->str[0])) { + /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + + buf = (char*) alloca(xs->len * 6 + 2); /* EUC またはUTF8を仮定 */ + anthy_sputxstr(buf, xs, encoding); + write_quote_string(fp, buf); +} + +static void +write_number(FILE* fp, int x) +{ + fprintf(fp, "%d", x); +} + +/* journalに1行追記する */ +static void +commit_add_row(struct record_stat* rst, + const char* sname, struct trie_node* node) +{ + FILE* fp; + int i; + + fp = fopen(rst->journal_fn, "a"); + if (fp == NULL) { + return; + } + + write_string(fp, "ADD \""); + write_quote_string(fp, sname); + write_string(fp, "\" S\""); + write_quote_xstr(fp, &node->row.key, rst->encoding); + write_string(fp, "\""); + + for (i = 0; i < node->row.nr_vals; i++) { + switch (node->row.vals[i].type) { + case RT_EMPTY: + write_string(fp, " E"); + break; + case RT_VAL: + write_string(fp, " N"); + write_number(fp, node->row.vals[i].u.val); + break; + case RT_XSTR: + write_string(fp, " S\""); + write_quote_xstr(fp, &node->row.vals[i].u.str, rst->encoding); + write_string(fp, "\""); + break; + case RT_XSTRP: + write_string(fp, " S\""); + write_quote_xstr(fp, node->row.vals[i].u.strp, rst->encoding); + write_string(fp, "\""); + break; + } + } + write_string(fp, "\n"); + rst->last_update = ftell(fp); + fclose(fp); +} + +/* 全ての row を解放する */ +static void +clear_record(struct record_stat* rst) +{ + struct record_section *rsc; + for (rsc = rst->section_list.next; rsc; rsc = rsc->next) { + trie_remove_all(&rsc->cols, &rsc->lru_nr_used, &rsc->lru_nr_sused); + } + rst->cur_row = NULL; +} + +/* 基本ファイルを読む */ +static void +read_session(struct record_stat *rst) +{ + char **tokens; + int nr; + int in_section = 0; + while (!anthy_read_line(&tokens, &nr)) { + xstr *xs; + int i; + int dirty = 0; + struct trie_node* node; + + if (!strcmp(tokens[0], "---") && nr > 1) { + /* セクションの切れ目 */ + in_section = 1; + rst->cur_section = do_select_section(rst, tokens[1], 1); + goto end; + } + if (!in_section || nr < 2) { + /* セクションが始まってない or 行が不完全 */ + goto end; + } + /* 行頭のLRUのマークを読む */ + if (tokens[0][0] == '-') { + dirty = 0; + } else if (tokens[0][0] == '+') { + dirty = LRU_SUSED; + } + /* 次にindex */ + xs = anthy_cstr_to_xstr(&tokens[0][1], rst->encoding); + node = do_select_row(rst->cur_section, xs, 1, dirty); + anthy_free_xstr(xs); + if (!node) { + goto end; + } + rst->cur_row = node; + /**/ + for (i = 1; i < nr; i++) { + if (tokens[i][0] == '"') { + char *str; + str = strdup(&tokens[i][1]); + str[strlen(str) - 1] = 0; + xs = anthy_cstr_to_xstr(str, rst->encoding); + free(str); + do_set_nth_xstr(rst->cur_row, i-1, xs, &rst->xstrs); + anthy_free_xstr(xs); + }else if (tokens[i][0] == '*') { + /* EMPTY entry */ + get_nth_val_ent(rst->cur_row, i-1, 1); + } else { + do_set_nth_value(rst->cur_row, i-1, atoi(tokens[i])); + } + } + end: + anthy_free_line(); + } +} + +/* いまのデータベースを解放した後にファイルから読み込む */ +static void +read_base_record(struct record_stat *rst) +{ + struct stat st; + if (rst->is_anon) { + clear_record(rst); + return ; + } + anthy_check_user_dir(); + + if (anthy_open_file(rst->base_fn) == -1) { + return ; + } + + clear_record(rst); + read_session(rst); + anthy_close_file(); + if (stat(rst->base_fn, &st) == 0) { + rst->base_timestamp = st.st_mtime; + } + rst->last_update = 0; +} + +static FILE * +open_tmp_in_recorddir(void) +{ + char *pn; + const char *hd; + const char *sid; + sid = anthy_conf_get_str("SESSION-ID"); + hd = anthy_conf_get_str("HOME"); + pn = alloca(strlen(hd)+strlen(sid) + 10); + sprintf(pn, "%s/.anthy/%s", hd, sid); + return fopen(pn, "w"); +} + +/* + * 一時ファイルからbaseファイルへrenameする + */ +static void +update_file(const char *fn) +{ + const char *hd; + char *tmp_fn; + const char *sid; + hd = anthy_conf_get_str("HOME"); + sid = anthy_conf_get_str("SESSION-ID"); + tmp_fn = alloca(strlen(hd)+strlen(sid) + 10); + + sprintf(tmp_fn, "%s/.anthy/%s", hd, sid); + if (rename(tmp_fn, fn)){ + anthy_log(0, "Failed to update record file %s -> %s.\n", tmp_fn, fn); + } +} + +/* カラムを保存する */ +static void +save_a_row(FILE *fp, struct record_stat* rst, + struct record_row *c, int dirty) +{ + int i; + char *buf = alloca(c->key.len * 6 + 2); + /* LRUのマークを出力 */ + if (dirty == 0) { + fputc('-', fp); + } else { + fputc('+', fp); + } + anthy_sputxstr(buf, &c->key, rst->encoding); + /* index を出力 */ + fprintf(fp, "%s ", buf); + /**/ + for (i = 0; i < c->nr_vals; i++) { + struct record_val *val = &c->vals[i]; + switch (val->type) { + case RT_EMPTY: + fprintf(fp, "* "); + break; + case RT_XSTR: + /* should not happen */ + fprintf(fp, "\""); + write_quote_xstr(fp, &val->u.str, rst->encoding); + fprintf(fp, "\" "); + abort(); + break; + case RT_XSTRP: + fprintf(fp, "\""); + write_quote_xstr(fp, val->u.strp, rst->encoding); + fprintf(fp, "\" "); + break; + case RT_VAL: + fprintf(fp, "%d ", val->u.val); + break; + default: + anthy_log(0, "Faild to save an unkonwn record. (in record.c)\n"); + break; + } + } + fprintf(fp, "\n"); +} + +static void +update_base_record(struct record_stat* rst) +{ + struct record_section *sec; + struct trie_node *col; + FILE *fp; + struct stat st; + + /* 一時ファイルを作ってrecordを書き出す */ + anthy_check_user_dir(); + fp = open_tmp_in_recorddir(); + if (!fp) { + anthy_log(0, "Failed to open temporaly session file.\n"); + return ; + } + /* 各セクションに対して */ + for (sec = rst->section_list.next; + sec; sec = sec->next) { + if (!trie_first(&sec->cols)) { + /*このセクションは空*/ + continue; + } + /* セクション境界の文字列 */ + fprintf(fp, "--- %s\n", sec->name); + /* 各カラムを保存する */ + for (col = trie_first(&sec->cols); col; + col = trie_next(&sec->cols, col)) { + save_a_row(fp, rst, &col->row, col->dirty); + } + } + fclose(fp); + + /* 本来の名前にrenameする */ + update_file(rst->base_fn); + + if (stat(rst->base_fn, &st) == 0) { + rst->base_timestamp = st.st_mtime; + } + /* journalファイルを消す */ + unlink(rst->journal_fn); + rst->last_update = 0; +} + +static void +commit_del_row(struct record_stat* rst, + const char* sname, struct trie_node* node) +{ + FILE* fp; + + fp = fopen(rst->journal_fn, "a"); + if (fp == NULL) { + return; + } + write_string(fp, "DEL \""); + write_quote_string(fp, sname); + write_string(fp, "\" S\""); + write_quote_xstr(fp, &node->row.key, rst->encoding); + write_string(fp, "\""); + write_string(fp, "\n"); + fclose(fp); +} + +/* + * sync_add: ADD の書き込み + * sync_del_and_del: DEL の書き込みと削除 + * どちらも書き込みの前に、他のプロセスによってディスク上に保存された + * 更新をメモリ上に読み込む。 + * このとき、データベースをフラッシュする可能性もある。データベースの + * フラッシュがあると、 cur_row と全ての xstr は無効になる。 + * ただし、 cur_section の有効性は保存される。 + */ +static void +sync_add(struct record_stat* rst, struct record_section* rsc, + struct trie_node* node) +{ + lock_record(rst); + if (check_base_record_uptodate(rst)) { + node->dirty |= PROTECT; + /* 差分ファイルだけ読む */ + read_journal_record(rst); + node->dirty &= ~PROTECT; + commit_add_row(rst, rsc->name, node); + } else { + /* 再読み込み */ + commit_add_row(rst, rsc->name, node); + read_base_record(rst); + read_journal_record(rst); + } + if (rst->last_update > FILE2_LIMIT) { + update_base_record(rst); + } + unlock_record(rst); +} + +static void +sync_del_and_del(struct record_stat* rst, struct record_section* rsc, + struct trie_node* node) +{ + lock_record(rst); + commit_del_row(rst, rsc->name, node); + if (!check_base_record_uptodate(rst)) { + read_base_record(rst); + } + read_journal_record(rst); + if (rst->last_update > FILE2_LIMIT) { + update_base_record(rst); + } + unlock_record(rst); +} + + +/* + * prediction関係 + */ + +static int +read_prediction_node(struct trie_node *n, struct prediction_t* predictions, int index) +{ + int i; + int nr_predictions = do_get_nr_values(n); + for (i = 0; i < nr_predictions; i += 2) { + time_t t = do_get_nth_value(n, i); + xstr* xs = do_get_nth_xstr(n, i + 1); + if (t && xs) { + if (predictions) { + predictions[index].timestamp = t; + predictions[index].src_str = anthy_xstr_dup(&n->row.key); + predictions[index].str = anthy_xstr_dup(xs); + } + ++index; + } + } + return index; +} + + +/* + * trie中をたどり、prefixがマッチしたらread_prediction_nodeを + * 呼んでpredictionsの配列に結果を追加する。 + */ +static int +traverse_record_for_prediction(xstr* key, struct trie_node *n, + struct prediction_t* predictions, int index) +{ + if (n->l->bit > n->bit) { + index = traverse_record_for_prediction(key, n->l, predictions, index); + } else { + if (n->l->row.key.len != -1) { + if (anthy_xstrncmp(&n->l->row.key, key, key->len) == 0) { + index = read_prediction_node(n->l, predictions, index); + } + } + } + if (n->r->bit > n->bit) { + index = traverse_record_for_prediction(key, n->r, predictions, index); + } else { + if (n->r->row.key.len != -1) { + if (anthy_xstrncmp(&n->r->row.key, key, key->len) == 0) { + index = read_prediction_node(n->r, predictions, index); + } + } + } + return index; +} + +/* + * key で探索 + * key の文字列長を越えるか、ノードが無くなったら探索打ち切り + * trieのkeyが格納されているところでななくて葉を返す + */ +static struct trie_node * +trie_find_for_prediction (struct trie_root* root, xstr *key) +{ + struct trie_node *p; + struct trie_node *q; + + p = &root->root; + q = p->l; + + while (p->bit < q->bit) { + if (q->bit >= 2) { + if ((q->bit - 2) / (int)(sizeof(xchar) << 3) >= key->len) { + break; + } + } + p = q; + q = trie_key_nth_bit(key, p->bit) ? p->r : p->l; + } + return p; +} + +static int +prediction_cmp(const void* lhs, const void* rhs) +{ + struct prediction_t *lpre = (struct prediction_t*)lhs; + struct prediction_t *rpre = (struct prediction_t*)rhs; + return rpre->timestamp - lpre->timestamp; +} + +int +anthy_traverse_record_for_prediction(xstr* key, struct prediction_t* predictions) +{ + struct trie_node* mark; + int nr_predictions; + if (anthy_select_section("PREDICTION", 0)) { + return 0; + } + + /* 指定された文字列をprefixに持つnodeを探す */ + mark = trie_find_for_prediction(&anthy_current_record->cur_section->cols, key); + if (!mark) { + return 0; + } + nr_predictions = traverse_record_for_prediction(key, mark, predictions, 0); + if (predictions) { + /* タイムスタンプで予測候補をソートする */ + qsort(predictions, nr_predictions, sizeof(struct prediction_t), prediction_cmp); + } + return nr_predictions; +} + +/* Wrappers begin.. */ +int +anthy_select_section(const char *name, int flag) +{ + struct record_stat* rst; + struct record_section* rsc; + + rst = anthy_current_record; + if (rst->row_dirty && rst->cur_section && rst->cur_row) { + sync_add(rst, rst->cur_section, rst->cur_row); + } + rst->cur_row = NULL; + rst->row_dirty = 0; + rsc = do_select_section(rst, name, flag); + if (!rsc) { + return -1; + } + rst->cur_section = rsc; + return 0; +} + +int +anthy_select_row(xstr *name, int flag) +{ + struct record_stat* rst; + struct trie_node* node; + + rst = anthy_current_record; + if (!rst->cur_section) { + return -1; + } + if (rst->row_dirty && rst->cur_row) { + sync_add(rst, rst->cur_section, rst->cur_row); + rst->row_dirty = 0; + } + node = do_select_row(rst->cur_section, name, flag, LRU_USED); + if (!node) { + return -1; + } + rst->cur_row = node; + rst->row_dirty = flag; + return 0; +} + +int +anthy_select_longest_row(xstr *name) +{ + struct record_stat* rst; + struct trie_node* node; + + rst = anthy_current_record; + if (!rst->cur_section) + return -1; + + if (rst->row_dirty && rst->cur_row) { + sync_add(rst, rst->cur_section, rst->cur_row); + rst->row_dirty = 0; + } + node = do_select_longest_row(rst->cur_section, name); + if (!node) { + return -1; + } + + rst->cur_row = node; + rst->row_dirty = 0; + return 0; +} + +void +anthy_truncate_section(int count) +{ + do_truncate_section(anthy_current_record, count); +} + +void +anthy_truncate_row(int nth) +{ + struct trie_node *cur_row = anthy_current_record->cur_row; + if (!cur_row) { + return ; + } + do_truncate_row(cur_row, nth); + +} + +int +anthy_mark_row_used(void) +{ + struct record_stat* rst = anthy_current_record; + if (!rst->cur_row) { + return -1; + } + + do_mark_row_used(rst->cur_section, rst->cur_row); + sync_add(rst, rst->cur_section, rst->cur_row); + rst->row_dirty = 0; + return 0; +} + +void +anthy_set_nth_value(int nth, int val) +{ + struct record_stat* rst; + + rst = anthy_current_record; + if (!rst->cur_row) { + return; + } + do_set_nth_value(rst->cur_row, nth, val); + rst->row_dirty = 1; +} + +void +anthy_set_nth_xstr(int nth, xstr *xs) +{ + struct record_stat* rst = anthy_current_record; + if (!rst->cur_row) { + return; + } + do_set_nth_xstr(rst->cur_row, nth, xs, &rst->xstrs); + rst->row_dirty = 1; +} + +int +anthy_get_nr_values(void) +{ + return do_get_nr_values(anthy_current_record->cur_row); +} + +int +anthy_get_nth_value(int n) +{ + return do_get_nth_value(anthy_current_record->cur_row, n); +} + +xstr * +anthy_get_nth_xstr(int n) +{ + return do_get_nth_xstr(anthy_current_record->cur_row, n); +} + +int +anthy_select_first_row(void) +{ + struct record_stat* rst; + struct trie_node* node; + + rst = anthy_current_record; + if (!rst->cur_section) + return -1; + + if (rst->row_dirty && rst->cur_row) { + sync_add(rst, rst->cur_section, rst->cur_row); + rst->row_dirty = 0; + } + node = do_select_first_row(rst->cur_section); + if (!node) { + return -1; + } + rst->cur_row = node; + rst->row_dirty = 0; + return 0; +} + +int +anthy_select_next_row(void) +{ + struct record_stat* rst; + struct trie_node* node; + + rst = anthy_current_record; + if (!rst->cur_section || !rst->cur_row) + return -1; + + /* sync_add() で cur_row が無効になることがあるので、 + * たとえ row_dirty でも sync_add() しない + */ + rst->row_dirty = 0; + node = do_select_next_row(rst->cur_section, rst->cur_row); + if (!node) + return -1; + rst->cur_row = node; + rst->row_dirty = 0; + return 0; +} + +xstr * +anthy_get_index_xstr(void) +{ + return do_get_index_xstr(anthy_current_record); +} +/*..Wrappers end*/ + +/* + * trie_row_init は何回よんでもいい + */ +static void +trie_row_init(struct record_row* rc) +{ + rc->nr_vals = 0; + rc->vals = NULL; +} + +static void +trie_row_free(struct record_row *rc) +{ + int i; + for (i = 0; i < rc->nr_vals; i++) + free_val_contents(rc->vals + i); + free(rc->vals); + free(rc->key.str); +} + +/* あるセクションのデータを全て解放する */ +static void +free_section(struct record_stat *r, struct record_section *rs) +{ + struct record_section *s; + trie_remove_all(&rs->cols, &rs->lru_nr_used, &rs->lru_nr_sused); + if (r->cur_section == rs) { + r->cur_row = 0; + r->cur_section = 0; + } + for (s = &r->section_list; s && s->next; s = s->next) { + if (s->next == rs) { + s->next = s->next->next; + } + } + if (rs->name){ + free((void *)rs->name); + } + free(rs); +} + +/* すべてのデータを解放する */ +static void +free_record(struct record_stat *rst) +{ + struct record_section *rsc; + for (rsc = rst->section_list.next; rsc; ){ + struct record_section *tmp; + tmp = rsc; + rsc = rsc->next; + free_section(rst, tmp); + } + rst->section_list.next = NULL; +} + +void +anthy_release_section(void) +{ + struct record_stat* rst; + + rst = anthy_current_record; + if (!rst->cur_section) { + return ; + } + free_section(rst, rst->cur_section); + rst->cur_section = 0; +} + +void +anthy_release_row(void) +{ + struct record_stat* rst; + + rst = anthy_current_record; + if (!rst->cur_section || !rst->cur_row) { + return; + } + + rst->row_dirty = 0; + /* sync_del_and_del で削除もする */ + sync_del_and_del(rst, rst->cur_section, rst->cur_row); + rst->cur_row = NULL; +} + +static void +check_record_encoding(struct record_stat *rst) +{ + FILE *fp; + if (anthy_open_file(rst->base_fn) == 0) { + /* EUCの履歴ファイルがあった */ + anthy_close_file(); + return ; + } + fp = fopen(rst->journal_fn, "r"); + if (fp) { + /* EUCの差分ファイルがあった */ + fclose(fp); + return ; + } + rst->encoding = ANTHY_UTF8_ENCODING; + strcat(rst->base_fn, ENCODING_SUFFIX); + strcat(rst->journal_fn, ENCODING_SUFFIX); +} + +static void +record_dtor(void *p) +{ + int dummy; + struct record_stat *rst = (struct record_stat*) p; + free_record(rst); + if (rst->id) { + free(rst->base_fn); + free(rst->journal_fn); + } + trie_remove_all(&rst->xstrs, &dummy, &dummy); +} + +void +anthy_reload_record(void) +{ + struct stat st; + struct record_stat *rst = anthy_current_record; + + if (stat(rst->journal_fn, &st) == 0 && + rst->journal_timestamp == st.st_mtime) { + return ; + } + + lock_record(rst); + read_base_record(rst); + read_journal_record(rst); + unlock_record(rst); +} + +void +anthy_init_record(void) +{ + record_ator = anthy_create_allocator(sizeof(struct record_stat), + record_dtor); +} + +static void +setup_filenames(const char *id, struct record_stat *rst) +{ + const char *home = anthy_conf_get_str("HOME"); + int base_len = strlen(home) + strlen(id) + 10; + + /* 基本ファイル */ + rst->base_fn = (char*) malloc(base_len + + strlen("/.anthy/last-record1_")); + sprintf(rst->base_fn, "%s/.anthy/last-record1_%s", + home, id); + /* 差分ファイル */ + rst->journal_fn = (char*) malloc(base_len + + strlen("/.anthy/last-record2_")); + sprintf(rst->journal_fn, "%s/.anthy/last-record2_%s", + home, id); +} + +struct record_stat * +anthy_create_record(const char *id) +{ + struct record_stat *rst; + + if (!id) { + return NULL; + } + + rst = anthy_smalloc(record_ator); + rst->id = id; + rst->section_list.next = 0; + init_trie_root(&rst->xstrs); + rst->cur_section = 0; + rst->cur_row = 0; + rst->row_dirty = 0; + rst->encoding = 0; + + /* ファイル名の文字列を作る */ + setup_filenames(id, rst); + + rst->last_update = 0; + + if (!strcmp(id, ANON_ID)) { + rst->is_anon = 1; + } else { + rst->is_anon = 0; + anthy_check_user_dir(); + } + + /* ファイルから読み込む */ + lock_record(rst); + check_record_encoding(rst); + read_base_record(rst); + read_journal_record(rst); + unlock_record(rst); + + return rst; +} + +void +anthy_release_record(struct record_stat *rs) +{ + anthy_sfree(record_ator, rs); +} diff --git a/src-worddic/textdict.c b/src-worddic/textdict.c new file mode 100644 index 0000000..4d3d04b --- /dev/null +++ b/src-worddic/textdict.c @@ -0,0 +1,202 @@ +/* + * ソートされたテキストから検索を行う + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <anthy/filemap.h> +#include <anthy/textdict.h> +#include "dic_main.h" + +struct textdict { + char *fn; + char *ptr; + struct filemapping *mapping; +}; + +struct textdict * +anthy_textdict_open(const char *fn, int create) +{ + struct textdict *td = malloc(sizeof(struct textdict)); + if (!td) { + return NULL; + } + td->fn = strdup(fn); + if (!td->fn) { + free(td); + return NULL; + } + td->mapping = NULL; + return td; +} + + +static void +unmap(struct textdict *td) +{ + if (td->mapping) { + anthy_munmap(td->mapping); + td->mapping = NULL; + } +} + +void +anthy_textdict_close(struct textdict *td) +{ + if (!td) { + return ; + } + unmap(td); + free(td->fn); + free(td); +} + +static int +update_mapping(struct textdict *td) +{ + if (td->mapping) { + anthy_munmap(td->mapping); + } + td->mapping = anthy_mmap(td->fn, 1); + if (!td->mapping) { + td->ptr = NULL; + return 1; + } + td->ptr = anthy_mmap_address(td->mapping); + return 0; +} + +static int +expand_file(struct textdict *td, int len) +{ + FILE *fp; + char buf[256]; + int c; + fp = fopen(td->fn, "a+"); + if (!fp) { + return -1; + } + memset(buf, '\n', 256); + c = 1; + if (len > 256) { + c *= fwrite(buf, 256, len / 256, fp); + } + if (len % 256) { + c *= fwrite(buf, len % 256, 1, fp); + } + fclose(fp); + if (c == 0) { + return -1; + } + return 0; +} + +void +anthy_textdict_scan(struct textdict *td, int offset, void *ptr, + int (*fun)(void *, int, const char *, const char *)) +{ + FILE *fp; + char buf[1024]; + if (!td) { + return ; + } + fp = fopen(td->fn, "r"); + if (!fp) { + return ; + } + if (fseek(fp, offset, SEEK_SET)) { + fclose(fp); + return ; + } + while (fgets(buf, 1024, fp)) { + char *p = strchr(buf, ' '); + int len, r; + len = strlen(buf); + offset += len; + buf[len - 1] = 0; + if (!p) { + continue; + } + *p = 0; + p++; + while (*p == ' ') { + p++; + } + /* call it */ + r = fun(ptr, offset, buf, p); + if (r) { + break; + } + } + fclose(fp); +} + +int +anthy_textdict_delete_line(struct textdict *td, int offset) +{ + FILE *fp; + char buf[1024]; + int len, size; + fp = fopen(td->fn, "r"); + if (!fp) { + return -1; + } + if (fseek(fp, offset, SEEK_SET)) { + fclose(fp); + return -1; + } + if (!fgets(buf, 1024, fp)) { + fclose(fp); + return -1; + } + len = strlen(buf); + fclose(fp); + update_mapping(td); + if (!td->mapping) { + return -1; + } + size = anthy_mmap_size(td->mapping); + memmove(&td->ptr[offset], &td->ptr[offset+len], size - offset - len); + unmap(td); + if (size - len == 0) { + unlink(td->fn); + return 0; + } + truncate(td->fn, size - len); + return 0; +} + +int +anthy_textdict_insert_line(struct textdict *td, int offset, + const char *line) +{ + int len = strlen(line); + int size; + if (!td) { + return -1; + } + if (expand_file(td, len)) { + return -1; + } + update_mapping(td); + size = anthy_mmap_size(td->mapping); + memmove(&td->ptr[offset+len], &td->ptr[offset], size - offset - len); + memcpy(&td->ptr[offset], line, len); + return 0; +} diff --git a/src-worddic/texttrie.c b/src-worddic/texttrie.c new file mode 100644 index 0000000..9497a02 --- /dev/null +++ b/src-worddic/texttrie.c @@ -0,0 +1,1516 @@ +/* + * DEPRECATED, it is too hard to debug. + * you may use textdict instead + * + * Trie in Text + * + * *issues + * +correct API + * -iterator vs callback + * +robustness + * -error detection + * -auto correction + * -concurrent access + * +efficiency + * -lower memory consumption + * -disk space? + * + * on some file system like jffs2 on linux, writable mmap + * is not allowed, though you can write it. + * + */ +/* + * API + * anthy_trie_open() + * anthy_trie_close() + * anthy_trie_add() + * anthy_trie_find() + * anthy_trie_delete() + * anthy_trie_find_next_key() + * anthy_trie_find_prefix() + * anthy_trie_print_array() + * + * Copyright (C) 2005-2006 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* open & mmap */ +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +/**/ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <anthy/texttrie.h> +#include <anthy/filemap.h> +#include "dic_main.h" + +/* configs */ +#define OBJ_LEN 20 +#define LINE_LEN 32 +#define EXPAND_COUNT 16 + +/* cell type */ +#define TT_SUPER 0 +#define TT_UNUSED 1 +#define TT_ALLOCED 2 +#define TT_NODE 3 +#define TT_BODY 4 +#define TT_TAIL 5 + +/* cell structure */ +struct cell { + /* (common) type */ + int type; + /* union */ + union { + /* unused */ + int next_unused; + /* super */ + struct { + int first_unused; + int root_cell; + int size; + int serial; + } super; + /* node */ + struct { + int key; + int next; + int child; + int body; + int parent; + } node; + /* body */ + struct { + int owner; + char *obj; + } body; + /* tail */ + struct { + char *obj; + int prev; + } tail; + } u; + /* body & tail */ + int next_tail; +}; + +struct text_trie { + /**/ + int fatal; + /**/ + char *fn; + FILE *wfp; + struct filemapping *mapping; + char *ptr; + /**/ + struct cell super; + int valid_super; +}; + +struct path { + /**/ + const char *key_str; + /**/ + int max_len; + int *path; + int len; + int cur; +}; + +static void +print_super_cell(struct cell *c) +{ + printf("super first_unused=%d, root_cell=%d, size=%d, serial=%d\n", + c->u.super.first_unused, c->u.super.root_cell, + c->u.super.size, c->u.super.serial); +} + +static void +print_alloced_cell(void) +{ + printf("alloc-ed\n"); +} + +static void +print_node_cell(struct cell *c) +{ + printf("node key=%d", c->u.node.key); + if (c->u.node.key > 0 && isprint(c->u.node.key)) { + printf("(%c)", c->u.node.key); + } + printf(" parent=%d next=%d child=%d body=%d\n", + c->u.node.parent, c->u.node.next, c->u.node.child, c->u.node.body); +} + +static void +print_unused_cell(struct cell *c) +{ + printf("unused next_unused=%d\n", + c->u.next_unused); +} + +static void +print_body_cell(struct cell *c) +{ + printf("body object=(%s), owner=%d, next_tail=%d\n", + c->u.body.obj, c->u.body.owner, c->next_tail); +} + +static void +print_tail_cell(struct cell *c) +{ + printf("tail object=(%s), prev=%d, next_tail=%d\n", + c->u.tail.obj, c->u.tail.prev, c->next_tail); +} + +static void +print_cell(int idx, struct cell *c) +{ + if (!c) { + printf("idx =%d(null cell):\n", idx); + return ; + } + printf("idx=%d:", idx); + switch (c->type) { + case TT_SUPER: + print_super_cell(c); + break; + case TT_ALLOCED: + print_alloced_cell(); + break; + case TT_NODE: + print_node_cell(c); + break; + case TT_UNUSED: + print_unused_cell(c); + break; + case TT_BODY: + print_body_cell(c); + break; + case TT_TAIL: + print_tail_cell(c); + break; + default: + printf("unknown\n"); + } +} + +static void +path_setup(struct path *path, const char *key, int len, int *buf) +{ + unsigned char *p = (unsigned char *)key; + path->key_str = key; + path->max_len = len; + path->path = buf; + path->len = 0; + path->cur = 0; + /**/ + while (*p) { + path->path[path->len] = p[0] * 256 + p[1]; + path->len ++; + p++; + if (p[0]) { + p++; + } + } +} + +static void +path_copy_to_str(struct path *path, char *str, int buf_len) +{ + unsigned char *p = (unsigned char *)str; + int i, o; + for (i = 0, o = 0; i < path->cur && o < buf_len - 2; i++) { + p[o] = (path->path[i]>>8)&255; + p[o+1] = path->path[i]&255; + o += 2; + } + p[o] = 0; +} + +static int +sput_int(char *buf, int num) +{ + unsigned char *tmp = (unsigned char *)buf; + tmp[0] = (num>>24)&255; + tmp[1] = (num>>16)&255; + tmp[2] = (num>>8)&255; + tmp[3] = num&255; + return 4; +} + +static char * +sget_int(char *buf, int *num) +{ + unsigned int res; + unsigned char *tmp = (unsigned char *)buf; + res = 0; + res += tmp[0] << 24; + res += tmp[1] << 16; + res += tmp[2] << 8; + res += tmp[3]; + *num = res; + buf += 4; + return buf; +} + +static char * +pass_str(char *buf, const char *str) +{ + buf += strlen(str); + return buf; +} + +static void +encode_super(struct cell *c, char *buf) +{ + buf += sprintf(buf, "S "); + buf += sput_int(buf, c->u.super.size); + buf += sput_int(buf, c->u.super.root_cell); + buf += sput_int(buf, c->u.super.first_unused); + buf += sput_int(buf, c->u.super.serial); + buf += sput_int(buf, LINE_LEN); +} + +static void +encode_node(struct cell *c, char *buf) +{ + buf += sprintf(buf, "N "); + buf += sput_int(buf, c->u.node.key); + buf += sput_int(buf, c->u.node.parent); + buf += sput_int(buf, c->u.node.next); + buf += sput_int(buf, c->u.node.child); + buf += sput_int(buf, c->u.node.body); +} + +static void +encode_body(struct cell *c, char *buf) +{ + buf += sprintf(buf, "B"); + buf += sput_int(buf, c->next_tail); + buf += sput_int(buf, c->u.body.owner); + sprintf(buf, "\"%s\"", + c->u.body.obj); +} + +static void +encode_unused(struct cell *c, char *buf) +{ + buf += sprintf(buf, "-next="); + buf += sput_int(buf, c->u.next_unused); +} + +static void +encode_tail(struct cell *c, char *buf) +{ + buf += sprintf(buf, "T"); + buf += sput_int(buf, c->u.tail.prev); + buf += sput_int(buf, c->next_tail); + sprintf(buf, "\"%s\"", + c->u.tail.obj); +} + +static void +encode_unknown(char *buf) +{ + sprintf(buf, "?"); +} + +static void +encode_cell(struct cell *c, char *buf) +{ + switch (c->type) { + case TT_SUPER: + encode_super(c, buf); + break; + case TT_NODE: + encode_node(c, buf); + break; + case TT_UNUSED: + encode_unused(c, buf); + break; + case TT_BODY: + encode_body(c, buf); + break; + case TT_TAIL: + encode_tail(c, buf); + break; + default: + encode_unknown(buf); + break; + } +} + +static void +write_back_cell(struct text_trie *tt, struct cell *c, int idx) +{ + int i; + char buf[LINE_LEN+1]; + /* sanity check */ + if (((anthy_mmap_size(tt->mapping) / LINE_LEN) < (idx + 1)) || + idx < 0) { + return ; + } + for (i = 0; i < LINE_LEN; i++) { + buf[i] = ' '; + } + encode_cell(c, buf); + buf[LINE_LEN-1] = '\n'; + if (anthy_mmap_is_writable(tt->mapping)) { + memcpy(&tt->ptr[idx*LINE_LEN], buf, LINE_LEN); + } else { + fseek(tt->wfp, idx*LINE_LEN, SEEK_SET); + fwrite(buf, LINE_LEN, 1, tt->wfp); + fflush(tt->wfp); + } +} + +static char * +decode_str(char *raw_buf, int off) +{ + char *head; + char copy_buf[LINE_LEN + 1]; + char *buf; + int i; + /* from off to before last '\n' */ + for (i = 0; i < LINE_LEN - off - 1; i++) { + copy_buf[i] = raw_buf[i]; + } + copy_buf[i] = 0; + buf = copy_buf; + /* find first double quote */ + while (*buf && *buf != '\"') { + buf ++; + } + if (!*buf) { + /* cant find double quote */ + return strdup(""); + } + buf ++; + head = buf; + /* go to the end of string */ + while (*buf) { + buf ++; + } + /* find last double quote */ + while (*buf != '\"') { + buf--; + } + *buf = 0; + /**/ + return strdup(head); +} + +static void +release_cell_str(struct cell *c) +{ + if (!c) { + return ; + } + if (c->type == TT_BODY) { + free(c->u.body.obj); + } + if (c->type == TT_TAIL) { + free(c->u.tail.obj); + } +} + +static int +decode_super(struct cell *c, char *buf) +{ + c->type = TT_SUPER; + buf = pass_str(buf, "S "); + buf = sget_int(buf, &c->u.super.size); + buf = sget_int(buf, &c->u.super.root_cell); + buf = sget_int(buf, &c->u.super.first_unused); + buf = sget_int(buf, &c->u.super.serial); + return 0; +} + +static int +decode_unuse(struct cell *c, char *buf) +{ + c->type = TT_UNUSED; + buf = pass_str(buf, "-next="); + buf = sget_int(buf, &c->u.next_unused); + return 0; +} + +static int +decode_node(struct cell *c, char *buf) +{ + c->type = TT_NODE; + buf = pass_str(buf, "N "); + buf = sget_int(buf, &c->u.node.key); + buf = sget_int(buf, &c->u.node.parent); + buf = sget_int(buf, &c->u.node.next); + buf = sget_int(buf, &c->u.node.child); + buf = sget_int(buf, &c->u.node.body); + return 0; +} + +static int +decode_body(struct cell *c, char *buf) +{ + c->type = TT_BODY; + buf = pass_str(buf, "B"); + buf = sget_int(buf, &c->next_tail); + buf = sget_int(buf, &c->u.body.owner); + c->u.body.obj = decode_str(buf, 9); + return 0; +} + +static int +decode_tail(struct cell *c, char *buf) +{ + c->type = TT_TAIL; + buf = pass_str(buf, "T"); + buf = sget_int(buf, &c->u.tail.prev); + buf = sget_int(buf, &c->next_tail); + c->u.tail.obj = decode_str(buf, 9); + return 0; +} + +static int +decode_alloced(struct cell *c) +{ + c->type = TT_ALLOCED; + return 0; +} + +static struct cell * +decode_nth_cell(struct text_trie *tt, struct cell *c, int nth) +{ + int res; + char *buf; + if (nth < 0 || + (anthy_mmap_size(tt->mapping) / LINE_LEN) < + (nth + 1)) { + return NULL; + } + buf = &tt->ptr[nth*LINE_LEN]; + + res = -1; + switch (buf[0]) { + case 'S': + res = decode_super(c, buf); + break; + case '-': + res = decode_unuse(c, buf); + break; + case 'N': + res = decode_node(c, buf); + break; + case 'B': + res = decode_body(c, buf); + break; + case 'T': + res = decode_tail(c, buf); + break; + case '?': + res = decode_alloced(c); + break; + default: + /*printf("decode fail (nth=%d::%s).\n", nth, buf);*/ + ; + } + if (res) { + c->type = TT_UNUSED; + } + return c; +} + +static struct cell * +decode_nth_node(struct text_trie *tt, struct cell *c, int nth) +{ + if (!decode_nth_cell(tt, c, nth)) { + return NULL; + } + if (c->type != TT_NODE) { + return NULL; + } + return c; +} + +static int +update_mapping(struct text_trie *tt) +{ + if (tt->mapping) { + anthy_munmap(tt->mapping); + } + tt->mapping = anthy_mmap(tt->fn, 1); + if (!tt->mapping) { + /* try to fall back read-only mmap */ + tt->mapping = anthy_mmap(tt->fn, 0); + } + if (!tt->mapping) { + tt->ptr = NULL; + return 1; + } + tt->ptr = anthy_mmap_address(tt->mapping); + return 0; +} + +static int +expand_file(struct text_trie *tt, int count) +{ + char buf[LINE_LEN+1]; + int i; + for (i = 0; i < LINE_LEN; i++) { + buf[i] = ' '; + } + buf[LINE_LEN-1] = '\n'; + /**/ + for (i = 0; i < count; i++) { + int res; + res = fwrite(buf, LINE_LEN, 1, tt->wfp); + if (res != 1) { + return 1; + } + if (fflush(tt->wfp)) { + return 1; + } + } + return 0; +} + +static int +set_file_size(struct text_trie *tt, int len) +{ + int size = LINE_LEN * len; + int cur_size; + int err = 0; + + fseek(tt->wfp, 0, SEEK_END); + cur_size = ftell(tt->wfp); + if (cur_size == size) { + return 0; + } + if (cur_size > size) { + truncate(tt->fn, size); + } else { + err = expand_file(tt, (size - cur_size) / LINE_LEN); + if (!err) { + update_mapping(tt); + } else { + tt->fatal = 1; + } + } + + return err; +} + +static struct cell * +get_super_cell(struct text_trie *tt) +{ + /* cached? */ + if (tt->valid_super) { + return &tt->super; + } + /* read */ + if (decode_nth_cell(tt, &tt->super, 0)) { + tt->valid_super = 1; + return &tt->super; + } + /* create now */ + tt->super.type = TT_SUPER; + tt->super.u.super.first_unused = 0; + tt->super.u.super.root_cell = 0; + tt->super.u.super.size = 1; + tt->super.u.super.serial = 1; + if (set_file_size(tt, 1) != 0) { + return NULL; + } + write_back_cell(tt, &tt->super, 0); + tt->valid_super = 1; + return &tt->super; +} + +/* convenience function */ +static int +get_array_size(struct text_trie *a) +{ + struct cell *super = get_super_cell(a); + int size = super->u.super.size; + return size; +} + +/* convenience function */ +static int +get_root_idx(struct text_trie *tt) +{ + struct cell *super = get_super_cell(tt); + if (!super) { + return 0; + } + return super->u.super.root_cell; +} + +static int +expand_array(struct text_trie *tt, int len) +{ + int i; + struct cell *super; + int res; + int size = get_array_size(tt); + if (size >= len) { + return 0; + } + /* expand file */ + res = set_file_size(tt, len); + if (res) { + return 1; + } + /* fill unused */ + super = get_super_cell(tt); + for (i = super->u.super.size; i < len; i++) { + struct cell ex_cell; + ex_cell.type = TT_UNUSED; + ex_cell.u.next_unused = super->u.super.first_unused; + write_back_cell(tt, &ex_cell, i); + super->u.super.first_unused = i; + } + super->u.super.size = len; + write_back_cell(tt, super, 0); + return 0; +} + +void +anthy_trie_print_array(struct text_trie *tt) +{ + int i; + int size = get_array_size(tt); + print_cell(0, get_super_cell(tt)); + for (i = 1; i < size; i++) { + struct cell c; + decode_nth_cell(tt, &c, i); + print_cell(i, &c); + release_cell_str(&c); + } +} + +/* get unused cell */ +static int +get_unused_index(struct text_trie *tt) +{ + struct cell *super; + int unuse; + struct cell new_cell; + + super = get_super_cell(tt); + unuse = super->u.super.first_unused; + if (!unuse) { + /* expand array */ + expand_array(tt, super->u.super.size + EXPAND_COUNT); + unuse = super->u.super.first_unused; + if (!unuse) { + return 0; + } + } + if (!decode_nth_cell(tt, &new_cell, unuse)) { + tt->fatal = 1; + return 0; + } + super->u.super.first_unused = new_cell.u.next_unused; + new_cell.type = TT_ALLOCED; + write_back_cell(tt, &new_cell, unuse); + write_back_cell(tt, super, 0); + return unuse; +} + +static void +free_cell(struct text_trie *tt, int idx) +{ + struct cell *super = get_super_cell(tt); + struct cell c; + if (!decode_nth_cell(tt, &c, idx)) { + tt->fatal = 1; + } else { + c.type = TT_UNUSED; + c.u.next_unused = super->u.super.first_unused; + write_back_cell(tt, &c, idx); + } + super->u.super.first_unused = idx; + write_back_cell(tt, super, 0); +} + +static void +load_super(struct text_trie *tt) +{ + struct cell root, *super; + int root_idx; + super = get_super_cell(tt); + if (!super) { + tt->fatal = 1; + return ; + } + /**/ + if (super->u.super.root_cell) { + return ; + } + /**/ + root_idx = get_unused_index(tt); + if (root_idx == 0) { + tt->fatal = 1; + return ; + } + root.u.node.key = 0; + root.type = TT_NODE; + root.u.node.parent = 0; + root.u.node.next = 0; + root.u.node.child = 0; + root.u.node.body = 0; + write_back_cell(tt, &root, root_idx); + /**/ + tt->super.u.super.root_cell = root_idx; + write_back_cell(tt, super, 0); +} + +static void +purge_cache(struct text_trie *tt) +{ + if (tt) { + tt->valid_super = 0; + } +} + +static FILE * +do_fopen(const char *fn, int create) +{ + int fd; + if (!create) { + /* check file existance */ + FILE *fp; + fp = fopen(fn, "r"); + if (!fp) { + return NULL; + } + fclose(fp); + } + fd = open(fn, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (fd == -1) { + return NULL; + } + return fdopen(fd, "w"); +} + +static struct text_trie * +alloc_tt(const char *fn, FILE *wfp) +{ + struct text_trie *tt; + tt = malloc(sizeof(struct text_trie)); + tt->fatal = 0; + tt->wfp = wfp; + tt->valid_super = 0; + tt->fn = strdup(fn); + tt->mapping = NULL; + return tt; +} + +static void +clear_file(const char *fn) +{ + FILE *fp = fopen(fn, "w"); + if (fp) { + fclose(fp); + } +} + +static struct text_trie * +trie_open(const char *fn, int create, int do_retry) +{ + struct text_trie *tt; + FILE *fp; + /**/ + fp = do_fopen(fn, create); + if (!fp) { + return NULL; + } + /**/ + tt = alloc_tt(fn, fp); + if (!tt) { + fclose(fp); + return NULL; + } + /**/ + update_mapping(tt); + load_super(tt); + /**/ + if (tt->fatal) { + anthy_trie_close(tt); + if (!do_retry) { + return NULL; + } + clear_file(fn); + return trie_open(fn, create, 0); + } + /**/ + return tt; +} + + +/* API */ +struct text_trie * +anthy_trie_open(const char *fn, int create) +{ + struct text_trie *tt; + anthy_priv_dic_lock(); + tt = trie_open(fn, create, 1); + anthy_priv_dic_unlock(); + purge_cache(tt); + return tt; +} + +/* API */ +void +anthy_trie_close(struct text_trie *tt) +{ + if (!tt) { + return ; + } + fclose(tt->wfp); + anthy_munmap(tt->mapping); + free(tt->fn); + free(tt); +} + +/* API */ +void +anthy_trie_update_mapping(struct text_trie *tt) +{ + if (!tt) { + return ; + } + anthy_priv_dic_lock(); + update_mapping(tt); + anthy_priv_dic_unlock(); +} + +static void +graft_child(struct text_trie *tt, int parent_idx, int new_idx) +{ + struct cell parent_cell; + struct cell new_cell; + struct cell cur_youngest_cell; + int cur_idx; + /**/ + if (!decode_nth_node(tt, &parent_cell, parent_idx)) { + return ; + } + /**/ + if (parent_cell.u.node.child == 0) { + /* 1st child */ + parent_cell.u.node.child = new_idx; + write_back_cell(tt, &parent_cell, parent_idx); + return ; + } + + if (!decode_nth_node(tt, &cur_youngest_cell, parent_cell.u.node.child)) { + return ; + } + if (!decode_nth_node(tt, &new_cell, new_idx)) { + return ; + } + if (new_cell.u.node.key < cur_youngest_cell.u.node.key) { + /* newly added younger child */ + new_cell.u.node.next = parent_cell.u.node.child; + parent_cell.u.node.child = new_idx; + write_back_cell(tt, &new_cell, new_idx); + write_back_cell(tt, &parent_cell, parent_idx); + return; + } + + /* insert some order */ + cur_idx = parent_cell.u.node.child; + while (cur_idx) { + int next_idx; + struct cell cur_cell, tmp_cell; + struct cell *next_cell = NULL; + if (!decode_nth_node(tt, &cur_cell, cur_idx)) { + return ; + } + next_idx = cur_cell.u.node.next; + /**/ + if (next_idx) { + next_cell = decode_nth_node(tt, &tmp_cell, next_idx); + } + if (!next_cell) { + /* append */ + new_cell.u.node.next = 0; + cur_cell.u.node.next = new_idx; + write_back_cell(tt, &cur_cell, cur_idx); + break; + } else { + if (cur_cell.u.node.key < new_cell.u.node.key && + new_cell.u.node.key < next_cell->u.node.key) { + cur_cell.u.node.next = new_idx; + new_cell.u.node.next = next_idx; + write_back_cell(tt, &cur_cell, cur_idx); + break; + } + } + cur_idx = next_idx; + } + write_back_cell(tt, &new_cell, new_idx); +} + +static int +find_child(struct text_trie *tt, int parent_idx, int key, int exact) +{ + int child_idx; + int prev_key; + struct cell parent_cell; + + if (!decode_nth_node(tt, &parent_cell, parent_idx)) { + return 0; + } + + /**/ + prev_key = 0; + child_idx = parent_cell.u.node.child; + + /**/ + while (child_idx) { + struct cell child_cell; + int this_key; + /**/ + if (!decode_nth_node(tt, &child_cell, child_idx)) { + return 0; + } + this_key = child_cell.u.node.key; + if (this_key <= prev_key) { + return 0; + } + /**/ + if (exact && this_key == key) { + return child_idx; + } + if (!exact && (this_key & 0xff00) == (key & 0xff00)) { + return child_idx; + } + child_idx = child_cell.u.node.next; + prev_key = this_key; + } + return 0; +} + +static int +trie_search_rec(struct text_trie *tt, struct path *p, + int parent_idx, int create) +{ + int child_idx; + int key = p->path[p->cur]; + /* special case */ + if (p->cur == p->len) { + return parent_idx; + } + /* scan child */ + child_idx = find_child(tt, parent_idx, key, 1); + if (!child_idx) { + struct cell child_cell; + if (!create) { + return 0; + } + /* add child */ + child_idx = get_unused_index(tt); + if (!child_idx) { + return 0; + } + if (!decode_nth_cell(tt, &child_cell, child_idx)) { + return 0; + } + child_cell.type = TT_NODE; + child_cell.u.node.parent = parent_idx; + child_cell.u.node.key = key; + child_cell.u.node.next = 0; + child_cell.u.node.child = 0; + child_cell.u.node.body = 0; + write_back_cell(tt, &child_cell, child_idx); + /* graft */ + graft_child(tt, parent_idx, child_idx); + } + p->cur ++; + key ++; + if (!key) { + return child_idx; + } + return trie_search_rec(tt, p, child_idx, create); +} + +static char * +get_str_part(const char *str, int from) +{ + char buf[OBJ_LEN+1]; + int i; + for (i = 0; i < OBJ_LEN; i++) { + buf[i] = str[from+i]; + } + buf[i] = 0; + return strdup(buf); +} + +static void +release_body(struct text_trie *tt, int idx) +{ + struct cell c; + int tail_idx; + if (!decode_nth_cell(tt, &c, idx) || + c.type != TT_BODY) { + return ; + } + tail_idx = c.next_tail; + while (tail_idx) { + struct cell tail_cell; + int tmp; + if (!decode_nth_cell(tt, &tail_cell, tail_idx)) { + break; + } + tmp = tail_cell.next_tail; + free_cell(tt, tail_idx); + tail_idx = tmp; + } + free_cell(tt, idx); +} + +static void +set_body(struct text_trie *tt, int idx, const char *body_str) +{ + int body_idx = get_unused_index(tt); + int len; + int i; + struct cell node_cell; + struct cell body_cell; + struct cell prev_cell; + struct cell tail_cell; + int prev_idx; + /**/ + if (!decode_nth_cell(tt, &node_cell, idx)) { + return ; + } + if (node_cell.u.node.body) { + release_body(tt, node_cell.u.node.body); + } + len = strlen(body_str); + /**/ + node_cell.u.node.body = body_idx; + write_back_cell(tt, &node_cell, idx); + /**/ + if (!decode_nth_cell(tt, &body_cell, body_idx)) { + return ; + } + body_cell.type = TT_BODY; + body_cell.u.body.obj = get_str_part(body_str, 0); + body_cell.u.body.owner = idx; + body_cell.next_tail = 0; + write_back_cell(tt, &body_cell, body_idx); + release_cell_str(&body_cell); + /**/ + if (!decode_nth_cell(tt, &body_cell, body_idx)) { + return ; + } + /**/ + prev_idx = body_idx; + prev_cell = body_cell; + for (i = OBJ_LEN; i < len; i += OBJ_LEN) { + int tail_idx = get_unused_index(tt); + if (!decode_nth_cell(tt, &tail_cell, tail_idx)) { + return ; + } + tail_cell.type = TT_TAIL; + tail_cell.u.tail.obj = get_str_part(body_str, i); + tail_cell.u.tail.prev = prev_idx; + tail_cell.next_tail = 0; + prev_cell.next_tail = tail_idx; + write_back_cell(tt, &tail_cell, tail_idx); + write_back_cell(tt, &prev_cell, prev_idx); + release_cell_str(&prev_cell); + /**/ + prev_idx = tail_idx; + prev_cell = tail_cell; + } + if (i != OBJ_LEN) { + release_cell_str(&tail_cell); + } +} + +static int +trie_add(struct text_trie *tt, struct path *p, const char *body) +{ + int root_idx = get_root_idx(tt); + int target_idx; + /**/ + if (root_idx == 0) { + return -1; + } + target_idx = trie_search_rec(tt, p, root_idx, 1); + if (target_idx) { + set_body(tt, target_idx, body); + } + return 0; +} + +/* API */ +int +anthy_trie_add(struct text_trie *tt, const char *key, const char *body) +{ + int res; + int len; + struct path p; + if (!tt || tt->fatal) { + return -1; + } + len = strlen(key); + path_setup(&p, key, len, alloca(sizeof(int)*len)); + anthy_priv_dic_lock(); + res = trie_add(tt, &p, body); + anthy_priv_dic_unlock(); + purge_cache(tt); + return res; +} + +static int +get_object_length(struct text_trie *tt, int body_idx) +{ + int len = 0; + int idx = body_idx; + while (idx) { + struct cell c; + if (!decode_nth_cell(tt, &c, idx)) { + return 0; + } + idx = c.next_tail; + len += OBJ_LEN; + release_cell_str(&c); + } + return len; +} + +static char * +gather_str(struct text_trie *tt, int body_idx) +{ + int idx; + char *buf; + int len; + /* count length */ + len = get_object_length(tt, body_idx); + if (len == 0) { + return NULL; + } + /**/ + buf = malloc(len + 1); + idx = body_idx; + len = 0; + while (idx) { + struct cell c; + if (!decode_nth_cell(tt, &c, idx)) { + free(buf); + return NULL; + } + if (len == 0) { + sprintf(&buf[len], "%s", c.u.body.obj); + } else { + sprintf(&buf[len], "%s", c.u.tail.obj); + } + idx = c.next_tail; + len += OBJ_LEN; + release_cell_str(&c); + } + return buf; +} + +static char * +trie_find(struct text_trie *tt, struct path *p) +{ + int root_idx; + int target_idx; + root_idx = get_root_idx(tt); + if (!root_idx) { + return NULL; + } + target_idx = trie_search_rec(tt, p, root_idx, 0); + if (target_idx) { + struct cell target_cell; + int body_idx; + if (!decode_nth_node(tt, &target_cell, target_idx)) { + return NULL; + } + body_idx = target_cell.u.node.body; + if (body_idx) { + return gather_str(tt, body_idx); + } + } + return NULL; +} + +/* API */ +char * +anthy_trie_find(struct text_trie *tt, char *key) +{ + char *res; + struct path p; + int len; + if (!tt || tt->fatal) { + return NULL; + } + len = strlen(key); + path_setup(&p, key, len, alloca(sizeof(int)*len)); + anthy_priv_dic_lock(); + res = trie_find(tt, &p); + anthy_priv_dic_unlock(); + purge_cache(tt); + return res; +} + +static int +do_find_next_key(struct text_trie *tt, struct path *p, + int root_idx, int target_idx) +{ + struct cell *target_cell, tmp_cell; + int prev_is_up = 0; + target_cell = decode_nth_node(tt, &tmp_cell, target_idx); + /**/ + do { + /* one step */ + if (!target_cell) { + return -1; + } + if (!prev_is_up && target_cell->u.node.child) { + prev_is_up = 0; + target_idx = target_cell->u.node.child; + p->cur++; + } else if (target_cell->u.node.next) { + prev_is_up = 0; + target_idx = target_cell->u.node.next; + } else if (target_cell->u.node.parent) { + prev_is_up = 1; + target_idx = target_cell->u.node.parent; + p->cur--; + } else { + return -1; + } + target_cell = decode_nth_node(tt, &tmp_cell, target_idx); + if (!target_cell) { + return -1; + } + if (p->cur >= p->max_len) { + continue; + } + if (p->cur == 0) { + return -1; + } + p->path[p->cur-1] = target_cell->u.node.key; + if (!prev_is_up && target_cell->u.node.body) { + return 0; + } + } while (target_idx != root_idx); + return -1; +} + +static int +find_partial_key(struct text_trie *tt, struct path *p, int idx) +{ + struct cell c; + if (!decode_nth_node(tt, &c, idx)) { + return -1; + } + if (c.type != TT_NODE) { + return -1; + } + p->len ++; + p->path[p->cur] = c.u.node.key; + p->cur ++; + return 0; +} + +static int +trie_find_next_key(struct text_trie *tt, struct path *p) +{ + int root_idx = get_root_idx(tt); + int target_idx; + int tmp_idx; + /**/ + target_idx = trie_search_rec(tt, p, root_idx, 0); + if (target_idx > 0) { + /* easy case */ + return do_find_next_key(tt, p, root_idx, target_idx); + } + if ((p->path[p->len-1] & 0xff) != 0) { + /* simply not exist in tree */ + return -1; + } + /* special case */ + p->len --; + p->cur = 0; + target_idx = trie_search_rec(tt, p, root_idx, 0); + tmp_idx = find_child(tt, target_idx, p->path[p->len], 0); + if (tmp_idx) { + return find_partial_key(tt, p, tmp_idx); + } + return do_find_next_key(tt, p, root_idx, target_idx); +} + + +/* API */ +char * +anthy_trie_find_next_key(struct text_trie *tt, char *buf, int buf_len) +{ + int res; + struct path p; + if (!tt || tt->fatal) { + return NULL; + } + path_setup(&p, buf, buf_len, alloca(sizeof(int)*buf_len)); + anthy_priv_dic_lock(); + res = trie_find_next_key(tt, &p); + anthy_priv_dic_unlock(); + purge_cache(tt); + if (res) { + return NULL; + } + path_copy_to_str(&p, buf, buf_len); + return buf; +} + +static void +trie_find_prefix(struct text_trie *tt, const char *str, + char *buf, int buf_len, + int (*cb)(const char *key, const char *str)) +{ + int idx = get_root_idx(tt); + int i, len = strlen(str); + for (i = 0; i < len && i < buf_len; i++) { + struct cell c; + idx = find_child(tt, idx, str[i], 1); + if (!idx) { + return ; + } + if (!decode_nth_node(tt, &c, idx)) { + return ; + } + buf[i] = idx; + buf[i+1] = 0; + if (c.u.node.body) { + char *s = gather_str(tt, c.u.node.body); + if (cb) { + cb(buf, s); + } + free(s); + } + } +} + +void +anthy_trie_find_prefix(struct text_trie *tt, const char *str, + char *buf, int buf_len, + int (*cb)(const char *key, const char *str)) +{ + if (!tt || tt->fatal) { + return ; + } + anthy_priv_dic_lock(); + trie_find_prefix(tt, str, buf, buf_len, cb); + anthy_priv_dic_unlock(); + purge_cache(tt); +} + +static void +disconnect(struct text_trie *tt, int parent_idx, int target_idx) +{ + struct cell parent_cell; + struct cell target_cell; + + if (!decode_nth_node(tt, &parent_cell, parent_idx) || + !decode_nth_node(tt, &target_cell, target_idx)) { + return ; + } + + if (parent_cell.u.node.child == target_idx) { + /* 1st child */ + parent_cell.u.node.child = target_cell.u.node.next; + write_back_cell(tt, &parent_cell, parent_idx); + if (!target_cell.u.node.next && + !parent_cell.u.node.body) { + /* only child and parent does not have body, so traverse upward */ + disconnect(tt, parent_cell.u.node.parent, parent_idx); + free_cell(tt, target_idx); + return ; + } + free_cell(tt, target_idx); + } else { + /* not 1st child */ + int child_idx = parent_cell.u.node.child; + while (child_idx) { + struct cell cur; + if (!decode_nth_cell(tt, &cur, child_idx)) { + return ; + } + if (cur.u.node.next == target_idx) { + /**/ + cur.u.node.next = target_cell.u.node.next; + write_back_cell(tt, &cur, child_idx); + free_cell(tt, target_idx); + return ; + } + child_idx = cur.u.node.next; + } + } +} + +static void +trie_delete(struct text_trie *tt, struct path *p) +{ + struct cell target_cell; + int root_idx = get_root_idx(tt); + int target_idx, parent_idx; + target_idx = trie_search_rec(tt, p, root_idx, 0); + if (!target_idx) { + return ; + } + if (!decode_nth_node(tt, &target_cell, target_idx)) { + return ; + } + release_body(tt, target_cell.u.node.body); + target_cell.u.node.body = 0; + write_back_cell(tt, &target_cell, target_idx); + if (target_cell.u.node.child) { + return ; + } + parent_idx = target_cell.u.node.parent; + disconnect(tt, parent_idx, target_idx); +} + +/* API */ +void +anthy_trie_delete(struct text_trie *tt, const char *key) +{ + struct path p; + int len; + if (!tt || tt->fatal) { + return ; + } + len = strlen(key); + path_setup(&p, key, len, alloca(sizeof(int)*len)); + anthy_priv_dic_lock(); + trie_delete(tt, &p); + anthy_priv_dic_unlock(); + purge_cache(tt); +} diff --git a/src-worddic/use_dic.c b/src-worddic/use_dic.c new file mode 100644 index 0000000..8a555f7 --- /dev/null +++ b/src-worddic/use_dic.c @@ -0,0 +1,21 @@ +/* + * 用例辞書を扱う + * Copyright (C) 2003 TABATA Yusuke + */ +#include <string.h> +#include <stdlib.h> + +#include <anthy/dic.h> +#include <anthy/xstr.h> +#include <anthy/matrix.h> +#include <anthy/word_dic.h> +#include "dic_main.h" +#include "dic_ent.h" + +/**/ +int anthy_word_dic_check_word_relation(struct word_dic *wdic, + int from, int to) +{ + /* 共有辞書 */ + return anthy_matrix_image_peek((int *)wdic->uc_section, from, to); +} diff --git a/src-worddic/word_dic.c b/src-worddic/word_dic.c new file mode 100644 index 0000000..1b1df35 --- /dev/null +++ b/src-worddic/word_dic.c @@ -0,0 +1,782 @@ +/* + * Anthyの辞書ライブラリの中心 + * + * anthy_get_seq_ent_from_xstr()で辞書をひく + * + * Copyright (C) 2000-2007 TABATA Yusuke + * Copyright (C) 2005-2006 YOSHIDA Yuichi + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <string.h> + +#include <anthy/anthy.h> +#include <anthy/dic.h> +#include <anthy/conf.h> +#include <anthy/record.h> +#include <anthy/alloc.h> +#include <anthy/logger.h> +#include <anthy/xchar.h> +#include <anthy/feature_set.h> +#include <anthy/textdict.h> + +#include <anthy/diclib.h> + +#include "dic_ent.h" +#include "dic_personality.h" +#include "dic_main.h" + +/**/ +static int dic_init_count; + +/* 辞書 */ +/* 全personalityで共有されるファイル辞書 */ +static struct word_dic *master_dic_file; + +/* 各パーソナリティごとの辞書 */ +struct mem_dic *anthy_current_personal_dic_cache;/* キャッシュ */ +/**/ +struct record_stat *anthy_current_record; + +struct seq_ent * +anthy_validate_seq_ent(struct seq_ent *seq, xstr *xs, int is_reverse) +{ + if (!seq) { + return NULL; + } + if (seq->nr_dic_ents == 0 && seq->nr_compound_ents == 0) { + /* 無効なエントリを作成したのでcacheから削除 */ + anthy_mem_dic_release_seq_ent(anthy_current_personal_dic_cache, + xs, is_reverse); + return NULL; + } + + return seq; +} + +struct seq_ent * +anthy_cache_get_seq_ent(xstr *xs, int is_reverse) +{ + struct seq_ent *seq; + + /* キャッシュ中に既にあればそれを返す */ + seq = anthy_mem_dic_find_seq_ent_by_xstr(anthy_current_personal_dic_cache, + xs, is_reverse); + if (seq) { + return seq; + } + + /* キャッシュ中に無いので確保 */ + return anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache, + xs, is_reverse); +} + +int +anthy_dic_check_word_relation(int from, int to) +{ + return anthy_word_dic_check_word_relation(master_dic_file, from, to); +} + +static seq_ent_t +do_get_seq_ent_from_xstr(xstr *xs, int is_reverse) +{ + struct seq_ent *seq; + /* キャッシュから取り出す */ + seq = anthy_cache_get_seq_ent(xs, is_reverse); + seq = anthy_validate_seq_ent(seq, xs, is_reverse); + if (!seq) { + /* 数字などの辞書に無い文字列を検索する */ + return anthy_get_ext_seq_ent_from_xstr(xs, is_reverse); + } + return seq; +} + +static xstr * +convert_vu(xstr *xs) +{ + int i, v = 0; + int j; + + /* 「ヴ」の出現を数える */ + for (i = 0; i < xs->len; i++) { + if (xs->str[i] == KK_VU) { + v++; + } + } + if (v > 0) { + xstr *nx = malloc(sizeof(xstr)); + nx->len = xs->len + v; + nx->str = malloc(sizeof(xchar)*nx->len); + j = 0; + /* 「ヴ」を「う゛」に変換しつつコピーする */ + for (i = 0; i < xs->len; i++) { + if (xs->str[i] == KK_VU) { + nx->str[j] = HK_U; + j++; + nx->str[j] = HK_DDOT; + j++; + } else { + nx->str[j] = xs->str[i]; + j++; + } + } + return nx; + } + return NULL; +} + +seq_ent_t +anthy_get_seq_ent_from_xstr(xstr *xs, int is_reverse) +{ + struct seq_ent *se; + + if (!xs) { + return NULL; + } + if (!is_reverse) { + xstr *nx = convert_vu(xs); + /* 「ヴ」の混ざった順変換の場合、「う゛」に直して検索する + * 上位のレイヤーではユーザの与えた文字列をそのまま保持することが + * 期待されるので、変換はここで行なう。 + */ + if (nx) { + se = do_get_seq_ent_from_xstr(nx, 0); + anthy_free_xstr(nx); + return se; + } + } + /* 「ヴ」が出現しない、もしくは逆変換の場合 */ + return do_get_seq_ent_from_xstr(xs, is_reverse); +} + +static void +gang_elm_dtor(void *p) +{ + struct gang_elm *ge = p; + free(ge->key); +} + +static int +find_gang_elm(allocator ator, struct gang_elm *head, xstr *xs) +{ + char *str = anthy_xstr_to_cstr(xs, ANTHY_UTF8_ENCODING); + struct gang_elm *ge; + for (ge = head->tmp.next; ge; ge = ge->tmp.next) { + if (!strcmp(ge->key, str)) { + free(str); + return 0; + } + } + ge = anthy_smalloc(ator); + ge->xs = *xs; + ge->key = str; + ge->tmp.next = head->tmp.next; + head->tmp.next = ge; + return 1; +} + +static int +gang_elm_compare_func(const void *p1, const void *p2) +{ + const struct gang_elm * const *s1 = p1; + const struct gang_elm * const *s2 = p2; + return strcmp((*s1)->key, (*s2)->key); +} + +struct gang_scan_context { + /**/ + int nr; + struct gang_elm **array; + /**/ + int nth; +}; + +static int +is_ext_ent(struct seq_ent *seq) +{ + if (!seq->md) { + return 1; + } + return 0; +} + +static void +scan_misc_dic(struct gang_elm **array, int nr, int is_reverse) +{ + int i; + for (i = 0; i < nr; i++) { + xstr *xs = &array[i]->xs; + struct seq_ent *seq; + seq = anthy_cache_get_seq_ent(xs, is_reverse); + /* 個人辞書からの取得(texttrie(旧形式)と未知語辞書) */ + if (seq) { + anthy_copy_words_from_private_dic(seq, xs, is_reverse); + anthy_validate_seq_ent(seq, xs, is_reverse); + } + } +} + +static void +load_word(xstr *xs, const char *n, int is_reverse) +{ + struct seq_ent *seq = anthy_get_seq_ent_from_xstr(xs, 0); + xstr *word_xs; + wtype_t wt; + struct word_line wl; + if (!seq || is_ext_ent(seq)) { + seq = anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache, + xs, is_reverse); + } + if (anthy_parse_word_line(n, &wl)) { + return ; + } + word_xs = anthy_cstr_to_xstr(wl.word, ANTHY_UTF8_ENCODING); + if (anthy_type_to_wtype(wl.wt, &wt)) { + anthy_mem_dic_push_back_dic_ent(seq, 0, word_xs, wt, + NULL, wl.freq, 0); + } + + anthy_free_xstr(word_xs); +} + +static int +gang_scan(void *p, int offset, const char *key, const char *n) +{ + struct gang_scan_context *gsc = p; + struct gang_elm *elm; + int r; + (void)offset; + while (1) { + if (gsc->nth >= gsc->nr) { + return 0; + } + elm = gsc->array[gsc->nth]; + r = strcmp(elm->key, key); + if (r == 0) { + /* find it */ + load_word(&elm->xs, n, 0); + /* go next in dictionary */ + return 0; + } else if (r > 0) { + /* go next in dictionary */ + return 0; + } else { + /* go next in lookup */ + gsc->nth ++; + } + } + return 0; +} + +static void +scan_dict(struct textdict *td, int nr, struct gang_elm **array) +{ + struct gang_scan_context gsc; + gsc.nr = nr; + gsc.array = array; + gsc.nth = 0; + anthy_textdict_scan(td, 0, &gsc, gang_scan); +} + +struct scan_arg { + struct gang_elm **array; + int nr; +}; + +static void +request_scan(struct textdict *td, void *arg) +{ + struct scan_arg *sarg = (struct scan_arg *)arg; + scan_dict(td, sarg->nr, sarg->array); +} + +static void +do_gang_load_dic(xstr *sentence, int is_reverse) +{ + allocator ator = anthy_create_allocator(sizeof(struct gang_elm), + gang_elm_dtor); + int from, len; + xstr xs; + int i, nr; + struct gang_elm head; + struct gang_elm **array, *cur; + struct scan_arg sarg; + head.tmp.next = NULL; + nr = 0; + for (from = 0; from < sentence->len ; from ++) { + for (len = 1; len < 32 && from + len <= sentence->len; len ++) { + xs.str = &sentence->str[from]; + xs.len = len; + nr += find_gang_elm(ator, &head, &xs); + } + } + array = malloc(sizeof(struct gang_elm *) * nr); + cur = head.tmp.next; + for (i = 0; i < nr; i++) { + array[i] = cur; + cur = cur->tmp.next; + } + qsort(array, nr, sizeof(struct gang_elm *), gang_elm_compare_func); + /**/ + anthy_gang_fill_seq_ent(master_dic_file, array, nr, is_reverse); + /**/ + scan_misc_dic(array, nr, is_reverse); + /* 個人辞書から読む */ + sarg.nr = nr; + sarg.array = array; + anthy_ask_scan(request_scan, (void *)&sarg); + /**/ + free(array); + anthy_free_allocator(ator); +} + +void +anthy_gang_load_dic(xstr *sentence, int is_reverse) +{ + xstr *nx; + if (!is_reverse && (nx = convert_vu(sentence))) { + do_gang_load_dic(nx, is_reverse); + anthy_free_xstr(nx); + } else { + do_gang_load_dic(sentence, is_reverse); + } +} + +/* + * seq_entの取得 + ************************ + * seq_entの各種情報の取得 + */ +int +anthy_get_nr_dic_ents(seq_ent_t se, xstr *xs) +{ + struct seq_ent *s = se; + if (!s) { + return 0; + } + if (!xs) { + return s->nr_dic_ents; + } + return s->nr_dic_ents + anthy_get_nr_dic_ents_of_ext_ent(se, xs); +} + +int +anthy_get_nth_dic_ent_str(seq_ent_t se, xstr *orig, + int n, xstr *x) +{ + if (!se || (n < 0)) { /* INDEPPAIR学習による交換先が見つからなかった時に不正なメモリアクセスをするバグの修正(通称「いちおく」の件) */ + x->str = NULL; /* 不正なメモリアクセスやメモリの多重解放をするバグの修正 */ + x->len = 0; + return -1; + } + if (n >= se->nr_dic_ents) { + return anthy_get_nth_dic_ent_str_of_ext_ent(se, orig, + n - se->nr_dic_ents, x); + } + x->len = se->dic_ents[n]->str.len; + x->str = anthy_xstr_dup_str(&se->dic_ents[n]->str); + return 0; +} + +int +anthy_get_nth_dic_ent_is_compound(seq_ent_t se, int nth) +{ + if (!se) { + return 0; + } + if (nth >= se->nr_dic_ents) { + return 0; + } + return se->dic_ents[nth]->is_compound; +} + +int +anthy_get_nth_dic_ent_freq(seq_ent_t se, int nth) +{ + struct seq_ent *s = se; + if (!s) { + return 0; + } + if (!s->dic_ents) { + return anthy_get_nth_dic_ent_freq_of_ext_ent(se, nth); + } + if (s->nr_dic_ents <= nth) { + return anthy_get_nth_dic_ent_freq_of_ext_ent(se, nth - se->nr_dic_ents); + } + return s->dic_ents[nth]->freq; +} + +int +anthy_get_nth_dic_ent_wtype(seq_ent_t se, xstr *xs, + int n, wtype_t *w) +{ + struct seq_ent *s = se; + if (!s) { + *w = anthy_wt_none; + return -1; + } + if (s->nr_dic_ents <= n) { + int r; + r = anthy_get_nth_dic_ent_wtype_of_ext_ent(xs, n - s->nr_dic_ents, w); + if (r == -1) { + *w = anthy_wt_none; + } + return r; + } + *w = s->dic_ents[n]->type; + return 0; +} + +int +anthy_get_seq_ent_pos(seq_ent_t se, int pos) +{ + int i, v=0; + struct seq_ent *s = se; + if (!s) { + return 0; + } + if (s->nr_dic_ents == 0) { + return anthy_get_ext_seq_ent_pos(se, pos); + } + for (i = 0; i < s->nr_dic_ents; i++) { + if (anthy_wtype_get_pos(s->dic_ents[i]->type) == pos) { + v += s->dic_ents[i]->freq; + if (v == 0) { + v = 1; + } + } + } + return v; +} + +int +anthy_get_seq_ent_ct(seq_ent_t se, int pos, int ct) +{ + int i, v=0; + struct seq_ent *s = se; + if (!s) { + return 0; + } + if (s->nr_dic_ents == 0) { + return anthy_get_ext_seq_ent_ct(s, pos, ct); + } + for (i = 0; i < s->nr_dic_ents; i++) { + if (anthy_wtype_get_pos(s->dic_ents[i]->type)== pos && + anthy_wtype_get_ct(s->dic_ents[i]->type)==ct) { + v += s->dic_ents[i]->freq; + if (v == 0) { + v = 1; + } + } + } + return v; +} + +/* + * wtの品詞を持つ単語の中で最大の頻度を持つものを返す + */ +int +anthy_get_seq_ent_wtype_freq(seq_ent_t seq, wtype_t wt) +{ + int i, f; + + if (!seq) { + return 0; + } + /**/ + if (seq->nr_dic_ents == 0) { + return anthy_get_ext_seq_ent_wtype(seq, wt); + } + + f = 0; + /* 単語 */ + for (i = 0; i < seq->nr_dic_ents; i++) { + if (seq->dic_ents[i]->order == 0 && + anthy_wtype_include(wt, seq->dic_ents[i]->type)) { + if (f < seq->dic_ents[i]->freq) { + f = seq->dic_ents[i]->freq; + } + } + } + return f; +} + +/* + * wtの品詞を持つ複合語の中で最大の頻度を持つものを返す + */ +int +anthy_get_seq_ent_wtype_compound_freq(seq_ent_t se, wtype_t wt) +{ + int i,f; + struct seq_ent *s = se; + if (!s) { + return 0; + } + /**/ + f = 0; + for (i = 0; i < s->nr_dic_ents; i++) { + if (!anthy_get_nth_dic_ent_is_compound(se, i)) { + continue; + } + if (anthy_wtype_include(wt, s->dic_ents[i]->type)) { + if (f < s->dic_ents[i]->freq) { + f = s->dic_ents[i]->freq; + } + } + } + return f; +} + +int +anthy_get_seq_ent_indep(seq_ent_t se) +{ + int i; + struct seq_ent *s = se; + if (!s) { + return 0; + } + if (s->nr_dic_ents == 0) { + return anthy_get_ext_seq_ent_indep(s); + } + for (i = 0; i < s->nr_dic_ents; i++) { + if (anthy_wtype_get_indep(s->dic_ents[i]->type)) { + return 1; + } + } + return 0; +} + +int +anthy_has_compound_ents(seq_ent_t se) +{ + if (!se) { + return 0; + } + return se->nr_compound_ents; +} + +/* compundでない候補を持っているか */ +int +anthy_has_non_compound_ents(seq_ent_t se) +{ + if (!se) { + return 0; + } + if (se->nr_dic_ents == 0) { + return 1; + } + return se->nr_dic_ents - se->nr_compound_ents; +} + +compound_ent_t +anthy_get_nth_compound_ent(seq_ent_t se, int nth) +{ + if (!se) { + return NULL; + } + if (nth >= 0 && nth < se->nr_dic_ents) { + return se->dic_ents[nth]; + } + return NULL; +} + +struct elm_compound { + int len; + xstr str; +}; + +/* 要素に対応する読みの長さを返す */ +static int +get_element_len(xchar xc) +{ + if (xc > '0' && xc <= '9') { + return xc - '0'; + } + if (xc >= 'a' && xc <= 'z') { + return xc - 'a' + 10; + } + return 0; +} + +static struct elm_compound * +get_nth_elm_compound(compound_ent_t ce, struct elm_compound *elm, int nth) +{ + int off = 0; + int i, j; + for (i = 0; i <= nth; i++) { + /* nth番目の要素の先頭へ移動する */ + while (!(ce->str.str[off] == '_' && + get_element_len(ce->str.str[off+1]) > 0)) { + off ++; + if (off + 1 >= ce->str.len) { + return NULL; + } + } + /* 構造体へ情報を取り込む */ + elm->len = get_element_len(ce->str.str[off+1]); + elm->str.str = &ce->str.str[off+2]; + elm->str.len = ce->str.len - off - 2; + for (j = 0; j < elm->str.len; j++) { + if (elm->str.str[j] == '_') { + elm->str.len = j; + break; + } + } + off ++; + } + return elm; +} + +int +anthy_compound_get_nr_segments(compound_ent_t ce) +{ + struct elm_compound elm; + int i; + if (!ce) { + return 0; + } + for (i = 0; get_nth_elm_compound(ce, &elm, i); i++); + return i; +} + +int +anthy_compound_get_nth_segment_len(compound_ent_t ce, int nth) +{ + struct elm_compound elm; + if (get_nth_elm_compound(ce, &elm, nth)) { + return elm.len; + } + return 0; +} + +int +anthy_compound_get_nth_segment_xstr(compound_ent_t ce, int nth, xstr *xs) +{ + struct elm_compound elm; + if (get_nth_elm_compound(ce, &elm, nth)) { + if (xs) { + *xs = elm.str; + return 0; + } + } + return -1; +} + +int +anthy_compound_get_wtype(compound_ent_t ce, wtype_t *w) +{ + *w = ce->type; + return 0; +} + +int +anthy_compound_get_freq(compound_ent_t ce) +{ + return ce->freq; +} + +/* フロントエンドから呼ばれる */ +void +anthy_lock_dic(void) +{ + anthy_priv_dic_lock(); + anthy_priv_dic_update(); +} + +/* フロントエンドから呼ばれる */ +void +anthy_unlock_dic(void) +{ + anthy_priv_dic_unlock(); +} + + +dic_session_t +anthy_dic_create_session(void) +{ + return anthy_create_mem_dic(); +} + +void +anthy_dic_activate_session(dic_session_t d) +{ + anthy_current_personal_dic_cache = d; +} + +void +anthy_dic_release_session(dic_session_t d) +{ + anthy_release_mem_dic(d); +} + +void +anthy_dic_set_personality(const char *id) +{ + anthy_current_record = anthy_create_record(id); + anthy_current_personal_dic_cache = anthy_create_mem_dic(); + anthy_init_private_dic(id); +} + + +/** 辞書サブシステムを初期化 + */ +int +anthy_init_dic(void) +{ + if (dic_init_count) { + dic_init_count ++; + return 0; + } + if (anthy_init_diclib() == -1) { + return -1; + } + + anthy_init_wtypes(); + anthy_init_mem_dic(); + anthy_init_record(); + anthy_init_ext_ent(); + anthy_init_features(); + + anthy_init_word_dic(); + master_dic_file = anthy_create_word_dic(); + if (!master_dic_file) { + anthy_log(0, "Failed to create file dic.\n"); + return -1; + } + dic_init_count ++; + return 0; +} + +/** 辞書サブシステムをすべて解放 + */ +void +anthy_quit_dic(void) +{ + dic_init_count --; + if (dic_init_count) { + return; + } + if (anthy_current_record) { + anthy_release_record(anthy_current_record); + } + anthy_release_private_dic(); + anthy_current_record = NULL; + anthy_quit_mem_dic(); + anthy_quit_diclib(); +} + diff --git a/src-worddic/word_lookup.c b/src-worddic/word_lookup.c new file mode 100644 index 0000000..6575b9b --- /dev/null +++ b/src-worddic/word_lookup.c @@ -0,0 +1,673 @@ +/* + * Word Dictionary + * ファイルの辞書のインターフェース、存在するデータは + * キャッシュされるのでここでは存在しない単語の + * サーチを高速にする必要がある。 + * + * anthy_gang_fill_seq_ent()が中心となる関数である + * 指定したword_dicから指定した文字列をインデックスとしてもつエントリに + * 語尾を付加してseq_entに追加する + * + * a)辞書の形式とb)辞書アクセスの高速化c)辞書ファイルのエンコーディング + * このソース中で扱ってるのでかなり複雑化してます. + * + * Copyright (C) 2000-2007 TABATA Yusuke + * Copyright (C) 2005-2006 YOSHIDA Yuichi + * Copyright (C) 2001-2002 TAKAI Kosuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#include "config.h" +#include <anthy/anthy.h> +#include <anthy/alloc.h> +#include <anthy/dic.h> +#include <anthy/word_dic.h> +#include <anthy/logger.h> +#include <anthy/xstr.h> +#include <anthy/diclib.h> + +#include "dic_main.h" +#include "dic_ent.h" + +#define NO_WORD -1 + +static allocator word_dic_ator; + +struct lookup_context { + struct gang_elm **array; + int nr; + int nth; + int is_reverse; +}; + +/* 1バイト目を見て、文字が何バイトあるかを返す */ +static int +mb_fragment_len(const char *str) +{ + unsigned char c = *((const unsigned char *)str); + if (c < 0x80) { + return 1; + } + if (c < 0xe0) { + return 2; + } + if (c < 0xf0) { + return 3; + } + if (c < 0xf8) { + return 4; + } + if (c < 0xfc) { + return 5; + } + return 6; +} + +static int +is_printable(char *str) +{ + unsigned char *tmp = (unsigned char *)str; + if (*tmp > 31 && *tmp < 127) { + return 1; + } + if (mb_fragment_len(str) > 1) { + return 1; + } + return 0; +} + +/* 辞書のエンコーディングからxcharを作る */ +static xchar +form_mb_char(const char *str) +{ + xchar xc; + anthy_utf8_to_ucs4_xchar(str, &xc); + return xc; +} + +static int +hash(xstr *x) +{ + return anthy_xstr_hash(x)& + (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1); +} + +static int +check_hash_ent(struct word_dic *wdic, xstr *xs) +{ + int val = hash(xs); + int idx = (val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1); + int bit = val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1); + return wdic->hash_ent[idx] & (1<<bit); +} + +static int +wtype_str_len(const char *str) +{ + int i; + for (i = 0; str[i] && str[i]!= ' '; i++); + return i; +} + +/* 辞書の行中をスキャンするための状態保持 */ +struct wt_stat { + wtype_t wt; + const char *wt_name; + int feature; + int freq; + int order_bonus;/* 辞書中の順序による頻度のボーナス */ + int offset;/* 文字列中のオフセット */ + const char *line; + int encoding; +}; +/* + * #XX*123 というCannadicの形式をパーズする + * #XX + * #XX*123 + * #XX,x*123 + */ +static const char * +parse_wtype_str(struct wt_stat *ws) +{ + int len; + char *buf; + char *freq_part; + char *feature_part; + const char *wt_name; + /* バッファへコピーする */ + len = wtype_str_len(&ws->line[ws->offset]); + buf = alloca(len + 1); + strncpy(buf, &ws->line[ws->offset], len); + buf[len] = 0; + + /* 素性(未使用) */ + feature_part = strchr(buf, ','); + if (feature_part) { + ws->feature = 1; + } else { + ws->feature = 0; + } + + /* 頻度をparseする */ + freq_part = strchr(buf, '*'); + if (freq_part) { + *freq_part = 0; + freq_part ++; + ws->freq = atoi(freq_part) * FREQ_RATIO; + } else { + ws->freq = FREQ_RATIO - 2; + } + + /**/ + wt_name = anthy_type_to_wtype(buf, &ws->wt); + if (!wt_name) { + ws->wt = anthy_wt_none; + } + ws->offset += len; + return wt_name; +} + + +static int +normalize_freq(struct wt_stat* ws) +{ + if (ws->freq < 0) { + ws->freq *= -1; + } + return ws->freq + ws->order_bonus; +} + +/* '\\'によるエスケープに対応したコピー */ +static void +copy_to_buf(char *buf, const char *src, int char_count) +{ + int pos; + int i; + pos = 0; + for (i = 0; i < char_count; i++){ + if (src[i] == '\\') { + if (src[i + 1] == ' ') { + i ++; + } else if (src[i + 1] == '\\') { + i ++; + } + } + buf[pos] = src[i]; + pos ++; + } + buf[pos] = 0; +} + +/** seq_entにdic_entを追加する */ +static int +add_dic_ent(struct seq_ent *seq, struct wt_stat *ws, + xstr* yomi, int is_reverse) +{ + int i; + /* 辞書ファイル中のバイト数 */ + int char_count; + char *buf; + xstr *xs; + int freq; + wtype_t w = ws->wt; + const char *s = &ws->line[ws->offset]; + + /* 単語の文字数を計算 */ + for (i = 0, char_count = 0; + s[i] && (s[i] != ' ') && (s[i] != '#'); i++) { + char_count ++; + if (s[i] == '\\') { + char_count++; + i++; + } + } + + /* 品詞が定義されていないので無視 */ + if (!ws->wt_name) { + return char_count; + } + + /* freqが負なのは逆変換用 */ + if (!is_reverse && ws->freq < 0) { + return char_count; + } + + /* bufに単語をコピー */ + buf = alloca(char_count+1); + copy_to_buf(buf, s, char_count); + + xs = anthy_cstr_to_xstr(buf, ws->encoding); + + /* freqが正なのは順変換用 */ + if (is_reverse && ws->freq > 0) { + /* 再変換の際に、変換済みの部分と未変換の部分が混じっていた場合に対応する為に、 + 平仮名のみからなる部分は順辞書にその読みを持つ単語があればdic_entを生成する。 + */ + if (anthy_get_xstr_type(yomi) & XCT_HIRA) { + freq = normalize_freq(ws); + anthy_mem_dic_push_back_dic_ent(seq, 0, yomi, w, + ws->wt_name, freq, 0); + } + anthy_free_xstr(xs); + return char_count; + } + + freq = normalize_freq(ws); + + anthy_mem_dic_push_back_dic_ent(seq, 0, xs, w, ws->wt_name, freq, 0); + if (anthy_wtype_get_meisi(w)) { + /* 連用形が名詞化するやつは名詞化したものも追加 */ + w = anthy_get_wtype_with_ct(w, CT_MEISIKA); + anthy_mem_dic_push_back_dic_ent(seq, 0, xs, w, ws->wt_name, freq, 0); + } + anthy_free_xstr(xs); + return char_count; +} + +static int +add_compound_ent(struct seq_ent *seq, struct wt_stat *ws, + xstr* yomi, + int is_reverse) +{ + int len = wtype_str_len(&ws->line[ws->offset]); + char *buf = alloca(len); + xstr *xs; + int freq; + + (void)yomi; + + /* freqが負なのは逆変換用 */ + if (!is_reverse && ws->freq < 0) { + /* 普段の変換では要らない */ + return len; + } + + /* freqが正なのは順変換用 */ + if (is_reverse && ws->freq > 0) { + + /* 再変換の際に、変換済みの部分と未変換の部分が混じっていた場合に対応する為に、 + 平仮名のみからなる部分は順辞書にその読みを持つ単語があればdic_entを生成する。 + */ + /* + yomiに#_等を付加した文字列を作る必要がある + if (anthy_get_xstr_type(yomi) & (XCT_HIRA | XCT_KATA)) { + freq = normalize_freq(ws); + anthy_mem_dic_push_back_compound_ent(seq, xs, ws->wt, freq); + } + */ + return len; + } + + strncpy(buf, &ws->line[ws->offset + 1], len - 1); + buf[len - 1] = 0; + xs = anthy_cstr_to_xstr(buf, ws->encoding); + + freq = normalize_freq(ws); + anthy_mem_dic_push_back_dic_ent(seq, 1, xs, ws->wt, + ws->wt_name, freq, 0); + anthy_free_xstr(xs); + + return len; +} + +static void +init_wt_stat(struct wt_stat *ws, char *line) +{ + ws->wt_name = NULL; + ws->freq = 0; + ws->feature = 0; + ws->order_bonus = 0; + ws->offset = 0; + ws->line = line; + ws->encoding = ANTHY_EUC_JP_ENCODING; + if (*(ws->line) == 'u') { + ws->encoding = ANTHY_UTF8_ENCODING; + ws->line ++; + } +} + +/** 辞書のエントリの情報を元にseq_entをうめる */ +static void +fill_dic_ent(char *line, struct seq_ent *seq, + xstr* yomi, int is_reverse) +{ + struct wt_stat ws; + init_wt_stat(&ws, line); + + while (ws.line[ws.offset]) { + if (ws.line[ws.offset] == '#') { + if (isalpha(ws.line[ws.offset + 1])) { + /* 品詞*頻度 */ + ws.wt_name = parse_wtype_str(&ws); + /**/ + ws.order_bonus = FREQ_RATIO - 1; + } else { + /* 複合語候補 */ + ws.offset += add_compound_ent(seq, &ws, + yomi, + is_reverse); + } + } else { + /* 単語 */ + ws.offset += add_dic_ent(seq, &ws, yomi, + is_reverse); + if (ws.order_bonus > 0) { + ws.order_bonus --; + } + } + if (ws.line[ws.offset] == ' ') { + ws.offset++; + } + } +} + +/* + * sに書かれた文字列によってxを変更する + * 返り値は読み進めたバイト数 + */ +static int +mkxstr(char *s, xstr *x) +{ + int i, len; + /* s[0]には巻き戻しの文字数 */ + x->len -= (s[0] - 1); + for (i = 1; is_printable(&s[i]); i ++) { + len = mb_fragment_len(&s[i]); + if (len > 1) { + /* マルチバイト */ + x->str[x->len] = form_mb_char(&s[i]); + x->len ++; + i += (len - 1); + } else { + /* 1バイト文字 */ + x->str[x->len] = s[i]; + x->len ++; + } + } + return i; +} + +static int +set_next_idx(struct lookup_context *lc) +{ + lc->nth ++; + while (lc->nth < lc->nr) { + if (lc->array[lc->nth]->tmp.idx != NO_WORD) { + return 1; + } + lc->nth ++; + } + return 0; +} + +/** ページ中の単語の場所を調べる */ +static void +search_words_in_page(struct lookup_context *lc, int page, char *s) +{ + int o = 0; + xchar *buf; + xstr xs; + int nr = 0; + /* このページ中にあるもっとも長い単語を格納しうる長さ */ + buf = alloca(sizeof(xchar)*strlen(s)/2); + xs.str = buf; + xs.len = 0; + + while (*s) { + int r; + s += mkxstr(s, &xs); + r = anthy_xstrcmp(&xs, &lc->array[lc->nth]->xs); + if (!r) { + lc->array[lc->nth]->tmp.idx = o + page * WORDS_PER_PAGE; + nr ++; + if (!set_next_idx(lc)) { + return ; + } + /* 同じページ内で次の単語を探す */ + } + o ++; + } + if (nr == 0) { + /* このページで1語も見つからなかったら、この単語は無い */ + lc->array[lc->nth]->tmp.idx = NO_WORD; + set_next_idx(lc); + } + /* 現在の単語は次の呼び出しで探す */ +} + +/**/ +static int +compare_page_index(struct word_dic *wdic, const char *key, int page) +{ + char buf[100]; + char *s = &wdic->page[anthy_dic_ntohl(wdic->page_index[page])]; + int i; + s++; + for (i = 0; is_printable(&s[i]);) { + int j, l = mb_fragment_len(&s[i]); + for (j = 0; j < l; j++) { + buf[i+j] = s[i+j]; + } + i += l; + } + buf[i] = 0; + return strcmp(key ,buf); +} + +/* 再帰的にバイナリサーチをする */ +static int +get_page_index_search(struct word_dic *wdic, const char *key, int f, int t) +{ + /* anthy_xstrcmpが-1で無くなったところを探す */ + int c,p; + c = (f+t)/2; + if (f+1==t) { + return c; + } else { + p = compare_page_index(wdic, key, c); + if (p < 0) { + return get_page_index_search(wdic, key, f, c); + } else { + /* c<= <t */ + return get_page_index_search(wdic, key, c, t); + } + } +} + +/** keyを含む可能性のあるページの番号を得る、 + * 範囲チェックをしてバイナリサーチを行うget_page_index_searchを呼ぶ + */ +static int +get_page_index(struct word_dic *wdic, struct lookup_context *lc) +{ + int page; + const char *key = lc->array[lc->nth]->key; + /* 最初のページの読みよりも小さい */ + if (compare_page_index(wdic, key, 0) < 0) { + return -1; + } + /* 最後のページの読みよりも大きいので、最後のページに含まれる可能性がある */ + if (compare_page_index(wdic, key, wdic->nr_pages-1) >= 0) { + return wdic->nr_pages-1; + } + /* 検索する */ + page = get_page_index_search(wdic, key, 0, wdic->nr_pages); + return page; +} + +static int +get_nr_page(struct word_dic *h) +{ + int i; + for (i = 1; anthy_dic_ntohl(h->page_index[i]); i++); + return i; +} + +static char * +get_section(struct word_dic *wdic, int section) +{ + int *p = (int *)wdic->dic_file; + int offset = anthy_dic_ntohl(p[section]); + return &wdic->dic_file[offset]; +} + +/** 辞書ファイルをmmapして、word_dic中の各セクションのポインタを取得する */ +static int +get_word_dic_sections(struct word_dic *wdic) +{ + wdic->entry_index = (int *)get_section(wdic, 2); + wdic->entry = (char *)get_section(wdic, 3); + wdic->page = (char *)get_section(wdic, 4); + wdic->page_index = (int *)get_section(wdic, 5); + wdic->uc_section = (char *)get_section(wdic, 6); + wdic->hash_ent = (unsigned char *)get_section(wdic, 7); + + return 0; +} + +/** 指定された単語の辞書中のインデックスを調べる */ +static void +search_yomi_index(struct word_dic *wdic, struct lookup_context *lc) +{ + int p; + int page_number; + + /* すでに無いことが分かっている */ + if (lc->array[lc->nth]->tmp.idx == NO_WORD) { + set_next_idx(lc); + return ; + } + + p = get_page_index(wdic, lc); + if (p == -1) { + lc->array[lc->nth]->tmp.idx = NO_WORD; + set_next_idx(lc); + return ; + } + + page_number = anthy_dic_ntohl(wdic->page_index[p]); + search_words_in_page(lc, p, &wdic->page[page_number]); +} + +static void +find_words(struct word_dic *wdic, struct lookup_context *lc) +{ + int i; + /* 検索前に除去 */ + for (i = 0; i < lc->nr; i++) { + lc->array[i]->tmp.idx = NO_WORD; + if (lc->array[i]->xs.len > 31) { + /* 32文字以上単語には未対応 */ + continue; + } + /* hashにないなら除去 */ + if (!check_hash_ent(wdic, &lc->array[i]->xs)) { + continue; + } + /* NO_WORDでない値を設定することで検索対象とする */ + lc->array[i]->tmp.idx = 0; + } + /* 検索する */ + lc->nth = 0; + while (lc->nth < lc->nr) { + search_yomi_index(wdic, lc); + } +} + +static void +load_words(struct word_dic *wdic, struct lookup_context *lc) +{ + int i; + for (i = 0; i < lc->nr; i++) { + int yomi_index; + yomi_index = lc->array[i]->tmp.idx; + if (yomi_index != NO_WORD) { + int entry_index; + struct seq_ent *seq; + seq = anthy_cache_get_seq_ent(&lc->array[i]->xs, + lc->is_reverse); + entry_index = anthy_dic_ntohl(wdic->entry_index[yomi_index]); + fill_dic_ent(&wdic->entry[entry_index], + seq, + &lc->array[i]->xs, + lc->is_reverse); + anthy_validate_seq_ent(seq, &lc->array[i]->xs, lc->is_reverse); + } + } +} + +/** word_dicから単語を検索する + * 辞書キャッシュから呼ばれる + * (gang lookupにすることを検討する) + */ +void +anthy_gang_fill_seq_ent(struct word_dic *wdic, + struct gang_elm **array, int nr, + int is_reverse) +{ + struct lookup_context lc; + lc.array = array; + lc.nr = nr; + lc.is_reverse = is_reverse; + + /* 各単語の場所を探す */ + find_words(wdic, &lc); + /* 単語の情報を読み込む */ + load_words(wdic, &lc); +} + +struct word_dic * +anthy_create_word_dic(void) +{ + struct word_dic *wdic; + char *p; + + wdic = anthy_smalloc(word_dic_ator); + memset(wdic, 0, sizeof(*wdic)); + + /* 辞書ファイルをマップする */ + wdic->dic_file = anthy_file_dic_get_section("word_dic"); + + /* 各セクションのポインタを取得する */ + if (get_word_dic_sections(wdic) == -1) { + anthy_sfree(word_dic_ator, wdic); + return 0; + } + wdic->nr_pages = get_nr_page(wdic); + + /* 用例辞書をマップする */ + p = wdic->uc_section; + return wdic; +} + +void +anthy_release_word_dic(struct word_dic *wdic) +{ + anthy_sfree(word_dic_ator, wdic); +} + +void +anthy_init_word_dic(void) +{ + word_dic_ator = anthy_create_allocator(sizeof(struct word_dic), NULL); +} diff --git a/src-worddic/wtab.h b/src-worddic/wtab.h new file mode 100644 index 0000000..d4bd41c --- /dev/null +++ b/src-worddic/wtab.h @@ -0,0 +1,160 @@ +/* + * 前半は cannadic-0.90からコピーしてきた。 + * 後の方にAnthy独自の品詞が追加してある。 + */ +/* リニアサーチで速く見付かるようによく使う品詞を先頭で定義する */ +{"#T",POS_NOUN,COS_NONE,SCOS_T35,CC_NONE,CT_NONE,WF_INDEP /* "名詞(語幹,格助接続)"*/}, +/**/ +{"#B5",POS_V,COS_NONE,SCOS_NONE,CC_B5,CT_HEAD,WF_INDEP /*"バ行五段"*/}, +{"#B5r",POS_V,COS_NONE,SCOS_NONE,CC_B5,CT_HEAD,WF_INDEP|WF_MEISI /*"バ行五段(連用形名詞)"*/}, +{"#C5r",POS_V,COS_NONE,SCOS_NONE,CC_C5,CT_HEAD,WF_INDEP|WF_MEISI /*"行く五段(連用形名詞)"*/}, +{"#CJ",POS_CONJ,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "接続詞・感動詞"*/}, +{"#CN",POS_NOUN,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "地名"*/}, +{"#CNPRE",POS_PRE,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "地名接頭語"*/}, +{"#CNS",POS_NOUN,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "地名(接尾語つき)"*/}, +{"#CNSUC1",POS_SUC,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "地名接尾語1"*/}, +{"#CNSUC2",POS_SUC,COS_CN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP/* "地名接尾語2"(N/A)*/}, +{"#D2KY",POS_D2KY,COS_NONE,SCOS_A1,CC_A,CT_HEAD,WF_INDEP /* "形容詞化接尾語(しづらい,がたい)"*/}, +{"#yasui",POS_D2KY,COS_NONE,SCOS_A1,CC_A,CT_HEAD,WF_INDEP /* #D2KYと同じ*/}, +{"#D2T16",POS_INVAL,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "形容動詞化接尾語(がち)"*/}, +{"#D2T35",POS_NOUN,COS_SUFFIX,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(っぱなし)"*/}, +{"#F00",POS_AV,COS_NONE,SCOS_F0,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,たる,する,語幹)"*/}, +{"#F01",POS_AV,COS_NONE,SCOS_F1,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,たる,する)"*/}, +{"#F02",POS_AV,COS_NONE,SCOS_F2,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,たる,語幹)"*/}, +{"#F03",POS_AV,COS_NONE,SCOS_F3,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,たる,)"*/}, +{"#F04",POS_AV,COS_NONE,SCOS_F4,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,する,語幹)"*/}, +{"#F05",POS_AV,COS_NONE,SCOS_F5,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,する)"*/}, +{"#F06",POS_AV,COS_NONE,SCOS_F6,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と,語幹)"*/}, +{"#F07",POS_AV,COS_NONE,SCOS_F7,CC_NONE,CT_NONE,WF_INDEP /* "副詞(と)"(N/A)*/}, +{"#F08",POS_AV,COS_NONE,SCOS_F8,CC_NONE,CT_NONE,WF_INDEP /* "副詞(たる,する,語幹)"(N/A)*/}, +{"#F09",POS_AV,COS_NONE,SCOS_F9,CC_NONE,CT_NONE,WF_INDEP /* "副詞(たる,する)"*/}, +{"#F10",POS_AV,COS_NONE,SCOS_F10,CC_NONE,CT_NONE,WF_INDEP /* "副詞(たる,語幹)"*/}, +{"#F11",POS_AV,COS_NONE,SCOS_F11,CC_NONE,CT_NONE,WF_INDEP /* "副詞(たる)"*/}, +{"#F12",POS_AV,COS_NONE,SCOS_F12,CC_NONE,CT_NONE,WF_INDEP /* "副詞(する,語幹)"*/}, +{"#F13",POS_AV,COS_NONE,SCOS_F13,CC_NONE,CT_NONE,WF_INDEP /* "副詞(する)"*/}, +{"#F14",POS_AV,COS_NONE,SCOS_F14,CC_NONE,CT_NONE,WF_INDEP /* "副詞(語幹)"*/}, +{"#F15",POS_AV,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "副詞(未定義)"(N/A)*/}, +{"#G5",POS_V,COS_NONE,SCOS_NONE,CC_G5,CT_HEAD,WF_INDEP /* "ガ行五段"*/}, +{"#G5r",POS_V,COS_NONE,SCOS_NONE,CC_G5,CT_HEAD,WF_INDEP|WF_MEISI /* "ガ行五段(連用形名詞)"*/}, +{"#JCN",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "人・地名"*/}, +{"#JN",POS_NOUN,COS_JN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "人名"*/}, +{"#JNM",POS_NOUN,COS_JN,SCOS_FSTNAME,CC_NONE,CT_NONE,WF_INDEP /* "人名(名)"*/}, +{"#JNMUC",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "人名(名)接尾語"(N/A)*/}, +{"#JNS",POS_NOUN,COS_JN,SCOS_FAMNAME,CC_NONE,CT_NONE,WF_INDEP /* "人名(姓)"*/}, +{"#JNSSUC",POS_SUC,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP/* "人名(姓)接尾語"(N/A)*/}, +{"#JNSUC",POS_SUC,COS_JN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "人名接尾語"*/}, +{"#JS",POS_SUC,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "助数詞"*/}, +{"#JSSUC",POS_SUC,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "助数詞接続語"*/}, +{"#K2T15",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(め)"(N/A)*/}, +{"#K2T16",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(げ)"(N/A)*/}, +{"#K2T35",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(さ)"(N/A)*/}, +{"#K5",POS_V,COS_NONE,SCOS_NONE,CC_K5,CT_HEAD,WF_INDEP /* "カ行五段"*/}, +{"#K5r",POS_V,COS_NONE,SCOS_NONE,CC_K5,CT_HEAD,WF_INDEP|WF_MEISI /* "カ行五段(連用形名詞)"*/}, +{"#KJ",POS_TANKANJI,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "単漢字"*/}, +{"#KK",POS_NOUN,COS_KK,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "団体名"*/}, +{"#KN",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "形式名詞(あと,うち,etc)"(N/A)*/}, +{"#KS",POS_V,COS_NONE,SCOS_NONE,CC_KS1,CT_HEAD,WF_INDEP /* "上下一段"*/}, +{"#KSr",POS_V,COS_NONE,SCOS_NONE,CC_KS1,CT_HEAD,WF_INDEP|WF_MEISI /* "上下一段(連用形名詞)"*/}, +{"#KY",POS_A,COS_NONE,SCOS_A0,CC_A,CT_HEAD,WF_INDEP /* "形容詞"*/}, +{"#KYT",POS_A,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP /* "形容詞(名詞)"(N/A)*/}, +{"#KYU",POS_A,COS_NONE,SCOS_NONE,CC_A_U,CT_HEAD,WF_INDEP /* "形容詞(ウ音便)"*/}, +{"#KYme",POS_A,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP /* "形容詞(め終了形容動詞化)"(N/A)*/}, +{"#KYmi",POS_A,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP /* "形容詞(み終了名詞化)"*/}, +{"#KYmime",POS_A,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP/* "形容詞(み・め終了形容動詞化)"(N/A)*/}, +{"#KYna",POS_A,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP /* "形容詞(な終了連体詞化)"(N/A)*/}, +{"#L5",POS_V,COS_NONE,SCOS_NONE,CC_L5,CT_HEAD,WF_INDEP /* "ラ行5段(命令形がイ)"*/}, +{"#M5",POS_V,COS_NONE,SCOS_NONE,CC_M5,CT_HEAD,WF_INDEP /* "マ行五段"*/}, +{"#M5r",POS_V,COS_NONE,SCOS_NONE,CC_M5,CT_HEAD,WF_INDEP|WF_MEISI /* "マ行五段(連用形名詞)"*/}, +{"#N00",POS_NUMBER,COS_NN,SCOS_N10000,CC_NONE,CT_NONE,WF_INDEP /* "数詞(x万,x億,x兆)"*/}, +{"#N01",POS_NUMBER,COS_NN,SCOS_N1000,CC_NONE,CT_NONE,WF_INDEP /* "数詞(千,二千,…,数千)"(N/A)*/}, +{"#N02",POS_NUMBER,COS_NN,SCOS_N100,CC_NONE,CT_NONE,WF_INDEP /* "数詞(百,二百,…,数百)"*/}, +{"#N03",POS_NUMBER,COS_NN,SCOS_N10,CC_NONE,CT_NONE,WF_INDEP /* "数詞(十,二十,…,数十)"*/}, +{"#N2KYT",POS_NOUN,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP /* "形容詞化接尾語(だかい,ばやい)"(N/A)*/}, +{"#N2T10",POS_INVAL,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(な,する,語幹,格助接続)"*/}, +{"#N2T16",POS_INVAL,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "副詞化接尾語(な,語幹,格助接続△)"*/}, +{"#N2T17",POS_INVAL,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "副詞化接尾語(な,語幹)"*/}, +{"#N2T30",POS_N2T,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(する,語幹,格助接続)"*/}, +{"#N2T35",POS_NOUN,COS_SUFFIX,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "名詞化接尾語(語幹,格助接続)"*/}, +{"#N5",POS_V,COS_NONE,SCOS_NONE,CC_N5,CT_HEAD,WF_INDEP /* "ナ行五段"*/}, +{"#N5r",POS_V,COS_NONE,SCOS_NONE,CC_N5,CT_HEAD,WF_INDEP|WF_MEISI /* "ナ行五段(連用形名詞)"*/}, +{"#ND2KY",POS_INVAL,COS_NONE,SCOS_NONE,CC_A,CT_HEAD,WF_INDEP /* "形容詞化接尾語(がましい,づよい)"*/}, +{"#NN",POS_NUMBER,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "数詞(数,幾)"*/}, +{"#NNPRE",POS_PRE,COS_NN,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE /* "数詞接頭語"*/}, +{"#OKX",POS_V,COS_NONE,SCOS_NONE,CC_SV,CT_NONE,WF_INDEP /* "動詞丁寧表現語幹"*/}, +{"#PRE",POS_PRE,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "一般名詞接頭語"*/}, +{"#R5",POS_V,COS_NONE,SCOS_NONE,CC_R5,CT_HEAD,WF_INDEP /* "ラ行五段"*/}, +{"#R5r",POS_V,COS_NONE,SCOS_NONE,CC_R5,CT_HEAD,WF_INDEP|WF_MEISI /* "ラ行五段(連用形名詞)"*/}, +{"#RT",POS_ME,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "連体詞"*/}, +{"#S5",POS_V,COS_NONE,SCOS_NONE,CC_S5,CT_HEAD,WF_INDEP /* "サ行五段"*/}, +{"#S5r",POS_V,COS_NONE,SCOS_NONE,CC_S5,CT_HEAD,WF_INDEP|WF_MEISI /* "サ行五段(連用形名詞)"*/}, +{"#SUC",POS_NOUN,COS_SUFFIX,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* "一般名詞接尾語"*/}, +{"#SX",POS_V,COS_NONE,SCOS_NONE,CC_SV,CT_HEAD,WF_INDEP /* "さ変動詞"*/}, +{"#T00",POS_NOUN,COS_NONE,SCOS_T0,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,さ,する,語幹,格助接続)"*/}, +{"#T01",POS_NOUN,COS_NONE,SCOS_T0,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,さ,する,語幹,格助接続△)(N/A)"*/}, +{"#T02",POS_NOUN,COS_NONE,SCOS_T2,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,さ,する,語幹)"*/}, +{"#T03",POS_NOUN,COS_NONE,SCOS_T3,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,さ,する,格助接続)"*/}, +{"#T04",POS_NOUN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,さ,する)"(N/A)*/}, +{"#T05",POS_NOUN,COS_NONE,SCOS_T5,CC_AJV,CT_HEAD,WF_INDEP|WF_AJV /* "名詞(な,さ,語幹,格助接続)"*/}, +{"#T06",POS_NOUN,COS_NONE,SCOS_T5,CC_NONE,CT_NONE,WF_INDEP /* "名詞(な,さ,語幹,格助接続△)"*/}, +{"#T07",POS_NOUN,COS_NONE,SCOS_T7,CC_NONE,CT_NONE,WF_INDEP /* "副詞(な,さ,語幹)"*/}, +{"#T08",POS_NOUN,COS_NONE,SCOS_T8,CC_NONE,CT_NONE,WF_INDEP /* "名詞(な,さ,格助接続△)"*/}, +{"#T09",POS_NOUN,COS_NONE,SCOS_T9,CC_NONE,CT_NONE,WF_INDEP /* "名詞(な,さ)"(N/A)*/}, +{"#T10",POS_NOUN,COS_NONE,SCOS_T10,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,する,語幹,格助接続)"*/}, +{"#T11",POS_NOUN,COS_NONE,SCOS_T10,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,する,語幹,格助接続△)"*/}, +{"#T12",POS_NOUN,COS_NONE,SCOS_T12,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,する,語幹)"*/}, +{"#T13",POS_NOUN,COS_NONE,SCOS_T13,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,する,格助接続△)"*/}, +{"#T14",POS_NOUN,COS_NONE,SCOS_T14,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(な,する)"(N/A)*/}, +{"#T15",POS_NOUN,COS_NONE,SCOS_T15,CC_NONE,CT_NONE,WF_INDEP /* "名詞・副詞(な,語幹,格助接続)"*/}, +{"#T16",POS_NOUN,COS_NONE,SCOS_T15,CC_NONE,CT_NONE,WF_INDEP /* "副詞(な,語幹,格助接続△)"*/}, +{"#T17",POS_NOUN,COS_NONE,SCOS_T17,CC_NONE,CT_NONE,WF_INDEP /* "副詞(な,語幹)"*/}, +{"#T18",POS_NOUN,COS_NONE,SCOS_T18,CC_NONE,CT_NONE,WF_INDEP /* "名詞(な,格助接続△)"*/}, +{"#T19",POS_NOUN,COS_NONE,SCOS_T19,CC_NONE,CT_NONE,WF_INDEP /* "名詞(な)"*/}, +{"#T20",POS_NOUN,COS_NONE,SCOS_T20,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(さ,する,語幹,格助接続)"(N/A)*/}, +{"#T21",POS_NOUN,COS_NONE,SCOS_T20,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(さ,する,語幹,格助接続△)"(N/A)*/}, +{"#T22",POS_NOUN,COS_NONE,SCOS_T22,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(さ,する,語幹)"(N/A)*/}, +{"#T23",POS_NOUN,COS_NONE,SCOS_T23,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(さ,する,格助接続△)"(N/A)*/}, +{"#T24",POS_NOUN,COS_NONE,SCOS_T24,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(さ,する)"(N/A)*/}, +{"#T25",POS_NOUN,COS_NONE,SCOS_T25,CC_NONE,CT_NONE,WF_INDEP /* "名詞(さ,語幹,格助接続)"(N/A)*/}, +{"#T26",POS_NOUN,COS_NONE,SCOS_T25,CC_NONE,CT_NONE,WF_INDEP /* "名詞(さ,語幹,格助接続△)"(N/A)*/}, +{"#T27",POS_NOUN,COS_NONE,SCOS_T27,CC_NONE,CT_NONE,WF_INDEP /* "名詞(さ,語幹)"(N/A)*/}, +{"#T28",POS_NOUN,COS_NONE,SCOS_T28,CC_NONE,CT_NONE,WF_INDEP /* "名詞(さ,格助接続△)"(N/A)*/}, +{"#T29",POS_NOUN,COS_NONE,SCOS_T29,CC_NONE,CT_NONE,WF_INDEP /* "名詞(さ)"(N/A)*/}, +{"#T30",POS_NOUN,COS_NONE,SCOS_T30,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(する,語幹,格助接続)"*/}, +{"#T31",POS_NOUN,COS_NONE,SCOS_T30,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "副詞(する,語幹,格助接続△)"*/}, +{"#T32",POS_NOUN,COS_NONE,SCOS_T32,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(する,語幹)"(N/A)*/}, +{"#T33",POS_NOUN,COS_NONE,SCOS_T33,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(する,格助接続△)"(N/A)*/}, +{"#T34",POS_NOUN,COS_NONE,SCOS_T34,CC_NONE,CT_NONE,WF_INDEP|WF_SV /* "名詞(する)"(N/A)*/}, +{"#T35",POS_NOUN,COS_NONE,SCOS_T35,CC_NONE,CT_NONE,WF_INDEP /* "名詞(語幹,格助接続)"*/}, +{"#T36",POS_NOUN,COS_NONE,SCOS_T35,CC_NONE,CT_NONE,WF_INDEP /* "副詞(語幹,格助接続△)"*/}, +{"#T37",POS_NOUN,COS_NONE,SCOS_T37,CC_NONE,CT_NONE,WF_INDEP /* "副詞(語幹)"(N/A)*/}, +{"#T38",POS_NOUN,COS_NONE,SCOS_T38,CC_NONE,CT_NONE,WF_INDEP /* "副詞(語幹)"(N/A)*/}, +{"#T39",POS_NOUN,COS_NONE,SCOS_T39,CC_NONE,CT_NONE,WF_INDEP /* "名詞副詞(語幹)"(N/A)*/}, +{"#T5",POS_V,COS_NONE,SCOS_NONE,CC_T5,CT_HEAD,WF_INDEP /* "タ行五段"*/}, +{"#T5r",POS_V,COS_NONE,SCOS_NONE,CC_T5,CT_HEAD,WF_INDEP|WF_MEISI /* "タ行五段(連用形名詞)"*/}, +{"#U5",POS_V,COS_NONE,SCOS_NONE,CC_U5,CT_HEAD,WF_INDEP /* "乞う五段"*/}, +{"#U5r",POS_V,COS_NONE,SCOS_NONE,CC_U5,CT_HEAD,WF_INDEP|WF_MEISI /* "乞う五段(連用形名詞)"*/}, +{"#W5",POS_V,COS_NONE,SCOS_NONE,CC_W5,CT_HEAD,WF_INDEP /* "ワ行五段"*/}, +{"#W5r",POS_V,COS_NONE,SCOS_NONE,CC_W5,CT_HEAD,WF_INDEP|WF_MEISI /* "ワ行五段(連用形名詞)"*/}, +{"#ZX",POS_V,COS_NONE,SCOS_NONE,CC_ZV,CT_HEAD,WF_INDEP /* "さ変動詞"*/}, +{"#aru",POS_V,COS_NONE,SCOS_NONE,CC_RV,CT_HEAD,WF_INDEP|WF_MEISI /* "ある"*/}, + /* カ変は#KXを使う */ +{"#kxi",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_RENYOU,WF_INDEP /* "カ変動詞(き)"*/}, +{"#kxo",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_MIZEN,WF_INDEP /* "カ変動詞(こ)"*/}, +{"#kxoi",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_MEIREI,WF_INDEP /* "カ変動詞(こい)"*/}, +{"#kxure",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_KATEI,WF_INDEP /* "カ変動詞(くれ)"*/}, +{"#kxuru",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_SYUSI,WF_INDEP /* "カ変動詞(くる)"*/}, +{"#kxya",POS_V,COS_NONE,SCOS_NONE,CC_KV,CT_NONE,WF_INDEP /* "カ変動詞(くりゃ)"*/}, + /* サ変「する」 */ +{"#sxi",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MIZEN,WF_INDEP /* "する未然形(し)"*/}, +{"#sxe",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MIZEN,WF_INDEP /* "する未然形(せ)"*/}, +{"#sxi2",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_RENYOU,WF_INDEP /* "する連用形(し)"*/}, +{"#sxiro",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MEIREI,WF_INDEP /*"する命令形(しろ)"*/}, +{"#sxeyo",POS_V,COS_NONE,SCOS_NONE,CC_SRV,CT_MEIREI,WF_INDEP /*"する命令形(せよ)"*/}, + /* 独自拡張 */ +{"#NONE",POS_INVAL,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_NONE}, +{"#N04",POS_NUMBER,COS_NN,SCOS_N1,CC_NONE,CT_NONE,WF_NONE},/*いち、に、さん…*/ +{"#N05",POS_NUMBER,COS_NN,SCOS_N100,CC_NONE,CT_NONE,WF_NONE},/*ひゃく、にひゃく*/ +{"#SVSUC",POS_SUC,COS_SVSUFFIX,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* サ変接尾語*/}, +{"#OPEN",POS_OPEN,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* 開き括弧 */}, +{"#CLOSE",POS_CLOSE,COS_NONE,SCOS_NONE,CC_NONE,CT_NONE,WF_INDEP /* 閉じ括弧 */}, +{0,0,0,0,0,0,0}, diff --git a/src-worddic/wtype.c b/src-worddic/wtype.c new file mode 100644 index 0000000..df29de3 --- /dev/null +++ b/src-worddic/wtype.c @@ -0,0 +1,292 @@ +/* + * 品詞型を管理する + * 中身はwtype_tの内部のレイアウトに強く依存する。 + * + * Copyright (C) 2000-2007 TABATA Yusuke + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <string.h> + +#include <anthy/wtype.h> +#include "dic_main.h" + +wtype_t anthy_wt_none, anthy_wt_all; + +struct wttable { + const char *name; + int pos; + int cos; + int scos; + int cc; + int ct;/*カ変など*/ + int flags; +}; + +/* 品詞の日本語の名前を品詞に変換するテーブル */ +static struct wttable pos_name_tab[]= { +#include "ptab.h" +}; + +/* 辞書中の品詞の名前を品詞に変換するテーブル */ +static struct wttable wt_name_tab[]= { +#include "wtab.h" +}; + +static struct wttable * +find_wttab(struct wttable *array, const char *name) +{ + struct wttable *w; + for (w = array; w->name; w++) { + if (!strcmp(w->name, name)) { + return w; + } + } + return NULL; +} + +void +anthy_init_wtypes(void) +{ + anthy_wt_all.pos = POS_NONE; + anthy_wt_all.cc = CC_NONE; + anthy_wt_all.ct = CT_NONE; + anthy_wt_all.cos = COS_NONE; + anthy_wt_all.scos = SCOS_NONE; + anthy_wt_all.wf = WF_NONE; + + anthy_wt_none = anthy_wt_all; + anthy_wt_none.pos = POS_INVAL; +} + +/* + * 返り値には品詞の名前 + * tには品詞が返される + */ +const char * +anthy_type_to_wtype(const char *s, wtype_t *t) +{ + struct wttable *w; + if (s[0] != '#') { + *t = anthy_wt_none; + return NULL; + } + w = find_wttab(wt_name_tab, s); + if (!w) { + *t = anthy_wt_all; + return NULL; + } + *t = anthy_get_wtype(w->pos, w->cos, w->scos, w->cc, w->ct, w->flags); + return w->name; +} + +wtype_t +anthy_init_wtype_by_name(const char *name) +{ + struct wttable *p; + p = find_wttab(pos_name_tab, name); + + if (p) { + return anthy_get_wtype(p->pos, p->cos, p->scos, p->cc, p->ct, p->flags); + } + + printf("Failed to find wtype(%s).\n", name); + return anthy_wt_all; +} + +/* 二つの品詞が完全に一致しているかどうか */ +int +anthy_wtype_equal(wtype_t lhs, wtype_t rhs) +{ + if (lhs.pos == rhs.pos && + lhs.cos == rhs.cos && + lhs.scos == rhs.scos && + lhs.cc == rhs.cc && + lhs.ct == rhs.ct && + lhs.wf == rhs.wf) { + return 1; + } else { + return 0; + } +} + + +/* n は hs の一部かどうか? */ +int +anthy_wtype_include(wtype_t hs, wtype_t n) +{ + /*printf("POS %d,%d\n", hs.type[WT_POS], n.type[WT_POS]);*/ + if (hs.pos != POS_NONE && + hs.pos != n.pos) { + return 0; + } + if (hs.cc != CC_NONE && + hs.cc != n.cc) { + return 0; + } + if (hs.ct != CT_NONE && + hs.ct != n.ct) { + return 0; + } + if (hs.cos != COS_NONE && + hs.cos != n.cos) { + return 0; + } + if (hs.scos != SCOS_NONE && + hs.scos != n.scos) { + return 0; + } + return 1; +} + +int +anthy_wtype_get_cc(wtype_t t) +{ + return t.cc; +} + +int +anthy_wtype_get_ct(wtype_t t) +{ + return t.ct; +} + +int +anthy_wtype_get_pos(wtype_t t) +{ + return t.pos; +} + +int +anthy_wtype_get_cos(wtype_t t) +{ + return t.cos; +} + +int +anthy_wtype_get_scos(wtype_t t) +{ + return t.scos; +} + +int +anthy_wtype_get_wf(wtype_t t) +{ + return t.wf; +} + +int +anthy_wtype_get_indep(wtype_t t) +{ + return t.wf & WF_INDEP; +} + +int +anthy_wtype_get_meisi(wtype_t w) +{ + return w.wf & WF_MEISI; +} + +int +anthy_wtype_get_sv(wtype_t w) +{ + return w.wf & WF_SV; +} + +int +anthy_wtype_get_ajv(wtype_t w) +{ + return w.wf & WF_AJV; +} + +void +anthy_wtype_set_cc(wtype_t *w, int cc) +{ + w->cc = cc; +} + +void +anthy_wtype_set_ct(wtype_t *w, int ct) +{ + w->ct = ct; +} + +void +anthy_wtype_set_pos(wtype_t *w, int pos) +{ + w->pos = pos; +} + +void +anthy_wtype_set_cos(wtype_t *w, int cs) +{ + w->cos = cs; +} + +void +anthy_wtype_set_scos(wtype_t *w, int sc) +{ + w->scos = sc; +} + +void +anthy_wtype_set_dep(wtype_t *w, int isDep) +{ + if (isDep) { + w->wf &= (~WF_INDEP); + }else{ + w->wf |= WF_INDEP; + } +} + +void +anthy_print_wtype(wtype_t w) +{ + printf("(POS=%d,COS=%d,SCOS=%d,CC=%d,CT=%d,flags=%d)\n", + anthy_wtype_get_pos(w), + anthy_wtype_get_cos(w), + anthy_wtype_get_scos(w), + anthy_wtype_get_cc(w), + anthy_wtype_get_ct(w), + anthy_wtype_get_wf(w)); +} + +wtype_t +anthy_get_wtype_with_ct(wtype_t base, int ct) +{ + wtype_t w; + + w = base; + w.ct = ct; + + return w; +} + +wtype_t +anthy_get_wtype(int pos, int cos, int scos, int cc, int ct, int wf) +{ + wtype_t w; + + w.pos = pos; + w.cos = cos; + w.scos = scos; + w.cc = cc; + w.ct = ct; + w.wf = wf; + + return w; +} |