diff options
author | Peng Wu <alexepico@gmail.com> | 2012-05-18 11:42:05 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2012-05-18 11:52:20 +0800 |
commit | ad922c4b6f8a6f2062a980e7b15b4fa17428b182 (patch) | |
tree | 76efbc71f979886ca010340eb5506efd4139d563 | |
parent | 46d7e4c0e4a1a0a8b8e8b8346e88d222597dacb7 (diff) | |
download | libpinyin-ad922c4b6f8a6f2062a980e7b15b4fa17428b182.tar.gz |
refine utils
-rw-r--r-- | utils/Makefile.am | 4 | ||||
-rw-r--r-- | utils/segment/Makefile.am | 1 | ||||
-rw-r--r-- | utils/segment/ngseg.cpp | 17 | ||||
-rw-r--r-- | utils/storage/Makefile.am | 1 | ||||
-rw-r--r-- | utils/storage/export_interpolation.cpp | 20 | ||||
-rw-r--r-- | utils/storage/import_interpolation.cpp | 17 | ||||
-rw-r--r-- | utils/training/Makefile.am | 1 | ||||
-rw-r--r-- | utils/training/estimate_interpolation.cpp | 19 | ||||
-rw-r--r-- | utils/training/eval_correction_rate.cpp | 17 | ||||
-rw-r--r-- | utils/training/export_k_mixture_model.cpp | 18 | ||||
-rw-r--r-- | utils/training/gen_ngram.cpp | 17 | ||||
-rw-r--r-- | utils/training/gen_unigram.cpp | 18 | ||||
-rw-r--r-- | utils/utils_helper.h | 46 |
13 files changed, 77 insertions, 119 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am index c315d10..8342f61 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -20,6 +20,8 @@ SUBDIRS = storage segment training MAINTAINERCLEANFILES = Makefile.in -CLEANFILES = *.bak +CLEANFILES = *.bak ACLOCAL = aclocal -I $(ac_aux_dir) + +noinst_HEADERS = utils_helper.h diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am index b04f64f..4561ad1 100644 --- a/utils/segment/Makefile.am +++ b/utils/segment/Makefile.am @@ -21,6 +21,7 @@ INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ @GLIB2_CPPFLAGS@ noinst_PROGRAMS = spseg ngseg diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp index 25b9361..b0ec850 100644 --- a/utils/segment/ngseg.cpp +++ b/utils/segment/ngseg.cpp @@ -23,6 +23,7 @@ #include <stdlib.h> #include <locale.h> #include "pinyin_internal.h" +#include "utils_helper.h" /* n-gram based sentence segment. */ @@ -109,20 +110,8 @@ int main(int argc, char * argv[]){ /* init phrase index */ FacadePhraseIndex phrase_index; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); /* init bi-gram */ Bigram system_bigram; diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am index f8f03a5..cb54626 100644 --- a/utils/storage/Makefile.am +++ b/utils/storage/Makefile.am @@ -19,6 +19,7 @@ INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ @GLIB2_CPPFLAGS@ bin_PROGRAMS = gen_binary_files \ diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp index 60e27d6..10dcbdd 100644 --- a/utils/storage/export_interpolation.cpp +++ b/utils/storage/export_interpolation.cpp @@ -24,7 +24,7 @@ #include <assert.h> #include <glib.h> #include "pinyin_internal.h" - +#include "utils_helper.h" /* export interpolation model as textual format */ @@ -47,20 +47,8 @@ int main(int argc, char * argv[]){ MemoryChunk * chunk = NULL; FacadePhraseIndex phrase_index; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_READONLY); @@ -80,7 +68,7 @@ bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) { PhraseIndexRange range; int result = phrase_index->get_range(i, range); - if ( result ) + if (ERROR_OK != result ) continue; PhraseItem item; diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp index 0bc45ba..89e2276 100644 --- a/utils/storage/import_interpolation.cpp +++ b/utils/storage/import_interpolation.cpp @@ -22,6 +22,7 @@ #include <stdio.h> #include <glib.h> #include "pinyin_internal.h" +#include "utils_helper.h" enum LINE_TYPE{ BEGIN_LINE = 1, @@ -207,20 +208,8 @@ int main(int argc, char * argv[]){ phrases.load(chunk); FacadePhraseIndex phrase_index; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); Bigram bigram; retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am index 32ad26d..8503c63 100644 --- a/utils/training/Makefile.am +++ b/utils/training/Makefile.am @@ -21,6 +21,7 @@ INCLUDES = -I$(top_srcdir)/src \ -I$(top_srcdir)/src/include \ -I$(top_srcdir)/src/storage \ -I$(top_srcdir)/src/lookup \ + -I$(top_srcdir)/utils \ @GLIB2_CPPFLAGS@ noinst_HEADERS = k_mixture_model.h diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp index cd2ef96..a7ba100 100644 --- a/utils/training/estimate_interpolation.cpp +++ b/utils/training/estimate_interpolation.cpp @@ -26,6 +26,7 @@ #include <math.h> #include <glib.h> #include "pinyin_internal.h" +#include "utils_helper.h" parameter_t compute_interpolation(SingleGram * deleted_bigram, FacadePhraseIndex * unigram, @@ -63,7 +64,6 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram, } { - guint32 freq = 0; parameter_t elem_poss = 0; PhraseItem item; if (!unigram->get_phrase_item(token, item)){ @@ -90,21 +90,8 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram, int main(int argc, char * argv[]){ FacadePhraseIndex phrase_index; - MemoryChunk * chunk = NULL; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); Bigram bigram; bigram.attach("bigram.db", ATTACH_READONLY); diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp index ada160f..1fa82b6 100644 --- a/utils/training/eval_correction_rate.cpp +++ b/utils/training/eval_correction_rate.cpp @@ -21,6 +21,7 @@ #include "pinyin_internal.h" +#include "utils_helper.h" void print_help(){ @@ -123,20 +124,8 @@ int main(int argc, char * argv[]){ largetable.load(options, chunk, NULL); FacadePhraseIndex phrase_index; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); FacadePhraseTable phrases; chunk = new MemoryChunk; diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp index df09d24..a52eb46 100644 --- a/utils/training/export_k_mixture_model.cpp +++ b/utils/training/export_k_mixture_model.cpp @@ -21,6 +21,7 @@ #include "pinyin_internal.h" #include "k_mixture_model.h" +#include "utils_helper.h" void print_help(){ printf("Usage: export_k_mixture_model [--k-mixture-model-file <FILENAME>]\n"); @@ -125,21 +126,8 @@ int main(int argc, char * argv[]){ } FacadePhraseIndex phrase_index; - MemoryChunk * chunk = NULL; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER); bigram.attach(k_mixture_model_filename, ATTACH_READONLY); diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp index 6de442b..93fae14 100644 --- a/utils/training/gen_ngram.cpp +++ b/utils/training/gen_ngram.cpp @@ -25,6 +25,7 @@ #include <locale.h> #include <glib.h> #include "pinyin_internal.h" +#include "utils_helper.h" void print_help(){ printf("Usage: gen_ngram [--skip-pi-gram-training]\n"); @@ -63,20 +64,8 @@ int main(int argc, char * argv[]){ phrases.load(chunk); FacadePhraseIndex phrase_index; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp index 29a1bc6..b03235e 100644 --- a/utils/training/gen_unigram.cpp +++ b/utils/training/gen_unigram.cpp @@ -21,7 +21,7 @@ #include <stdio.h> #include "pinyin_internal.h" - +#include "utils_helper.h" /* increase all unigram frequency by a constant. */ @@ -29,20 +29,8 @@ int main(int argc, char * argv[]){ MemoryChunk * chunk = NULL; FacadePhraseIndex phrase_index; - for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { - const char * bin_file = pinyin_phrase_files[i]; - if (NULL == bin_file) - continue; - - chunk = new MemoryChunk; - bool retval = chunk->load(bin_file); - if (!retval) { - fprintf(stderr, "open %s failed!\n", bin_file); - exit(ENOENT); - } - - phrase_index.load(i, chunk); - } + if (!init_phrase_index(&phrase_index)) + exit(ENOENT); /* Note: please increase the value when corpus size becomes larger. * To avoid zero value when computing unigram frequency in float format. diff --git a/utils/utils_helper.h b/utils/utils_helper.h index e69de29..19ece80 100644 --- a/utils/utils_helper.h +++ b/utils/utils_helper.h @@ -0,0 +1,46 @@ +/* + * libpinyin + * Library to deal with pinyin. + * + * Copyright (C) 2012 Peng Wu <alexepico@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + + +#ifndef UTILS_HELPER_H +#define UTILS_HELPER_H + +static bool init_phrase_index(FacadePhraseIndex * phrase_index) { + MemoryChunk * chunk = NULL; + for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { + const char * bin_file = pinyin_phrase_files[i]; + if (NULL == bin_file) + continue; + + chunk = new MemoryChunk; + bool retval = chunk->load(bin_file); + if (!retval) { + fprintf(stderr, "open %s failed!\n", bin_file); + return false; + } + + phrase_index->load(i, chunk); + } + return true; +} + + +#endif |