summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2012-05-18 11:42:05 +0800
committerPeng Wu <alexepico@gmail.com>2012-05-18 11:52:20 +0800
commitad922c4b6f8a6f2062a980e7b15b4fa17428b182 (patch)
tree76efbc71f979886ca010340eb5506efd4139d563
parent46d7e4c0e4a1a0a8b8e8b8346e88d222597dacb7 (diff)
downloadlibpinyin-ad922c4b6f8a6f2062a980e7b15b4fa17428b182.tar.gz
refine utils
-rw-r--r--utils/Makefile.am4
-rw-r--r--utils/segment/Makefile.am1
-rw-r--r--utils/segment/ngseg.cpp17
-rw-r--r--utils/storage/Makefile.am1
-rw-r--r--utils/storage/export_interpolation.cpp20
-rw-r--r--utils/storage/import_interpolation.cpp17
-rw-r--r--utils/training/Makefile.am1
-rw-r--r--utils/training/estimate_interpolation.cpp19
-rw-r--r--utils/training/eval_correction_rate.cpp17
-rw-r--r--utils/training/export_k_mixture_model.cpp18
-rw-r--r--utils/training/gen_ngram.cpp17
-rw-r--r--utils/training/gen_unigram.cpp18
-rw-r--r--utils/utils_helper.h46
13 files changed, 77 insertions, 119 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index c315d10..8342f61 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -20,6 +20,8 @@ SUBDIRS = storage segment training
MAINTAINERCLEANFILES = Makefile.in
-CLEANFILES = *.bak
+CLEANFILES = *.bak
ACLOCAL = aclocal -I $(ac_aux_dir)
+
+noinst_HEADERS = utils_helper.h
diff --git a/utils/segment/Makefile.am b/utils/segment/Makefile.am
index b04f64f..4561ad1 100644
--- a/utils/segment/Makefile.am
+++ b/utils/segment/Makefile.am
@@ -21,6 +21,7 @@ INCLUDES = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
@GLIB2_CPPFLAGS@
noinst_PROGRAMS = spseg ngseg
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp
index 25b9361..b0ec850 100644
--- a/utils/segment/ngseg.cpp
+++ b/utils/segment/ngseg.cpp
@@ -23,6 +23,7 @@
#include <stdlib.h>
#include <locale.h>
#include "pinyin_internal.h"
+#include "utils_helper.h"
/* n-gram based sentence segment. */
@@ -109,20 +110,8 @@ int main(int argc, char * argv[]){
/* init phrase index */
FacadePhraseIndex phrase_index;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
/* init bi-gram */
Bigram system_bigram;
diff --git a/utils/storage/Makefile.am b/utils/storage/Makefile.am
index f8f03a5..cb54626 100644
--- a/utils/storage/Makefile.am
+++ b/utils/storage/Makefile.am
@@ -19,6 +19,7 @@ INCLUDES = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
@GLIB2_CPPFLAGS@
bin_PROGRAMS = gen_binary_files \
diff --git a/utils/storage/export_interpolation.cpp b/utils/storage/export_interpolation.cpp
index 60e27d6..10dcbdd 100644
--- a/utils/storage/export_interpolation.cpp
+++ b/utils/storage/export_interpolation.cpp
@@ -24,7 +24,7 @@
#include <assert.h>
#include <glib.h>
#include "pinyin_internal.h"
-
+#include "utils_helper.h"
/* export interpolation model as textual format */
@@ -47,20 +47,8 @@ int main(int argc, char * argv[]){
MemoryChunk * chunk = NULL;
FacadePhraseIndex phrase_index;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
Bigram bigram;
bigram.attach(bigram_filename, ATTACH_READONLY);
@@ -80,7 +68,7 @@ bool gen_unigram(FILE * output, FacadePhraseIndex * phrase_index) {
PhraseIndexRange range;
int result = phrase_index->get_range(i, range);
- if ( result )
+ if (ERROR_OK != result )
continue;
PhraseItem item;
diff --git a/utils/storage/import_interpolation.cpp b/utils/storage/import_interpolation.cpp
index 0bc45ba..89e2276 100644
--- a/utils/storage/import_interpolation.cpp
+++ b/utils/storage/import_interpolation.cpp
@@ -22,6 +22,7 @@
#include <stdio.h>
#include <glib.h>
#include "pinyin_internal.h"
+#include "utils_helper.h"
enum LINE_TYPE{
BEGIN_LINE = 1,
@@ -207,20 +208,8 @@ int main(int argc, char * argv[]){
phrases.load(chunk);
FacadePhraseIndex phrase_index;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
Bigram bigram;
retval = bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
diff --git a/utils/training/Makefile.am b/utils/training/Makefile.am
index 32ad26d..8503c63 100644
--- a/utils/training/Makefile.am
+++ b/utils/training/Makefile.am
@@ -21,6 +21,7 @@ INCLUDES = -I$(top_srcdir)/src \
-I$(top_srcdir)/src/include \
-I$(top_srcdir)/src/storage \
-I$(top_srcdir)/src/lookup \
+ -I$(top_srcdir)/utils \
@GLIB2_CPPFLAGS@
noinst_HEADERS = k_mixture_model.h
diff --git a/utils/training/estimate_interpolation.cpp b/utils/training/estimate_interpolation.cpp
index cd2ef96..a7ba100 100644
--- a/utils/training/estimate_interpolation.cpp
+++ b/utils/training/estimate_interpolation.cpp
@@ -26,6 +26,7 @@
#include <math.h>
#include <glib.h>
#include "pinyin_internal.h"
+#include "utils_helper.h"
parameter_t compute_interpolation(SingleGram * deleted_bigram,
FacadePhraseIndex * unigram,
@@ -63,7 +64,6 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram,
}
{
- guint32 freq = 0;
parameter_t elem_poss = 0;
PhraseItem item;
if (!unigram->get_phrase_item(token, item)){
@@ -90,21 +90,8 @@ parameter_t compute_interpolation(SingleGram * deleted_bigram,
int main(int argc, char * argv[]){
FacadePhraseIndex phrase_index;
- MemoryChunk * chunk = NULL;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
Bigram bigram;
bigram.attach("bigram.db", ATTACH_READONLY);
diff --git a/utils/training/eval_correction_rate.cpp b/utils/training/eval_correction_rate.cpp
index ada160f..1fa82b6 100644
--- a/utils/training/eval_correction_rate.cpp
+++ b/utils/training/eval_correction_rate.cpp
@@ -21,6 +21,7 @@
#include "pinyin_internal.h"
+#include "utils_helper.h"
void print_help(){
@@ -123,20 +124,8 @@ int main(int argc, char * argv[]){
largetable.load(options, chunk, NULL);
FacadePhraseIndex phrase_index;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
FacadePhraseTable phrases;
chunk = new MemoryChunk;
diff --git a/utils/training/export_k_mixture_model.cpp b/utils/training/export_k_mixture_model.cpp
index df09d24..a52eb46 100644
--- a/utils/training/export_k_mixture_model.cpp
+++ b/utils/training/export_k_mixture_model.cpp
@@ -21,6 +21,7 @@
#include "pinyin_internal.h"
#include "k_mixture_model.h"
+#include "utils_helper.h"
void print_help(){
printf("Usage: export_k_mixture_model [--k-mixture-model-file <FILENAME>]\n");
@@ -125,21 +126,8 @@ int main(int argc, char * argv[]){
}
FacadePhraseIndex phrase_index;
- MemoryChunk * chunk = NULL;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
diff --git a/utils/training/gen_ngram.cpp b/utils/training/gen_ngram.cpp
index 6de442b..93fae14 100644
--- a/utils/training/gen_ngram.cpp
+++ b/utils/training/gen_ngram.cpp
@@ -25,6 +25,7 @@
#include <locale.h>
#include <glib.h>
#include "pinyin_internal.h"
+#include "utils_helper.h"
void print_help(){
printf("Usage: gen_ngram [--skip-pi-gram-training]\n");
@@ -63,20 +64,8 @@ int main(int argc, char * argv[]){
phrases.load(chunk);
FacadePhraseIndex phrase_index;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
Bigram bigram;
bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);
diff --git a/utils/training/gen_unigram.cpp b/utils/training/gen_unigram.cpp
index 29a1bc6..b03235e 100644
--- a/utils/training/gen_unigram.cpp
+++ b/utils/training/gen_unigram.cpp
@@ -21,7 +21,7 @@
#include <stdio.h>
#include "pinyin_internal.h"
-
+#include "utils_helper.h"
/* increase all unigram frequency by a constant. */
@@ -29,20 +29,8 @@ int main(int argc, char * argv[]){
MemoryChunk * chunk = NULL;
FacadePhraseIndex phrase_index;
- for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
- const char * bin_file = pinyin_phrase_files[i];
- if (NULL == bin_file)
- continue;
-
- chunk = new MemoryChunk;
- bool retval = chunk->load(bin_file);
- if (!retval) {
- fprintf(stderr, "open %s failed!\n", bin_file);
- exit(ENOENT);
- }
-
- phrase_index.load(i, chunk);
- }
+ if (!init_phrase_index(&phrase_index))
+ exit(ENOENT);
/* Note: please increase the value when corpus size becomes larger.
* To avoid zero value when computing unigram frequency in float format.
diff --git a/utils/utils_helper.h b/utils/utils_helper.h
index e69de29..19ece80 100644
--- a/utils/utils_helper.h
+++ b/utils/utils_helper.h
@@ -0,0 +1,46 @@
+/*
+ * libpinyin
+ * Library to deal with pinyin.
+ *
+ * Copyright (C) 2012 Peng Wu <alexepico@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+
+#ifndef UTILS_HELPER_H
+#define UTILS_HELPER_H
+
+static bool init_phrase_index(FacadePhraseIndex * phrase_index) {
+ MemoryChunk * chunk = NULL;
+ for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
+ const char * bin_file = pinyin_phrase_files[i];
+ if (NULL == bin_file)
+ continue;
+
+ chunk = new MemoryChunk;
+ bool retval = chunk->load(bin_file);
+ if (!retval) {
+ fprintf(stderr, "open %s failed!\n", bin_file);
+ return false;
+ }
+
+ phrase_index->load(i, chunk);
+ }
+ return true;
+}
+
+
+#endif