summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2022-09-26 14:27:40 +0800
committerPeng Wu <alexepico@gmail.com>2022-09-26 14:27:40 +0800
commit9a0b828affd447aed97b571261de8409d1143d18 (patch)
tree6287b2cdae2c3c5144a658d1884cc0c3b1d2f8c9
parentdc17f530f01b0376343beaea4513ae1855620d21 (diff)
downloadlibpinyin-9a0b828affd447aed97b571261de8409d1143d18.tar.gz
Update pinyin_choose_candidate function
-rw-r--r--src/pinyin.cpp27
1 files changed, 24 insertions, 3 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 61cdb5f..9dd784b 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -86,6 +86,9 @@ struct _pinyin_instance_t{
NBestMatchResults m_nbest_results;
TokenVector m_phrase_result;
CandidateVector m_candidates;
+
+ /* cache the sort option here. */
+ guint m_sort_option;
};
struct _lookup_candidate_t{
@@ -1134,6 +1137,9 @@ pinyin_instance_t * pinyin_alloc_instance(pinyin_context_t * context){
instance->m_candidates =
g_array_new(TRUE, TRUE, sizeof(lookup_candidate_t));
+ instance->m_sort_option =
+ SORT_BY_PHRASE_LENGTH | SORT_BY_PINYIN_LENGTH | SORT_BY_FREQUENCY;
+
return instance;
}
@@ -1993,6 +1999,8 @@ bool pinyin_guess_candidates(pinyin_instance_t * instance,
if (0 == matrix.size())
return false;
+ instance->m_sort_option = sort_option;
+
/* lookup the previous token here. */
phrase_token_t prev_token = null_token;
@@ -2230,6 +2238,9 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
int pinyin_choose_candidate(pinyin_instance_t * instance,
size_t offset,
lookup_candidate_t * candidate){
+ const guint32 initial_seed = 23 * 3;
+ const guint32 unigram_factor = 7;
+
assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type &&
PREDICTED_PREFIX_CANDIDATE != candidate->m_candidate_type);
@@ -2248,9 +2259,6 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
if (LONGER_CANDIDATE == candidate->m_candidate_type) {
/* only train uni-gram for longer candidate. */
- const guint32 initial_seed = 23 * 3;
- const guint32 unigram_factor = 7;
-
phrase_token_t token = candidate->m_token;
int error = context->m_phrase_index->add_unigram_frequency
(token, initial_seed * unigram_factor);
@@ -2291,6 +2299,19 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
candidate->m_token = token;
}
+ if (instance->m_sort_option & SORT_WITHOUT_SENTENCE_CANDIDATE) {
+ assert(0 == offset);
+
+ /* only train uni-gram. */
+ phrase_token_t token = candidate->m_token;
+ int error = context->m_phrase_index->add_unigram_frequency
+ (token, initial_seed * unigram_factor);
+ if (ERROR_INTEGER_OVERFLOW == error)
+ return false;
+
+ return true;
+ }
+
/* sync m_constraints to the length of m_pinyin_keys. */
bool retval = constraints->validate_constraint(&matrix);