summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2022-08-29 14:17:27 +0800
committerPeng Wu <alexepico@gmail.com>2022-08-29 14:20:46 +0800
commit008794423b00488ee45fb204bf5c6fc5205475cb (patch)
treebdc0453434c847c76d2d607f7daead22ccb850b1
parent3e857c0a978133ad1a59eec1788f5fd6f1e7f187 (diff)
downloadlibpinyin-008794423b00488ee45fb204bf5c6fc5205475cb.tar.gz
Fix pinyin.cpp
-rw-r--r--src/pinyin.cpp65
-rw-r--r--src/storage/phrase_large_table3_bdb.cpp2
2 files changed, 36 insertions, 31 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 1446977..76ab61a 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -1439,9 +1439,9 @@ static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
item.get_phrase_string(buffer);
guint length = item.get_phrase_length();
if (len)
- *len = length;
+ *len = length - begin;
if (utf8_str)
- *utf8_str = g_ucs4_to_utf8(buffer + begin, length, NULL, NULL, NULL);
+ *utf8_str = g_ucs4_to_utf8(buffer + begin, length - begin, NULL, NULL, NULL);
return true;
}
@@ -2047,39 +2047,38 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
break;
}
- if (0 == merged_gram.get_length())
- return false;
+ if (0 != merged_gram.get_length()) {
- /* retrieve all items. */
- BigramPhraseWithCountArray tokens = g_array_new
- (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
- merged_gram.retrieve_all(tokens);
+ /* retrieve all items. */
+ BigramPhraseWithCountArray tokens = g_array_new
+ (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+ merged_gram.retrieve_all(tokens);
- /* sort the longer word first. */
- PhraseItem cached_item;
- for (ssize_t len = length; len > 0; --len) {
- /* append items. */
- for (size_t k = 0; k < tokens->len; ++k){
- BigramPhraseItemWithCount * phrase_item = &g_array_index
- (tokens, BigramPhraseItemWithCount, k);
+ /* sort the longer word first. */
+ PhraseItem cached_item;
+ for (ssize_t len = length; len > 0; --len) {
+ /* append items. */
+ for (size_t k = 0; k < tokens->len; ++k){
+ BigramPhraseItemWithCount * phrase_item = &g_array_index
+ (tokens, BigramPhraseItemWithCount, k);
- if (phrase_item->m_count < filter)
- continue;
+ if (phrase_item->m_count < filter)
+ continue;
- int result = phrase_index->get_phrase_item
- (phrase_item->m_token, cached_item);
- if (ERROR_NO_SUB_PHRASE_INDEX == result)
- continue;
+ int result = phrase_index->get_phrase_item
+ (phrase_item->m_token, cached_item);
+ if (ERROR_NO_SUB_PHRASE_INDEX == result)
+ continue;
- if (len != cached_item.get_phrase_length())
- continue;
+ if (len != cached_item.get_phrase_length())
+ continue;
- lookup_candidate_t item;
- item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE;
- item.m_token = phrase_item->m_token;
- g_array_append_val(candidates, item);
+ lookup_candidate_t item;
+ item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE;
+ item.m_token = phrase_item->m_token;
+ g_array_append_val(candidates, item);
+ }
}
-
}
/* search prefix candidate. */
@@ -2129,7 +2128,8 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
int pinyin_choose_candidate(pinyin_instance_t * instance,
size_t offset,
lookup_candidate_t * candidate){
- assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type);
+ assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type &&
+ PREDICTED_PREFIX_CANDIDATE != candidate->m_candidate_type);
pinyin_context_t * context = instance->m_context;
PhoneticKeyMatrix & matrix = instance->m_matrix;
@@ -2190,7 +2190,8 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
lookup_candidate_t * candidate){
- assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type);
+ assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type ||
+ PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type);
const guint32 initial_seed = 23 * 3;
const guint32 unigram_factor = 7;
@@ -2205,6 +2206,10 @@ bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
if (ERROR_INTEGER_OVERFLOW == error)
return false;
+ /* The prefix candidate only trains uni-gram frequency. */
+ if (PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type)
+ return true;
+
phrase_token_t prev_token = _get_previous_token(instance, 0);
if (null_token == prev_token)
return false;
diff --git a/src/storage/phrase_large_table3_bdb.cpp b/src/storage/phrase_large_table3_bdb.cpp
index da2e6f2..9074170 100644
--- a/src/storage/phrase_large_table3_bdb.cpp
+++ b/src/storage/phrase_large_table3_bdb.cpp
@@ -230,7 +230,7 @@ int PhraseLargeTable3::search_suggestion(int phrase_length,
DBT db_data;
memset(&db_data, 0, sizeof(DBT));
/* Get the prefix entry */
- ret = cursorp->c_get(cursorp, &db_key1, &db_data, 0);
+ ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET);
if (ret != 0) {
cursorp->c_close(cursorp);
return result;