Fix pinyin.cpp

author: Peng Wu <alexepico@gmail.com> 2022-08-29 14:17:27 +0800
committer: Peng Wu <alexepico@gmail.com> 2022-08-29 14:20:46 +0800
commit: 008794423b00488ee45fb204bf5c6fc5205475cb (patch)
tree: bdc0453434c847c76d2d607f7daead22ccb850b1
parent: 3e857c0a978133ad1a59eec1788f5fd6f1e7f187 (diff)
download: libpinyin-008794423b00488ee45fb204bf5c6fc5205475cb.tar.gz
2 files changed, 36 insertions, 31 deletions
diff --git a/src/pinyin.cpp b/src/pinyin.cpp
index 1446977..76ab61a 100644
--- a/src/pinyin.cpp
+++ b/src/pinyin.cpp
@@ -1439,9 +1439,9 @@ static bool _token_get_phrase(FacadePhraseIndex * phrase_index,
     item.get_phrase_string(buffer);
     guint length = item.get_phrase_length();
     if (len)
-        *len = length;
+        *len = length - begin;
     if (utf8_str)
-        *utf8_str = g_ucs4_to_utf8(buffer + begin, length, NULL, NULL, NULL);
+        *utf8_str = g_ucs4_to_utf8(buffer + begin, length - begin, NULL, NULL, NULL);
     return true;
 }
 
@@ -2047,39 +2047,38 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
             break;
     }
 
-    if (0 == merged_gram.get_length())
-        return false;
+    if (0 != merged_gram.get_length()) {
 
-    /* retrieve all items. */
-    BigramPhraseWithCountArray tokens = g_array_new
-        (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
-    merged_gram.retrieve_all(tokens);
+        /* retrieve all items. */
+        BigramPhraseWithCountArray tokens = g_array_new
+            (FALSE, FALSE, sizeof(BigramPhraseItemWithCount));
+        merged_gram.retrieve_all(tokens);
 
-    /* sort the longer word first. */
-    PhraseItem cached_item;
-    for (ssize_t len = length; len > 0; --len) {
-        /* append items. */
-        for (size_t k = 0; k < tokens->len; ++k){
-            BigramPhraseItemWithCount * phrase_item = &g_array_index
-                (tokens, BigramPhraseItemWithCount, k);
+        /* sort the longer word first. */
+        PhraseItem cached_item;
+        for (ssize_t len = length; len > 0; --len) {
+            /* append items. */
+            for (size_t k = 0; k < tokens->len; ++k){
+                BigramPhraseItemWithCount * phrase_item = &g_array_index
+                    (tokens, BigramPhraseItemWithCount, k);
 
-            if (phrase_item->m_count < filter)
-                continue;
+                if (phrase_item->m_count < filter)
+                    continue;
 
-            int result = phrase_index->get_phrase_item
-                (phrase_item->m_token, cached_item);
-            if (ERROR_NO_SUB_PHRASE_INDEX == result)
-                continue;
+                int result = phrase_index->get_phrase_item
+                    (phrase_item->m_token, cached_item);
+                if (ERROR_NO_SUB_PHRASE_INDEX == result)
+                    continue;
 
-            if (len != cached_item.get_phrase_length())
-                continue;
+                if (len != cached_item.get_phrase_length())
+                    continue;
 
-            lookup_candidate_t item;
-            item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE;
-            item.m_token = phrase_item->m_token;
-            g_array_append_val(candidates, item);
+                lookup_candidate_t item;
+                item.m_candidate_type = PREDICTED_BIGRAM_CANDIDATE;
+                item.m_token = phrase_item->m_token;
+                g_array_append_val(candidates, item);
+            }
         }
-
     }
 
     /* search prefix candidate. */
@@ -2129,7 +2128,8 @@ bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance,
 int pinyin_choose_candidate(pinyin_instance_t * instance,
                             size_t offset,
                             lookup_candidate_t * candidate){
-    assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type);
+    assert(PREDICTED_BIGRAM_CANDIDATE != candidate->m_candidate_type &&
+           PREDICTED_PREFIX_CANDIDATE != candidate->m_candidate_type);
 
     pinyin_context_t * context = instance->m_context;
     PhoneticKeyMatrix & matrix = instance->m_matrix;
@@ -2190,7 +2190,8 @@ int pinyin_choose_candidate(pinyin_instance_t * instance,
 
 bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
                                        lookup_candidate_t * candidate){
-    assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type);
+    assert(PREDICTED_BIGRAM_CANDIDATE == candidate->m_candidate_type ||
+           PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type);
 
     const guint32 initial_seed = 23 * 3;
     const guint32 unigram_factor = 7;
@@ -2205,6 +2206,10 @@ bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance,
     if (ERROR_INTEGER_OVERFLOW == error)
         return false;
 
+    /* The prefix candidate only trains uni-gram frequency. */
+    if (PREDICTED_PREFIX_CANDIDATE == candidate->m_candidate_type)
+        return true;
+
     phrase_token_t prev_token = _get_previous_token(instance, 0);
     if (null_token == prev_token)
         return false;
diff --git a/src/storage/phrase_large_table3_bdb.cpp b/src/storage/phrase_large_table3_bdb.cpp
index da2e6f2..9074170 100644
--- a/src/storage/phrase_large_table3_bdb.cpp
+++ b/src/storage/phrase_large_table3_bdb.cpp
@@ -230,7 +230,7 @@ int PhraseLargeTable3::search_suggestion(int phrase_length,
     DBT db_data;
     memset(&db_data, 0, sizeof(DBT));
     /* Get the prefix entry */
-    ret = cursorp->c_get(cursorp, &db_key1, &db_data, 0);
+    ret = cursorp->c_get(cursorp, &db_key1, &db_data, DB_SET);
     if (ret != 0) {
         cursorp->c_close(cursorp);
         return result;
author	Peng Wu <alexepico@gmail.com>	2022-08-29 14:17:27 +0800
committer	Peng Wu <alexepico@gmail.com>	2022-08-29 14:20:46 +0800
commit	008794423b00488ee45fb204bf5c6fc5205475cb (patch)
tree	bdc0453434c847c76d2d607f7daead22ccb850b1
parent	3e857c0a978133ad1a59eec1788f5fd6f1e7f187 (diff)
download	libpinyin-008794423b00488ee45fb204bf5c6fc5205475cb.tar.gz