diff options
Diffstat (limited to 'utils/segment')
-rw-r--r-- | utils/segment/ngseg.cpp | 6 | ||||
-rw-r--r-- | utils/segment/spseg.cpp | 8 |
2 files changed, 7 insertions, 7 deletions
diff --git a/utils/segment/ngseg.cpp b/utils/segment/ngseg.cpp index 6acde10..d5a825b 100644 --- a/utils/segment/ngseg.cpp +++ b/utils/segment/ngseg.cpp @@ -27,9 +27,9 @@ /* n-gram based sentence segment. */ /* Note: - * Currently libpinyin only supports ucs2 characters, as this is a + * Currently libpinyin only supports ucs4 characters, as this is a * pre-processor tool for raw corpus, it will skip all sentences - * which contains non-ucs2 characters. + * which contains non-ucs4 characters. */ /* TODO: @@ -146,7 +146,7 @@ int main(int argc, char * argv[]){ linebuf[strlen(linebuf) - 1] = '\0'; } - //check non-ucs2 characters + //check non-ucs4 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); diff --git a/utils/segment/spseg.cpp b/utils/segment/spseg.cpp index 448ce33..4a03287 100644 --- a/utils/segment/spseg.cpp +++ b/utils/segment/spseg.cpp @@ -28,9 +28,9 @@ /* graph shortest path sentence segment. */ /* Note: - * Currently libpinyin only supports ucs2 characters, as this is a + * Currently libpinyin only supports ucs4 characters, as this is a * pre-processor tool for raw corpus, it will skip all sentences - * which contains non-ucs2 characters. + * which contains non-ucs4 characters. */ struct SegmentStep{ @@ -162,12 +162,12 @@ int main(int argc, char * argv[]){ linebuf[strlen(linebuf) - 1] = '\0'; } - //check non-ucs2 characters + //check non-ucs4 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { - fprintf(stderr, "non-ucs2 characters encountered:%s.\n", linebuf); + fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); printf("\n"); continue; } |