diff options
author | Reuben Thomas <rrt@sc3d.org> | 2017-07-23 14:25:19 +0100 |
---|---|---|
committer | Reuben Thomas <rrt@sc3d.org> | 2017-07-25 09:36:41 +0100 |
commit | cf998b0c4348518e1bff93bf6e3ef58d92162e9b (patch) | |
tree | c23d48b83d56d770c1a7e1be8375921f8a1485bb /src | |
parent | f33386baeb71b5ce3802465c7981fc92a394492e (diff) | |
download | enchant-cf998b0c4348518e1bff93bf6e3ef58d92162e9b.tar.gz |
Fix issue #17: add new APIs for per-dictionary character classes
Add enchant_dict_get_extra_word_characters, which returns a string of
non-letter characters that may occur in words, and
enchant_dict_is_word_character, which checks whether the given character is
valid as the first, last, or internal character in a word.
Diffstat (limited to 'src')
-rw-r--r-- | src/enchant-provider.h | 11 | ||||
-rw-r--r-- | src/enchant.c | 81 | ||||
-rw-r--r-- | src/enchant.h | 45 | ||||
-rw-r--r-- | src/lib.c | 68 |
4 files changed, 131 insertions, 74 deletions
diff --git a/src/enchant-provider.h b/src/enchant-provider.h index c6ae27e..ae3457d 100644 --- a/src/enchant-provider.h +++ b/src/enchant-provider.h @@ -112,7 +112,12 @@ struct str_enchant_dict const char *const cor, size_t cor_len); void (*add_to_exclude) (struct str_enchant_dict * me, - const char *const word, size_t len); + const char *const word, size_t len); + + const char * (*get_extra_word_characters) (struct str_enchant_dict * me); + + int (*is_word_character) (struct str_enchant_dict * me, + uint32_t uc_in, size_t n); }; struct str_enchant_provider @@ -124,7 +129,7 @@ struct str_enchant_provider void (*dispose) (struct str_enchant_provider * me); EnchantDict *(*request_dict) (struct str_enchant_provider * me, - const char *const tag); + const char *const tag); void (*dispose_dict) (struct str_enchant_provider * me, EnchantDict * dict); @@ -138,7 +143,7 @@ struct str_enchant_provider const char * (*describe) (struct str_enchant_provider * me); char ** (*list_dicts) (struct str_enchant_provider * me, - size_t * out_n_dicts); + size_t * out_n_dicts); }; #ifdef __cplusplus diff --git a/src/enchant.c b/src/enchant.c index f798b50..7581b6d 100644 --- a/src/enchant.c +++ b/src/enchant.c @@ -192,64 +192,9 @@ do_mode_l (FILE * out, EnchantDict * dict, GString * word, size_t lineCount) } -static int -is_word_char (gunichar uc, size_t n) -{ - GUnicodeType type; - - if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) { - return 1; - } - - type = g_unichar_type(uc); - - switch (type) { - case G_UNICODE_MODIFIER_LETTER: - case G_UNICODE_LOWERCASE_LETTER: - case G_UNICODE_TITLECASE_LETTER: - case G_UNICODE_UPPERCASE_LETTER: - case G_UNICODE_OTHER_LETTER: - case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */ - case G_UNICODE_ENCLOSING_MARK: - case G_UNICODE_NON_SPACING_MARK: - case G_UNICODE_DECIMAL_NUMBER: - case G_UNICODE_LETTER_NUMBER: - case G_UNICODE_OTHER_NUMBER: - case G_UNICODE_CONNECT_PUNCTUATION: - return 1; /* Enchant 1.3.0 defines word chars like this. */ - - case G_UNICODE_DASH_PUNCTUATION: - if ((n > 0) && (type == G_UNICODE_DASH_PUNCTUATION)) { - return 1; /* hyphens only accepted within a word. */ - } - /* Fallthrough */ - - case G_UNICODE_CONTROL: - case G_UNICODE_FORMAT: - case G_UNICODE_UNASSIGNED: - case G_UNICODE_PRIVATE_USE: - case G_UNICODE_SURROGATE: - case G_UNICODE_CLOSE_PUNCTUATION: - case G_UNICODE_FINAL_PUNCTUATION: - case G_UNICODE_INITIAL_PUNCTUATION: - case G_UNICODE_OTHER_PUNCTUATION: - case G_UNICODE_OPEN_PUNCTUATION: - case G_UNICODE_CURRENCY_SYMBOL: - case G_UNICODE_MODIFIER_SYMBOL: - case G_UNICODE_MATH_SYMBOL: - case G_UNICODE_OTHER_SYMBOL: - case G_UNICODE_LINE_SEPARATOR: - case G_UNICODE_PARAGRAPH_SEPARATOR: - case G_UNICODE_SPACE_SEPARATOR: - default: - return 0; - } -} - - /* Splits a line into a set of (word,word_position) tuples. */ static GSList * -tokenize_line (GString * line) +tokenize_line (EnchantDict * dict, GString * line) { GSList * tokens = NULL; char *utf = (char *) line->str; @@ -267,24 +212,24 @@ tokenize_line (GString * line) /* Skip non-word characters. */ cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); uc = g_utf8_get_char (utf); - while (cur_pos < line->len && *utf && !is_word_char(uc,0)) { + while (cur_pos < line->len && *utf && !enchant_dict_is_word_character (dict, uc, 0)) { utf = g_utf8_next_char (utf); uc = g_utf8_get_char (utf); cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); } start_pos = cur_pos; - /* Skip over word. */ - while (cur_pos < line->len && *utf && is_word_char(uc,1)) { + /* Skip over word characters. */ + while (cur_pos < line->len && *utf && enchant_dict_is_word_character (dict, uc, 1)) { g_string_append_unichar (word, uc); utf = g_utf8_next_char (utf); uc = g_utf8_get_char (utf); cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); } - /* Do not accept one or more ' at the end of the word. */ + /* Do not accept one or more ' at the end of the word. */ i = word->len-1; - while ((i >= 0) && (word->str[i] == '\'')) { + while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) { g_string_truncate (word, i); i--; } @@ -388,16 +333,18 @@ parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dic case '`': /* Enter verbose-correction mode */ break; - case '$': /* Save correction for rest of session [aspell extension] */ - { /* Syntax: $$ra <MISSPELLED>,<REPLACEMENT> */ - const gchar *prefix = "$$ra "; - if (g_str_has_prefix(str->str, prefix)) { + case '$': /* Miscellaneous commands */ + { + const gchar *prefix = "$$ra "; /* Save correction for rest of session [aspell extension] */ + if (g_str_has_prefix(str->str, prefix)) { /* Syntax: $$ra <MISSPELLED>,<REPLACEMENT> */ gchar *comma = g_utf8_strchr(str->str, -1, (gunichar)','); char *mis = str->str + strlen(prefix); char *cor = comma + 1; ssize_t mis_len = comma - mis; ssize_t cor_len = strlen(str->str) - (cor - str->str); enchant_dict_store_replacement(dict, mis, mis_len, cor, cor_len); + } else if (g_str_has_prefix(str->str, "$$wc")) { /* Return the extra word chars list */ + fprintf(out, "%s\n", enchant_dict_get_extra_word_characters(dict)); } } break; @@ -415,7 +362,7 @@ parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dic } if (mode != MODE_A || mode_A_no_command) { - token_ptr = tokens = tokenize_line (str); + token_ptr = tokens = tokenize_line (dict, str); if (tokens == NULL) putc('\n', out); while (tokens != NULL) { @@ -463,7 +410,7 @@ int main (int argc, char ** argv) FILE * fp = stdin; int countLines = 0; - gchar *dictionary = 0; /* -d dictionary */ + gchar *dictionary = NULL; /* -d dictionary */ /* Initialize system locale */ setlocale(LC_ALL, ""); diff --git a/src/enchant.h b/src/enchant.h index 704412f..9f00255 100644 --- a/src/enchant.h +++ b/src/enchant.h @@ -30,6 +30,7 @@ #ifndef ENCHANT_H #define ENCHANT_H +#include <stdint.h> /* for uint32_t */ #include <sys/types.h> /* for size_t, ssize_t */ @@ -259,17 +260,53 @@ void enchant_dict_free_string_list (EnchantDict * dict, char **string_list); /** * enchant_dict_get_error - * @dict: A non-null dictionary + * @dict: A non-null #EnchantDict * * Returns a const char string or NULL describing the last exception in UTF8 encoding. * WARNING: error is transient. It will likely be cleared as soon as - * the next dictionary operation is called - * - * Returns: an error message + * the next dictionary operation is called. */ const char *enchant_dict_get_error (EnchantDict * dict); /** + * enchant_dict_get_extra_word_characters + * @dict: A non-null #EnchantDict + * + * Returns a const char UTF-8-encoded string containing the non-letter characters + * allowed in a word, e.g. "01234567890’-". If dash occurs, it will be last, so that + * the string can be appended to a character class used to match word characters. + * + * Words containing non-letters not in this string will automatically be rejected + * by Enchant. + * + * Note that for some back-ends the result may be a guess, in which case it + * may include characters not actually allowed in the given dictionary. + */ +const char *enchant_dict_get_extra_word_characters (EnchantDict * dict); + +/** + * enchant_dict_is_word_character + * @dict: An #EnchantDict, or %null + * @uc: A unicode code-point + * @n: An integer: 0 if the character is at the start of a word, 1 if it is + * in the middle, or 2 if at the end. + * + * Returns a flag specifying whether the given character is valid at the + * given position. + * + * One way to match a complete word is to check that the first character matches + * with n == 0, then proceed matching characters with n == 1 until failure, then + * proceed backwards until a character matches with n == 2. + * + * Note that for some back-ends the result may be a guess, in which case it + * may allow characters not actually allowed in the given dictionary. + * + * If @dict is %null, a built-in implementation is used (FIXME: We should document + * behavior for this). If @n is not 0, 1 or 2, then a false flag is returned. + */ +int enchant_dict_is_word_character (EnchantDict * dict, uint32_t uc, size_t n); + +/** * EnchantDictDescribeFn * @lang_tag: The dictionary's language tag (eg: en_US, de_AT, ...) * @provider_name: The provider's name (eg: Aspell) in UTF8 encoding @@ -1323,6 +1323,74 @@ enchant_broker_dict_exists (EnchantBroker * broker, return exists; } +_GL_ATTRIBUTE_PURE const char * +enchant_dict_get_extra_word_characters (EnchantDict *dict) +{ + g_return_val_if_fail (dict, NULL); + + return dict->get_extra_word_characters ? (*dict->get_extra_word_characters) (dict) : ""; +} + +_GL_ATTRIBUTE_PURE int +enchant_dict_is_word_character (EnchantDict * dict, uint32_t uc_in, size_t n) +{ + g_return_val_if_fail (n <= 2, 0); + + if (dict && dict->is_word_character) + return (*dict->is_word_character) (dict, uc_in, n); + + gunichar uc = (gunichar)uc_in; + + /* Accept quote marks anywhere except at the end of a word */ + if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) { + return n < 2; + } + + GUnicodeType type = g_unichar_type(uc); + + switch (type) { + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_UPPERCASE_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */ + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_NON_SPACING_MARK: + case G_UNICODE_DECIMAL_NUMBER: + case G_UNICODE_LETTER_NUMBER: + case G_UNICODE_OTHER_NUMBER: + case G_UNICODE_CONNECT_PUNCTUATION: + return 1; /* Enchant 1.3.0 defines word chars like this. */ + + case G_UNICODE_DASH_PUNCTUATION: + if ((n == 1) && (type == G_UNICODE_DASH_PUNCTUATION)) { + return 1; /* hyphens only accepted within a word. */ + } + /* Fallthrough */ + + case G_UNICODE_CONTROL: + case G_UNICODE_FORMAT: + case G_UNICODE_UNASSIGNED: + case G_UNICODE_PRIVATE_USE: + case G_UNICODE_SURROGATE: + case G_UNICODE_CLOSE_PUNCTUATION: + case G_UNICODE_FINAL_PUNCTUATION: + case G_UNICODE_INITIAL_PUNCTUATION: + case G_UNICODE_OTHER_PUNCTUATION: + case G_UNICODE_OPEN_PUNCTUATION: + case G_UNICODE_CURRENCY_SYMBOL: + case G_UNICODE_MODIFIER_SYMBOL: + case G_UNICODE_MATH_SYMBOL: + case G_UNICODE_OTHER_SYMBOL: + case G_UNICODE_LINE_SEPARATOR: + case G_UNICODE_PARAGRAPH_SEPARATOR: + case G_UNICODE_SPACE_SEPARATOR: + default: + return 0; + } +} + void enchant_broker_set_ordering (EnchantBroker * broker, const char * const tag, |