summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorReuben Thomas <rrt@sc3d.org>2017-07-23 14:25:19 +0100
committerReuben Thomas <rrt@sc3d.org>2017-07-25 09:36:41 +0100
commitcf998b0c4348518e1bff93bf6e3ef58d92162e9b (patch)
treec23d48b83d56d770c1a7e1be8375921f8a1485bb /src
parentf33386baeb71b5ce3802465c7981fc92a394492e (diff)
downloadenchant-cf998b0c4348518e1bff93bf6e3ef58d92162e9b.tar.gz
Fix issue #17: add new APIs for per-dictionary character classes
Add enchant_dict_get_extra_word_characters, which returns a string of non-letter characters that may occur in words, and enchant_dict_is_word_character, which checks whether the given character is valid as the first, last, or internal character in a word.
Diffstat (limited to 'src')
-rw-r--r--src/enchant-provider.h11
-rw-r--r--src/enchant.c81
-rw-r--r--src/enchant.h45
-rw-r--r--src/lib.c68
4 files changed, 131 insertions, 74 deletions
diff --git a/src/enchant-provider.h b/src/enchant-provider.h
index c6ae27e..ae3457d 100644
--- a/src/enchant-provider.h
+++ b/src/enchant-provider.h
@@ -112,7 +112,12 @@ struct str_enchant_dict
const char *const cor, size_t cor_len);
void (*add_to_exclude) (struct str_enchant_dict * me,
- const char *const word, size_t len);
+ const char *const word, size_t len);
+
+ const char * (*get_extra_word_characters) (struct str_enchant_dict * me);
+
+ int (*is_word_character) (struct str_enchant_dict * me,
+ uint32_t uc_in, size_t n);
};
struct str_enchant_provider
@@ -124,7 +129,7 @@ struct str_enchant_provider
void (*dispose) (struct str_enchant_provider * me);
EnchantDict *(*request_dict) (struct str_enchant_provider * me,
- const char *const tag);
+ const char *const tag);
void (*dispose_dict) (struct str_enchant_provider * me,
EnchantDict * dict);
@@ -138,7 +143,7 @@ struct str_enchant_provider
const char * (*describe) (struct str_enchant_provider * me);
char ** (*list_dicts) (struct str_enchant_provider * me,
- size_t * out_n_dicts);
+ size_t * out_n_dicts);
};
#ifdef __cplusplus
diff --git a/src/enchant.c b/src/enchant.c
index f798b50..7581b6d 100644
--- a/src/enchant.c
+++ b/src/enchant.c
@@ -192,64 +192,9 @@ do_mode_l (FILE * out, EnchantDict * dict, GString * word, size_t lineCount)
}
-static int
-is_word_char (gunichar uc, size_t n)
-{
- GUnicodeType type;
-
- if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) {
- return 1;
- }
-
- type = g_unichar_type(uc);
-
- switch (type) {
- case G_UNICODE_MODIFIER_LETTER:
- case G_UNICODE_LOWERCASE_LETTER:
- case G_UNICODE_TITLECASE_LETTER:
- case G_UNICODE_UPPERCASE_LETTER:
- case G_UNICODE_OTHER_LETTER:
- case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */
- case G_UNICODE_ENCLOSING_MARK:
- case G_UNICODE_NON_SPACING_MARK:
- case G_UNICODE_DECIMAL_NUMBER:
- case G_UNICODE_LETTER_NUMBER:
- case G_UNICODE_OTHER_NUMBER:
- case G_UNICODE_CONNECT_PUNCTUATION:
- return 1; /* Enchant 1.3.0 defines word chars like this. */
-
- case G_UNICODE_DASH_PUNCTUATION:
- if ((n > 0) && (type == G_UNICODE_DASH_PUNCTUATION)) {
- return 1; /* hyphens only accepted within a word. */
- }
- /* Fallthrough */
-
- case G_UNICODE_CONTROL:
- case G_UNICODE_FORMAT:
- case G_UNICODE_UNASSIGNED:
- case G_UNICODE_PRIVATE_USE:
- case G_UNICODE_SURROGATE:
- case G_UNICODE_CLOSE_PUNCTUATION:
- case G_UNICODE_FINAL_PUNCTUATION:
- case G_UNICODE_INITIAL_PUNCTUATION:
- case G_UNICODE_OTHER_PUNCTUATION:
- case G_UNICODE_OPEN_PUNCTUATION:
- case G_UNICODE_CURRENCY_SYMBOL:
- case G_UNICODE_MODIFIER_SYMBOL:
- case G_UNICODE_MATH_SYMBOL:
- case G_UNICODE_OTHER_SYMBOL:
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- case G_UNICODE_SPACE_SEPARATOR:
- default:
- return 0;
- }
-}
-
-
/* Splits a line into a set of (word,word_position) tuples. */
static GSList *
-tokenize_line (GString * line)
+tokenize_line (EnchantDict * dict, GString * line)
{
GSList * tokens = NULL;
char *utf = (char *) line->str;
@@ -267,24 +212,24 @@ tokenize_line (GString * line)
/* Skip non-word characters. */
cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
uc = g_utf8_get_char (utf);
- while (cur_pos < line->len && *utf && !is_word_char(uc,0)) {
+ while (cur_pos < line->len && *utf && !enchant_dict_is_word_character (dict, uc, 0)) {
utf = g_utf8_next_char (utf);
uc = g_utf8_get_char (utf);
cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
}
start_pos = cur_pos;
- /* Skip over word. */
- while (cur_pos < line->len && *utf && is_word_char(uc,1)) {
+ /* Skip over word characters. */
+ while (cur_pos < line->len && *utf && enchant_dict_is_word_character (dict, uc, 1)) {
g_string_append_unichar (word, uc);
utf = g_utf8_next_char (utf);
uc = g_utf8_get_char (utf);
cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
}
- /* Do not accept one or more ' at the end of the word. */
+ /* Do not accept one or more ' at the end of the word. */
i = word->len-1;
- while ((i >= 0) && (word->str[i] == '\'')) {
+ while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) {
g_string_truncate (word, i);
i--;
}
@@ -388,16 +333,18 @@ parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dic
case '`': /* Enter verbose-correction mode */
break;
- case '$': /* Save correction for rest of session [aspell extension] */
- { /* Syntax: $$ra <MISSPELLED>,<REPLACEMENT> */
- const gchar *prefix = "$$ra ";
- if (g_str_has_prefix(str->str, prefix)) {
+ case '$': /* Miscellaneous commands */
+ {
+ const gchar *prefix = "$$ra "; /* Save correction for rest of session [aspell extension] */
+ if (g_str_has_prefix(str->str, prefix)) { /* Syntax: $$ra <MISSPELLED>,<REPLACEMENT> */
gchar *comma = g_utf8_strchr(str->str, -1, (gunichar)',');
char *mis = str->str + strlen(prefix);
char *cor = comma + 1;
ssize_t mis_len = comma - mis;
ssize_t cor_len = strlen(str->str) - (cor - str->str);
enchant_dict_store_replacement(dict, mis, mis_len, cor, cor_len);
+ } else if (g_str_has_prefix(str->str, "$$wc")) { /* Return the extra word chars list */
+ fprintf(out, "%s\n", enchant_dict_get_extra_word_characters(dict));
}
}
break;
@@ -415,7 +362,7 @@ parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dic
}
if (mode != MODE_A || mode_A_no_command) {
- token_ptr = tokens = tokenize_line (str);
+ token_ptr = tokens = tokenize_line (dict, str);
if (tokens == NULL)
putc('\n', out);
while (tokens != NULL) {
@@ -463,7 +410,7 @@ int main (int argc, char ** argv)
FILE * fp = stdin;
int countLines = 0;
- gchar *dictionary = 0; /* -d dictionary */
+ gchar *dictionary = NULL; /* -d dictionary */
/* Initialize system locale */
setlocale(LC_ALL, "");
diff --git a/src/enchant.h b/src/enchant.h
index 704412f..9f00255 100644
--- a/src/enchant.h
+++ b/src/enchant.h
@@ -30,6 +30,7 @@
#ifndef ENCHANT_H
#define ENCHANT_H
+#include <stdint.h> /* for uint32_t */
#include <sys/types.h> /* for size_t, ssize_t */
@@ -259,17 +260,53 @@ void enchant_dict_free_string_list (EnchantDict * dict, char **string_list);
/**
* enchant_dict_get_error
- * @dict: A non-null dictionary
+ * @dict: A non-null #EnchantDict
*
* Returns a const char string or NULL describing the last exception in UTF8 encoding.
* WARNING: error is transient. It will likely be cleared as soon as
- * the next dictionary operation is called
- *
- * Returns: an error message
+ * the next dictionary operation is called.
*/
const char *enchant_dict_get_error (EnchantDict * dict);
/**
+ * enchant_dict_get_extra_word_characters
+ * @dict: A non-null #EnchantDict
+ *
+ * Returns a const char UTF-8-encoded string containing the non-letter characters
+ * allowed in a word, e.g. "01234567890’-". If dash occurs, it will be last, so that
+ * the string can be appended to a character class used to match word characters.
+ *
+ * Words containing non-letters not in this string will automatically be rejected
+ * by Enchant.
+ *
+ * Note that for some back-ends the result may be a guess, in which case it
+ * may include characters not actually allowed in the given dictionary.
+ */
+const char *enchant_dict_get_extra_word_characters (EnchantDict * dict);
+
+/**
+ * enchant_dict_is_word_character
+ * @dict: An #EnchantDict, or %null
+ * @uc: A unicode code-point
+ * @n: An integer: 0 if the character is at the start of a word, 1 if it is
+ * in the middle, or 2 if at the end.
+ *
+ * Returns a flag specifying whether the given character is valid at the
+ * given position.
+ *
+ * One way to match a complete word is to check that the first character matches
+ * with n == 0, then proceed matching characters with n == 1 until failure, then
+ * proceed backwards until a character matches with n == 2.
+ *
+ * Note that for some back-ends the result may be a guess, in which case it
+ * may allow characters not actually allowed in the given dictionary.
+ *
+ * If @dict is %null, a built-in implementation is used (FIXME: We should document
+ * behavior for this). If @n is not 0, 1 or 2, then a false flag is returned.
+ */
+int enchant_dict_is_word_character (EnchantDict * dict, uint32_t uc, size_t n);
+
+/**
* EnchantDictDescribeFn
* @lang_tag: The dictionary's language tag (eg: en_US, de_AT, ...)
* @provider_name: The provider's name (eg: Aspell) in UTF8 encoding
diff --git a/src/lib.c b/src/lib.c
index d521ae8..395d4ba 100644
--- a/src/lib.c
+++ b/src/lib.c
@@ -1323,6 +1323,74 @@ enchant_broker_dict_exists (EnchantBroker * broker,
return exists;
}
+_GL_ATTRIBUTE_PURE const char *
+enchant_dict_get_extra_word_characters (EnchantDict *dict)
+{
+ g_return_val_if_fail (dict, NULL);
+
+ return dict->get_extra_word_characters ? (*dict->get_extra_word_characters) (dict) : "";
+}
+
+_GL_ATTRIBUTE_PURE int
+enchant_dict_is_word_character (EnchantDict * dict, uint32_t uc_in, size_t n)
+{
+ g_return_val_if_fail (n <= 2, 0);
+
+ if (dict && dict->is_word_character)
+ return (*dict->is_word_character) (dict, uc_in, n);
+
+ gunichar uc = (gunichar)uc_in;
+
+ /* Accept quote marks anywhere except at the end of a word */
+ if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) {
+ return n < 2;
+ }
+
+ GUnicodeType type = g_unichar_type(uc);
+
+ switch (type) {
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ case G_UNICODE_CONNECT_PUNCTUATION:
+ return 1; /* Enchant 1.3.0 defines word chars like this. */
+
+ case G_UNICODE_DASH_PUNCTUATION:
+ if ((n == 1) && (type == G_UNICODE_DASH_PUNCTUATION)) {
+ return 1; /* hyphens only accepted within a word. */
+ }
+ /* Fallthrough */
+
+ case G_UNICODE_CONTROL:
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_UNASSIGNED:
+ case G_UNICODE_PRIVATE_USE:
+ case G_UNICODE_SURROGATE:
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ case G_UNICODE_FINAL_PUNCTUATION:
+ case G_UNICODE_INITIAL_PUNCTUATION:
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_OPEN_PUNCTUATION:
+ case G_UNICODE_CURRENCY_SYMBOL:
+ case G_UNICODE_MODIFIER_SYMBOL:
+ case G_UNICODE_MATH_SYMBOL:
+ case G_UNICODE_OTHER_SYMBOL:
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_SPACE_SEPARATOR:
+ default:
+ return 0;
+ }
+}
+
void
enchant_broker_set_ordering (EnchantBroker * broker,
const char * const tag,