Fix issue #17: add new APIs for per-dictionary character classes

Add enchant_dict_get_extra_word_characters, which returns a string of non-letter characters that may occur in words, and enchant_dict_is_word_character, which checks whether the given character is valid as the first, last, or internal character in a word.
author: Reuben Thomas <rrt@sc3d.org> 2017-07-23 14:25:19 +0100
committer: Reuben Thomas <rrt@sc3d.org> 2017-07-25 09:36:41 +0100
commit: cf998b0c4348518e1bff93bf6e3ef58d92162e9b (patch)
tree: c23d48b83d56d770c1a7e1be8375921f8a1485bb /src
parent: f33386baeb71b5ce3802465c7981fc92a394492e (diff)
download: enchant-cf998b0c4348518e1bff93bf6e3ef58d92162e9b.tar.gz
4 files changed, 131 insertions, 74 deletions
diff --git a/src/enchant-provider.h b/src/enchant-provider.h
index c6ae27e..ae3457d 100644
--- a/src/enchant-provider.h
+++ b/src/enchant-provider.h
@@ -112,7 +112,12 @@ struct str_enchant_dict
 				   const char *const cor, size_t cor_len);
 	
 	void (*add_to_exclude) (struct str_enchant_dict * me,
-				 const char *const word, size_t len);
+				const char *const word, size_t len);
+
+	const char * (*get_extra_word_characters) (struct str_enchant_dict * me);
+
+	int (*is_word_character) (struct str_enchant_dict * me,
+				  uint32_t uc_in, size_t n);
 };
 	
 struct str_enchant_provider
@@ -124,7 +129,7 @@ struct str_enchant_provider
 	void (*dispose) (struct str_enchant_provider * me);
 	
 	EnchantDict *(*request_dict) (struct str_enchant_provider * me,
-					  const char *const tag);
+				      const char *const tag);
 	
 	void (*dispose_dict) (struct str_enchant_provider * me,
 				  EnchantDict * dict);
@@ -138,7 +143,7 @@ struct str_enchant_provider
 	const  char * (*describe) (struct str_enchant_provider * me);
 
 	char ** (*list_dicts) (struct str_enchant_provider * me,
-							   size_t * out_n_dicts);
+			       size_t * out_n_dicts);
 };
 
 #ifdef __cplusplus
diff --git a/src/enchant.c b/src/enchant.c
index f798b50..7581b6d 100644
--- a/src/enchant.c
+++ b/src/enchant.c
@@ -192,64 +192,9 @@ do_mode_l (FILE * out, EnchantDict * dict, GString * word, size_t lineCount)
 }
 
 
-static int
-is_word_char (gunichar uc, size_t n)
-{
-	GUnicodeType type;
-
-	if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) {
-		return 1;
-	}
-
-	type = g_unichar_type(uc);
-
-	switch (type) {
-	case G_UNICODE_MODIFIER_LETTER:
-	case G_UNICODE_LOWERCASE_LETTER:
-	case G_UNICODE_TITLECASE_LETTER:
-	case G_UNICODE_UPPERCASE_LETTER:
-	case G_UNICODE_OTHER_LETTER:
-	case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */
-	case G_UNICODE_ENCLOSING_MARK:
-	case G_UNICODE_NON_SPACING_MARK:
-	case G_UNICODE_DECIMAL_NUMBER:
-	case G_UNICODE_LETTER_NUMBER:
-	case G_UNICODE_OTHER_NUMBER:
-	case G_UNICODE_CONNECT_PUNCTUATION:
-                return 1;     /* Enchant 1.3.0 defines word chars like this. */
-
-	case G_UNICODE_DASH_PUNCTUATION:
-		if ((n > 0) && (type == G_UNICODE_DASH_PUNCTUATION)) {
-			return 1; /* hyphens only accepted within a word. */
-		}
-		/* Fallthrough */
-
-	case G_UNICODE_CONTROL:
-	case G_UNICODE_FORMAT:
-	case G_UNICODE_UNASSIGNED:
-	case G_UNICODE_PRIVATE_USE:
-	case G_UNICODE_SURROGATE:
-	case G_UNICODE_CLOSE_PUNCTUATION:
-	case G_UNICODE_FINAL_PUNCTUATION:
-	case G_UNICODE_INITIAL_PUNCTUATION:
-	case G_UNICODE_OTHER_PUNCTUATION:
-	case G_UNICODE_OPEN_PUNCTUATION:
-	case G_UNICODE_CURRENCY_SYMBOL:
-	case G_UNICODE_MODIFIER_SYMBOL:
-	case G_UNICODE_MATH_SYMBOL:
-	case G_UNICODE_OTHER_SYMBOL:
-	case G_UNICODE_LINE_SEPARATOR:
-	case G_UNICODE_PARAGRAPH_SEPARATOR:
-	case G_UNICODE_SPACE_SEPARATOR:
-	default:
-		return 0;
-	}
-}
-
-
 /* Splits a line into a set of (word,word_position) tuples. */
 static GSList *
-tokenize_line (GString * line)
+tokenize_line (EnchantDict * dict, GString * line)
 {
 	GSList * tokens = NULL;
 	char *utf = (char *) line->str;
@@ -267,24 +212,24 @@ tokenize_line (GString * line)
 	        /* Skip non-word characters. */
 		cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
 		uc = g_utf8_get_char (utf);
-		while (cur_pos < line->len && *utf && !is_word_char(uc,0)) {
+		while (cur_pos < line->len && *utf && !enchant_dict_is_word_character (dict, uc, 0)) {
 		        utf = g_utf8_next_char (utf);
 			uc = g_utf8_get_char (utf);
 			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
 		}
 		start_pos = cur_pos;
 
-		/* Skip over word. */
-		while (cur_pos < line->len && *utf && is_word_char(uc,1)) {
+		/* Skip over word characters. */
+		while (cur_pos < line->len && *utf && enchant_dict_is_word_character (dict, uc, 1)) {
 			g_string_append_unichar (word, uc);
 		        utf = g_utf8_next_char (utf);
 			uc = g_utf8_get_char (utf);
 			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
 		}
 
-	        /* Do not accept one or more  ' at the end of the word. */
+	        /* Do not accept one or more ' at the end of the word. */
 		i = word->len-1;
-	        while ((i >= 0) && (word->str[i] == '\'')) {
+	        while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) {
 	                g_string_truncate (word, i);
 			i--;
 		}
@@ -388,16 +333,18 @@ parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dic
 				case '`': /* Enter verbose-correction mode */
 					break;
 
-				case '$': /* Save correction for rest of session [aspell extension] */
-					{ /* Syntax: $$ra <MISSPELLED>,<REPLACEMENT> */
-						const gchar *prefix = "$$ra ";
-						if (g_str_has_prefix(str->str, prefix)) {
+				case '$': /* Miscellaneous commands */
+					{
+						const gchar *prefix = "$$ra "; /* Save correction for rest of session [aspell extension] */
+						if (g_str_has_prefix(str->str, prefix)) { /* Syntax: $$ra <MISSPELLED>,<REPLACEMENT> */
 							gchar *comma = g_utf8_strchr(str->str, -1, (gunichar)',');
 							char *mis = str->str + strlen(prefix);
 							char *cor = comma + 1;
 							ssize_t mis_len = comma - mis;
 							ssize_t cor_len = strlen(str->str) - (cor - str->str);
 							enchant_dict_store_replacement(dict, mis, mis_len, cor, cor_len);
+						} else if (g_str_has_prefix(str->str, "$$wc")) { /* Return the extra word chars list */
+							fprintf(out, "%s\n", enchant_dict_get_extra_word_characters(dict));
 						}
 					}
 					break;
@@ -415,7 +362,7 @@ parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dic
 			}
 
 			if (mode != MODE_A || mode_A_no_command) {
-				token_ptr = tokens = tokenize_line (str);
+				token_ptr = tokens = tokenize_line (dict, str);
 				if (tokens == NULL)
 					putc('\n', out);
 				while (tokens != NULL) {
@@ -463,7 +410,7 @@ int main (int argc, char ** argv)
 	FILE * fp = stdin;
 
 	int countLines = 0;
-	gchar *dictionary = 0;  /* -d dictionary */
+	gchar *dictionary = NULL;  /* -d dictionary */
 
 	/* Initialize system locale */
 	setlocale(LC_ALL, "");
diff --git a/src/enchant.h b/src/enchant.h
index 704412f..9f00255 100644
--- a/src/enchant.h
+++ b/src/enchant.h
@@ -30,6 +30,7 @@
 #ifndef ENCHANT_H
 #define ENCHANT_H
 
+#include <stdint.h> /* for uint32_t */
 #include <sys/types.h> /* for size_t, ssize_t */
 
 
@@ -259,17 +260,53 @@ void enchant_dict_free_string_list (EnchantDict * dict, char **string_list);
 
 /**
  * enchant_dict_get_error
- * @dict: A non-null dictionary
+ * @dict: A non-null #EnchantDict
  *
  * Returns a const char string or NULL describing the last exception in UTF8 encoding.
  * WARNING: error is transient. It will likely be cleared as soon as
- * the next dictionary operation is called
- *
- * Returns: an error message
+ * the next dictionary operation is called.
  */
 const char *enchant_dict_get_error (EnchantDict * dict);
 
 /**
+ * enchant_dict_get_extra_word_characters
+ * @dict: A non-null #EnchantDict
+ *
+ * Returns a const char UTF-8-encoded string containing the non-letter characters
+ * allowed in a word, e.g. "01234567890’-". If dash occurs, it will be last, so that
+ * the string can be appended to a character class used to match word characters.
+ *
+ * Words containing non-letters not in this string will automatically be rejected
+ * by Enchant.
+ *
+ * Note that for some back-ends the result may be a guess, in which case it
+ * may include characters not actually allowed in the given dictionary.
+ */
+const char *enchant_dict_get_extra_word_characters (EnchantDict * dict);
+
+/**
+ * enchant_dict_is_word_character
+ * @dict: An #EnchantDict, or %null
+ * @uc: A unicode code-point
+ * @n: An integer: 0 if the character is at the start of a word, 1 if it is
+ * in the middle, or 2 if at the end.
+ *
+ * Returns a flag specifying whether the given character is valid at the
+ * given position.
+ *
+ * One way to match a complete word is to check that the first character matches
+ * with n == 0, then proceed matching characters with n == 1 until failure, then
+ * proceed backwards until a character matches with n == 2.
+ *
+ * Note that for some back-ends the result may be a guess, in which case it
+ * may allow characters not actually allowed in the given dictionary.
+ *
+ * If @dict is %null, a built-in implementation is used (FIXME: We should document
+ * behavior for this). If @n is not 0, 1 or 2, then a false flag is returned.
+ */
+int enchant_dict_is_word_character (EnchantDict * dict, uint32_t uc, size_t n);
+
+/**
  * EnchantDictDescribeFn
  * @lang_tag: The dictionary's language tag (eg: en_US, de_AT, ...)
  * @provider_name: The provider's name (eg: Aspell) in UTF8 encoding
diff --git a/src/lib.c b/src/lib.c
index d521ae8..395d4ba 100644
--- a/src/lib.c
+++ b/src/lib.c
@@ -1323,6 +1323,74 @@ enchant_broker_dict_exists (EnchantBroker * broker,
 	return exists;
 }
 
+_GL_ATTRIBUTE_PURE const char *
+enchant_dict_get_extra_word_characters (EnchantDict *dict)
+{
+	g_return_val_if_fail (dict, NULL);
+
+	return dict->get_extra_word_characters ? (*dict->get_extra_word_characters) (dict) : "";
+}
+
+_GL_ATTRIBUTE_PURE int
+enchant_dict_is_word_character (EnchantDict * dict, uint32_t uc_in, size_t n)
+{
+	g_return_val_if_fail (n <= 2, 0);
+
+	if (dict && dict->is_word_character)
+		return (*dict->is_word_character) (dict, uc_in, n);
+
+	gunichar uc = (gunichar)uc_in;
+
+	/* Accept quote marks anywhere except at the end of a word */
+	if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) {
+		return n < 2;
+	}
+
+	GUnicodeType type = g_unichar_type(uc);
+
+	switch (type) {
+	case G_UNICODE_MODIFIER_LETTER:
+	case G_UNICODE_LOWERCASE_LETTER:
+	case G_UNICODE_TITLECASE_LETTER:
+	case G_UNICODE_UPPERCASE_LETTER:
+	case G_UNICODE_OTHER_LETTER:
+	case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */
+	case G_UNICODE_ENCLOSING_MARK:
+	case G_UNICODE_NON_SPACING_MARK:
+	case G_UNICODE_DECIMAL_NUMBER:
+	case G_UNICODE_LETTER_NUMBER:
+	case G_UNICODE_OTHER_NUMBER:
+	case G_UNICODE_CONNECT_PUNCTUATION:
+		return 1;     /* Enchant 1.3.0 defines word chars like this. */
+
+	case G_UNICODE_DASH_PUNCTUATION:
+		if ((n == 1) && (type == G_UNICODE_DASH_PUNCTUATION)) {
+			return 1; /* hyphens only accepted within a word. */
+		}
+		/* Fallthrough */
+
+	case G_UNICODE_CONTROL:
+	case G_UNICODE_FORMAT:
+	case G_UNICODE_UNASSIGNED:
+	case G_UNICODE_PRIVATE_USE:
+	case G_UNICODE_SURROGATE:
+	case G_UNICODE_CLOSE_PUNCTUATION:
+	case G_UNICODE_FINAL_PUNCTUATION:
+	case G_UNICODE_INITIAL_PUNCTUATION:
+	case G_UNICODE_OTHER_PUNCTUATION:
+	case G_UNICODE_OPEN_PUNCTUATION:
+	case G_UNICODE_CURRENCY_SYMBOL:
+	case G_UNICODE_MODIFIER_SYMBOL:
+	case G_UNICODE_MATH_SYMBOL:
+	case G_UNICODE_OTHER_SYMBOL:
+	case G_UNICODE_LINE_SEPARATOR:
+	case G_UNICODE_PARAGRAPH_SEPARATOR:
+	case G_UNICODE_SPACE_SEPARATOR:
+	default:
+		return 0;
+	}
+}
+
 void
 enchant_broker_set_ordering (EnchantBroker * broker,
 				 const char * const tag,
author	Reuben Thomas <rrt@sc3d.org>	2017-07-23 14:25:19 +0100
committer	Reuben Thomas <rrt@sc3d.org>	2017-07-25 09:36:41 +0100
commit	cf998b0c4348518e1bff93bf6e3ef58d92162e9b (patch)
tree	c23d48b83d56d770c1a7e1be8375921f8a1485bb /src
parent	f33386baeb71b5ce3802465c7981fc92a394492e (diff)
download	enchant-cf998b0c4348518e1bff93bf6e3ef58d92162e9b.tar.gz