summaryrefslogtreecommitdiff
path: root/tests/unicode-data.c
diff options
context:
space:
mode:
authorMatthias Clasen <mclasen@redhat.com>2019-07-08 13:38:36 -0400
committerMatthias Clasen <mclasen@redhat.com>2019-07-08 13:38:36 -0400
commit5988509d25a99836e904988be57541997e237f23 (patch)
treee53f3367ba90bd038253f98a0df58fd454367826 /tests/unicode-data.c
parent18cfe50caa1dc6e3295ad6bf5151b14ffbcfbc37 (diff)
downloadglib-unicode-test.tar.gz
Add a unicode data testunicode-test
This just dumps out our Unicode data for given input, and can compare the results to expected values. This has been useful to me for some quick inspection of Unicode data.
Diffstat (limited to 'tests/unicode-data.c')
-rw-r--r--tests/unicode-data.c270
1 files changed, 270 insertions, 0 deletions
diff --git a/tests/unicode-data.c b/tests/unicode-data.c
new file mode 100644
index 000000000..1c2eef1d2
--- /dev/null
+++ b/tests/unicode-data.c
@@ -0,0 +1,270 @@
+/* GLib
+ * unicode-data.c: Test Unicode character data
+ *
+ * Copyright (C) 2019 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+#include <glib.h>
+#include <string.h>
+#include <locale.h>
+
+#ifndef G_OS_WIN32
+#include <unistd.h>
+#endif
+
+#include "test-common.h"
+
+static const char *
+char_type (GUnicodeType t)
+{
+ const char *names[] = {
+ "Cc", "Cf", "Cn", "Co", "Cs", "Ll", "Lm", "Lo", "Lt",
+ "Lu", "Mc", "Me", "Mn", "Nd", "Nl", "No", "Pc", "Pd",
+ "Pe", "Pf", "Pi", "Po", "Ps", "Sc", "Sk", "Sm", "So",
+ "Zl", "Zp", "Zs"
+ };
+ return names[t];
+}
+
+static const char *
+break_type (GUnicodeBreakType t)
+{
+ const char *names[] = {
+ "BK", "CR", "LF", "CM", "SG", "ZW", "IN", "GL", "CB",
+ "SP", "BA", "BB", "B2", "HY", "NS", "OP", "CL", "QU",
+ "EX", "ID", "NU", "IS", "SY", "AL", "PR", "PO", "SA",
+ "AI", "XX", "NL", "WJ", "JL", "JV", "JT", "H2", "H3",
+ "CP", "CJ", "HL", "RI", "EB", "EM", "ZWJ"
+ };
+ return names[t];
+}
+
+static const char *
+script_name (GUnicodeScript s)
+{
+ const char *names[] = {
+ "Zyyy", "Zinh", "Arab", "Armn", "Beng", "Bopo", "Cher",
+ "Copt", "Cyrl", "Dsrt", "Deva", "Ethi", "Geor", "Goth",
+ "Grek", "Gujr", "Guru", "Hani", "Hang", "Hebr", "Hira",
+ "Knda", "Kana", "Khmr", "Laoo", "Latn", "Mlym", "Mong",
+ "Mymr", "Ogam", "Ital", "Orya", "Runr", "Sinh", "Syrc",
+ "Taml", "Telu", "Thaa", "Thai", "Tibt", "Cans", "Yiii",
+ "Tglg", "Hano", "Buhd", "Tagb", "Brai", "Cprt", "Limb",
+ "Osma", "Shaw", "Linb", "Tale", "Ugar", "Talu", "Bugi",
+ "Glag", "Tfng", "Sylo", "Xpeo", "Khar", "Zzzz", "Bali",
+ "Xsux", "Phnx", "Phag", "Nkoo", "Kali", "Lepc", "Rjng",
+ "Sund", "Saur", "Cham", "Olck", "Vaii", "Cari", "Lyci",
+ "Lydi", "Avst", "Bamu", "Egyp", "Armi", "Phli", "Prti",
+ "Java", "Kthi", "Lisu", "Mtei", "Sarb", "Orkh", "Samr",
+ "Lana", "Tavt", "Batk", "Brah", "Mand", "Cakm", "Merc",
+ "Mero", "Plrd", "Shrd", "Sora", "Takr", "Bass", "Aghb",
+ "Dupl", "Elba", "Gran", "Khoj", "Sind", "Lina", "Mahj",
+ "Mani", "Mend", "Modi", "Mroo", "Nbat", "Narb", "Perm",
+ "Hmng", "Palm", "Pauc", "Phlp", "Sidd", "Tirh", "Wara",
+ "Ahom", "Hluw", "Hatr", "Mult", "Hung", "Sgnw", "Adlm",
+ "Bhks", "Marc", "Newa", "Osge", "Tang", "Gonm", "Nshu",
+ "Soyo", "Zanb", "Dogr", "Gong", "Rohg", "Maka", "Medf",
+ "Sogo", "Sogd", "Elym", "Nand", "Rohg", "Wcho"
+ };
+ return names[s];
+}
+
+static void
+test_file (const char *filename, GString *string)
+{
+ char *contents;
+ gsize length;
+ GError *error = NULL;
+ char *p;
+ GString *s1, *s2, *s3;
+ GUnicodeScript prev_script = -1;
+ int m;
+
+ if (!g_file_get_contents (filename, &contents, &length, &error))
+ {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ return;
+ }
+
+ g_string_append (string, "Text: ");
+ s1 = g_string_new ("Char type: ");
+ s2 = g_string_new ("Break type: ");
+ s3 = g_string_new ("Script: ");
+
+ m = MAX (MAX (s1->len, s2->len), s3->len);
+
+ g_string_append_printf (s1, "%*s", (int)(m - s1->len), "");
+ g_string_append_printf (s2, "%*s", (int)(m - s2->len), "");
+ g_string_append_printf (s3, "%*s", (int)(m - s3->len), "");
+ g_string_append_printf (string, "%*s", (int)(m - strlen ("Text: ")), "");
+
+ for (p = contents; *p; p = g_utf8_next_char (p))
+ {
+ gunichar ch = g_utf8_get_char (p);
+ const char *ctype = char_type (g_unichar_type (ch));
+ const char *btype = break_type (g_unichar_break_type (ch));
+ GUnicodeScript script = g_unichar_get_script (ch);
+ int c = strlen (ctype);
+ int b = strlen (btype);
+ int s = 0;
+ int t = 0;
+
+ g_string_append_printf (s1, "%s", ctype);
+ g_string_append_printf (s2, "%s", btype);
+
+ if (prev_script != script)
+ {
+ const char *str = script_name (script);
+ prev_script = script;
+ g_string_append (s3, str);
+ s = strlen (str);
+ }
+
+ if (ch == 0x20)
+ {
+ g_string_append (string, "[ ]");
+ t = 3;
+ }
+ else if (g_unichar_isgraph (ch) &&
+ (ch != 0x2028) &&
+ (ch != 0x2029))
+ {
+ g_string_append_unichar (string, ch);
+ t = 1;
+ }
+ else
+ {
+ char *str = g_strdup_printf ("[%#04x]", ch);
+ g_string_append (string, str);
+ t = strlen (str);
+ g_free (str);
+ }
+
+ m = MAX (t, MAX (MAX (c + 1, b + 1), s + 1));
+
+ g_string_append_printf (string, "%*s", m - t, "");
+ g_string_append_printf (s1, "%*s", m - c, "");
+ g_string_append_printf (s2, "%*s", m - b, "");
+ g_string_append_printf (s3, "%*s", m - s, "");
+ }
+
+ g_string_append (string, "\n");
+ g_string_append_len (string, s1->str, s1->len);
+ g_string_append (string, "\n");
+ g_string_append_len (string, s2->str, s2->len);
+ g_string_append (string, "\n");
+ g_string_append_len (string, s3->str, s3->len);
+ g_string_append (string, "\n");
+
+ g_string_free (s1, TRUE);
+ g_string_free (s2, TRUE);
+ g_string_free (s3, TRUE);
+
+ g_free (contents);
+}
+
+static gchar *
+get_expected_filename (const gchar *filename)
+{
+ gchar *f, *p, *expected;
+
+ f = g_strdup (filename);
+ p = strstr (f, ".chars");
+ if (p)
+ *p = 0;
+ expected = g_strconcat (f, ".expected", NULL);
+
+ g_free (f);
+
+ return expected;
+}
+
+static void
+test_break (gconstpointer d)
+{
+ const char *filename = d;
+ char *expected_file;
+ GError *error = NULL;
+ GString *dump;
+ char *diff = NULL;
+
+ expected_file = get_expected_filename (filename);
+
+ dump = g_string_sized_new (0);
+
+ test_file (filename, dump);
+
+ diff = diff_with_file (expected_file, dump->str, dump->len, &error);
+ g_assert_no_error (error);
+
+ if (diff && diff[0])
+ {
+ g_printerr ("Contents don't match expected contents:\n%s", diff);
+ g_test_fail ();
+ g_free (diff);
+ }
+
+ g_string_free (dump, TRUE);
+ g_free (expected_file);
+}
+
+int
+main (int argc, char *argv[])
+{
+ GDir *dir;
+ GError *error = NULL;
+ const gchar *name;
+ gchar *path;
+
+ g_setenv ("LC_ALL", "en_US.UTF-8", TRUE);
+ setlocale (LC_ALL, "");
+
+ g_test_init (&argc, &argv, NULL);
+
+ /* allow to easily generate expected output for new test cases */
+ if (argc > 1)
+ {
+ GString *string;
+
+ string = g_string_sized_new (0);
+ test_file (argv[1], string);
+ g_print ("%s", string->str);
+
+ return 0;
+ }
+
+ path = g_test_build_filename (G_TEST_DIST, "chars", NULL);
+ dir = g_dir_open (path, 0, &error);
+ g_free (path);
+ g_assert_no_error (error);
+ while ((name = g_dir_read_name (dir)) != NULL)
+ {
+ if (!strstr (name, "chars"))
+ continue;
+
+ path = g_strdup_printf ("/chars/%s", name);
+ g_test_add_data_func_full (path, g_test_build_filename (G_TEST_DIST, "chars", name, NULL),
+ test_break, g_free);
+ g_free (path);
+ }
+ g_dir_close (dir);
+
+ return g_test_run ();
+}