Add a unicode data testunicode-test

This just dumps out our Unicode data for given input, and can compare the results to expected values. This has been useful to me for some quick inspection of Unicode data.
author: Matthias Clasen <mclasen@redhat.com> 2019-07-08 13:38:36 -0400
committer: Matthias Clasen <mclasen@redhat.com> 2019-07-08 13:38:36 -0400
commit: 5988509d25a99836e904988be57541997e237f23 (patch)
tree: e53f3367ba90bd038253f98a0df58fd454367826 /tests/unicode-data.c
parent: 18cfe50caa1dc6e3295ad6bf5151b14ffbcfbc37 (diff)
download: glib-unicode-test.tar.gz
1 files changed, 270 insertions, 0 deletions
diff --git a/tests/unicode-data.c b/tests/unicode-data.c
new file mode 100644
index 000000000..1c2eef1d2
--- /dev/null
+++ b/tests/unicode-data.c
@@ -0,0 +1,270 @@
+/* GLib
+ * unicode-data.c: Test Unicode character data
+ *
+ * Copyright (C) 2019 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+#include <glib.h>
+#include <string.h>
+#include <locale.h>
+
+#ifndef G_OS_WIN32
+#include <unistd.h>
+#endif
+
+#include "test-common.h"
+
+static const char *
+char_type (GUnicodeType t)
+{
+  const char *names[] = {
+    "Cc", "Cf", "Cn", "Co", "Cs", "Ll", "Lm", "Lo", "Lt",
+    "Lu", "Mc", "Me", "Mn", "Nd", "Nl", "No", "Pc", "Pd",
+    "Pe", "Pf", "Pi", "Po", "Ps", "Sc", "Sk", "Sm", "So",
+    "Zl", "Zp", "Zs"
+  };
+  return names[t];
+}
+
+static const char *
+break_type (GUnicodeBreakType t)
+{
+  const char *names[] = {
+    "BK", "CR", "LF", "CM", "SG", "ZW", "IN", "GL", "CB",
+    "SP", "BA", "BB", "B2", "HY", "NS", "OP", "CL", "QU",
+    "EX", "ID", "NU", "IS", "SY", "AL", "PR", "PO", "SA",
+    "AI", "XX", "NL", "WJ", "JL", "JV", "JT", "H2", "H3",
+    "CP", "CJ", "HL", "RI", "EB", "EM", "ZWJ"
+  };
+  return names[t];
+}
+
+static const char *
+script_name (GUnicodeScript s)
+{
+  const char *names[] = {
+    "Zyyy", "Zinh", "Arab", "Armn", "Beng", "Bopo", "Cher",
+    "Copt", "Cyrl", "Dsrt", "Deva", "Ethi", "Geor", "Goth",
+    "Grek", "Gujr", "Guru", "Hani", "Hang", "Hebr", "Hira",
+    "Knda", "Kana", "Khmr", "Laoo", "Latn", "Mlym", "Mong",
+    "Mymr", "Ogam", "Ital", "Orya", "Runr", "Sinh", "Syrc",
+    "Taml", "Telu", "Thaa", "Thai", "Tibt", "Cans", "Yiii",
+    "Tglg", "Hano", "Buhd", "Tagb", "Brai", "Cprt", "Limb",
+    "Osma", "Shaw", "Linb", "Tale", "Ugar", "Talu", "Bugi",
+    "Glag", "Tfng", "Sylo", "Xpeo", "Khar", "Zzzz", "Bali",
+    "Xsux", "Phnx", "Phag", "Nkoo", "Kali", "Lepc", "Rjng",
+    "Sund", "Saur", "Cham", "Olck", "Vaii", "Cari", "Lyci",
+    "Lydi", "Avst", "Bamu", "Egyp", "Armi", "Phli", "Prti",
+    "Java", "Kthi", "Lisu", "Mtei", "Sarb", "Orkh", "Samr",
+    "Lana", "Tavt", "Batk", "Brah", "Mand", "Cakm", "Merc",
+    "Mero", "Plrd", "Shrd", "Sora", "Takr", "Bass", "Aghb",
+    "Dupl", "Elba", "Gran", "Khoj", "Sind", "Lina", "Mahj",
+    "Mani", "Mend", "Modi", "Mroo", "Nbat", "Narb", "Perm",
+    "Hmng", "Palm", "Pauc", "Phlp", "Sidd", "Tirh", "Wara",
+    "Ahom", "Hluw", "Hatr", "Mult", "Hung", "Sgnw", "Adlm",
+    "Bhks", "Marc", "Newa", "Osge", "Tang", "Gonm", "Nshu",
+    "Soyo", "Zanb", "Dogr", "Gong", "Rohg", "Maka", "Medf",
+    "Sogo", "Sogd", "Elym", "Nand", "Rohg", "Wcho"
+  };
+  return names[s];
+}
+
+static void
+test_file (const char *filename, GString *string)
+{
+  char *contents;
+  gsize length;
+  GError *error = NULL;
+  char *p;
+  GString *s1, *s2, *s3;
+  GUnicodeScript prev_script = -1;
+  int m;
+
+  if (!g_file_get_contents (filename, &contents, &length, &error))
+    {
+      g_error ("%s", error->message);
+      g_error_free (error);
+      return;
+    }
+
+  g_string_append (string, "Text: ");
+  s1 = g_string_new ("Char type: ");
+  s2 = g_string_new ("Break type: ");
+  s3 = g_string_new ("Script: ");
+
+  m = MAX (MAX (s1->len, s2->len), s3->len);
+
+  g_string_append_printf (s1, "%*s", (int)(m - s1->len), "");
+  g_string_append_printf (s2, "%*s", (int)(m - s2->len), "");
+  g_string_append_printf (s3, "%*s", (int)(m - s3->len), "");
+  g_string_append_printf (string, "%*s", (int)(m - strlen ("Text: ")), "");
+
+  for (p = contents; *p; p = g_utf8_next_char (p))
+    {
+      gunichar ch = g_utf8_get_char (p);
+      const char *ctype = char_type (g_unichar_type (ch));
+      const char *btype = break_type (g_unichar_break_type (ch));
+      GUnicodeScript script = g_unichar_get_script (ch);
+      int c = strlen (ctype);
+      int b = strlen (btype);
+      int s = 0;
+      int t = 0;
+
+      g_string_append_printf (s1, "%s", ctype);
+      g_string_append_printf (s2, "%s", btype);
+
+      if (prev_script != script)
+        {
+          const char *str = script_name (script);
+          prev_script = script;
+          g_string_append (s3, str);
+          s = strlen (str);
+        }
+
+      if (ch == 0x20)
+        {
+          g_string_append (string, "[ ]");
+          t = 3;
+        }
+      else if (g_unichar_isgraph (ch) &&
+               (ch != 0x2028) &&
+               (ch != 0x2029))
+        {
+          g_string_append_unichar (string, ch);
+          t = 1;
+        }
+      else
+        {
+          char *str = g_strdup_printf ("[%#04x]", ch);
+          g_string_append (string, str); 
+          t = strlen (str);
+          g_free (str);
+        }
+
+      m = MAX (t, MAX (MAX (c + 1, b + 1), s + 1));
+
+      g_string_append_printf (string, "%*s", m - t, "");
+      g_string_append_printf (s1, "%*s", m - c, "");
+      g_string_append_printf (s2, "%*s", m - b, "");
+      g_string_append_printf (s3, "%*s", m - s, "");
+    }
+
+  g_string_append (string, "\n");
+  g_string_append_len (string, s1->str, s1->len);
+  g_string_append (string, "\n");
+  g_string_append_len (string, s2->str, s2->len);
+  g_string_append (string, "\n");
+  g_string_append_len (string, s3->str, s3->len);
+  g_string_append (string, "\n");
+
+  g_string_free (s1, TRUE);
+  g_string_free (s2, TRUE);
+  g_string_free (s3, TRUE);
+
+  g_free (contents);
+}
+
+static gchar *
+get_expected_filename (const gchar *filename)
+{
+  gchar *f, *p, *expected;
+
+  f = g_strdup (filename);
+  p = strstr (f, ".chars");
+  if (p)
+    *p = 0;
+  expected = g_strconcat (f, ".expected", NULL);
+
+  g_free (f);
+
+  return expected;
+}
+
+static void
+test_break (gconstpointer d)
+{
+  const char *filename = d;
+  char *expected_file;
+  GError *error = NULL;
+  GString *dump;
+  char *diff = NULL;
+
+  expected_file = get_expected_filename (filename);
+
+  dump = g_string_sized_new (0);
+
+  test_file (filename, dump);
+
+  diff = diff_with_file (expected_file, dump->str, dump->len, &error);
+  g_assert_no_error (error);
+
+  if (diff && diff[0])
+    {
+      g_printerr ("Contents don't match expected contents:\n%s", diff);
+      g_test_fail ();
+      g_free (diff);
+    }
+
+  g_string_free (dump, TRUE);
+  g_free (expected_file);
+}
+
+int
+main (int argc, char *argv[])
+{
+  GDir *dir;
+  GError *error = NULL;
+  const gchar *name;
+  gchar *path;
+
+  g_setenv ("LC_ALL", "en_US.UTF-8", TRUE);
+  setlocale (LC_ALL, "");
+
+  g_test_init (&argc, &argv, NULL);
+
+  /* allow to easily generate expected output for new test cases */
+  if (argc > 1)
+    {
+      GString *string;
+
+      string = g_string_sized_new (0);
+      test_file (argv[1], string);
+      g_print ("%s", string->str);
+
+      return 0;
+    }
+
+  path = g_test_build_filename (G_TEST_DIST, "chars", NULL);
+  dir = g_dir_open (path, 0, &error);
+  g_free (path);
+  g_assert_no_error (error);
+  while ((name = g_dir_read_name (dir)) != NULL)
+    {
+      if (!strstr (name, "chars"))
+        continue;
+
+      path = g_strdup_printf ("/chars/%s", name);
+      g_test_add_data_func_full (path, g_test_build_filename (G_TEST_DIST, "chars", name, NULL),
+                                 test_break, g_free);
+      g_free (path);
+    }
+  g_dir_close (dir);
+
+  return g_test_run ();
+}
author	Matthias Clasen <mclasen@redhat.com>	2019-07-08 13:38:36 -0400
committer	Matthias Clasen <mclasen@redhat.com>	2019-07-08 13:38:36 -0400
commit	5988509d25a99836e904988be57541997e237f23 (patch)
tree	e53f3367ba90bd038253f98a0df58fd454367826 /tests/unicode-data.c
parent	18cfe50caa1dc6e3295ad6bf5151b14ffbcfbc37 (diff)
download	glib-unicode-test.tar.gz