Add a unicode data testunicode-test

This just dumps out our Unicode data for given input, and can compare the results to expected values. This has been useful to me for some quick inspection of Unicode data.
author: Matthias Clasen <mclasen@redhat.com> 2019-07-08 13:38:36 -0400
committer: Matthias Clasen <mclasen@redhat.com> 2019-07-08 13:38:36 -0400
commit: 5988509d25a99836e904988be57541997e237f23 (patch)
tree: e53f3367ba90bd038253f98a0df58fd454367826
parent: 18cfe50caa1dc6e3295ad6bf5151b14ffbcfbc37 (diff)
download: glib-unicode-test.tar.gz
6 files changed, 369 insertions, 0 deletions
diff --git a/tests/chars/one.chars b/tests/chars/one.chars
new file mode 100644
index 000000000..c26db8011
--- /dev/null
+++ b/tests/chars/one.chars
@@ -0,0 +1 @@
+a b c d e f g h i j k lm n o　p	q r s
diff --git a/tests/chars/one.expected b/tests/chars/one.expected
new file mode 100644
index 000000000..bfb819ed4
--- /dev/null
+++ b/tests/chars/one.expected
@@ -0,0 +1,4 @@
+Text:       a    [ ]  b    [0xa0]c    [0x2002]d    [0x2003]e    [0x2004]f    [0x2005]g    [0x2006]h    [0x2007]i    [0x2008]j    [0x2009]k    [0x200a]l    [0x200b]m    [0x202f]n    [0x205f]o    [0x3000]p    [0x09]q    [0x2028]r    [0x2029]s    [0x0a]
+Char type:  Ll   Zs   Ll   Zs    Ll   Zs      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Cf      Ll   Zs      Ll   Zs      Ll   Zs      Ll   Cc    Ll   Zl      Ll   Zp      Ll   Cc    
+Break type: AL   SP   AL   GL    AL   BA      AL   BA      AL   BA      AL   BA      AL   BA      AL   GL      AL   BA      AL   BA      AL   BA      AL   ZW      AL   GL      AL   BA      AL   BA      AL   BA    AL   BK      AL   BK      AL   LF    
+Script:     Latn Zyyy Latn Zyyy  Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy    Latn Zyyy  Latn Zyyy    Latn Zyyy    Latn Zyyy  
diff --git a/tests/meson.build b/tests/meson.build
index ce3044258..0b33ae90b 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -43,6 +43,9 @@ tests = {
   'threadpool-test' : {'suite' : ['slow']},
   'type-test' : {},
   'unicode-caseconv' : {},
+  'unicode-data' : {
+    'extra_sources' : 'test-common.c',
+  }, 
   'unicode-encoding' : {},
   'module-test' : {
     'dependencies' : [libgmodule_dep],
diff --git a/tests/test-common.c b/tests/test-common.c
new file mode 100644
index 000000000..c317b3b1c
--- /dev/null
+++ b/tests/test-common.c
@@ -0,0 +1,81 @@
+/* GLib
+ * test-common.c: Common test code
+ *
+ * Copyright (C) 2014 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include <glib.h>
+#include <string.h>
+
+#include <locale.h>
+
+#ifdef G_OS_WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "test-common.h"
+
+char *
+diff_with_file (const char  *file,
+                char        *text,
+                gssize       len,
+                GError     **error)
+{
+  const char *command[] = { "diff", "-u", "-i", file, NULL, NULL };
+  char *diff, *tmpfile;
+  int fd;
+
+  diff = NULL;
+
+  if (len < 0)
+    len = strlen (text);
+
+  /* write the text buffer to a temporary file */
+  fd = g_file_open_tmp (NULL, &tmpfile, error);
+  if (fd < 0)
+    return NULL;
+
+  if (write (fd, text, len) != (int) len)
+    {
+      close (fd);
+      g_set_error (error,
+                   G_FILE_ERROR, G_FILE_ERROR_FAILED,
+                   "Could not write data to temporary file '%s'", tmpfile);
+      goto done;
+    }
+  close (fd);
+  command[4] = tmpfile;
+
+  /* run diff command */
+  g_spawn_sync (NULL,
+                (char **) command,
+                NULL,
+                G_SPAWN_SEARCH_PATH,
+                NULL, NULL,
+                &diff,
+                NULL, NULL,
+                error);
+
+done:
+  unlink (tmpfile);
+  g_free (tmpfile);
+
+  return diff;
+}
diff --git a/tests/test-common.h b/tests/test-common.h
new file mode 100644
index 000000000..2b4de821e
--- /dev/null
+++ b/tests/test-common.h
@@ -0,0 +1,10 @@
+#ifndef __TEST_COMMON_H__
+#define __TEST_COMMON_H__
+
+char * diff_with_file (const char  *file,
+                       char        *text,
+                       gssize       len,
+                       GError     **error);
+
+
+#endif
diff --git a/tests/unicode-data.c b/tests/unicode-data.c
new file mode 100644
index 000000000..1c2eef1d2
--- /dev/null
+++ b/tests/unicode-data.c
@@ -0,0 +1,270 @@
+/* GLib
+ * unicode-data.c: Test Unicode character data
+ *
+ * Copyright (C) 2019 Red Hat, Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+#include <glib.h>
+#include <string.h>
+#include <locale.h>
+
+#ifndef G_OS_WIN32
+#include <unistd.h>
+#endif
+
+#include "test-common.h"
+
+static const char *
+char_type (GUnicodeType t)
+{
+  const char *names[] = {
+    "Cc", "Cf", "Cn", "Co", "Cs", "Ll", "Lm", "Lo", "Lt",
+    "Lu", "Mc", "Me", "Mn", "Nd", "Nl", "No", "Pc", "Pd",
+    "Pe", "Pf", "Pi", "Po", "Ps", "Sc", "Sk", "Sm", "So",
+    "Zl", "Zp", "Zs"
+  };
+  return names[t];
+}
+
+static const char *
+break_type (GUnicodeBreakType t)
+{
+  const char *names[] = {
+    "BK", "CR", "LF", "CM", "SG", "ZW", "IN", "GL", "CB",
+    "SP", "BA", "BB", "B2", "HY", "NS", "OP", "CL", "QU",
+    "EX", "ID", "NU", "IS", "SY", "AL", "PR", "PO", "SA",
+    "AI", "XX", "NL", "WJ", "JL", "JV", "JT", "H2", "H3",
+    "CP", "CJ", "HL", "RI", "EB", "EM", "ZWJ"
+  };
+  return names[t];
+}
+
+static const char *
+script_name (GUnicodeScript s)
+{
+  const char *names[] = {
+    "Zyyy", "Zinh", "Arab", "Armn", "Beng", "Bopo", "Cher",
+    "Copt", "Cyrl", "Dsrt", "Deva", "Ethi", "Geor", "Goth",
+    "Grek", "Gujr", "Guru", "Hani", "Hang", "Hebr", "Hira",
+    "Knda", "Kana", "Khmr", "Laoo", "Latn", "Mlym", "Mong",
+    "Mymr", "Ogam", "Ital", "Orya", "Runr", "Sinh", "Syrc",
+    "Taml", "Telu", "Thaa", "Thai", "Tibt", "Cans", "Yiii",
+    "Tglg", "Hano", "Buhd", "Tagb", "Brai", "Cprt", "Limb",
+    "Osma", "Shaw", "Linb", "Tale", "Ugar", "Talu", "Bugi",
+    "Glag", "Tfng", "Sylo", "Xpeo", "Khar", "Zzzz", "Bali",
+    "Xsux", "Phnx", "Phag", "Nkoo", "Kali", "Lepc", "Rjng",
+    "Sund", "Saur", "Cham", "Olck", "Vaii", "Cari", "Lyci",
+    "Lydi", "Avst", "Bamu", "Egyp", "Armi", "Phli", "Prti",
+    "Java", "Kthi", "Lisu", "Mtei", "Sarb", "Orkh", "Samr",
+    "Lana", "Tavt", "Batk", "Brah", "Mand", "Cakm", "Merc",
+    "Mero", "Plrd", "Shrd", "Sora", "Takr", "Bass", "Aghb",
+    "Dupl", "Elba", "Gran", "Khoj", "Sind", "Lina", "Mahj",
+    "Mani", "Mend", "Modi", "Mroo", "Nbat", "Narb", "Perm",
+    "Hmng", "Palm", "Pauc", "Phlp", "Sidd", "Tirh", "Wara",
+    "Ahom", "Hluw", "Hatr", "Mult", "Hung", "Sgnw", "Adlm",
+    "Bhks", "Marc", "Newa", "Osge", "Tang", "Gonm", "Nshu",
+    "Soyo", "Zanb", "Dogr", "Gong", "Rohg", "Maka", "Medf",
+    "Sogo", "Sogd", "Elym", "Nand", "Rohg", "Wcho"
+  };
+  return names[s];
+}
+
+static void
+test_file (const char *filename, GString *string)
+{
+  char *contents;
+  gsize length;
+  GError *error = NULL;
+  char *p;
+  GString *s1, *s2, *s3;
+  GUnicodeScript prev_script = -1;
+  int m;
+
+  if (!g_file_get_contents (filename, &contents, &length, &error))
+    {
+      g_error ("%s", error->message);
+      g_error_free (error);
+      return;
+    }
+
+  g_string_append (string, "Text: ");
+  s1 = g_string_new ("Char type: ");
+  s2 = g_string_new ("Break type: ");
+  s3 = g_string_new ("Script: ");
+
+  m = MAX (MAX (s1->len, s2->len), s3->len);
+
+  g_string_append_printf (s1, "%*s", (int)(m - s1->len), "");
+  g_string_append_printf (s2, "%*s", (int)(m - s2->len), "");
+  g_string_append_printf (s3, "%*s", (int)(m - s3->len), "");
+  g_string_append_printf (string, "%*s", (int)(m - strlen ("Text: ")), "");
+
+  for (p = contents; *p; p = g_utf8_next_char (p))
+    {
+      gunichar ch = g_utf8_get_char (p);
+      const char *ctype = char_type (g_unichar_type (ch));
+      const char *btype = break_type (g_unichar_break_type (ch));
+      GUnicodeScript script = g_unichar_get_script (ch);
+      int c = strlen (ctype);
+      int b = strlen (btype);
+      int s = 0;
+      int t = 0;
+
+      g_string_append_printf (s1, "%s", ctype);
+      g_string_append_printf (s2, "%s", btype);
+
+      if (prev_script != script)
+        {
+          const char *str = script_name (script);
+          prev_script = script;
+          g_string_append (s3, str);
+          s = strlen (str);
+        }
+
+      if (ch == 0x20)
+        {
+          g_string_append (string, "[ ]");
+          t = 3;
+        }
+      else if (g_unichar_isgraph (ch) &&
+               (ch != 0x2028) &&
+               (ch != 0x2029))
+        {
+          g_string_append_unichar (string, ch);
+          t = 1;
+        }
+      else
+        {
+          char *str = g_strdup_printf ("[%#04x]", ch);
+          g_string_append (string, str); 
+          t = strlen (str);
+          g_free (str);
+        }
+
+      m = MAX (t, MAX (MAX (c + 1, b + 1), s + 1));
+
+      g_string_append_printf (string, "%*s", m - t, "");
+      g_string_append_printf (s1, "%*s", m - c, "");
+      g_string_append_printf (s2, "%*s", m - b, "");
+      g_string_append_printf (s3, "%*s", m - s, "");
+    }
+
+  g_string_append (string, "\n");
+  g_string_append_len (string, s1->str, s1->len);
+  g_string_append (string, "\n");
+  g_string_append_len (string, s2->str, s2->len);
+  g_string_append (string, "\n");
+  g_string_append_len (string, s3->str, s3->len);
+  g_string_append (string, "\n");
+
+  g_string_free (s1, TRUE);
+  g_string_free (s2, TRUE);
+  g_string_free (s3, TRUE);
+
+  g_free (contents);
+}
+
+static gchar *
+get_expected_filename (const gchar *filename)
+{
+  gchar *f, *p, *expected;
+
+  f = g_strdup (filename);
+  p = strstr (f, ".chars");
+  if (p)
+    *p = 0;
+  expected = g_strconcat (f, ".expected", NULL);
+
+  g_free (f);
+
+  return expected;
+}
+
+static void
+test_break (gconstpointer d)
+{
+  const char *filename = d;
+  char *expected_file;
+  GError *error = NULL;
+  GString *dump;
+  char *diff = NULL;
+
+  expected_file = get_expected_filename (filename);
+
+  dump = g_string_sized_new (0);
+
+  test_file (filename, dump);
+
+  diff = diff_with_file (expected_file, dump->str, dump->len, &error);
+  g_assert_no_error (error);
+
+  if (diff && diff[0])
+    {
+      g_printerr ("Contents don't match expected contents:\n%s", diff);
+      g_test_fail ();
+      g_free (diff);
+    }
+
+  g_string_free (dump, TRUE);
+  g_free (expected_file);
+}
+
+int
+main (int argc, char *argv[])
+{
+  GDir *dir;
+  GError *error = NULL;
+  const gchar *name;
+  gchar *path;
+
+  g_setenv ("LC_ALL", "en_US.UTF-8", TRUE);
+  setlocale (LC_ALL, "");
+
+  g_test_init (&argc, &argv, NULL);
+
+  /* allow to easily generate expected output for new test cases */
+  if (argc > 1)
+    {
+      GString *string;
+
+      string = g_string_sized_new (0);
+      test_file (argv[1], string);
+      g_print ("%s", string->str);
+
+      return 0;
+    }
+
+  path = g_test_build_filename (G_TEST_DIST, "chars", NULL);
+  dir = g_dir_open (path, 0, &error);
+  g_free (path);
+  g_assert_no_error (error);
+  while ((name = g_dir_read_name (dir)) != NULL)
+    {
+      if (!strstr (name, "chars"))
+        continue;
+
+      path = g_strdup_printf ("/chars/%s", name);
+      g_test_add_data_func_full (path, g_test_build_filename (G_TEST_DIST, "chars", name, NULL),
+                                 test_break, g_free);
+      g_free (path);
+    }
+  g_dir_close (dir);
+
+  return g_test_run ();
+}
author	Matthias Clasen <mclasen@redhat.com>	2019-07-08 13:38:36 -0400
committer	Matthias Clasen <mclasen@redhat.com>	2019-07-08 13:38:36 -0400
commit	5988509d25a99836e904988be57541997e237f23 (patch)
tree	e53f3367ba90bd038253f98a0df58fd454367826
parent	18cfe50caa1dc6e3295ad6bf5151b14ffbcfbc37 (diff)
download	glib-unicode-test.tar.gz