#undef G_DISABLE_ASSERT #undef G_LOG_DOMAIN #include #include #include #include static char * decode (const gchar *input) { unsigned ch; int offset = 0; GString *result = g_string_new (NULL); do { g_assert_cmpint (sscanf (input + offset, "%x", &ch), ==, 1); g_string_append_unichar (result, ch); while (input[offset] && input[offset] != ' ') offset++; while (input[offset] && input[offset] == ' ') offset++; } while (input[offset]); return g_string_free (result, FALSE); } const char *names[4] = { "NFD", "NFC", "NFKD", "NFKC" }; static void test_form (int line, GNormalizeMode mode, gboolean do_compat, int expected, char **c, char **raw) { int i; gboolean mode_is_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); if (mode_is_compat || !do_compat) { for (i = 0; i < 3; i++) { char *result = g_utf8_normalize (c[i], -1, mode); g_assert_cmpstr (result, ==, c[expected]); g_free (result); } } if (mode_is_compat || do_compat) { for (i = 3; i < 5; i++) { char *result = g_utf8_normalize (c[i], -1, mode); g_assert_cmpstr (result, ==, c[expected]); g_free (result); } } } static void process_one (int line, gchar **columns) { char *c[5]; int i; for (i = 0; i < 5; i++) { c[i] = decode (columns[i]); g_assert_nonnull (c[i]); } test_form (line, G_NORMALIZE_NFD, FALSE, 2, c, columns); test_form (line, G_NORMALIZE_NFD, TRUE, 4, c, columns); test_form (line, G_NORMALIZE_NFC, FALSE, 1, c, columns); test_form (line, G_NORMALIZE_NFC, TRUE, 3, c, columns); test_form (line, G_NORMALIZE_NFKD, TRUE, 4, c, columns); test_form (line, G_NORMALIZE_NFKC, TRUE, 3, c, columns); for (i = 0; i < 5; i++) g_free (c[i]); } static void test_unicode_normalize (void) { GIOChannel *in; GError *error = NULL; gchar *filename = NULL; GString *buffer = g_string_new (NULL); int line = 1; filename = g_test_build_filename (G_TEST_DIST, "NormalizationTest.txt", NULL); g_assert_nonnull (filename); in = g_io_channel_new_file (filename, "r", &error); g_assert_no_error (error); g_assert_nonnull (in); g_free (filename); while (TRUE) { gsize term_pos; gchar **columns; if (g_io_channel_read_line_string (in, buffer, &term_pos, &error) != G_IO_STATUS_NORMAL) break; buffer->str[term_pos] = '\0'; if (buffer->str[0] == '#') /* Comment */ goto next; if (buffer->str[0] == '@') /* Part */ { g_test_message ("Processing %s", buffer->str + 1); goto next; } columns = g_strsplit (buffer->str, ";", -1); if (!columns[0]) { g_strfreev (columns); goto next; } process_one (line, columns); g_strfreev (columns); next: g_string_truncate (buffer, 0); line++; } g_assert_no_error (error); g_io_channel_unref (in); g_string_free (buffer, TRUE); } static void test_unicode_normalize_invalid (void) { /* g_utf8_normalize() should return NULL for all of these invalid inputs */ const struct { gssize max_len; const gchar *str; } test_vectors[] = { /* input ending with truncated multibyte encoding */ { -1, "\xC0" }, { 1, "\xC0\x80" }, { -1, "\xE0\x80" }, { 2, "\xE0\x80\x80" }, { -1, "\xF0\x80\x80" }, { 3, "\xF0\x80\x80\x80" }, { -1, "\xF8\x80\x80\x80" }, { 4, "\xF8\x80\x80\x80\x80" }, { 3, "\x20\xE2\x84\xAA" }, { -1, "\x20\xE2\x00\xAA" }, { -1, "\xC0\x80\xE0\x80" }, { 4, "\xC0\x80\xE0\x80\x80" }, /* input containing invalid multibyte encoding */ { -1, "\xED\x85\x9C\xED\x15\x9C\xED\x85\x9C" }, }; gsize i; for (i = 0; i < G_N_ELEMENTS (test_vectors); i++) { g_test_message ("Invalid UTF-8 vector %" G_GSIZE_FORMAT, i); g_assert_null (g_utf8_normalize (test_vectors[i].str, test_vectors[i].max_len, G_NORMALIZE_ALL)); } } int main (int argc, char **argv) { g_test_init (&argc, &argv, NULL); g_test_add_func ("/unicode/normalize", test_unicode_normalize); g_test_add_func ("/unicode/normalize-invalid", test_unicode_normalize_invalid); return g_test_run (); }