summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGabriel Ivascu <gabrielivascu@gnome.org>2017-12-17 17:28:02 +0200
committerGabriel Ivascu <gabrielivascu@gnome.org>2017-12-17 17:34:22 +0200
commitc0ff0b543ef9cb4e26cbc96b6d2857d2e0ac2dce (patch)
treec0ce4b586eee1287279fdf1f6b23d63135fe41d6
parentdb7a7389b33dcd7715694e25fbfd2ff274add14e (diff)
downloadepiphany-wip/idn-display.tar.gz
uri-helpers: Implement Mozilla's IDN display algorithmwip/idn-display
https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm https://bugzilla.gnome.org/show_bug.cgi?id=791168
-rw-r--r--lib/ephy-uri-helpers.c159
-rw-r--r--lib/ephy-uri-helpers.h9
-rw-r--r--tests/ephy-uri-helpers-test.c69
3 files changed, 232 insertions, 5 deletions
diff --git a/lib/ephy-uri-helpers.c b/lib/ephy-uri-helpers.c
index 7b0059331..ea5157713 100644
--- a/lib/ephy-uri-helpers.c
+++ b/lib/ephy-uri-helpers.c
@@ -252,6 +252,36 @@ ephy_remove_tracking_from_uri (const char *uri_string)
return ret;
}
+static char *
+evaluate_host_for_display (const char *original_host,
+ const char *unicode_host)
+{
+ char **original_labels;
+ char **unicode_labels;
+ char *retval;
+
+ g_assert (original_host);
+ g_assert (unicode_host);
+
+ /* These arrays will have the same length. */
+ original_labels = g_strsplit (original_host, ".", -1);
+ unicode_labels = g_strsplit (unicode_host, ".", -1);
+
+ for (guint i = 0; i < g_strv_length (unicode_labels); i++) {
+ if (!ephy_uri_validate_label (unicode_labels[i])) {
+ g_free (unicode_labels[i]);
+ unicode_labels[i] = g_strdup (original_labels[i]);
+ }
+ }
+
+ retval = g_strjoinv (".", unicode_labels);
+ g_strfreev (original_labels);
+ g_strfreev (unicode_labels);
+
+ return retval;
+}
+
+
/* Use this function to format a URI for display. The URIs used
* internally by WebKit may contain percent-encoded characters or
* punycode, which we do not want the user to see.
@@ -266,6 +296,7 @@ ephy_uri_decode (const char *uri_string)
static GMutex idna_creation_mutex;
SoupURI *uri;
char *percent_encoded_uri;
+ char *percent_decoded_host;
char *idna_decoded_name;
char *fully_decoded_uri;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
@@ -307,8 +338,10 @@ ephy_uri_decode (const char *uri_string)
return g_strdup (uri_string);
}
+ percent_decoded_host = soup_uri_decode (uri->host);
g_free (uri->host);
- uri->host = idna_decoded_name;
+ uri->host = evaluate_host_for_display (percent_decoded_host, idna_decoded_name);
+ g_free (percent_decoded_host);
}
/* Note: this also strips passwords from the display URI. */
@@ -357,3 +390,127 @@ ephy_uri_to_security_origin (const char *uri_string)
return result != NULL ? g_strdup (result) : NULL;
}
+
+static inline void
+script_table_update (GHashTable *table,
+ GUnicodeScript script)
+{
+ gpointer value;
+ gpointer new_value;
+
+ value = g_hash_table_lookup (table, GINT_TO_POINTER (script));
+ new_value = GINT_TO_POINTER (GPOINTER_TO_INT (value) + 1);
+ g_hash_table_replace (table, GINT_TO_POINTER (script), new_value);
+}
+
+static inline int
+script_table_get (GHashTable *table,
+ GUnicodeScript script)
+{
+ gpointer value;
+
+ value = g_hash_table_lookup (table, GINT_TO_POINTER (script));
+ return GPOINTER_TO_INT (value);
+}
+
+/**
+ * ephy_uri_validate_label:
+ * @label: a domain label, UTF-8 encoded
+ *
+ * Verifies whether @label is safe to be displayed as Unicode characters, as per
+ * this algorithm: https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm. If
+ * %FALSE is returned, then @label should be displayed as Punycode text.
+ *
+ * Return value: %TRUE if @label is considered safe, %FALSE otherwise
+ **/
+gboolean
+ephy_uri_validate_label (const char *label)
+{
+ GHashTable *table;
+ GUnicodeScript script;
+ gunichar *unichars;
+ gunichar saved_zero_char = 0;
+ gboolean retval = FALSE;
+ long num;
+
+ g_assert (label);
+
+ if (!g_utf8_validate (label, -1, NULL))
+ return FALSE;
+
+ /* Use a hash table to count the occurrences of every script,
+ * except Common and Inherited. */
+ table = g_hash_table_new (g_direct_hash, g_direct_equal);
+ unichars = g_utf8_to_ucs4_fast (label, -1, &num);
+
+ for (gunichar *u = unichars; u && *u; u++) {
+ script = g_unichar_get_script (*u);
+
+ if (script != G_UNICODE_SCRIPT_COMMON && script != G_UNICODE_SCRIPT_INHERITED)
+ script_table_update (table, script);
+ else
+ num--;
+
+ /* Check for mixed numbering systems. */
+ if (g_unichar_isdigit (*u)) {
+ gunichar zero_char = *u - g_unichar_digit_value (*u);
+ if (saved_zero_char == 0)
+ saved_zero_char = zero_char;
+ else if (zero_char != saved_zero_char)
+ goto out;
+ }
+ }
+
+ /* Single script, allow. */
+ if (g_hash_table_size (table) < 2) {
+ retval = TRUE;
+ goto out;
+ }
+
+ /* Chinese scripts. */
+ if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
+ script_table_get (table, G_UNICODE_SCRIPT_HAN) +
+ script_table_get (table, G_UNICODE_SCRIPT_BOPOMOFO) == num) {
+ retval = TRUE;
+ goto out;
+ }
+
+ /* Korean scripts. */
+ if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
+ script_table_get (table, G_UNICODE_SCRIPT_HAN) +
+ script_table_get (table, G_UNICODE_SCRIPT_HANGUL) == num) {
+ retval = TRUE;
+ goto out;
+ }
+
+ /* Japanese scripts. */
+ if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) +
+ script_table_get (table, G_UNICODE_SCRIPT_HAN) +
+ script_table_get (table, G_UNICODE_SCRIPT_HIRAGANA) +
+ script_table_get (table, G_UNICODE_SCRIPT_KATAKANA) == num) {
+ retval = TRUE;
+ goto out;
+ }
+
+ /* Ban mixes of more than two scripts. */
+ if (g_hash_table_size (table) > 2)
+ goto out;
+
+ /* Ban any mix of two scrips that doesn't contain Latin. */
+ if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) == 0)
+ goto out;
+
+ /* Ban Latin + Cyrillic or Latin + Greek. */
+ if (script_table_get (table, G_UNICODE_SCRIPT_CYRILLIC) > 0 ||
+ script_table_get (table, G_UNICODE_SCRIPT_GREEK) > 0)
+ goto out;
+
+ /* Allow Latin + any other single script. */
+ retval = TRUE;
+
+out:
+ g_hash_table_unref (table);
+ g_free (unichars);
+
+ return retval;
+}
diff --git a/lib/ephy-uri-helpers.h b/lib/ephy-uri-helpers.h
index cf46a6a5f..9416b0028 100644
--- a/lib/ephy-uri-helpers.h
+++ b/lib/ephy-uri-helpers.h
@@ -25,9 +25,10 @@
G_BEGIN_DECLS
-char *ephy_remove_tracking_from_uri (const char *uri);
-char *ephy_uri_decode (const char *uri);
-char *ephy_uri_normalize (const char *uri);
-char *ephy_uri_to_security_origin (const char *uri);
+char *ephy_remove_tracking_from_uri (const char *uri);
+char *ephy_uri_decode (const char *uri);
+char *ephy_uri_normalize (const char *uri);
+char *ephy_uri_to_security_origin (const char *uri);
+gboolean ephy_uri_validate_label (const char *label);
G_END_DECLS
diff --git a/tests/ephy-uri-helpers-test.c b/tests/ephy-uri-helpers-test.c
index f536ab9eb..b21032a15 100644
--- a/tests/ephy-uri-helpers-test.c
+++ b/tests/ephy-uri-helpers-test.c
@@ -65,6 +65,71 @@ test_ephy_uri_helpers_remove_tracking (void)
}
}
+static void
+test_ephy_uri_helpers_validate_label (void)
+{
+ struct {
+ const char *label;
+ gboolean is_ok;
+ } const items[] = {
+ { "abcdef", TRUE }, /* Latin-only, OK. */
+ { "αβγχψω", TRUE }, /* Greek-only, OK. */
+ { "αaβbγcχxψyωz", FALSE }, /* Latin + Greek, NOT OK. */
+ { "абгдеж", TRUE }, /* Cyrillic-only, OK. */
+ { "аaбbгcдdеeжf", FALSE }, /* Latin + Cyrillic, NOT OK. */
+ { "αаβбγгχдψеωж", FALSE }, /* Greek + Cyrillic, NOT OK. */
+ { "おかがキギク", TRUE }, /* Japanese mix, OK. */
+ { "おaかbがcキdギeクf", TRUE }, /* Latin + Japanese mix, OK. */
+ { "ㄈㄉㄊ⻕⻒⼣", TRUE }, /* Chinese mix, OK. */
+ { "ㄈaㄉbㄊc⻕d⻒e⼣f", TRUE }, /* Latin + Chinese mix, OK. */
+ { "ᄊᄋᄌᄍᄎᄏ", TRUE }, /* Korean mix, OK. */
+ { "ᄊaᄋbᄌcᄍdᄎeᄏf", TRUE }, /* Latin + Korean mix, OK. */
+ { "ㄈㄉㄊおかが", FALSE }, /* Chinese + Japanese, NOT OK. */
+ { "ㄈㄉㄊᄊᄋᄌ", FALSE }, /* Chinese + Korean, NOT OK. */
+ { "おかがᄊᄋᄌ", FALSE }, /* Japanese + Korean, NOT OK. */
+ { "abꓚꓛᎪᎫ", FALSE }, /* Latin + Lisu + Cherokee (3 scripts), NOT OK. */
+ { "ꓚꓛꓜᎪᎫᎬ", FALSE }, /* Lisu + Cherokee, NOT OK. */
+ { "abc𐒊𐒋𐒌", TRUE }, /* Latin + single scrip except Greek or Cyrillic, OK. */
+ };
+
+ for (guint i = 0; i < G_N_ELEMENTS (items); i++)
+ g_assert (ephy_uri_validate_label (items[i].label) == items[i].is_ok);
+}
+
+static void
+test_ephy_uri_helpers_uri_decode (void)
+{
+ struct {
+ const char *input;
+ const char *output;
+ } const items[] = {
+ { "http://abcdef.com/", "http://abcdef.com/" },
+ { "http://xn--mxacd4ffg.com/", "http://αβγχψω.com/" },
+ { "http://xn--abcxyz-oxejk5rman.com/", "http://xn--abcxyz-oxejk5rman.com/" },
+ { "http://xn--80acgefg.com/", "http://абгдеж.com/" },
+ { "http://xn--abcdef-2nfjtlmn.com/", "http://xn--abcdef-2nfjtlmn.com/" },
+ { "http://xn--mxacd4ffg03ejatlmn.com/", "http://xn--mxacd4ffg03ejatlmn.com/" },
+ { "http://xn--t8jcd20bfag.com/", "http://おかがキギク.com/" },
+ { "http://xn--abcdef-253ejak58gman.com/", "http://おaかbがcキdギeクf.com/" },
+ { "http://xn--2xjf20oeaf2262d.com/", "http://ㄈㄉㄊ⻕⻒夕.com/" },
+ { "http://xn--abcdef-z01ewa771clam1314j.com/", "http://ㄈaㄉbㄊc⻕d⻒e夕f.com/" },
+ { "http://xn--8pdcdefg.com/", "http://ᄊᄋᄌᄍᄎᄏ.com/" },
+ { "http://xn--abcdef-2gyjklmn.com/", "http://ᄊaᄋbᄌcᄍdᄎeᄏf.com/" },
+ { "http://xn--t8jcd36efag.com/", "http://xn--t8jcd36efag.com/" },
+ { "http://xn--8pdcd3683afag.com/", "http://xn--8pdcd3683afag.com/" },
+ { "http://xn--8pdcd3013afag.com/", "http://xn--8pdcd3013afag.com/" },
+ { "http://xn--ab-u9le7496pga.com/", "http://xn--ab-u9le7496pga.com/" },
+ { "http://xn--g9dcd5779lfag.com/", "http://xn--g9dcd5779lfag.com/" },
+ { "http://xn--abc-dn1tfag.com/", "http://abc𐒊𐒋𐒌.com/" },
+ };
+
+ for (guint i = 0; i < G_N_ELEMENTS (items); i++) {
+ char *decoded = ephy_uri_decode (items[i].input);
+ g_assert_cmpstr (decoded, ==, items[i].output);
+ g_free (decoded);
+ }
+}
+
int
main (int argc, char *argv[])
{
@@ -80,6 +145,10 @@ main (int argc, char *argv[])
g_test_add_func ("/lib/ephy-uri-helpers/remove_tracking",
test_ephy_uri_helpers_remove_tracking);
+ g_test_add_func ("/lib/ephy-uri-helpers/validate_label",
+ test_ephy_uri_helpers_validate_label);
+ g_test_add_func ("/lib/ephy-uri-helpers/uri_decode",
+ test_ephy_uri_helpers_uri_decode);
ret = g_test_run ();