diff options
author | Gabriel Ivascu <gabrielivascu@gnome.org> | 2017-12-17 17:28:02 +0200 |
---|---|---|
committer | Gabriel Ivascu <gabrielivascu@gnome.org> | 2017-12-17 17:34:22 +0200 |
commit | c0ff0b543ef9cb4e26cbc96b6d2857d2e0ac2dce (patch) | |
tree | c0ce4b586eee1287279fdf1f6b23d63135fe41d6 | |
parent | db7a7389b33dcd7715694e25fbfd2ff274add14e (diff) | |
download | epiphany-wip/idn-display.tar.gz |
uri-helpers: Implement Mozilla's IDN display algorithmwip/idn-display
https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm
https://bugzilla.gnome.org/show_bug.cgi?id=791168
-rw-r--r-- | lib/ephy-uri-helpers.c | 159 | ||||
-rw-r--r-- | lib/ephy-uri-helpers.h | 9 | ||||
-rw-r--r-- | tests/ephy-uri-helpers-test.c | 69 |
3 files changed, 232 insertions, 5 deletions
diff --git a/lib/ephy-uri-helpers.c b/lib/ephy-uri-helpers.c index 7b0059331..ea5157713 100644 --- a/lib/ephy-uri-helpers.c +++ b/lib/ephy-uri-helpers.c @@ -252,6 +252,36 @@ ephy_remove_tracking_from_uri (const char *uri_string) return ret; } +static char * +evaluate_host_for_display (const char *original_host, + const char *unicode_host) +{ + char **original_labels; + char **unicode_labels; + char *retval; + + g_assert (original_host); + g_assert (unicode_host); + + /* These arrays will have the same length. */ + original_labels = g_strsplit (original_host, ".", -1); + unicode_labels = g_strsplit (unicode_host, ".", -1); + + for (guint i = 0; i < g_strv_length (unicode_labels); i++) { + if (!ephy_uri_validate_label (unicode_labels[i])) { + g_free (unicode_labels[i]); + unicode_labels[i] = g_strdup (original_labels[i]); + } + } + + retval = g_strjoinv (".", unicode_labels); + g_strfreev (original_labels); + g_strfreev (unicode_labels); + + return retval; +} + + /* Use this function to format a URI for display. The URIs used * internally by WebKit may contain percent-encoded characters or * punycode, which we do not want the user to see. @@ -266,6 +296,7 @@ ephy_uri_decode (const char *uri_string) static GMutex idna_creation_mutex; SoupURI *uri; char *percent_encoded_uri; + char *percent_decoded_host; char *idna_decoded_name; char *fully_decoded_uri; UIDNAInfo info = UIDNA_INFO_INITIALIZER; @@ -307,8 +338,10 @@ ephy_uri_decode (const char *uri_string) return g_strdup (uri_string); } + percent_decoded_host = soup_uri_decode (uri->host); g_free (uri->host); - uri->host = idna_decoded_name; + uri->host = evaluate_host_for_display (percent_decoded_host, idna_decoded_name); + g_free (percent_decoded_host); } /* Note: this also strips passwords from the display URI. */ @@ -357,3 +390,127 @@ ephy_uri_to_security_origin (const char *uri_string) return result != NULL ? g_strdup (result) : NULL; } + +static inline void +script_table_update (GHashTable *table, + GUnicodeScript script) +{ + gpointer value; + gpointer new_value; + + value = g_hash_table_lookup (table, GINT_TO_POINTER (script)); + new_value = GINT_TO_POINTER (GPOINTER_TO_INT (value) + 1); + g_hash_table_replace (table, GINT_TO_POINTER (script), new_value); +} + +static inline int +script_table_get (GHashTable *table, + GUnicodeScript script) +{ + gpointer value; + + value = g_hash_table_lookup (table, GINT_TO_POINTER (script)); + return GPOINTER_TO_INT (value); +} + +/** + * ephy_uri_validate_label: + * @label: a domain label, UTF-8 encoded + * + * Verifies whether @label is safe to be displayed as Unicode characters, as per + * this algorithm: https://wiki.mozilla.org/IDN_Display_Algorithm#Algorithm. If + * %FALSE is returned, then @label should be displayed as Punycode text. + * + * Return value: %TRUE if @label is considered safe, %FALSE otherwise + **/ +gboolean +ephy_uri_validate_label (const char *label) +{ + GHashTable *table; + GUnicodeScript script; + gunichar *unichars; + gunichar saved_zero_char = 0; + gboolean retval = FALSE; + long num; + + g_assert (label); + + if (!g_utf8_validate (label, -1, NULL)) + return FALSE; + + /* Use a hash table to count the occurrences of every script, + * except Common and Inherited. */ + table = g_hash_table_new (g_direct_hash, g_direct_equal); + unichars = g_utf8_to_ucs4_fast (label, -1, &num); + + for (gunichar *u = unichars; u && *u; u++) { + script = g_unichar_get_script (*u); + + if (script != G_UNICODE_SCRIPT_COMMON && script != G_UNICODE_SCRIPT_INHERITED) + script_table_update (table, script); + else + num--; + + /* Check for mixed numbering systems. */ + if (g_unichar_isdigit (*u)) { + gunichar zero_char = *u - g_unichar_digit_value (*u); + if (saved_zero_char == 0) + saved_zero_char = zero_char; + else if (zero_char != saved_zero_char) + goto out; + } + } + + /* Single script, allow. */ + if (g_hash_table_size (table) < 2) { + retval = TRUE; + goto out; + } + + /* Chinese scripts. */ + if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) + + script_table_get (table, G_UNICODE_SCRIPT_HAN) + + script_table_get (table, G_UNICODE_SCRIPT_BOPOMOFO) == num) { + retval = TRUE; + goto out; + } + + /* Korean scripts. */ + if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) + + script_table_get (table, G_UNICODE_SCRIPT_HAN) + + script_table_get (table, G_UNICODE_SCRIPT_HANGUL) == num) { + retval = TRUE; + goto out; + } + + /* Japanese scripts. */ + if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) + + script_table_get (table, G_UNICODE_SCRIPT_HAN) + + script_table_get (table, G_UNICODE_SCRIPT_HIRAGANA) + + script_table_get (table, G_UNICODE_SCRIPT_KATAKANA) == num) { + retval = TRUE; + goto out; + } + + /* Ban mixes of more than two scripts. */ + if (g_hash_table_size (table) > 2) + goto out; + + /* Ban any mix of two scrips that doesn't contain Latin. */ + if (script_table_get (table, G_UNICODE_SCRIPT_LATIN) == 0) + goto out; + + /* Ban Latin + Cyrillic or Latin + Greek. */ + if (script_table_get (table, G_UNICODE_SCRIPT_CYRILLIC) > 0 || + script_table_get (table, G_UNICODE_SCRIPT_GREEK) > 0) + goto out; + + /* Allow Latin + any other single script. */ + retval = TRUE; + +out: + g_hash_table_unref (table); + g_free (unichars); + + return retval; +} diff --git a/lib/ephy-uri-helpers.h b/lib/ephy-uri-helpers.h index cf46a6a5f..9416b0028 100644 --- a/lib/ephy-uri-helpers.h +++ b/lib/ephy-uri-helpers.h @@ -25,9 +25,10 @@ G_BEGIN_DECLS -char *ephy_remove_tracking_from_uri (const char *uri); -char *ephy_uri_decode (const char *uri); -char *ephy_uri_normalize (const char *uri); -char *ephy_uri_to_security_origin (const char *uri); +char *ephy_remove_tracking_from_uri (const char *uri); +char *ephy_uri_decode (const char *uri); +char *ephy_uri_normalize (const char *uri); +char *ephy_uri_to_security_origin (const char *uri); +gboolean ephy_uri_validate_label (const char *label); G_END_DECLS diff --git a/tests/ephy-uri-helpers-test.c b/tests/ephy-uri-helpers-test.c index f536ab9eb..b21032a15 100644 --- a/tests/ephy-uri-helpers-test.c +++ b/tests/ephy-uri-helpers-test.c @@ -65,6 +65,71 @@ test_ephy_uri_helpers_remove_tracking (void) } } +static void +test_ephy_uri_helpers_validate_label (void) +{ + struct { + const char *label; + gboolean is_ok; + } const items[] = { + { "abcdef", TRUE }, /* Latin-only, OK. */ + { "αβγχψω", TRUE }, /* Greek-only, OK. */ + { "αaβbγcχxψyωz", FALSE }, /* Latin + Greek, NOT OK. */ + { "абгдеж", TRUE }, /* Cyrillic-only, OK. */ + { "аaбbгcдdеeжf", FALSE }, /* Latin + Cyrillic, NOT OK. */ + { "αаβбγгχдψеωж", FALSE }, /* Greek + Cyrillic, NOT OK. */ + { "おかがキギク", TRUE }, /* Japanese mix, OK. */ + { "おaかbがcキdギeクf", TRUE }, /* Latin + Japanese mix, OK. */ + { "ㄈㄉㄊ⻕⻒⼣", TRUE }, /* Chinese mix, OK. */ + { "ㄈaㄉbㄊc⻕d⻒e⼣f", TRUE }, /* Latin + Chinese mix, OK. */ + { "ᄊᄋᄌᄍᄎᄏ", TRUE }, /* Korean mix, OK. */ + { "ᄊaᄋbᄌcᄍdᄎeᄏf", TRUE }, /* Latin + Korean mix, OK. */ + { "ㄈㄉㄊおかが", FALSE }, /* Chinese + Japanese, NOT OK. */ + { "ㄈㄉㄊᄊᄋᄌ", FALSE }, /* Chinese + Korean, NOT OK. */ + { "おかがᄊᄋᄌ", FALSE }, /* Japanese + Korean, NOT OK. */ + { "abꓚꓛᎪᎫ", FALSE }, /* Latin + Lisu + Cherokee (3 scripts), NOT OK. */ + { "ꓚꓛꓜᎪᎫᎬ", FALSE }, /* Lisu + Cherokee, NOT OK. */ + { "abc𐒊𐒋𐒌", TRUE }, /* Latin + single scrip except Greek or Cyrillic, OK. */ + }; + + for (guint i = 0; i < G_N_ELEMENTS (items); i++) + g_assert (ephy_uri_validate_label (items[i].label) == items[i].is_ok); +} + +static void +test_ephy_uri_helpers_uri_decode (void) +{ + struct { + const char *input; + const char *output; + } const items[] = { + { "http://abcdef.com/", "http://abcdef.com/" }, + { "http://xn--mxacd4ffg.com/", "http://αβγχψω.com/" }, + { "http://xn--abcxyz-oxejk5rman.com/", "http://xn--abcxyz-oxejk5rman.com/" }, + { "http://xn--80acgefg.com/", "http://абгдеж.com/" }, + { "http://xn--abcdef-2nfjtlmn.com/", "http://xn--abcdef-2nfjtlmn.com/" }, + { "http://xn--mxacd4ffg03ejatlmn.com/", "http://xn--mxacd4ffg03ejatlmn.com/" }, + { "http://xn--t8jcd20bfag.com/", "http://おかがキギク.com/" }, + { "http://xn--abcdef-253ejak58gman.com/", "http://おaかbがcキdギeクf.com/" }, + { "http://xn--2xjf20oeaf2262d.com/", "http://ㄈㄉㄊ⻕⻒夕.com/" }, + { "http://xn--abcdef-z01ewa771clam1314j.com/", "http://ㄈaㄉbㄊc⻕d⻒e夕f.com/" }, + { "http://xn--8pdcdefg.com/", "http://ᄊᄋᄌᄍᄎᄏ.com/" }, + { "http://xn--abcdef-2gyjklmn.com/", "http://ᄊaᄋbᄌcᄍdᄎeᄏf.com/" }, + { "http://xn--t8jcd36efag.com/", "http://xn--t8jcd36efag.com/" }, + { "http://xn--8pdcd3683afag.com/", "http://xn--8pdcd3683afag.com/" }, + { "http://xn--8pdcd3013afag.com/", "http://xn--8pdcd3013afag.com/" }, + { "http://xn--ab-u9le7496pga.com/", "http://xn--ab-u9le7496pga.com/" }, + { "http://xn--g9dcd5779lfag.com/", "http://xn--g9dcd5779lfag.com/" }, + { "http://xn--abc-dn1tfag.com/", "http://abc𐒊𐒋𐒌.com/" }, + }; + + for (guint i = 0; i < G_N_ELEMENTS (items); i++) { + char *decoded = ephy_uri_decode (items[i].input); + g_assert_cmpstr (decoded, ==, items[i].output); + g_free (decoded); + } +} + int main (int argc, char *argv[]) { @@ -80,6 +145,10 @@ main (int argc, char *argv[]) g_test_add_func ("/lib/ephy-uri-helpers/remove_tracking", test_ephy_uri_helpers_remove_tracking); + g_test_add_func ("/lib/ephy-uri-helpers/validate_label", + test_ephy_uri_helpers_validate_label); + g_test_add_func ("/lib/ephy-uri-helpers/uri_decode", + test_ephy_uri_helpers_uri_decode); ret = g_test_run (); |