From 4c2d1ac3daed3940217da2477546db31fbdb1b43 Mon Sep 17 00:00:00 2001 From: Dan Winship Date: Wed, 18 Dec 2013 17:06:16 -0500 Subject: soup-tld: accept ASCII-encoded hostnames too It's inconvenient for callers to have to ensure that the hostname they pass in is UTF-8 (since they themselves may not need to care). So accept ASCII-encoded hostnames too, and add the corresponding punycode tests from the publicsuffix.org test list that we were ignoring before. --- libsoup/soup-tld.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++-------- tests/tld-test.c | 26 ++++++++++++++++++++- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/libsoup/soup-tld.c b/libsoup/soup-tld.c index c6faed13..2e3da62d 100644 --- a/libsoup/soup-tld.c +++ b/libsoup/soup-tld.c @@ -57,7 +57,7 @@ soup_tld_ensure_rules_hash_table (void) /** * soup_tld_get_base_domain: - * @hostname: a UTF-8 hostname in its canonical representation form + * @hostname: a hostname * @error: return location for a #GError, or %NULL to ignore * errors. See #SoupTLDError for the available error codes * @@ -70,9 +70,10 @@ soup_tld_ensure_rules_hash_table (void) * with any well known TLD) because choosing a base domain for them * would be totally arbitrary. * - * This method only works for valid UTF-8 hostnames in their canonical - * representation form, so you should use g_hostname_to_unicode() to - * get the canonical representation if that is not the case. + * Prior to libsoup 2.46, this function required that @hostname be in + * UTF-8 if it was an IDN. From 2.46 on, the name can be in either + * UTF-8 or ASCII format (and the return value will be in the same + * format). * * Returns: a pointer to the start of the base domain in @hostname. If * an error occurs, %NULL will be returned and @error set. @@ -83,21 +84,21 @@ const char * soup_tld_get_base_domain (const char *hostname, GError **error) { g_return_val_if_fail (hostname, NULL); - g_return_val_if_fail (!g_hostname_is_ascii_encoded (hostname), FALSE); return soup_tld_get_base_domain_internal (hostname, 1, error); } /** * soup_tld_domain_is_public_suffix: - * @domain: a UTF-8 domain in its canonical representation form + * @domain: a domain name * * Looks whether the @domain passed as argument is a public domain * suffix (.org, .com, .co.uk, etc) or not. * - * This method only works for valid UTF-8 domains in their canonical - * representation form, so you should use g_hostname_to_unicode() to - * get the canonical representation if that is not the case. + * Prior to libsoup 2.46, this function required that @domain be in + * UTF-8 if it was an IDN. From 2.46 on, the name can be in either + * UTF-8 or ASCII format (and the return value will be in the same + * format). * * Returns: %TRUE if it is a public domain, %FALSE otherwise. * @@ -174,8 +175,10 @@ soup_tld_error_quark (void) static const char * soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error) { - char *prev_domain, *cur_domain, *tld, *next_dot; + char *prev_domain, *cur_domain, *next_dot; gint add_domains; + const char *orig_hostname = NULL, *tld; + char *utf8_hostname = NULL; soup_tld_ensure_rules_hash_table (); @@ -186,6 +189,17 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain return NULL; } + if (g_hostname_is_ascii_encoded (hostname)) { + orig_hostname = hostname; + hostname = utf8_hostname = g_hostname_to_unicode (hostname); + if (!hostname) { + g_set_error_literal (error, SOUP_TLD_ERROR, + SOUP_TLD_ERROR_INVALID_HOSTNAME, + _("Invalid hostname")); + return NULL; + } + } + cur_domain = (char *) hostname; tld = cur_domain; prev_domain = NULL; @@ -204,6 +218,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME, _("Invalid hostname")); + g_free (utf8_hostname); return NULL; } @@ -233,6 +248,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN, _("Hostname has no base domain")); + g_free (utf8_hostname); return NULL; } @@ -240,6 +256,37 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain cur_domain = next_dot + 1; } + if (orig_hostname) { + int dots; + const char *p; + + /* Count the number of dots that appear after tld in + * utf8_hostname, and then find the corresponding spot + * in orig_hostname; + */ + for (p = tld, dots = 0; *p; p++) { + if (*p == '.') + dots++; + } + + for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) { + if (*(p - 1) == '.') { + if (dots) + dots--; + else + break; + } + } + /* It's not possible for utf8_hostname to have had + * more dots than orig_hostname. + */ + g_assert (dots == 0); + + tld = p; + g_free (utf8_hostname); + hostname = orig_hostname; + } + /* Include the additional number of domains requested. */ add_domains = additional_domains; while (tld != hostname) { diff --git a/tests/tld-test.c b/tests/tld-test.c index 2b6b5dd9..d1f1de1f 100644 --- a/tests/tld-test.c +++ b/tests/tld-test.c @@ -83,8 +83,30 @@ static struct { { "www.食狮.中国", "食狮.中国" }, { "shishi.中国", "shishi.中国" }, { "中国", NULL }, - /* This is not in http://publicsuffix.org/list/test.txt but we want to check it anyway. */ + /* Same as above, but punycoded. */ + { "xn--85x722f.com.cn", "xn--85x722f.com.cn" }, + { "xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn" }, + { "www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn" }, + { "shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn" }, + { "xn--55qx5d.cn", NULL }, + { "xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s" }, + { "www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s" }, + { "shishi.xn--fiqs8s", "shishi.xn--fiqs8s" }, + { "xn--fiqs8s", NULL }, + /* End of publicsuffix.org tests */ + + /* Let's just double-check this one... */ { "co.uk", NULL }, + { "test.co.uk", "test.co.uk" }, + { "www.test.co.uk", "test.co.uk" }, + + /* Two levels of non-ASCII */ + { "våler.østfold.no", NULL }, + { "test.våler.østfold.no", "test.våler.østfold.no" }, + { "www.test.våler.østfold.no", "test.våler.østfold.no" }, + { "xn--vler-qoa.xn--stfold-9xa.no", NULL }, + { "test.xn--vler-qoa.xn--stfold-9xa.no", "test.xn--vler-qoa.xn--stfold-9xa.no" }, + { "www.test.xn--vler-qoa.xn--stfold-9xa.no", "test.xn--vler-qoa.xn--stfold-9xa.no" }, }, /* Non Internet TLDs have NULL as expected result */ @@ -135,6 +157,8 @@ main (int argc, char **argv) g_clear_error(&error); } + debug_printf (1, "\n"); + for (i = 0; i < G_N_ELEMENTS (non_inet_tld_tests); ++i) { gboolean is_public = soup_tld_domain_is_public_suffix (non_inet_tld_tests[i].hostname); const char *base_domain = soup_tld_get_base_domain (non_inet_tld_tests[i].hostname, NULL); -- cgit v1.2.1