diff options
Diffstat (limited to 'libsoup/soup-tld.c')
-rw-r--r-- | libsoup/soup-tld.c | 126 |
1 files changed, 102 insertions, 24 deletions
diff --git a/libsoup/soup-tld.c b/libsoup/soup-tld.c index 0c40b675..2e3da62d 100644 --- a/libsoup/soup-tld.c +++ b/libsoup/soup-tld.c @@ -17,6 +17,15 @@ #include "soup.h" #include "soup-tld-private.h" +/** + * SECTION:soup-tld + * @short_description: Top-Level Domain Utilities + * + * These functions can be used to parse hostnames to attempt to determine + * what part of the name belongs to the domain owner, and what part is + * simply a "public suffix" such as ".com". + */ + static void soup_tld_ensure_rules_hash_table (void); static const char *soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, @@ -27,8 +36,7 @@ static SoupTLDEntry tld_entries[] = { #include "tld_data.inc" }; -/** - * Stores the entries data in a hash table to ease and speed up +/* Stores the entries data in a hash table to ease and speed up * searches. */ static void @@ -49,8 +57,7 @@ soup_tld_ensure_rules_hash_table (void) /** * soup_tld_get_base_domain: - * @tld: a #SoupTLD - * @hostname: a UTF-8 hostname in its canonical representation form + * @hostname: a hostname * @error: return location for a #GError, or %NULL to ignore * errors. See #SoupTLDError for the available error codes * @@ -59,9 +66,14 @@ soup_tld_ensure_rules_hash_table (void) * plus the second level domain, for example for myhost.mydomain.com * it will return mydomain.com. * - * This method only works for valid UTF-8 hostnames in their canonical - * representation form, so you should use g_hostname_to_unicode() to - * get the canonical representation if that is not the case. + * Note that %NULL will be returned for private URLs (those not ending + * with any well known TLD) because choosing a base domain for them + * would be totally arbitrary. + * + * Prior to libsoup 2.46, this function required that @hostname be in + * UTF-8 if it was an IDN. From 2.46 on, the name can be in either + * UTF-8 or ASCII format (and the return value will be in the same + * format). * * Returns: a pointer to the start of the base domain in @hostname. If * an error occurs, %NULL will be returned and @error set. @@ -72,22 +84,21 @@ const char * soup_tld_get_base_domain (const char *hostname, GError **error) { g_return_val_if_fail (hostname, NULL); - g_return_val_if_fail (!g_hostname_is_ascii_encoded (hostname), FALSE); return soup_tld_get_base_domain_internal (hostname, 1, error); } /** * soup_tld_domain_is_public_suffix: - * @tld: a #SoupTLD - * @domain: a UTF-8 domain in its canonical representation form + * @domain: a domain name * * Looks whether the @domain passed as argument is a public domain * suffix (.org, .com, .co.uk, etc) or not. * - * This method only works for valid UTF-8 domains in their canonical - * representation form, so you should use g_hostname_to_unicode() to - * get the canonical representation if that is not the case. + * Prior to libsoup 2.46, this function required that @domain be in + * UTF-8 if it was an IDN. From 2.46 on, the name can be in either + * UTF-8 or ASCII format (and the return value will be in the same + * format). * * Returns: %TRUE if it is a public domain, %FALSE otherwise. * @@ -102,12 +113,14 @@ soup_tld_domain_is_public_suffix (const char *domain) g_return_val_if_fail (domain, FALSE); /* Skip the leading '.' if present */ - if (*domain == '.' && !(++domain)) + if (*domain == '.' && !*(++domain)) g_return_val_if_reached (FALSE); base_domain = soup_tld_get_base_domain_internal (domain, 0, &error); - if (base_domain) + if (g_strcmp0 (domain, base_domain)) { + g_clear_error (&error); return FALSE; + } if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) { g_error_free (error); @@ -125,6 +138,31 @@ soup_tld_domain_is_public_suffix (const char *domain) return TRUE; } +/** + * SOUP_TLD_ERROR: + * + * The #GError domain for soup-tld-related errors. + * + * Since: 2.40 + */ +/** + * SoupTLDError: + * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically + * invalid. + * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was + * actually an IP address (and thus has no base domain or + * public suffix). + * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname + * did not have enough components. Eg, calling + * soup_tld_get_base_domain() on <literal>"co.uk"</literal>. + * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has + * no recognized public suffix. + * + * Error codes for %SOUP_TLD_ERROR. + * + * Since: 2.40 + */ + GQuark soup_tld_error_quark (void) { @@ -137,8 +175,10 @@ soup_tld_error_quark (void) static const char * soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error) { - char *prev_domain, *cur_domain, *tld, *next_dot; + char *prev_domain, *cur_domain, *next_dot; gint add_domains; + const char *orig_hostname = NULL, *tld; + char *utf8_hostname = NULL; soup_tld_ensure_rules_hash_table (); @@ -149,6 +189,17 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain return NULL; } + if (g_hostname_is_ascii_encoded (hostname)) { + orig_hostname = hostname; + hostname = utf8_hostname = g_hostname_to_unicode (hostname); + if (!hostname) { + g_set_error_literal (error, SOUP_TLD_ERROR, + SOUP_TLD_ERROR_INVALID_HOSTNAME, + _("Invalid hostname")); + return NULL; + } + } + cur_domain = (char *) hostname; tld = cur_domain; prev_domain = NULL; @@ -167,6 +218,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME, _("Invalid hostname")); + g_free (utf8_hostname); return NULL; } @@ -178,15 +230,9 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain /* If we match a *. rule and there were no previous exceptions * nor previous domains then treat it as an exact match. */ - if (!prev_domain) { - g_set_error_literal (error, SOUP_TLD_ERROR, - SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS, - _("Not enough domains")); - return NULL; - } - tld = prev_domain; + tld = prev_domain ? prev_domain : cur_domain; break; - } else if (*flags == SOUP_TLD_RULE_NORMAL || !next_dot) { + } else if (*flags == SOUP_TLD_RULE_NORMAL) { tld = cur_domain; break; } else if (*flags & SOUP_TLD_RULE_EXCEPTION) { @@ -202,6 +248,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN, _("Hostname has no base domain")); + g_free (utf8_hostname); return NULL; } @@ -209,6 +256,37 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain cur_domain = next_dot + 1; } + if (orig_hostname) { + int dots; + const char *p; + + /* Count the number of dots that appear after tld in + * utf8_hostname, and then find the corresponding spot + * in orig_hostname; + */ + for (p = tld, dots = 0; *p; p++) { + if (*p == '.') + dots++; + } + + for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) { + if (*(p - 1) == '.') { + if (dots) + dots--; + else + break; + } + } + /* It's not possible for utf8_hostname to have had + * more dots than orig_hostname. + */ + g_assert (dots == 0); + + tld = p; + g_free (utf8_hostname); + hostname = orig_hostname; + } + /* Include the additional number of domains requested. */ add_domains = additional_domains; while (tld != hostname) { |