1 files changed, 102 insertions, 24 deletions
diff --git a/libsoup/soup-tld.c b/libsoup/soup-tld.c
index 0c40b675..2e3da62d 100644
--- a/libsoup/soup-tld.c
+++ b/libsoup/soup-tld.c
@@ -17,6 +17,15 @@
 #include "soup.h"
 #include "soup-tld-private.h"
 
+/**
+ * SECTION:soup-tld
+ * @short_description: Top-Level Domain Utilities
+ *
+ * These functions can be used to parse hostnames to attempt to determine
+ * what part of the name belongs to the domain owner, and what part is
+ * simply a "public suffix" such as ".com".
+ */
+
 static void soup_tld_ensure_rules_hash_table (void);
 static const char *soup_tld_get_base_domain_internal (const char *hostname,
 						      guint       additional_domains,
@@ -27,8 +36,7 @@ static SoupTLDEntry tld_entries[] = {
 #include "tld_data.inc"
 };
 
-/**
- * Stores the entries data in a hash table to ease and speed up
+/* Stores the entries data in a hash table to ease and speed up
  * searches.
  */
 static void
@@ -49,8 +57,7 @@ soup_tld_ensure_rules_hash_table (void)
 
 /**
  * soup_tld_get_base_domain:
- * @tld: a #SoupTLD
- * @hostname: a UTF-8 hostname in its canonical representation form
+ * @hostname: a hostname
  * @error: return location for a #GError, or %NULL to ignore
  *   errors. See #SoupTLDError for the available error codes
  *
@@ -59,9 +66,14 @@ soup_tld_ensure_rules_hash_table (void)
  * plus the second level domain, for example for myhost.mydomain.com
  * it will return mydomain.com.
  *
- * This method only works for valid UTF-8 hostnames in their canonical
- * representation form, so you should use g_hostname_to_unicode() to
- * get the canonical representation if that is not the case.
+ * Note that %NULL will be returned for private URLs (those not ending
+ * with any well known TLD) because choosing a base domain for them
+ * would be totally arbitrary.
+ *
+ * Prior to libsoup 2.46, this function required that @hostname be in
+ * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
+ * UTF-8 or ASCII format (and the return value will be in the same
+ * format).
  *
  * Returns: a pointer to the start of the base domain in @hostname. If
  * an error occurs, %NULL will be returned and @error set.
@@ -72,22 +84,21 @@ const char *
 soup_tld_get_base_domain (const char *hostname, GError **error)
 {
 	g_return_val_if_fail (hostname, NULL);
-	g_return_val_if_fail (!g_hostname_is_ascii_encoded (hostname), FALSE);
 
 	return soup_tld_get_base_domain_internal (hostname, 1, error);
 }
 
 /**
  * soup_tld_domain_is_public_suffix:
- * @tld: a #SoupTLD
- * @domain: a UTF-8 domain in its canonical representation form
+ * @domain: a domain name
  *
  * Looks whether the @domain passed as argument is a public domain
  * suffix (.org, .com, .co.uk, etc) or not.
  *
- * This method only works for valid UTF-8 domains in their canonical
- * representation form, so you should use g_hostname_to_unicode() to
- * get the canonical representation if that is not the case.
+ * Prior to libsoup 2.46, this function required that @domain be in
+ * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
+ * UTF-8 or ASCII format (and the return value will be in the same
+ * format).
  *
  * Returns: %TRUE if it is a public domain, %FALSE otherwise.
  *
@@ -102,12 +113,14 @@ soup_tld_domain_is_public_suffix (const char *domain)
 	g_return_val_if_fail (domain, FALSE);
 
 	/* Skip the leading '.' if present */
-	if (*domain == '.' && !(++domain))
+	if (*domain == '.' && !*(++domain))
 		g_return_val_if_reached (FALSE);
 
 	base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
-	if (base_domain)
+	if (g_strcmp0 (domain, base_domain)) {
+		g_clear_error (&error);
 		return FALSE;
+	}
 
 	if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
 		g_error_free (error);
@@ -125,6 +138,31 @@ soup_tld_domain_is_public_suffix (const char *domain)
 	return TRUE;
 }
 
+/**
+ * SOUP_TLD_ERROR:
+ *
+ * The #GError domain for soup-tld-related errors.
+ *
+ * Since: 2.40
+ */
+/**
+ * SoupTLDError:
+ * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically
+ *   invalid.
+ * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was
+ *   actually an IP address (and thus has no base domain or
+ *   public suffix).
+ * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname
+ *   did not have enough components. Eg, calling
+ *   soup_tld_get_base_domain() on <literal>"co.uk"</literal>.
+ * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has
+ *   no recognized public suffix.
+ *
+ * Error codes for %SOUP_TLD_ERROR.
+ *
+ * Since: 2.40
+ */
+
 GQuark
 soup_tld_error_quark (void)
 {
@@ -137,8 +175,10 @@ soup_tld_error_quark (void)
 static const char *
 soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
 {
-	char *prev_domain, *cur_domain, *tld, *next_dot;
+	char *prev_domain, *cur_domain, *next_dot;
 	gint add_domains;
+	const char *orig_hostname = NULL, *tld;
+	char *utf8_hostname = NULL;
 
 	soup_tld_ensure_rules_hash_table ();
 
@@ -149,6 +189,17 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
 		return NULL;
 	}
 
+	if (g_hostname_is_ascii_encoded (hostname)) {
+		orig_hostname = hostname;
+		hostname = utf8_hostname = g_hostname_to_unicode (hostname);
+		if (!hostname) {
+			g_set_error_literal (error, SOUP_TLD_ERROR,
+					     SOUP_TLD_ERROR_INVALID_HOSTNAME,
+					     _("Invalid hostname"));
+			return NULL;
+		}
+	}
+
 	cur_domain = (char *) hostname;
 	tld = cur_domain;
 	prev_domain = NULL;
@@ -167,6 +218,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
 			g_set_error_literal (error, SOUP_TLD_ERROR,
 					     SOUP_TLD_ERROR_INVALID_HOSTNAME,
 					     _("Invalid hostname"));
+			g_free (utf8_hostname);
 			return NULL;
 		}
 
@@ -178,15 +230,9 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
 				/* If we match a *. rule and there were no previous exceptions
 				 * nor previous domains then treat it as an exact match.
 				 */
-				if (!prev_domain) {
-					g_set_error_literal (error, SOUP_TLD_ERROR,
-							     SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
-							     _("Not enough domains"));
-					return NULL;
-				}
-				tld = prev_domain;
+				tld = prev_domain ? prev_domain : cur_domain;
 				break;
-			} else if (*flags == SOUP_TLD_RULE_NORMAL || !next_dot) {
+			} else if (*flags == SOUP_TLD_RULE_NORMAL) {
 				tld = cur_domain;
 				break;
 			} else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
@@ -202,6 +248,7 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
 			g_set_error_literal (error, SOUP_TLD_ERROR,
 					     SOUP_TLD_ERROR_NO_BASE_DOMAIN,
 					     _("Hostname has no base domain"));
+			g_free (utf8_hostname);
 			return NULL;
 		}
 
@@ -209,6 +256,37 @@ soup_tld_get_base_domain_internal (const char *hostname, guint additional_domain
 		cur_domain = next_dot + 1;
 	}
 
+	if (orig_hostname) {
+		int dots;
+		const char *p;
+
+		/* Count the number of dots that appear after tld in
+		 * utf8_hostname, and then find the corresponding spot
+		 * in orig_hostname;
+		 */
+		for (p = tld, dots = 0; *p; p++) {
+			if (*p == '.')
+				dots++;
+		}
+
+		for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
+			if (*(p - 1) == '.') {
+				if (dots)
+					dots--;
+				else
+					break;
+			}
+		}
+		/* It's not possible for utf8_hostname to have had
+		 * more dots than orig_hostname.
+		 */
+		g_assert (dots == 0);
+
+		tld = p;
+		g_free (utf8_hostname);
+		hostname = orig_hostname;
+	}
+
 	/* Include the additional number of domains requested. */
 	add_domains = additional_domains;
 	while (tld != hostname) {