3 files changed, 80 insertions, 28 deletions
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
index d0c22538cd4..f394f73cd3f 100644
--- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
+++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
@@ -256,7 +256,8 @@ IDNSpoofChecker::~IDNSpoofChecker() {
 
 bool IDNSpoofChecker::SafeToDisplayAsUnicode(
     base::StringPiece16 label,
-    base::StringPiece top_level_domain) {
+    base::StringPiece top_level_domain,
+    base::StringPiece16 top_level_domain_unicode) {
   UErrorCode status = U_ZERO_ERROR;
   int32_t result =
       uspoof_check(checker_, label.data(),
@@ -266,7 +267,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
   if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS))
     return false;
 
-  icu::UnicodeString label_string(FALSE, label.data(),
+  icu::UnicodeString label_string(FALSE /* isTerminated */, label.data(),
                                   base::checked_cast<int32_t>(label.size()));
 
   // A punycode label with 'xn--' prefix is not subject to the URL
@@ -284,7 +285,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
     return false;
 
   // Disallow Icelandic confusables for domains outside Iceland's ccTLD (.is).
-  if (label_string.length() > 1 && top_level_domain != ".is" &&
+  if (label_string.length() > 1 && top_level_domain != "is" &&
       icelandic_characters_.containsSome(label_string))
 
   // Disallow Latin Schwa (U+0259) for domains outside Azerbaijan's ccTLD (.az).
@@ -309,9 +310,11 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
   if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
       kana_letters_exceptions_.containsNone(label_string) &&
       combining_diacritics_exceptions_.containsNone(label_string)) {
-    bool is_tld_ascii = !top_level_domain.starts_with(".xn--");
-    // Check Cyrillic confusable only for ASCII TLDs.
-    return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
+    // Check Cyrillic confusable only for TLDs where Cyrillic characters are
+    // uncommon.
+    return IsCyrillicTopLevelDomain(top_level_domain,
+                                    top_level_domain_unicode) ||
+           !IsMadeOfLatinAlikeCyrillic(label_string);
   }
 
   // Additional checks for |label| with multiple scripts, one of which is Latin.
@@ -592,6 +595,21 @@ bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
          cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);
 }
 
+bool IDNSpoofChecker::IsCyrillicTopLevelDomain(
+    base::StringPiece tld,
+    base::StringPiece16 tld_unicode) const {
+  icu::UnicodeString tld_string(
+      FALSE /* isTerminated */, tld_unicode.data(),
+      base::checked_cast<int32_t>(tld_unicode.size()));
+  if (cyrillic_letters_.containsSome(tld_string)) {
+    return true;
+  }
+  // These ASCII TLDs contain a large number of domains with Cyrillic
+  // characters.
+  return tld == "bg" || tld == "by" || tld == "kz" || tld == "pyc" ||
+         tld == "ru" || tld == "su" || tld == "ua" || tld == "uz";
+}
+
 // static
 void IDNSpoofChecker::SetTrieParamsForTesting(
     const HuffmanTrieParams& trie_params) {
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
index b981c403e3d..2d899cae84e 100644
--- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
+++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
@@ -54,8 +54,11 @@ class IDNSpoofChecker {
   // Returns true if |label| is safe to display as Unicode. In the event of
   // library failure, all IDN inputs will be treated as unsafe.
   // See the function body for details on the specific safety checks performed.
+  // top_level_domain_unicode can be empty if top_level_domain is not well
+  // formed punycode.
   bool SafeToDisplayAsUnicode(base::StringPiece16 label,
-                              base::StringPiece top_level_domain);
+                              base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode);
 
   // Returns the matching top domain if |hostname| or the last few components of
   // |hostname| looks similar to one of top domains listed i
@@ -82,6 +85,11 @@ class IDNSpoofChecker {
   // Returns true if all the Cyrillic letters in |label| belong to a set of
   // Cyrillic letters that look like ASCII Latin letters.
   bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label);
+  // Returns true if |tld| is a top level domain most likely to contain a large
+  // number of Cyrillic domains. |tld_unicode| can be empty if |tld| is not well
+  // formed punycode.
+  bool IsCyrillicTopLevelDomain(base::StringPiece tld,
+                                base::StringPiece16 tld_unicode) const;
 
   // Used for unit tests.
   static void SetTrieParamsForTesting(const HuffmanTrieParams& trie_params);
diff --git a/chromium/components/url_formatter/url_formatter.cc b/chromium/components/url_formatter/url_formatter.cc
index e7d8f37ce8e..4aada4a7ac4 100644
--- a/chromium/components/url_formatter/url_formatter.cc
+++ b/chromium/components/url_formatter/url_formatter.cc
@@ -35,6 +35,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments(
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                               size_t comp_len,
                               base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode,
                               bool enable_spoof_checks,
                               base::string16* out,
                               bool* has_idn_component);
@@ -234,6 +235,28 @@ base::string16 FormatViewSourceUrl(
 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
     LAZY_INSTANCE_INITIALIZER;
 
+// Computes the top level domain from |host|. top_level_domain_unicode will
+// contain the unicode version of top_level_domain. top_level_domain_unicode can
+// remain empty if the TLD is not well formed punycode.
+void GetTopLevelDomain(base::StringPiece host,
+                       base::StringPiece* top_level_domain,
+                       base::string16* top_level_domain_unicode) {
+  size_t last_dot = host.rfind('.');
+  if (last_dot == base::StringPiece::npos)
+    return;
+
+  *top_level_domain = host.substr(last_dot + 1);
+  base::string16 tld16;
+  tld16.reserve(top_level_domain->length());
+  tld16.insert(tld16.end(), top_level_domain->begin(), top_level_domain->end());
+
+  // Convert the TLD to unicode with the spoof checks disabled.
+  bool tld_has_idn_component = false;
+  IDNToUnicodeOneComponent(tld16.data(), tld16.size(), std::string(),
+                           base::string16(), false /* enable_spoof_checks */,
+                           top_level_domain_unicode, &tld_has_idn_component);
+}
+
 IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
     base::StringPiece host,
     base::OffsetAdjuster::Adjustments* adjustments,
@@ -241,27 +264,25 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
   if (adjustments)
     adjustments->clear();
   // Convert the ASCII input to a base::string16 for ICU.
-  base::string16 input16;
-  input16.reserve(host.length());
-  input16.insert(input16.end(), host.begin(), host.end());
+  base::string16 host16;
+  host16.reserve(host.length());
+  host16.insert(host16.end(), host.begin(), host.end());
 
+  // Compute the top level domain to be used in spoof checks later.
   base::StringPiece top_level_domain;
-  size_t last_dot = host.rfind('.');
-  if (last_dot != base::StringPiece::npos) {
-    top_level_domain = host.substr(last_dot);
-  }
+  base::string16 top_level_domain_unicode;
+  GetTopLevelDomain(host, &top_level_domain, &top_level_domain_unicode);
 
   IDNConversionResult result;
   // Do each component of the host separately, since we enforce script matching
   // on a per-component basis.
   base::string16 out16;
   for (size_t component_start = 0, component_end;
-       component_start < input16.length();
-       component_start = component_end + 1) {
+       component_start < host16.length(); component_start = component_end + 1) {
     // Find the end of the component.
-    component_end = input16.find('.', component_start);
+    component_end = host16.find('.', component_start);
     if (component_end == base::string16::npos)
-      component_end = input16.length();  // For getting the last component.
+      component_end = host16.length();  // For getting the last component.
     size_t component_length = component_end - component_start;
     size_t new_component_start = out16.length();
     bool converted_idn = false;
@@ -269,8 +290,9 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
       // Add the substring that we just found.
       bool has_idn_component = false;
       converted_idn = IDNToUnicodeOneComponent(
-          input16.data() + component_start, component_length, top_level_domain,
-          enable_spoof_checks, &out16, &has_idn_component);
+          host16.data() + component_start, component_length, top_level_domain,
+          top_level_domain_unicode, enable_spoof_checks, &out16,
+          &has_idn_component);
       result.has_idn_component |= has_idn_component;
     }
     size_t new_component_length = out16.length() - new_component_start;
@@ -281,7 +303,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
     }
 
     // Need to add the dot we just found (if we found one).
-    if (component_end < input16.length())
+    if (component_end < host16.length())
       out16.push_back('.');
   }
 
@@ -294,7 +316,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
     if (enable_spoof_checks && !result.matching_top_domain.empty()) {
       if (adjustments)
         adjustments->clear();
-      result.result = input16;
+      result.result = host16;
     }
   }
 
@@ -320,9 +342,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments(
 // all even though it's possible to make up look-alike labels with ASCII
 // characters alone.
 bool IsIDNComponentSafe(base::StringPiece16 label,
-                        base::StringPiece top_level_domain) {
-  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label,
-                                                          top_level_domain);
+                        base::StringPiece top_level_domain,
+                        base::StringPiece16 top_level_domain_unicode) {
+  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(
+      label, top_level_domain, top_level_domain_unicode);
 }
 
 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
@@ -375,6 +398,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                               size_t comp_len,
                               base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode,
                               bool enable_spoof_checks,
                               base::string16* out,
                               bool* has_idn_component) {
@@ -411,8 +435,10 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
 
   if (U_SUCCESS(status) && info.errors == 0) {
     *has_idn_component = true;
-    // Converted successfully. Ensure that the converted component
-    // can be safely displayed to the user.
+    // Converted successfully. At this point the length of the output string
+    // is original_length + output_length which may be shorter than the current
+    // length of |out|. Trim |out| and ensure that the converted component can
+    // be safely displayed to the user.
     out->resize(original_length + output_length);
     if (!enable_spoof_checks) {
       return true;
@@ -420,7 +446,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
     if (IsIDNComponentSafe(
             base::StringPiece16(out->data() + original_length,
                                 base::checked_cast<size_t>(output_length)),
-            top_level_domain)) {
+            top_level_domain, top_level_domain_unicode)) {
       return true;
     }
   }