[Backport] CVE-2020-6412 - Insufficient validation of untrusted input in Omnibox

Manual backport of patch originally reviewed on https://chromium-review.googlesource.com/c/chromium/src/+/1881887: Allow whole-script confusable Cyrillic domains only on Cyrillic TLDs A whole-script confusable Cyrillic domain consists of entirely Cyrillic characters that look identical to Latin characters (e.g. xn--80ak6aa92e[.]com decodes to аррӏе[.]com where аррӏе is in fact '\x0430\x0440\x0440\x04cf\x0435'). A previous change allowed whole-script confusable Cyrillic characters on non-ASCII top level domains only. This means that xn--80ak6aa92e[.]com remains punycode (TLD is .com) but xn--80ak6aa92e[.]xn--p1ai is decoded as аррӏе[.]рф (TLD is Cyrillic). However, this also allows spoofs in other non-ASCII TLDs such as аррӏе[.]中国 so it's not a sufficient measure. This change further limits allowable whole-script confusable Cyrillic domains to Cyrillic TLDs (instead of non-ASCII) and a small list of additional TLDs containing a large number of Cyrillic domains (bg, by, kz, pyc, ru, su, ua, uz). The idea is that users familiar with Cyrillic are more likely to encounter these TLDs and notice any discrepancies in the displayed domain name. Bug: 968505 Change-Id: I83dbb215c5177f3faa80e0e0c157aeb483fe7138 Reviewed-by: Jüri Valdmann <juri.valdmann@qt.io>
author: meacer <meacer@chromium.org> 2019-11-05 21:46:26 +0000
committer: Michael Brüning <michael.bruning@qt.io> 2020-03-10 15:48:15 +0000
commit: f938fe1765e30dea5789ddb9113d218abe2ec532 (patch)
tree: 13f016f15105c29974179112a97cedb648cc5e7e
parent: 2643eee04e099c1b649167ec7e646d7832d19000 (diff)
download: qtwebengine-chromium-f938fe1765e30dea5789ddb9113d218abe2ec532.tar.gz
3 files changed, 80 insertions, 28 deletions
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
index d0c22538cd4..f394f73cd3f 100644
--- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
+++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
@@ -256,7 +256,8 @@ IDNSpoofChecker::~IDNSpoofChecker() {
 
 bool IDNSpoofChecker::SafeToDisplayAsUnicode(
     base::StringPiece16 label,
-    base::StringPiece top_level_domain) {
+    base::StringPiece top_level_domain,
+    base::StringPiece16 top_level_domain_unicode) {
   UErrorCode status = U_ZERO_ERROR;
   int32_t result =
       uspoof_check(checker_, label.data(),
@@ -266,7 +267,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
   if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS))
     return false;
 
-  icu::UnicodeString label_string(FALSE, label.data(),
+  icu::UnicodeString label_string(FALSE /* isTerminated */, label.data(),
                                   base::checked_cast<int32_t>(label.size()));
 
   // A punycode label with 'xn--' prefix is not subject to the URL
@@ -284,7 +285,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
     return false;
 
   // Disallow Icelandic confusables for domains outside Iceland's ccTLD (.is).
-  if (label_string.length() > 1 && top_level_domain != ".is" &&
+  if (label_string.length() > 1 && top_level_domain != "is" &&
       icelandic_characters_.containsSome(label_string))
 
   // Disallow Latin Schwa (U+0259) for domains outside Azerbaijan's ccTLD (.az).
@@ -309,9 +310,11 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
   if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
       kana_letters_exceptions_.containsNone(label_string) &&
       combining_diacritics_exceptions_.containsNone(label_string)) {
-    bool is_tld_ascii = !top_level_domain.starts_with(".xn--");
-    // Check Cyrillic confusable only for ASCII TLDs.
-    return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
+    // Check Cyrillic confusable only for TLDs where Cyrillic characters are
+    // uncommon.
+    return IsCyrillicTopLevelDomain(top_level_domain,
+                                    top_level_domain_unicode) ||
+           !IsMadeOfLatinAlikeCyrillic(label_string);
   }
 
   // Additional checks for |label| with multiple scripts, one of which is Latin.
@@ -592,6 +595,21 @@ bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
          cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);
 }
 
+bool IDNSpoofChecker::IsCyrillicTopLevelDomain(
+    base::StringPiece tld,
+    base::StringPiece16 tld_unicode) const {
+  icu::UnicodeString tld_string(
+      FALSE /* isTerminated */, tld_unicode.data(),
+      base::checked_cast<int32_t>(tld_unicode.size()));
+  if (cyrillic_letters_.containsSome(tld_string)) {
+    return true;
+  }
+  // These ASCII TLDs contain a large number of domains with Cyrillic
+  // characters.
+  return tld == "bg" || tld == "by" || tld == "kz" || tld == "pyc" ||
+         tld == "ru" || tld == "su" || tld == "ua" || tld == "uz";
+}
+
 // static
 void IDNSpoofChecker::SetTrieParamsForTesting(
     const HuffmanTrieParams& trie_params) {
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
index b981c403e3d..2d899cae84e 100644
--- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
+++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
@@ -54,8 +54,11 @@ class IDNSpoofChecker {
   // Returns true if |label| is safe to display as Unicode. In the event of
   // library failure, all IDN inputs will be treated as unsafe.
   // See the function body for details on the specific safety checks performed.
+  // top_level_domain_unicode can be empty if top_level_domain is not well
+  // formed punycode.
   bool SafeToDisplayAsUnicode(base::StringPiece16 label,
-                              base::StringPiece top_level_domain);
+                              base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode);
 
   // Returns the matching top domain if |hostname| or the last few components of
   // |hostname| looks similar to one of top domains listed i
@@ -82,6 +85,11 @@ class IDNSpoofChecker {
   // Returns true if all the Cyrillic letters in |label| belong to a set of
   // Cyrillic letters that look like ASCII Latin letters.
   bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label);
+  // Returns true if |tld| is a top level domain most likely to contain a large
+  // number of Cyrillic domains. |tld_unicode| can be empty if |tld| is not well
+  // formed punycode.
+  bool IsCyrillicTopLevelDomain(base::StringPiece tld,
+                                base::StringPiece16 tld_unicode) const;
 
   // Used for unit tests.
   static void SetTrieParamsForTesting(const HuffmanTrieParams& trie_params);
diff --git a/chromium/components/url_formatter/url_formatter.cc b/chromium/components/url_formatter/url_formatter.cc
index e7d8f37ce8e..4aada4a7ac4 100644
--- a/chromium/components/url_formatter/url_formatter.cc
+++ b/chromium/components/url_formatter/url_formatter.cc
@@ -35,6 +35,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments(
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                               size_t comp_len,
                               base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode,
                               bool enable_spoof_checks,
                               base::string16* out,
                               bool* has_idn_component);
@@ -234,6 +235,28 @@ base::string16 FormatViewSourceUrl(
 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
     LAZY_INSTANCE_INITIALIZER;
 
+// Computes the top level domain from |host|. top_level_domain_unicode will
+// contain the unicode version of top_level_domain. top_level_domain_unicode can
+// remain empty if the TLD is not well formed punycode.
+void GetTopLevelDomain(base::StringPiece host,
+                       base::StringPiece* top_level_domain,
+                       base::string16* top_level_domain_unicode) {
+  size_t last_dot = host.rfind('.');
+  if (last_dot == base::StringPiece::npos)
+    return;
+
+  *top_level_domain = host.substr(last_dot + 1);
+  base::string16 tld16;
+  tld16.reserve(top_level_domain->length());
+  tld16.insert(tld16.end(), top_level_domain->begin(), top_level_domain->end());
+
+  // Convert the TLD to unicode with the spoof checks disabled.
+  bool tld_has_idn_component = false;
+  IDNToUnicodeOneComponent(tld16.data(), tld16.size(), std::string(),
+                           base::string16(), false /* enable_spoof_checks */,
+                           top_level_domain_unicode, &tld_has_idn_component);
+}
+
 IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
     base::StringPiece host,
     base::OffsetAdjuster::Adjustments* adjustments,
@@ -241,27 +264,25 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
   if (adjustments)
     adjustments->clear();
   // Convert the ASCII input to a base::string16 for ICU.
-  base::string16 input16;
-  input16.reserve(host.length());
-  input16.insert(input16.end(), host.begin(), host.end());
+  base::string16 host16;
+  host16.reserve(host.length());
+  host16.insert(host16.end(), host.begin(), host.end());
 
+  // Compute the top level domain to be used in spoof checks later.
   base::StringPiece top_level_domain;
-  size_t last_dot = host.rfind('.');
-  if (last_dot != base::StringPiece::npos) {
-    top_level_domain = host.substr(last_dot);
-  }
+  base::string16 top_level_domain_unicode;
+  GetTopLevelDomain(host, &top_level_domain, &top_level_domain_unicode);
 
   IDNConversionResult result;
   // Do each component of the host separately, since we enforce script matching
   // on a per-component basis.
   base::string16 out16;
   for (size_t component_start = 0, component_end;
-       component_start < input16.length();
-       component_start = component_end + 1) {
+       component_start < host16.length(); component_start = component_end + 1) {
     // Find the end of the component.
-    component_end = input16.find('.', component_start);
+    component_end = host16.find('.', component_start);
     if (component_end == base::string16::npos)
-      component_end = input16.length();  // For getting the last component.
+      component_end = host16.length();  // For getting the last component.
     size_t component_length = component_end - component_start;
     size_t new_component_start = out16.length();
     bool converted_idn = false;
@@ -269,8 +290,9 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
       // Add the substring that we just found.
       bool has_idn_component = false;
       converted_idn = IDNToUnicodeOneComponent(
-          input16.data() + component_start, component_length, top_level_domain,
-          enable_spoof_checks, &out16, &has_idn_component);
+          host16.data() + component_start, component_length, top_level_domain,
+          top_level_domain_unicode, enable_spoof_checks, &out16,
+          &has_idn_component);
       result.has_idn_component |= has_idn_component;
     }
     size_t new_component_length = out16.length() - new_component_start;
@@ -281,7 +303,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
     }
 
     // Need to add the dot we just found (if we found one).
-    if (component_end < input16.length())
+    if (component_end < host16.length())
       out16.push_back('.');
   }
 
@@ -294,7 +316,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
     if (enable_spoof_checks && !result.matching_top_domain.empty()) {
       if (adjustments)
         adjustments->clear();
-      result.result = input16;
+      result.result = host16;
     }
   }
 
@@ -320,9 +342,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments(
 // all even though it's possible to make up look-alike labels with ASCII
 // characters alone.
 bool IsIDNComponentSafe(base::StringPiece16 label,
-                        base::StringPiece top_level_domain) {
-  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label,
-                                                          top_level_domain);
+                        base::StringPiece top_level_domain,
+                        base::StringPiece16 top_level_domain_unicode) {
+  return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(
+      label, top_level_domain, top_level_domain_unicode);
 }
 
 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
@@ -375,6 +398,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
 bool IDNToUnicodeOneComponent(const base::char16* comp,
                               size_t comp_len,
                               base::StringPiece top_level_domain,
+                              base::StringPiece16 top_level_domain_unicode,
                               bool enable_spoof_checks,
                               base::string16* out,
                               bool* has_idn_component) {
@@ -411,8 +435,10 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
 
   if (U_SUCCESS(status) && info.errors == 0) {
     *has_idn_component = true;
-    // Converted successfully. Ensure that the converted component
-    // can be safely displayed to the user.
+    // Converted successfully. At this point the length of the output string
+    // is original_length + output_length which may be shorter than the current
+    // length of |out|. Trim |out| and ensure that the converted component can
+    // be safely displayed to the user.
     out->resize(original_length + output_length);
     if (!enable_spoof_checks) {
       return true;
@@ -420,7 +446,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
     if (IsIDNComponentSafe(
             base::StringPiece16(out->data() + original_length,
                                 base::checked_cast<size_t>(output_length)),
-            top_level_domain)) {
+            top_level_domain, top_level_domain_unicode)) {
       return true;
     }
   }
author	meacer <meacer@chromium.org>	2019-11-05 21:46:26 +0000
committer	Michael Brüning <michael.bruning@qt.io>	2020-03-10 15:48:15 +0000
commit	f938fe1765e30dea5789ddb9113d218abe2ec532 (patch)
tree	13f016f15105c29974179112a97cedb648cc5e7e
parent	2643eee04e099c1b649167ec7e646d7832d19000 (diff)
download	qtwebengine-chromium-f938fe1765e30dea5789ddb9113d218abe2ec532.tar.gz