diff options
3 files changed, 80 insertions, 28 deletions
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc index d0c22538cd4..f394f73cd3f 100644 --- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc +++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc @@ -256,7 +256,8 @@ IDNSpoofChecker::~IDNSpoofChecker() { bool IDNSpoofChecker::SafeToDisplayAsUnicode( base::StringPiece16 label, - base::StringPiece top_level_domain) { + base::StringPiece top_level_domain, + base::StringPiece16 top_level_domain_unicode) { UErrorCode status = U_ZERO_ERROR; int32_t result = uspoof_check(checker_, label.data(), @@ -266,7 +267,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode( if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS)) return false; - icu::UnicodeString label_string(FALSE, label.data(), + icu::UnicodeString label_string(FALSE /* isTerminated */, label.data(), base::checked_cast<int32_t>(label.size())); // A punycode label with 'xn--' prefix is not subject to the URL @@ -284,7 +285,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode( return false; // Disallow Icelandic confusables for domains outside Iceland's ccTLD (.is). - if (label_string.length() > 1 && top_level_domain != ".is" && + if (label_string.length() > 1 && top_level_domain != "is" && icelandic_characters_.containsSome(label_string)) // Disallow Latin Schwa (U+0259) for domains outside Azerbaijan's ccTLD (.az). @@ -309,9 +310,11 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode( if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE && kana_letters_exceptions_.containsNone(label_string) && combining_diacritics_exceptions_.containsNone(label_string)) { - bool is_tld_ascii = !top_level_domain.starts_with(".xn--"); - // Check Cyrillic confusable only for ASCII TLDs. - return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string); + // Check Cyrillic confusable only for TLDs where Cyrillic characters are + // uncommon. + return IsCyrillicTopLevelDomain(top_level_domain, + top_level_domain_unicode) || + !IsMadeOfLatinAlikeCyrillic(label_string); } // Additional checks for |label| with multiple scripts, one of which is Latin. @@ -592,6 +595,21 @@ bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label); } +bool IDNSpoofChecker::IsCyrillicTopLevelDomain( + base::StringPiece tld, + base::StringPiece16 tld_unicode) const { + icu::UnicodeString tld_string( + FALSE /* isTerminated */, tld_unicode.data(), + base::checked_cast<int32_t>(tld_unicode.size())); + if (cyrillic_letters_.containsSome(tld_string)) { + return true; + } + // These ASCII TLDs contain a large number of domains with Cyrillic + // characters. + return tld == "bg" || tld == "by" || tld == "kz" || tld == "pyc" || + tld == "ru" || tld == "su" || tld == "ua" || tld == "uz"; +} + // static void IDNSpoofChecker::SetTrieParamsForTesting( const HuffmanTrieParams& trie_params) { diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h index b981c403e3d..2d899cae84e 100644 --- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h +++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h @@ -54,8 +54,11 @@ class IDNSpoofChecker { // Returns true if |label| is safe to display as Unicode. In the event of // library failure, all IDN inputs will be treated as unsafe. // See the function body for details on the specific safety checks performed. + // top_level_domain_unicode can be empty if top_level_domain is not well + // formed punycode. bool SafeToDisplayAsUnicode(base::StringPiece16 label, - base::StringPiece top_level_domain); + base::StringPiece top_level_domain, + base::StringPiece16 top_level_domain_unicode); // Returns the matching top domain if |hostname| or the last few components of // |hostname| looks similar to one of top domains listed i @@ -82,6 +85,11 @@ class IDNSpoofChecker { // Returns true if all the Cyrillic letters in |label| belong to a set of // Cyrillic letters that look like ASCII Latin letters. bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label); + // Returns true if |tld| is a top level domain most likely to contain a large + // number of Cyrillic domains. |tld_unicode| can be empty if |tld| is not well + // formed punycode. + bool IsCyrillicTopLevelDomain(base::StringPiece tld, + base::StringPiece16 tld_unicode) const; // Used for unit tests. static void SetTrieParamsForTesting(const HuffmanTrieParams& trie_params); diff --git a/chromium/components/url_formatter/url_formatter.cc b/chromium/components/url_formatter/url_formatter.cc index e7d8f37ce8e..4aada4a7ac4 100644 --- a/chromium/components/url_formatter/url_formatter.cc +++ b/chromium/components/url_formatter/url_formatter.cc @@ -35,6 +35,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments( bool IDNToUnicodeOneComponent(const base::char16* comp, size_t comp_len, base::StringPiece top_level_domain, + base::StringPiece16 top_level_domain_unicode, bool enable_spoof_checks, base::string16* out, bool* has_idn_component); @@ -234,6 +235,28 @@ base::string16 FormatViewSourceUrl( base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker = LAZY_INSTANCE_INITIALIZER; +// Computes the top level domain from |host|. top_level_domain_unicode will +// contain the unicode version of top_level_domain. top_level_domain_unicode can +// remain empty if the TLD is not well formed punycode. +void GetTopLevelDomain(base::StringPiece host, + base::StringPiece* top_level_domain, + base::string16* top_level_domain_unicode) { + size_t last_dot = host.rfind('.'); + if (last_dot == base::StringPiece::npos) + return; + + *top_level_domain = host.substr(last_dot + 1); + base::string16 tld16; + tld16.reserve(top_level_domain->length()); + tld16.insert(tld16.end(), top_level_domain->begin(), top_level_domain->end()); + + // Convert the TLD to unicode with the spoof checks disabled. + bool tld_has_idn_component = false; + IDNToUnicodeOneComponent(tld16.data(), tld16.size(), std::string(), + base::string16(), false /* enable_spoof_checks */, + top_level_domain_unicode, &tld_has_idn_component); +} + IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( base::StringPiece host, base::OffsetAdjuster::Adjustments* adjustments, @@ -241,27 +264,25 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( if (adjustments) adjustments->clear(); // Convert the ASCII input to a base::string16 for ICU. - base::string16 input16; - input16.reserve(host.length()); - input16.insert(input16.end(), host.begin(), host.end()); + base::string16 host16; + host16.reserve(host.length()); + host16.insert(host16.end(), host.begin(), host.end()); + // Compute the top level domain to be used in spoof checks later. base::StringPiece top_level_domain; - size_t last_dot = host.rfind('.'); - if (last_dot != base::StringPiece::npos) { - top_level_domain = host.substr(last_dot); - } + base::string16 top_level_domain_unicode; + GetTopLevelDomain(host, &top_level_domain, &top_level_domain_unicode); IDNConversionResult result; // Do each component of the host separately, since we enforce script matching // on a per-component basis. base::string16 out16; for (size_t component_start = 0, component_end; - component_start < input16.length(); - component_start = component_end + 1) { + component_start < host16.length(); component_start = component_end + 1) { // Find the end of the component. - component_end = input16.find('.', component_start); + component_end = host16.find('.', component_start); if (component_end == base::string16::npos) - component_end = input16.length(); // For getting the last component. + component_end = host16.length(); // For getting the last component. size_t component_length = component_end - component_start; size_t new_component_start = out16.length(); bool converted_idn = false; @@ -269,8 +290,9 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( // Add the substring that we just found. bool has_idn_component = false; converted_idn = IDNToUnicodeOneComponent( - input16.data() + component_start, component_length, top_level_domain, - enable_spoof_checks, &out16, &has_idn_component); + host16.data() + component_start, component_length, top_level_domain, + top_level_domain_unicode, enable_spoof_checks, &out16, + &has_idn_component); result.has_idn_component |= has_idn_component; } size_t new_component_length = out16.length() - new_component_start; @@ -281,7 +303,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( } // Need to add the dot we just found (if we found one). - if (component_end < input16.length()) + if (component_end < host16.length()) out16.push_back('.'); } @@ -294,7 +316,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl( if (enable_spoof_checks && !result.matching_top_domain.empty()) { if (adjustments) adjustments->clear(); - result.result = input16; + result.result = host16; } } @@ -320,9 +342,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments( // all even though it's possible to make up look-alike labels with ASCII // characters alone. bool IsIDNComponentSafe(base::StringPiece16 label, - base::StringPiece top_level_domain) { - return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label, - top_level_domain); + base::StringPiece top_level_domain, + base::StringPiece16 top_level_domain_unicode) { + return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode( + label, top_level_domain, top_level_domain_unicode); } // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to @@ -375,6 +398,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER; bool IDNToUnicodeOneComponent(const base::char16* comp, size_t comp_len, base::StringPiece top_level_domain, + base::StringPiece16 top_level_domain_unicode, bool enable_spoof_checks, base::string16* out, bool* has_idn_component) { @@ -411,8 +435,10 @@ bool IDNToUnicodeOneComponent(const base::char16* comp, if (U_SUCCESS(status) && info.errors == 0) { *has_idn_component = true; - // Converted successfully. Ensure that the converted component - // can be safely displayed to the user. + // Converted successfully. At this point the length of the output string + // is original_length + output_length which may be shorter than the current + // length of |out|. Trim |out| and ensure that the converted component can + // be safely displayed to the user. out->resize(original_length + output_length); if (!enable_spoof_checks) { return true; @@ -420,7 +446,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp, if (IsIDNComponentSafe( base::StringPiece16(out->data() + original_length, base::checked_cast<size_t>(output_length)), - top_level_domain)) { + top_level_domain, top_level_domain_unicode)) { return true; } } |