summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc30
-rw-r--r--chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h10
-rw-r--r--chromium/components/url_formatter/url_formatter.cc68
3 files changed, 80 insertions, 28 deletions
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
index d0c22538cd4..f394f73cd3f 100644
--- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
+++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.cc
@@ -256,7 +256,8 @@ IDNSpoofChecker::~IDNSpoofChecker() {
bool IDNSpoofChecker::SafeToDisplayAsUnicode(
base::StringPiece16 label,
- base::StringPiece top_level_domain) {
+ base::StringPiece top_level_domain,
+ base::StringPiece16 top_level_domain_unicode) {
UErrorCode status = U_ZERO_ERROR;
int32_t result =
uspoof_check(checker_, label.data(),
@@ -266,7 +267,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
if (U_FAILURE(status) || (result & USPOOF_ALL_CHECKS))
return false;
- icu::UnicodeString label_string(FALSE, label.data(),
+ icu::UnicodeString label_string(FALSE /* isTerminated */, label.data(),
base::checked_cast<int32_t>(label.size()));
// A punycode label with 'xn--' prefix is not subject to the URL
@@ -284,7 +285,7 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
return false;
// Disallow Icelandic confusables for domains outside Iceland's ccTLD (.is).
- if (label_string.length() > 1 && top_level_domain != ".is" &&
+ if (label_string.length() > 1 && top_level_domain != "is" &&
icelandic_characters_.containsSome(label_string))
// Disallow Latin Schwa (U+0259) for domains outside Azerbaijan's ccTLD (.az).
@@ -309,9 +310,11 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(
if (result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE &&
kana_letters_exceptions_.containsNone(label_string) &&
combining_diacritics_exceptions_.containsNone(label_string)) {
- bool is_tld_ascii = !top_level_domain.starts_with(".xn--");
- // Check Cyrillic confusable only for ASCII TLDs.
- return !is_tld_ascii || !IsMadeOfLatinAlikeCyrillic(label_string);
+ // Check Cyrillic confusable only for TLDs where Cyrillic characters are
+ // uncommon.
+ return IsCyrillicTopLevelDomain(top_level_domain,
+ top_level_domain_unicode) ||
+ !IsMadeOfLatinAlikeCyrillic(label_string);
}
// Additional checks for |label| with multiple scripts, one of which is Latin.
@@ -592,6 +595,21 @@ bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
cyrillic_letters_latin_alike_.containsAll(cyrillic_in_label);
}
+bool IDNSpoofChecker::IsCyrillicTopLevelDomain(
+ base::StringPiece tld,
+ base::StringPiece16 tld_unicode) const {
+ icu::UnicodeString tld_string(
+ FALSE /* isTerminated */, tld_unicode.data(),
+ base::checked_cast<int32_t>(tld_unicode.size()));
+ if (cyrillic_letters_.containsSome(tld_string)) {
+ return true;
+ }
+ // These ASCII TLDs contain a large number of domains with Cyrillic
+ // characters.
+ return tld == "bg" || tld == "by" || tld == "kz" || tld == "pyc" ||
+ tld == "ru" || tld == "su" || tld == "ua" || tld == "uz";
+}
+
// static
void IDNSpoofChecker::SetTrieParamsForTesting(
const HuffmanTrieParams& trie_params) {
diff --git a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
index b981c403e3d..2d899cae84e 100644
--- a/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
+++ b/chromium/components/url_formatter/spoof_checks/idn_spoof_checker.h
@@ -54,8 +54,11 @@ class IDNSpoofChecker {
// Returns true if |label| is safe to display as Unicode. In the event of
// library failure, all IDN inputs will be treated as unsafe.
// See the function body for details on the specific safety checks performed.
+ // top_level_domain_unicode can be empty if top_level_domain is not well
+ // formed punycode.
bool SafeToDisplayAsUnicode(base::StringPiece16 label,
- base::StringPiece top_level_domain);
+ base::StringPiece top_level_domain,
+ base::StringPiece16 top_level_domain_unicode);
// Returns the matching top domain if |hostname| or the last few components of
// |hostname| looks similar to one of top domains listed i
@@ -82,6 +85,11 @@ class IDNSpoofChecker {
// Returns true if all the Cyrillic letters in |label| belong to a set of
// Cyrillic letters that look like ASCII Latin letters.
bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label);
+ // Returns true if |tld| is a top level domain most likely to contain a large
+ // number of Cyrillic domains. |tld_unicode| can be empty if |tld| is not well
+ // formed punycode.
+ bool IsCyrillicTopLevelDomain(base::StringPiece tld,
+ base::StringPiece16 tld_unicode) const;
// Used for unit tests.
static void SetTrieParamsForTesting(const HuffmanTrieParams& trie_params);
diff --git a/chromium/components/url_formatter/url_formatter.cc b/chromium/components/url_formatter/url_formatter.cc
index e7d8f37ce8e..4aada4a7ac4 100644
--- a/chromium/components/url_formatter/url_formatter.cc
+++ b/chromium/components/url_formatter/url_formatter.cc
@@ -35,6 +35,7 @@ IDNConversionResult IDNToUnicodeWithAdjustments(
bool IDNToUnicodeOneComponent(const base::char16* comp,
size_t comp_len,
base::StringPiece top_level_domain,
+ base::StringPiece16 top_level_domain_unicode,
bool enable_spoof_checks,
base::string16* out,
bool* has_idn_component);
@@ -234,6 +235,28 @@ base::string16 FormatViewSourceUrl(
base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =
LAZY_INSTANCE_INITIALIZER;
+// Computes the top level domain from |host|. top_level_domain_unicode will
+// contain the unicode version of top_level_domain. top_level_domain_unicode can
+// remain empty if the TLD is not well formed punycode.
+void GetTopLevelDomain(base::StringPiece host,
+ base::StringPiece* top_level_domain,
+ base::string16* top_level_domain_unicode) {
+ size_t last_dot = host.rfind('.');
+ if (last_dot == base::StringPiece::npos)
+ return;
+
+ *top_level_domain = host.substr(last_dot + 1);
+ base::string16 tld16;
+ tld16.reserve(top_level_domain->length());
+ tld16.insert(tld16.end(), top_level_domain->begin(), top_level_domain->end());
+
+ // Convert the TLD to unicode with the spoof checks disabled.
+ bool tld_has_idn_component = false;
+ IDNToUnicodeOneComponent(tld16.data(), tld16.size(), std::string(),
+ base::string16(), false /* enable_spoof_checks */,
+ top_level_domain_unicode, &tld_has_idn_component);
+}
+
IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
base::StringPiece host,
base::OffsetAdjuster::Adjustments* adjustments,
@@ -241,27 +264,25 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
if (adjustments)
adjustments->clear();
// Convert the ASCII input to a base::string16 for ICU.
- base::string16 input16;
- input16.reserve(host.length());
- input16.insert(input16.end(), host.begin(), host.end());
+ base::string16 host16;
+ host16.reserve(host.length());
+ host16.insert(host16.end(), host.begin(), host.end());
+ // Compute the top level domain to be used in spoof checks later.
base::StringPiece top_level_domain;
- size_t last_dot = host.rfind('.');
- if (last_dot != base::StringPiece::npos) {
- top_level_domain = host.substr(last_dot);
- }
+ base::string16 top_level_domain_unicode;
+ GetTopLevelDomain(host, &top_level_domain, &top_level_domain_unicode);
IDNConversionResult result;
// Do each component of the host separately, since we enforce script matching
// on a per-component basis.
base::string16 out16;
for (size_t component_start = 0, component_end;
- component_start < input16.length();
- component_start = component_end + 1) {
+ component_start < host16.length(); component_start = component_end + 1) {
// Find the end of the component.
- component_end = input16.find('.', component_start);
+ component_end = host16.find('.', component_start);
if (component_end == base::string16::npos)
- component_end = input16.length(); // For getting the last component.
+ component_end = host16.length(); // For getting the last component.
size_t component_length = component_end - component_start;
size_t new_component_start = out16.length();
bool converted_idn = false;
@@ -269,8 +290,9 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
// Add the substring that we just found.
bool has_idn_component = false;
converted_idn = IDNToUnicodeOneComponent(
- input16.data() + component_start, component_length, top_level_domain,
- enable_spoof_checks, &out16, &has_idn_component);
+ host16.data() + component_start, component_length, top_level_domain,
+ top_level_domain_unicode, enable_spoof_checks, &out16,
+ &has_idn_component);
result.has_idn_component |= has_idn_component;
}
size_t new_component_length = out16.length() - new_component_start;
@@ -281,7 +303,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
}
// Need to add the dot we just found (if we found one).
- if (component_end < input16.length())
+ if (component_end < host16.length())
out16.push_back('.');
}
@@ -294,7 +316,7 @@ IDNConversionResult IDNToUnicodeWithAdjustmentsImpl(
if (enable_spoof_checks && !result.matching_top_domain.empty()) {
if (adjustments)
adjustments->clear();
- result.result = input16;
+ result.result = host16;
}
}
@@ -320,9 +342,10 @@ IDNConversionResult UnsafeIDNToUnicodeWithAdjustments(
// all even though it's possible to make up look-alike labels with ASCII
// characters alone.
bool IsIDNComponentSafe(base::StringPiece16 label,
- base::StringPiece top_level_domain) {
- return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(label,
- top_level_domain);
+ base::StringPiece top_level_domain,
+ base::StringPiece16 top_level_domain_unicode) {
+ return g_idn_spoof_checker.Get().SafeToDisplayAsUnicode(
+ label, top_level_domain, top_level_domain_unicode);
}
// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
@@ -375,6 +398,7 @@ base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;
bool IDNToUnicodeOneComponent(const base::char16* comp,
size_t comp_len,
base::StringPiece top_level_domain,
+ base::StringPiece16 top_level_domain_unicode,
bool enable_spoof_checks,
base::string16* out,
bool* has_idn_component) {
@@ -411,8 +435,10 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
if (U_SUCCESS(status) && info.errors == 0) {
*has_idn_component = true;
- // Converted successfully. Ensure that the converted component
- // can be safely displayed to the user.
+ // Converted successfully. At this point the length of the output string
+ // is original_length + output_length which may be shorter than the current
+ // length of |out|. Trim |out| and ensure that the converted component can
+ // be safely displayed to the user.
out->resize(original_length + output_length);
if (!enable_spoof_checks) {
return true;
@@ -420,7 +446,7 @@ bool IDNToUnicodeOneComponent(const base::char16* comp,
if (IsIDNComponentSafe(
base::StringPiece16(out->data() + original_length,
base::checked_cast<size_t>(output_length)),
- top_level_domain)) {
+ top_level_domain, top_level_domain_unicode)) {
return true;
}
}