summaryrefslogtreecommitdiff
path: root/chromium/components/lookalikes
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2020-10-12 14:27:29 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2020-10-13 09:35:20 +0000
commitc30a6232df03e1efbd9f3b226777b07e087a1122 (patch)
treee992f45784689f373bcc38d1b79a239ebe17ee23 /chromium/components/lookalikes
parent7b5b123ac58f58ffde0f4f6e488bcd09aa4decd3 (diff)
downloadqtwebengine-chromium-c30a6232df03e1efbd9f3b226777b07e087a1122.tar.gz
BASELINE: Update Chromium to 85.0.4183.14085-based
Change-Id: Iaa42f4680837c57725b1344f108c0196741f6057 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/components/lookalikes')
-rw-r--r--chromium/components/lookalikes/core/BUILD.gn1
-rw-r--r--chromium/components/lookalikes/core/features.cc3
-rw-r--r--chromium/components/lookalikes/core/features.h4
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util.cc372
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util.h48
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util_unittest.cc242
6 files changed, 466 insertions, 204 deletions
diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn
index ac6a4d28be9..65e89c84808 100644
--- a/chromium/components/lookalikes/core/BUILD.gn
+++ b/chromium/components/lookalikes/core/BUILD.gn
@@ -27,6 +27,7 @@ jumbo_source_set("unit_tests") {
deps = [
":core",
+ ":features",
"//net:test_support",
"//testing/gtest",
]
diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc
index faa42a292a2..99d9c417d37 100644
--- a/chromium/components/lookalikes/core/features.cc
+++ b/chromium/components/lookalikes/core/features.cc
@@ -10,5 +10,8 @@ namespace features {
const base::Feature kDetectTargetEmbeddingLookalikes{
"TargetEmbeddingLookalikes", base::FEATURE_DISABLED_BY_DEFAULT};
+const base::Feature kLookalikeInterstitialForPunycode{
+ "LookalikeInterstitialForPunycode", base::FEATURE_DISABLED_BY_DEFAULT};
+
} // namespace features
} // namespace lookalikes
diff --git a/chromium/components/lookalikes/core/features.h b/chromium/components/lookalikes/core/features.h
index 988c1c4043c..453e1146082 100644
--- a/chromium/components/lookalikes/core/features.h
+++ b/chromium/components/lookalikes/core/features.h
@@ -15,6 +15,10 @@ namespace features {
COMPONENT_EXPORT(LOOKALIKES_FEATURES)
extern const base::Feature kDetectTargetEmbeddingLookalikes;
+// This feature enables interstitial warnings for certain punycode domains.
+COMPONENT_EXPORT(LOOKALIKES_FEATURES)
+extern const base::Feature kLookalikeInterstitialForPunycode;
+
} // namespace features
} // namespace lookalikes
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc
index 2e82da34a54..4a350ed122e 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util.cc
@@ -14,6 +14,7 @@
#include "base/memory/singleton.h"
#include "base/metrics/field_trial_params.h"
#include "base/metrics/histogram_macros.h"
+#include "base/strings/string_piece.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
@@ -36,6 +37,28 @@ const char kHistogramName[] = "NavigationSuggestion.Event";
namespace {
+// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
+// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
+const char kDigitChars[] = "0123456789";
+
+// Minimum length of e2LD protected against target embedding. For example,
+// foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since
+// "baz" is shorter than kMinTargetE2LDLength.
+const size_t kMinE2LDLengthForTargetEmbedding = 4;
+
+// This list will be added to the static list of common words so common words
+// could be added to the list using a flag if needed.
+const base::FeatureParam<std::string> kAdditionalCommonWords{
+ &lookalikes::features::kDetectTargetEmbeddingLookalikes,
+ "additional_common_words", ""};
+
+// We might not protect a domain whose e2LD is a common word in target embedding
+// based on the TLD that is paired with it.
+const char* kCommonWords[] = {"shop", "jobs", "live", "info", "study",
+ "asahi", "weather", "health", "forum", "radio",
+ "ideal", "research", "france", "free", "mobile",
+ "sky", "ask"};
+
bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
const url_formatter::Skeletons& skeletons2) {
DCHECK(!skeletons1.empty());
@@ -86,27 +109,18 @@ std::string GetSimilarDomainFromTop500(
}
const std::string top_domain =
- url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton).domain;
+ url_formatter::LookupSkeletonInTopDomains(
+ top_domain_skeleton, url_formatter::SkeletonType::kFull)
+ .domain;
DCHECK(!top_domain.empty());
- // If the only difference between the navigated and top
- // domains is the registry part, this is unlikely to be a spoofing
- // attempt. Ignore this match and continue. E.g. If the navigated domain
- // is google.com.tw and the top domain is google.com.tr, this won't
- // produce a match.
- const std::string top_domain_without_registry =
- url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
- DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
- top_domain_without_registry));
- if (navigated_domain.domain_without_registry ==
- top_domain_without_registry) {
+ if (IsLikelyEditDistanceFalsePositive(navigated_domain,
+ GetDomainInfo(top_domain))) {
continue;
}
// Skip past domains that are allowed to be spoofed.
- if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
- url::kStandardSchemeSeparator +
- top_domain))) {
+ if (target_allowlisted.Run(top_domain)) {
continue;
}
@@ -134,20 +148,12 @@ std::string GetSimilarDomainFromEngagedSites(
continue;
}
- // If the only difference between the navigated and engaged
- // domain is the registry part, this is unlikely to be a spoofing
- // attempt. Ignore this match and continue. E.g. If the navigated
- // domain is google.com.tw and the top domain is google.com.tr, this
- // won't produce a match.
- if (navigated_domain.domain_without_registry ==
- engaged_site.domain_without_registry) {
+ if (IsLikelyEditDistanceFalsePositive(navigated_domain, engaged_site)) {
continue;
}
// Skip past domains that are allowed to be spoofed.
- if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
- url::kStandardSchemeSeparator +
- engaged_site.domain_and_registry))) {
+ if (target_allowlisted.Run(engaged_site.domain_and_registry)) {
continue;
}
@@ -164,61 +170,109 @@ void RecordEvent(NavigationSuggestionEvent event) {
// Returns the parts of the domain that are separated by "." or "-", not
// including the eTLD.
-std::vector<std::string> SplitDomainWithouteTLDIntoTokens(
+//
+// |host_without_etld| must outlive the return value since the vector contains
+// StringPieces.
+std::vector<base::StringPiece> SplitDomainWithouteTLDIntoTokens(
const std::string& host_without_etld) {
- return base::SplitString(host_without_etld, "-.", base::TRIM_WHITESPACE,
- base::SPLIT_WANT_NONEMPTY);
+ return base::SplitStringPiece(host_without_etld, "-.", base::TRIM_WHITESPACE,
+ base::SPLIT_WANT_NONEMPTY);
}
-// Checks whether |domain| is a top domain. If yes, returns true and fills
-// |found_domain| with the matching top domain.
-bool IsTop500Domain(const DomainInfo& domain, std::string* found_domain) {
- for (auto& skeleton : domain.skeletons) {
- // Matching with top domains is only done with skeleton matching. We check
- // if the skeleton of our hostname matches the skeleton of any top domain.
- url_formatter::TopDomainEntry matched_domain =
- url_formatter::IDNSpoofChecker().LookupSkeletonInTopDomains(skeleton);
- // We are only interested in an exact match with a top 500 domain (as
- // opposed to skeleton match). Here we check that the matched domain is a
- // top 500 domain and also the hostname of the matched domain is exactly the
- // same as our input eTLD+1.
- if (matched_domain.is_top_500 &&
- matched_domain.domain == domain.domain_and_registry) {
- *found_domain = matched_domain.domain;
+// Returns whether any subdomain ending in the last entry of |domain_labels| is
+// allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the
+// allowlist for google.com, scholar.google.com, and foo.scholar.google.com.
+bool ASubdomainIsAllowlisted(
+ const base::span<const base::StringPiece>& domain_labels,
+ const LookalikeTargetAllowlistChecker& in_target_allowlist) {
+ DCHECK(domain_labels.size() >= 2);
+ std::string potential_hostname =
+ domain_labels[domain_labels.size() - 1].as_string();
+ // Attach each token from the end to the embedded target to check if that
+ // subdomain has been allowlisted.
+ for (int i = domain_labels.size() - 2; i >= 0; i--) {
+ potential_hostname =
+ domain_labels[i].as_string() + "." + potential_hostname;
+ if (in_target_allowlist.Run(potential_hostname)) {
return true;
}
}
return false;
}
-// Checks if the targeted domain is allowlisted. To check that we need to
-// check all of the subdomains that could be made. The reason is for example
-// in the case of "foo.scholar.google.com.university.edu", "google.com" is
-// considered as the targeted domain. We need to make sure
-// "scholar.google.com" or "foo.scholar.google.com" are not allowlisted
-// before marking the input domain as a target embedding domain.
-bool ASubdomainIsAllowlisted(
- const std::string& embedded_target,
- const base::span<const std::string>& subdomain_labels_so_far,
- const LookalikeTargetAllowlistChecker& in_target_allowlist) {
- const std::string https_scheme =
- url::kHttpsScheme + std::string(url::kStandardSchemeSeparator);
+// Returns the top domain if the top domain without its separators matches the
+// |potential_target| (e.g. googlecom). The matching is a skeleton matching.
+std::string GetMatchingTopDomainWithoutSeparators(
+ const base::StringPiece& potential_target) {
+ const url_formatter::Skeletons skeletons =
+ url_formatter::GetSkeletons(base::UTF8ToUTF16(potential_target));
- if (in_target_allowlist.Run(GURL(https_scheme + embedded_target))) {
+ for (const auto& skeleton : skeletons) {
+ url_formatter::TopDomainEntry matched_domain =
+ url_formatter::LookupSkeletonInTopDomains(
+ skeleton, url_formatter::SkeletonType::kSeparatorsRemoved);
+ if (!matched_domain.domain.empty() &&
+ matched_domain.skeleton_type ==
+ url_formatter::SkeletonType::kSeparatorsRemoved) {
+ return matched_domain.domain;
+ }
+ }
+ return std::string();
+}
+
+// Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged
+// site or a top 500 domain. |embedded_target| is set to matching eTLD+1.
+bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
+ const DomainInfo& domain,
+ const std::vector<DomainInfo>& engaged_sites,
+ std::string* embedded_target) {
+ for (const auto& skeleton : domain.skeletons) {
+ for (const auto& engaged_site : engaged_sites) {
+ if (base::Contains(engaged_site.skeletons, skeleton)) {
+ *embedded_target = engaged_site.domain_and_registry;
+ return true;
+ }
+ }
+ }
+ for (const auto& skeleton : domain.skeletons) {
+ const url_formatter::TopDomainEntry top_domain =
+ url_formatter::LookupSkeletonInTopDomains(
+ skeleton, url_formatter::SkeletonType::kFull);
+ if (!top_domain.domain.empty() && top_domain.is_top_500) {
+ *embedded_target = top_domain.domain;
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns whether the provided token includes a common word, which is a common
+// indication of a likely false positive.
+bool UsesCommonWord(const DomainInfo& domain) {
+ std::vector<std::string> additional_common_words =
+ base::SplitString(kAdditionalCommonWords.Get(), ",",
+ base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+ if (base::Contains(additional_common_words, domain.domain_without_registry)) {
return true;
}
- std::string potential_hostname = embedded_target;
- // Attach each token from the end to the embedded target to check if that
- // subdomain has been allowlisted.
- for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) {
- potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname;
- if (in_target_allowlist.Run(GURL(https_scheme + potential_hostname))) {
+ for (auto* common_word : kCommonWords) {
+ if (domain.domain_without_registry == common_word) {
return true;
}
}
return false;
}
+// A domain is allowed to be embedded if its e2LD is a common word or any
+// valid partial subdomain is allowlisted.
+bool IsAllowedToBeEmbedded(
+ const DomainInfo& embedded_target,
+ const base::span<const base::StringPiece>& subdomain_span,
+ const LookalikeTargetAllowlistChecker& in_target_allowlist) {
+ return UsesCommonWord(embedded_target) ||
+ ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist);
+}
+
} // namespace
DomainInfo::DomainInfo(const std::string& arg_hostname,
@@ -236,14 +290,14 @@ DomainInfo::~DomainInfo() = default;
DomainInfo::DomainInfo(const DomainInfo&) = default;
-DomainInfo GetDomainInfo(const GURL& url) {
- if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) {
+DomainInfo GetDomainInfo(const std::string& hostname) {
+ if (net::HostStringIsLocalhost(hostname) ||
+ net::IsHostnameNonUnique(hostname)) {
return DomainInfo(std::string(), std::string(), std::string(),
url_formatter::IDNConversionResult(),
url_formatter::Skeletons());
}
- const std::string hostname = url.host();
- const std::string domain_and_registry = GetETLDPlusOne(url.host());
+ const std::string domain_and_registry = GetETLDPlusOne(hostname);
const std::string domain_without_registry =
domain_and_registry.empty()
? std::string()
@@ -268,6 +322,10 @@ DomainInfo GetDomainInfo(const GURL& url) {
idn_result, skeletons);
}
+DomainInfo GetDomainInfo(const GURL& url) {
+ return GetDomainInfo(url.host());
+}
+
std::string GetETLDPlusOne(const std::string& hostname) {
return net::registry_controlled_domains::GetDomainAndRegistry(
hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
@@ -314,12 +372,67 @@ bool IsEditDistanceAtMostOne(const base::string16& str1,
return edit_count <= 1;
}
+bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
+ const DomainInfo& matched_domain) {
+ DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
+ matched_domain.domain_and_registry));
+ DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
+ navigated_domain.domain_and_registry));
+ // If the only difference between the domains is the registry part, this is
+ // unlikely to be a spoofing attempt and we should ignore this match. E.g.
+ // exclude matches like google.com.tw and google.com.tr.
+ if (navigated_domain.domain_without_registry ==
+ matched_domain.domain_without_registry) {
+ return true;
+ }
+
+ // If the domains only differ by a numeric suffix on their e2LD (e.g.
+ // site45.tld and site35.tld), then ignore the match.
+ auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry,
+ kDigitChars, base::TRIM_TRAILING);
+ auto matched_trimmed = base::TrimString(
+ matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING);
+ DCHECK_NE(navigated_domain.domain_without_registry,
+ matched_domain.domain_without_registry);
+ // We previously verified that the domains without registries weren't equal,
+ // so if they're equal now, the match must have come from numeric suffixes.
+ if (nav_trimmed == matched_trimmed) {
+ return true;
+ }
+
+ // Ignore domains that only differ by an insertion/substitution at the
+ // start, as these are usually different words, not lookalikes.
+ const auto nav_dom_len = navigated_domain.domain_and_registry.length();
+ const auto matched_dom_len = matched_domain.domain_and_registry.length();
+ const auto& nav_dom = navigated_domain.domain_and_registry;
+ const auto& matched_dom = matched_domain.domain_and_registry;
+ if (nav_dom_len == matched_dom_len) {
+ // e.g. hank vs tank
+ if (nav_dom.substr(1) == matched_dom.substr(1)) {
+ return true;
+ }
+ } else if (nav_dom_len < matched_dom_len) {
+ // e.g. oodle vs poodle
+ if (nav_dom == matched_dom.substr(1)) {
+ return true;
+ }
+ } else { // navigated_dom_len > matched_dom_len
+ // e.g. poodle vs oodle
+ if (nav_dom.substr(1) == matched_dom) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool IsTopDomain(const DomainInfo& domain_info) {
// Top domains are only accessible through their skeletons, so query the top
// domains trie for each skeleton of this domain.
for (const std::string& skeleton : domain_info.skeletons) {
const url_formatter::TopDomainEntry top_domain =
- url_formatter::LookupSkeletonInTopDomains(skeleton);
+ url_formatter::LookupSkeletonInTopDomains(
+ skeleton, url_formatter::SkeletonType::kFull);
if (domain_info.domain_and_registry == top_domain.domain) {
return true;
}
@@ -402,12 +515,18 @@ bool GetMatchingDomain(
}
}
- if (IsTargetEmbeddingLookalike(navigated_domain.hostname, engaged_sites,
- in_target_allowlist, matched_domain)) {
+ TargetEmbeddingType embedding_type =
+ GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites,
+ in_target_allowlist, matched_domain);
+ if (embedding_type == TargetEmbeddingType::kSafetyTip) {
+ *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips;
+ return true;
+ } else if (embedding_type == TargetEmbeddingType::kInterstitial) {
*match_type = LookalikeUrlMatchType::kTargetEmbedding;
return true;
}
+ DCHECK(embedding_type == TargetEmbeddingType::kNone);
return false;
}
@@ -431,84 +550,89 @@ void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) {
case LookalikeUrlMatchType::kSkeletonMatchTop5k:
RecordEvent(NavigationSuggestionEvent::kMatchSkeletonTop5k);
break;
+ case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips:
+ RecordEvent(
+ NavigationSuggestionEvent::kMatchTargetEmbeddingForSafetyTips);
+ break;
+ case LookalikeUrlMatchType::kFailedSpoofChecks:
+ RecordEvent(NavigationSuggestionEvent::kFailedSpoofChecks);
+ break;
case LookalikeUrlMatchType::kNone:
break;
}
}
-bool IsTargetEmbeddingLookalike(
+TargetEmbeddingType GetTargetEmbeddingType(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
std::string* safe_hostname) {
const std::string host_without_etld =
url_formatter::top_domains::HostnameWithoutRegistry(hostname);
- const std::vector<std::string> hostname_tokens_without_etld =
+ const std::vector<base::StringPiece> hostname_tokens_without_etld =
SplitDomainWithouteTLDIntoTokens(host_without_etld);
- // For each token, we look backwards to the previous token to see if
- // "|prev_token|.|token|" forms a top domain or a high engaged domain.
- std::string prev_token;
-
- // We can have domains separated by '-'s or '.'s. In order to find target
- // embedding urls with google.com.com or google-com.com, we get url parts as
- // anything that is between two '-'s or '.'s. We check to see if any two
- // consecutive tokens form a top or highly-engaged domain.
- // Because of the way this matching is working, we can not identify target
- // embedding attacks against domains that contain '-' in their address
- // (e.g programme-tv.net). Also if the eTLD of the target has more than one
- // part, we won't be able to protect it (e.g. google.co.uk).
- for (size_t i = 0; i < hostname_tokens_without_etld.size(); i++) {
- const std::string token = hostname_tokens_without_etld[i];
- const std::string possible_embedded_target = prev_token + "." + token;
- if (prev_token.empty()) {
- prev_token = token;
- continue;
+ // There are O(n^2) potential target embeddings in a domain name. We want to
+ // be comprehensive, but optimize so that usually we needn't check all of
+ // them. We do that by sweeping from the back of the embedding domain, towards
+ // the front, checking for a valid eTLD. If we find one, then we consider the
+ // possible embedded domains that end in that eTLD (i.e. all possible start
+ // points from the beginning of the string onward).
+ for (int end = hostname_tokens_without_etld.size(); end > 0; --end) {
+ base::span<const base::StringPiece> etld_check_span(
+ hostname_tokens_without_etld.data(), end);
+ std::string etld_check_host = base::JoinString(etld_check_span, ".");
+ auto etld_check_dominfo = GetDomainInfo(etld_check_host);
+
+ // Check if the final token is a no-separator target (e.g. "googlecom").
+ // This check happens first so that we can exclude invalid eTLD+1s next.
+ std::string embedded_target = GetMatchingTopDomainWithoutSeparators(
+ hostname_tokens_without_etld[end - 1]);
+ if (!embedded_target.empty() &&
+ !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
+ in_target_allowlist)) {
+ *safe_hostname = embedded_target;
+ return TargetEmbeddingType::kInterstitial;
}
- prev_token = token;
- // Short domains are more likely to be misidentified as being embedded. For
- // example "mi.com", "mk.ru", or "com.ru" are a few examples of domains that
- // could trigger the target embedding heuristic falsely.
- if (possible_embedded_target.size() < 7) {
+ // Exclude otherwise-invalid eTLDs.
+ if (etld_check_dominfo.domain_without_registry.empty()) {
continue;
}
- // We want to protect user's high engaged websites as well as top domains.
- GURL possible_target_url(url::kHttpsScheme +
- std::string(url::kStandardSchemeSeparator) +
- possible_embedded_target);
- DomainInfo possible_target_domain = GetDomainInfo(possible_target_url);
- // We check if the eTLD+1 is a valid domain, otherwise there is no point in
- // checking if it is a top domain or an engaged domain.
- if (possible_target_domain.domain_and_registry.empty()) {
+ // Exclude e2LDs that are too short. <= because domain_without_registry has
+ // a trailing ".".
+ if (etld_check_dominfo.domain_without_registry.length() <=
+ kMinE2LDLengthForTargetEmbedding) {
continue;
}
- *safe_hostname =
- GetMatchingSiteEngagementDomain(engaged_sites, possible_target_domain);
- // |GetMatchingSiteEngagementDomain| uses skeleton matching, we make sure
- // the found engaged site is an exact match of the embedded target.
- if (*safe_hostname != possible_embedded_target) {
- *safe_hostname = std::string();
- }
- if (safe_hostname->empty() &&
- !IsTop500Domain(possible_target_domain, safe_hostname)) {
- continue;
+ // Check for exact matches against engaged sites, among all possible
+ // subdomains ending at |end|.
+ for (int start = 0; start < end - 1; ++start) {
+ const base::span<const base::StringPiece> span(
+ (hostname_tokens_without_etld.data() + start), end - start);
+ auto embedded_hostname = base::JoinString(span, ".");
+ auto embedded_dominfo = GetDomainInfo(embedded_hostname);
+
+ for (auto& engaged_site : engaged_sites) {
+ if (engaged_site.hostname == embedded_dominfo.hostname &&
+ !IsAllowedToBeEmbedded(embedded_dominfo, span,
+ in_target_allowlist)) {
+ *safe_hostname = engaged_site.hostname;
+ return TargetEmbeddingType::kInterstitial;
+ }
+ }
}
- // Check if any subdomain is allowlisted.
- std::vector<std::string> subdomain_labels_so_far(
- hostname_tokens_without_etld.begin(),
- hostname_tokens_without_etld.begin() + i - 1);
- if (!ASubdomainIsAllowlisted(possible_embedded_target,
- subdomain_labels_so_far,
- in_target_allowlist)) {
- return true;
+ // There were no exact engaged site matches, but there may yet still be a
+ // match against the eTLD+1 of an engaged or top site.
+ if (DoesETLDPlus1MatchTopDomainOrEngagedSite(
+ etld_check_dominfo, engaged_sites, safe_hostname) &&
+ !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
+ in_target_allowlist)) {
+ return TargetEmbeddingType::kInterstitial;
}
-
- // A target is found but it was allowlisted.
- *safe_hostname = std::string();
}
- return false;
+ return TargetEmbeddingType::kNone;
}
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h
index 1bc49c24384..00946f6d909 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.h
+++ b/chromium/components/lookalikes/core/lookalike_url_util.h
@@ -20,7 +20,16 @@ extern const char kHistogramName[];
}
using LookalikeTargetAllowlistChecker =
- base::RepeatingCallback<bool(const GURL&)>;
+ base::RepeatingCallback<bool(const std::string&)>;
+
+// Used for |GetTargetEmbeddingType| return value. It shows if the target
+// embedding triggers on the input domain, and if it does, what type of warning
+// should be shown to the user.
+enum class TargetEmbeddingType {
+ kNone = 0,
+ kInterstitial = 1,
+ kSafetyTip = 2,
+};
// Used for UKM. There is only a single LookalikeUrlMatchType per navigation.
enum class LookalikeUrlMatchType {
@@ -33,10 +42,16 @@ enum class LookalikeUrlMatchType {
kTargetEmbedding = 5,
kSkeletonMatchTop500 = 6,
kSkeletonMatchTop5k = 7,
+ kTargetEmbeddingForSafetyTips = 8,
+
+ // The domain name failed IDN spoof checks but didn't match a safe hostname.
+ // As a result, there is no URL to suggest to the user in the form of "Did
+ // you mean <url>?".
+ kFailedSpoofChecks = 9,
// Append new items to the end of the list above; do not modify or replace
// existing values. Comment out obsolete items.
- kMaxValue = kSkeletonMatchTop5k,
+ kMaxValue = kFailedSpoofChecks,
};
// Used for UKM. There is only a single LookalikeUrlBlockingPageUserAction per
@@ -66,10 +81,12 @@ enum class NavigationSuggestionEvent {
kMatchTargetEmbedding = 7,
kMatchSkeletonTop500 = 8,
kMatchSkeletonTop5k = 9,
+ kMatchTargetEmbeddingForSafetyTips = 10,
+ kFailedSpoofChecks = 11,
// Append new items to the end of the list above; do not modify or
// replace existing values. Comment out obsolete items.
- kMaxValue = kMatchSkeletonTop5k,
+ kMaxValue = kFailedSpoofChecks,
};
struct DomainInfo {
@@ -99,9 +116,12 @@ struct DomainInfo {
DomainInfo(const DomainInfo& other);
};
-// Returns a DomainInfo instance computed from |url|. Will return empty fields
-// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is
-// empty.
+// Returns a DomainInfo instance computed from |hostname|. Will return empty
+// fields for non-unique hostnames (e.g. site.test), localhost or sites whose
+// eTLD+1 is empty.
+DomainInfo GetDomainInfo(const std::string& hostname);
+
+// Convenience function for returning GetDomainInfo(url.host()).
DomainInfo GetDomainInfo(const GURL& url);
// Returns true if the Levenshtein distance between |str1| and |str2| is at most
@@ -110,6 +130,13 @@ DomainInfo GetDomainInfo(const GURL& url);
bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2);
+// Returns whether |navigated_domain| and |matched_domain| are likely to be edit
+// distance false positives, and thus the user should *not* be warned.
+//
+// Assumes |navigated_domain| and |matched_domain| are edit distance matches.
+bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
+ const DomainInfo& matched_domain);
+
// Returns true if the domain given by |domain_info| is a top domain.
bool IsTopDomain(const DomainInfo& domain_info);
@@ -141,14 +168,19 @@ void RecordUMAFromMatchType(LookalikeUrlMatchType match_type);
// |safe_hostname| to the url of the embedded target domain.
// At the moment we consider the following cases as Target Embedding:
// example-google.com-site.com, example.google.com-site.com,
-// example-google-com-site.com, example.google.com.site.com,
+// example-google-info-site.com, example.google.com.site.com,
// example-googlé.com-site.com where the embedded target is google.com. We
// detect embeddings of top 500 domains and engaged domains. However, to reduce
// false positives, we do not protect domains that are shorter than 7 characters
// long (e.g. com.ru).
// This function checks possible targets against |in_target_allowlist| to skip
// permitted embeddings.
-bool IsTargetEmbeddingLookalike(
+// If no target embedding is found, the return value will be set to |kNonw|.
+// When the target is embedded with another TLD instead of its actual TLD, it
+// should trigger a Safety Tip when the embedded TLD is a ccTLD. In this
+// situation, return value will be |kSafetyTip|. All the other triggers will
+// result in a |kInterstitial| return value.
+TargetEmbeddingType GetTargetEmbeddingType(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
index 4af557b6ad9..1aed2eddeec 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
@@ -6,6 +6,8 @@
#include "base/bind.h"
#include "base/strings/utf_string_conversions.h"
+#include "base/test/scoped_feature_list.h"
+#include "components/lookalikes/core/features.h"
#include "testing/gtest/include/gtest/gtest.h"
TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
@@ -63,102 +65,198 @@ TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
bool result =
IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
base::WideToUTF16(test_case.top_domain));
- EXPECT_EQ(test_case.expected, result);
+ EXPECT_EQ(test_case.expected, result)
+ << "when comparing " << test_case.domain << " with "
+ << test_case.top_domain;
}
}
-bool IsGoogleScholar(const GURL& hostname) {
- return hostname.host() == "scholar.google.com";
+TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) {
+ const struct TestCase {
+ const char* domain;
+ const char* top_domain;
+ bool is_likely_false_positive;
+ } kTestCases[] = {
+ // Most edit distance instances are not likely false positives.
+ {"abcxd.com", "abcyd.com", false}, // Substitution
+ {"abcxd.com", "abcxxd.com", false}, // Deletion
+ {"abcxxd.com", "abcxd.com", false}, // Insertion
+
+ // But we permit cases where the only difference is in the tld.
+ {"abcde.com", "abcde.net", true},
+
+ // We also permit matches that are only due to a numeric suffix,
+ {"abcd1.com", "abcd2.com", true}, // Substitution
+ {"abcde.com", "abcde1.com", true}, // Numeric deletion
+ {"abcde1.com", "abcde.com", true}, // Numeric insertion
+ {"abcd11.com", "abcd21.com", true}, // Not-final-digit substitution
+ {"a.abcd1.com", "abcd2.com", true}, // Only relevant for eTLD+1.
+ // ...and that change must be due to the numeric suffix.
+ {"abcx1.com", "abcy1.com", false}, // Substitution before suffix
+ {"abcd1.com", "abcde1.com", false}, // Deletion before suffix
+ {"abcde1.com", "abcd1.com", false}, // Insertion before suffix
+ {"abcdx.com", "abcdy.com", false}, // Non-numeric substitution at end
+
+ // We also permit matches that are only due to a first-character change,
+ {"xabcd.com", "yabcd.com", true}, // Substitution
+ {"xabcde.com", "abcde.com", true}, // Insertion
+ {"abcde.com", "xabcde.com", true}, // Deletion
+ {"a.abcde.com", "xabcde.com", true}, // For eTLD+1
+ // ...so long as that change is only on the first character, not later.
+ {"abcde.com", "axbcde.com", false}, // Deletion
+ {"axbcde.com", "abcde.com", false}, // Insertion
+ {"axbcde.com", "aybcde.com", false}, // Substitution
+ };
+ for (const TestCase& test_case : kTestCases) {
+ auto navigated =
+ GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
+ url::kStandardSchemeSeparator + test_case.domain));
+ auto matched = GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
+ url::kStandardSchemeSeparator +
+ test_case.top_domain));
+ bool result = IsLikelyEditDistanceFalsePositive(navigated, matched);
+ EXPECT_EQ(test_case.is_likely_false_positive, result)
+ << "when comparing " << test_case.domain << " with "
+ << test_case.top_domain;
+ }
}
-TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
- const std::vector<DomainInfo> engaged_sites = {
- GetDomainInfo(GURL("https://highengagement.com"))};
- const struct TargetEmbeddingHeuristicTestCase {
- const GURL url;
- bool should_trigger;
- } kTestCases[] = {
+bool IsGoogleScholar(const std::string& hostname) {
+ return hostname == "scholar.google.com";
+}
- // Scheme should not affect the outcome.
- {GURL("http://google.com.com"), true},
- {GURL("https://google.com.com"), true},
+struct TargetEmbeddingHeuristicTestCase {
+ const std::string hostname;
+ // Empty when there is no match.
+ const std::string expected_safe_host;
+ const TargetEmbeddingType expected_type;
+};
+TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
+ const std::vector<DomainInfo> kEngagedSites = {
+ GetDomainInfo(GURL("https://highengagement.com")),
+ GetDomainInfo(GURL("https://highengagement.co.uk")),
+ GetDomainInfo(GURL("https://subdomain.highengagement.com")),
+ GetDomainInfo(GURL("https://subdomain.google.com")),
+ };
+ const std::vector<TargetEmbeddingHeuristicTestCase> kTestCases = {
// The length of the url should not affect the outcome.
- {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-"
- "outcome-of-this-target-embedding-test-google.com-login.com"),
- true},
- {GURL(
- "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-"
- "outcome-of-this-target-embedding-test.com-login.com"),
- false},
- {GURL(
- "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-"
- "outcome-of-this-target-embedding-test.com-login.com"),
- false},
+ {"this-is-a-very-long-url-but-it-should-not-affect-the-"
+ "outcome-of-this-target-embedding-test-google.com-login.com",
+ "google.com", TargetEmbeddingType::kInterstitial},
+ {"google-com-this-is-a-very-long-url-but-it-should-not-affect-"
+ "the-outcome-of-this-target-embedding-test-login.com",
+ "google.com", TargetEmbeddingType::kInterstitial},
+ {"this-is-a-very-long-url-but-it-should-not-affect-google-the-"
+ "outcome-of-this-target-embedding-test.com-login.com",
+ "", TargetEmbeddingType::kNone},
+ {"google-this-is-a-very-long-url-but-it-should-not-affect-the-"
+ "outcome-of-this-target-embedding-test.com-login.com",
+ "", TargetEmbeddingType::kNone},
// We need exact skeleton match for our domain so exclude edit-distance
// matches.
- {GURL("http://goog0le.com-login.com"), false},
+ {"goog0le.com-login.com", "", TargetEmbeddingType::kNone},
- // Unicode characters are currently not handled. As a result, target
- // embedding sites that embed lookalikes of top domains aren't flagged.
- {GURL("http://googlé.com-login.com"), false},
- {GURL("http://sth-googlé.com-sth.com"), false},
+ // Unicode characters should be handled
+ {"googlé.com-login.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+ {"foo-googlé.com-bar.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
- // The basic state
- {GURL("http://google.com.sth.com"), true},
+ // The basic states
+ {"google.com.foo.com", "google.com", TargetEmbeddingType::kInterstitial},
// - before the domain name should be ignored.
- {GURL("http://sth-google.com-sth.com"), true},
-
+ {"foo-google.com-bar.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
// The embedded target's TLD doesn't necessarily need to be followed by a
// '-' and could be a subdomain by itself.
- {GURL("http://sth-google.com.sth.com"), true},
- {GURL("http://a.b.c.d.e.f.g.h.sth-google.com.sth.com"), true},
- {GURL("http://a.b.c.d.e.f.g.h.google.com-sth.com"), true},
- {GURL("http://1.2.3.4.5.6.google.com-sth.com"), true},
-
+ {"foo-google.com.foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+ {"a.b.c.d.e.f.g.h.foo-google.com.foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+ {"a.b.c.d.e.f.g.h.google.com-foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+ {"1.2.3.4.5.6.google.com-foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
// Target domain could be in the middle of subdomains.
- {GURL("http://sth.google.com.sth.com"), true},
- {GURL("http://sth.google.com-sth.com"), true},
-
+ {"foo.google.com.foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
// The target domain and its tld should be next to each other.
- {GURL("http://sth-google.l.com-sth.com"), false},
-
- // Target domain should match only with its actual TLD.
- {GURL("http://google.edu.com"), false},
+ {"foo-google.l.com-foo.com", "", TargetEmbeddingType::kNone},
// Target domain might be separated with a dash instead of dot.
- {GURL("http://sth.google-com-sth.com"), true},
- // Target domain could be an engaged domain
- {GURL("http://highengagement-com-login.com"), true},
- // If target domain is an allowlisted domain, it should not trigger the
- // heuristic.
- {GURL("http://foo.scholar.google-com-login.com"), false},
- // An allowlisted domain will make sure it is not marked as an embedded
- // target. However, other targets could still be embedded in the domain.
- {GURL("http://foo.google.com.scholar.google-com-login.com"), true},
- {GURL("http://foo.scholar.google-com.google.com-login.com"), true},
-
- // Ensure legitimate domains don't trigger the heuristic.
- {GURL("http://google.com"), false},
- {GURL("http://google.co.uk"), false},
- {GURL("http://google.randomreg-login.com"), false},
+ {"foo.google-com-foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+
+ // Allowlisted domains should not trigger heuristic.
+ {"scholar.google.com.foo.com", "", TargetEmbeddingType::kNone},
+ {"scholar.google.com-google.com.foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+ {"google.com-scholar.google.com.foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+ {"foo.scholar.google.com.foo.com", "", TargetEmbeddingType::kNone},
+ {"scholar.foo.google.com.foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+
+ // Targets should be longer than 6 characters.
+ {"hp.com-foo.com", "", TargetEmbeddingType::kNone},
+
+ // Targets with common words as e2LD are not considered embedded targets
+ // either for all TLDs or another-TLD matching.
+ {"foo.jobs.com-foo.com", "", TargetEmbeddingType::kNone},
+ {"foo.office.com-foo.com", "office.com",
+ TargetEmbeddingType::kInterstitial},
+ {"foo.jobs.org-foo.com", "", TargetEmbeddingType::kNone},
+ {"foo.office.org-foo.com", "", TargetEmbeddingType::kNone},
+
+ // Targets could be embedded without their dots and dashes.
+ {"foo.googlecom-foo.com", "google.com",
+ TargetEmbeddingType::kInterstitial},
+
+ // Ensure legitimate domains don't trigger.
+ {"foo.google.com", "", TargetEmbeddingType::kNone},
+ {"foo.bar.google.com", "", TargetEmbeddingType::kNone},
+ {"google.com", "", TargetEmbeddingType::kNone},
+ {"google.co.uk", "", TargetEmbeddingType::kNone},
+ {"google.randomreg-login.com", "", TargetEmbeddingType::kNone},
+ {"com.foo.com", "", TargetEmbeddingType::kNone},
+
+ // Multipart eTLDs should work.
+ {"foo.google.co.uk.foo.com", "google.co.uk",
+ TargetEmbeddingType::kInterstitial},
+ {"foo.highengagement-co-uk.foo.com", "highengagement.co.uk",
+ TargetEmbeddingType::kInterstitial},
+
+ // Engaged sites should trigger as specifically as possible, and should
+ // trigger preferentially to top sites when possible.
+ {"foo.highengagement.com.foo.com", "highengagement.com",
+ TargetEmbeddingType::kInterstitial},
+ {"foo.subdomain.highengagement.com.foo.com",
+ "subdomain.highengagement.com", TargetEmbeddingType::kInterstitial},
+ {"foo.subdomain.google.com.foo.com", "subdomain.google.com",
+ TargetEmbeddingType::kInterstitial},
+ // Skeleton matching should work against engaged sites at the eTLD level.
+ {"subdomain.highéngagement.com-foo.com", "highengagement.com",
+ TargetEmbeddingType::kInterstitial},
};
- for (const auto& kTestCase : kTestCases) {
+ for (auto& test_case : kTestCases) {
std::string safe_hostname;
- if (kTestCase.should_trigger) {
- EXPECT_TRUE(IsTargetEmbeddingLookalike(
- kTestCase.url.host(), engaged_sites,
- base::BindRepeating(&IsGoogleScholar), &safe_hostname))
- << "Expected that \"" << kTestCase.url
- << " should trigger but it didn't.";
+ TargetEmbeddingType embedding_type = GetTargetEmbeddingType(
+ test_case.hostname, kEngagedSites,
+ base::BindRepeating(&IsGoogleScholar), &safe_hostname);
+ if (test_case.expected_type != TargetEmbeddingType::kNone) {
+ EXPECT_EQ(safe_hostname, test_case.expected_safe_host)
+ << test_case.hostname << " should trigger on "
+ << test_case.expected_safe_host << ", but "
+ << (safe_hostname.empty() ? "it didn't trigger at all."
+ : "triggered on " + safe_hostname);
+ EXPECT_EQ(embedding_type, test_case.expected_type);
} else {
- EXPECT_FALSE(IsTargetEmbeddingLookalike(
- kTestCase.url.host(), engaged_sites,
- base::BindRepeating(&IsGoogleScholar), &safe_hostname))
- << "Expected that \"" << kTestCase.url
- << " shouldn't trigger but it did. For URL: " << safe_hostname;
+ EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone)
+ << test_case.hostname << " unexpectedly triggered on "
+ << safe_hostname;
}
}
}