summaryrefslogtreecommitdiff
path: root/chromium/components/lookalikes
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2020-07-16 11:45:35 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2020-07-17 08:59:23 +0000
commit552906b0f222c5d5dd11b9fd73829d510980461a (patch)
tree3a11e6ed0538a81dd83b20cf3a4783e297f26d91 /chromium/components/lookalikes
parent1b05827804eaf047779b597718c03e7d38344261 (diff)
downloadqtwebengine-chromium-552906b0f222c5d5dd11b9fd73829d510980461a.tar.gz
BASELINE: Update Chromium to 83.0.4103.122
Change-Id: Ie3a82f5bb0076eec2a7c6a6162326b4301ee291e Reviewed-by: Michael Brüning <michael.bruning@qt.io>
Diffstat (limited to 'chromium/components/lookalikes')
-rw-r--r--chromium/components/lookalikes/BUILD.gn32
-rw-r--r--chromium/components/lookalikes/DEPS5
-rw-r--r--chromium/components/lookalikes/OWNERS3
-rw-r--r--chromium/components/lookalikes/README4
-rw-r--r--chromium/components/lookalikes/lookalike_url_util.cc436
-rw-r--r--chromium/components/lookalikes/lookalike_url_util.h144
-rw-r--r--chromium/components/lookalikes/lookalike_url_util_unittest.cc175
7 files changed, 799 insertions, 0 deletions
diff --git a/chromium/components/lookalikes/BUILD.gn b/chromium/components/lookalikes/BUILD.gn
new file mode 100644
index 00000000000..c28b4f05f45
--- /dev/null
+++ b/chromium/components/lookalikes/BUILD.gn
@@ -0,0 +1,32 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import("//build/config/jumbo.gni")
+
+jumbo_static_library("lookalikes") {
+ sources = [
+ "lookalike_url_util.cc",
+ "lookalike_url_util.h",
+ ]
+ deps = [
+ "//base",
+ "//components/security_state/core:features",
+ "//components/url_formatter",
+ "//components/url_formatter/spoof_checks/top_domains:common",
+ "//components/url_formatter/spoof_checks/top_domains:top500_domains",
+ "//components/url_formatter/spoof_checks/top_domains:top500_domains_header",
+ "//net",
+ ]
+}
+
+jumbo_source_set("unit_tests") {
+ testonly = true
+ sources = [ "lookalike_url_util_unittest.cc" ]
+
+ deps = [
+ ":lookalikes",
+ "//net:test_support",
+ "//testing/gtest",
+ ]
+}
diff --git a/chromium/components/lookalikes/DEPS b/chromium/components/lookalikes/DEPS
new file mode 100644
index 00000000000..563bb1e8d24
--- /dev/null
+++ b/chromium/components/lookalikes/DEPS
@@ -0,0 +1,5 @@
+include_rules = [
+ "+components/security_state",
+ "+components/url_formatter",
+ "+net/base",
+]
diff --git a/chromium/components/lookalikes/OWNERS b/chromium/components/lookalikes/OWNERS
new file mode 100644
index 00000000000..1d70c9fef31
--- /dev/null
+++ b/chromium/components/lookalikes/OWNERS
@@ -0,0 +1,3 @@
+file://chrome/browser/lookalikes/OWNERS
+
+# COMPONENT: UI>Browser>Interstitials
diff --git a/chromium/components/lookalikes/README b/chromium/components/lookalikes/README
new file mode 100644
index 00000000000..7aa6eeaccef
--- /dev/null
+++ b/chromium/components/lookalikes/README
@@ -0,0 +1,4 @@
+This directory contains shared code used for the Lookalike URL blocking page.
+
+The lookalike interstitial is triggered when a user visits a domain that is
+similar to that of a domain of a popular site. \ No newline at end of file
diff --git a/chromium/components/lookalikes/lookalike_url_util.cc b/chromium/components/lookalikes/lookalike_url_util.cc
new file mode 100644
index 00000000000..fa386d5d6f8
--- /dev/null
+++ b/chromium/components/lookalikes/lookalike_url_util.cc
@@ -0,0 +1,436 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/lookalikes/lookalike_url_util.h"
+
+#include <utility>
+
+#include "base/bind.h"
+#include "base/callback.h"
+#include "base/macros.h"
+#include "base/memory/scoped_refptr.h"
+#include "base/memory/singleton.h"
+#include "base/metrics/field_trial_params.h"
+#include "base/metrics/histogram_macros.h"
+#include "base/strings/string_split.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/task/post_task.h"
+#include "base/task/thread_pool.h"
+#include "base/time/default_clock.h"
+#include "components/security_state/core/features.h"
+#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
+#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
+#include "components/url_formatter/url_formatter.h"
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
+#include "net/base/url_util.h"
+
+namespace lookalikes {
+
+const char kHistogramName[] = "NavigationSuggestion.Event";
+const base::FeatureParam<std::string> kImportantTlds{
+ &security_state::features::kSafetyTipUI, "targetembedding_important_tlds",
+ "com,edu,org,gov"};
+
+} // namespace lookalikes
+
+namespace {
+
+bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
+ const url_formatter::Skeletons& skeletons2) {
+ DCHECK(!skeletons1.empty());
+ DCHECK(!skeletons2.empty());
+ for (const std::string& skeleton1 : skeletons1) {
+ if (base::Contains(skeletons2, skeleton1)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns a site that the user has used before that the eTLD+1 in
+// |domain_and_registry| may be attempting to spoof, based on skeleton
+// comparison.
+std::string GetMatchingSiteEngagementDomain(
+ const std::vector<DomainInfo>& engaged_sites,
+ const DomainInfo& navigated_domain) {
+ DCHECK(!navigated_domain.domain_and_registry.empty());
+ for (const DomainInfo& engaged_site : engaged_sites) {
+ DCHECK(!engaged_site.domain_and_registry.empty());
+ DCHECK_NE(navigated_domain.domain_and_registry,
+ engaged_site.domain_and_registry);
+ if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) {
+ return engaged_site.domain_and_registry;
+ }
+ }
+ return std::string();
+}
+
+// Returns the first matching top domain with an edit distance of at most one
+// to |domain_and_registry|. This search is done in lexicographic order on the
+// top 500 suitable domains, instead of in order by popularity. This means that
+// the resulting "similar" domain may not be the most popular domain that
+// matches.
+std::string GetSimilarDomainFromTop500(const DomainInfo& navigated_domain) {
+ for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
+ for (const char* const top_domain_skeleton :
+ top500_domains::kTop500EditDistanceSkeletons) {
+ if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
+ base::UTF8ToUTF16(top_domain_skeleton))) {
+ const std::string top_domain =
+ url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton)
+ .domain;
+ DCHECK(!top_domain.empty());
+ // If the only difference between the navigated and top
+ // domains is the registry part, this is unlikely to be a spoofing
+ // attempt. Ignore this match and continue. E.g. If the navigated domain
+ // is google.com.tw and the top domain is google.com.tr, this won't
+ // produce a match.
+ const std::string top_domain_without_registry =
+ url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
+ DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
+ top_domain_without_registry));
+ if (navigated_domain.domain_without_registry !=
+ top_domain_without_registry) {
+ return top_domain;
+ }
+ }
+ }
+ }
+ return std::string();
+}
+
+// Returns the first matching engaged domain with an edit distance of at most
+// one to |domain_and_registry|.
+std::string GetSimilarDomainFromEngagedSites(
+ const DomainInfo& navigated_domain,
+ const std::vector<DomainInfo>& engaged_sites) {
+ for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
+ for (const DomainInfo& engaged_site : engaged_sites) {
+ if (!url_formatter::top_domains::IsEditDistanceCandidate(
+ engaged_site.domain_and_registry)) {
+ continue;
+ }
+ for (const std::string& engaged_skeleton : engaged_site.skeletons) {
+ if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
+ base::UTF8ToUTF16(engaged_skeleton))) {
+ // If the only difference between the navigated and engaged
+ // domain is the registry part, this is unlikely to be a spoofing
+ // attempt. Ignore this match and continue. E.g. If the navigated
+ // domain is google.com.tw and the top domain is google.com.tr, this
+ // won't produce a match.
+ if (navigated_domain.domain_without_registry !=
+ engaged_site.domain_without_registry) {
+ return engaged_site.domain_and_registry;
+ }
+ }
+ }
+ }
+ }
+ return std::string();
+}
+
+void RecordEvent(NavigationSuggestionEvent event) {
+ UMA_HISTOGRAM_ENUMERATION(lookalikes::kHistogramName, event);
+}
+
+// Returns the parts of the url that are separated by "." or "-" not including
+// the eTLD.
+std::vector<base::string16> SplitNoneTLDDomainIntoTokens(
+ const base::string16& host_without_etld) {
+ return base::SplitString(host_without_etld, base::ASCIIToUTF16("-."),
+ base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+}
+
+// For each possible e2LD+eTLD pair, check whether it forms a top domain.
+bool IsTopDomainCandidate(const std::set<std::string>& important_tlds,
+ const base::string16& e2LD,
+ GURL* found_domain) {
+ // We need to identify top domains, even when the spoof uses the 'wrong' TLD
+ // (e.g. google.gov). To do that, we check the embedded domain with each
+ // possible |important_tld| against the top domain list.
+ for (const auto& tld : important_tlds) {
+ // Create a GURL so we can get a DomainInfo from it for IsTopDomain
+ // e2LD is the smallest unit of a domain name that could be registered.
+ // (e.g. example in example.com)
+ base::string16 target16 =
+ e2LD + base::ASCIIToUTF16(".") + base::ASCIIToUTF16(tld);
+ GURL possible_target(base::ASCIIToUTF16(url::kHttpsScheme) +
+ base::ASCIIToUTF16(url::kStandardSchemeSeparator) +
+ target16);
+ DomainInfo possible_target_domain = GetDomainInfo(possible_target);
+ if (IsTopDomain(possible_target_domain)) {
+ *found_domain = GURL(possible_target.spec());
+ return true;
+ }
+ // If no match is found, check if e2LD is a unicode spoof
+ std::string top_targeted_domain =
+ url_formatter::IDNSpoofChecker().GetSimilarTopDomain(target16).domain;
+ if (!top_targeted_domain.empty()) {
+ *found_domain = GURL(std::string(url::kHttpsScheme) +
+ url::kStandardSchemeSeparator + top_targeted_domain);
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace
+
+DomainInfo::DomainInfo(const std::string& arg_hostname,
+ const std::string& arg_domain_and_registry,
+ const std::string& arg_domain_without_registry,
+ const url_formatter::IDNConversionResult& arg_idn_result,
+ const url_formatter::Skeletons& arg_skeletons)
+ : hostname(arg_hostname),
+ domain_and_registry(arg_domain_and_registry),
+ domain_without_registry(arg_domain_without_registry),
+ idn_result(arg_idn_result),
+ skeletons(arg_skeletons) {}
+
+DomainInfo::~DomainInfo() = default;
+
+DomainInfo::DomainInfo(const DomainInfo&) = default;
+
+DomainInfo GetDomainInfo(const GURL& url) {
+ if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) {
+ return DomainInfo(std::string(), std::string(), std::string(),
+ url_formatter::IDNConversionResult(),
+ url_formatter::Skeletons());
+ }
+ const std::string hostname = url.host();
+ const std::string domain_and_registry = GetETLDPlusOne(url.host());
+ const std::string domain_without_registry =
+ domain_and_registry.empty()
+ ? std::string()
+ : url_formatter::top_domains::HostnameWithoutRegistry(
+ domain_and_registry);
+
+ // eTLD+1 can be empty for private domains.
+ if (domain_and_registry.empty()) {
+ return DomainInfo(hostname, domain_and_registry, domain_without_registry,
+ url_formatter::IDNConversionResult(),
+ url_formatter::Skeletons());
+ }
+ // Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing
+ // checks in url_formatter can cause the converted result to be punycode.
+ // We want to avoid this in order to get an accurate skeleton for the unicode
+ // version of the domain.
+ const url_formatter::IDNConversionResult idn_result =
+ url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry);
+ const url_formatter::Skeletons skeletons =
+ url_formatter::GetSkeletons(idn_result.result);
+ return DomainInfo(hostname, domain_and_registry, domain_without_registry,
+ idn_result, skeletons);
+}
+
+std::string GetETLDPlusOne(const std::string& hostname) {
+ return net::registry_controlled_domains::GetDomainAndRegistry(
+ hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
+}
+
+bool IsEditDistanceAtMostOne(const base::string16& str1,
+ const base::string16& str2) {
+ if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
+ return false;
+ }
+ base::string16::const_iterator i = str1.begin();
+ base::string16::const_iterator j = str2.begin();
+ size_t edit_count = 0;
+ while (i != str1.end() && j != str2.end()) {
+ if (*i == *j) {
+ i++;
+ j++;
+ } else {
+ edit_count++;
+ if (edit_count > 1) {
+ return false;
+ }
+
+ if (str1.size() > str2.size()) {
+ // First string is longer than the second. This can only happen if the
+ // first string has an extra character.
+ i++;
+ } else if (str2.size() > str1.size()) {
+ // Second string is longer than the first. This can only happen if the
+ // second string has an extra character.
+ j++;
+ } else {
+ // Both strings are the same length. This can only happen if the two
+ // strings differ by a single character.
+ i++;
+ j++;
+ }
+ }
+ }
+ if (i != str1.end() || j != str2.end()) {
+ // A character at the end did not match.
+ edit_count++;
+ }
+ return edit_count <= 1;
+}
+
+bool IsTopDomain(const DomainInfo& domain_info) {
+ // Top domains are only accessible through their skeletons, so query the top
+ // domains trie for each skeleton of this domain.
+ for (const std::string& skeleton : domain_info.skeletons) {
+ const url_formatter::TopDomainEntry top_domain =
+ url_formatter::LookupSkeletonInTopDomains(skeleton);
+ if (domain_info.domain_and_registry == top_domain.domain) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type,
+ const DomainInfo& navigated_domain) {
+ if (match_type == LookalikeUrlMatchType::kSiteEngagement) {
+ return true;
+ }
+ return match_type == LookalikeUrlMatchType::kTopSite &&
+ navigated_domain.idn_result.matching_top_domain.is_top_500;
+}
+
+bool GetMatchingDomain(const DomainInfo& navigated_domain,
+ const std::vector<DomainInfo>& engaged_sites,
+ std::string* matched_domain,
+ LookalikeUrlMatchType* match_type) {
+ DCHECK(!navigated_domain.domain_and_registry.empty());
+ DCHECK(matched_domain);
+ DCHECK(match_type);
+
+ if (navigated_domain.idn_result.has_idn_component) {
+ // If the navigated domain is IDN, check its skeleton against engaged sites
+ // and top domains.
+ const std::string matched_engaged_domain =
+ GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
+ if (!matched_engaged_domain.empty()) {
+ *matched_domain = matched_engaged_domain;
+ *match_type = LookalikeUrlMatchType::kSiteEngagement;
+ return true;
+ }
+
+ if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) {
+ // In practice, this is not possible since the top domain list does not
+ // contain IDNs, so domain_and_registry can't both have IDN and be a top
+ // domain. Still, sanity check in case the top domain list changes in the
+ // future.
+ // At this point, navigated domain should not be a top domain.
+ DCHECK_NE(navigated_domain.domain_and_registry,
+ navigated_domain.idn_result.matching_top_domain.domain);
+ *matched_domain = navigated_domain.idn_result.matching_top_domain.domain;
+ *match_type = LookalikeUrlMatchType::kTopSite;
+ return true;
+ }
+ }
+
+ if (url_formatter::top_domains::IsEditDistanceCandidate(
+ navigated_domain.domain_and_registry)) {
+ // If we can't find an exact top domain or an engaged site, try to find an
+ // engaged domain within an edit distance of one.
+ const std::string similar_engaged_domain =
+ GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites);
+ if (!similar_engaged_domain.empty() &&
+ navigated_domain.domain_and_registry != similar_engaged_domain) {
+ *matched_domain = similar_engaged_domain;
+ *match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement;
+ return true;
+ }
+
+ // Finally, try to find a top domain within an edit distance of one.
+ const std::string similar_top_domain =
+ GetSimilarDomainFromTop500(navigated_domain);
+ if (!similar_top_domain.empty() &&
+ navigated_domain.domain_and_registry != similar_top_domain) {
+ *matched_domain = similar_top_domain;
+ *match_type = LookalikeUrlMatchType::kEditDistance;
+ return true;
+ }
+ }
+
+ GURL safe_url;
+ std::vector<std::string> important_tlds_list =
+ base::SplitString(lookalikes::kImportantTlds.Get(), ",",
+ base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+ std::set<std::string> important_tlds(important_tlds_list.begin(),
+ important_tlds_list.end());
+ if (IsTargetEmbeddingLookalike(
+ GURL(std::string(url::kHttpsScheme) +
+ std::string(url::kStandardSchemeSeparator) +
+ navigated_domain.hostname),
+ important_tlds, &safe_url)) {
+ *matched_domain = safe_url.host();
+ *match_type = LookalikeUrlMatchType::kTargetEmbedding;
+ return true;
+ }
+
+ return false;
+}
+
+void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) {
+ switch (match_type) {
+ case LookalikeUrlMatchType::kTopSite:
+ RecordEvent(NavigationSuggestionEvent::kMatchTopSite);
+ break;
+ case LookalikeUrlMatchType::kSiteEngagement:
+ RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
+ break;
+ case LookalikeUrlMatchType::kEditDistance:
+ RecordEvent(NavigationSuggestionEvent::kMatchEditDistance);
+ break;
+ case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
+ RecordEvent(NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement);
+ break;
+ case LookalikeUrlMatchType::kTargetEmbedding:
+ RecordEvent(NavigationSuggestionEvent::kMatchTargetEmbedding);
+ break;
+ case LookalikeUrlMatchType::kNone:
+ break;
+ }
+}
+
+bool IsTargetEmbeddingLookalike(const GURL& url,
+ const std::set<std::string>& important_tlds,
+ GURL* safe_url) {
+ DCHECK(url.SchemeIsHTTPOrHTTPS());
+
+ size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
+ url, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
+ net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
+ // url.host() will give punycode-encoded hostname, as we need all the unicode
+ // characters to stay in the url for further check we convert host to unicode
+ base::string16 host =
+ url_formatter::UnsafeIDNToUnicodeWithDetails(url.host()).result;
+ base::string16 host_without_etld =
+ host.substr(0, host.size() - 1 - registry_length);
+ const std::vector<base::string16> hostname_tokens_without_etld =
+ SplitNoneTLDDomainIntoTokens(host_without_etld);
+
+ // When we find a valid TLD, we look backwards to the previous token
+ // to see if we can use it to build a top domain.
+ base::string16 prev_part = base::EmptyString16();
+
+ // We could have domains separated by '-'s or '.'s, in order to find target
+ // embedding urls with google.com.com or google-com.com, we get url parts as
+ // anything that is between two '-'s or '.'s. We check to see if an important
+ // TLD is following an important domain.
+ // Because of the way this matching is working, we can not identify target
+ // embedding attacks on legitimate websites that contain '-' in their names
+ // (e.g programme-tv.net).
+ for (const auto& token : hostname_tokens_without_etld) {
+ if (prev_part.empty()) {
+ prev_part = token;
+ continue;
+ }
+
+ const std::string tld = base::UTF16ToUTF8(token);
+ if (base::Contains(important_tlds, tld) &&
+ IsTopDomainCandidate(important_tlds, prev_part, safe_url)) {
+ return true;
+ }
+ prev_part = token;
+ }
+ *safe_url = GURL();
+ return false;
+}
diff --git a/chromium/components/lookalikes/lookalike_url_util.h b/chromium/components/lookalikes/lookalike_url_util.h
new file mode 100644
index 00000000000..2d33bb0dec5
--- /dev/null
+++ b/chromium/components/lookalikes/lookalike_url_util.h
@@ -0,0 +1,144 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_
+#define COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "base/time/time.h"
+#include "components/url_formatter/url_formatter.h"
+#include "url/gurl.h"
+
+class GURL;
+
+namespace lookalikes {
+extern const char kHistogramName[];
+}
+
+// Used for UKM. There is only a single LookalikeUrlMatchType per navigation.
+enum class LookalikeUrlMatchType {
+ kNone = 0,
+ kTopSite = 1,
+ kSiteEngagement = 2,
+ kEditDistance = 3,
+ kEditDistanceSiteEngagement = 4,
+ kTargetEmbedding = 5,
+
+ // Append new items to the end of the list above; do not modify or replace
+ // existing values. Comment out obsolete items.
+ kMaxValue = kTargetEmbedding,
+};
+
+// Used for UKM. There is only a single LookalikeUrlBlockingPageUserAction per
+// navigation.
+enum class LookalikeUrlBlockingPageUserAction {
+ kInterstitialNotShown = 0,
+ kClickThrough = 1,
+ kAcceptSuggestion = 2,
+ kCloseOrBack = 3,
+
+ // Append new items to the end of the list above; do not modify or replace
+ // existing values. Comment out obsolete items.
+ kMaxValue = kCloseOrBack,
+};
+
+// Used for metrics. Multiple events can occur per navigation.
+enum class NavigationSuggestionEvent {
+ kNone = 0,
+ // Interstitial results recorded using security_interstitials::MetricsHelper
+ // kInfobarShown = 1,
+ // kLinkClicked = 2,
+ kMatchTopSite = 3,
+ kMatchSiteEngagement = 4,
+ kMatchEditDistance = 5,
+ kMatchEditDistanceSiteEngagement = 6,
+ kMatchTargetEmbedding = 7,
+
+ // Append new items to the end of the list above; do not modify or
+ // replace existing values. Comment out obsolete items.
+ kMaxValue = kMatchTargetEmbedding,
+};
+
+struct DomainInfo {
+ // The full ASCII hostname, used in detecting target embedding. For
+ // "https://www.google.com/mail" this will be "www.google.com".
+ const std::string hostname;
+ // eTLD+1, used for skeleton and edit distance comparison. Must be ASCII.
+ // Empty for non-unique domains, localhost or sites whose eTLD+1 is empty.
+ const std::string domain_and_registry;
+ // eTLD+1 without the registry part, and with a trailing period. For
+ // "www.google.com", this will be "google.". Used for edit distance
+ // comparisons. Empty for non-unique domains, localhost or sites whose eTLD+1
+ // is empty.
+ const std::string domain_without_registry;
+
+ // Result of IDN conversion of domain_and_registry field.
+ const url_formatter::IDNConversionResult idn_result;
+ // Skeletons of domain_and_registry field.
+ const url_formatter::Skeletons skeletons;
+
+ DomainInfo(const std::string& arg_hostname,
+ const std::string& arg_domain_and_registry,
+ const std::string& arg_domain_without_registry,
+ const url_formatter::IDNConversionResult& arg_idn_result,
+ const url_formatter::Skeletons& arg_skeletons);
+ ~DomainInfo();
+ DomainInfo(const DomainInfo& other);
+};
+
+// Returns a DomainInfo instance computed from |url|. Will return empty fields
+// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is
+// empty.
+DomainInfo GetDomainInfo(const GURL& url);
+
+// Returns true if the Levenshtein distance between |str1| and |str2| is at most
+// one. This has O(max(n,m)) complexity as opposed to O(n*m) of the usual edit
+// distance computation.
+bool IsEditDistanceAtMostOne(const base::string16& str1,
+ const base::string16& str2);
+
+// Returns true if the domain given by |domain_info| is a top domain.
+bool IsTopDomain(const DomainInfo& domain_info);
+
+// Returns eTLD+1 of |hostname|. This excludes private registries, and returns
+// "blogspot.com" for "test.blogspot.com" (blogspot.com is listed as a private
+// registry). We do this to be consistent with url_formatter's top domain list
+// which doesn't have a notion of private registries.
+std::string GetETLDPlusOne(const std::string& hostname);
+
+// Returns true if a lookalike interstitial should be shown.
+bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type,
+ const DomainInfo& navigated_domain);
+
+// Returns true if a domain is visually similar to the hostname of |url|. The
+// matching domain can be a top domain or an engaged site. Similarity
+// check is made using both visual skeleton and edit distance comparison. If
+// this returns true, match details will be written into |matched_domain|.
+// Pointer arguments can't be nullptr.
+bool GetMatchingDomain(const DomainInfo& navigated_domain,
+ const std::vector<DomainInfo>& engaged_sites,
+ std::string* matched_domain,
+ LookalikeUrlMatchType* match_type);
+
+void RecordUMAFromMatchType(LookalikeUrlMatchType match_type);
+
+// Checks to see if a URL is a target embedding lookalike. This function sets
+// |safe_url| to the url of the embedded target domain.
+// At the moment we consider the following cases as Target Embedding:
+// example-google.com-site.com, example.google.com-site.com,
+// example-google-com-site.com, example.google.com.site.com,
+// example-googlé.com-site.com where the embedded target is google.com. In
+// addition to these examples, this function also detects domains embedded with
+// alternate TLDs, if the TLD is included in |important_tlds| (e.g. google.edu
+// instead of google.com in the example URLs above.). To reduce false positives,
+// we exclude cases where the eTLD of the possibly-unsafe domain contains more
+// than just the TLD of the embedded domain. For instance, we exclude
+// foo-google.co.uk.
+bool IsTargetEmbeddingLookalike(const GURL& url,
+ const std::set<std::string>& important_tlds,
+ GURL* safe_url);
+
+#endif // COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_
diff --git a/chromium/components/lookalikes/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/lookalike_url_util_unittest.cc
new file mode 100644
index 00000000000..031aa39ca13
--- /dev/null
+++ b/chromium/components/lookalikes/lookalike_url_util_unittest.cc
@@ -0,0 +1,175 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/lookalikes/lookalike_url_util.h"
+
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
+ const struct TestCase {
+ const wchar_t* domain;
+ const wchar_t* top_domain;
+ bool expected;
+ } kTestCases[] = {
+ {L"", L"", true},
+ {L"a", L"a", true},
+ {L"a", L"", true},
+ {L"", L"a", true},
+
+ {L"", L"ab", false},
+ {L"ab", L"", false},
+
+ {L"ab", L"a", true},
+ {L"a", L"ab", true},
+ {L"ab", L"b", true},
+ {L"b", L"ab", true},
+ {L"ab", L"ab", true},
+
+ {L"", L"ab", false},
+ {L"ab", L"", false},
+ {L"a", L"abc", false},
+ {L"abc", L"a", false},
+
+ {L"aba", L"ab", true},
+ {L"ba", L"aba", true},
+ {L"abc", L"ac", true},
+ {L"ac", L"abc", true},
+
+ // Same length.
+ {L"xbc", L"ybc", true},
+ {L"axc", L"ayc", true},
+ {L"abx", L"aby", true},
+
+ // Should also work for non-ASCII.
+ {L"é", L"", true},
+ {L"", L"é", true},
+ {L"tést", L"test", true},
+ {L"test", L"tést", true},
+ {L"tés", L"test", false},
+ {L"test", L"tés", false},
+
+ // Real world test cases.
+ {L"google.com", L"gooogle.com", true},
+ {L"gogle.com", L"google.com", true},
+ {L"googlé.com", L"google.com", true},
+ {L"google.com", L"googlé.com", true},
+ // Different by two characters.
+ {L"google.com", L"goooglé.com", false},
+ };
+ for (const TestCase& test_case : kTestCases) {
+ bool result =
+ IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
+ base::WideToUTF16(test_case.top_domain));
+ EXPECT_EQ(test_case.expected, result);
+ }
+}
+
+TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
+ const std::set<std::string> important_tlds = {"com", "org", "edu", "gov",
+ "co"};
+ const struct TargetEmbeddingHeuristicTestCase {
+ const GURL url;
+ bool should_trigger;
+ } kTestCases[] = {
+
+ // We test everything with the correct TLD and another popular TLD.
+
+ // Scheme should not affect the outcome.
+ {GURL("http://google.com.com"), true},
+ {GURL("https://google.com.com"), true},
+
+ // The length of the url should not affect the outcome.
+ {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-"
+ "outcome-of-this-target-embedding-test-google.com-login.com"),
+ true},
+ {GURL(
+ "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-"
+ "outcome-of-this-target-embedding-test.com-login.com"),
+ false},
+ {GURL(
+ "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-"
+ "outcome-of-this-target-embedding-test.com-login.com"),
+ false},
+
+ // We need exact skeleton match for our domain so exclude edit-distance
+ // matches.
+ {GURL("http://goog0le.com-login.com"), false},
+
+ // Unicode characters should be handled
+ {GURL("http://googlé.com-login.com"), true},
+ {GURL("http://sth-googlé.com-sth.com"), true},
+
+ // The basic state
+ {GURL("http://google.com.sth.com"), true},
+ // - before the domain name should be ignored.
+ {GURL("http://sth-google.com-sth.com"), true},
+
+ // The embedded target's TLD doesn't necessarily need to be followed by a
+ // '-' and could be a subdomain by itself.
+ {GURL("http://sth-google.com.sth.com"), true},
+ {GURL("http://a.b.c.d.e.f.g.h.sth-google.com.sth.com"), true},
+ {GURL("http://a.b.c.d.e.f.g.h.google.com-sth.com"), true},
+ {GURL("http://1.2.3.4.5.6.google.com-sth.com"), true},
+
+ // Target domain could be in the middle of subdomains.
+ {GURL("http://sth.google.com.sth.com"), true},
+ {GURL("http://sth.google.com-sth.com"), true},
+
+ // The target domain and its tld should be next to each other.
+ {GURL("http://sth-google.l.com-sth.com"), false},
+
+ {GURL("http://google.edu.com"), true},
+ {GURL("https://google.edu.com"), true},
+ {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-"
+ "outcome-of-this-target-embedding-test-google.edu-login.com"),
+ true},
+ {GURL(
+ "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-"
+ "outcome-of-this-target-embedding-test.edu-login.com"),
+ false},
+ {GURL(
+ "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-"
+ "outcome-of-this-target-embedding-test.edu-login.com"),
+ false},
+ {GURL("http://goog0le.edu-login.com"), false},
+ {GURL("http://googlé.edu-login.com"), true},
+ {GURL("http://sth-googlé.edu-sth.com"), true},
+ {GURL("http://google.edu.sth.com"), true},
+ {GURL("http://sth-google.edu-sth.com"), true},
+ {GURL("http://sth-google.edu.sth.com"), true},
+ {GURL("http://a.b.c.d.e.f.g.h.sth-google.edu.sth.com"), true},
+ {GURL("http://a.b.c.d.e.f.g.h.google.edu-sth.com"), true},
+ {GURL("http://1.2.3.4.5.6.google.edu-sth.com"), true},
+ {GURL("http://sth.google.edu.sth.com"), true},
+ {GURL("http://sth.google.edu-sth.com"), true},
+ {GURL("http://sth-google.l.edu-sth.com"), false},
+ {GURL("http://sth-google-l.edu-sth.com"), false},
+ {GURL("http://sth-google.l-edu-sth.com"), false},
+
+ // Target domain might be separated with a dash instead of dot.
+ {GURL("http://sth.google-com-sth.com"), true},
+
+ // Ensure legitimate domains don't trigger.
+ {GURL("http://google.com"), false},
+ {GURL("http://google.co.uk"), false},
+ {GURL("http://google.randomreg-login.com"), false},
+
+ };
+
+ for (const auto& kTestCase : kTestCases) {
+ GURL safe_url = GURL();
+ if (kTestCase.should_trigger) {
+ EXPECT_TRUE(
+ IsTargetEmbeddingLookalike(kTestCase.url, important_tlds, &safe_url))
+ << "Expected that \"" << kTestCase.url
+ << " should trigger but it didn't.";
+ } else {
+ EXPECT_FALSE(
+ IsTargetEmbeddingLookalike(kTestCase.url, important_tlds, &safe_url))
+ << "Expected that \"" << kTestCase.url
+ << " shouldn't trigger but it did. For URL: " << safe_url.spec();
+ }
+ }
+}