summaryrefslogtreecommitdiff
path: root/chromium/components/lookalikes
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2021-05-20 09:47:09 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2021-06-07 11:15:42 +0000
commit189d4fd8fad9e3c776873be51938cd31a42b6177 (patch)
tree6497caeff5e383937996768766ab3bb2081a40b2 /chromium/components/lookalikes
parent8bc75099d364490b22f43a7ce366b366c08f4164 (diff)
downloadqtwebengine-chromium-189d4fd8fad9e3c776873be51938cd31a42b6177.tar.gz
BASELINE: Update Chromium to 90.0.4430.221
Change-Id: Iff4d9d18d2fcf1a576f3b1f453010f744a232920 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/components/lookalikes')
-rw-r--r--chromium/components/lookalikes/DIR_METADATA3
-rw-r--r--chromium/components/lookalikes/OWNERS2
-rw-r--r--chromium/components/lookalikes/core/BUILD.gn1
-rw-r--r--chromium/components/lookalikes/core/features.cc5
-rw-r--r--chromium/components/lookalikes/core/features.h10
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util.cc87
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util.h2
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util_unittest.cc32
8 files changed, 120 insertions, 22 deletions
diff --git a/chromium/components/lookalikes/DIR_METADATA b/chromium/components/lookalikes/DIR_METADATA
new file mode 100644
index 00000000000..b3022156e2a
--- /dev/null
+++ b/chromium/components/lookalikes/DIR_METADATA
@@ -0,0 +1,3 @@
+monorail {
+ component: "UI>Browser>Interstitials"
+}
diff --git a/chromium/components/lookalikes/OWNERS b/chromium/components/lookalikes/OWNERS
index 1d70c9fef31..f050018fa29 100644
--- a/chromium/components/lookalikes/OWNERS
+++ b/chromium/components/lookalikes/OWNERS
@@ -1,3 +1 @@
file://chrome/browser/lookalikes/OWNERS
-
-# COMPONENT: UI>Browser>Interstitials
diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn
index 30aab561767..fccb48046c8 100644
--- a/chromium/components/lookalikes/core/BUILD.gn
+++ b/chromium/components/lookalikes/core/BUILD.gn
@@ -18,6 +18,7 @@ static_library("core") {
"//components/security_state/core:features",
"//components/strings",
"//components/url_formatter",
+ "//components/url_formatter/spoof_checks/common_words:common",
"//components/url_formatter/spoof_checks/top_domains:common",
"//components/url_formatter/spoof_checks/top_domains:top500_domains",
"//components/url_formatter/spoof_checks/top_domains:top500_domains_header",
diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc
index dc362a97a1b..d9e4926452d 100644
--- a/chromium/components/lookalikes/core/features.cc
+++ b/chromium/components/lookalikes/core/features.cc
@@ -13,5 +13,10 @@ const base::Feature kDetectTargetEmbeddingLookalikes{
const base::Feature kLookalikeInterstitialForPunycode{
"LookalikeInterstitialForPunycode", base::FEATURE_ENABLED_BY_DEFAULT};
+const base::Feature kLookalikeDigitalAssetLinks{
+ "LookalikeDigitalAssetLinks", base::FEATURE_DISABLED_BY_DEFAULT};
+
+const char kLookalikeDigitalAssetLinksTimeoutParameter[] = "timeout";
+
} // namespace features
} // namespace lookalikes
diff --git a/chromium/components/lookalikes/core/features.h b/chromium/components/lookalikes/core/features.h
index 453e1146082..d85eb6089c2 100644
--- a/chromium/components/lookalikes/core/features.h
+++ b/chromium/components/lookalikes/core/features.h
@@ -19,6 +19,16 @@ extern const base::Feature kDetectTargetEmbeddingLookalikes;
COMPONENT_EXPORT(LOOKALIKES_FEATURES)
extern const base::Feature kLookalikeInterstitialForPunycode;
+// This feature enables Digital Asset Link validations for lookalikes.
+COMPONENT_EXPORT(LOOKALIKES_FEATURES)
+extern const base::Feature kLookalikeDigitalAssetLinks;
+
+// Timeout before giving up on Digital Asset Link manifest fetches. The feature
+// fetches manifests from both the lookalike and the target URLs. If it fails to
+// fetch either manifest within this period, the validation is assumed to fail.
+COMPONENT_EXPORT(LOOKALIKES_FEATURES)
+extern const char kLookalikeDigitalAssetLinksTimeoutParameter[];
+
} // namespace features
} // namespace lookalikes
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc
index 3e8fcddc395..2daa1378f8c 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util.cc
@@ -23,10 +23,12 @@
#include "base/task/post_task.h"
#include "base/task/thread_pool.h"
#include "base/time/default_clock.h"
+#include "base/trace_event/trace_event.h"
#include "base/values.h"
#include "components/lookalikes/core/features.h"
#include "components/security_interstitials/core/pref_names.h"
#include "components/security_state/core/features.h"
+#include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
@@ -56,18 +58,15 @@ const size_t kMinE2LDLengthForTargetEmbedding = 4;
// This list will be added to the static list of common words so common words
// could be added to the list using a flag if needed.
-const base::FeatureParam<std::string> kAdditionalCommonWords{
+const base::FeatureParam<std::string> kRemoveAdditionalCommonWords{
&lookalikes::features::kDetectTargetEmbeddingLookalikes,
"additional_common_words", ""};
// We might not protect a domain whose e2LD is a common word in target embedding
-// based on the TLD that is paired with it.
-const char* kCommonWords[] = {
- "shop", "jobs", "live", "info", "study", "asahi",
- "weather", "health", "forum", "radio", "ideal", "research",
- "france", "free", "mobile", "sky", "ask", "booking",
- "canada", "dating", "dictionary", "express", "hoteles", "hotels",
- "investing", "jharkhand", "nifty"};
+// based on the TLD that is paired with it. This list supplements words from
+// url_formatter::common_words::IsCommonWord().
+const char* kLocalAdditionalCommonWords[] = {"asahi", "hoteles", "jharkhand",
+ "nifty"};
// These domains are plausible lookalike targets, but they also use common words
// in their names. Selectively prevent flagging embeddings where the embedder
@@ -241,6 +240,13 @@ std::string GetMatchingTopDomainWithoutSeparators(
return std::string();
}
+// Returns whether the visited domain is either for a bare eTLD+1 (e.g.
+// 'google.com') or a trivial subdomain (e.g. 'www.google.com').
+bool IsETLDPlusOneOrTrivialSubdomain(const DomainInfo& host) {
+ return (host.domain_and_registry == host.hostname ||
+ "www." + host.domain_and_registry == host.hostname);
+}
+
// Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged
// site or a top 500 domain. |embedded_target| is set to matching eTLD+1.
bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
@@ -249,7 +255,11 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
std::string* embedded_target) {
for (const auto& skeleton : domain.skeletons) {
for (const auto& engaged_site : engaged_sites) {
- if (base::Contains(engaged_site.skeletons, skeleton)) {
+ // Skeleton matching only calculates skeletons of the eTLD+1, so only
+ // consider engaged sites that are bare eTLD+1s (or a trivial subdomain)
+ // and are a skeleton match.
+ if (IsETLDPlusOneOrTrivialSubdomain(engaged_site) &&
+ base::Contains(engaged_site.skeletons, skeleton)) {
*embedded_target = engaged_site.domain_and_registry;
return true;
}
@@ -271,17 +281,33 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
// weather.com, ask.com). Target embeddings of these domains are often false
// positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
bool UsesCommonWord(const DomainInfo& domain) {
- std::vector<std::string> additional_common_words =
- base::SplitString(kAdditionalCommonWords.Get(), ",",
- base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
- if (base::Contains(additional_common_words, domain.domain_without_registry)) {
+ // kDomainsPermittedInEndEmbeddings are based on domains with common words,
+ // but they should not be excluded here (and instead are checked later).
+ for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
+ if (domain.domain_and_registry == permitted_ending) {
+ return false;
+ }
+ }
+
+ // Search for words in the big common word list.
+ if (url_formatter::common_words::IsCommonWord(
+ domain.domain_without_registry)) {
return true;
}
- for (auto* common_word : kCommonWords) {
+
+ // Also check the local lists.
+ for (auto* common_word : kLocalAdditionalCommonWords) {
if (domain.domain_without_registry == common_word) {
return true;
}
}
+ std::vector<std::string> additional_common_words =
+ base::SplitString(kRemoveAdditionalCommonWords.Get(), ",",
+ base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+ if (base::Contains(additional_common_words, domain.domain_without_registry)) {
+ return true;
+ }
+
return false;
}
@@ -317,9 +343,8 @@ bool IsCrossTLDMatch(const DomainInfo& embedded_target,
}
// Returns whether |embedded_target| is one of kDomainsPermittedInEndEmbeddings
-// and that |embedding_domain| ends with that domain (e.g. is of the form
-// "*-outlook.com" for each example.com in kDomainsPermittedInEndEmbeddings).
-// (e.g. will return true if |embedded_target| matches "evil-office.com"). Only
+// and that |embedding_domain| ends with that domain, e.g. "evil-office.com" is
+// permitted, as "office.com" is in kDomainsPermittedInEndEmbeddings. Only
// impacts Target Embedding matches.
bool EndsWithPermittedDomains(const DomainInfo& embedded_target,
const std::string& embedding_domain) {
@@ -348,6 +373,22 @@ bool IsAllowedToBeEmbedded(
EndsWithPermittedDomains(embedded_target, embedding_domain);
}
+// Returns the first character of the first string that is different from the
+// second string. Strings should be at least 1 edit distance apart.
+char GetFirstDifferentChar(const std::string& str1, const std::string& str2) {
+ std::string::const_iterator i1 = str1.begin();
+ std::string::const_iterator i2 = str2.begin();
+ while (i1 != str1.end() && i2 != str2.end()) {
+ if (*i1 != *i2) {
+ return *i1;
+ }
+ i1++;
+ i2++;
+ }
+ NOTREACHED();
+ return 0;
+}
+
} // namespace
DomainInfo::DomainInfo(const std::string& arg_hostname,
@@ -366,6 +407,7 @@ DomainInfo::~DomainInfo() = default;
DomainInfo::DomainInfo(const DomainInfo&) = default;
DomainInfo GetDomainInfo(const std::string& hostname) {
+ TRACE_EVENT0("navigation", "GetDomainInfo");
if (net::HostStringIsLocalhost(hostname) ||
net::IsHostnameNonUnique(hostname)) {
return DomainInfo(std::string(), std::string(), std::string(),
@@ -498,6 +540,17 @@ bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
}
}
+ // Ignore domains that only differ by an insertion of a "-".
+ if (nav_dom_len != matched_dom_len) {
+ if (nav_dom_len < matched_dom_len &&
+ GetFirstDifferentChar(matched_dom, nav_dom) == '-') {
+ return true;
+ } else if (nav_dom_len > matched_dom_len &&
+ GetFirstDifferentChar(nav_dom, matched_dom) == '-') {
+ return true;
+ }
+ }
+
return false;
}
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h
index 73d2d8afb0c..c5a27718cef 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.h
+++ b/chromium/components/lookalikes/core/lookalike_url_util.h
@@ -138,7 +138,7 @@ bool IsEditDistanceAtMostOne(const base::string16& str1,
// Returns whether |navigated_domain| and |matched_domain| are likely to be edit
// distance false positives, and thus the user should *not* be warned.
//
-// Assumes |navigated_domain| and |matched_domain| are edit distance matches.
+// Assumes |navigated_domain| and |matched_domain| are edit distance of 1 apart.
bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain);
diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
index c0f049a1528..25f30d6ad8d 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
@@ -105,6 +105,14 @@ TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) {
{"abcde.com", "axbcde.com", false}, // Deletion
{"axbcde.com", "abcde.com", false}, // Insertion
{"axbcde.com", "aybcde.com", false}, // Substitution
+
+ // We permit matches that only differ due to a single "-".
+ {"-abcde.com", "abcde.com", true},
+ {"ab-cde.com", "abcde.com", true},
+ {"abcde-.com", "abcde.com", true},
+ {"abcde.com", "-abcde.com", true},
+ {"abcde.com", "ab-cde.com", true},
+ {"abcde.com", "abcde-.com", true},
};
for (const TestCase& test_case : kTestCases) {
auto navigated =
@@ -134,8 +142,10 @@ struct TargetEmbeddingHeuristicTestCase {
TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
const std::vector<DomainInfo> kEngagedSites = {
GetDomainInfo(GURL("https://highengagement.com")),
+ GetDomainInfo(GURL("https://highengagement.inthesubdomain.com")),
GetDomainInfo(GURL("https://highengagement.co.uk")),
GetDomainInfo(GURL("https://subdomain.highengagement.com")),
+ GetDomainInfo(GURL("https://www.highengagementwithwww.com")),
GetDomainInfo(GURL("https://subdomain.google.com")),
};
const std::vector<TargetEmbeddingHeuristicTestCase> kTestCases = {
@@ -207,6 +217,9 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
TargetEmbeddingType::kInterstitial},
{"foo.jobs.org-foo.com", "", TargetEmbeddingType::kNone},
{"foo.office.org-foo.com", "", TargetEmbeddingType::kNone},
+ // Common words (like 'jobs' are included in the big common word list.
+ // Ensure that the supplemental kCommonWords list is also checked.
+ {"foo.hoteles.com-foo.com", "", TargetEmbeddingType::kNone},
// Targets could be embedded without their dots and dashes.
{"googlecom-foo.com", "google.com", TargetEmbeddingType::kInterstitial},
@@ -242,9 +255,19 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
{"foo.subdomain.google.com.foo.com", "subdomain.google.com",
TargetEmbeddingType::kInterstitial},
- // Skeleton matching should work against engaged sites at the eTLD level.
+ // Skeleton matching should work against engaged sites at a eTLD+1 level,
+ {"highengagement.inthesubdomain.com-foo.com",
+ "highengagement.inthesubdomain.com", TargetEmbeddingType::kInterstitial},
+ // but only if the bare eTLD+1, or www.[eTLD+1] has been engaged.
{"subdomain.highéngagement.com-foo.com", "highengagement.com",
TargetEmbeddingType::kInterstitial},
+ {"subdomain.highéngagementwithwww.com-foo.com",
+ "highengagementwithwww.com", TargetEmbeddingType::kInterstitial},
+ {"other.inthésubdomain.com-foo.com", "", TargetEmbeddingType::kNone},
+ // Ideally, we'd be able to combine subdomains and skeleton matching, but
+ // our current algorithm can't detect that precisely.
+ {"highengagement.inthésubdomain.com-foo.com", "",
+ TargetEmbeddingType::kNone},
// Domains should be allowed to embed themselves.
{"highengagement.com.highengagement.com", "", TargetEmbeddingType::kNone},
@@ -291,7 +314,12 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
<< test_case.expected_safe_host << ", but "
<< (safe_hostname.empty() ? "it didn't trigger at all."
: "triggered on " + safe_hostname);
- EXPECT_EQ(embedding_type, test_case.expected_type);
+ EXPECT_EQ(embedding_type, test_case.expected_type)
+ << test_case.hostname << " should trigger on "
+ << test_case.expected_safe_host << " but it returned "
+ << (embedding_type == TargetEmbeddingType::kNone
+ ? "kNone."
+ : "something unexpected");
} else {
EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone)
<< test_case.hostname << " unexpectedly triggered on "