diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-05-20 09:47:09 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-06-07 11:15:42 +0000 |
commit | 189d4fd8fad9e3c776873be51938cd31a42b6177 (patch) | |
tree | 6497caeff5e383937996768766ab3bb2081a40b2 /chromium/components/lookalikes | |
parent | 8bc75099d364490b22f43a7ce366b366c08f4164 (diff) | |
download | qtwebengine-chromium-189d4fd8fad9e3c776873be51938cd31a42b6177.tar.gz |
BASELINE: Update Chromium to 90.0.4430.221
Change-Id: Iff4d9d18d2fcf1a576f3b1f453010f744a232920
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/components/lookalikes')
8 files changed, 120 insertions, 22 deletions
diff --git a/chromium/components/lookalikes/DIR_METADATA b/chromium/components/lookalikes/DIR_METADATA new file mode 100644 index 00000000000..b3022156e2a --- /dev/null +++ b/chromium/components/lookalikes/DIR_METADATA @@ -0,0 +1,3 @@ +monorail { + component: "UI>Browser>Interstitials" +} diff --git a/chromium/components/lookalikes/OWNERS b/chromium/components/lookalikes/OWNERS index 1d70c9fef31..f050018fa29 100644 --- a/chromium/components/lookalikes/OWNERS +++ b/chromium/components/lookalikes/OWNERS @@ -1,3 +1 @@ file://chrome/browser/lookalikes/OWNERS - -# COMPONENT: UI>Browser>Interstitials diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn index 30aab561767..fccb48046c8 100644 --- a/chromium/components/lookalikes/core/BUILD.gn +++ b/chromium/components/lookalikes/core/BUILD.gn @@ -18,6 +18,7 @@ static_library("core") { "//components/security_state/core:features", "//components/strings", "//components/url_formatter", + "//components/url_formatter/spoof_checks/common_words:common", "//components/url_formatter/spoof_checks/top_domains:common", "//components/url_formatter/spoof_checks/top_domains:top500_domains", "//components/url_formatter/spoof_checks/top_domains:top500_domains_header", diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc index dc362a97a1b..d9e4926452d 100644 --- a/chromium/components/lookalikes/core/features.cc +++ b/chromium/components/lookalikes/core/features.cc @@ -13,5 +13,10 @@ const base::Feature kDetectTargetEmbeddingLookalikes{ const base::Feature kLookalikeInterstitialForPunycode{ "LookalikeInterstitialForPunycode", base::FEATURE_ENABLED_BY_DEFAULT}; +const base::Feature kLookalikeDigitalAssetLinks{ + "LookalikeDigitalAssetLinks", base::FEATURE_DISABLED_BY_DEFAULT}; + +const char kLookalikeDigitalAssetLinksTimeoutParameter[] = "timeout"; + } // namespace features } // namespace lookalikes diff --git a/chromium/components/lookalikes/core/features.h b/chromium/components/lookalikes/core/features.h index 453e1146082..d85eb6089c2 100644 --- a/chromium/components/lookalikes/core/features.h +++ b/chromium/components/lookalikes/core/features.h @@ -19,6 +19,16 @@ extern const base::Feature kDetectTargetEmbeddingLookalikes; COMPONENT_EXPORT(LOOKALIKES_FEATURES) extern const base::Feature kLookalikeInterstitialForPunycode; +// This feature enables Digital Asset Link validations for lookalikes. +COMPONENT_EXPORT(LOOKALIKES_FEATURES) +extern const base::Feature kLookalikeDigitalAssetLinks; + +// Timeout before giving up on Digital Asset Link manifest fetches. The feature +// fetches manifests from both the lookalike and the target URLs. If it fails to +// fetch either manifest within this period, the validation is assumed to fail. +COMPONENT_EXPORT(LOOKALIKES_FEATURES) +extern const char kLookalikeDigitalAssetLinksTimeoutParameter[]; + } // namespace features } // namespace lookalikes diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc index 3e8fcddc395..2daa1378f8c 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util.cc +++ b/chromium/components/lookalikes/core/lookalike_url_util.cc @@ -23,10 +23,12 @@ #include "base/task/post_task.h" #include "base/task/thread_pool.h" #include "base/time/default_clock.h" +#include "base/trace_event/trace_event.h" #include "base/values.h" #include "components/lookalikes/core/features.h" #include "components/security_interstitials/core/pref_names.h" #include "components/security_state/core/features.h" +#include "components/url_formatter/spoof_checks/common_words/common_words_util.h" #include "components/url_formatter/spoof_checks/top_domains/top500_domains.h" #include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h" #include "components/url_formatter/url_formatter.h" @@ -56,18 +58,15 @@ const size_t kMinE2LDLengthForTargetEmbedding = 4; // This list will be added to the static list of common words so common words // could be added to the list using a flag if needed. -const base::FeatureParam<std::string> kAdditionalCommonWords{ +const base::FeatureParam<std::string> kRemoveAdditionalCommonWords{ &lookalikes::features::kDetectTargetEmbeddingLookalikes, "additional_common_words", ""}; // We might not protect a domain whose e2LD is a common word in target embedding -// based on the TLD that is paired with it. -const char* kCommonWords[] = { - "shop", "jobs", "live", "info", "study", "asahi", - "weather", "health", "forum", "radio", "ideal", "research", - "france", "free", "mobile", "sky", "ask", "booking", - "canada", "dating", "dictionary", "express", "hoteles", "hotels", - "investing", "jharkhand", "nifty"}; +// based on the TLD that is paired with it. This list supplements words from +// url_formatter::common_words::IsCommonWord(). +const char* kLocalAdditionalCommonWords[] = {"asahi", "hoteles", "jharkhand", + "nifty"}; // These domains are plausible lookalike targets, but they also use common words // in their names. Selectively prevent flagging embeddings where the embedder @@ -241,6 +240,13 @@ std::string GetMatchingTopDomainWithoutSeparators( return std::string(); } +// Returns whether the visited domain is either for a bare eTLD+1 (e.g. +// 'google.com') or a trivial subdomain (e.g. 'www.google.com'). +bool IsETLDPlusOneOrTrivialSubdomain(const DomainInfo& host) { + return (host.domain_and_registry == host.hostname || + "www." + host.domain_and_registry == host.hostname); +} + // Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged // site or a top 500 domain. |embedded_target| is set to matching eTLD+1. bool DoesETLDPlus1MatchTopDomainOrEngagedSite( @@ -249,7 +255,11 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite( std::string* embedded_target) { for (const auto& skeleton : domain.skeletons) { for (const auto& engaged_site : engaged_sites) { - if (base::Contains(engaged_site.skeletons, skeleton)) { + // Skeleton matching only calculates skeletons of the eTLD+1, so only + // consider engaged sites that are bare eTLD+1s (or a trivial subdomain) + // and are a skeleton match. + if (IsETLDPlusOneOrTrivialSubdomain(engaged_site) && + base::Contains(engaged_site.skeletons, skeleton)) { *embedded_target = engaged_site.domain_and_registry; return true; } @@ -271,17 +281,33 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite( // weather.com, ask.com). Target embeddings of these domains are often false // positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com"). bool UsesCommonWord(const DomainInfo& domain) { - std::vector<std::string> additional_common_words = - base::SplitString(kAdditionalCommonWords.Get(), ",", - base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); - if (base::Contains(additional_common_words, domain.domain_without_registry)) { + // kDomainsPermittedInEndEmbeddings are based on domains with common words, + // but they should not be excluded here (and instead are checked later). + for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) { + if (domain.domain_and_registry == permitted_ending) { + return false; + } + } + + // Search for words in the big common word list. + if (url_formatter::common_words::IsCommonWord( + domain.domain_without_registry)) { return true; } - for (auto* common_word : kCommonWords) { + + // Also check the local lists. + for (auto* common_word : kLocalAdditionalCommonWords) { if (domain.domain_without_registry == common_word) { return true; } } + std::vector<std::string> additional_common_words = + base::SplitString(kRemoveAdditionalCommonWords.Get(), ",", + base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); + if (base::Contains(additional_common_words, domain.domain_without_registry)) { + return true; + } + return false; } @@ -317,9 +343,8 @@ bool IsCrossTLDMatch(const DomainInfo& embedded_target, } // Returns whether |embedded_target| is one of kDomainsPermittedInEndEmbeddings -// and that |embedding_domain| ends with that domain (e.g. is of the form -// "*-outlook.com" for each example.com in kDomainsPermittedInEndEmbeddings). -// (e.g. will return true if |embedded_target| matches "evil-office.com"). Only +// and that |embedding_domain| ends with that domain, e.g. "evil-office.com" is +// permitted, as "office.com" is in kDomainsPermittedInEndEmbeddings. Only // impacts Target Embedding matches. bool EndsWithPermittedDomains(const DomainInfo& embedded_target, const std::string& embedding_domain) { @@ -348,6 +373,22 @@ bool IsAllowedToBeEmbedded( EndsWithPermittedDomains(embedded_target, embedding_domain); } +// Returns the first character of the first string that is different from the +// second string. Strings should be at least 1 edit distance apart. +char GetFirstDifferentChar(const std::string& str1, const std::string& str2) { + std::string::const_iterator i1 = str1.begin(); + std::string::const_iterator i2 = str2.begin(); + while (i1 != str1.end() && i2 != str2.end()) { + if (*i1 != *i2) { + return *i1; + } + i1++; + i2++; + } + NOTREACHED(); + return 0; +} + } // namespace DomainInfo::DomainInfo(const std::string& arg_hostname, @@ -366,6 +407,7 @@ DomainInfo::~DomainInfo() = default; DomainInfo::DomainInfo(const DomainInfo&) = default; DomainInfo GetDomainInfo(const std::string& hostname) { + TRACE_EVENT0("navigation", "GetDomainInfo"); if (net::HostStringIsLocalhost(hostname) || net::IsHostnameNonUnique(hostname)) { return DomainInfo(std::string(), std::string(), std::string(), @@ -498,6 +540,17 @@ bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain, } } + // Ignore domains that only differ by an insertion of a "-". + if (nav_dom_len != matched_dom_len) { + if (nav_dom_len < matched_dom_len && + GetFirstDifferentChar(matched_dom, nav_dom) == '-') { + return true; + } else if (nav_dom_len > matched_dom_len && + GetFirstDifferentChar(nav_dom, matched_dom) == '-') { + return true; + } + } + return false; } diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h index 73d2d8afb0c..c5a27718cef 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util.h +++ b/chromium/components/lookalikes/core/lookalike_url_util.h @@ -138,7 +138,7 @@ bool IsEditDistanceAtMostOne(const base::string16& str1, // Returns whether |navigated_domain| and |matched_domain| are likely to be edit // distance false positives, and thus the user should *not* be warned. // -// Assumes |navigated_domain| and |matched_domain| are edit distance matches. +// Assumes |navigated_domain| and |matched_domain| are edit distance of 1 apart. bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain, const DomainInfo& matched_domain); diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc index c0f049a1528..25f30d6ad8d 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc +++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc @@ -105,6 +105,14 @@ TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) { {"abcde.com", "axbcde.com", false}, // Deletion {"axbcde.com", "abcde.com", false}, // Insertion {"axbcde.com", "aybcde.com", false}, // Substitution + + // We permit matches that only differ due to a single "-". + {"-abcde.com", "abcde.com", true}, + {"ab-cde.com", "abcde.com", true}, + {"abcde-.com", "abcde.com", true}, + {"abcde.com", "-abcde.com", true}, + {"abcde.com", "ab-cde.com", true}, + {"abcde.com", "abcde-.com", true}, }; for (const TestCase& test_case : kTestCases) { auto navigated = @@ -134,8 +142,10 @@ struct TargetEmbeddingHeuristicTestCase { TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { const std::vector<DomainInfo> kEngagedSites = { GetDomainInfo(GURL("https://highengagement.com")), + GetDomainInfo(GURL("https://highengagement.inthesubdomain.com")), GetDomainInfo(GURL("https://highengagement.co.uk")), GetDomainInfo(GURL("https://subdomain.highengagement.com")), + GetDomainInfo(GURL("https://www.highengagementwithwww.com")), GetDomainInfo(GURL("https://subdomain.google.com")), }; const std::vector<TargetEmbeddingHeuristicTestCase> kTestCases = { @@ -207,6 +217,9 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { TargetEmbeddingType::kInterstitial}, {"foo.jobs.org-foo.com", "", TargetEmbeddingType::kNone}, {"foo.office.org-foo.com", "", TargetEmbeddingType::kNone}, + // Common words (like 'jobs' are included in the big common word list. + // Ensure that the supplemental kCommonWords list is also checked. + {"foo.hoteles.com-foo.com", "", TargetEmbeddingType::kNone}, // Targets could be embedded without their dots and dashes. {"googlecom-foo.com", "google.com", TargetEmbeddingType::kInterstitial}, @@ -242,9 +255,19 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { {"foo.subdomain.google.com.foo.com", "subdomain.google.com", TargetEmbeddingType::kInterstitial}, - // Skeleton matching should work against engaged sites at the eTLD level. + // Skeleton matching should work against engaged sites at a eTLD+1 level, + {"highengagement.inthesubdomain.com-foo.com", + "highengagement.inthesubdomain.com", TargetEmbeddingType::kInterstitial}, + // but only if the bare eTLD+1, or www.[eTLD+1] has been engaged. {"subdomain.highéngagement.com-foo.com", "highengagement.com", TargetEmbeddingType::kInterstitial}, + {"subdomain.highéngagementwithwww.com-foo.com", + "highengagementwithwww.com", TargetEmbeddingType::kInterstitial}, + {"other.inthésubdomain.com-foo.com", "", TargetEmbeddingType::kNone}, + // Ideally, we'd be able to combine subdomains and skeleton matching, but + // our current algorithm can't detect that precisely. + {"highengagement.inthésubdomain.com-foo.com", "", + TargetEmbeddingType::kNone}, // Domains should be allowed to embed themselves. {"highengagement.com.highengagement.com", "", TargetEmbeddingType::kNone}, @@ -291,7 +314,12 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { << test_case.expected_safe_host << ", but " << (safe_hostname.empty() ? "it didn't trigger at all." : "triggered on " + safe_hostname); - EXPECT_EQ(embedding_type, test_case.expected_type); + EXPECT_EQ(embedding_type, test_case.expected_type) + << test_case.hostname << " should trigger on " + << test_case.expected_safe_host << " but it returned " + << (embedding_type == TargetEmbeddingType::kNone + ? "kNone." + : "something unexpected"); } else { EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone) << test_case.hostname << " unexpectedly triggered on " |