From 21ba0c5d4bf8fba15dddd97cd693bad2358b77fd Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Fri, 3 Sep 2021 13:32:17 +0200 Subject: BASELINE: Update Chromium to 92.0.4515.166 Change-Id: I42a050486714e9e54fc271f2a8939223a02ae364 --- chromium/components/lookalikes/core/BUILD.gn | 3 + chromium/components/lookalikes/core/DEPS | 2 + chromium/components/lookalikes/core/features.cc | 2 +- .../lookalikes/core/lookalike_url_util.cc | 91 ++++++++++++++++++---- .../lookalikes/core/lookalike_url_util.h | 34 ++++---- .../lookalikes/core/lookalike_url_util_unittest.cc | 68 +++++++++++++--- 6 files changed, 154 insertions(+), 46 deletions(-) (limited to 'chromium/components/lookalikes') diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn index fccb48046c8..c46ad6f7e1d 100644 --- a/chromium/components/lookalikes/core/BUILD.gn +++ b/chromium/components/lookalikes/core/BUILD.gn @@ -14,6 +14,8 @@ static_library("core") { "//base", "//components/pref_registry", "//components/prefs:prefs", + "//components/reputation/core:core", + "//components/reputation/core:proto", "//components/security_interstitials/core", "//components/security_state/core:features", "//components/strings", @@ -36,6 +38,7 @@ source_set("unit_tests") { deps = [ ":core", ":features", + "//components/reputation/core", "//net:test_support", "//testing/gtest", ] diff --git a/chromium/components/lookalikes/core/DEPS b/chromium/components/lookalikes/core/DEPS index a3c048b885f..33c4e659341 100644 --- a/chromium/components/lookalikes/core/DEPS +++ b/chromium/components/lookalikes/core/DEPS @@ -3,4 +3,6 @@ include_rules = [ # should not be introduced. "-content", "-ios/web", + # components/reputation contains the lookalike (safety tips) component. + "+components/reputation/core", ] diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc index 3eb15692239..3d1fb0c01c4 100644 --- a/chromium/components/lookalikes/core/features.cc +++ b/chromium/components/lookalikes/core/features.cc @@ -9,7 +9,7 @@ namespace features { // Note: this flag is ignored on iOS. See lookalike_url_util.cc. const base::Feature kDetectTargetEmbeddingLookalikes{ - "TargetEmbeddingLookalikes", base::FEATURE_DISABLED_BY_DEFAULT}; + "TargetEmbeddingLookalikes", base::FEATURE_ENABLED_BY_DEFAULT}; const base::Feature kLookalikeInterstitialForPunycode{ "LookalikeInterstitialForPunycode", base::FEATURE_ENABLED_BY_DEFAULT}; diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc index 7677aa9660f..a3e5695061d 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util.cc +++ b/chromium/components/lookalikes/core/lookalike_url_util.cc @@ -27,6 +27,7 @@ #include "base/values.h" #include "build/build_config.h" #include "components/lookalikes/core/features.h" +#include "components/reputation/core/safety_tips_config.h" #include "components/security_interstitials/core/pref_names.h" #include "components/security_state/core/features.h" #include "components/url_formatter/spoof_checks/common_words/common_words_util.h" @@ -212,13 +213,12 @@ bool ASubdomainIsAllowlisted( const base::span& domain_labels, const LookalikeTargetAllowlistChecker& in_target_allowlist) { DCHECK(domain_labels.size() >= 2); - std::string potential_hostname = - domain_labels[domain_labels.size() - 1].as_string(); + std::string potential_hostname(domain_labels[domain_labels.size() - 1]); // Attach each token from the end to the embedded target to check if that // subdomain has been allowlisted. for (int i = domain_labels.size() - 2; i >= 0; i--) { potential_hostname = - domain_labels[i].as_string() + "." + potential_hostname; + std::string(domain_labels[i]) + "." + potential_hostname; if (in_target_allowlist.Run(potential_hostname)) { return true; } @@ -286,7 +286,8 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite( // Returns whether the e2LD of the provided domain is a common word (e.g. // weather.com, ask.com). Target embeddings of these domains are often false // positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com"). -bool UsesCommonWord(const DomainInfo& domain) { +bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto, + const DomainInfo& domain) { // kDomainsPermittedInEndEmbeddings are based on domains with common words, // but they should not be excluded here (and instead are checked later). for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) { @@ -301,7 +302,13 @@ bool UsesCommonWord(const DomainInfo& domain) { return true; } - // Also check the local lists. + // Search for words in the component-provided word list. + if (reputation::IsCommonWordInConfigProto(config_proto, + domain.domain_without_registry)) { + return true; + } + + // Search for words in the local word lists. for (auto* common_word : kLocalAdditionalCommonWords) { if (domain.domain_without_registry == common_word) { return true; @@ -323,14 +330,13 @@ bool UsesCommonWord(const DomainInfo& domain) { bool IsEmbeddingItself(const base::span& domain_labels, const std::string& embedding_domain) { DCHECK(domain_labels.size() >= 2); - std::string potential_hostname = - domain_labels[domain_labels.size() - 1].as_string(); + std::string potential_hostname(domain_labels[domain_labels.size() - 1]); // Attach each token from the end to the embedded target to check if that // subdomain is the embedding domain. (e.g. using the earlier example, check // each ["com", "example.com", "foo.example.com"] against "example.com". for (int i = domain_labels.size() - 2; i >= 0; i--) { potential_hostname = - domain_labels[i].as_string() + "." + potential_hostname; + std::string(domain_labels[i]) + "." + potential_hostname; if (embedding_domain == potential_hostname) { return true; } @@ -371,8 +377,9 @@ bool IsAllowedToBeEmbedded( const DomainInfo& embedded_target, const base::span& subdomain_span, const LookalikeTargetAllowlistChecker& in_target_allowlist, - const std::string& embedding_domain) { - return UsesCommonWord(embedded_target) || + const std::string& embedding_domain, + const reputation::SafetyTipsConfig* config_proto) { + return UsesCommonWord(config_proto, embedded_target) || ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist) || IsEmbeddingItself(subdomain_span, embedding_domain) || IsCrossTLDMatch(embedded_target, embedding_domain) || @@ -616,6 +623,7 @@ bool GetMatchingDomain( const DomainInfo& navigated_domain, const std::vector& engaged_sites, const LookalikeTargetAllowlistChecker& in_target_allowlist, + const reputation::SafetyTipsConfig* config_proto, std::string* matched_domain, LookalikeUrlMatchType* match_type) { DCHECK(!navigated_domain.domain_and_registry.empty()); @@ -676,7 +684,7 @@ bool GetMatchingDomain( TargetEmbeddingType embedding_type = GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites, - in_target_allowlist, matched_domain); + in_target_allowlist, config_proto, matched_domain); if (embedding_type == TargetEmbeddingType::kSafetyTip) { *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips; return true; @@ -725,6 +733,37 @@ TargetEmbeddingType GetTargetEmbeddingType( const std::string& hostname, const std::vector& engaged_sites, const LookalikeTargetAllowlistChecker& in_target_allowlist, + const reputation::SafetyTipsConfig* config_proto, + std::string* safe_hostname) { + // Because of how target embeddings are detected (i.e. by sweeping the URL + // from back to front), we're guaranteed to find tail-embedding before other + // target embedding. Tail embedding triggers a safety tip, but interstitials + // are more important than safety tips, so if we find a safety tippable + // embedding with SearchForEmbeddings, go search again not permitting safety + // tips to see if we can also find an interstitiallable embedding. + auto result = SearchForEmbeddings( + hostname, engaged_sites, in_target_allowlist, config_proto, + /*safety_tips_allowed=*/true, safe_hostname); + if (result == TargetEmbeddingType::kSafetyTip) { + std::string no_st_safe_hostname; + auto no_st_result = SearchForEmbeddings( + hostname, engaged_sites, in_target_allowlist, config_proto, + /*safety_tips_allowed=*/false, &no_st_safe_hostname); + if (no_st_result == TargetEmbeddingType::kNone) { + return result; + } + *safe_hostname = no_st_safe_hostname; + return no_st_result; + } + return result; +} + +TargetEmbeddingType SearchForEmbeddings( + const std::string& hostname, + const std::vector& engaged_sites, + const LookalikeTargetAllowlistChecker& in_target_allowlist, + const reputation::SafetyTipsConfig* config_proto, + bool safety_tips_allowed, std::string* safe_hostname) { const std::string embedding_domain = GetETLDPlusOne(hostname); const std::vector hostname_tokens = @@ -764,7 +803,8 @@ TargetEmbeddingType GetTargetEmbeddingType( if (no_separator_dominfo.domain_without_registry.length() > kMinE2LDLengthForTargetEmbedding && !IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens, - in_target_allowlist, embedding_domain)) { + in_target_allowlist, embedding_domain, + config_proto)) { *safe_hostname = embedded_target; return TargetEmbeddingType::kInterstitial; } @@ -793,9 +833,17 @@ TargetEmbeddingType GetTargetEmbeddingType( for (auto& engaged_site : engaged_sites) { if (engaged_site.hostname == embedded_dominfo.hostname && !IsAllowedToBeEmbedded(embedded_dominfo, span, in_target_allowlist, - embedding_domain)) { + embedding_domain, config_proto)) { *safe_hostname = engaged_site.hostname; - return TargetEmbeddingType::kInterstitial; + // Tail-embedding (e.g. evil-google.com, where the embedding happens + // at the very end of the hostname) is a safety tip, but only when + // safety tips are allowed. If it's tail embedding but we can't create + // a safety tip, keep looking. Non-tail-embeddings are interstitials. + if (end != static_cast(hostname_tokens.size())) { + return TargetEmbeddingType::kInterstitial; + } else if (safety_tips_allowed) { + return TargetEmbeddingType::kSafetyTip; + } // else keep searching. } } } @@ -805,8 +853,17 @@ TargetEmbeddingType GetTargetEmbeddingType( if (DoesETLDPlus1MatchTopDomainOrEngagedSite( etld_check_dominfo, engaged_sites, safe_hostname) && !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span, - in_target_allowlist, embedding_domain)) { - return TargetEmbeddingType::kInterstitial; + in_target_allowlist, embedding_domain, + config_proto)) { + // Tail-embedding (e.g. evil-google.com, where the embedding happens at + // the very end of the hostname) is a safety tip, but only when safety + // tips are allowed. If it's tail embedding but we can't create a safety + // tip, keep looking. Non-tail-embeddings are interstitials. + if (end != static_cast(hostname_tokens.size())) { + return TargetEmbeddingType::kInterstitial; + } else if (safety_tips_allowed) { + return TargetEmbeddingType::kSafetyTip; + } // else keep searching. } } return TargetEmbeddingType::kNone; @@ -879,7 +936,7 @@ bool IsAllowedByEnterprisePolicy(const PrefService* pref_service, const GURL& url) { const auto* list = pref_service->GetList(prefs::kLookalikeWarningAllowlistDomains); - for (const auto& domain_val : *list) { + for (const auto& domain_val : list->GetList()) { auto domain = domain_val.GetString(); if (url.DomainIs(domain)) { return true; diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h index 72124acb0ca..e5a8ba87dfa 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util.h +++ b/chromium/components/lookalikes/core/lookalike_url_util.h @@ -9,9 +9,9 @@ #include #include "base/callback.h" -#include "base/time/time.h" #include "components/pref_registry/pref_registry_syncable.h" #include "components/prefs/pref_service.h" +#include "components/reputation/core/safety_tips.pb.h" #include "components/url_formatter/url_formatter.h" #include "url/gurl.h" @@ -163,31 +163,33 @@ bool GetMatchingDomain( const DomainInfo& navigated_domain, const std::vector& engaged_sites, const LookalikeTargetAllowlistChecker& in_target_allowlist, + const reputation::SafetyTipsConfig* config_proto, std::string* matched_domain, LookalikeUrlMatchType* match_type); void RecordUMAFromMatchType(LookalikeUrlMatchType match_type); // Checks to see if a URL is a target embedding lookalike. This function sets -// |safe_hostname| to the url of the embedded target domain. -// At the moment we consider the following cases as Target Embedding: -// example-google.com-site.com, example.google.com-site.com, -// example-google-info-site.com, example.google.com.site.com, -// example-googlé.com-site.com where the embedded target is google.com. We -// detect embeddings of top 500 domains and engaged domains. However, to reduce -// false positives, we do not protect domains that are shorter than 7 characters -// long (e.g. com.ru). -// This function checks possible targets against |in_target_allowlist| to skip -// permitted embeddings. -// If no target embedding is found, the return value will be set to |kNonw|. -// When the target is embedded with another TLD instead of its actual TLD, it -// should trigger a Safety Tip when the embedded TLD is a ccTLD. In this -// situation, return value will be |kSafetyTip|. All the other triggers will -// result in a |kInterstitial| return value. +// |safe_hostname| to the url of the embedded target domain. See the unit tests +// for what qualifies as target embedding. TargetEmbeddingType GetTargetEmbeddingType( const std::string& hostname, const std::vector& engaged_sites, const LookalikeTargetAllowlistChecker& in_target_allowlist, + const reputation::SafetyTipsConfig* config_proto, + std::string* safe_hostname); + +// Same as GetTargetEmbeddingType, but explicitly state whether or not a safety +// tip is permitted via |safety_tips_allowed|. Safety tips are presently only +// used for tail embedding (e.g. "evil-google.com"). This function may return +// kSafetyTip preferentially to kInterstitial -- call with !safety_tips_allowed +// if you're interested in determining if there's *also* an interstitial. +TargetEmbeddingType SearchForEmbeddings( + const std::string& hostname, + const std::vector& engaged_sites, + const LookalikeTargetAllowlistChecker& in_target_allowlist, + const reputation::SafetyTipsConfig* config_proto, + bool safety_tips_allowed, std::string* safe_hostname); // Returns true if a navigation to an IDN should be blocked. diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc index 4062ea99861..e7b52ca0a5f 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc +++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc @@ -7,8 +7,22 @@ #include "base/bind.h" #include "base/strings/utf_string_conversions.h" #include "components/lookalikes/core/features.h" +#include "components/reputation/core/safety_tip_test_utils.h" +#include "components/reputation/core/safety_tips_config.h" #include "testing/gtest/include/gtest/gtest.h" +std::string TargetEmbeddingTypeToString(TargetEmbeddingType type) { + switch (type) { + case TargetEmbeddingType::kNone: + return "kNone"; + case TargetEmbeddingType::kInterstitial: + return "kInterstitial"; + case TargetEmbeddingType::kSafetyTip: + return "kSafetyTip"; + } + NOTREACHED(); +} + TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) { const struct TestCase { const wchar_t* domain; @@ -139,7 +153,7 @@ struct TargetEmbeddingHeuristicTestCase { const TargetEmbeddingType expected_type; }; -TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { +TEST(LookalikeUrlUtilTest, TargetEmbedding) { const std::vector kEngagedSites = { GetDomainInfo(GURL("https://highengagement.com")), GetDomainInfo(GURL("https://highengagement.inthesubdomain.com")), @@ -278,12 +292,15 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { {"google.com.google.com", "", TargetEmbeddingType::kNone}, {"www.google.com.google.com", "", TargetEmbeddingType::kNone}, - // Detect embeddings at the end of the domain, too. - {"www-google.com", "google.com", TargetEmbeddingType::kInterstitial}, + // Detect embeddings at the end of the domain, too, but as a Safety Tip. + {"www-google.com", "google.com", TargetEmbeddingType::kSafetyTip}, {"www-highengagement.com", "highengagement.com", - TargetEmbeddingType::kInterstitial}, + TargetEmbeddingType::kSafetyTip}, {"subdomain-highengagement.com", "subdomain.highengagement.com", - TargetEmbeddingType::kInterstitial}, + TargetEmbeddingType::kSafetyTip}, + // If the match duplicates the TLD, it's not quite tail-embedding. + {"google-com.com", "google.com", TargetEmbeddingType::kInterstitial}, + // If there are multiple options, it should choose the more severe one. {"google-com.google-com.com", "google.com", TargetEmbeddingType::kInterstitial}, {"subdomain.google-com.google-com.com", "google.com", @@ -300,14 +317,17 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { // works for domains on the list, but not for others. {"office.com-foo.com", "office.com", TargetEmbeddingType::kInterstitial}, {"example-office.com", "", TargetEmbeddingType::kNone}, - {"example-google.com", "google.com", TargetEmbeddingType::kInterstitial}, + {"example-google.com", "google.com", TargetEmbeddingType::kSafetyTip}, }; + reputation::InitializeBlankLookalikeAllowlistForTesting(); + auto* config_proto = reputation::GetSafetyTipsRemoteConfigProto(); + for (auto& test_case : kTestCases) { std::string safe_hostname; TargetEmbeddingType embedding_type = GetTargetEmbeddingType( test_case.hostname, kEngagedSites, - base::BindRepeating(&IsGoogleScholar), &safe_hostname); + base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname); if (test_case.expected_type != TargetEmbeddingType::kNone) { EXPECT_EQ(safe_hostname, test_case.expected_safe_host) << test_case.hostname << " should trigger on " @@ -315,19 +335,43 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { << (safe_hostname.empty() ? "it didn't trigger at all." : "triggered on " + safe_hostname); EXPECT_EQ(embedding_type, test_case.expected_type) - << test_case.hostname << " should trigger on " + << test_case.hostname << " should trigger " + << TargetEmbeddingTypeToString(test_case.expected_type) << " against " << test_case.expected_safe_host << " but it returned " - << (embedding_type == TargetEmbeddingType::kNone - ? "kNone." - : "something unexpected"); + << TargetEmbeddingTypeToString(embedding_type); } else { EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone) - << test_case.hostname << " unexpectedly triggered on " + << test_case.hostname << " unexpectedly triggered " + << TargetEmbeddingTypeToString(embedding_type) << " against " << safe_hostname; } } } +TEST(LookalikeUrlUtilTest, TargetEmbeddingIgnoresComponentWordlist) { + const std::vector kEngagedSites = { + GetDomainInfo(GURL("https://commonword.com")), + GetDomainInfo(GURL("https://uncommonword.com")), + }; + + reputation::SetSafetyTipAllowlistPatterns({}, {}, {"commonword"}); + auto* config_proto = reputation::GetSafetyTipsRemoteConfigProto(); + TargetEmbeddingType embedding_type; + std::string safe_hostname; + + // Engaged sites using uncommon words are still blocked. + embedding_type = GetTargetEmbeddingType( + "uncommonword.com.evil.com", kEngagedSites, + base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname); + EXPECT_EQ(embedding_type, TargetEmbeddingType::kInterstitial); + + // But engaged sites using common words are not blocked. + embedding_type = GetTargetEmbeddingType( + "commonword.com.evil.com", kEngagedSites, + base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname); + EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone); +} + struct GetETLDPlusOneTestCase { const std::string hostname; const std::string expected_etldp1; -- cgit v1.2.1