summaryrefslogtreecommitdiff
path: root/chromium/components/lookalikes
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2021-09-03 13:32:17 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2021-10-01 14:31:55 +0200
commit21ba0c5d4bf8fba15dddd97cd693bad2358b77fd (patch)
tree91be119f694044dfc1ff9fdc054459e925de9df0 /chromium/components/lookalikes
parent03c549e0392f92c02536d3f86d5e1d8dfa3435ac (diff)
downloadqtwebengine-chromium-21ba0c5d4bf8fba15dddd97cd693bad2358b77fd.tar.gz
BASELINE: Update Chromium to 92.0.4515.166
Change-Id: I42a050486714e9e54fc271f2a8939223a02ae364
Diffstat (limited to 'chromium/components/lookalikes')
-rw-r--r--chromium/components/lookalikes/core/BUILD.gn3
-rw-r--r--chromium/components/lookalikes/core/DEPS2
-rw-r--r--chromium/components/lookalikes/core/features.cc2
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util.cc91
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util.h34
-rw-r--r--chromium/components/lookalikes/core/lookalike_url_util_unittest.cc68
6 files changed, 154 insertions, 46 deletions
diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn
index fccb48046c8..c46ad6f7e1d 100644
--- a/chromium/components/lookalikes/core/BUILD.gn
+++ b/chromium/components/lookalikes/core/BUILD.gn
@@ -14,6 +14,8 @@ static_library("core") {
"//base",
"//components/pref_registry",
"//components/prefs:prefs",
+ "//components/reputation/core:core",
+ "//components/reputation/core:proto",
"//components/security_interstitials/core",
"//components/security_state/core:features",
"//components/strings",
@@ -36,6 +38,7 @@ source_set("unit_tests") {
deps = [
":core",
":features",
+ "//components/reputation/core",
"//net:test_support",
"//testing/gtest",
]
diff --git a/chromium/components/lookalikes/core/DEPS b/chromium/components/lookalikes/core/DEPS
index a3c048b885f..33c4e659341 100644
--- a/chromium/components/lookalikes/core/DEPS
+++ b/chromium/components/lookalikes/core/DEPS
@@ -3,4 +3,6 @@ include_rules = [
# should not be introduced.
"-content",
"-ios/web",
+ # components/reputation contains the lookalike (safety tips) component.
+ "+components/reputation/core",
]
diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc
index 3eb15692239..3d1fb0c01c4 100644
--- a/chromium/components/lookalikes/core/features.cc
+++ b/chromium/components/lookalikes/core/features.cc
@@ -9,7 +9,7 @@ namespace features {
// Note: this flag is ignored on iOS. See lookalike_url_util.cc.
const base::Feature kDetectTargetEmbeddingLookalikes{
- "TargetEmbeddingLookalikes", base::FEATURE_DISABLED_BY_DEFAULT};
+ "TargetEmbeddingLookalikes", base::FEATURE_ENABLED_BY_DEFAULT};
const base::Feature kLookalikeInterstitialForPunycode{
"LookalikeInterstitialForPunycode", base::FEATURE_ENABLED_BY_DEFAULT};
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc
index 7677aa9660f..a3e5695061d 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util.cc
@@ -27,6 +27,7 @@
#include "base/values.h"
#include "build/build_config.h"
#include "components/lookalikes/core/features.h"
+#include "components/reputation/core/safety_tips_config.h"
#include "components/security_interstitials/core/pref_names.h"
#include "components/security_state/core/features.h"
#include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
@@ -212,13 +213,12 @@ bool ASubdomainIsAllowlisted(
const base::span<const base::StringPiece>& domain_labels,
const LookalikeTargetAllowlistChecker& in_target_allowlist) {
DCHECK(domain_labels.size() >= 2);
- std::string potential_hostname =
- domain_labels[domain_labels.size() - 1].as_string();
+ std::string potential_hostname(domain_labels[domain_labels.size() - 1]);
// Attach each token from the end to the embedded target to check if that
// subdomain has been allowlisted.
for (int i = domain_labels.size() - 2; i >= 0; i--) {
potential_hostname =
- domain_labels[i].as_string() + "." + potential_hostname;
+ std::string(domain_labels[i]) + "." + potential_hostname;
if (in_target_allowlist.Run(potential_hostname)) {
return true;
}
@@ -286,7 +286,8 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
// Returns whether the e2LD of the provided domain is a common word (e.g.
// weather.com, ask.com). Target embeddings of these domains are often false
// positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
-bool UsesCommonWord(const DomainInfo& domain) {
+bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto,
+ const DomainInfo& domain) {
// kDomainsPermittedInEndEmbeddings are based on domains with common words,
// but they should not be excluded here (and instead are checked later).
for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
@@ -301,7 +302,13 @@ bool UsesCommonWord(const DomainInfo& domain) {
return true;
}
- // Also check the local lists.
+ // Search for words in the component-provided word list.
+ if (reputation::IsCommonWordInConfigProto(config_proto,
+ domain.domain_without_registry)) {
+ return true;
+ }
+
+ // Search for words in the local word lists.
for (auto* common_word : kLocalAdditionalCommonWords) {
if (domain.domain_without_registry == common_word) {
return true;
@@ -323,14 +330,13 @@ bool UsesCommonWord(const DomainInfo& domain) {
bool IsEmbeddingItself(const base::span<const base::StringPiece>& domain_labels,
const std::string& embedding_domain) {
DCHECK(domain_labels.size() >= 2);
- std::string potential_hostname =
- domain_labels[domain_labels.size() - 1].as_string();
+ std::string potential_hostname(domain_labels[domain_labels.size() - 1]);
// Attach each token from the end to the embedded target to check if that
// subdomain is the embedding domain. (e.g. using the earlier example, check
// each ["com", "example.com", "foo.example.com"] against "example.com".
for (int i = domain_labels.size() - 2; i >= 0; i--) {
potential_hostname =
- domain_labels[i].as_string() + "." + potential_hostname;
+ std::string(domain_labels[i]) + "." + potential_hostname;
if (embedding_domain == potential_hostname) {
return true;
}
@@ -371,8 +377,9 @@ bool IsAllowedToBeEmbedded(
const DomainInfo& embedded_target,
const base::span<const base::StringPiece>& subdomain_span,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
- const std::string& embedding_domain) {
- return UsesCommonWord(embedded_target) ||
+ const std::string& embedding_domain,
+ const reputation::SafetyTipsConfig* config_proto) {
+ return UsesCommonWord(config_proto, embedded_target) ||
ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist) ||
IsEmbeddingItself(subdomain_span, embedding_domain) ||
IsCrossTLDMatch(embedded_target, embedding_domain) ||
@@ -616,6 +623,7 @@ bool GetMatchingDomain(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
+ const reputation::SafetyTipsConfig* config_proto,
std::string* matched_domain,
LookalikeUrlMatchType* match_type) {
DCHECK(!navigated_domain.domain_and_registry.empty());
@@ -676,7 +684,7 @@ bool GetMatchingDomain(
TargetEmbeddingType embedding_type =
GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites,
- in_target_allowlist, matched_domain);
+ in_target_allowlist, config_proto, matched_domain);
if (embedding_type == TargetEmbeddingType::kSafetyTip) {
*match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips;
return true;
@@ -725,6 +733,37 @@ TargetEmbeddingType GetTargetEmbeddingType(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
+ const reputation::SafetyTipsConfig* config_proto,
+ std::string* safe_hostname) {
+ // Because of how target embeddings are detected (i.e. by sweeping the URL
+ // from back to front), we're guaranteed to find tail-embedding before other
+ // target embedding. Tail embedding triggers a safety tip, but interstitials
+ // are more important than safety tips, so if we find a safety tippable
+ // embedding with SearchForEmbeddings, go search again not permitting safety
+ // tips to see if we can also find an interstitiallable embedding.
+ auto result = SearchForEmbeddings(
+ hostname, engaged_sites, in_target_allowlist, config_proto,
+ /*safety_tips_allowed=*/true, safe_hostname);
+ if (result == TargetEmbeddingType::kSafetyTip) {
+ std::string no_st_safe_hostname;
+ auto no_st_result = SearchForEmbeddings(
+ hostname, engaged_sites, in_target_allowlist, config_proto,
+ /*safety_tips_allowed=*/false, &no_st_safe_hostname);
+ if (no_st_result == TargetEmbeddingType::kNone) {
+ return result;
+ }
+ *safe_hostname = no_st_safe_hostname;
+ return no_st_result;
+ }
+ return result;
+}
+
+TargetEmbeddingType SearchForEmbeddings(
+ const std::string& hostname,
+ const std::vector<DomainInfo>& engaged_sites,
+ const LookalikeTargetAllowlistChecker& in_target_allowlist,
+ const reputation::SafetyTipsConfig* config_proto,
+ bool safety_tips_allowed,
std::string* safe_hostname) {
const std::string embedding_domain = GetETLDPlusOne(hostname);
const std::vector<base::StringPiece> hostname_tokens =
@@ -764,7 +803,8 @@ TargetEmbeddingType GetTargetEmbeddingType(
if (no_separator_dominfo.domain_without_registry.length() >
kMinE2LDLengthForTargetEmbedding &&
!IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens,
- in_target_allowlist, embedding_domain)) {
+ in_target_allowlist, embedding_domain,
+ config_proto)) {
*safe_hostname = embedded_target;
return TargetEmbeddingType::kInterstitial;
}
@@ -793,9 +833,17 @@ TargetEmbeddingType GetTargetEmbeddingType(
for (auto& engaged_site : engaged_sites) {
if (engaged_site.hostname == embedded_dominfo.hostname &&
!IsAllowedToBeEmbedded(embedded_dominfo, span, in_target_allowlist,
- embedding_domain)) {
+ embedding_domain, config_proto)) {
*safe_hostname = engaged_site.hostname;
- return TargetEmbeddingType::kInterstitial;
+ // Tail-embedding (e.g. evil-google.com, where the embedding happens
+ // at the very end of the hostname) is a safety tip, but only when
+ // safety tips are allowed. If it's tail embedding but we can't create
+ // a safety tip, keep looking. Non-tail-embeddings are interstitials.
+ if (end != static_cast<int>(hostname_tokens.size())) {
+ return TargetEmbeddingType::kInterstitial;
+ } else if (safety_tips_allowed) {
+ return TargetEmbeddingType::kSafetyTip;
+ } // else keep searching.
}
}
}
@@ -805,8 +853,17 @@ TargetEmbeddingType GetTargetEmbeddingType(
if (DoesETLDPlus1MatchTopDomainOrEngagedSite(
etld_check_dominfo, engaged_sites, safe_hostname) &&
!IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
- in_target_allowlist, embedding_domain)) {
- return TargetEmbeddingType::kInterstitial;
+ in_target_allowlist, embedding_domain,
+ config_proto)) {
+ // Tail-embedding (e.g. evil-google.com, where the embedding happens at
+ // the very end of the hostname) is a safety tip, but only when safety
+ // tips are allowed. If it's tail embedding but we can't create a safety
+ // tip, keep looking. Non-tail-embeddings are interstitials.
+ if (end != static_cast<int>(hostname_tokens.size())) {
+ return TargetEmbeddingType::kInterstitial;
+ } else if (safety_tips_allowed) {
+ return TargetEmbeddingType::kSafetyTip;
+ } // else keep searching.
}
}
return TargetEmbeddingType::kNone;
@@ -879,7 +936,7 @@ bool IsAllowedByEnterprisePolicy(const PrefService* pref_service,
const GURL& url) {
const auto* list =
pref_service->GetList(prefs::kLookalikeWarningAllowlistDomains);
- for (const auto& domain_val : *list) {
+ for (const auto& domain_val : list->GetList()) {
auto domain = domain_val.GetString();
if (url.DomainIs(domain)) {
return true;
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h
index 72124acb0ca..e5a8ba87dfa 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.h
+++ b/chromium/components/lookalikes/core/lookalike_url_util.h
@@ -9,9 +9,9 @@
#include <vector>
#include "base/callback.h"
-#include "base/time/time.h"
#include "components/pref_registry/pref_registry_syncable.h"
#include "components/prefs/pref_service.h"
+#include "components/reputation/core/safety_tips.pb.h"
#include "components/url_formatter/url_formatter.h"
#include "url/gurl.h"
@@ -163,31 +163,33 @@ bool GetMatchingDomain(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
+ const reputation::SafetyTipsConfig* config_proto,
std::string* matched_domain,
LookalikeUrlMatchType* match_type);
void RecordUMAFromMatchType(LookalikeUrlMatchType match_type);
// Checks to see if a URL is a target embedding lookalike. This function sets
-// |safe_hostname| to the url of the embedded target domain.
-// At the moment we consider the following cases as Target Embedding:
-// example-google.com-site.com, example.google.com-site.com,
-// example-google-info-site.com, example.google.com.site.com,
-// example-googlé.com-site.com where the embedded target is google.com. We
-// detect embeddings of top 500 domains and engaged domains. However, to reduce
-// false positives, we do not protect domains that are shorter than 7 characters
-// long (e.g. com.ru).
-// This function checks possible targets against |in_target_allowlist| to skip
-// permitted embeddings.
-// If no target embedding is found, the return value will be set to |kNonw|.
-// When the target is embedded with another TLD instead of its actual TLD, it
-// should trigger a Safety Tip when the embedded TLD is a ccTLD. In this
-// situation, return value will be |kSafetyTip|. All the other triggers will
-// result in a |kInterstitial| return value.
+// |safe_hostname| to the url of the embedded target domain. See the unit tests
+// for what qualifies as target embedding.
TargetEmbeddingType GetTargetEmbeddingType(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
+ const reputation::SafetyTipsConfig* config_proto,
+ std::string* safe_hostname);
+
+// Same as GetTargetEmbeddingType, but explicitly state whether or not a safety
+// tip is permitted via |safety_tips_allowed|. Safety tips are presently only
+// used for tail embedding (e.g. "evil-google.com"). This function may return
+// kSafetyTip preferentially to kInterstitial -- call with !safety_tips_allowed
+// if you're interested in determining if there's *also* an interstitial.
+TargetEmbeddingType SearchForEmbeddings(
+ const std::string& hostname,
+ const std::vector<DomainInfo>& engaged_sites,
+ const LookalikeTargetAllowlistChecker& in_target_allowlist,
+ const reputation::SafetyTipsConfig* config_proto,
+ bool safety_tips_allowed,
std::string* safe_hostname);
// Returns true if a navigation to an IDN should be blocked.
diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
index 4062ea99861..e7b52ca0a5f 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
@@ -7,8 +7,22 @@
#include "base/bind.h"
#include "base/strings/utf_string_conversions.h"
#include "components/lookalikes/core/features.h"
+#include "components/reputation/core/safety_tip_test_utils.h"
+#include "components/reputation/core/safety_tips_config.h"
#include "testing/gtest/include/gtest/gtest.h"
+std::string TargetEmbeddingTypeToString(TargetEmbeddingType type) {
+ switch (type) {
+ case TargetEmbeddingType::kNone:
+ return "kNone";
+ case TargetEmbeddingType::kInterstitial:
+ return "kInterstitial";
+ case TargetEmbeddingType::kSafetyTip:
+ return "kSafetyTip";
+ }
+ NOTREACHED();
+}
+
TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
const struct TestCase {
const wchar_t* domain;
@@ -139,7 +153,7 @@ struct TargetEmbeddingHeuristicTestCase {
const TargetEmbeddingType expected_type;
};
-TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
+TEST(LookalikeUrlUtilTest, TargetEmbedding) {
const std::vector<DomainInfo> kEngagedSites = {
GetDomainInfo(GURL("https://highengagement.com")),
GetDomainInfo(GURL("https://highengagement.inthesubdomain.com")),
@@ -278,12 +292,15 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
{"google.com.google.com", "", TargetEmbeddingType::kNone},
{"www.google.com.google.com", "", TargetEmbeddingType::kNone},
- // Detect embeddings at the end of the domain, too.
- {"www-google.com", "google.com", TargetEmbeddingType::kInterstitial},
+ // Detect embeddings at the end of the domain, too, but as a Safety Tip.
+ {"www-google.com", "google.com", TargetEmbeddingType::kSafetyTip},
{"www-highengagement.com", "highengagement.com",
- TargetEmbeddingType::kInterstitial},
+ TargetEmbeddingType::kSafetyTip},
{"subdomain-highengagement.com", "subdomain.highengagement.com",
- TargetEmbeddingType::kInterstitial},
+ TargetEmbeddingType::kSafetyTip},
+ // If the match duplicates the TLD, it's not quite tail-embedding.
+ {"google-com.com", "google.com", TargetEmbeddingType::kInterstitial},
+ // If there are multiple options, it should choose the more severe one.
{"google-com.google-com.com", "google.com",
TargetEmbeddingType::kInterstitial},
{"subdomain.google-com.google-com.com", "google.com",
@@ -300,14 +317,17 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
// works for domains on the list, but not for others.
{"office.com-foo.com", "office.com", TargetEmbeddingType::kInterstitial},
{"example-office.com", "", TargetEmbeddingType::kNone},
- {"example-google.com", "google.com", TargetEmbeddingType::kInterstitial},
+ {"example-google.com", "google.com", TargetEmbeddingType::kSafetyTip},
};
+ reputation::InitializeBlankLookalikeAllowlistForTesting();
+ auto* config_proto = reputation::GetSafetyTipsRemoteConfigProto();
+
for (auto& test_case : kTestCases) {
std::string safe_hostname;
TargetEmbeddingType embedding_type = GetTargetEmbeddingType(
test_case.hostname, kEngagedSites,
- base::BindRepeating(&IsGoogleScholar), &safe_hostname);
+ base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname);
if (test_case.expected_type != TargetEmbeddingType::kNone) {
EXPECT_EQ(safe_hostname, test_case.expected_safe_host)
<< test_case.hostname << " should trigger on "
@@ -315,19 +335,43 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
<< (safe_hostname.empty() ? "it didn't trigger at all."
: "triggered on " + safe_hostname);
EXPECT_EQ(embedding_type, test_case.expected_type)
- << test_case.hostname << " should trigger on "
+ << test_case.hostname << " should trigger "
+ << TargetEmbeddingTypeToString(test_case.expected_type) << " against "
<< test_case.expected_safe_host << " but it returned "
- << (embedding_type == TargetEmbeddingType::kNone
- ? "kNone."
- : "something unexpected");
+ << TargetEmbeddingTypeToString(embedding_type);
} else {
EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone)
- << test_case.hostname << " unexpectedly triggered on "
+ << test_case.hostname << " unexpectedly triggered "
+ << TargetEmbeddingTypeToString(embedding_type) << " against "
<< safe_hostname;
}
}
}
+TEST(LookalikeUrlUtilTest, TargetEmbeddingIgnoresComponentWordlist) {
+ const std::vector<DomainInfo> kEngagedSites = {
+ GetDomainInfo(GURL("https://commonword.com")),
+ GetDomainInfo(GURL("https://uncommonword.com")),
+ };
+
+ reputation::SetSafetyTipAllowlistPatterns({}, {}, {"commonword"});
+ auto* config_proto = reputation::GetSafetyTipsRemoteConfigProto();
+ TargetEmbeddingType embedding_type;
+ std::string safe_hostname;
+
+ // Engaged sites using uncommon words are still blocked.
+ embedding_type = GetTargetEmbeddingType(
+ "uncommonword.com.evil.com", kEngagedSites,
+ base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname);
+ EXPECT_EQ(embedding_type, TargetEmbeddingType::kInterstitial);
+
+ // But engaged sites using common words are not blocked.
+ embedding_type = GetTargetEmbeddingType(
+ "commonword.com.evil.com", kEngagedSites,
+ base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname);
+ EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone);
+}
+
struct GetETLDPlusOneTestCase {
const std::string hostname;
const std::string expected_etldp1;