From 21ba0c5d4bf8fba15dddd97cd693bad2358b77fd Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@qt.io>
Date: Fri, 3 Sep 2021 13:32:17 +0200
Subject: BASELINE: Update Chromium to 92.0.4515.166

Change-Id: I42a050486714e9e54fc271f2a8939223a02ae364
---
 chromium/components/lookalikes/core/BUILD.gn       |  3 +
 chromium/components/lookalikes/core/DEPS           |  2 +
 chromium/components/lookalikes/core/features.cc    |  2 +-
 .../lookalikes/core/lookalike_url_util.cc          | 91 ++++++++++++++++++----
 .../lookalikes/core/lookalike_url_util.h           | 34 ++++----
 .../lookalikes/core/lookalike_url_util_unittest.cc | 68 +++++++++++++---
 6 files changed, 154 insertions(+), 46 deletions(-)

(limited to 'chromium/components/lookalikes')
diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn
index fccb48046c8..c46ad6f7e1d 100644
--- a/chromium/components/lookalikes/core/BUILD.gn
+++ b/chromium/components/lookalikes/core/BUILD.gn
@@ -14,6 +14,8 @@ static_library("core") {
     "//base",
     "//components/pref_registry",
     "//components/prefs:prefs",
+    "//components/reputation/core:core",
+    "//components/reputation/core:proto",
     "//components/security_interstitials/core",
     "//components/security_state/core:features",
     "//components/strings",
@@ -36,6 +38,7 @@ source_set("unit_tests") {
   deps = [
     ":core",
     ":features",
+    "//components/reputation/core",
     "//net:test_support",
     "//testing/gtest",
   ]
diff --git a/chromium/components/lookalikes/core/DEPS b/chromium/components/lookalikes/core/DEPS
index a3c048b885f..33c4e659341 100644
--- a/chromium/components/lookalikes/core/DEPS
+++ b/chromium/components/lookalikes/core/DEPS
@@ -3,4 +3,6 @@ include_rules = [
   # should not be introduced.
   "-content",
   "-ios/web",
+  # components/reputation contains the lookalike (safety tips) component.
+  "+components/reputation/core",
 ]
diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc
index 3eb15692239..3d1fb0c01c4 100644
--- a/chromium/components/lookalikes/core/features.cc
+++ b/chromium/components/lookalikes/core/features.cc
@@ -9,7 +9,7 @@ namespace features {
 
 // Note: this flag is ignored on iOS. See lookalike_url_util.cc.
 const base::Feature kDetectTargetEmbeddingLookalikes{
-    "TargetEmbeddingLookalikes", base::FEATURE_DISABLED_BY_DEFAULT};
+    "TargetEmbeddingLookalikes", base::FEATURE_ENABLED_BY_DEFAULT};
 
 const base::Feature kLookalikeInterstitialForPunycode{
     "LookalikeInterstitialForPunycode", base::FEATURE_ENABLED_BY_DEFAULT};
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc
index 7677aa9660f..a3e5695061d 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util.cc
@@ -27,6 +27,7 @@
 #include "base/values.h"
 #include "build/build_config.h"
 #include "components/lookalikes/core/features.h"
+#include "components/reputation/core/safety_tips_config.h"
 #include "components/security_interstitials/core/pref_names.h"
 #include "components/security_state/core/features.h"
 #include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
@@ -212,13 +213,12 @@ bool ASubdomainIsAllowlisted(
     const base::span<const base::StringPiece>& domain_labels,
     const LookalikeTargetAllowlistChecker& in_target_allowlist) {
   DCHECK(domain_labels.size() >= 2);
-  std::string potential_hostname =
-      domain_labels[domain_labels.size() - 1].as_string();
+  std::string potential_hostname(domain_labels[domain_labels.size() - 1]);
   // Attach each token from the end to the embedded target to check if that
   // subdomain has been allowlisted.
   for (int i = domain_labels.size() - 2; i >= 0; i--) {
     potential_hostname =
-        domain_labels[i].as_string() + "." + potential_hostname;
+        std::string(domain_labels[i]) + "." + potential_hostname;
     if (in_target_allowlist.Run(potential_hostname)) {
       return true;
     }
@@ -286,7 +286,8 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
 // Returns whether the e2LD of the provided domain is a common word (e.g.
 // weather.com, ask.com). Target embeddings of these domains are often false
 // positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
-bool UsesCommonWord(const DomainInfo& domain) {
+bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto,
+                    const DomainInfo& domain) {
   // kDomainsPermittedInEndEmbeddings are based on domains with common words,
   // but they should not be excluded here (and instead are checked later).
   for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
@@ -301,7 +302,13 @@ bool UsesCommonWord(const DomainInfo& domain) {
     return true;
   }
 
-  // Also check the local lists.
+  // Search for words in the component-provided word list.
+  if (reputation::IsCommonWordInConfigProto(config_proto,
+                                            domain.domain_without_registry)) {
+    return true;
+  }
+
+  // Search for words in the local word lists.
   for (auto* common_word : kLocalAdditionalCommonWords) {
     if (domain.domain_without_registry == common_word) {
       return true;
@@ -323,14 +330,13 @@ bool UsesCommonWord(const DomainInfo& domain) {
 bool IsEmbeddingItself(const base::span<const base::StringPiece>& domain_labels,
                        const std::string& embedding_domain) {
   DCHECK(domain_labels.size() >= 2);
-  std::string potential_hostname =
-      domain_labels[domain_labels.size() - 1].as_string();
+  std::string potential_hostname(domain_labels[domain_labels.size() - 1]);
   // Attach each token from the end to the embedded target to check if that
   // subdomain is the embedding domain. (e.g. using the earlier example, check
   // each ["com", "example.com", "foo.example.com"] against "example.com".
   for (int i = domain_labels.size() - 2; i >= 0; i--) {
     potential_hostname =
-        domain_labels[i].as_string() + "." + potential_hostname;
+        std::string(domain_labels[i]) + "." + potential_hostname;
     if (embedding_domain == potential_hostname) {
       return true;
     }
@@ -371,8 +377,9 @@ bool IsAllowedToBeEmbedded(
     const DomainInfo& embedded_target,
     const base::span<const base::StringPiece>& subdomain_span,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
-    const std::string& embedding_domain) {
-  return UsesCommonWord(embedded_target) ||
+    const std::string& embedding_domain,
+    const reputation::SafetyTipsConfig* config_proto) {
+  return UsesCommonWord(config_proto, embedded_target) ||
          ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist) ||
          IsEmbeddingItself(subdomain_span, embedding_domain) ||
          IsCrossTLDMatch(embedded_target, embedding_domain) ||
@@ -616,6 +623,7 @@ bool GetMatchingDomain(
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
+    const reputation::SafetyTipsConfig* config_proto,
     std::string* matched_domain,
     LookalikeUrlMatchType* match_type) {
   DCHECK(!navigated_domain.domain_and_registry.empty());
@@ -676,7 +684,7 @@ bool GetMatchingDomain(
 
   TargetEmbeddingType embedding_type =
       GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites,
-                             in_target_allowlist, matched_domain);
+                             in_target_allowlist, config_proto, matched_domain);
   if (embedding_type == TargetEmbeddingType::kSafetyTip) {
     *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips;
     return true;
@@ -725,6 +733,37 @@ TargetEmbeddingType GetTargetEmbeddingType(
     const std::string& hostname,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
+    const reputation::SafetyTipsConfig* config_proto,
+    std::string* safe_hostname) {
+  // Because of how target embeddings are detected (i.e. by sweeping the URL
+  // from back to front), we're guaranteed to find tail-embedding before other
+  // target embedding. Tail embedding triggers a safety tip, but interstitials
+  // are more important than safety tips, so if we find a safety tippable
+  // embedding with SearchForEmbeddings, go search again not permitting safety
+  // tips to see if we can also find an interstitiallable embedding.
+  auto result = SearchForEmbeddings(
+      hostname, engaged_sites, in_target_allowlist, config_proto,
+      /*safety_tips_allowed=*/true, safe_hostname);
+  if (result == TargetEmbeddingType::kSafetyTip) {
+    std::string no_st_safe_hostname;
+    auto no_st_result = SearchForEmbeddings(
+        hostname, engaged_sites, in_target_allowlist, config_proto,
+        /*safety_tips_allowed=*/false, &no_st_safe_hostname);
+    if (no_st_result == TargetEmbeddingType::kNone) {
+      return result;
+    }
+    *safe_hostname = no_st_safe_hostname;
+    return no_st_result;
+  }
+  return result;
+}
+
+TargetEmbeddingType SearchForEmbeddings(
+    const std::string& hostname,
+    const std::vector<DomainInfo>& engaged_sites,
+    const LookalikeTargetAllowlistChecker& in_target_allowlist,
+    const reputation::SafetyTipsConfig* config_proto,
+    bool safety_tips_allowed,
     std::string* safe_hostname) {
   const std::string embedding_domain = GetETLDPlusOne(hostname);
   const std::vector<base::StringPiece> hostname_tokens =
@@ -764,7 +803,8 @@ TargetEmbeddingType GetTargetEmbeddingType(
       if (no_separator_dominfo.domain_without_registry.length() >
               kMinE2LDLengthForTargetEmbedding &&
           !IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens,
-                                 in_target_allowlist, embedding_domain)) {
+                                 in_target_allowlist, embedding_domain,
+                                 config_proto)) {
         *safe_hostname = embedded_target;
         return TargetEmbeddingType::kInterstitial;
       }
@@ -793,9 +833,17 @@ TargetEmbeddingType GetTargetEmbeddingType(
       for (auto& engaged_site : engaged_sites) {
         if (engaged_site.hostname == embedded_dominfo.hostname &&
             !IsAllowedToBeEmbedded(embedded_dominfo, span, in_target_allowlist,
-                                   embedding_domain)) {
+                                   embedding_domain, config_proto)) {
           *safe_hostname = engaged_site.hostname;
-          return TargetEmbeddingType::kInterstitial;
+          // Tail-embedding (e.g. evil-google.com, where the embedding happens
+          // at the very end of the hostname) is a safety tip, but only when
+          // safety tips are allowed. If it's tail embedding but we can't create
+          // a safety tip, keep looking.  Non-tail-embeddings are interstitials.
+          if (end != static_cast<int>(hostname_tokens.size())) {
+            return TargetEmbeddingType::kInterstitial;
+          } else if (safety_tips_allowed) {
+            return TargetEmbeddingType::kSafetyTip;
+          }  // else keep searching.
         }
       }
     }
@@ -805,8 +853,17 @@ TargetEmbeddingType GetTargetEmbeddingType(
     if (DoesETLDPlus1MatchTopDomainOrEngagedSite(
             etld_check_dominfo, engaged_sites, safe_hostname) &&
         !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
-                               in_target_allowlist, embedding_domain)) {
-      return TargetEmbeddingType::kInterstitial;
+                               in_target_allowlist, embedding_domain,
+                               config_proto)) {
+      // Tail-embedding (e.g. evil-google.com, where the embedding happens at
+      // the very end of the hostname) is a safety tip, but only when safety
+      // tips are allowed. If it's tail embedding but we can't create a safety
+      // tip, keep looking.  Non-tail-embeddings are interstitials.
+      if (end != static_cast<int>(hostname_tokens.size())) {
+        return TargetEmbeddingType::kInterstitial;
+      } else if (safety_tips_allowed) {
+        return TargetEmbeddingType::kSafetyTip;
+      }  // else keep searching.
     }
   }
   return TargetEmbeddingType::kNone;
@@ -879,7 +936,7 @@ bool IsAllowedByEnterprisePolicy(const PrefService* pref_service,
                                  const GURL& url) {
   const auto* list =
       pref_service->GetList(prefs::kLookalikeWarningAllowlistDomains);
-  for (const auto& domain_val : *list) {
+  for (const auto& domain_val : list->GetList()) {
     auto domain = domain_val.GetString();
     if (url.DomainIs(domain)) {
       return true;
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h
index 72124acb0ca..e5a8ba87dfa 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.h
+++ b/chromium/components/lookalikes/core/lookalike_url_util.h
@@ -9,9 +9,9 @@
 #include <vector>
 
 #include "base/callback.h"
-#include "base/time/time.h"
 #include "components/pref_registry/pref_registry_syncable.h"
 #include "components/prefs/pref_service.h"
+#include "components/reputation/core/safety_tips.pb.h"
 #include "components/url_formatter/url_formatter.h"
 #include "url/gurl.h"
 
@@ -163,31 +163,33 @@ bool GetMatchingDomain(
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
+    const reputation::SafetyTipsConfig* config_proto,
     std::string* matched_domain,
     LookalikeUrlMatchType* match_type);
 
 void RecordUMAFromMatchType(LookalikeUrlMatchType match_type);
 
 // Checks to see if a URL is a target embedding lookalike. This function sets
-// |safe_hostname| to the url of the embedded target domain.
-// At the moment we consider the following cases as Target Embedding:
-// example-google.com-site.com, example.google.com-site.com,
-// example-google-info-site.com, example.google.com.site.com,
-// example-googlé.com-site.com where the embedded target is google.com. We
-// detect embeddings of top 500 domains and engaged domains. However, to reduce
-// false positives, we do not protect domains that are shorter than 7 characters
-// long (e.g. com.ru).
-// This function checks possible targets against |in_target_allowlist| to skip
-// permitted embeddings.
-// If no target embedding is found, the return value will be set to |kNonw|.
-// When the target is embedded with another TLD instead of its actual TLD, it
-// should trigger a Safety Tip when the embedded TLD is a ccTLD. In this
-// situation, return value will be |kSafetyTip|. All the other triggers will
-// result in a |kInterstitial| return value.
+// |safe_hostname| to the url of the embedded target domain. See the unit tests
+// for what qualifies as target embedding.
 TargetEmbeddingType GetTargetEmbeddingType(
     const std::string& hostname,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
+    const reputation::SafetyTipsConfig* config_proto,
+    std::string* safe_hostname);
+
+// Same as GetTargetEmbeddingType, but explicitly state whether or not a safety
+// tip is permitted via |safety_tips_allowed|. Safety tips are presently only
+// used for tail embedding (e.g. "evil-google.com"). This function may return
+// kSafetyTip preferentially to kInterstitial -- call with !safety_tips_allowed
+// if you're interested in determining if there's *also* an interstitial.
+TargetEmbeddingType SearchForEmbeddings(
+    const std::string& hostname,
+    const std::vector<DomainInfo>& engaged_sites,
+    const LookalikeTargetAllowlistChecker& in_target_allowlist,
+    const reputation::SafetyTipsConfig* config_proto,
+    bool safety_tips_allowed,
     std::string* safe_hostname);
 
 // Returns true if a navigation to an IDN should be blocked.
diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
index 4062ea99861..e7b52ca0a5f 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
@@ -7,8 +7,22 @@
 #include "base/bind.h"
 #include "base/strings/utf_string_conversions.h"
 #include "components/lookalikes/core/features.h"
+#include "components/reputation/core/safety_tip_test_utils.h"
+#include "components/reputation/core/safety_tips_config.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
+std::string TargetEmbeddingTypeToString(TargetEmbeddingType type) {
+  switch (type) {
+    case TargetEmbeddingType::kNone:
+      return "kNone";
+    case TargetEmbeddingType::kInterstitial:
+      return "kInterstitial";
+    case TargetEmbeddingType::kSafetyTip:
+      return "kSafetyTip";
+  }
+  NOTREACHED();
+}
+
 TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
   const struct TestCase {
     const wchar_t* domain;
@@ -139,7 +153,7 @@ struct TargetEmbeddingHeuristicTestCase {
   const TargetEmbeddingType expected_type;
 };
 
-TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
+TEST(LookalikeUrlUtilTest, TargetEmbedding) {
   const std::vector<DomainInfo> kEngagedSites = {
       GetDomainInfo(GURL("https://highengagement.com")),
       GetDomainInfo(GURL("https://highengagement.inthesubdomain.com")),
@@ -278,12 +292,15 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
       {"google.com.google.com", "", TargetEmbeddingType::kNone},
       {"www.google.com.google.com", "", TargetEmbeddingType::kNone},
 
-      // Detect embeddings at the end of the domain, too.
-      {"www-google.com", "google.com", TargetEmbeddingType::kInterstitial},
+      // Detect embeddings at the end of the domain, too, but as a Safety Tip.
+      {"www-google.com", "google.com", TargetEmbeddingType::kSafetyTip},
       {"www-highengagement.com", "highengagement.com",
-       TargetEmbeddingType::kInterstitial},
+       TargetEmbeddingType::kSafetyTip},
       {"subdomain-highengagement.com", "subdomain.highengagement.com",
-       TargetEmbeddingType::kInterstitial},
+       TargetEmbeddingType::kSafetyTip},
+      // If the match duplicates the TLD, it's not quite tail-embedding.
+      {"google-com.com", "google.com", TargetEmbeddingType::kInterstitial},
+      // If there are multiple options, it should choose the more severe one.
       {"google-com.google-com.com", "google.com",
        TargetEmbeddingType::kInterstitial},
       {"subdomain.google-com.google-com.com", "google.com",
@@ -300,14 +317,17 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
       // works for domains on the list, but not for others.
       {"office.com-foo.com", "office.com", TargetEmbeddingType::kInterstitial},
       {"example-office.com", "", TargetEmbeddingType::kNone},
-      {"example-google.com", "google.com", TargetEmbeddingType::kInterstitial},
+      {"example-google.com", "google.com", TargetEmbeddingType::kSafetyTip},
   };
 
+  reputation::InitializeBlankLookalikeAllowlistForTesting();
+  auto* config_proto = reputation::GetSafetyTipsRemoteConfigProto();
+
   for (auto& test_case : kTestCases) {
     std::string safe_hostname;
     TargetEmbeddingType embedding_type = GetTargetEmbeddingType(
         test_case.hostname, kEngagedSites,
-        base::BindRepeating(&IsGoogleScholar), &safe_hostname);
+        base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname);
     if (test_case.expected_type != TargetEmbeddingType::kNone) {
       EXPECT_EQ(safe_hostname, test_case.expected_safe_host)
           << test_case.hostname << " should trigger on "
@@ -315,19 +335,43 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
           << (safe_hostname.empty() ? "it didn't trigger at all."
                                     : "triggered on " + safe_hostname);
       EXPECT_EQ(embedding_type, test_case.expected_type)
-          << test_case.hostname << " should trigger on "
+          << test_case.hostname << " should trigger "
+          << TargetEmbeddingTypeToString(test_case.expected_type) << " against "
           << test_case.expected_safe_host << " but it returned "
-          << (embedding_type == TargetEmbeddingType::kNone
-                  ? "kNone."
-                  : "something unexpected");
+          << TargetEmbeddingTypeToString(embedding_type);
     } else {
       EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone)
-          << test_case.hostname << " unexpectedly triggered on "
+          << test_case.hostname << " unexpectedly triggered "
+          << TargetEmbeddingTypeToString(embedding_type) << " against "
           << safe_hostname;
     }
   }
 }
 
+TEST(LookalikeUrlUtilTest, TargetEmbeddingIgnoresComponentWordlist) {
+  const std::vector<DomainInfo> kEngagedSites = {
+      GetDomainInfo(GURL("https://commonword.com")),
+      GetDomainInfo(GURL("https://uncommonword.com")),
+  };
+
+  reputation::SetSafetyTipAllowlistPatterns({}, {}, {"commonword"});
+  auto* config_proto = reputation::GetSafetyTipsRemoteConfigProto();
+  TargetEmbeddingType embedding_type;
+  std::string safe_hostname;
+
+  // Engaged sites using uncommon words are still blocked.
+  embedding_type = GetTargetEmbeddingType(
+      "uncommonword.com.evil.com", kEngagedSites,
+      base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname);
+  EXPECT_EQ(embedding_type, TargetEmbeddingType::kInterstitial);
+
+  // But engaged sites using common words are not blocked.
+  embedding_type = GetTargetEmbeddingType(
+      "commonword.com.evil.com", kEngagedSites,
+      base::BindRepeating(&IsGoogleScholar), config_proto, &safe_hostname);
+  EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone);
+}
+
 struct GetETLDPlusOneTestCase {
   const std::string hostname;
   const std::string expected_etldp1;
-- 
cgit v1.2.1