BASELINE: Update Chromium to 85.0.4183.14085-based

Change-Id: Iaa42f4680837c57725b1344f108c0196741f6057 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2020-10-12 14:27:29 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2020-10-13 09:35:20 +0000
commit: c30a6232df03e1efbd9f3b226777b07e087a1122 (patch)
tree: e992f45784689f373bcc38d1b79a239ebe17ee23 /chromium/components/lookalikes
parent: 7b5b123ac58f58ffde0f4f6e488bcd09aa4decd3 (diff)
download: qtwebengine-chromium-c30a6232df03e1efbd9f3b226777b07e087a1122.tar.gz
6 files changed, 466 insertions, 204 deletions
diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn
index ac6a4d28be9..65e89c84808 100644
--- a/chromium/components/lookalikes/core/BUILD.gn
+++ b/chromium/components/lookalikes/core/BUILD.gn
@@ -27,6 +27,7 @@ jumbo_source_set("unit_tests") {
 
   deps = [
     ":core",
+    ":features",
     "//net:test_support",
     "//testing/gtest",
   ]
diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc
index faa42a292a2..99d9c417d37 100644
--- a/chromium/components/lookalikes/core/features.cc
+++ b/chromium/components/lookalikes/core/features.cc
@@ -10,5 +10,8 @@ namespace features {
 const base::Feature kDetectTargetEmbeddingLookalikes{
     "TargetEmbeddingLookalikes", base::FEATURE_DISABLED_BY_DEFAULT};
 
+const base::Feature kLookalikeInterstitialForPunycode{
+    "LookalikeInterstitialForPunycode", base::FEATURE_DISABLED_BY_DEFAULT};
+
 }  // namespace features
 }  // namespace lookalikes
diff --git a/chromium/components/lookalikes/core/features.h b/chromium/components/lookalikes/core/features.h
index 988c1c4043c..453e1146082 100644
--- a/chromium/components/lookalikes/core/features.h
+++ b/chromium/components/lookalikes/core/features.h
@@ -15,6 +15,10 @@ namespace features {
 COMPONENT_EXPORT(LOOKALIKES_FEATURES)
 extern const base::Feature kDetectTargetEmbeddingLookalikes;
 
+// This feature enables interstitial warnings for certain punycode domains.
+COMPONENT_EXPORT(LOOKALIKES_FEATURES)
+extern const base::Feature kLookalikeInterstitialForPunycode;
+
 }  // namespace features
 }  // namespace lookalikes
 
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc
index 2e82da34a54..4a350ed122e 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util.cc
@@ -14,6 +14,7 @@
 #include "base/memory/singleton.h"
 #include "base/metrics/field_trial_params.h"
 #include "base/metrics/histogram_macros.h"
+#include "base/strings/string_piece.h"
 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
@@ -36,6 +37,28 @@ const char kHistogramName[] = "NavigationSuggestion.Event";
 
 namespace {
 
+// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
+// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
+const char kDigitChars[] = "0123456789";
+
+// Minimum length of e2LD protected against target embedding. For example,
+// foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since
+// "baz" is shorter than kMinTargetE2LDLength.
+const size_t kMinE2LDLengthForTargetEmbedding = 4;
+
+// This list will be added to the static list of common words so common words
+// could be added to the list using a flag if needed.
+const base::FeatureParam<std::string> kAdditionalCommonWords{
+    &lookalikes::features::kDetectTargetEmbeddingLookalikes,
+    "additional_common_words", ""};
+
+// We might not protect a domain whose e2LD is a common word in target embedding
+// based on the TLD that is paired with it.
+const char* kCommonWords[] = {"shop",  "jobs",     "live",   "info",  "study",
+                              "asahi", "weather",  "health", "forum", "radio",
+                              "ideal", "research", "france", "free",  "mobile",
+                              "sky",   "ask"};
+
 bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
                     const url_formatter::Skeletons& skeletons2) {
   DCHECK(!skeletons1.empty());
@@ -86,27 +109,18 @@ std::string GetSimilarDomainFromTop500(
       }
 
       const std::string top_domain =
-          url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton).domain;
+          url_formatter::LookupSkeletonInTopDomains(
+              top_domain_skeleton, url_formatter::SkeletonType::kFull)
+              .domain;
       DCHECK(!top_domain.empty());
 
-      // If the only difference between the navigated and top
-      // domains is the registry part, this is unlikely to be a spoofing
-      // attempt. Ignore this match and continue. E.g. If the navigated domain
-      // is google.com.tw and the top domain is google.com.tr, this won't
-      // produce a match.
-      const std::string top_domain_without_registry =
-          url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
-      DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
-          top_domain_without_registry));
-      if (navigated_domain.domain_without_registry ==
-          top_domain_without_registry) {
+      if (IsLikelyEditDistanceFalsePositive(navigated_domain,
+                                            GetDomainInfo(top_domain))) {
         continue;
       }
 
       // Skip past domains that are allowed to be spoofed.
-      if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
-                                      url::kStandardSchemeSeparator +
-                                      top_domain))) {
+      if (target_allowlisted.Run(top_domain)) {
         continue;
       }
 
@@ -134,20 +148,12 @@ std::string GetSimilarDomainFromEngagedSites(
           continue;
         }
 
-        // If the only difference between the navigated and engaged
-        // domain is the registry part, this is unlikely to be a spoofing
-        // attempt. Ignore this match and continue. E.g. If the navigated
-        // domain is google.com.tw and the top domain is google.com.tr, this
-        // won't produce a match.
-        if (navigated_domain.domain_without_registry ==
-            engaged_site.domain_without_registry) {
+        if (IsLikelyEditDistanceFalsePositive(navigated_domain, engaged_site)) {
           continue;
         }
 
         // Skip past domains that are allowed to be spoofed.
-        if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
-                                        url::kStandardSchemeSeparator +
-                                        engaged_site.domain_and_registry))) {
+        if (target_allowlisted.Run(engaged_site.domain_and_registry)) {
           continue;
         }
 
@@ -164,61 +170,109 @@ void RecordEvent(NavigationSuggestionEvent event) {
 
 // Returns the parts of the domain that are separated by "." or "-", not
 // including the eTLD.
-std::vector<std::string> SplitDomainWithouteTLDIntoTokens(
+//
+// |host_without_etld| must outlive the return value since the vector contains
+// StringPieces.
+std::vector<base::StringPiece> SplitDomainWithouteTLDIntoTokens(
     const std::string& host_without_etld) {
-  return base::SplitString(host_without_etld, "-.", base::TRIM_WHITESPACE,
-                           base::SPLIT_WANT_NONEMPTY);
+  return base::SplitStringPiece(host_without_etld, "-.", base::TRIM_WHITESPACE,
+                                base::SPLIT_WANT_NONEMPTY);
 }
 
-// Checks whether |domain| is a top domain. If yes, returns true and fills
-// |found_domain| with the matching top domain.
-bool IsTop500Domain(const DomainInfo& domain, std::string* found_domain) {
-  for (auto& skeleton : domain.skeletons) {
-    // Matching with top domains is only done with skeleton matching. We check
-    // if the skeleton of our hostname matches the skeleton of any top domain.
-    url_formatter::TopDomainEntry matched_domain =
-        url_formatter::IDNSpoofChecker().LookupSkeletonInTopDomains(skeleton);
-    // We are only interested in an exact match with a top 500 domain (as
-    // opposed to skeleton match). Here we check that the matched domain is a
-    // top 500 domain and also the hostname of the matched domain is exactly the
-    // same as our input eTLD+1.
-    if (matched_domain.is_top_500 &&
-        matched_domain.domain == domain.domain_and_registry) {
-      *found_domain = matched_domain.domain;
+// Returns whether any subdomain ending in the last entry of |domain_labels| is
+// allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the
+// allowlist for google.com, scholar.google.com, and foo.scholar.google.com.
+bool ASubdomainIsAllowlisted(
+    const base::span<const base::StringPiece>& domain_labels,
+    const LookalikeTargetAllowlistChecker& in_target_allowlist) {
+  DCHECK(domain_labels.size() >= 2);
+  std::string potential_hostname =
+      domain_labels[domain_labels.size() - 1].as_string();
+  // Attach each token from the end to the embedded target to check if that
+  // subdomain has been allowlisted.
+  for (int i = domain_labels.size() - 2; i >= 0; i--) {
+    potential_hostname =
+        domain_labels[i].as_string() + "." + potential_hostname;
+    if (in_target_allowlist.Run(potential_hostname)) {
       return true;
     }
   }
   return false;
 }
 
-// Checks if the targeted domain is allowlisted. To check that we need to
-// check all of the subdomains that could be made. The reason is for example
-// in the case of "foo.scholar.google.com.university.edu", "google.com" is
-// considered as the targeted domain. We need to make sure
-// "scholar.google.com" or "foo.scholar.google.com" are not allowlisted
-// before marking the input domain as a target embedding domain.
-bool ASubdomainIsAllowlisted(
-    const std::string& embedded_target,
-    const base::span<const std::string>& subdomain_labels_so_far,
-    const LookalikeTargetAllowlistChecker& in_target_allowlist) {
-  const std::string https_scheme =
-      url::kHttpsScheme + std::string(url::kStandardSchemeSeparator);
+// Returns the top domain if the top domain without its separators matches the
+// |potential_target| (e.g. googlecom). The matching is a skeleton matching.
+std::string GetMatchingTopDomainWithoutSeparators(
+    const base::StringPiece& potential_target) {
+  const url_formatter::Skeletons skeletons =
+      url_formatter::GetSkeletons(base::UTF8ToUTF16(potential_target));
 
-  if (in_target_allowlist.Run(GURL(https_scheme + embedded_target))) {
+  for (const auto& skeleton : skeletons) {
+    url_formatter::TopDomainEntry matched_domain =
+        url_formatter::LookupSkeletonInTopDomains(
+            skeleton, url_formatter::SkeletonType::kSeparatorsRemoved);
+    if (!matched_domain.domain.empty() &&
+        matched_domain.skeleton_type ==
+            url_formatter::SkeletonType::kSeparatorsRemoved) {
+      return matched_domain.domain;
+    }
+  }
+  return std::string();
+}
+
+// Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged
+// site or a top 500 domain. |embedded_target| is set to matching eTLD+1.
+bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
+    const DomainInfo& domain,
+    const std::vector<DomainInfo>& engaged_sites,
+    std::string* embedded_target) {
+  for (const auto& skeleton : domain.skeletons) {
+    for (const auto& engaged_site : engaged_sites) {
+      if (base::Contains(engaged_site.skeletons, skeleton)) {
+        *embedded_target = engaged_site.domain_and_registry;
+        return true;
+      }
+    }
+  }
+  for (const auto& skeleton : domain.skeletons) {
+    const url_formatter::TopDomainEntry top_domain =
+        url_formatter::LookupSkeletonInTopDomains(
+            skeleton, url_formatter::SkeletonType::kFull);
+    if (!top_domain.domain.empty() && top_domain.is_top_500) {
+      *embedded_target = top_domain.domain;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns whether the provided token includes a common word, which is a common
+// indication of a likely false positive.
+bool UsesCommonWord(const DomainInfo& domain) {
+  std::vector<std::string> additional_common_words =
+      base::SplitString(kAdditionalCommonWords.Get(), ",",
+                        base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+  if (base::Contains(additional_common_words, domain.domain_without_registry)) {
     return true;
   }
-  std::string potential_hostname = embedded_target;
-  // Attach each token from the end to the embedded target to check if that
-  // subdomain has been allowlisted.
-  for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) {
-    potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname;
-    if (in_target_allowlist.Run(GURL(https_scheme + potential_hostname))) {
+  for (auto* common_word : kCommonWords) {
+    if (domain.domain_without_registry == common_word) {
       return true;
     }
   }
   return false;
 }
 
+// A domain is allowed to be embedded if its e2LD is a common word or any
+// valid partial subdomain is allowlisted.
+bool IsAllowedToBeEmbedded(
+    const DomainInfo& embedded_target,
+    const base::span<const base::StringPiece>& subdomain_span,
+    const LookalikeTargetAllowlistChecker& in_target_allowlist) {
+  return UsesCommonWord(embedded_target) ||
+         ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist);
+}
+
 }  // namespace
 
 DomainInfo::DomainInfo(const std::string& arg_hostname,
@@ -236,14 +290,14 @@ DomainInfo::~DomainInfo() = default;
 
 DomainInfo::DomainInfo(const DomainInfo&) = default;
 
-DomainInfo GetDomainInfo(const GURL& url) {
-  if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) {
+DomainInfo GetDomainInfo(const std::string& hostname) {
+  if (net::HostStringIsLocalhost(hostname) ||
+      net::IsHostnameNonUnique(hostname)) {
     return DomainInfo(std::string(), std::string(), std::string(),
                       url_formatter::IDNConversionResult(),
                       url_formatter::Skeletons());
   }
-  const std::string hostname = url.host();
-  const std::string domain_and_registry = GetETLDPlusOne(url.host());
+  const std::string domain_and_registry = GetETLDPlusOne(hostname);
   const std::string domain_without_registry =
       domain_and_registry.empty()
           ? std::string()
@@ -268,6 +322,10 @@ DomainInfo GetDomainInfo(const GURL& url) {
                     idn_result, skeletons);
 }
 
+DomainInfo GetDomainInfo(const GURL& url) {
+  return GetDomainInfo(url.host());
+}
+
 std::string GetETLDPlusOne(const std::string& hostname) {
   return net::registry_controlled_domains::GetDomainAndRegistry(
       hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
@@ -314,12 +372,67 @@ bool IsEditDistanceAtMostOne(const base::string16& str1,
   return edit_count <= 1;
 }
 
+bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
+                                       const DomainInfo& matched_domain) {
+  DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
+      matched_domain.domain_and_registry));
+  DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
+      navigated_domain.domain_and_registry));
+  // If the only difference between the domains is the registry part, this is
+  // unlikely to be a spoofing attempt and we should ignore this match.  E.g.
+  // exclude matches like google.com.tw and google.com.tr.
+  if (navigated_domain.domain_without_registry ==
+      matched_domain.domain_without_registry) {
+    return true;
+  }
+
+  // If the domains only differ by a numeric suffix on their e2LD (e.g.
+  // site45.tld and site35.tld), then ignore the match.
+  auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry,
+                                      kDigitChars, base::TRIM_TRAILING);
+  auto matched_trimmed = base::TrimString(
+      matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING);
+  DCHECK_NE(navigated_domain.domain_without_registry,
+            matched_domain.domain_without_registry);
+  // We previously verified that the domains without registries weren't equal,
+  // so if they're equal now, the match must have come from numeric suffixes.
+  if (nav_trimmed == matched_trimmed) {
+    return true;
+  }
+
+  // Ignore domains that only differ by an insertion/substitution at the
+  // start, as these are usually different words, not lookalikes.
+  const auto nav_dom_len = navigated_domain.domain_and_registry.length();
+  const auto matched_dom_len = matched_domain.domain_and_registry.length();
+  const auto& nav_dom = navigated_domain.domain_and_registry;
+  const auto& matched_dom = matched_domain.domain_and_registry;
+  if (nav_dom_len == matched_dom_len) {
+    // e.g. hank vs tank
+    if (nav_dom.substr(1) == matched_dom.substr(1)) {
+      return true;
+    }
+  } else if (nav_dom_len < matched_dom_len) {
+    // e.g. oodle vs poodle
+    if (nav_dom == matched_dom.substr(1)) {
+      return true;
+    }
+  } else {  // navigated_dom_len > matched_dom_len
+    // e.g. poodle vs oodle
+    if (nav_dom.substr(1) == matched_dom) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool IsTopDomain(const DomainInfo& domain_info) {
   // Top domains are only accessible through their skeletons, so query the top
   // domains trie for each skeleton of this domain.
   for (const std::string& skeleton : domain_info.skeletons) {
     const url_formatter::TopDomainEntry top_domain =
-        url_formatter::LookupSkeletonInTopDomains(skeleton);
+        url_formatter::LookupSkeletonInTopDomains(
+            skeleton, url_formatter::SkeletonType::kFull);
     if (domain_info.domain_and_registry == top_domain.domain) {
       return true;
     }
@@ -402,12 +515,18 @@ bool GetMatchingDomain(
     }
   }
 
-  if (IsTargetEmbeddingLookalike(navigated_domain.hostname, engaged_sites,
-                                 in_target_allowlist, matched_domain)) {
+  TargetEmbeddingType embedding_type =
+      GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites,
+                             in_target_allowlist, matched_domain);
+  if (embedding_type == TargetEmbeddingType::kSafetyTip) {
+    *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips;
+    return true;
+  } else if (embedding_type == TargetEmbeddingType::kInterstitial) {
     *match_type = LookalikeUrlMatchType::kTargetEmbedding;
     return true;
   }
 
+  DCHECK(embedding_type == TargetEmbeddingType::kNone);
   return false;
 }
 
@@ -431,84 +550,89 @@ void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) {
     case LookalikeUrlMatchType::kSkeletonMatchTop5k:
       RecordEvent(NavigationSuggestionEvent::kMatchSkeletonTop5k);
       break;
+    case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips:
+      RecordEvent(
+          NavigationSuggestionEvent::kMatchTargetEmbeddingForSafetyTips);
+      break;
+    case LookalikeUrlMatchType::kFailedSpoofChecks:
+      RecordEvent(NavigationSuggestionEvent::kFailedSpoofChecks);
+      break;
     case LookalikeUrlMatchType::kNone:
       break;
   }
 }
 
-bool IsTargetEmbeddingLookalike(
+TargetEmbeddingType GetTargetEmbeddingType(
     const std::string& hostname,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
     std::string* safe_hostname) {
   const std::string host_without_etld =
       url_formatter::top_domains::HostnameWithoutRegistry(hostname);
-  const std::vector<std::string> hostname_tokens_without_etld =
+  const std::vector<base::StringPiece> hostname_tokens_without_etld =
       SplitDomainWithouteTLDIntoTokens(host_without_etld);
 
-  // For each token, we look backwards to the previous token to see if
-  // "|prev_token|.|token|" forms a top domain or a high engaged domain.
-  std::string prev_token;
-
-  // We can have domains separated by '-'s or '.'s. In order to find target
-  // embedding urls with google.com.com or google-com.com, we get url parts as
-  // anything that is between two '-'s or '.'s. We check to see if any two
-  // consecutive tokens form a top or highly-engaged domain.
-  // Because of the way this matching is working, we can not identify target
-  // embedding attacks against domains that contain '-' in their address
-  // (e.g programme-tv.net). Also if the eTLD of the target has more than one
-  // part, we won't be able to protect it (e.g. google.co.uk).
-  for (size_t i = 0; i < hostname_tokens_without_etld.size(); i++) {
-    const std::string token = hostname_tokens_without_etld[i];
-    const std::string possible_embedded_target = prev_token + "." + token;
-    if (prev_token.empty()) {
-      prev_token = token;
-      continue;
+  // There are O(n^2) potential target embeddings in a domain name. We want to
+  // be comprehensive, but optimize so that usually we needn't check all of
+  // them. We do that by sweeping from the back of the embedding domain, towards
+  // the front, checking for a valid eTLD. If we find one, then we consider the
+  // possible embedded domains that end in that eTLD (i.e. all possible start
+  // points from the beginning of the string onward).
+  for (int end = hostname_tokens_without_etld.size(); end > 0; --end) {
+    base::span<const base::StringPiece> etld_check_span(
+        hostname_tokens_without_etld.data(), end);
+    std::string etld_check_host = base::JoinString(etld_check_span, ".");
+    auto etld_check_dominfo = GetDomainInfo(etld_check_host);
+
+    // Check if the final token is a no-separator target (e.g. "googlecom").
+    // This check happens first so that we can exclude invalid eTLD+1s next.
+    std::string embedded_target = GetMatchingTopDomainWithoutSeparators(
+        hostname_tokens_without_etld[end - 1]);
+    if (!embedded_target.empty() &&
+        !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
+                               in_target_allowlist)) {
+      *safe_hostname = embedded_target;
+      return TargetEmbeddingType::kInterstitial;
     }
-    prev_token = token;
 
-    // Short domains are more likely to be misidentified as being embedded. For
-    // example "mi.com", "mk.ru", or "com.ru" are a few examples of domains that
-    // could trigger the target embedding heuristic falsely.
-    if (possible_embedded_target.size() < 7) {
+    // Exclude otherwise-invalid eTLDs.
+    if (etld_check_dominfo.domain_without_registry.empty()) {
       continue;
     }
 
-    // We want to protect user's high engaged websites as well as top domains.
-    GURL possible_target_url(url::kHttpsScheme +
-                             std::string(url::kStandardSchemeSeparator) +
-                             possible_embedded_target);
-    DomainInfo possible_target_domain = GetDomainInfo(possible_target_url);
-    // We check if the eTLD+1 is a valid domain, otherwise there is no point in
-    // checking if it is a top domain or an engaged domain.
-    if (possible_target_domain.domain_and_registry.empty()) {
+    // Exclude e2LDs that are too short. <= because domain_without_registry has
+    // a trailing ".".
+    if (etld_check_dominfo.domain_without_registry.length() <=
+        kMinE2LDLengthForTargetEmbedding) {
       continue;
     }
 
-    *safe_hostname =
-        GetMatchingSiteEngagementDomain(engaged_sites, possible_target_domain);
-    // |GetMatchingSiteEngagementDomain| uses skeleton matching, we make sure
-    // the found engaged site is an exact match of the embedded target.
-    if (*safe_hostname != possible_embedded_target) {
-      *safe_hostname = std::string();
-    }
-    if (safe_hostname->empty() &&
-        !IsTop500Domain(possible_target_domain, safe_hostname)) {
-      continue;
+    // Check for exact matches against engaged sites, among all possible
+    // subdomains ending at |end|.
+    for (int start = 0; start < end - 1; ++start) {
+      const base::span<const base::StringPiece> span(
+          (hostname_tokens_without_etld.data() + start), end - start);
+      auto embedded_hostname = base::JoinString(span, ".");
+      auto embedded_dominfo = GetDomainInfo(embedded_hostname);
+
+      for (auto& engaged_site : engaged_sites) {
+        if (engaged_site.hostname == embedded_dominfo.hostname &&
+            !IsAllowedToBeEmbedded(embedded_dominfo, span,
+                                   in_target_allowlist)) {
+          *safe_hostname = engaged_site.hostname;
+          return TargetEmbeddingType::kInterstitial;
+        }
+      }
     }
 
-    // Check if any subdomain is allowlisted.
-    std::vector<std::string> subdomain_labels_so_far(
-        hostname_tokens_without_etld.begin(),
-        hostname_tokens_without_etld.begin() + i - 1);
-    if (!ASubdomainIsAllowlisted(possible_embedded_target,
-                                 subdomain_labels_so_far,
-                                 in_target_allowlist)) {
-      return true;
+    // There were no exact engaged site matches, but there may yet still be a
+    // match against the eTLD+1 of an engaged or top site.
+    if (DoesETLDPlus1MatchTopDomainOrEngagedSite(
+            etld_check_dominfo, engaged_sites, safe_hostname) &&
+        !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
+                               in_target_allowlist)) {
+      return TargetEmbeddingType::kInterstitial;
     }
-
-    // A target is found but it was allowlisted.
-    *safe_hostname = std::string();
   }
-  return false;
+  return TargetEmbeddingType::kNone;
 }
diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h
index 1bc49c24384..00946f6d909 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util.h
+++ b/chromium/components/lookalikes/core/lookalike_url_util.h
@@ -20,7 +20,16 @@ extern const char kHistogramName[];
 }
 
 using LookalikeTargetAllowlistChecker =
-    base::RepeatingCallback<bool(const GURL&)>;
+    base::RepeatingCallback<bool(const std::string&)>;
+
+// Used for |GetTargetEmbeddingType| return value. It shows if the target
+// embedding triggers on the input domain, and if it does, what type of warning
+// should be shown to the user.
+enum class TargetEmbeddingType {
+  kNone = 0,
+  kInterstitial = 1,
+  kSafetyTip = 2,
+};
 
 // Used for UKM. There is only a single LookalikeUrlMatchType per navigation.
 enum class LookalikeUrlMatchType {
@@ -33,10 +42,16 @@ enum class LookalikeUrlMatchType {
   kTargetEmbedding = 5,
   kSkeletonMatchTop500 = 6,
   kSkeletonMatchTop5k = 7,
+  kTargetEmbeddingForSafetyTips = 8,
+
+  // The domain name failed IDN spoof checks but didn't match a safe hostname.
+  // As a result, there is no URL to suggest to the user in the form of "Did
+  // you mean <url>?".
+  kFailedSpoofChecks = 9,
 
   // Append new items to the end of the list above; do not modify or replace
   // existing values. Comment out obsolete items.
-  kMaxValue = kSkeletonMatchTop5k,
+  kMaxValue = kFailedSpoofChecks,
 };
 
 // Used for UKM. There is only a single LookalikeUrlBlockingPageUserAction per
@@ -66,10 +81,12 @@ enum class NavigationSuggestionEvent {
   kMatchTargetEmbedding = 7,
   kMatchSkeletonTop500 = 8,
   kMatchSkeletonTop5k = 9,
+  kMatchTargetEmbeddingForSafetyTips = 10,
+  kFailedSpoofChecks = 11,
 
   // Append new items to the end of the list above; do not modify or
   // replace existing values. Comment out obsolete items.
-  kMaxValue = kMatchSkeletonTop5k,
+  kMaxValue = kFailedSpoofChecks,
 };
 
 struct DomainInfo {
@@ -99,9 +116,12 @@ struct DomainInfo {
   DomainInfo(const DomainInfo& other);
 };
 
-// Returns a DomainInfo instance computed from |url|. Will return empty fields
-// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is
-// empty.
+// Returns a DomainInfo instance computed from |hostname|. Will return empty
+// fields for non-unique hostnames (e.g. site.test), localhost or sites whose
+// eTLD+1 is empty.
+DomainInfo GetDomainInfo(const std::string& hostname);
+
+// Convenience function for returning GetDomainInfo(url.host()).
 DomainInfo GetDomainInfo(const GURL& url);
 
 // Returns true if the Levenshtein distance between |str1| and |str2| is at most
@@ -110,6 +130,13 @@ DomainInfo GetDomainInfo(const GURL& url);
 bool IsEditDistanceAtMostOne(const base::string16& str1,
                              const base::string16& str2);
 
+// Returns whether |navigated_domain| and |matched_domain| are likely to be edit
+// distance false positives, and thus the user should *not* be warned.
+//
+// Assumes |navigated_domain| and |matched_domain| are edit distance matches.
+bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
+                                       const DomainInfo& matched_domain);
+
 // Returns true if the domain given by |domain_info| is a top domain.
 bool IsTopDomain(const DomainInfo& domain_info);
 
@@ -141,14 +168,19 @@ void RecordUMAFromMatchType(LookalikeUrlMatchType match_type);
 // |safe_hostname| to the url of the embedded target domain.
 // At the moment we consider the following cases as Target Embedding:
 // example-google.com-site.com, example.google.com-site.com,
-// example-google-com-site.com, example.google.com.site.com,
+// example-google-info-site.com, example.google.com.site.com,
 // example-googlé.com-site.com where the embedded target is google.com. We
 // detect embeddings of top 500 domains and engaged domains. However, to reduce
 // false positives, we do not protect domains that are shorter than 7 characters
 // long (e.g. com.ru).
 // This function checks possible targets against |in_target_allowlist| to skip
 // permitted embeddings.
-bool IsTargetEmbeddingLookalike(
+// If no target embedding is found, the return value will be set to |kNonw|.
+// When the target is embedded with another TLD instead of its actual TLD, it
+// should trigger a Safety Tip when the embedded TLD is a ccTLD. In this
+// situation, return value will be |kSafetyTip|. All the other triggers will
+// result in a |kInterstitial| return value.
+TargetEmbeddingType GetTargetEmbeddingType(
     const std::string& hostname,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
index 4af557b6ad9..1aed2eddeec 100644
--- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
+++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc
@@ -6,6 +6,8 @@
 
 #include "base/bind.h"
 #include "base/strings/utf_string_conversions.h"
+#include "base/test/scoped_feature_list.h"
+#include "components/lookalikes/core/features.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
 TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
@@ -63,102 +65,198 @@ TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
     bool result =
         IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
                                 base::WideToUTF16(test_case.top_domain));
-    EXPECT_EQ(test_case.expected, result);
+    EXPECT_EQ(test_case.expected, result)
+        << "when comparing " << test_case.domain << " with "
+        << test_case.top_domain;
   }
 }
 
-bool IsGoogleScholar(const GURL& hostname) {
-  return hostname.host() == "scholar.google.com";
+TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) {
+  const struct TestCase {
+    const char* domain;
+    const char* top_domain;
+    bool is_likely_false_positive;
+  } kTestCases[] = {
+      // Most edit distance instances are not likely false positives.
+      {"abcxd.com", "abcyd.com", false},   // Substitution
+      {"abcxd.com", "abcxxd.com", false},  // Deletion
+      {"abcxxd.com", "abcxd.com", false},  // Insertion
+
+      // But we permit cases where the only difference is in the tld.
+      {"abcde.com", "abcde.net", true},
+
+      // We also permit matches that are only due to a numeric suffix,
+      {"abcd1.com", "abcd2.com", true},    // Substitution
+      {"abcde.com", "abcde1.com", true},   // Numeric deletion
+      {"abcde1.com", "abcde.com", true},   // Numeric insertion
+      {"abcd11.com", "abcd21.com", true},  // Not-final-digit substitution
+      {"a.abcd1.com", "abcd2.com", true},  // Only relevant for eTLD+1.
+      // ...and that change must be due to the numeric suffix.
+      {"abcx1.com", "abcy1.com", false},   // Substitution before suffix
+      {"abcd1.com", "abcde1.com", false},  // Deletion before suffix
+      {"abcde1.com", "abcd1.com", false},  // Insertion before suffix
+      {"abcdx.com", "abcdy.com", false},   // Non-numeric substitution at end
+
+      // We also permit matches that are only due to a first-character change,
+      {"xabcd.com", "yabcd.com", true},     // Substitution
+      {"xabcde.com", "abcde.com", true},    // Insertion
+      {"abcde.com", "xabcde.com", true},    // Deletion
+      {"a.abcde.com", "xabcde.com", true},  // For eTLD+1
+      // ...so long as that change is only on the first character, not later.
+      {"abcde.com", "axbcde.com", false},   // Deletion
+      {"axbcde.com", "abcde.com", false},   // Insertion
+      {"axbcde.com", "aybcde.com", false},  // Substitution
+  };
+  for (const TestCase& test_case : kTestCases) {
+    auto navigated =
+        GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
+                           url::kStandardSchemeSeparator + test_case.domain));
+    auto matched = GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
+                                      url::kStandardSchemeSeparator +
+                                      test_case.top_domain));
+    bool result = IsLikelyEditDistanceFalsePositive(navigated, matched);
+    EXPECT_EQ(test_case.is_likely_false_positive, result)
+        << "when comparing " << test_case.domain << " with "
+        << test_case.top_domain;
+  }
 }
 
-TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
-  const std::vector<DomainInfo> engaged_sites = {
-      GetDomainInfo(GURL("https://highengagement.com"))};
-  const struct TargetEmbeddingHeuristicTestCase {
-    const GURL url;
-    bool should_trigger;
-  } kTestCases[] = {
+bool IsGoogleScholar(const std::string& hostname) {
+  return hostname == "scholar.google.com";
+}
 
-      // Scheme should not affect the outcome.
-      {GURL("http://google.com.com"), true},
-      {GURL("https://google.com.com"), true},
+struct TargetEmbeddingHeuristicTestCase {
+  const std::string hostname;
+  // Empty when there is no match.
+  const std::string expected_safe_host;
+  const TargetEmbeddingType expected_type;
+};
 
+TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
+  const std::vector<DomainInfo> kEngagedSites = {
+      GetDomainInfo(GURL("https://highengagement.com")),
+      GetDomainInfo(GURL("https://highengagement.co.uk")),
+      GetDomainInfo(GURL("https://subdomain.highengagement.com")),
+      GetDomainInfo(GURL("https://subdomain.google.com")),
+  };
+  const std::vector<TargetEmbeddingHeuristicTestCase> kTestCases = {
       // The length of the url should not affect the outcome.
-      {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-"
-            "outcome-of-this-target-embedding-test-google.com-login.com"),
-       true},
-      {GURL(
-           "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-"
-           "outcome-of-this-target-embedding-test.com-login.com"),
-       false},
-      {GURL(
-           "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-"
-           "outcome-of-this-target-embedding-test.com-login.com"),
-       false},
+      {"this-is-a-very-long-url-but-it-should-not-affect-the-"
+       "outcome-of-this-target-embedding-test-google.com-login.com",
+       "google.com", TargetEmbeddingType::kInterstitial},
+      {"google-com-this-is-a-very-long-url-but-it-should-not-affect-"
+       "the-outcome-of-this-target-embedding-test-login.com",
+       "google.com", TargetEmbeddingType::kInterstitial},
+      {"this-is-a-very-long-url-but-it-should-not-affect-google-the-"
+       "outcome-of-this-target-embedding-test.com-login.com",
+       "", TargetEmbeddingType::kNone},
+      {"google-this-is-a-very-long-url-but-it-should-not-affect-the-"
+       "outcome-of-this-target-embedding-test.com-login.com",
+       "", TargetEmbeddingType::kNone},
 
       // We need exact skeleton match for our domain so exclude edit-distance
       // matches.
-      {GURL("http://goog0le.com-login.com"), false},
+      {"goog0le.com-login.com", "", TargetEmbeddingType::kNone},
 
-      // Unicode characters are currently not handled. As a result, target
-      // embedding sites that embed lookalikes of top domains aren't flagged.
-      {GURL("http://googlé.com-login.com"), false},
-      {GURL("http://sth-googlé.com-sth.com"), false},
+      // Unicode characters should be handled
+      {"googlé.com-login.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+      {"foo-googlé.com-bar.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
 
-      // The basic state
-      {GURL("http://google.com.sth.com"), true},
+      // The basic states
+      {"google.com.foo.com", "google.com", TargetEmbeddingType::kInterstitial},
       // - before the domain name should be ignored.
-      {GURL("http://sth-google.com-sth.com"), true},
-
+      {"foo-google.com-bar.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
       // The embedded target's TLD doesn't necessarily need to be followed by a
       // '-' and could be a subdomain by itself.
-      {GURL("http://sth-google.com.sth.com"), true},
-      {GURL("http://a.b.c.d.e.f.g.h.sth-google.com.sth.com"), true},
-      {GURL("http://a.b.c.d.e.f.g.h.google.com-sth.com"), true},
-      {GURL("http://1.2.3.4.5.6.google.com-sth.com"), true},
-
+      {"foo-google.com.foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+      {"a.b.c.d.e.f.g.h.foo-google.com.foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+      {"a.b.c.d.e.f.g.h.google.com-foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+      {"1.2.3.4.5.6.google.com-foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
       // Target domain could be in the middle of subdomains.
-      {GURL("http://sth.google.com.sth.com"), true},
-      {GURL("http://sth.google.com-sth.com"), true},
-
+      {"foo.google.com.foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
       // The target domain and its tld should be next to each other.
-      {GURL("http://sth-google.l.com-sth.com"), false},
-
-      // Target domain should match only with its actual TLD.
-      {GURL("http://google.edu.com"), false},
+      {"foo-google.l.com-foo.com", "", TargetEmbeddingType::kNone},
       // Target domain might be separated with a dash instead of dot.
-      {GURL("http://sth.google-com-sth.com"), true},
-      // Target domain could be an engaged domain
-      {GURL("http://highengagement-com-login.com"), true},
-      // If target domain is an allowlisted domain, it should not trigger the
-      // heuristic.
-      {GURL("http://foo.scholar.google-com-login.com"), false},
-      // An allowlisted domain will make sure it is not marked as an embedded
-      // target. However, other targets could still be embedded in the domain.
-      {GURL("http://foo.google.com.scholar.google-com-login.com"), true},
-      {GURL("http://foo.scholar.google-com.google.com-login.com"), true},
-
-      // Ensure legitimate domains don't trigger the heuristic.
-      {GURL("http://google.com"), false},
-      {GURL("http://google.co.uk"), false},
-      {GURL("http://google.randomreg-login.com"), false},
+      {"foo.google-com-foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+
+      // Allowlisted domains should not trigger heuristic.
+      {"scholar.google.com.foo.com", "", TargetEmbeddingType::kNone},
+      {"scholar.google.com-google.com.foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+      {"google.com-scholar.google.com.foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+      {"foo.scholar.google.com.foo.com", "", TargetEmbeddingType::kNone},
+      {"scholar.foo.google.com.foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+
+      // Targets should be longer than 6 characters.
+      {"hp.com-foo.com", "", TargetEmbeddingType::kNone},
+
+      // Targets with common words as e2LD are not considered embedded targets
+      // either for all TLDs or another-TLD matching.
+      {"foo.jobs.com-foo.com", "", TargetEmbeddingType::kNone},
+      {"foo.office.com-foo.com", "office.com",
+       TargetEmbeddingType::kInterstitial},
+      {"foo.jobs.org-foo.com", "", TargetEmbeddingType::kNone},
+      {"foo.office.org-foo.com", "", TargetEmbeddingType::kNone},
+
+      // Targets could be embedded without their dots and dashes.
+      {"foo.googlecom-foo.com", "google.com",
+       TargetEmbeddingType::kInterstitial},
+
+      // Ensure legitimate domains don't trigger.
+      {"foo.google.com", "", TargetEmbeddingType::kNone},
+      {"foo.bar.google.com", "", TargetEmbeddingType::kNone},
+      {"google.com", "", TargetEmbeddingType::kNone},
+      {"google.co.uk", "", TargetEmbeddingType::kNone},
+      {"google.randomreg-login.com", "", TargetEmbeddingType::kNone},
+      {"com.foo.com", "", TargetEmbeddingType::kNone},
+
+      // Multipart eTLDs should work.
+      {"foo.google.co.uk.foo.com", "google.co.uk",
+       TargetEmbeddingType::kInterstitial},
+      {"foo.highengagement-co-uk.foo.com", "highengagement.co.uk",
+       TargetEmbeddingType::kInterstitial},
+
+      // Engaged sites should trigger as specifically as possible, and should
+      // trigger preferentially to top sites when possible.
+      {"foo.highengagement.com.foo.com", "highengagement.com",
+       TargetEmbeddingType::kInterstitial},
+      {"foo.subdomain.highengagement.com.foo.com",
+       "subdomain.highengagement.com", TargetEmbeddingType::kInterstitial},
+      {"foo.subdomain.google.com.foo.com", "subdomain.google.com",
+       TargetEmbeddingType::kInterstitial},
 
+      // Skeleton matching should work against engaged sites at the eTLD level.
+      {"subdomain.highéngagement.com-foo.com", "highengagement.com",
+       TargetEmbeddingType::kInterstitial},
   };
 
-  for (const auto& kTestCase : kTestCases) {
+  for (auto& test_case : kTestCases) {
     std::string safe_hostname;
-    if (kTestCase.should_trigger) {
-      EXPECT_TRUE(IsTargetEmbeddingLookalike(
-          kTestCase.url.host(), engaged_sites,
-          base::BindRepeating(&IsGoogleScholar), &safe_hostname))
-          << "Expected that \"" << kTestCase.url
-          << " should trigger but it didn't.";
+    TargetEmbeddingType embedding_type = GetTargetEmbeddingType(
+        test_case.hostname, kEngagedSites,
+        base::BindRepeating(&IsGoogleScholar), &safe_hostname);
+    if (test_case.expected_type != TargetEmbeddingType::kNone) {
+      EXPECT_EQ(safe_hostname, test_case.expected_safe_host)
+          << test_case.hostname << " should trigger on "
+          << test_case.expected_safe_host << ", but "
+          << (safe_hostname.empty() ? "it didn't trigger at all."
+                                    : "triggered on " + safe_hostname);
+      EXPECT_EQ(embedding_type, test_case.expected_type);
     } else {
-      EXPECT_FALSE(IsTargetEmbeddingLookalike(
-          kTestCase.url.host(), engaged_sites,
-          base::BindRepeating(&IsGoogleScholar), &safe_hostname))
-          << "Expected that \"" << kTestCase.url
-          << " shouldn't trigger but it did. For URL: " << safe_hostname;
+      EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone)
+          << test_case.hostname << " unexpectedly triggered on "
+          << safe_hostname;
     }
   }
 }
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2020-10-12 14:27:29 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2020-10-13 09:35:20 +0000
commit	c30a6232df03e1efbd9f3b226777b07e087a1122 (patch)
tree	e992f45784689f373bcc38d1b79a239ebe17ee23 /chromium/components/lookalikes
parent	7b5b123ac58f58ffde0f4f6e488bcd09aa4decd3 (diff)
download	qtwebengine-chromium-c30a6232df03e1efbd9f3b226777b07e087a1122.tar.gz