BASELINE: Update Chromium to 83.0.4103.122

Change-Id: Ie3a82f5bb0076eec2a7c6a6162326b4301ee291e Reviewed-by: Michael Brüning <michael.bruning@qt.io>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2020-07-16 11:45:35 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2020-07-17 08:59:23 +0000
commit: 552906b0f222c5d5dd11b9fd73829d510980461a (patch)
tree: 3a11e6ed0538a81dd83b20cf3a4783e297f26d91 /chromium/components/lookalikes
parent: 1b05827804eaf047779b597718c03e7d38344261 (diff)
download: qtwebengine-chromium-552906b0f222c5d5dd11b9fd73829d510980461a.tar.gz
7 files changed, 799 insertions, 0 deletions
diff --git a/chromium/components/lookalikes/BUILD.gn b/chromium/components/lookalikes/BUILD.gn
new file mode 100644
index 00000000000..c28b4f05f45
--- /dev/null
+++ b/chromium/components/lookalikes/BUILD.gn
@@ -0,0 +1,32 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import("//build/config/jumbo.gni")
+
+jumbo_static_library("lookalikes") {
+  sources = [
+    "lookalike_url_util.cc",
+    "lookalike_url_util.h",
+  ]
+  deps = [
+    "//base",
+    "//components/security_state/core:features",
+    "//components/url_formatter",
+    "//components/url_formatter/spoof_checks/top_domains:common",
+    "//components/url_formatter/spoof_checks/top_domains:top500_domains",
+    "//components/url_formatter/spoof_checks/top_domains:top500_domains_header",
+    "//net",
+  ]
+}
+
+jumbo_source_set("unit_tests") {
+  testonly = true
+  sources = [ "lookalike_url_util_unittest.cc" ]
+
+  deps = [
+    ":lookalikes",
+    "//net:test_support",
+    "//testing/gtest",
+  ]
+}
diff --git a/chromium/components/lookalikes/DEPS b/chromium/components/lookalikes/DEPS
new file mode 100644
index 00000000000..563bb1e8d24
--- /dev/null
+++ b/chromium/components/lookalikes/DEPS
@@ -0,0 +1,5 @@
+include_rules = [
+  "+components/security_state",
+  "+components/url_formatter",
+  "+net/base",
+]
diff --git a/chromium/components/lookalikes/OWNERS b/chromium/components/lookalikes/OWNERS
new file mode 100644
index 00000000000..1d70c9fef31
--- /dev/null
+++ b/chromium/components/lookalikes/OWNERS
@@ -0,0 +1,3 @@
+file://chrome/browser/lookalikes/OWNERS
+
+# COMPONENT: UI>Browser>Interstitials
diff --git a/chromium/components/lookalikes/README b/chromium/components/lookalikes/README
new file mode 100644
index 00000000000..7aa6eeaccef
--- /dev/null
+++ b/chromium/components/lookalikes/README
@@ -0,0 +1,4 @@
+This directory contains shared code used for the Lookalike URL blocking page.
+
+The lookalike interstitial is triggered when a user visits a domain that is
+similar to that of a domain of a popular site.
+\ No newline at end of file
diff --git a/chromium/components/lookalikes/lookalike_url_util.cc b/chromium/components/lookalikes/lookalike_url_util.cc
new file mode 100644
index 00000000000..fa386d5d6f8
--- /dev/null
+++ b/chromium/components/lookalikes/lookalike_url_util.cc
@@ -0,0 +1,436 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/lookalikes/lookalike_url_util.h"
+
+#include <utility>
+
+#include "base/bind.h"
+#include "base/callback.h"
+#include "base/macros.h"
+#include "base/memory/scoped_refptr.h"
+#include "base/memory/singleton.h"
+#include "base/metrics/field_trial_params.h"
+#include "base/metrics/histogram_macros.h"
+#include "base/strings/string_split.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/task/post_task.h"
+#include "base/task/thread_pool.h"
+#include "base/time/default_clock.h"
+#include "components/security_state/core/features.h"
+#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
+#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
+#include "components/url_formatter/url_formatter.h"
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
+#include "net/base/url_util.h"
+
+namespace lookalikes {
+
+const char kHistogramName[] = "NavigationSuggestion.Event";
+const base::FeatureParam<std::string> kImportantTlds{
+    &security_state::features::kSafetyTipUI, "targetembedding_important_tlds",
+    "com,edu,org,gov"};
+
+}  // namespace lookalikes
+
+namespace {
+
+bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
+                    const url_formatter::Skeletons& skeletons2) {
+  DCHECK(!skeletons1.empty());
+  DCHECK(!skeletons2.empty());
+  for (const std::string& skeleton1 : skeletons1) {
+    if (base::Contains(skeletons2, skeleton1)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns a site that the user has used before that the eTLD+1 in
+// |domain_and_registry| may be attempting to spoof, based on skeleton
+// comparison.
+std::string GetMatchingSiteEngagementDomain(
+    const std::vector<DomainInfo>& engaged_sites,
+    const DomainInfo& navigated_domain) {
+  DCHECK(!navigated_domain.domain_and_registry.empty());
+  for (const DomainInfo& engaged_site : engaged_sites) {
+    DCHECK(!engaged_site.domain_and_registry.empty());
+    DCHECK_NE(navigated_domain.domain_and_registry,
+              engaged_site.domain_and_registry);
+    if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) {
+      return engaged_site.domain_and_registry;
+    }
+  }
+  return std::string();
+}
+
+// Returns the first matching top domain with an edit distance of at most one
+// to |domain_and_registry|. This search is done in lexicographic order on the
+// top 500 suitable domains, instead of in order by popularity. This means that
+// the resulting "similar" domain may not be the most popular domain that
+// matches.
+std::string GetSimilarDomainFromTop500(const DomainInfo& navigated_domain) {
+  for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
+    for (const char* const top_domain_skeleton :
+         top500_domains::kTop500EditDistanceSkeletons) {
+      if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
+                                  base::UTF8ToUTF16(top_domain_skeleton))) {
+        const std::string top_domain =
+            url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton)
+                .domain;
+        DCHECK(!top_domain.empty());
+        // If the only difference between the navigated and top
+        // domains is the registry part, this is unlikely to be a spoofing
+        // attempt. Ignore this match and continue. E.g. If the navigated domain
+        // is google.com.tw and the top domain is google.com.tr, this won't
+        // produce a match.
+        const std::string top_domain_without_registry =
+            url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
+        DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
+            top_domain_without_registry));
+        if (navigated_domain.domain_without_registry !=
+            top_domain_without_registry) {
+          return top_domain;
+        }
+      }
+    }
+  }
+  return std::string();
+}
+
+// Returns the first matching engaged domain with an edit distance of at most
+// one to |domain_and_registry|.
+std::string GetSimilarDomainFromEngagedSites(
+    const DomainInfo& navigated_domain,
+    const std::vector<DomainInfo>& engaged_sites) {
+  for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
+    for (const DomainInfo& engaged_site : engaged_sites) {
+      if (!url_formatter::top_domains::IsEditDistanceCandidate(
+              engaged_site.domain_and_registry)) {
+        continue;
+      }
+      for (const std::string& engaged_skeleton : engaged_site.skeletons) {
+        if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
+                                    base::UTF8ToUTF16(engaged_skeleton))) {
+          // If the only difference between the navigated and engaged
+          // domain is the registry part, this is unlikely to be a spoofing
+          // attempt. Ignore this match and continue. E.g. If the navigated
+          // domain is google.com.tw and the top domain is google.com.tr, this
+          // won't produce a match.
+          if (navigated_domain.domain_without_registry !=
+              engaged_site.domain_without_registry) {
+            return engaged_site.domain_and_registry;
+          }
+        }
+      }
+    }
+  }
+  return std::string();
+}
+
+void RecordEvent(NavigationSuggestionEvent event) {
+  UMA_HISTOGRAM_ENUMERATION(lookalikes::kHistogramName, event);
+}
+
+// Returns the parts of the url that are separated by "." or "-" not including
+// the eTLD.
+std::vector<base::string16> SplitNoneTLDDomainIntoTokens(
+    const base::string16& host_without_etld) {
+  return base::SplitString(host_without_etld, base::ASCIIToUTF16("-."),
+                           base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+}
+
+// For each possible e2LD+eTLD pair, check whether it forms a top domain.
+bool IsTopDomainCandidate(const std::set<std::string>& important_tlds,
+                          const base::string16& e2LD,
+                          GURL* found_domain) {
+  // We need to identify top domains, even when the spoof uses the 'wrong' TLD
+  // (e.g. google.gov). To do that, we check the embedded domain with each
+  // possible |important_tld| against the top domain list.
+  for (const auto& tld : important_tlds) {
+    // Create a GURL so we can get a DomainInfo from it for IsTopDomain
+    // e2LD is the smallest unit of a domain name that could be registered.
+    // (e.g. example in example.com)
+    base::string16 target16 =
+        e2LD + base::ASCIIToUTF16(".") + base::ASCIIToUTF16(tld);
+    GURL possible_target(base::ASCIIToUTF16(url::kHttpsScheme) +
+                         base::ASCIIToUTF16(url::kStandardSchemeSeparator) +
+                         target16);
+    DomainInfo possible_target_domain = GetDomainInfo(possible_target);
+    if (IsTopDomain(possible_target_domain)) {
+      *found_domain = GURL(possible_target.spec());
+      return true;
+    }
+    // If no match is found, check if e2LD is a unicode spoof
+    std::string top_targeted_domain =
+        url_formatter::IDNSpoofChecker().GetSimilarTopDomain(target16).domain;
+    if (!top_targeted_domain.empty()) {
+      *found_domain = GURL(std::string(url::kHttpsScheme) +
+                           url::kStandardSchemeSeparator + top_targeted_domain);
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+DomainInfo::DomainInfo(const std::string& arg_hostname,
+                       const std::string& arg_domain_and_registry,
+                       const std::string& arg_domain_without_registry,
+                       const url_formatter::IDNConversionResult& arg_idn_result,
+                       const url_formatter::Skeletons& arg_skeletons)
+    : hostname(arg_hostname),
+      domain_and_registry(arg_domain_and_registry),
+      domain_without_registry(arg_domain_without_registry),
+      idn_result(arg_idn_result),
+      skeletons(arg_skeletons) {}
+
+DomainInfo::~DomainInfo() = default;
+
+DomainInfo::DomainInfo(const DomainInfo&) = default;
+
+DomainInfo GetDomainInfo(const GURL& url) {
+  if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) {
+    return DomainInfo(std::string(), std::string(), std::string(),
+                      url_formatter::IDNConversionResult(),
+                      url_formatter::Skeletons());
+  }
+  const std::string hostname = url.host();
+  const std::string domain_and_registry = GetETLDPlusOne(url.host());
+  const std::string domain_without_registry =
+      domain_and_registry.empty()
+          ? std::string()
+          : url_formatter::top_domains::HostnameWithoutRegistry(
+                domain_and_registry);
+
+  // eTLD+1 can be empty for private domains.
+  if (domain_and_registry.empty()) {
+    return DomainInfo(hostname, domain_and_registry, domain_without_registry,
+                      url_formatter::IDNConversionResult(),
+                      url_formatter::Skeletons());
+  }
+  // Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing
+  // checks in url_formatter can cause the converted result to be punycode.
+  // We want to avoid this in order to get an accurate skeleton for the unicode
+  // version of the domain.
+  const url_formatter::IDNConversionResult idn_result =
+      url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry);
+  const url_formatter::Skeletons skeletons =
+      url_formatter::GetSkeletons(idn_result.result);
+  return DomainInfo(hostname, domain_and_registry, domain_without_registry,
+                    idn_result, skeletons);
+}
+
+std::string GetETLDPlusOne(const std::string& hostname) {
+  return net::registry_controlled_domains::GetDomainAndRegistry(
+      hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
+}
+
+bool IsEditDistanceAtMostOne(const base::string16& str1,
+                             const base::string16& str2) {
+  if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
+    return false;
+  }
+  base::string16::const_iterator i = str1.begin();
+  base::string16::const_iterator j = str2.begin();
+  size_t edit_count = 0;
+  while (i != str1.end() && j != str2.end()) {
+    if (*i == *j) {
+      i++;
+      j++;
+    } else {
+      edit_count++;
+      if (edit_count > 1) {
+        return false;
+      }
+
+      if (str1.size() > str2.size()) {
+        // First string is longer than the second. This can only happen if the
+        // first string has an extra character.
+        i++;
+      } else if (str2.size() > str1.size()) {
+        // Second string is longer than the first. This can only happen if the
+        // second string has an extra character.
+        j++;
+      } else {
+        // Both strings are the same length. This can only happen if the two
+        // strings differ by a single character.
+        i++;
+        j++;
+      }
+    }
+  }
+  if (i != str1.end() || j != str2.end()) {
+    // A character at the end did not match.
+    edit_count++;
+  }
+  return edit_count <= 1;
+}
+
+bool IsTopDomain(const DomainInfo& domain_info) {
+  // Top domains are only accessible through their skeletons, so query the top
+  // domains trie for each skeleton of this domain.
+  for (const std::string& skeleton : domain_info.skeletons) {
+    const url_formatter::TopDomainEntry top_domain =
+        url_formatter::LookupSkeletonInTopDomains(skeleton);
+    if (domain_info.domain_and_registry == top_domain.domain) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type,
+                                       const DomainInfo& navigated_domain) {
+  if (match_type == LookalikeUrlMatchType::kSiteEngagement) {
+    return true;
+  }
+  return match_type == LookalikeUrlMatchType::kTopSite &&
+         navigated_domain.idn_result.matching_top_domain.is_top_500;
+}
+
+bool GetMatchingDomain(const DomainInfo& navigated_domain,
+                       const std::vector<DomainInfo>& engaged_sites,
+                       std::string* matched_domain,
+                       LookalikeUrlMatchType* match_type) {
+  DCHECK(!navigated_domain.domain_and_registry.empty());
+  DCHECK(matched_domain);
+  DCHECK(match_type);
+
+  if (navigated_domain.idn_result.has_idn_component) {
+    // If the navigated domain is IDN, check its skeleton against engaged sites
+    // and top domains.
+    const std::string matched_engaged_domain =
+        GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
+    if (!matched_engaged_domain.empty()) {
+      *matched_domain = matched_engaged_domain;
+      *match_type = LookalikeUrlMatchType::kSiteEngagement;
+      return true;
+    }
+
+    if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) {
+      // In practice, this is not possible since the top domain list does not
+      // contain IDNs, so domain_and_registry can't both have IDN and be a top
+      // domain. Still, sanity check in case the top domain list changes in the
+      // future.
+      // At this point, navigated domain should not be a top domain.
+      DCHECK_NE(navigated_domain.domain_and_registry,
+                navigated_domain.idn_result.matching_top_domain.domain);
+      *matched_domain = navigated_domain.idn_result.matching_top_domain.domain;
+      *match_type = LookalikeUrlMatchType::kTopSite;
+      return true;
+    }
+  }
+
+  if (url_formatter::top_domains::IsEditDistanceCandidate(
+          navigated_domain.domain_and_registry)) {
+    // If we can't find an exact top domain or an engaged site, try to find an
+    // engaged domain within an edit distance of one.
+    const std::string similar_engaged_domain =
+        GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites);
+    if (!similar_engaged_domain.empty() &&
+        navigated_domain.domain_and_registry != similar_engaged_domain) {
+      *matched_domain = similar_engaged_domain;
+      *match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement;
+      return true;
+    }
+
+    // Finally, try to find a top domain within an edit distance of one.
+    const std::string similar_top_domain =
+        GetSimilarDomainFromTop500(navigated_domain);
+    if (!similar_top_domain.empty() &&
+        navigated_domain.domain_and_registry != similar_top_domain) {
+      *matched_domain = similar_top_domain;
+      *match_type = LookalikeUrlMatchType::kEditDistance;
+      return true;
+    }
+  }
+
+  GURL safe_url;
+  std::vector<std::string> important_tlds_list =
+      base::SplitString(lookalikes::kImportantTlds.Get(), ",",
+                        base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+  std::set<std::string> important_tlds(important_tlds_list.begin(),
+                                       important_tlds_list.end());
+  if (IsTargetEmbeddingLookalike(
+          GURL(std::string(url::kHttpsScheme) +
+               std::string(url::kStandardSchemeSeparator) +
+               navigated_domain.hostname),
+          important_tlds, &safe_url)) {
+    *matched_domain = safe_url.host();
+    *match_type = LookalikeUrlMatchType::kTargetEmbedding;
+    return true;
+  }
+
+  return false;
+}
+
+void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) {
+  switch (match_type) {
+    case LookalikeUrlMatchType::kTopSite:
+      RecordEvent(NavigationSuggestionEvent::kMatchTopSite);
+      break;
+    case LookalikeUrlMatchType::kSiteEngagement:
+      RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
+      break;
+    case LookalikeUrlMatchType::kEditDistance:
+      RecordEvent(NavigationSuggestionEvent::kMatchEditDistance);
+      break;
+    case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
+      RecordEvent(NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement);
+      break;
+    case LookalikeUrlMatchType::kTargetEmbedding:
+      RecordEvent(NavigationSuggestionEvent::kMatchTargetEmbedding);
+      break;
+    case LookalikeUrlMatchType::kNone:
+      break;
+  }
+}
+
+bool IsTargetEmbeddingLookalike(const GURL& url,
+                                const std::set<std::string>& important_tlds,
+                                GURL* safe_url) {
+  DCHECK(url.SchemeIsHTTPOrHTTPS());
+
+  size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
+      url, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
+      net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
+  // url.host() will give punycode-encoded hostname, as we need all the unicode
+  // characters to stay in the url for further check we convert host to unicode
+  base::string16 host =
+      url_formatter::UnsafeIDNToUnicodeWithDetails(url.host()).result;
+  base::string16 host_without_etld =
+      host.substr(0, host.size() - 1 - registry_length);
+  const std::vector<base::string16> hostname_tokens_without_etld =
+      SplitNoneTLDDomainIntoTokens(host_without_etld);
+
+  // When we find a valid TLD, we look backwards to the previous token
+  // to see if we can use it to build a top domain.
+  base::string16 prev_part = base::EmptyString16();
+
+  // We could have domains separated by '-'s or '.'s, in order to find target
+  // embedding urls with google.com.com or google-com.com, we get url parts as
+  // anything that is between two '-'s or '.'s. We check to see if an important
+  // TLD is following an important domain.
+  // Because of the way this matching is working, we can not identify target
+  // embedding attacks on legitimate websites that contain '-' in their names
+  // (e.g programme-tv.net).
+  for (const auto& token : hostname_tokens_without_etld) {
+    if (prev_part.empty()) {
+      prev_part = token;
+      continue;
+    }
+
+    const std::string tld = base::UTF16ToUTF8(token);
+    if (base::Contains(important_tlds, tld) &&
+        IsTopDomainCandidate(important_tlds, prev_part, safe_url)) {
+      return true;
+    }
+    prev_part = token;
+  }
+  *safe_url = GURL();
+  return false;
+}
diff --git a/chromium/components/lookalikes/lookalike_url_util.h b/chromium/components/lookalikes/lookalike_url_util.h
new file mode 100644
index 00000000000..2d33bb0dec5
--- /dev/null
+++ b/chromium/components/lookalikes/lookalike_url_util.h
@@ -0,0 +1,144 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_
+#define COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "base/time/time.h"
+#include "components/url_formatter/url_formatter.h"
+#include "url/gurl.h"
+
+class GURL;
+
+namespace lookalikes {
+extern const char kHistogramName[];
+}
+
+// Used for UKM. There is only a single LookalikeUrlMatchType per navigation.
+enum class LookalikeUrlMatchType {
+  kNone = 0,
+  kTopSite = 1,
+  kSiteEngagement = 2,
+  kEditDistance = 3,
+  kEditDistanceSiteEngagement = 4,
+  kTargetEmbedding = 5,
+
+  // Append new items to the end of the list above; do not modify or replace
+  // existing values. Comment out obsolete items.
+  kMaxValue = kTargetEmbedding,
+};
+
+// Used for UKM. There is only a single LookalikeUrlBlockingPageUserAction per
+// navigation.
+enum class LookalikeUrlBlockingPageUserAction {
+  kInterstitialNotShown = 0,
+  kClickThrough = 1,
+  kAcceptSuggestion = 2,
+  kCloseOrBack = 3,
+
+  // Append new items to the end of the list above; do not modify or replace
+  // existing values. Comment out obsolete items.
+  kMaxValue = kCloseOrBack,
+};
+
+// Used for metrics. Multiple events can occur per navigation.
+enum class NavigationSuggestionEvent {
+  kNone = 0,
+  // Interstitial results recorded using security_interstitials::MetricsHelper
+  // kInfobarShown = 1,
+  // kLinkClicked = 2,
+  kMatchTopSite = 3,
+  kMatchSiteEngagement = 4,
+  kMatchEditDistance = 5,
+  kMatchEditDistanceSiteEngagement = 6,
+  kMatchTargetEmbedding = 7,
+
+  // Append new items to the end of the list above; do not modify or
+  // replace existing values. Comment out obsolete items.
+  kMaxValue = kMatchTargetEmbedding,
+};
+
+struct DomainInfo {
+  // The full ASCII hostname, used in detecting target embedding. For
+  // "https://www.google.com/mail" this will be "www.google.com".
+  const std::string hostname;
+  // eTLD+1, used for skeleton and edit distance comparison. Must be ASCII.
+  // Empty for non-unique domains, localhost or sites whose eTLD+1 is empty.
+  const std::string domain_and_registry;
+  // eTLD+1 without the registry part, and with a trailing period. For
+  // "www.google.com", this will be "google.". Used for edit distance
+  // comparisons. Empty for non-unique domains, localhost or sites whose eTLD+1
+  // is empty.
+  const std::string domain_without_registry;
+
+  // Result of IDN conversion of domain_and_registry field.
+  const url_formatter::IDNConversionResult idn_result;
+  // Skeletons of domain_and_registry field.
+  const url_formatter::Skeletons skeletons;
+
+  DomainInfo(const std::string& arg_hostname,
+             const std::string& arg_domain_and_registry,
+             const std::string& arg_domain_without_registry,
+             const url_formatter::IDNConversionResult& arg_idn_result,
+             const url_formatter::Skeletons& arg_skeletons);
+  ~DomainInfo();
+  DomainInfo(const DomainInfo& other);
+};
+
+// Returns a DomainInfo instance computed from |url|. Will return empty fields
+// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is
+// empty.
+DomainInfo GetDomainInfo(const GURL& url);
+
+// Returns true if the Levenshtein distance between |str1| and |str2| is at most
+// one. This has O(max(n,m)) complexity as opposed to O(n*m) of the usual edit
+// distance computation.
+bool IsEditDistanceAtMostOne(const base::string16& str1,
+                             const base::string16& str2);
+
+// Returns true if the domain given by |domain_info| is a top domain.
+bool IsTopDomain(const DomainInfo& domain_info);
+
+// Returns eTLD+1 of |hostname|. This excludes private registries, and returns
+// "blogspot.com" for "test.blogspot.com" (blogspot.com is listed as a private
+// registry). We do this to be consistent with url_formatter's top domain list
+// which doesn't have a notion of private registries.
+std::string GetETLDPlusOne(const std::string& hostname);
+
+// Returns true if a lookalike interstitial should be shown.
+bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type,
+                                       const DomainInfo& navigated_domain);
+
+// Returns true if a domain is visually similar to the hostname of |url|. The
+// matching domain can be a top domain or an engaged site. Similarity
+// check is made using both visual skeleton and edit distance comparison.  If
+// this returns true, match details will be written into |matched_domain|.
+// Pointer arguments can't be nullptr.
+bool GetMatchingDomain(const DomainInfo& navigated_domain,
+                       const std::vector<DomainInfo>& engaged_sites,
+                       std::string* matched_domain,
+                       LookalikeUrlMatchType* match_type);
+
+void RecordUMAFromMatchType(LookalikeUrlMatchType match_type);
+
+// Checks to see if a URL is a target embedding lookalike. This function sets
+// |safe_url| to the url of the embedded target domain.
+// At the moment we consider the following cases as Target Embedding:
+// example-google.com-site.com, example.google.com-site.com,
+// example-google-com-site.com, example.google.com.site.com,
+// example-googlé.com-site.com where the embedded target is google.com. In
+// addition to these examples, this function also detects domains embedded with
+// alternate TLDs, if the TLD is included in |important_tlds| (e.g. google.edu
+// instead of google.com in the example URLs above.). To reduce false positives,
+// we exclude cases where the eTLD of the possibly-unsafe domain contains more
+// than just the TLD of the embedded domain. For instance, we exclude
+// foo-google.co.uk.
+bool IsTargetEmbeddingLookalike(const GURL& url,
+                                const std::set<std::string>& important_tlds,
+                                GURL* safe_url);
+
+#endif  // COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_
diff --git a/chromium/components/lookalikes/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/lookalike_url_util_unittest.cc
new file mode 100644
index 00000000000..031aa39ca13
--- /dev/null
+++ b/chromium/components/lookalikes/lookalike_url_util_unittest.cc
@@ -0,0 +1,175 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/lookalikes/lookalike_url_util.h"
+
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
+  const struct TestCase {
+    const wchar_t* domain;
+    const wchar_t* top_domain;
+    bool expected;
+  } kTestCases[] = {
+      {L"", L"", true},
+      {L"a", L"a", true},
+      {L"a", L"", true},
+      {L"", L"a", true},
+
+      {L"", L"ab", false},
+      {L"ab", L"", false},
+
+      {L"ab", L"a", true},
+      {L"a", L"ab", true},
+      {L"ab", L"b", true},
+      {L"b", L"ab", true},
+      {L"ab", L"ab", true},
+
+      {L"", L"ab", false},
+      {L"ab", L"", false},
+      {L"a", L"abc", false},
+      {L"abc", L"a", false},
+
+      {L"aba", L"ab", true},
+      {L"ba", L"aba", true},
+      {L"abc", L"ac", true},
+      {L"ac", L"abc", true},
+
+      // Same length.
+      {L"xbc", L"ybc", true},
+      {L"axc", L"ayc", true},
+      {L"abx", L"aby", true},
+
+      // Should also work for non-ASCII.
+      {L"é", L"", true},
+      {L"", L"é", true},
+      {L"tést", L"test", true},
+      {L"test", L"tést", true},
+      {L"tés", L"test", false},
+      {L"test", L"tés", false},
+
+      // Real world test cases.
+      {L"google.com", L"gooogle.com", true},
+      {L"gogle.com", L"google.com", true},
+      {L"googlé.com", L"google.com", true},
+      {L"google.com", L"googlé.com", true},
+      // Different by two characters.
+      {L"google.com", L"goooglé.com", false},
+  };
+  for (const TestCase& test_case : kTestCases) {
+    bool result =
+        IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
+                                base::WideToUTF16(test_case.top_domain));
+    EXPECT_EQ(test_case.expected, result);
+  }
+}
+
+TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
+  const std::set<std::string> important_tlds = {"com", "org", "edu", "gov",
+                                                "co"};
+  const struct TargetEmbeddingHeuristicTestCase {
+    const GURL url;
+    bool should_trigger;
+  } kTestCases[] = {
+
+      // We test everything with the correct TLD and another popular TLD.
+
+      // Scheme should not affect the outcome.
+      {GURL("http://google.com.com"), true},
+      {GURL("https://google.com.com"), true},
+
+      // The length of the url should not affect the outcome.
+      {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-"
+            "outcome-of-this-target-embedding-test-google.com-login.com"),
+       true},
+      {GURL(
+           "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-"
+           "outcome-of-this-target-embedding-test.com-login.com"),
+       false},
+      {GURL(
+           "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-"
+           "outcome-of-this-target-embedding-test.com-login.com"),
+       false},
+
+      // We need exact skeleton match for our domain so exclude edit-distance
+      // matches.
+      {GURL("http://goog0le.com-login.com"), false},
+
+      // Unicode characters should be handled
+      {GURL("http://googlé.com-login.com"), true},
+      {GURL("http://sth-googlé.com-sth.com"), true},
+
+      // The basic state
+      {GURL("http://google.com.sth.com"), true},
+      // - before the domain name should be ignored.
+      {GURL("http://sth-google.com-sth.com"), true},
+
+      // The embedded target's TLD doesn't necessarily need to be followed by a
+      // '-' and could be a subdomain by itself.
+      {GURL("http://sth-google.com.sth.com"), true},
+      {GURL("http://a.b.c.d.e.f.g.h.sth-google.com.sth.com"), true},
+      {GURL("http://a.b.c.d.e.f.g.h.google.com-sth.com"), true},
+      {GURL("http://1.2.3.4.5.6.google.com-sth.com"), true},
+
+      // Target domain could be in the middle of subdomains.
+      {GURL("http://sth.google.com.sth.com"), true},
+      {GURL("http://sth.google.com-sth.com"), true},
+
+      // The target domain and its tld should be next to each other.
+      {GURL("http://sth-google.l.com-sth.com"), false},
+
+      {GURL("http://google.edu.com"), true},
+      {GURL("https://google.edu.com"), true},
+      {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-"
+            "outcome-of-this-target-embedding-test-google.edu-login.com"),
+       true},
+      {GURL(
+           "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-"
+           "outcome-of-this-target-embedding-test.edu-login.com"),
+       false},
+      {GURL(
+           "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-"
+           "outcome-of-this-target-embedding-test.edu-login.com"),
+       false},
+      {GURL("http://goog0le.edu-login.com"), false},
+      {GURL("http://googlé.edu-login.com"), true},
+      {GURL("http://sth-googlé.edu-sth.com"), true},
+      {GURL("http://google.edu.sth.com"), true},
+      {GURL("http://sth-google.edu-sth.com"), true},
+      {GURL("http://sth-google.edu.sth.com"), true},
+      {GURL("http://a.b.c.d.e.f.g.h.sth-google.edu.sth.com"), true},
+      {GURL("http://a.b.c.d.e.f.g.h.google.edu-sth.com"), true},
+      {GURL("http://1.2.3.4.5.6.google.edu-sth.com"), true},
+      {GURL("http://sth.google.edu.sth.com"), true},
+      {GURL("http://sth.google.edu-sth.com"), true},
+      {GURL("http://sth-google.l.edu-sth.com"), false},
+      {GURL("http://sth-google-l.edu-sth.com"), false},
+      {GURL("http://sth-google.l-edu-sth.com"), false},
+
+      // Target domain might be separated with a dash instead of dot.
+      {GURL("http://sth.google-com-sth.com"), true},
+
+      // Ensure legitimate domains don't trigger.
+      {GURL("http://google.com"), false},
+      {GURL("http://google.co.uk"), false},
+      {GURL("http://google.randomreg-login.com"), false},
+
+  };
+
+  for (const auto& kTestCase : kTestCases) {
+    GURL safe_url = GURL();
+    if (kTestCase.should_trigger) {
+      EXPECT_TRUE(
+          IsTargetEmbeddingLookalike(kTestCase.url, important_tlds, &safe_url))
+          << "Expected that \"" << kTestCase.url
+          << " should trigger but it didn't.";
+    } else {
+      EXPECT_FALSE(
+          IsTargetEmbeddingLookalike(kTestCase.url, important_tlds, &safe_url))
+          << "Expected that \"" << kTestCase.url
+          << " shouldn't trigger but it did. For URL: " << safe_url.spec();
+    }
+  }
+}
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2020-07-16 11:45:35 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2020-07-17 08:59:23 +0000
commit	552906b0f222c5d5dd11b9fd73829d510980461a (patch)
tree	3a11e6ed0538a81dd83b20cf3a4783e297f26d91 /chromium/components/lookalikes
parent	1b05827804eaf047779b597718c03e7d38344261 (diff)
download	qtwebengine-chromium-552906b0f222c5d5dd11b9fd73829d510980461a.tar.gz