diff options
Diffstat (limited to 'chromium/components/lookalikes')
-rw-r--r-- | chromium/components/lookalikes/BUILD.gn | 32 | ||||
-rw-r--r-- | chromium/components/lookalikes/DEPS | 5 | ||||
-rw-r--r-- | chromium/components/lookalikes/OWNERS | 3 | ||||
-rw-r--r-- | chromium/components/lookalikes/README | 4 | ||||
-rw-r--r-- | chromium/components/lookalikes/lookalike_url_util.cc | 436 | ||||
-rw-r--r-- | chromium/components/lookalikes/lookalike_url_util.h | 144 | ||||
-rw-r--r-- | chromium/components/lookalikes/lookalike_url_util_unittest.cc | 175 |
7 files changed, 799 insertions, 0 deletions
diff --git a/chromium/components/lookalikes/BUILD.gn b/chromium/components/lookalikes/BUILD.gn new file mode 100644 index 00000000000..c28b4f05f45 --- /dev/null +++ b/chromium/components/lookalikes/BUILD.gn @@ -0,0 +1,32 @@ +# Copyright 2020 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import("//build/config/jumbo.gni") + +jumbo_static_library("lookalikes") { + sources = [ + "lookalike_url_util.cc", + "lookalike_url_util.h", + ] + deps = [ + "//base", + "//components/security_state/core:features", + "//components/url_formatter", + "//components/url_formatter/spoof_checks/top_domains:common", + "//components/url_formatter/spoof_checks/top_domains:top500_domains", + "//components/url_formatter/spoof_checks/top_domains:top500_domains_header", + "//net", + ] +} + +jumbo_source_set("unit_tests") { + testonly = true + sources = [ "lookalike_url_util_unittest.cc" ] + + deps = [ + ":lookalikes", + "//net:test_support", + "//testing/gtest", + ] +} diff --git a/chromium/components/lookalikes/DEPS b/chromium/components/lookalikes/DEPS new file mode 100644 index 00000000000..563bb1e8d24 --- /dev/null +++ b/chromium/components/lookalikes/DEPS @@ -0,0 +1,5 @@ +include_rules = [ + "+components/security_state", + "+components/url_formatter", + "+net/base", +] diff --git a/chromium/components/lookalikes/OWNERS b/chromium/components/lookalikes/OWNERS new file mode 100644 index 00000000000..1d70c9fef31 --- /dev/null +++ b/chromium/components/lookalikes/OWNERS @@ -0,0 +1,3 @@ +file://chrome/browser/lookalikes/OWNERS + +# COMPONENT: UI>Browser>Interstitials diff --git a/chromium/components/lookalikes/README b/chromium/components/lookalikes/README new file mode 100644 index 00000000000..7aa6eeaccef --- /dev/null +++ b/chromium/components/lookalikes/README @@ -0,0 +1,4 @@ +This directory contains shared code used for the Lookalike URL blocking page. + +The lookalike interstitial is triggered when a user visits a domain that is +similar to that of a domain of a popular site.
\ No newline at end of file diff --git a/chromium/components/lookalikes/lookalike_url_util.cc b/chromium/components/lookalikes/lookalike_url_util.cc new file mode 100644 index 00000000000..fa386d5d6f8 --- /dev/null +++ b/chromium/components/lookalikes/lookalike_url_util.cc @@ -0,0 +1,436 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/lookalikes/lookalike_url_util.h" + +#include <utility> + +#include "base/bind.h" +#include "base/callback.h" +#include "base/macros.h" +#include "base/memory/scoped_refptr.h" +#include "base/memory/singleton.h" +#include "base/metrics/field_trial_params.h" +#include "base/metrics/histogram_macros.h" +#include "base/strings/string_split.h" +#include "base/strings/utf_string_conversions.h" +#include "base/task/post_task.h" +#include "base/task/thread_pool.h" +#include "base/time/default_clock.h" +#include "components/security_state/core/features.h" +#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h" +#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h" +#include "components/url_formatter/url_formatter.h" +#include "net/base/registry_controlled_domains/registry_controlled_domain.h" +#include "net/base/url_util.h" + +namespace lookalikes { + +const char kHistogramName[] = "NavigationSuggestion.Event"; +const base::FeatureParam<std::string> kImportantTlds{ + &security_state::features::kSafetyTipUI, "targetembedding_important_tlds", + "com,edu,org,gov"}; + +} // namespace lookalikes + +namespace { + +bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1, + const url_formatter::Skeletons& skeletons2) { + DCHECK(!skeletons1.empty()); + DCHECK(!skeletons2.empty()); + for (const std::string& skeleton1 : skeletons1) { + if (base::Contains(skeletons2, skeleton1)) { + return true; + } + } + return false; +} + +// Returns a site that the user has used before that the eTLD+1 in +// |domain_and_registry| may be attempting to spoof, based on skeleton +// comparison. +std::string GetMatchingSiteEngagementDomain( + const std::vector<DomainInfo>& engaged_sites, + const DomainInfo& navigated_domain) { + DCHECK(!navigated_domain.domain_and_registry.empty()); + for (const DomainInfo& engaged_site : engaged_sites) { + DCHECK(!engaged_site.domain_and_registry.empty()); + DCHECK_NE(navigated_domain.domain_and_registry, + engaged_site.domain_and_registry); + if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) { + return engaged_site.domain_and_registry; + } + } + return std::string(); +} + +// Returns the first matching top domain with an edit distance of at most one +// to |domain_and_registry|. This search is done in lexicographic order on the +// top 500 suitable domains, instead of in order by popularity. This means that +// the resulting "similar" domain may not be the most popular domain that +// matches. +std::string GetSimilarDomainFromTop500(const DomainInfo& navigated_domain) { + for (const std::string& navigated_skeleton : navigated_domain.skeletons) { + for (const char* const top_domain_skeleton : + top500_domains::kTop500EditDistanceSkeletons) { + if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton), + base::UTF8ToUTF16(top_domain_skeleton))) { + const std::string top_domain = + url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton) + .domain; + DCHECK(!top_domain.empty()); + // If the only difference between the navigated and top + // domains is the registry part, this is unlikely to be a spoofing + // attempt. Ignore this match and continue. E.g. If the navigated domain + // is google.com.tw and the top domain is google.com.tr, this won't + // produce a match. + const std::string top_domain_without_registry = + url_formatter::top_domains::HostnameWithoutRegistry(top_domain); + DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( + top_domain_without_registry)); + if (navigated_domain.domain_without_registry != + top_domain_without_registry) { + return top_domain; + } + } + } + } + return std::string(); +} + +// Returns the first matching engaged domain with an edit distance of at most +// one to |domain_and_registry|. +std::string GetSimilarDomainFromEngagedSites( + const DomainInfo& navigated_domain, + const std::vector<DomainInfo>& engaged_sites) { + for (const std::string& navigated_skeleton : navigated_domain.skeletons) { + for (const DomainInfo& engaged_site : engaged_sites) { + if (!url_formatter::top_domains::IsEditDistanceCandidate( + engaged_site.domain_and_registry)) { + continue; + } + for (const std::string& engaged_skeleton : engaged_site.skeletons) { + if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton), + base::UTF8ToUTF16(engaged_skeleton))) { + // If the only difference between the navigated and engaged + // domain is the registry part, this is unlikely to be a spoofing + // attempt. Ignore this match and continue. E.g. If the navigated + // domain is google.com.tw and the top domain is google.com.tr, this + // won't produce a match. + if (navigated_domain.domain_without_registry != + engaged_site.domain_without_registry) { + return engaged_site.domain_and_registry; + } + } + } + } + } + return std::string(); +} + +void RecordEvent(NavigationSuggestionEvent event) { + UMA_HISTOGRAM_ENUMERATION(lookalikes::kHistogramName, event); +} + +// Returns the parts of the url that are separated by "." or "-" not including +// the eTLD. +std::vector<base::string16> SplitNoneTLDDomainIntoTokens( + const base::string16& host_without_etld) { + return base::SplitString(host_without_etld, base::ASCIIToUTF16("-."), + base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); +} + +// For each possible e2LD+eTLD pair, check whether it forms a top domain. +bool IsTopDomainCandidate(const std::set<std::string>& important_tlds, + const base::string16& e2LD, + GURL* found_domain) { + // We need to identify top domains, even when the spoof uses the 'wrong' TLD + // (e.g. google.gov). To do that, we check the embedded domain with each + // possible |important_tld| against the top domain list. + for (const auto& tld : important_tlds) { + // Create a GURL so we can get a DomainInfo from it for IsTopDomain + // e2LD is the smallest unit of a domain name that could be registered. + // (e.g. example in example.com) + base::string16 target16 = + e2LD + base::ASCIIToUTF16(".") + base::ASCIIToUTF16(tld); + GURL possible_target(base::ASCIIToUTF16(url::kHttpsScheme) + + base::ASCIIToUTF16(url::kStandardSchemeSeparator) + + target16); + DomainInfo possible_target_domain = GetDomainInfo(possible_target); + if (IsTopDomain(possible_target_domain)) { + *found_domain = GURL(possible_target.spec()); + return true; + } + // If no match is found, check if e2LD is a unicode spoof + std::string top_targeted_domain = + url_formatter::IDNSpoofChecker().GetSimilarTopDomain(target16).domain; + if (!top_targeted_domain.empty()) { + *found_domain = GURL(std::string(url::kHttpsScheme) + + url::kStandardSchemeSeparator + top_targeted_domain); + return true; + } + } + return false; +} + +} // namespace + +DomainInfo::DomainInfo(const std::string& arg_hostname, + const std::string& arg_domain_and_registry, + const std::string& arg_domain_without_registry, + const url_formatter::IDNConversionResult& arg_idn_result, + const url_formatter::Skeletons& arg_skeletons) + : hostname(arg_hostname), + domain_and_registry(arg_domain_and_registry), + domain_without_registry(arg_domain_without_registry), + idn_result(arg_idn_result), + skeletons(arg_skeletons) {} + +DomainInfo::~DomainInfo() = default; + +DomainInfo::DomainInfo(const DomainInfo&) = default; + +DomainInfo GetDomainInfo(const GURL& url) { + if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) { + return DomainInfo(std::string(), std::string(), std::string(), + url_formatter::IDNConversionResult(), + url_formatter::Skeletons()); + } + const std::string hostname = url.host(); + const std::string domain_and_registry = GetETLDPlusOne(url.host()); + const std::string domain_without_registry = + domain_and_registry.empty() + ? std::string() + : url_formatter::top_domains::HostnameWithoutRegistry( + domain_and_registry); + + // eTLD+1 can be empty for private domains. + if (domain_and_registry.empty()) { + return DomainInfo(hostname, domain_and_registry, domain_without_registry, + url_formatter::IDNConversionResult(), + url_formatter::Skeletons()); + } + // Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing + // checks in url_formatter can cause the converted result to be punycode. + // We want to avoid this in order to get an accurate skeleton for the unicode + // version of the domain. + const url_formatter::IDNConversionResult idn_result = + url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry); + const url_formatter::Skeletons skeletons = + url_formatter::GetSkeletons(idn_result.result); + return DomainInfo(hostname, domain_and_registry, domain_without_registry, + idn_result, skeletons); +} + +std::string GetETLDPlusOne(const std::string& hostname) { + return net::registry_controlled_domains::GetDomainAndRegistry( + hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); +} + +bool IsEditDistanceAtMostOne(const base::string16& str1, + const base::string16& str2) { + if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) { + return false; + } + base::string16::const_iterator i = str1.begin(); + base::string16::const_iterator j = str2.begin(); + size_t edit_count = 0; + while (i != str1.end() && j != str2.end()) { + if (*i == *j) { + i++; + j++; + } else { + edit_count++; + if (edit_count > 1) { + return false; + } + + if (str1.size() > str2.size()) { + // First string is longer than the second. This can only happen if the + // first string has an extra character. + i++; + } else if (str2.size() > str1.size()) { + // Second string is longer than the first. This can only happen if the + // second string has an extra character. + j++; + } else { + // Both strings are the same length. This can only happen if the two + // strings differ by a single character. + i++; + j++; + } + } + } + if (i != str1.end() || j != str2.end()) { + // A character at the end did not match. + edit_count++; + } + return edit_count <= 1; +} + +bool IsTopDomain(const DomainInfo& domain_info) { + // Top domains are only accessible through their skeletons, so query the top + // domains trie for each skeleton of this domain. + for (const std::string& skeleton : domain_info.skeletons) { + const url_formatter::TopDomainEntry top_domain = + url_formatter::LookupSkeletonInTopDomains(skeleton); + if (domain_info.domain_and_registry == top_domain.domain) { + return true; + } + } + return false; +} + +bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type, + const DomainInfo& navigated_domain) { + if (match_type == LookalikeUrlMatchType::kSiteEngagement) { + return true; + } + return match_type == LookalikeUrlMatchType::kTopSite && + navigated_domain.idn_result.matching_top_domain.is_top_500; +} + +bool GetMatchingDomain(const DomainInfo& navigated_domain, + const std::vector<DomainInfo>& engaged_sites, + std::string* matched_domain, + LookalikeUrlMatchType* match_type) { + DCHECK(!navigated_domain.domain_and_registry.empty()); + DCHECK(matched_domain); + DCHECK(match_type); + + if (navigated_domain.idn_result.has_idn_component) { + // If the navigated domain is IDN, check its skeleton against engaged sites + // and top domains. + const std::string matched_engaged_domain = + GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain); + if (!matched_engaged_domain.empty()) { + *matched_domain = matched_engaged_domain; + *match_type = LookalikeUrlMatchType::kSiteEngagement; + return true; + } + + if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) { + // In practice, this is not possible since the top domain list does not + // contain IDNs, so domain_and_registry can't both have IDN and be a top + // domain. Still, sanity check in case the top domain list changes in the + // future. + // At this point, navigated domain should not be a top domain. + DCHECK_NE(navigated_domain.domain_and_registry, + navigated_domain.idn_result.matching_top_domain.domain); + *matched_domain = navigated_domain.idn_result.matching_top_domain.domain; + *match_type = LookalikeUrlMatchType::kTopSite; + return true; + } + } + + if (url_formatter::top_domains::IsEditDistanceCandidate( + navigated_domain.domain_and_registry)) { + // If we can't find an exact top domain or an engaged site, try to find an + // engaged domain within an edit distance of one. + const std::string similar_engaged_domain = + GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites); + if (!similar_engaged_domain.empty() && + navigated_domain.domain_and_registry != similar_engaged_domain) { + *matched_domain = similar_engaged_domain; + *match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement; + return true; + } + + // Finally, try to find a top domain within an edit distance of one. + const std::string similar_top_domain = + GetSimilarDomainFromTop500(navigated_domain); + if (!similar_top_domain.empty() && + navigated_domain.domain_and_registry != similar_top_domain) { + *matched_domain = similar_top_domain; + *match_type = LookalikeUrlMatchType::kEditDistance; + return true; + } + } + + GURL safe_url; + std::vector<std::string> important_tlds_list = + base::SplitString(lookalikes::kImportantTlds.Get(), ",", + base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); + std::set<std::string> important_tlds(important_tlds_list.begin(), + important_tlds_list.end()); + if (IsTargetEmbeddingLookalike( + GURL(std::string(url::kHttpsScheme) + + std::string(url::kStandardSchemeSeparator) + + navigated_domain.hostname), + important_tlds, &safe_url)) { + *matched_domain = safe_url.host(); + *match_type = LookalikeUrlMatchType::kTargetEmbedding; + return true; + } + + return false; +} + +void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) { + switch (match_type) { + case LookalikeUrlMatchType::kTopSite: + RecordEvent(NavigationSuggestionEvent::kMatchTopSite); + break; + case LookalikeUrlMatchType::kSiteEngagement: + RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement); + break; + case LookalikeUrlMatchType::kEditDistance: + RecordEvent(NavigationSuggestionEvent::kMatchEditDistance); + break; + case LookalikeUrlMatchType::kEditDistanceSiteEngagement: + RecordEvent(NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement); + break; + case LookalikeUrlMatchType::kTargetEmbedding: + RecordEvent(NavigationSuggestionEvent::kMatchTargetEmbedding); + break; + case LookalikeUrlMatchType::kNone: + break; + } +} + +bool IsTargetEmbeddingLookalike(const GURL& url, + const std::set<std::string>& important_tlds, + GURL* safe_url) { + DCHECK(url.SchemeIsHTTPOrHTTPS()); + + size_t registry_length = net::registry_controlled_domains::GetRegistryLength( + url, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, + net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); + // url.host() will give punycode-encoded hostname, as we need all the unicode + // characters to stay in the url for further check we convert host to unicode + base::string16 host = + url_formatter::UnsafeIDNToUnicodeWithDetails(url.host()).result; + base::string16 host_without_etld = + host.substr(0, host.size() - 1 - registry_length); + const std::vector<base::string16> hostname_tokens_without_etld = + SplitNoneTLDDomainIntoTokens(host_without_etld); + + // When we find a valid TLD, we look backwards to the previous token + // to see if we can use it to build a top domain. + base::string16 prev_part = base::EmptyString16(); + + // We could have domains separated by '-'s or '.'s, in order to find target + // embedding urls with google.com.com or google-com.com, we get url parts as + // anything that is between two '-'s or '.'s. We check to see if an important + // TLD is following an important domain. + // Because of the way this matching is working, we can not identify target + // embedding attacks on legitimate websites that contain '-' in their names + // (e.g programme-tv.net). + for (const auto& token : hostname_tokens_without_etld) { + if (prev_part.empty()) { + prev_part = token; + continue; + } + + const std::string tld = base::UTF16ToUTF8(token); + if (base::Contains(important_tlds, tld) && + IsTopDomainCandidate(important_tlds, prev_part, safe_url)) { + return true; + } + prev_part = token; + } + *safe_url = GURL(); + return false; +} diff --git a/chromium/components/lookalikes/lookalike_url_util.h b/chromium/components/lookalikes/lookalike_url_util.h new file mode 100644 index 00000000000..2d33bb0dec5 --- /dev/null +++ b/chromium/components/lookalikes/lookalike_url_util.h @@ -0,0 +1,144 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_ +#define COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_ + +#include <string> +#include <vector> + +#include "base/time/time.h" +#include "components/url_formatter/url_formatter.h" +#include "url/gurl.h" + +class GURL; + +namespace lookalikes { +extern const char kHistogramName[]; +} + +// Used for UKM. There is only a single LookalikeUrlMatchType per navigation. +enum class LookalikeUrlMatchType { + kNone = 0, + kTopSite = 1, + kSiteEngagement = 2, + kEditDistance = 3, + kEditDistanceSiteEngagement = 4, + kTargetEmbedding = 5, + + // Append new items to the end of the list above; do not modify or replace + // existing values. Comment out obsolete items. + kMaxValue = kTargetEmbedding, +}; + +// Used for UKM. There is only a single LookalikeUrlBlockingPageUserAction per +// navigation. +enum class LookalikeUrlBlockingPageUserAction { + kInterstitialNotShown = 0, + kClickThrough = 1, + kAcceptSuggestion = 2, + kCloseOrBack = 3, + + // Append new items to the end of the list above; do not modify or replace + // existing values. Comment out obsolete items. + kMaxValue = kCloseOrBack, +}; + +// Used for metrics. Multiple events can occur per navigation. +enum class NavigationSuggestionEvent { + kNone = 0, + // Interstitial results recorded using security_interstitials::MetricsHelper + // kInfobarShown = 1, + // kLinkClicked = 2, + kMatchTopSite = 3, + kMatchSiteEngagement = 4, + kMatchEditDistance = 5, + kMatchEditDistanceSiteEngagement = 6, + kMatchTargetEmbedding = 7, + + // Append new items to the end of the list above; do not modify or + // replace existing values. Comment out obsolete items. + kMaxValue = kMatchTargetEmbedding, +}; + +struct DomainInfo { + // The full ASCII hostname, used in detecting target embedding. For + // "https://www.google.com/mail" this will be "www.google.com". + const std::string hostname; + // eTLD+1, used for skeleton and edit distance comparison. Must be ASCII. + // Empty for non-unique domains, localhost or sites whose eTLD+1 is empty. + const std::string domain_and_registry; + // eTLD+1 without the registry part, and with a trailing period. For + // "www.google.com", this will be "google.". Used for edit distance + // comparisons. Empty for non-unique domains, localhost or sites whose eTLD+1 + // is empty. + const std::string domain_without_registry; + + // Result of IDN conversion of domain_and_registry field. + const url_formatter::IDNConversionResult idn_result; + // Skeletons of domain_and_registry field. + const url_formatter::Skeletons skeletons; + + DomainInfo(const std::string& arg_hostname, + const std::string& arg_domain_and_registry, + const std::string& arg_domain_without_registry, + const url_formatter::IDNConversionResult& arg_idn_result, + const url_formatter::Skeletons& arg_skeletons); + ~DomainInfo(); + DomainInfo(const DomainInfo& other); +}; + +// Returns a DomainInfo instance computed from |url|. Will return empty fields +// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is +// empty. +DomainInfo GetDomainInfo(const GURL& url); + +// Returns true if the Levenshtein distance between |str1| and |str2| is at most +// one. This has O(max(n,m)) complexity as opposed to O(n*m) of the usual edit +// distance computation. +bool IsEditDistanceAtMostOne(const base::string16& str1, + const base::string16& str2); + +// Returns true if the domain given by |domain_info| is a top domain. +bool IsTopDomain(const DomainInfo& domain_info); + +// Returns eTLD+1 of |hostname|. This excludes private registries, and returns +// "blogspot.com" for "test.blogspot.com" (blogspot.com is listed as a private +// registry). We do this to be consistent with url_formatter's top domain list +// which doesn't have a notion of private registries. +std::string GetETLDPlusOne(const std::string& hostname); + +// Returns true if a lookalike interstitial should be shown. +bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type, + const DomainInfo& navigated_domain); + +// Returns true if a domain is visually similar to the hostname of |url|. The +// matching domain can be a top domain or an engaged site. Similarity +// check is made using both visual skeleton and edit distance comparison. If +// this returns true, match details will be written into |matched_domain|. +// Pointer arguments can't be nullptr. +bool GetMatchingDomain(const DomainInfo& navigated_domain, + const std::vector<DomainInfo>& engaged_sites, + std::string* matched_domain, + LookalikeUrlMatchType* match_type); + +void RecordUMAFromMatchType(LookalikeUrlMatchType match_type); + +// Checks to see if a URL is a target embedding lookalike. This function sets +// |safe_url| to the url of the embedded target domain. +// At the moment we consider the following cases as Target Embedding: +// example-google.com-site.com, example.google.com-site.com, +// example-google-com-site.com, example.google.com.site.com, +// example-googlé.com-site.com where the embedded target is google.com. In +// addition to these examples, this function also detects domains embedded with +// alternate TLDs, if the TLD is included in |important_tlds| (e.g. google.edu +// instead of google.com in the example URLs above.). To reduce false positives, +// we exclude cases where the eTLD of the possibly-unsafe domain contains more +// than just the TLD of the embedded domain. For instance, we exclude +// foo-google.co.uk. +bool IsTargetEmbeddingLookalike(const GURL& url, + const std::set<std::string>& important_tlds, + GURL* safe_url); + +#endif // COMPONENTS_LOOKALIKES_LOOKALIKE_URL_UTIL_H_ diff --git a/chromium/components/lookalikes/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/lookalike_url_util_unittest.cc new file mode 100644 index 00000000000..031aa39ca13 --- /dev/null +++ b/chromium/components/lookalikes/lookalike_url_util_unittest.cc @@ -0,0 +1,175 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/lookalikes/lookalike_url_util.h" + +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) { + const struct TestCase { + const wchar_t* domain; + const wchar_t* top_domain; + bool expected; + } kTestCases[] = { + {L"", L"", true}, + {L"a", L"a", true}, + {L"a", L"", true}, + {L"", L"a", true}, + + {L"", L"ab", false}, + {L"ab", L"", false}, + + {L"ab", L"a", true}, + {L"a", L"ab", true}, + {L"ab", L"b", true}, + {L"b", L"ab", true}, + {L"ab", L"ab", true}, + + {L"", L"ab", false}, + {L"ab", L"", false}, + {L"a", L"abc", false}, + {L"abc", L"a", false}, + + {L"aba", L"ab", true}, + {L"ba", L"aba", true}, + {L"abc", L"ac", true}, + {L"ac", L"abc", true}, + + // Same length. + {L"xbc", L"ybc", true}, + {L"axc", L"ayc", true}, + {L"abx", L"aby", true}, + + // Should also work for non-ASCII. + {L"é", L"", true}, + {L"", L"é", true}, + {L"tést", L"test", true}, + {L"test", L"tést", true}, + {L"tés", L"test", false}, + {L"test", L"tés", false}, + + // Real world test cases. + {L"google.com", L"gooogle.com", true}, + {L"gogle.com", L"google.com", true}, + {L"googlé.com", L"google.com", true}, + {L"google.com", L"googlé.com", true}, + // Different by two characters. + {L"google.com", L"goooglé.com", false}, + }; + for (const TestCase& test_case : kTestCases) { + bool result = + IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain), + base::WideToUTF16(test_case.top_domain)); + EXPECT_EQ(test_case.expected, result); + } +} + +TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { + const std::set<std::string> important_tlds = {"com", "org", "edu", "gov", + "co"}; + const struct TargetEmbeddingHeuristicTestCase { + const GURL url; + bool should_trigger; + } kTestCases[] = { + + // We test everything with the correct TLD and another popular TLD. + + // Scheme should not affect the outcome. + {GURL("http://google.com.com"), true}, + {GURL("https://google.com.com"), true}, + + // The length of the url should not affect the outcome. + {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-" + "outcome-of-this-target-embedding-test-google.com-login.com"), + true}, + {GURL( + "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-" + "outcome-of-this-target-embedding-test.com-login.com"), + false}, + {GURL( + "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-" + "outcome-of-this-target-embedding-test.com-login.com"), + false}, + + // We need exact skeleton match for our domain so exclude edit-distance + // matches. + {GURL("http://goog0le.com-login.com"), false}, + + // Unicode characters should be handled + {GURL("http://googlé.com-login.com"), true}, + {GURL("http://sth-googlé.com-sth.com"), true}, + + // The basic state + {GURL("http://google.com.sth.com"), true}, + // - before the domain name should be ignored. + {GURL("http://sth-google.com-sth.com"), true}, + + // The embedded target's TLD doesn't necessarily need to be followed by a + // '-' and could be a subdomain by itself. + {GURL("http://sth-google.com.sth.com"), true}, + {GURL("http://a.b.c.d.e.f.g.h.sth-google.com.sth.com"), true}, + {GURL("http://a.b.c.d.e.f.g.h.google.com-sth.com"), true}, + {GURL("http://1.2.3.4.5.6.google.com-sth.com"), true}, + + // Target domain could be in the middle of subdomains. + {GURL("http://sth.google.com.sth.com"), true}, + {GURL("http://sth.google.com-sth.com"), true}, + + // The target domain and its tld should be next to each other. + {GURL("http://sth-google.l.com-sth.com"), false}, + + {GURL("http://google.edu.com"), true}, + {GURL("https://google.edu.com"), true}, + {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-" + "outcome-of-this-target-embedding-test-google.edu-login.com"), + true}, + {GURL( + "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-" + "outcome-of-this-target-embedding-test.edu-login.com"), + false}, + {GURL( + "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-" + "outcome-of-this-target-embedding-test.edu-login.com"), + false}, + {GURL("http://goog0le.edu-login.com"), false}, + {GURL("http://googlé.edu-login.com"), true}, + {GURL("http://sth-googlé.edu-sth.com"), true}, + {GURL("http://google.edu.sth.com"), true}, + {GURL("http://sth-google.edu-sth.com"), true}, + {GURL("http://sth-google.edu.sth.com"), true}, + {GURL("http://a.b.c.d.e.f.g.h.sth-google.edu.sth.com"), true}, + {GURL("http://a.b.c.d.e.f.g.h.google.edu-sth.com"), true}, + {GURL("http://1.2.3.4.5.6.google.edu-sth.com"), true}, + {GURL("http://sth.google.edu.sth.com"), true}, + {GURL("http://sth.google.edu-sth.com"), true}, + {GURL("http://sth-google.l.edu-sth.com"), false}, + {GURL("http://sth-google-l.edu-sth.com"), false}, + {GURL("http://sth-google.l-edu-sth.com"), false}, + + // Target domain might be separated with a dash instead of dot. + {GURL("http://sth.google-com-sth.com"), true}, + + // Ensure legitimate domains don't trigger. + {GURL("http://google.com"), false}, + {GURL("http://google.co.uk"), false}, + {GURL("http://google.randomreg-login.com"), false}, + + }; + + for (const auto& kTestCase : kTestCases) { + GURL safe_url = GURL(); + if (kTestCase.should_trigger) { + EXPECT_TRUE( + IsTargetEmbeddingLookalike(kTestCase.url, important_tlds, &safe_url)) + << "Expected that \"" << kTestCase.url + << " should trigger but it didn't."; + } else { + EXPECT_FALSE( + IsTargetEmbeddingLookalike(kTestCase.url, important_tlds, &safe_url)) + << "Expected that \"" << kTestCase.url + << " shouldn't trigger but it did. For URL: " << safe_url.spec(); + } + } +} |