From c30a6232df03e1efbd9f3b226777b07e087a1122 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Mon, 12 Oct 2020 14:27:29 +0200 Subject: BASELINE: Update Chromium to 85.0.4183.140 Change-Id: Iaa42f4680837c57725b1344f108c0196741f6057 Reviewed-by: Allan Sandfeld Jensen --- chromium/components/lookalikes/core/BUILD.gn | 1 + chromium/components/lookalikes/core/features.cc | 3 + chromium/components/lookalikes/core/features.h | 4 + .../lookalikes/core/lookalike_url_util.cc | 372 ++++++++++++++------- .../lookalikes/core/lookalike_url_util.h | 48 ++- .../lookalikes/core/lookalike_url_util_unittest.cc | 242 ++++++++++---- 6 files changed, 466 insertions(+), 204 deletions(-) (limited to 'chromium/components/lookalikes') diff --git a/chromium/components/lookalikes/core/BUILD.gn b/chromium/components/lookalikes/core/BUILD.gn index ac6a4d28be9..65e89c84808 100644 --- a/chromium/components/lookalikes/core/BUILD.gn +++ b/chromium/components/lookalikes/core/BUILD.gn @@ -27,6 +27,7 @@ jumbo_source_set("unit_tests") { deps = [ ":core", + ":features", "//net:test_support", "//testing/gtest", ] diff --git a/chromium/components/lookalikes/core/features.cc b/chromium/components/lookalikes/core/features.cc index faa42a292a2..99d9c417d37 100644 --- a/chromium/components/lookalikes/core/features.cc +++ b/chromium/components/lookalikes/core/features.cc @@ -10,5 +10,8 @@ namespace features { const base::Feature kDetectTargetEmbeddingLookalikes{ "TargetEmbeddingLookalikes", base::FEATURE_DISABLED_BY_DEFAULT}; +const base::Feature kLookalikeInterstitialForPunycode{ + "LookalikeInterstitialForPunycode", base::FEATURE_DISABLED_BY_DEFAULT}; + } // namespace features } // namespace lookalikes diff --git a/chromium/components/lookalikes/core/features.h b/chromium/components/lookalikes/core/features.h index 988c1c4043c..453e1146082 100644 --- a/chromium/components/lookalikes/core/features.h +++ b/chromium/components/lookalikes/core/features.h @@ -15,6 +15,10 @@ namespace features { COMPONENT_EXPORT(LOOKALIKES_FEATURES) extern const base::Feature kDetectTargetEmbeddingLookalikes; +// This feature enables interstitial warnings for certain punycode domains. +COMPONENT_EXPORT(LOOKALIKES_FEATURES) +extern const base::Feature kLookalikeInterstitialForPunycode; + } // namespace features } // namespace lookalikes diff --git a/chromium/components/lookalikes/core/lookalike_url_util.cc b/chromium/components/lookalikes/core/lookalike_url_util.cc index 2e82da34a54..4a350ed122e 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util.cc +++ b/chromium/components/lookalikes/core/lookalike_url_util.cc @@ -14,6 +14,7 @@ #include "base/memory/singleton.h" #include "base/metrics/field_trial_params.h" #include "base/metrics/histogram_macros.h" +#include "base/strings/string_piece.h" #include "base/strings/string_split.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" @@ -36,6 +37,28 @@ const char kHistogramName[] = "NavigationSuggestion.Event"; namespace { +// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains +// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored. +const char kDigitChars[] = "0123456789"; + +// Minimum length of e2LD protected against target embedding. For example, +// foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since +// "baz" is shorter than kMinTargetE2LDLength. +const size_t kMinE2LDLengthForTargetEmbedding = 4; + +// This list will be added to the static list of common words so common words +// could be added to the list using a flag if needed. +const base::FeatureParam kAdditionalCommonWords{ + &lookalikes::features::kDetectTargetEmbeddingLookalikes, + "additional_common_words", ""}; + +// We might not protect a domain whose e2LD is a common word in target embedding +// based on the TLD that is paired with it. +const char* kCommonWords[] = {"shop", "jobs", "live", "info", "study", + "asahi", "weather", "health", "forum", "radio", + "ideal", "research", "france", "free", "mobile", + "sky", "ask"}; + bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1, const url_formatter::Skeletons& skeletons2) { DCHECK(!skeletons1.empty()); @@ -86,27 +109,18 @@ std::string GetSimilarDomainFromTop500( } const std::string top_domain = - url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton).domain; + url_formatter::LookupSkeletonInTopDomains( + top_domain_skeleton, url_formatter::SkeletonType::kFull) + .domain; DCHECK(!top_domain.empty()); - // If the only difference between the navigated and top - // domains is the registry part, this is unlikely to be a spoofing - // attempt. Ignore this match and continue. E.g. If the navigated domain - // is google.com.tw and the top domain is google.com.tr, this won't - // produce a match. - const std::string top_domain_without_registry = - url_formatter::top_domains::HostnameWithoutRegistry(top_domain); - DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( - top_domain_without_registry)); - if (navigated_domain.domain_without_registry == - top_domain_without_registry) { + if (IsLikelyEditDistanceFalsePositive(navigated_domain, + GetDomainInfo(top_domain))) { continue; } // Skip past domains that are allowed to be spoofed. - if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) + - url::kStandardSchemeSeparator + - top_domain))) { + if (target_allowlisted.Run(top_domain)) { continue; } @@ -134,20 +148,12 @@ std::string GetSimilarDomainFromEngagedSites( continue; } - // If the only difference between the navigated and engaged - // domain is the registry part, this is unlikely to be a spoofing - // attempt. Ignore this match and continue. E.g. If the navigated - // domain is google.com.tw and the top domain is google.com.tr, this - // won't produce a match. - if (navigated_domain.domain_without_registry == - engaged_site.domain_without_registry) { + if (IsLikelyEditDistanceFalsePositive(navigated_domain, engaged_site)) { continue; } // Skip past domains that are allowed to be spoofed. - if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) + - url::kStandardSchemeSeparator + - engaged_site.domain_and_registry))) { + if (target_allowlisted.Run(engaged_site.domain_and_registry)) { continue; } @@ -164,61 +170,109 @@ void RecordEvent(NavigationSuggestionEvent event) { // Returns the parts of the domain that are separated by "." or "-", not // including the eTLD. -std::vector SplitDomainWithouteTLDIntoTokens( +// +// |host_without_etld| must outlive the return value since the vector contains +// StringPieces. +std::vector SplitDomainWithouteTLDIntoTokens( const std::string& host_without_etld) { - return base::SplitString(host_without_etld, "-.", base::TRIM_WHITESPACE, - base::SPLIT_WANT_NONEMPTY); + return base::SplitStringPiece(host_without_etld, "-.", base::TRIM_WHITESPACE, + base::SPLIT_WANT_NONEMPTY); } -// Checks whether |domain| is a top domain. If yes, returns true and fills -// |found_domain| with the matching top domain. -bool IsTop500Domain(const DomainInfo& domain, std::string* found_domain) { - for (auto& skeleton : domain.skeletons) { - // Matching with top domains is only done with skeleton matching. We check - // if the skeleton of our hostname matches the skeleton of any top domain. - url_formatter::TopDomainEntry matched_domain = - url_formatter::IDNSpoofChecker().LookupSkeletonInTopDomains(skeleton); - // We are only interested in an exact match with a top 500 domain (as - // opposed to skeleton match). Here we check that the matched domain is a - // top 500 domain and also the hostname of the matched domain is exactly the - // same as our input eTLD+1. - if (matched_domain.is_top_500 && - matched_domain.domain == domain.domain_and_registry) { - *found_domain = matched_domain.domain; +// Returns whether any subdomain ending in the last entry of |domain_labels| is +// allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the +// allowlist for google.com, scholar.google.com, and foo.scholar.google.com. +bool ASubdomainIsAllowlisted( + const base::span& domain_labels, + const LookalikeTargetAllowlistChecker& in_target_allowlist) { + DCHECK(domain_labels.size() >= 2); + std::string potential_hostname = + domain_labels[domain_labels.size() - 1].as_string(); + // Attach each token from the end to the embedded target to check if that + // subdomain has been allowlisted. + for (int i = domain_labels.size() - 2; i >= 0; i--) { + potential_hostname = + domain_labels[i].as_string() + "." + potential_hostname; + if (in_target_allowlist.Run(potential_hostname)) { return true; } } return false; } -// Checks if the targeted domain is allowlisted. To check that we need to -// check all of the subdomains that could be made. The reason is for example -// in the case of "foo.scholar.google.com.university.edu", "google.com" is -// considered as the targeted domain. We need to make sure -// "scholar.google.com" or "foo.scholar.google.com" are not allowlisted -// before marking the input domain as a target embedding domain. -bool ASubdomainIsAllowlisted( - const std::string& embedded_target, - const base::span& subdomain_labels_so_far, - const LookalikeTargetAllowlistChecker& in_target_allowlist) { - const std::string https_scheme = - url::kHttpsScheme + std::string(url::kStandardSchemeSeparator); +// Returns the top domain if the top domain without its separators matches the +// |potential_target| (e.g. googlecom). The matching is a skeleton matching. +std::string GetMatchingTopDomainWithoutSeparators( + const base::StringPiece& potential_target) { + const url_formatter::Skeletons skeletons = + url_formatter::GetSkeletons(base::UTF8ToUTF16(potential_target)); - if (in_target_allowlist.Run(GURL(https_scheme + embedded_target))) { + for (const auto& skeleton : skeletons) { + url_formatter::TopDomainEntry matched_domain = + url_formatter::LookupSkeletonInTopDomains( + skeleton, url_formatter::SkeletonType::kSeparatorsRemoved); + if (!matched_domain.domain.empty() && + matched_domain.skeleton_type == + url_formatter::SkeletonType::kSeparatorsRemoved) { + return matched_domain.domain; + } + } + return std::string(); +} + +// Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged +// site or a top 500 domain. |embedded_target| is set to matching eTLD+1. +bool DoesETLDPlus1MatchTopDomainOrEngagedSite( + const DomainInfo& domain, + const std::vector& engaged_sites, + std::string* embedded_target) { + for (const auto& skeleton : domain.skeletons) { + for (const auto& engaged_site : engaged_sites) { + if (base::Contains(engaged_site.skeletons, skeleton)) { + *embedded_target = engaged_site.domain_and_registry; + return true; + } + } + } + for (const auto& skeleton : domain.skeletons) { + const url_formatter::TopDomainEntry top_domain = + url_formatter::LookupSkeletonInTopDomains( + skeleton, url_formatter::SkeletonType::kFull); + if (!top_domain.domain.empty() && top_domain.is_top_500) { + *embedded_target = top_domain.domain; + return true; + } + } + return false; +} + +// Returns whether the provided token includes a common word, which is a common +// indication of a likely false positive. +bool UsesCommonWord(const DomainInfo& domain) { + std::vector additional_common_words = + base::SplitString(kAdditionalCommonWords.Get(), ",", + base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); + if (base::Contains(additional_common_words, domain.domain_without_registry)) { return true; } - std::string potential_hostname = embedded_target; - // Attach each token from the end to the embedded target to check if that - // subdomain has been allowlisted. - for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) { - potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname; - if (in_target_allowlist.Run(GURL(https_scheme + potential_hostname))) { + for (auto* common_word : kCommonWords) { + if (domain.domain_without_registry == common_word) { return true; } } return false; } +// A domain is allowed to be embedded if its e2LD is a common word or any +// valid partial subdomain is allowlisted. +bool IsAllowedToBeEmbedded( + const DomainInfo& embedded_target, + const base::span& subdomain_span, + const LookalikeTargetAllowlistChecker& in_target_allowlist) { + return UsesCommonWord(embedded_target) || + ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist); +} + } // namespace DomainInfo::DomainInfo(const std::string& arg_hostname, @@ -236,14 +290,14 @@ DomainInfo::~DomainInfo() = default; DomainInfo::DomainInfo(const DomainInfo&) = default; -DomainInfo GetDomainInfo(const GURL& url) { - if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) { +DomainInfo GetDomainInfo(const std::string& hostname) { + if (net::HostStringIsLocalhost(hostname) || + net::IsHostnameNonUnique(hostname)) { return DomainInfo(std::string(), std::string(), std::string(), url_formatter::IDNConversionResult(), url_formatter::Skeletons()); } - const std::string hostname = url.host(); - const std::string domain_and_registry = GetETLDPlusOne(url.host()); + const std::string domain_and_registry = GetETLDPlusOne(hostname); const std::string domain_without_registry = domain_and_registry.empty() ? std::string() @@ -268,6 +322,10 @@ DomainInfo GetDomainInfo(const GURL& url) { idn_result, skeletons); } +DomainInfo GetDomainInfo(const GURL& url) { + return GetDomainInfo(url.host()); +} + std::string GetETLDPlusOne(const std::string& hostname) { return net::registry_controlled_domains::GetDomainAndRegistry( hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); @@ -314,12 +372,67 @@ bool IsEditDistanceAtMostOne(const base::string16& str1, return edit_count <= 1; } +bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain, + const DomainInfo& matched_domain) { + DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( + matched_domain.domain_and_registry)); + DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( + navigated_domain.domain_and_registry)); + // If the only difference between the domains is the registry part, this is + // unlikely to be a spoofing attempt and we should ignore this match. E.g. + // exclude matches like google.com.tw and google.com.tr. + if (navigated_domain.domain_without_registry == + matched_domain.domain_without_registry) { + return true; + } + + // If the domains only differ by a numeric suffix on their e2LD (e.g. + // site45.tld and site35.tld), then ignore the match. + auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry, + kDigitChars, base::TRIM_TRAILING); + auto matched_trimmed = base::TrimString( + matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING); + DCHECK_NE(navigated_domain.domain_without_registry, + matched_domain.domain_without_registry); + // We previously verified that the domains without registries weren't equal, + // so if they're equal now, the match must have come from numeric suffixes. + if (nav_trimmed == matched_trimmed) { + return true; + } + + // Ignore domains that only differ by an insertion/substitution at the + // start, as these are usually different words, not lookalikes. + const auto nav_dom_len = navigated_domain.domain_and_registry.length(); + const auto matched_dom_len = matched_domain.domain_and_registry.length(); + const auto& nav_dom = navigated_domain.domain_and_registry; + const auto& matched_dom = matched_domain.domain_and_registry; + if (nav_dom_len == matched_dom_len) { + // e.g. hank vs tank + if (nav_dom.substr(1) == matched_dom.substr(1)) { + return true; + } + } else if (nav_dom_len < matched_dom_len) { + // e.g. oodle vs poodle + if (nav_dom == matched_dom.substr(1)) { + return true; + } + } else { // navigated_dom_len > matched_dom_len + // e.g. poodle vs oodle + if (nav_dom.substr(1) == matched_dom) { + return true; + } + } + + return false; +} + bool IsTopDomain(const DomainInfo& domain_info) { // Top domains are only accessible through their skeletons, so query the top // domains trie for each skeleton of this domain. for (const std::string& skeleton : domain_info.skeletons) { const url_formatter::TopDomainEntry top_domain = - url_formatter::LookupSkeletonInTopDomains(skeleton); + url_formatter::LookupSkeletonInTopDomains( + skeleton, url_formatter::SkeletonType::kFull); if (domain_info.domain_and_registry == top_domain.domain) { return true; } @@ -402,12 +515,18 @@ bool GetMatchingDomain( } } - if (IsTargetEmbeddingLookalike(navigated_domain.hostname, engaged_sites, - in_target_allowlist, matched_domain)) { + TargetEmbeddingType embedding_type = + GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites, + in_target_allowlist, matched_domain); + if (embedding_type == TargetEmbeddingType::kSafetyTip) { + *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips; + return true; + } else if (embedding_type == TargetEmbeddingType::kInterstitial) { *match_type = LookalikeUrlMatchType::kTargetEmbedding; return true; } + DCHECK(embedding_type == TargetEmbeddingType::kNone); return false; } @@ -431,84 +550,89 @@ void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) { case LookalikeUrlMatchType::kSkeletonMatchTop5k: RecordEvent(NavigationSuggestionEvent::kMatchSkeletonTop5k); break; + case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips: + RecordEvent( + NavigationSuggestionEvent::kMatchTargetEmbeddingForSafetyTips); + break; + case LookalikeUrlMatchType::kFailedSpoofChecks: + RecordEvent(NavigationSuggestionEvent::kFailedSpoofChecks); + break; case LookalikeUrlMatchType::kNone: break; } } -bool IsTargetEmbeddingLookalike( +TargetEmbeddingType GetTargetEmbeddingType( const std::string& hostname, const std::vector& engaged_sites, const LookalikeTargetAllowlistChecker& in_target_allowlist, std::string* safe_hostname) { const std::string host_without_etld = url_formatter::top_domains::HostnameWithoutRegistry(hostname); - const std::vector hostname_tokens_without_etld = + const std::vector hostname_tokens_without_etld = SplitDomainWithouteTLDIntoTokens(host_without_etld); - // For each token, we look backwards to the previous token to see if - // "|prev_token|.|token|" forms a top domain or a high engaged domain. - std::string prev_token; - - // We can have domains separated by '-'s or '.'s. In order to find target - // embedding urls with google.com.com or google-com.com, we get url parts as - // anything that is between two '-'s or '.'s. We check to see if any two - // consecutive tokens form a top or highly-engaged domain. - // Because of the way this matching is working, we can not identify target - // embedding attacks against domains that contain '-' in their address - // (e.g programme-tv.net). Also if the eTLD of the target has more than one - // part, we won't be able to protect it (e.g. google.co.uk). - for (size_t i = 0; i < hostname_tokens_without_etld.size(); i++) { - const std::string token = hostname_tokens_without_etld[i]; - const std::string possible_embedded_target = prev_token + "." + token; - if (prev_token.empty()) { - prev_token = token; - continue; + // There are O(n^2) potential target embeddings in a domain name. We want to + // be comprehensive, but optimize so that usually we needn't check all of + // them. We do that by sweeping from the back of the embedding domain, towards + // the front, checking for a valid eTLD. If we find one, then we consider the + // possible embedded domains that end in that eTLD (i.e. all possible start + // points from the beginning of the string onward). + for (int end = hostname_tokens_without_etld.size(); end > 0; --end) { + base::span etld_check_span( + hostname_tokens_without_etld.data(), end); + std::string etld_check_host = base::JoinString(etld_check_span, "."); + auto etld_check_dominfo = GetDomainInfo(etld_check_host); + + // Check if the final token is a no-separator target (e.g. "googlecom"). + // This check happens first so that we can exclude invalid eTLD+1s next. + std::string embedded_target = GetMatchingTopDomainWithoutSeparators( + hostname_tokens_without_etld[end - 1]); + if (!embedded_target.empty() && + !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span, + in_target_allowlist)) { + *safe_hostname = embedded_target; + return TargetEmbeddingType::kInterstitial; } - prev_token = token; - // Short domains are more likely to be misidentified as being embedded. For - // example "mi.com", "mk.ru", or "com.ru" are a few examples of domains that - // could trigger the target embedding heuristic falsely. - if (possible_embedded_target.size() < 7) { + // Exclude otherwise-invalid eTLDs. + if (etld_check_dominfo.domain_without_registry.empty()) { continue; } - // We want to protect user's high engaged websites as well as top domains. - GURL possible_target_url(url::kHttpsScheme + - std::string(url::kStandardSchemeSeparator) + - possible_embedded_target); - DomainInfo possible_target_domain = GetDomainInfo(possible_target_url); - // We check if the eTLD+1 is a valid domain, otherwise there is no point in - // checking if it is a top domain or an engaged domain. - if (possible_target_domain.domain_and_registry.empty()) { + // Exclude e2LDs that are too short. <= because domain_without_registry has + // a trailing ".". + if (etld_check_dominfo.domain_without_registry.length() <= + kMinE2LDLengthForTargetEmbedding) { continue; } - *safe_hostname = - GetMatchingSiteEngagementDomain(engaged_sites, possible_target_domain); - // |GetMatchingSiteEngagementDomain| uses skeleton matching, we make sure - // the found engaged site is an exact match of the embedded target. - if (*safe_hostname != possible_embedded_target) { - *safe_hostname = std::string(); - } - if (safe_hostname->empty() && - !IsTop500Domain(possible_target_domain, safe_hostname)) { - continue; + // Check for exact matches against engaged sites, among all possible + // subdomains ending at |end|. + for (int start = 0; start < end - 1; ++start) { + const base::span span( + (hostname_tokens_without_etld.data() + start), end - start); + auto embedded_hostname = base::JoinString(span, "."); + auto embedded_dominfo = GetDomainInfo(embedded_hostname); + + for (auto& engaged_site : engaged_sites) { + if (engaged_site.hostname == embedded_dominfo.hostname && + !IsAllowedToBeEmbedded(embedded_dominfo, span, + in_target_allowlist)) { + *safe_hostname = engaged_site.hostname; + return TargetEmbeddingType::kInterstitial; + } + } } - // Check if any subdomain is allowlisted. - std::vector subdomain_labels_so_far( - hostname_tokens_without_etld.begin(), - hostname_tokens_without_etld.begin() + i - 1); - if (!ASubdomainIsAllowlisted(possible_embedded_target, - subdomain_labels_so_far, - in_target_allowlist)) { - return true; + // There were no exact engaged site matches, but there may yet still be a + // match against the eTLD+1 of an engaged or top site. + if (DoesETLDPlus1MatchTopDomainOrEngagedSite( + etld_check_dominfo, engaged_sites, safe_hostname) && + !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span, + in_target_allowlist)) { + return TargetEmbeddingType::kInterstitial; } - - // A target is found but it was allowlisted. - *safe_hostname = std::string(); } - return false; + return TargetEmbeddingType::kNone; } diff --git a/chromium/components/lookalikes/core/lookalike_url_util.h b/chromium/components/lookalikes/core/lookalike_url_util.h index 1bc49c24384..00946f6d909 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util.h +++ b/chromium/components/lookalikes/core/lookalike_url_util.h @@ -20,7 +20,16 @@ extern const char kHistogramName[]; } using LookalikeTargetAllowlistChecker = - base::RepeatingCallback; + base::RepeatingCallback; + +// Used for |GetTargetEmbeddingType| return value. It shows if the target +// embedding triggers on the input domain, and if it does, what type of warning +// should be shown to the user. +enum class TargetEmbeddingType { + kNone = 0, + kInterstitial = 1, + kSafetyTip = 2, +}; // Used for UKM. There is only a single LookalikeUrlMatchType per navigation. enum class LookalikeUrlMatchType { @@ -33,10 +42,16 @@ enum class LookalikeUrlMatchType { kTargetEmbedding = 5, kSkeletonMatchTop500 = 6, kSkeletonMatchTop5k = 7, + kTargetEmbeddingForSafetyTips = 8, + + // The domain name failed IDN spoof checks but didn't match a safe hostname. + // As a result, there is no URL to suggest to the user in the form of "Did + // you mean ?". + kFailedSpoofChecks = 9, // Append new items to the end of the list above; do not modify or replace // existing values. Comment out obsolete items. - kMaxValue = kSkeletonMatchTop5k, + kMaxValue = kFailedSpoofChecks, }; // Used for UKM. There is only a single LookalikeUrlBlockingPageUserAction per @@ -66,10 +81,12 @@ enum class NavigationSuggestionEvent { kMatchTargetEmbedding = 7, kMatchSkeletonTop500 = 8, kMatchSkeletonTop5k = 9, + kMatchTargetEmbeddingForSafetyTips = 10, + kFailedSpoofChecks = 11, // Append new items to the end of the list above; do not modify or // replace existing values. Comment out obsolete items. - kMaxValue = kMatchSkeletonTop5k, + kMaxValue = kFailedSpoofChecks, }; struct DomainInfo { @@ -99,9 +116,12 @@ struct DomainInfo { DomainInfo(const DomainInfo& other); }; -// Returns a DomainInfo instance computed from |url|. Will return empty fields -// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is -// empty. +// Returns a DomainInfo instance computed from |hostname|. Will return empty +// fields for non-unique hostnames (e.g. site.test), localhost or sites whose +// eTLD+1 is empty. +DomainInfo GetDomainInfo(const std::string& hostname); + +// Convenience function for returning GetDomainInfo(url.host()). DomainInfo GetDomainInfo(const GURL& url); // Returns true if the Levenshtein distance between |str1| and |str2| is at most @@ -110,6 +130,13 @@ DomainInfo GetDomainInfo(const GURL& url); bool IsEditDistanceAtMostOne(const base::string16& str1, const base::string16& str2); +// Returns whether |navigated_domain| and |matched_domain| are likely to be edit +// distance false positives, and thus the user should *not* be warned. +// +// Assumes |navigated_domain| and |matched_domain| are edit distance matches. +bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain, + const DomainInfo& matched_domain); + // Returns true if the domain given by |domain_info| is a top domain. bool IsTopDomain(const DomainInfo& domain_info); @@ -141,14 +168,19 @@ void RecordUMAFromMatchType(LookalikeUrlMatchType match_type); // |safe_hostname| to the url of the embedded target domain. // At the moment we consider the following cases as Target Embedding: // example-google.com-site.com, example.google.com-site.com, -// example-google-com-site.com, example.google.com.site.com, +// example-google-info-site.com, example.google.com.site.com, // example-googlé.com-site.com where the embedded target is google.com. We // detect embeddings of top 500 domains and engaged domains. However, to reduce // false positives, we do not protect domains that are shorter than 7 characters // long (e.g. com.ru). // This function checks possible targets against |in_target_allowlist| to skip // permitted embeddings. -bool IsTargetEmbeddingLookalike( +// If no target embedding is found, the return value will be set to |kNonw|. +// When the target is embedded with another TLD instead of its actual TLD, it +// should trigger a Safety Tip when the embedded TLD is a ccTLD. In this +// situation, return value will be |kSafetyTip|. All the other triggers will +// result in a |kInterstitial| return value. +TargetEmbeddingType GetTargetEmbeddingType( const std::string& hostname, const std::vector& engaged_sites, const LookalikeTargetAllowlistChecker& in_target_allowlist, diff --git a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc index 4af557b6ad9..1aed2eddeec 100644 --- a/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc +++ b/chromium/components/lookalikes/core/lookalike_url_util_unittest.cc @@ -6,6 +6,8 @@ #include "base/bind.h" #include "base/strings/utf_string_conversions.h" +#include "base/test/scoped_feature_list.h" +#include "components/lookalikes/core/features.h" #include "testing/gtest/include/gtest/gtest.h" TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) { @@ -63,102 +65,198 @@ TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) { bool result = IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain), base::WideToUTF16(test_case.top_domain)); - EXPECT_EQ(test_case.expected, result); + EXPECT_EQ(test_case.expected, result) + << "when comparing " << test_case.domain << " with " + << test_case.top_domain; } } -bool IsGoogleScholar(const GURL& hostname) { - return hostname.host() == "scholar.google.com"; +TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) { + const struct TestCase { + const char* domain; + const char* top_domain; + bool is_likely_false_positive; + } kTestCases[] = { + // Most edit distance instances are not likely false positives. + {"abcxd.com", "abcyd.com", false}, // Substitution + {"abcxd.com", "abcxxd.com", false}, // Deletion + {"abcxxd.com", "abcxd.com", false}, // Insertion + + // But we permit cases where the only difference is in the tld. + {"abcde.com", "abcde.net", true}, + + // We also permit matches that are only due to a numeric suffix, + {"abcd1.com", "abcd2.com", true}, // Substitution + {"abcde.com", "abcde1.com", true}, // Numeric deletion + {"abcde1.com", "abcde.com", true}, // Numeric insertion + {"abcd11.com", "abcd21.com", true}, // Not-final-digit substitution + {"a.abcd1.com", "abcd2.com", true}, // Only relevant for eTLD+1. + // ...and that change must be due to the numeric suffix. + {"abcx1.com", "abcy1.com", false}, // Substitution before suffix + {"abcd1.com", "abcde1.com", false}, // Deletion before suffix + {"abcde1.com", "abcd1.com", false}, // Insertion before suffix + {"abcdx.com", "abcdy.com", false}, // Non-numeric substitution at end + + // We also permit matches that are only due to a first-character change, + {"xabcd.com", "yabcd.com", true}, // Substitution + {"xabcde.com", "abcde.com", true}, // Insertion + {"abcde.com", "xabcde.com", true}, // Deletion + {"a.abcde.com", "xabcde.com", true}, // For eTLD+1 + // ...so long as that change is only on the first character, not later. + {"abcde.com", "axbcde.com", false}, // Deletion + {"axbcde.com", "abcde.com", false}, // Insertion + {"axbcde.com", "aybcde.com", false}, // Substitution + }; + for (const TestCase& test_case : kTestCases) { + auto navigated = + GetDomainInfo(GURL(std::string(url::kHttpsScheme) + + url::kStandardSchemeSeparator + test_case.domain)); + auto matched = GetDomainInfo(GURL(std::string(url::kHttpsScheme) + + url::kStandardSchemeSeparator + + test_case.top_domain)); + bool result = IsLikelyEditDistanceFalsePositive(navigated, matched); + EXPECT_EQ(test_case.is_likely_false_positive, result) + << "when comparing " << test_case.domain << " with " + << test_case.top_domain; + } } -TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { - const std::vector engaged_sites = { - GetDomainInfo(GURL("https://highengagement.com"))}; - const struct TargetEmbeddingHeuristicTestCase { - const GURL url; - bool should_trigger; - } kTestCases[] = { +bool IsGoogleScholar(const std::string& hostname) { + return hostname == "scholar.google.com"; +} - // Scheme should not affect the outcome. - {GURL("http://google.com.com"), true}, - {GURL("https://google.com.com"), true}, +struct TargetEmbeddingHeuristicTestCase { + const std::string hostname; + // Empty when there is no match. + const std::string expected_safe_host; + const TargetEmbeddingType expected_type; +}; +TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) { + const std::vector kEngagedSites = { + GetDomainInfo(GURL("https://highengagement.com")), + GetDomainInfo(GURL("https://highengagement.co.uk")), + GetDomainInfo(GURL("https://subdomain.highengagement.com")), + GetDomainInfo(GURL("https://subdomain.google.com")), + }; + const std::vector kTestCases = { // The length of the url should not affect the outcome. - {GURL("http://this-is-a-very-long-url-but-it-should-not-affect-the-" - "outcome-of-this-target-embedding-test-google.com-login.com"), - true}, - {GURL( - "http://this-is-a-very-long-url-but-it-should-not-affect-google-the-" - "outcome-of-this-target-embedding-test.com-login.com"), - false}, - {GURL( - "http://google-this-is-a-very-long-url-but-it-should-not-affect-the-" - "outcome-of-this-target-embedding-test.com-login.com"), - false}, + {"this-is-a-very-long-url-but-it-should-not-affect-the-" + "outcome-of-this-target-embedding-test-google.com-login.com", + "google.com", TargetEmbeddingType::kInterstitial}, + {"google-com-this-is-a-very-long-url-but-it-should-not-affect-" + "the-outcome-of-this-target-embedding-test-login.com", + "google.com", TargetEmbeddingType::kInterstitial}, + {"this-is-a-very-long-url-but-it-should-not-affect-google-the-" + "outcome-of-this-target-embedding-test.com-login.com", + "", TargetEmbeddingType::kNone}, + {"google-this-is-a-very-long-url-but-it-should-not-affect-the-" + "outcome-of-this-target-embedding-test.com-login.com", + "", TargetEmbeddingType::kNone}, // We need exact skeleton match for our domain so exclude edit-distance // matches. - {GURL("http://goog0le.com-login.com"), false}, + {"goog0le.com-login.com", "", TargetEmbeddingType::kNone}, - // Unicode characters are currently not handled. As a result, target - // embedding sites that embed lookalikes of top domains aren't flagged. - {GURL("http://googlé.com-login.com"), false}, - {GURL("http://sth-googlé.com-sth.com"), false}, + // Unicode characters should be handled + {"googlé.com-login.com", "google.com", + TargetEmbeddingType::kInterstitial}, + {"foo-googlé.com-bar.com", "google.com", + TargetEmbeddingType::kInterstitial}, - // The basic state - {GURL("http://google.com.sth.com"), true}, + // The basic states + {"google.com.foo.com", "google.com", TargetEmbeddingType::kInterstitial}, // - before the domain name should be ignored. - {GURL("http://sth-google.com-sth.com"), true}, - + {"foo-google.com-bar.com", "google.com", + TargetEmbeddingType::kInterstitial}, // The embedded target's TLD doesn't necessarily need to be followed by a // '-' and could be a subdomain by itself. - {GURL("http://sth-google.com.sth.com"), true}, - {GURL("http://a.b.c.d.e.f.g.h.sth-google.com.sth.com"), true}, - {GURL("http://a.b.c.d.e.f.g.h.google.com-sth.com"), true}, - {GURL("http://1.2.3.4.5.6.google.com-sth.com"), true}, - + {"foo-google.com.foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + {"a.b.c.d.e.f.g.h.foo-google.com.foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + {"a.b.c.d.e.f.g.h.google.com-foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + {"1.2.3.4.5.6.google.com-foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, // Target domain could be in the middle of subdomains. - {GURL("http://sth.google.com.sth.com"), true}, - {GURL("http://sth.google.com-sth.com"), true}, - + {"foo.google.com.foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, // The target domain and its tld should be next to each other. - {GURL("http://sth-google.l.com-sth.com"), false}, - - // Target domain should match only with its actual TLD. - {GURL("http://google.edu.com"), false}, + {"foo-google.l.com-foo.com", "", TargetEmbeddingType::kNone}, // Target domain might be separated with a dash instead of dot. - {GURL("http://sth.google-com-sth.com"), true}, - // Target domain could be an engaged domain - {GURL("http://highengagement-com-login.com"), true}, - // If target domain is an allowlisted domain, it should not trigger the - // heuristic. - {GURL("http://foo.scholar.google-com-login.com"), false}, - // An allowlisted domain will make sure it is not marked as an embedded - // target. However, other targets could still be embedded in the domain. - {GURL("http://foo.google.com.scholar.google-com-login.com"), true}, - {GURL("http://foo.scholar.google-com.google.com-login.com"), true}, - - // Ensure legitimate domains don't trigger the heuristic. - {GURL("http://google.com"), false}, - {GURL("http://google.co.uk"), false}, - {GURL("http://google.randomreg-login.com"), false}, + {"foo.google-com-foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + + // Allowlisted domains should not trigger heuristic. + {"scholar.google.com.foo.com", "", TargetEmbeddingType::kNone}, + {"scholar.google.com-google.com.foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + {"google.com-scholar.google.com.foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + {"foo.scholar.google.com.foo.com", "", TargetEmbeddingType::kNone}, + {"scholar.foo.google.com.foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + + // Targets should be longer than 6 characters. + {"hp.com-foo.com", "", TargetEmbeddingType::kNone}, + + // Targets with common words as e2LD are not considered embedded targets + // either for all TLDs or another-TLD matching. + {"foo.jobs.com-foo.com", "", TargetEmbeddingType::kNone}, + {"foo.office.com-foo.com", "office.com", + TargetEmbeddingType::kInterstitial}, + {"foo.jobs.org-foo.com", "", TargetEmbeddingType::kNone}, + {"foo.office.org-foo.com", "", TargetEmbeddingType::kNone}, + + // Targets could be embedded without their dots and dashes. + {"foo.googlecom-foo.com", "google.com", + TargetEmbeddingType::kInterstitial}, + + // Ensure legitimate domains don't trigger. + {"foo.google.com", "", TargetEmbeddingType::kNone}, + {"foo.bar.google.com", "", TargetEmbeddingType::kNone}, + {"google.com", "", TargetEmbeddingType::kNone}, + {"google.co.uk", "", TargetEmbeddingType::kNone}, + {"google.randomreg-login.com", "", TargetEmbeddingType::kNone}, + {"com.foo.com", "", TargetEmbeddingType::kNone}, + + // Multipart eTLDs should work. + {"foo.google.co.uk.foo.com", "google.co.uk", + TargetEmbeddingType::kInterstitial}, + {"foo.highengagement-co-uk.foo.com", "highengagement.co.uk", + TargetEmbeddingType::kInterstitial}, + + // Engaged sites should trigger as specifically as possible, and should + // trigger preferentially to top sites when possible. + {"foo.highengagement.com.foo.com", "highengagement.com", + TargetEmbeddingType::kInterstitial}, + {"foo.subdomain.highengagement.com.foo.com", + "subdomain.highengagement.com", TargetEmbeddingType::kInterstitial}, + {"foo.subdomain.google.com.foo.com", "subdomain.google.com", + TargetEmbeddingType::kInterstitial}, + // Skeleton matching should work against engaged sites at the eTLD level. + {"subdomain.highéngagement.com-foo.com", "highengagement.com", + TargetEmbeddingType::kInterstitial}, }; - for (const auto& kTestCase : kTestCases) { + for (auto& test_case : kTestCases) { std::string safe_hostname; - if (kTestCase.should_trigger) { - EXPECT_TRUE(IsTargetEmbeddingLookalike( - kTestCase.url.host(), engaged_sites, - base::BindRepeating(&IsGoogleScholar), &safe_hostname)) - << "Expected that \"" << kTestCase.url - << " should trigger but it didn't."; + TargetEmbeddingType embedding_type = GetTargetEmbeddingType( + test_case.hostname, kEngagedSites, + base::BindRepeating(&IsGoogleScholar), &safe_hostname); + if (test_case.expected_type != TargetEmbeddingType::kNone) { + EXPECT_EQ(safe_hostname, test_case.expected_safe_host) + << test_case.hostname << " should trigger on " + << test_case.expected_safe_host << ", but " + << (safe_hostname.empty() ? "it didn't trigger at all." + : "triggered on " + safe_hostname); + EXPECT_EQ(embedding_type, test_case.expected_type); } else { - EXPECT_FALSE(IsTargetEmbeddingLookalike( - kTestCase.url.host(), engaged_sites, - base::BindRepeating(&IsGoogleScholar), &safe_hostname)) - << "Expected that \"" << kTestCase.url - << " shouldn't trigger but it did. For URL: " << safe_hostname; + EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone) + << test_case.hostname << " unexpectedly triggered on " + << safe_hostname; } } } -- cgit v1.2.1