diff options
Diffstat (limited to 'chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc')
-rw-r--r-- | chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc | 504 |
1 files changed, 504 insertions, 0 deletions
diff --git a/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc new file mode 100644 index 00000000000..f1a9fb37d82 --- /dev/null +++ b/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc @@ -0,0 +1,504 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" + +#include <utility> + +#include "base/bind.h" +#include "base/compiler_specific.h" +#include "base/location.h" +#include "base/logging.h" +#include "base/metrics/histogram_macros.h" +#include "base/single_thread_task_runner.h" +#include "base/strings/string_util.h" +#include "base/threading/thread_task_runner_handle.h" +#include "base/time/time.h" +#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" +#include "chrome/renderer/safe_browsing/features.h" +#include "content/public/renderer/render_view.h" +#include "net/base/registry_controlled_domains/registry_controlled_domain.h" +#include "third_party/blink/public/platform/web_string.h" +#include "third_party/blink/public/web/web_element.h" +#include "third_party/blink/public/web/web_element_collection.h" +#include "third_party/blink/public/web/web_local_frame.h" +#include "third_party/blink/public/web/web_view.h" + +namespace safe_browsing { + +// This time should be short enough that it doesn't noticeably disrupt the +// user's interaction with the page. +const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; + +// Experimenting shows that we get a reasonable gain in performance by +// increasing this up to around 10, but there's not much benefit in +// increasing it past that. +const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; + +// This should be longer than we expect feature extraction to take on any +// actual phishing page. +const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; + +// Intermediate state used for computing features. See features.h for +// descriptions of the DOM features that are computed. +struct PhishingDOMFeatureExtractor::PageFeatureState { + // Link related features + int external_links; + std::unordered_set<std::string> external_domains; + int secure_links; + int total_links; + + // Form related features + int num_forms; + int num_text_inputs; + int num_pswd_inputs; + int num_radio_inputs; + int num_check_inputs; + int action_other_domain; + int total_actions; + std::unordered_set<std::string> page_action_urls; + + // Image related features + int img_other_domain; + int total_imgs; + + // How many script tags + int num_script_tags; + + // The time at which we started feature extraction for the current page. + base::TimeTicks start_time; + + // The number of iterations we've done for the current extraction. + int num_iterations; + + explicit PageFeatureState(base::TimeTicks start_time_ticks) + : external_links(0), + secure_links(0), + total_links(0), + num_forms(0), + num_text_inputs(0), + num_pswd_inputs(0), + num_radio_inputs(0), + num_check_inputs(0), + action_other_domain(0), + total_actions(0), + img_other_domain(0), + total_imgs(0), + num_script_tags(0), + start_time(start_time_ticks), + num_iterations(0) {} + + ~PageFeatureState() {} +}; + +// Per-frame state +struct PhishingDOMFeatureExtractor::FrameData { + // This is our reference to document.all, which is an iterator over all + // of the elements in the document. It keeps track of our current position. + blink::WebElementCollection elements; + // The domain of the document URL, stored here so that we don't need to + // recompute it every time it's needed. + std::string domain; +}; + +PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( + FeatureExtractorClock* clock) + : clock_(clock) { + Clear(); +} + +PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { + // The RenderView should have called CancelPendingExtraction() before + // we are destroyed. + CheckNoPendingExtraction(); +} + +void PhishingDOMFeatureExtractor::ExtractFeatures(blink::WebDocument document, + FeatureMap* features, + DoneCallback done_callback) { + // The RenderView should have called CancelPendingExtraction() before + // starting a new extraction, so DCHECK this. + CheckNoPendingExtraction(); + // However, in an opt build, we will go ahead and clean up the pending + // extraction so that we can start in a known state. + CancelPendingExtraction(); + + features_ = features; + done_callback_ = std::move(done_callback); + + page_feature_state_.reset(new PageFeatureState(clock_->Now())); + cur_document_ = document; + + base::ThreadTaskRunnerHandle::Get()->PostTask( + FROM_HERE, + base::BindOnce(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, + weak_factory_.GetWeakPtr())); +} + +void PhishingDOMFeatureExtractor::CancelPendingExtraction() { + // Cancel any pending callbacks, and clear our state. + weak_factory_.InvalidateWeakPtrs(); + Clear(); +} + +void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { + DCHECK(page_feature_state_.get()); + ++page_feature_state_->num_iterations; + base::TimeTicks current_chunk_start_time = clock_->Now(); + + if (cur_document_.IsNull()) { + // This will only happen if we weren't able to get the document for the + // main frame. We'll treat this as an extraction failure. + RunCallback(false); + return; + } + + int num_elements = 0; + for (; !cur_document_.IsNull(); cur_document_ = GetNextDocument()) { + blink::WebElement cur_element; + if (cur_frame_data_.get()) { + // We're resuming traversal of a frame, so just advance to the next + // element. + cur_element = cur_frame_data_->elements.NextItem(); + // When we resume the traversal, the first call to nextItem() potentially + // has to walk through the document again from the beginning, if it was + // modified between our chunks of work. Log how long this takes, so we + // can tell if it's too slow. + UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", + clock_->Now() - current_chunk_start_time); + } else { + // We just moved to a new frame, so update our frame state + // and advance to the first element. + ResetFrameData(); + cur_element = cur_frame_data_->elements.FirstItem(); + } + + for (; !cur_element.IsNull(); + cur_element = cur_frame_data_->elements.NextItem()) { + if (cur_element.HasHTMLTagName("a")) { + HandleLink(cur_element); + } else if (cur_element.HasHTMLTagName("form")) { + HandleForm(cur_element); + } else if (cur_element.HasHTMLTagName("img")) { + HandleImage(cur_element); + } else if (cur_element.HasHTMLTagName("input")) { + HandleInput(cur_element); + } else if (cur_element.HasHTMLTagName("script")) { + HandleScript(cur_element); + } + + if (++num_elements >= kClockCheckGranularity) { + num_elements = 0; + base::TimeTicks now = clock_->Now(); + if (now - page_feature_state_->start_time >= + base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { + DLOG(ERROR) << "Feature extraction took too long, giving up"; + // We expect this to happen infrequently, so record when it does. + UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.DOMFeatureTimeout", 1); + RunCallback(false); + return; + } + base::TimeDelta chunk_elapsed = now - current_chunk_start_time; + if (chunk_elapsed >= + base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { + // The time limit for the current chunk is up, so post a task to + // continue extraction. + // + // Record how much time we actually spent on the chunk. If this is + // much higher than kMaxTimePerChunkMs, we may need to adjust the + // clock granularity. + UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", + chunk_elapsed); + base::ThreadTaskRunnerHandle::Get()->PostTask( + FROM_HERE, + base::BindOnce( + &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, + weak_factory_.GetWeakPtr())); + return; + } + // Otherwise, continue. + } + } + + // We're done with this frame, recalculate the FrameData when we + // advance to the next frame. + cur_frame_data_.reset(); + } + + InsertFeatures(); + RunCallback(true); +} + +void PhishingDOMFeatureExtractor::HandleLink( + const blink::WebElement& element) { + // Count the number of times we link to a different host. + if (!element.HasAttribute("href")) { + DVLOG(1) << "Skipping anchor tag with no href"; + return; + } + + // Retrieve the link and resolve the link in case it's relative. + blink::WebURL full_url = CompleteURL(element, element.GetAttribute("href")); + + std::string domain; + bool is_external = IsExternalDomain(full_url, &domain); + if (domain.empty()) { + DVLOG(1) << "Could not extract domain from link: " << full_url; + return; + } + + if (is_external) { + ++page_feature_state_->external_links; + + // Record each unique domain that we link to. + page_feature_state_->external_domains.insert(domain); + } + + // Check how many are https links. + if (GURL(full_url).SchemeIs("https")) { + ++page_feature_state_->secure_links; + } + + ++page_feature_state_->total_links; +} + +void PhishingDOMFeatureExtractor::HandleForm( + const blink::WebElement& element) { + // Increment the number of forms on this page. + ++page_feature_state_->num_forms; + + // Record whether the action points to a different domain. + if (!element.HasAttribute("action")) { + return; + } + + blink::WebURL full_url = CompleteURL(element, element.GetAttribute("action")); + + page_feature_state_->page_action_urls.insert(full_url.GetString().Utf8()); + + std::string domain; + bool is_external = IsExternalDomain(full_url, &domain); + if (domain.empty()) { + DVLOG(1) << "Could not extract domain from form action: " << full_url; + return; + } + + if (is_external) { + ++page_feature_state_->action_other_domain; + } + ++page_feature_state_->total_actions; +} + +void PhishingDOMFeatureExtractor::HandleImage( + const blink::WebElement& element) { + if (!element.HasAttribute("src")) { + DVLOG(1) << "Skipping img tag with no src"; + } + + // Record whether the image points to a different domain. + blink::WebURL full_url = CompleteURL(element, element.GetAttribute("src")); + std::string domain; + bool is_external = IsExternalDomain(full_url, &domain); + if (domain.empty()) { + DVLOG(1) << "Could not extract domain from image src: " << full_url; + return; + } + + if (is_external) { + ++page_feature_state_->img_other_domain; + } + ++page_feature_state_->total_imgs; +} + +void PhishingDOMFeatureExtractor::HandleInput( + const blink::WebElement& element) { + // The HTML spec says that if the type is unspecified, it defaults to text. + // In addition, any unrecognized type will be treated as a text input. + // + // Note that we use the attribute value rather than + // WebFormControlElement::formControlType() for consistency with the + // way the phishing classification model is created. + std::string type = base::ToLowerASCII(element.GetAttribute("type").Utf8()); + if (type == "password") { + ++page_feature_state_->num_pswd_inputs; + } else if (type == "radio") { + ++page_feature_state_->num_radio_inputs; + } else if (type == "checkbox") { + ++page_feature_state_->num_check_inputs; + } else if (type != "submit" && type != "reset" && type != "file" && + type != "hidden" && type != "image" && type != "button") { + // Note that there are a number of new input types in HTML5 that are not + // handled above. For now, we will consider these as text inputs since + // they could be used to capture user input. + ++page_feature_state_->num_text_inputs; + } +} + +void PhishingDOMFeatureExtractor::HandleScript( + const blink::WebElement& element) { + ++page_feature_state_->num_script_tags; +} + +void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { + DCHECK(done_callback_.is_null()); + DCHECK(!cur_frame_data_.get()); + DCHECK(cur_document_.IsNull()); + if (!done_callback_.is_null() || cur_frame_data_.get() || + !cur_document_.IsNull()) { + LOG(ERROR) << "Extraction in progress, missing call to " + << "CancelPendingExtraction"; + } +} + +void PhishingDOMFeatureExtractor::RunCallback(bool success) { + // Record some timing stats that we can use to evaluate feature extraction + // performance. These include both successful and failed extractions. + DCHECK(page_feature_state_.get()); + UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.DOMFeatureIterations", + page_feature_state_->num_iterations); + UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", + clock_->Now() - page_feature_state_->start_time); + + DCHECK(!done_callback_.is_null()); + std::move(done_callback_).Run(success); + Clear(); +} + +void PhishingDOMFeatureExtractor::Clear() { + features_ = NULL; + done_callback_.Reset(); + cur_frame_data_.reset(NULL); + cur_document_.Reset(); +} + +void PhishingDOMFeatureExtractor::ResetFrameData() { + DCHECK(!cur_document_.IsNull()); + DCHECK(!cur_frame_data_.get()); + + cur_frame_data_.reset(new FrameData()); + cur_frame_data_->elements = cur_document_.All(); + cur_frame_data_->domain = + net::registry_controlled_domains::GetDomainAndRegistry( + cur_document_.Url(), + net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); +} + +blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { + DCHECK(!cur_document_.IsNull()); + blink::WebFrame* frame = cur_document_.GetFrame(); + // Advance to the next frame that contains a document, with no wrapping. + if (frame) { + for (frame = frame->TraverseNext(); frame; frame = frame->TraverseNext()) { + // TODO(dcheng): Verify if the WebDocument::IsNull check is really needed. + if (frame->IsWebLocalFrame() && + !frame->ToWebLocalFrame()->GetDocument().IsNull()) { + return frame->ToWebLocalFrame()->GetDocument(); + } + } + } else { + // Keep track of how often frame traversal got "stuck" due to the + // current subdocument getting removed from the frame tree. + UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.DOMFeatureFrameRemoved", 1); + } + return blink::WebDocument(); +} + +bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, + std::string* domain) const { + DCHECK(domain); + DCHECK(cur_frame_data_.get()); + + if (cur_frame_data_->domain.empty()) { + return false; + } + + // TODO(bryner): Ensure that the url encoding is consistent with the features + // in the model. + if (url.HostIsIPAddress()) { + domain->assign(url.host()); + } else { + domain->assign(net::registry_controlled_domains::GetDomainAndRegistry( + url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES)); + } + + return !domain->empty() && *domain != cur_frame_data_->domain; +} + +blink::WebURL PhishingDOMFeatureExtractor::CompleteURL( + const blink::WebElement& element, + const blink::WebString& partial_url) { + return element.GetDocument().CompleteURL(partial_url); +} + +void PhishingDOMFeatureExtractor::InsertFeatures() { + DCHECK(page_feature_state_.get()); + + if (page_feature_state_->total_links > 0) { + // Add a feature for the fraction of times the page links to an external + // domain vs. an internal domain. + double link_freq = static_cast<double>( + page_feature_state_->external_links) / + page_feature_state_->total_links; + features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); + + // Add a feature for each unique domain that we're linking to + for (const auto& domain : page_feature_state_->external_domains) { + features_->AddBooleanFeature(features::kPageLinkDomain + domain); + } + + // Fraction of links that use https. + double secure_freq = static_cast<double>( + page_feature_state_->secure_links) / page_feature_state_->total_links; + features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); + } + + // Record whether forms appear and whether various form elements appear. + if (page_feature_state_->num_forms > 0) { + features_->AddBooleanFeature(features::kPageHasForms); + } + if (page_feature_state_->num_text_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasTextInputs); + } + if (page_feature_state_->num_pswd_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasPswdInputs); + } + if (page_feature_state_->num_radio_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasRadioInputs); + } + if (page_feature_state_->num_check_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasCheckInputs); + } + + // Record fraction of form actions that point to a different domain. + if (page_feature_state_->total_actions > 0) { + double action_freq = static_cast<double>( + page_feature_state_->action_other_domain) / + page_feature_state_->total_actions; + features_->AddRealFeature(features::kPageActionOtherDomainFreq, + action_freq); + } + + // Add a feature for each unique external action url. + for (const auto& url : page_feature_state_->page_action_urls) { + features_->AddBooleanFeature(features::kPageActionURL + url); + } + + // Record how many image src attributes point to a different domain. + if (page_feature_state_->total_imgs > 0) { + double img_freq = static_cast<double>( + page_feature_state_->img_other_domain) / + page_feature_state_->total_imgs; + features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); + } + + // Record number of script tags (discretized for numerical stability.) + if (page_feature_state_->num_script_tags > 1) { + features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); + if (page_feature_state_->num_script_tags > 6) { + features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); + } + } +} + +} // namespace safe_browsing |