summaryrefslogtreecommitdiff
path: root/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc')
-rw-r--r--chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc504
1 files changed, 504 insertions, 0 deletions
diff --git a/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
new file mode 100644
index 00000000000..f1a9fb37d82
--- /dev/null
+++ b/chromium/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
@@ -0,0 +1,504 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
+
+#include <utility>
+
+#include "base/bind.h"
+#include "base/compiler_specific.h"
+#include "base/location.h"
+#include "base/logging.h"
+#include "base/metrics/histogram_macros.h"
+#include "base/single_thread_task_runner.h"
+#include "base/strings/string_util.h"
+#include "base/threading/thread_task_runner_handle.h"
+#include "base/time/time.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "content/public/renderer/render_view.h"
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
+#include "third_party/blink/public/platform/web_string.h"
+#include "third_party/blink/public/web/web_element.h"
+#include "third_party/blink/public/web/web_element_collection.h"
+#include "third_party/blink/public/web/web_local_frame.h"
+#include "third_party/blink/public/web/web_view.h"
+
+namespace safe_browsing {
+
+// This time should be short enough that it doesn't noticeably disrupt the
+// user's interaction with the page.
+const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
+
+// Experimenting shows that we get a reasonable gain in performance by
+// increasing this up to around 10, but there's not much benefit in
+// increasing it past that.
+const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
+
+// This should be longer than we expect feature extraction to take on any
+// actual phishing page.
+const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
+
+// Intermediate state used for computing features. See features.h for
+// descriptions of the DOM features that are computed.
+struct PhishingDOMFeatureExtractor::PageFeatureState {
+ // Link related features
+ int external_links;
+ std::unordered_set<std::string> external_domains;
+ int secure_links;
+ int total_links;
+
+ // Form related features
+ int num_forms;
+ int num_text_inputs;
+ int num_pswd_inputs;
+ int num_radio_inputs;
+ int num_check_inputs;
+ int action_other_domain;
+ int total_actions;
+ std::unordered_set<std::string> page_action_urls;
+
+ // Image related features
+ int img_other_domain;
+ int total_imgs;
+
+ // How many script tags
+ int num_script_tags;
+
+ // The time at which we started feature extraction for the current page.
+ base::TimeTicks start_time;
+
+ // The number of iterations we've done for the current extraction.
+ int num_iterations;
+
+ explicit PageFeatureState(base::TimeTicks start_time_ticks)
+ : external_links(0),
+ secure_links(0),
+ total_links(0),
+ num_forms(0),
+ num_text_inputs(0),
+ num_pswd_inputs(0),
+ num_radio_inputs(0),
+ num_check_inputs(0),
+ action_other_domain(0),
+ total_actions(0),
+ img_other_domain(0),
+ total_imgs(0),
+ num_script_tags(0),
+ start_time(start_time_ticks),
+ num_iterations(0) {}
+
+ ~PageFeatureState() {}
+};
+
+// Per-frame state
+struct PhishingDOMFeatureExtractor::FrameData {
+ // This is our reference to document.all, which is an iterator over all
+ // of the elements in the document. It keeps track of our current position.
+ blink::WebElementCollection elements;
+ // The domain of the document URL, stored here so that we don't need to
+ // recompute it every time it's needed.
+ std::string domain;
+};
+
+PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
+ FeatureExtractorClock* clock)
+ : clock_(clock) {
+ Clear();
+}
+
+PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
+ // The RenderView should have called CancelPendingExtraction() before
+ // we are destroyed.
+ CheckNoPendingExtraction();
+}
+
+void PhishingDOMFeatureExtractor::ExtractFeatures(blink::WebDocument document,
+ FeatureMap* features,
+ DoneCallback done_callback) {
+ // The RenderView should have called CancelPendingExtraction() before
+ // starting a new extraction, so DCHECK this.
+ CheckNoPendingExtraction();
+ // However, in an opt build, we will go ahead and clean up the pending
+ // extraction so that we can start in a known state.
+ CancelPendingExtraction();
+
+ features_ = features;
+ done_callback_ = std::move(done_callback);
+
+ page_feature_state_.reset(new PageFeatureState(clock_->Now()));
+ cur_document_ = document;
+
+ base::ThreadTaskRunnerHandle::Get()->PostTask(
+ FROM_HERE,
+ base::BindOnce(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
+ weak_factory_.GetWeakPtr()));
+}
+
+void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
+ // Cancel any pending callbacks, and clear our state.
+ weak_factory_.InvalidateWeakPtrs();
+ Clear();
+}
+
+void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
+ DCHECK(page_feature_state_.get());
+ ++page_feature_state_->num_iterations;
+ base::TimeTicks current_chunk_start_time = clock_->Now();
+
+ if (cur_document_.IsNull()) {
+ // This will only happen if we weren't able to get the document for the
+ // main frame. We'll treat this as an extraction failure.
+ RunCallback(false);
+ return;
+ }
+
+ int num_elements = 0;
+ for (; !cur_document_.IsNull(); cur_document_ = GetNextDocument()) {
+ blink::WebElement cur_element;
+ if (cur_frame_data_.get()) {
+ // We're resuming traversal of a frame, so just advance to the next
+ // element.
+ cur_element = cur_frame_data_->elements.NextItem();
+ // When we resume the traversal, the first call to nextItem() potentially
+ // has to walk through the document again from the beginning, if it was
+ // modified between our chunks of work. Log how long this takes, so we
+ // can tell if it's too slow.
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
+ clock_->Now() - current_chunk_start_time);
+ } else {
+ // We just moved to a new frame, so update our frame state
+ // and advance to the first element.
+ ResetFrameData();
+ cur_element = cur_frame_data_->elements.FirstItem();
+ }
+
+ for (; !cur_element.IsNull();
+ cur_element = cur_frame_data_->elements.NextItem()) {
+ if (cur_element.HasHTMLTagName("a")) {
+ HandleLink(cur_element);
+ } else if (cur_element.HasHTMLTagName("form")) {
+ HandleForm(cur_element);
+ } else if (cur_element.HasHTMLTagName("img")) {
+ HandleImage(cur_element);
+ } else if (cur_element.HasHTMLTagName("input")) {
+ HandleInput(cur_element);
+ } else if (cur_element.HasHTMLTagName("script")) {
+ HandleScript(cur_element);
+ }
+
+ if (++num_elements >= kClockCheckGranularity) {
+ num_elements = 0;
+ base::TimeTicks now = clock_->Now();
+ if (now - page_feature_state_->start_time >=
+ base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
+ DLOG(ERROR) << "Feature extraction took too long, giving up";
+ // We expect this to happen infrequently, so record when it does.
+ UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.DOMFeatureTimeout", 1);
+ RunCallback(false);
+ return;
+ }
+ base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
+ if (chunk_elapsed >=
+ base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
+ // The time limit for the current chunk is up, so post a task to
+ // continue extraction.
+ //
+ // Record how much time we actually spent on the chunk. If this is
+ // much higher than kMaxTimePerChunkMs, we may need to adjust the
+ // clock granularity.
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
+ chunk_elapsed);
+ base::ThreadTaskRunnerHandle::Get()->PostTask(
+ FROM_HERE,
+ base::BindOnce(
+ &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
+ weak_factory_.GetWeakPtr()));
+ return;
+ }
+ // Otherwise, continue.
+ }
+ }
+
+ // We're done with this frame, recalculate the FrameData when we
+ // advance to the next frame.
+ cur_frame_data_.reset();
+ }
+
+ InsertFeatures();
+ RunCallback(true);
+}
+
+void PhishingDOMFeatureExtractor::HandleLink(
+ const blink::WebElement& element) {
+ // Count the number of times we link to a different host.
+ if (!element.HasAttribute("href")) {
+ DVLOG(1) << "Skipping anchor tag with no href";
+ return;
+ }
+
+ // Retrieve the link and resolve the link in case it's relative.
+ blink::WebURL full_url = CompleteURL(element, element.GetAttribute("href"));
+
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ DVLOG(1) << "Could not extract domain from link: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->external_links;
+
+ // Record each unique domain that we link to.
+ page_feature_state_->external_domains.insert(domain);
+ }
+
+ // Check how many are https links.
+ if (GURL(full_url).SchemeIs("https")) {
+ ++page_feature_state_->secure_links;
+ }
+
+ ++page_feature_state_->total_links;
+}
+
+void PhishingDOMFeatureExtractor::HandleForm(
+ const blink::WebElement& element) {
+ // Increment the number of forms on this page.
+ ++page_feature_state_->num_forms;
+
+ // Record whether the action points to a different domain.
+ if (!element.HasAttribute("action")) {
+ return;
+ }
+
+ blink::WebURL full_url = CompleteURL(element, element.GetAttribute("action"));
+
+ page_feature_state_->page_action_urls.insert(full_url.GetString().Utf8());
+
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ DVLOG(1) << "Could not extract domain from form action: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->action_other_domain;
+ }
+ ++page_feature_state_->total_actions;
+}
+
+void PhishingDOMFeatureExtractor::HandleImage(
+ const blink::WebElement& element) {
+ if (!element.HasAttribute("src")) {
+ DVLOG(1) << "Skipping img tag with no src";
+ }
+
+ // Record whether the image points to a different domain.
+ blink::WebURL full_url = CompleteURL(element, element.GetAttribute("src"));
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ DVLOG(1) << "Could not extract domain from image src: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->img_other_domain;
+ }
+ ++page_feature_state_->total_imgs;
+}
+
+void PhishingDOMFeatureExtractor::HandleInput(
+ const blink::WebElement& element) {
+ // The HTML spec says that if the type is unspecified, it defaults to text.
+ // In addition, any unrecognized type will be treated as a text input.
+ //
+ // Note that we use the attribute value rather than
+ // WebFormControlElement::formControlType() for consistency with the
+ // way the phishing classification model is created.
+ std::string type = base::ToLowerASCII(element.GetAttribute("type").Utf8());
+ if (type == "password") {
+ ++page_feature_state_->num_pswd_inputs;
+ } else if (type == "radio") {
+ ++page_feature_state_->num_radio_inputs;
+ } else if (type == "checkbox") {
+ ++page_feature_state_->num_check_inputs;
+ } else if (type != "submit" && type != "reset" && type != "file" &&
+ type != "hidden" && type != "image" && type != "button") {
+ // Note that there are a number of new input types in HTML5 that are not
+ // handled above. For now, we will consider these as text inputs since
+ // they could be used to capture user input.
+ ++page_feature_state_->num_text_inputs;
+ }
+}
+
+void PhishingDOMFeatureExtractor::HandleScript(
+ const blink::WebElement& element) {
+ ++page_feature_state_->num_script_tags;
+}
+
+void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
+ DCHECK(done_callback_.is_null());
+ DCHECK(!cur_frame_data_.get());
+ DCHECK(cur_document_.IsNull());
+ if (!done_callback_.is_null() || cur_frame_data_.get() ||
+ !cur_document_.IsNull()) {
+ LOG(ERROR) << "Extraction in progress, missing call to "
+ << "CancelPendingExtraction";
+ }
+}
+
+void PhishingDOMFeatureExtractor::RunCallback(bool success) {
+ // Record some timing stats that we can use to evaluate feature extraction
+ // performance. These include both successful and failed extractions.
+ DCHECK(page_feature_state_.get());
+ UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.DOMFeatureIterations",
+ page_feature_state_->num_iterations);
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
+ clock_->Now() - page_feature_state_->start_time);
+
+ DCHECK(!done_callback_.is_null());
+ std::move(done_callback_).Run(success);
+ Clear();
+}
+
+void PhishingDOMFeatureExtractor::Clear() {
+ features_ = NULL;
+ done_callback_.Reset();
+ cur_frame_data_.reset(NULL);
+ cur_document_.Reset();
+}
+
+void PhishingDOMFeatureExtractor::ResetFrameData() {
+ DCHECK(!cur_document_.IsNull());
+ DCHECK(!cur_frame_data_.get());
+
+ cur_frame_data_.reset(new FrameData());
+ cur_frame_data_->elements = cur_document_.All();
+ cur_frame_data_->domain =
+ net::registry_controlled_domains::GetDomainAndRegistry(
+ cur_document_.Url(),
+ net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
+}
+
+blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
+ DCHECK(!cur_document_.IsNull());
+ blink::WebFrame* frame = cur_document_.GetFrame();
+ // Advance to the next frame that contains a document, with no wrapping.
+ if (frame) {
+ for (frame = frame->TraverseNext(); frame; frame = frame->TraverseNext()) {
+ // TODO(dcheng): Verify if the WebDocument::IsNull check is really needed.
+ if (frame->IsWebLocalFrame() &&
+ !frame->ToWebLocalFrame()->GetDocument().IsNull()) {
+ return frame->ToWebLocalFrame()->GetDocument();
+ }
+ }
+ } else {
+ // Keep track of how often frame traversal got "stuck" due to the
+ // current subdocument getting removed from the frame tree.
+ UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.DOMFeatureFrameRemoved", 1);
+ }
+ return blink::WebDocument();
+}
+
+bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
+ std::string* domain) const {
+ DCHECK(domain);
+ DCHECK(cur_frame_data_.get());
+
+ if (cur_frame_data_->domain.empty()) {
+ return false;
+ }
+
+ // TODO(bryner): Ensure that the url encoding is consistent with the features
+ // in the model.
+ if (url.HostIsIPAddress()) {
+ domain->assign(url.host());
+ } else {
+ domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
+ url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
+ }
+
+ return !domain->empty() && *domain != cur_frame_data_->domain;
+}
+
+blink::WebURL PhishingDOMFeatureExtractor::CompleteURL(
+ const blink::WebElement& element,
+ const blink::WebString& partial_url) {
+ return element.GetDocument().CompleteURL(partial_url);
+}
+
+void PhishingDOMFeatureExtractor::InsertFeatures() {
+ DCHECK(page_feature_state_.get());
+
+ if (page_feature_state_->total_links > 0) {
+ // Add a feature for the fraction of times the page links to an external
+ // domain vs. an internal domain.
+ double link_freq = static_cast<double>(
+ page_feature_state_->external_links) /
+ page_feature_state_->total_links;
+ features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
+
+ // Add a feature for each unique domain that we're linking to
+ for (const auto& domain : page_feature_state_->external_domains) {
+ features_->AddBooleanFeature(features::kPageLinkDomain + domain);
+ }
+
+ // Fraction of links that use https.
+ double secure_freq = static_cast<double>(
+ page_feature_state_->secure_links) / page_feature_state_->total_links;
+ features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
+ }
+
+ // Record whether forms appear and whether various form elements appear.
+ if (page_feature_state_->num_forms > 0) {
+ features_->AddBooleanFeature(features::kPageHasForms);
+ }
+ if (page_feature_state_->num_text_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasTextInputs);
+ }
+ if (page_feature_state_->num_pswd_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasPswdInputs);
+ }
+ if (page_feature_state_->num_radio_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasRadioInputs);
+ }
+ if (page_feature_state_->num_check_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasCheckInputs);
+ }
+
+ // Record fraction of form actions that point to a different domain.
+ if (page_feature_state_->total_actions > 0) {
+ double action_freq = static_cast<double>(
+ page_feature_state_->action_other_domain) /
+ page_feature_state_->total_actions;
+ features_->AddRealFeature(features::kPageActionOtherDomainFreq,
+ action_freq);
+ }
+
+ // Add a feature for each unique external action url.
+ for (const auto& url : page_feature_state_->page_action_urls) {
+ features_->AddBooleanFeature(features::kPageActionURL + url);
+ }
+
+ // Record how many image src attributes point to a different domain.
+ if (page_feature_state_->total_imgs > 0) {
+ double img_freq = static_cast<double>(
+ page_feature_state_->img_other_domain) /
+ page_feature_state_->total_imgs;
+ features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
+ }
+
+ // Record number of script tags (discretized for numerical stability.)
+ if (page_feature_state_->num_script_tags > 1) {
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
+ if (page_feature_state_->num_script_tags > 6) {
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
+ }
+ }
+}
+
+} // namespace safe_browsing