summaryrefslogtreecommitdiff
path: root/chromium/chrome/renderer/safe_browsing/phishing_classifier.h
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/chrome/renderer/safe_browsing/phishing_classifier.h')
-rw-r--r--chromium/chrome/renderer/safe_browsing/phishing_classifier.h155
1 files changed, 155 insertions, 0 deletions
diff --git a/chromium/chrome/renderer/safe_browsing/phishing_classifier.h b/chromium/chrome/renderer/safe_browsing/phishing_classifier.h
new file mode 100644
index 00000000000..9528c434cce
--- /dev/null
+++ b/chromium/chrome/renderer/safe_browsing/phishing_classifier.h
@@ -0,0 +1,155 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This class handles the process of extracting all of the features from a
+// page and computing a phishyness score. The basic steps are:
+// - Run each feature extractor over the page, building up a FeatureMap of
+// feature -> value.
+// - SHA-256 hash all of the feature names in the map so that they match the
+// supplied model.
+// - Hand the hashed map off to a Scorer, which computes the probability that
+// the page is phishy.
+// - If the page is phishy, run the supplied callback.
+//
+// For more details, see phishing_*_feature_extractor.h, scorer.h, and
+// client_model.proto.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
+
+#include <stdint.h>
+
+#include <memory>
+#include <set>
+
+#include "base/callback.h"
+#include "base/macros.h"
+#include "base/memory/weak_ptr.h"
+#include "base/strings/string16.h"
+
+namespace content {
+class RenderFrame;
+}
+
+namespace safe_browsing {
+class ClientPhishingRequest;
+class FeatureExtractorClock;
+class FeatureMap;
+class PhishingDOMFeatureExtractor;
+class PhishingTermFeatureExtractor;
+class PhishingUrlFeatureExtractor;
+class Scorer;
+
+class PhishingClassifier {
+ public:
+ // Callback to be run when phishing classification finishes. The verdict
+ // is a ClientPhishingRequest which contains the verdict computed by the
+ // classifier as well as the extracted features. If the verdict.is_phishing()
+ // is true, the page is considered phishy by the client-side model,
+ // and the browser should ping back to get a final verdict. The
+ // verdict.client_score() is set to kInvalidScore if classification failed.
+ typedef base::OnceCallback<void(const ClientPhishingRequest& /* verdict */)>
+ DoneCallback;
+
+ static const float kInvalidScore;
+
+ // Creates a new PhishingClassifier object that will operate on
+ // |render_view|. |clock| is used to time feature extractor operations, and
+ // the PhishingClassifier takes ownership of this object. Note that the
+ // classifier will not be 'ready' until set_phishing_scorer() is called.
+ PhishingClassifier(content::RenderFrame* render_frame,
+ FeatureExtractorClock* clock);
+ virtual ~PhishingClassifier();
+
+ // Sets a scorer for the classifier to use in computing the phishiness score.
+ // This must live at least as long as the PhishingClassifier. The caller is
+ // expected to cancel any pending classification before setting a phishing
+ // scorer.
+ void set_phishing_scorer(const Scorer* scorer);
+
+ // Returns true if the classifier is ready to classify pages, i.e. it
+ // has had a scorer set via set_phishing_scorer().
+ bool is_ready() const;
+
+ // Called by the RenderView when a page has finished loading. This begins
+ // the feature extraction and scoring process. |page_text| should contain
+ // the plain text of a web page, including any subframes, as returned by
+ // RenderView::CaptureText(). |page_text| is owned by the caller, and must
+ // not be destroyed until either |done_callback| is run or
+ // CancelPendingClassification() is called.
+ //
+ // To avoid blocking the render thread for too long, phishing classification
+ // may run in several chunks of work, posting a task to the current
+ // MessageLoop to continue processing. Once the scoring process is complete,
+ // |done_callback| is run on the current thread. PhishingClassifier takes
+ // ownership of the callback.
+ //
+ // It is an error to call BeginClassification if the classifier is not yet
+ // ready.
+ virtual void BeginClassification(const base::string16* page_text,
+ DoneCallback callback);
+
+ // Called by the RenderView (on the render thread) when a page is unloading
+ // or the RenderView is being destroyed. This cancels any extraction that
+ // is in progress. It is an error to call CancelPendingClassification if
+ // the classifier is not yet ready.
+ virtual void CancelPendingClassification();
+
+ private:
+ // Any score equal to or above this value is considered phishy.
+ static const float kPhishyThreshold;
+
+ // Begins the feature extraction process, by extracting URL features and
+ // beginning DOM feature extraction.
+ void BeginFeatureExtraction();
+
+ // Callback to be run when DOM feature extraction is complete.
+ // If it was successful, begins term feature extraction, otherwise
+ // runs the DoneCallback with a non-phishy verdict.
+ void DOMExtractionFinished(bool success);
+
+ // Callback to be run when term feature extraction is complete.
+ // If it was successful, computes a score and runs the DoneCallback.
+ // If extraction was unsuccessful, runs the DoneCallback with a
+ // non-phishy verdict.
+ void TermExtractionFinished(bool success);
+
+ // Helper to verify that there is no pending phishing classification. Dies
+ // in debug builds if the state is not as expected. This is a no-op in
+ // release builds.
+ void CheckNoPendingClassification();
+
+ // Helper method to run the DoneCallback and clear the state.
+ void RunCallback(const ClientPhishingRequest& verdict);
+
+ // Helper to run the DoneCallback when feature extraction has failed.
+ // This always signals a non-phishy verdict for the page, with kInvalidScore.
+ void RunFailureCallback();
+
+ // Clears the current state of the PhishingClassifier.
+ void Clear();
+
+ content::RenderFrame* render_frame_; // owns us
+ const Scorer* scorer_; // owned by the caller
+ std::unique_ptr<FeatureExtractorClock> clock_;
+ std::unique_ptr<PhishingUrlFeatureExtractor> url_extractor_;
+ std::unique_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
+ std::unique_ptr<PhishingTermFeatureExtractor> term_extractor_;
+
+ // State for any in-progress extraction.
+ std::unique_ptr<FeatureMap> features_;
+ std::unique_ptr<std::set<uint32_t>> shingle_hashes_;
+ const base::string16* page_text_; // owned by the caller
+ DoneCallback done_callback_;
+
+ // Used in scheduling BeginFeatureExtraction tasks.
+ // These pointers are invalidated if classification is cancelled.
+ base::WeakPtrFactory<PhishingClassifier> weak_factory_{this};
+
+ DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
+};
+
+} // namespace safe_browsing
+
+#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_