diff options
Diffstat (limited to 'chromium/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h')
-rw-r--r-- | chromium/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/chromium/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chromium/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h new file mode 100644 index 00000000000..d0cc48d7033 --- /dev/null +++ b/chromium/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h @@ -0,0 +1,172 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// PhishingTermFeatureExtractor handles computing term features from the text +// of a web page for the client-side phishing detection model. To do this, it +// takes a list of terms that appear in the model, and scans through the page +// text looking for them. Any terms that appear will cause a corresponding +// features::kPageTerm feature to be added to the FeatureMap. +// +// To make it harder for a phisher to enumerate all of the relevant terms in +// the model, the terms are provided as SHA-256 hashes, rather than plain text. +// +// There is one PhishingTermFeatureExtractor per RenderView. + +#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ +#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ + +#include <stddef.h> +#include <stdint.h> + +#include <memory> +#include <set> +#include <string> +#include <unordered_set> + +#include "base/callback.h" +#include "base/macros.h" +#include "base/memory/weak_ptr.h" +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" + +namespace safe_browsing { +class FeatureExtractorClock; +class FeatureMap; + +class PhishingTermFeatureExtractor { + public: + // Callback to be run when feature extraction finishes. The callback + // argument is true if extraction was successful, false otherwise. + typedef base::OnceCallback<void(bool)> DoneCallback; + + // Creates a PhishingTermFeatureExtractor which will extract features for + // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These + // terms may be multi-word n-grams, with at most |max_words_per_term| words. + // + // |page_word_hashes| contains the murmur3 hashes for all of the individual + // words that make up the terms. Both sets of strings are UTF-8 encoded and + // lowercased prior to hashing. The caller owns both sets of strings, and + // must ensure that they are valid until the PhishingTermFeatureExtractor is + // destroyed. + // + // In addition to extracting page terms, we will also extract text shingling + // sketch, which consists of hashes of N-gram-words (referred to as shingles) + // in the page. |shingle_size| defines N, and |max_shingles_per_page| defines + // the maximum number of unique shingle hashes we extracted per page. + // + // |clock| is used for timing feature extractor operations, and may be mocked + // for testing. The caller keeps ownership of the clock. + PhishingTermFeatureExtractor( + const std::unordered_set<std::string>* page_term_hashes, + const std::unordered_set<uint32_t>* page_word_hashes, + size_t max_words_per_term, + uint32_t murmurhash3_seed, + size_t max_shingles_per_page, + size_t shingle_size, + FeatureExtractorClock* clock); + ~PhishingTermFeatureExtractor(); + + // Begins extracting features from |page_text| into the given FeatureMap. + // |page_text| should contain the plain text of a web page, including any + // subframes, as returned by RenderView::CaptureText(). + // + // To avoid blocking the render thread for too long, the feature extractor + // may run in several chunks of work, posting a task to the current + // MessageLoop to continue processing. Once feature extraction is complete, + // |done_callback| is run on the current thread. + // PhishingTermFeatureExtractor takes ownership of the callback. + // + // |page_text|, |features|, and |shingle_hashes| are owned by the caller, + // and must not be destroyed until either |done_callback| is run or + // CancelPendingExtraction() is called. + void ExtractFeatures(const base::string16* page_text, + FeatureMap* features, + std::set<uint32_t>* shingle_hashes, + DoneCallback done_callback); + + // Cancels any pending feature extraction. The DoneCallback will not be run. + // Must be called if there is a feature extraction in progress when the page + // is unloaded or the PhishingTermFeatureExtractor is destroyed. + void CancelPendingExtraction(); + + private: + struct ExtractionState; + + // The maximum amount of wall time that we will spend on a single extraction + // iteration before pausing to let other MessageLoop tasks run. + static const int kMaxTimePerChunkMs; + + // The number of words that we will process before checking to see whether + // kMaxTimePerChunkMs has elapsed. Since checking the current time can be + // slow, we don't do this on every word processed. + static const int kClockCheckGranularity; + + // The maximum total amount of time that the feature extractor will run + // before giving up on the current page. + static const int kMaxTotalTimeMs; + + // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs + // until a predefined maximum amount of time has elapsed, then posts a task + // to the current MessageLoop to continue extraction. When extraction + // finishes, calls RunCallback(). + void ExtractFeaturesWithTimeout(); + + // Handles a single word in the page text. + void HandleWord(const base::StringPiece16& word); + + // Helper to verify that there is no pending feature extraction. Dies in + // debug builds if the state is not as expected. This is a no-op in release + // builds. + void CheckNoPendingExtraction(); + + // Runs |done_callback_| and then clears all internal state. + void RunCallback(bool success); + + // Clears all internal feature extraction state. + void Clear(); + + // All of the term hashes that we are looking for in the page. + const std::unordered_set<std::string>* page_term_hashes_; + + // Murmur3 hashes of all the individual words in page_term_hashes_. If + // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ + // would contain (hashed) "one" and "two". We do this so that we can have a + // quick out in the common case that the current word we are processing + // doesn't contain any part of one of our terms. + const std::unordered_set<uint32_t>* page_word_hashes_; + + // The maximum number of words in an n-gram. + const size_t max_words_per_term_; + + // The seed for murmurhash3. + const uint32_t murmurhash3_seed_; + + // The maximum number of unique shingle hashes we extract in a page. + const size_t max_shingles_per_page_; + + // The number of words in a shingle. + const size_t shingle_size_; + + // Non-owned pointer to our clock. + FeatureExtractorClock* clock_; + + // The output parameters from the most recent call to ExtractFeatures(). + const base::string16* page_text_; // The caller keeps ownership of this. + FeatureMap* features_; // The caller keeps ownership of this. + std::set<uint32_t>* shingle_hashes_; + DoneCallback done_callback_; + + // Stores the current state of term extraction from |page_text_|. + std::unique_ptr<ExtractionState> state_; + + // Used in scheduling ExtractFeaturesWithTimeout tasks. + // These pointers are invalidated if extraction is cancelled. + base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_{this}; + + DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor); +}; + +} // namespace safe_browsing + +#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_ |