summaryrefslogtreecommitdiff
path: root/chromium/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
blob: 2bea4a2ea43ba9803f73e4f0273da240edbec443 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

#include <list>
#include <map>
#include <memory>
#include <unordered_set>
#include <utility>

#include "base/bind.h"
#include "base/compiler_specific.h"
#include "base/i18n/break_iterator.h"
#include "base/i18n/case_conversion.h"
#include "base/location.h"
#include "base/logging.h"
#include "base/metrics/histogram_macros.h"
#include "base/single_thread_task_runner.h"
#include "base/strings/utf_string_conversions.h"
#include "base/threading/thread_task_runner_handle.h"
#include "base/time/time.h"
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
#include "crypto/sha2.h"

namespace safe_browsing {

// This time should be short enough that it doesn't noticeably disrupt the
// user's interaction with the page.
const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;

// Experimenting shows that we get a reasonable gain in performance by
// increasing this up to around 10, but there's not much benefit in
// increasing it past that.
const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;

// This should be longer than we expect feature extraction to take on any
// actual phishing page.
const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;

// All of the state pertaining to the current feature extraction.
struct PhishingTermFeatureExtractor::ExtractionState {
  // Stores up to max_words_per_term_ previous words separated by spaces.
  std::string previous_words;

  // Stores the current shingle after a new word is processed and added in.
  std::string current_shingle;

  // Stores the sizes of the words in current_shingle. Note: the size includes
  // the space after each word. In other words, the sum of all sizes in this
  // list is equal to the length of current_shingle.
  std::list<size_t> shingle_word_sizes;

  // Stores the sizes of the words in previous_words.  Note: the size includes
  // the space after each word.  In other words, the sum of all sizes in this
  // list is equal to the length of previous_words.
  std::list<size_t> previous_word_sizes;

  // An iterator for word breaking.
  std::unique_ptr<base::i18n::BreakIterator> iterator;

  // The time at which we started feature extraction for the current page.
  base::TimeTicks start_time;

  // The number of iterations we've done for the current extraction.
  int num_iterations;

  ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
      : start_time(start_time_ticks),
        num_iterations(0) {
    std::unique_ptr<base::i18n::BreakIterator> i(new base::i18n::BreakIterator(
        text, base::i18n::BreakIterator::BREAK_WORD));

    if (i->Init()) {
      iterator = std::move(i);
    } else {
      DLOG(ERROR) << "failed to open iterator";
    }
  }
};

PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
    const std::unordered_set<std::string>* page_term_hashes,
    const std::unordered_set<uint32_t>* page_word_hashes,
    size_t max_words_per_term,
    uint32_t murmurhash3_seed,
    size_t max_shingles_per_page,
    size_t shingle_size,
    FeatureExtractorClock* clock)
    : page_term_hashes_(page_term_hashes),
      page_word_hashes_(page_word_hashes),
      max_words_per_term_(max_words_per_term),
      murmurhash3_seed_(murmurhash3_seed),
      max_shingles_per_page_(max_shingles_per_page),
      shingle_size_(shingle_size),
      clock_(clock) {
  Clear();
}

PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
  // The RenderView should have called CancelPendingExtraction() before
  // we are destroyed.
  CheckNoPendingExtraction();
}

void PhishingTermFeatureExtractor::ExtractFeatures(
    const base::string16* page_text,
    FeatureMap* features,
    std::set<uint32_t>* shingle_hashes,
    DoneCallback done_callback) {
  // The RenderView should have called CancelPendingExtraction() before
  // starting a new extraction, so DCHECK this.
  CheckNoPendingExtraction();
  // However, in an opt build, we will go ahead and clean up the pending
  // extraction so that we can start in a known state.
  CancelPendingExtraction();

  page_text_ = page_text;
  features_ = features;
  shingle_hashes_ = shingle_hashes, done_callback_ = std::move(done_callback);

  state_.reset(new ExtractionState(*page_text_, clock_->Now()));
  base::ThreadTaskRunnerHandle::Get()->PostTask(
      FROM_HERE,
      base::BindOnce(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
                     weak_factory_.GetWeakPtr()));
}

void PhishingTermFeatureExtractor::CancelPendingExtraction() {
  // Cancel any pending callbacks, and clear our state.
  weak_factory_.InvalidateWeakPtrs();
  Clear();
}

void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
  DCHECK(state_.get());
  ++state_->num_iterations;
  base::TimeTicks current_chunk_start_time = clock_->Now();

  if (!state_->iterator.get()) {
    // We failed to initialize the break iterator, so stop now.
    UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.TermFeatureBreakIterError", 1);
    RunCallback(false);
    return;
  }

  int num_words = 0;
  while (state_->iterator->Advance()) {
    if (state_->iterator->IsWord()) {
      const size_t start = state_->iterator->prev();
      const size_t length = state_->iterator->pos() - start;
      HandleWord(base::StringPiece16(page_text_->data() + start, length));
      ++num_words;
    }

    if (num_words >= kClockCheckGranularity) {
      num_words = 0;
      base::TimeTicks now = clock_->Now();
      if (now - state_->start_time >=
          base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
        DLOG(ERROR) << "Feature extraction took too long, giving up";
        // We expect this to happen infrequently, so record when it does.
        UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.TermFeatureTimeout", 1);
        RunCallback(false);
        return;
      }
      base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
      if (chunk_elapsed >=
          base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
        // The time limit for the current chunk is up, so post a task to
        // continue extraction.
        //
        // Record how much time we actually spent on the chunk.  If this is
        // much higher than kMaxTimePerChunkMs, we may need to adjust the
        // clock granularity.
        UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
                            chunk_elapsed);
        base::ThreadTaskRunnerHandle::Get()->PostTask(
            FROM_HERE,
            base::BindOnce(
                &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
                weak_factory_.GetWeakPtr()));
        return;
      }
      // Otherwise, continue.
    }
  }
  RunCallback(true);
}

void PhishingTermFeatureExtractor::HandleWord(
    const base::StringPiece16& word) {
  // First, extract shingle hashes.
  const std::string& word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
  state_->current_shingle.append(word_lower + " ");
  state_->shingle_word_sizes.push_back(word_lower.size() + 1);
  if (state_->shingle_word_sizes.size() == shingle_size_) {
    shingle_hashes_->insert(
        MurmurHash3String(state_->current_shingle, murmurhash3_seed_));
    state_->current_shingle.erase(0, state_->shingle_word_sizes.front());
    state_->shingle_word_sizes.pop_front();
  }
  // Check if the size of shingle hashes is over the limit.
  if (shingle_hashes_->size() > max_shingles_per_page_) {
    // Pop the largest one.
    auto it = shingle_hashes_->end();
    shingle_hashes_->erase(--it);
  }

  // Next, extract page terms.
  uint32_t word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);

  // Quick out if the word is not part of any term, which is the common case.
  if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
    // Word doesn't exist in our terms so we can clear the n-gram state.
    state_->previous_words.clear();
    state_->previous_word_sizes.clear();
    return;
  }

  // Find all of the n-grams that we need to check and compute their SHA-256
  // hashes.
  std::map<std::string /* hash */, std::string /* plaintext */>
      hashes_to_check;
  hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;

  // Combine the new word with the previous words to find additional n-grams.
  // Note that we don't yet add the new word length to previous_word_sizes,
  // since we don't want to compute the hash for the word by itself again.
  //
  state_->previous_words.append(word_lower);
  std::string current_term = state_->previous_words;
  for (auto it = state_->previous_word_sizes.begin();
       it != state_->previous_word_sizes.end(); ++it) {
    hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
    current_term.erase(0, *it);
  }

  // Add features for any hashes that match page_term_hashes_.
  for (auto it = hashes_to_check.begin(); it != hashes_to_check.end(); ++it) {
    if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
      features_->AddBooleanFeature(features::kPageTerm + it->second);
    }
  }

  // Now that we have handled the current word, we have to add a space at the
  // end of it, and add the new word's size (including the space) to
  // previous_word_sizes.  Note: it's possible that the document language
  // doesn't use ASCII spaces to separate words.  That's fine though, we just
  // need to be consistent with how the model is generated.
  state_->previous_words.append(" ");
  state_->previous_word_sizes.push_back(word_lower.size() + 1);

  // Cap the number of previous words.
  if (state_->previous_word_sizes.size() >= max_words_per_term_) {
    state_->previous_words.erase(0, state_->previous_word_sizes.front());
    state_->previous_word_sizes.pop_front();
  }
}

void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
  DCHECK(done_callback_.is_null());
  DCHECK(!state_.get());
  if (!done_callback_.is_null() || state_.get()) {
    LOG(ERROR) << "Extraction in progress, missing call to "
               << "CancelPendingExtraction";
  }
}

void PhishingTermFeatureExtractor::RunCallback(bool success) {
  // Record some timing stats that we can use to evaluate feature extraction
  // performance.  These include both successful and failed extractions.
  DCHECK(state_.get());
  UMA_HISTOGRAM_COUNTS_1M("SBClientPhishing.TermFeatureIterations",
                          state_->num_iterations);
  UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
                      clock_->Now() - state_->start_time);

  DCHECK(!done_callback_.is_null());
  std::move(done_callback_).Run(success);
  Clear();
}

void PhishingTermFeatureExtractor::Clear() {
  page_text_ = NULL;
  features_ = NULL;
  shingle_hashes_ = NULL;
  done_callback_.Reset();
  state_.reset(NULL);
}

}  // namespace safe_browsing