summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
blob: e587047a97672af7d1cacaf5283b98e901c7698f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"

#include <string>

#include "base/callback.h"
#include "base/hash_tables.h"
#include "base/memory/scoped_ptr.h"
#include "base/message_loop.h"
#include "base/sha2.h"
#include "base/string16.h"
#include "base/stringprintf.h"
#include "base/time.h"
#include "base/utf_string_conversions.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"

using ::testing::ContainerEq;
using ::testing::Return;

namespace safe_browsing {

class PhishingTermFeatureExtractorTest : public ::testing::Test {
 protected:
  virtual void SetUp() {
    base::hash_set<std::string> terms;
    terms.insert("one");
    terms.insert("one one");
    terms.insert("two");
    terms.insert("multi word test");
    terms.insert("capitalization");
    terms.insert("space");
    terms.insert("separator");
    terms.insert("punctuation");
    // Chinese (translation of "hello")
    terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
    // Chinese (translation of "goodbye")
    terms.insert("\xe5\x86\x8d\xe8\xa7\x81");

    for (base::hash_set<std::string>::iterator it = terms.begin();
         it != terms.end(); ++it) {
      term_hashes_.insert(base::SHA256HashString(*it));
    }

    base::hash_set<std::string> words;
    words.insert("one");
    words.insert("two");
    words.insert("multi");
    words.insert("word");
    words.insert("test");
    words.insert("capitalization");
    words.insert("space");
    words.insert("separator");
    words.insert("punctuation");
    words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
    words.insert("\xe5\x86\x8d\xe8\xa7\x81");

    for (base::hash_set<std::string>::iterator it = words.begin();
         it != words.end(); ++it) {
      word_hashes_.insert(base::SHA256HashString(*it));
    }

    extractor_.reset(new PhishingTermFeatureExtractor(
        &term_hashes_,
        &word_hashes_,
        3 /* max_words_per_term */,
        &clock_));
  }

  // Runs the TermFeatureExtractor on |page_text|, waiting for the
  // completion callback.  Returns the success boolean from the callback.
  bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
    success_ = false;
    extractor_->ExtractFeatures(
        page_text,
        features,
        NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));
    msg_loop_.Run();
    return success_;
  }

  // Completion callback for feature extraction.
  void ExtractionDone(bool success) {
    success_ = success;
    msg_loop_.Quit();
  }

  MessageLoop msg_loop_;
  MockFeatureExtractorClock clock_;
  scoped_ptr<PhishingTermFeatureExtractor> extractor_;
  base::hash_set<std::string> term_hashes_;
  base::hash_set<std::string> word_hashes_;
  bool success_;  // holds the success value from ExtractFeatures
};

TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
  // This test doesn't exercise the extraction timing.
  EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));

  string16 page_text = ASCIIToUTF16("blah");
  FeatureMap expected_features;  // initially empty

  FeatureMap features;
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

  page_text = ASCIIToUTF16("one one");
  expected_features.Clear();
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("one"));
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("one one"));

  features.Clear();
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

  page_text = ASCIIToUTF16("bla bla multi word test bla");
  expected_features.Clear();
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("multi word test"));

  features.Clear();
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

  // This text has all of the words for one of the terms, but they are
  // not in the correct order.
  page_text = ASCIIToUTF16("bla bla test word multi bla");
  expected_features.Clear();

  features.Clear();
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

  page_text = ASCIIToUTF16("Capitalization plus non-space\n"
                           "separator... punctuation!");
  expected_features.Clear();
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("capitalization"));
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("space"));
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("separator"));
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("punctuation"));

  features.Clear();
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

  // Test with empty page text.
  page_text = string16();
  expected_features.Clear();
  features.Clear();
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));

  // Chinese translation of the phrase "hello goodbye". This tests that
  // we can correctly separate terms in languages that don't use spaces.
  page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
  expected_features.Clear();
  expected_features.AddBooleanFeature(
      features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
  expected_features.AddBooleanFeature(
      features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));

  features.Clear();
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
}

TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
  // For this test, we'll cause the feature extraction to run multiple
  // iterations by incrementing the clock.

  // This page has a total of 30 words.  For the features to be computed
  // correctly, the extractor has to process the entire string of text.
  string16 page_text(ASCIIToUTF16("one "));
  for (int i = 0; i < 28; ++i) {
    page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));
  }
  page_text.append(ASCIIToUTF16("two"));

  // Advance the clock 15 ms every 10 words processed, 10 ms between chunks.
  // Note that this assumes kClockCheckGranularity = 10 and
  // kMaxTimePerChunkMs = 20.
  base::TimeTicks now = base::TimeTicks::Now();
  EXPECT_CALL(clock_, Now())
      // Time check at the start of extraction.
      .WillOnce(Return(now))
      // Time check at the start of the first chunk of work.
      .WillOnce(Return(now))
      // Time check after the first 10 words.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(15)))
      // Time check after the next 10 words.  This is over the chunk
      // time limit, so a continuation task will be posted.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
      // Time check at the start of the second chunk of work.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(40)))
      // Time check after the next 10 words.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(55)))
      // A final check for the histograms.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(56)));

  FeatureMap expected_features;
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("one"));
  expected_features.AddBooleanFeature(features::kPageTerm +
                                      std::string("two"));

  FeatureMap features;
  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
  // Make sure none of the mock expectations carry over to the next test.
  ::testing::Mock::VerifyAndClearExpectations(&clock_);

  // Now repeat the test with the same text, but advance the clock faster so
  // that the extraction time exceeds the maximum total time for the feature
  // extractor.  Extraction should fail.  Note that this assumes
  // kMaxTotalTimeMs = 500.
  EXPECT_CALL(clock_, Now())
      // Time check at the start of extraction.
      .WillOnce(Return(now))
      // Time check at the start of the first chunk of work.
      .WillOnce(Return(now))
      // Time check after the first 10 words,
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
      // Time check at the start of the second chunk of work.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
      // Time check after the next 10 words.  This is over the limit.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
      // A final time check for the histograms.
      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));

  features.Clear();
  EXPECT_FALSE(ExtractFeatures(&page_text, &features));
}

}  // namespace safe_browsing