summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
blob: e738dac850a8764a8406772f2515c9bb15816bfa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// PhishingDOMFeatureExtractor handles computing DOM-based features for the
// client-side phishing detection model.  These include the presence of various
// types of elements, ratios of external and secure links, and tokens for
// external domains linked to.

#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_

#include <string>

#include "base/callback.h"
#include "base/macros.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/weak_ptr.h"
#include "third_party/WebKit/public/web/WebDocument.h"

class GURL;

namespace blink {
class WebElement;
}

namespace safe_browsing {
class FeatureExtractorClock;
class FeatureMap;

class PhishingDOMFeatureExtractor {
 public:
  // Callback to be run when feature extraction finishes.  The callback
  // argument is true if extraction was successful, false otherwise.
  typedef base::Callback<void(bool)> DoneCallback;

  // Creates a PhishingDOMFeatureExtractor instance.
  // |clock| is used for timing feature extractor operations, and may be
  // mocked for testing.  The caller maintains ownership of the clock.
  explicit PhishingDOMFeatureExtractor(FeatureExtractorClock* clock);
  ~PhishingDOMFeatureExtractor();

  // Begins extracting features into the given FeatureMap for the page.
  // To avoid blocking the render thread for too long, the feature extractor
  // may run in several chunks of work, posting a task to the current
  // MessageLoop to continue processing.  Once feature extraction is complete,
  // |done_callback| is run on the current thread.  PhishingDOMFeatureExtractor
  // takes ownership of the callback.
  void ExtractFeatures(blink::WebDocument document,
                       FeatureMap* features,
                       const DoneCallback& done_callback);

  // Cancels any pending feature extraction.  The DoneCallback will not be run.
  // Must be called if there is a feature extraction in progress when the page
  // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
  void CancelPendingExtraction();

 private:
  struct FrameData;
  struct PageFeatureState;

  // The maximum amount of wall time that we will spend on a single extraction
  // iteration before pausing to let other MessageLoop tasks run.
  static const int kMaxTimePerChunkMs;

  // The number of elements that we will process before checking to see whether
  // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
  // slow, we don't do this on every element processed.
  static const int kClockCheckGranularity;

  // The maximum total amount of time that the feature extractor will run
  // before giving up on the current page.
  static const int kMaxTotalTimeMs;

  // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
  // until a predefined maximum amount of time has elapsed, then posts a task
  // to the current MessageLoop to continue extraction.  When extraction
  // finishes, calls RunCallback().
  void ExtractFeaturesWithTimeout();

  // Handlers for the various HTML elements that we compute features for.
  // Since some of the features (such as ratios) cannot be computed until
  // feature extraction is finished, these handlers do not add to the feature
  // map directly.  Instead, they update the values in the PageFeatureState.
  void HandleLink(const blink::WebElement& element);
  void HandleForm(const blink::WebElement& element);
  void HandleImage(const blink::WebElement& element);
  void HandleInput(const blink::WebElement& element);
  void HandleScript(const blink::WebElement& element);

  // Helper to verify that there is no pending feature extraction.  Dies in
  // debug builds if the state is not as expected.  This is a no-op in release
  // builds.
  void CheckNoPendingExtraction();

  // Runs |done_callback_| and then clears all internal state.
  void RunCallback(bool success);

  // Clears all internal feature extraction state.
  void Clear();

  // Called after advancing |cur_document_| to update the state in
  // |cur_frame_data_|.
  void ResetFrameData();

  // Returns the next document in frame-traversal order from cur_document_.
  // If there are no more documents, returns a null WebDocument.
  blink::WebDocument GetNextDocument();

  // Given a URL, checks whether the domain is different from the domain of
  // the current frame's URL.  If so, stores the domain in |domain| and returns
  // true, otherwise returns false.
  virtual bool IsExternalDomain(const GURL& url, std::string* domain) const;

  // Given a partial URL, extend it to a full url based on the current frame's
  // URL.
  virtual blink::WebURL CompleteURL(const blink::WebElement& element,
                                    const blink::WebString& partial_url);

  // Called once all frames have been processed to compute features from the
  // PageFeatureState and add them to |features_|.  See features.h for a
  // description of which features are computed.
  void InsertFeatures();


  // Non-owned pointer to our clock.
  FeatureExtractorClock* clock_;

  // The output parameters from the most recent call to ExtractFeatures().
  FeatureMap* features_;  // The caller keeps ownership of this.
  DoneCallback done_callback_;

  // The current (sub-)document that we are processing.  May be a null document
  // (isNull()) if we are not currently extracting features.
  blink::WebDocument cur_document_;

  // Stores extra state for |cur_document_| that will be persisted until we
  // advance to the next frame.
  scoped_ptr<FrameData> cur_frame_data_;

  // Stores the intermediate data used to create features.  This data is
  // accumulated across all frames in the RenderView.
  scoped_ptr<PageFeatureState> page_feature_state_;

  // Used in scheduling ExtractFeaturesWithTimeout tasks.
  // These pointers are invalidated if extraction is cancelled.
  base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_;

  DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
};

}  // namespace safe_browsing

#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_