1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// PhishingDOMFeatureExtractor handles computing DOM-based features for the
// client-side phishing detection model. These include the presence of various
// types of elements, ratios of external and secure links, and tokens for
// external domains linked to.
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
#include <string>
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/scoped_ptr.h"
#include "base/task.h"
class GURL;
class RenderView;
namespace WebKit {
class WebElement;
class WebFrame;
}
namespace safe_browsing {
class FeatureExtractorClock;
class FeatureMap;
class PhishingDOMFeatureExtractor {
public:
// Callback to be run when feature extraction finishes. The callback
// argument is true if extraction was successful, false otherwise.
typedef Callback1<bool>::Type DoneCallback;
// Creates a PhishingDOMFeatureExtractor for the specified RenderView.
// The PhishingDOMFeatureExtrator should be destroyed prior to destroying
// the RenderView. |clock| is used for timing feature extractor operations,
// and may be mocked for testing. PhishingDOMFeatureExtractor takes
// ownership of the clock.
PhishingDOMFeatureExtractor(RenderView* render_view,
FeatureExtractorClock* clock);
~PhishingDOMFeatureExtractor();
// Begins extracting features into the given FeatureMap for the page
// currently loaded in this object's RenderView. To avoid blocking the
// render thread for too long, the feature extractor may run in several
// chunks of work, posting a task to the current MessageLoop to continue
// processing. Once feature extraction is complete, |done_callback|
// is run on the current thread. PhishingDOMFeatureExtractor takes
// ownership of the callback.
void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback);
// Cancels any pending feature extraction. The DoneCallback will not be run.
// Must be called if there is a feature extraction in progress when the page
// is unloaded or the PhishingDOMFeatureExtractor is destroyed.
void CancelPendingExtraction();
private:
struct FrameData;
struct PageFeatureState;
// The maximum amount of wall time that we will spend on a single extraction
// iteration before pausing to let other MessageLoop tasks run.
static const int kMaxTimePerChunkMs;
// The number of elements that we will process before checking to see whether
// kMaxTimePerChunkMs has elapsed. Since checking the current time can be
// slow, we don't do this on every element processed.
static const int kClockCheckGranularity;
// The maximum total amount of time that the feature extractor will run
// before giving up on the current page.
static const int kMaxTotalTimeMs;
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
// finishes, calls RunCallback().
void ExtractFeaturesWithTimeout();
// Handlers for the various HTML elements that we compute features for.
// Since some of the features (such as ratios) cannot be computed until
// feature extraction is finished, these handlers do not add to the feature
// map directly. Instead, they update the values in the PageFeatureState.
void HandleLink(const WebKit::WebElement& element);
void HandleForm(const WebKit::WebElement& element);
void HandleImage(const WebKit::WebElement& element);
void HandleInput(const WebKit::WebElement& element);
void HandleScript(const WebKit::WebElement& element);
// Helper to verify that there is no pending feature extraction. Dies in
// debug builds if the state is not as expected. This is a no-op in release
// builds.
void CheckNoPendingExtraction();
// Runs |done_callback_| and then clears all internal state.
void RunCallback(bool success);
// Clears all internal feature extraction state.
void Clear();
// Called after advancing |cur_frame_| to update the state in
// |cur_frame_data_|. Returns true if the state was updated successfully.
bool ResetFrameData();
// Given a URL, checks whether the domain is different from the domain of
// the current frame's URL. If so, stores the domain in |domain| and returns
// true, otherwise returns false.
bool IsExternalDomain(const GURL& url, std::string* domain) const;
// Called once all frames have been processed to compute features from the
// PageFeatureState and add them to |features_|. See features.h for a
// description of which features are computed.
void InsertFeatures();
// Non-owned pointer to the view that we will extract features from.
RenderView* render_view_;
// Owned pointer to our clock.
scoped_ptr<FeatureExtractorClock> clock_;
// The output parameters from the most recent call to ExtractFeatures().
FeatureMap* features_; // The caller keeps ownership of this.
scoped_ptr<DoneCallback> done_callback_;
// Non-owned pointer to the current frame that we are processing.
WebKit::WebFrame* cur_frame_;
// Stores extra state for |cur_frame_| that will be persisted until we
// advance to the next frame.
scoped_ptr<FrameData> cur_frame_data_;
// Stores the intermediate data used to create features. This data is
// accumulated across all frames in the RenderView.
scoped_ptr<PageFeatureState> page_feature_state_;
// Used to create ExtractFeaturesWithTimeout tasks.
// These tasks are revoked if extraction is cancelled.
ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_;
DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
};
} // namespace safe_browsing
#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
|