chrome/renderer/safe_browsing/phishing_classifier.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This class handles the process of extracting all of the features from a
// page and computing a phishyness score.  For more details, see
// phishing_*_feature_extractor.h, scorer.h, and client_model.proto.

#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_

#include "base/basictypes.h"
#include "base/callback.h"
#include "base/scoped_ptr.h"
#include "base/string16.h"
#include "base/task.h"

class RenderView;

namespace safe_browsing {
class FeatureExtractorClock;
class FeatureMap;
class PhishingDOMFeatureExtractor;
class PhishingTermFeatureExtractor;
class PhishingUrlFeatureExtractor;
class Scorer;

class PhishingClassifier {
 public:
  // Callback to be run when phishing classification finishes.  If the first
  // argument is true, the page is considered phishy by the client-side model,
  // and the browser should ping back to get a final verdict.  The second
  // argument gives the phishyness score which is used in the pingback,
  // or kInvalidScore if classification failed.
  typedef Callback2<bool /* phishy */, double /* phishy_score */>::Type
    DoneCallback;

  static const double kInvalidScore;

  // Creates a new PhishingClassifier object that will operate on
  // |render_view|.  |scorer| will be used for computing the final score, and
  // must live at least as long as the PhishingClassifier.  |clock| is used to
  // time feature extractor operations, and the PhishingClassifier takes
  // ownership of this object.
  PhishingClassifier(RenderView* render_view,
                     const Scorer* scorer,
                     FeatureExtractorClock* clock);
  ~PhishingClassifier();

  // Called by the RenderView when a page has finished loading.  This begins
  // the feature extraction and scoring process. |page_text| should contain
  // the plain text of a web page, including any subframes, as returned by
  // RenderView::CaptureText().  |page_text| is owned by the caller, and must
  // not be destroyed until either |done_callback| is run or
  // CancelPendingClassification() is called.
  //
  // To avoid blocking the render thread for too long, phishing classification
  // may run in several chunks of work, posting a task to the current
  // MessageLoop to continue processing.  Once the scoring process is complete,
  // |done_callback| is run on the current thread.  PhishingClassifier takes
  // ownership of the callback.
  void BeginClassification(const string16* page_text, DoneCallback* callback);

  // Called by the RenderView (on the render thread) when a page is unloading
  // or the RenderView is being destroyed.  This cancels any extraction that
  // is in progress.
  void CancelPendingClassification();

 private:
  // Any score equal to or above this value is considered phishy.
  static const double kPhishyThreshold;

  // Begins the feature extraction process, by extracting URL features and
  // beginning DOM feature extraction.
  void BeginFeatureExtraction();

  // Callback to be run when DOM feature extraction is complete.
  // If it was successful, begins term feature extraction, otherwise
  // runs the DoneCallback with a non-phishy verdict.
  void DOMExtractionFinished(bool success);

  // Callback to be run when term feature extraction is complete.
  // If it was successful, computes a score and runs the DoneCallback.
  // If extraction was unsuccessful, runs the DoneCallback with a
  // non-phishy verdict.
  void TermExtractionFinished(bool success);

  // Helper to verify that there is no pending phishing classification.  Dies
  // in debug builds if the state is not as expected.  This is a no-op in
  // release builds.
  void CheckNoPendingClassification();

  // Helper method to run the DoneCallback and clear the state.
  void RunCallback(bool phishy, double phishy_score);

  // Helper to run the DoneCallback when feature extraction has failed.
  // This always signals a non-phishy verdict for the page, with kInvalidScore.
  void RunFailureCallback();

  // Clears the current state of the PhishingClassifier.
  void Clear();

  RenderView* render_view_;  // owns us
  const Scorer* scorer_;  // owned by the caller
  scoped_ptr<FeatureExtractorClock> clock_;
  scoped_ptr<PhishingUrlFeatureExtractor> url_extractor_;
  scoped_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
  scoped_ptr<PhishingTermFeatureExtractor> term_extractor_;

  // State for any in-progress extraction.
  scoped_ptr<FeatureMap> features_;
  const string16* page_text_;  // owned by the caller
  scoped_ptr<DoneCallback> done_callback_;

  // Used to create BeginFeatureExtraction tasks.
  // These tasks are revoked if classification is cancelled.
  ScopedRunnableMethodFactory<PhishingClassifier> method_factory_;

  DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
};

}  // namespace safe_browsing

#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_