summaryrefslogtreecommitdiffstats
path: root/chrome/browser/safe_browsing/browser_feature_extractor.h
blob: 9a4ba3b1d4728eb261d8f19f53aeba3b538e4fb7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// BrowserFeatureExtractor computes various browser features for client-side
// phishing detection.  For now it does a bunch of lookups in the history
// service to see whether a particular URL has been visited before by the
// user.

#ifndef CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_
#define CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_

#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "base/basictypes.h"
#include "base/callback.h"
#include "base/hash_tables.h"
#include "base/memory/scoped_ptr.h"
#include "base/sequenced_task_runner_helpers.h"
#include "base/time.h"
#include "chrome/browser/common/cancelable_request.h"
#include "chrome/browser/history/history_types.h"
#include "chrome/browser/safe_browsing/safe_browsing_service.h"
#include "chrome/browser/safe_browsing/ui_manager.h"
#include "googleurl/src/gurl.h"

class HistoryService;

namespace content {
class WebContents;
}

namespace safe_browsing {
class ClientMalwareRequest;
class ClientPhishingRequest;
class ClientSideDetectionService;

typedef std::map<std::string, std::set<std::string> > IPHostMap;

struct BrowseInfo {
  // List of IPv4 and IPv6 addresses from which content was requested
  // together with the hosts on it, while browsing to the |url|.
  IPHostMap ips;

  // If a SafeBrowsing interstitial was shown for the current URL
  // this will contain the UnsafeResource struct for that URL.
  scoped_ptr<SafeBrowsingUIManager::UnsafeResource> unsafe_resource;

  // List of redirects that lead to the first page on the current host and
  // the current url respectively. These may be the same if the current url
  // is the first page on its host.
  std::vector<GURL> host_redirects;
  std::vector<GURL> url_redirects;

  // The HTTP status code from this navigation.
  int http_status_code;

  BrowseInfo();
  ~BrowseInfo();
};

// All methods of this class must be called on the UI thread (including
// the constructor).
class BrowserFeatureExtractor {
 public:
  // Called when feature extraction is done.  The first argument will be
  // true iff feature extraction succeeded.  The second argument is the
  // phishing request which was modified by the feature extractor.  The
  // DoneCallback takes ownership of the request object.
  typedef base::Callback<void(bool, ClientPhishingRequest*)> DoneCallback;
  typedef base::Callback<void(bool, ClientMalwareRequest*)> MalwareDoneCallback;

  // The caller keeps ownership of the tab and service objects and is
  // responsible for ensuring that they stay valid for the entire
  // lifetime of this object.
  BrowserFeatureExtractor(content::WebContents* tab,
                          ClientSideDetectionService* service);

  // The destructor will cancel any pending requests.
  virtual ~BrowserFeatureExtractor();

  // Begins extraction of the browser features.  We take ownership
  // of the request object until |callback| is called (see DoneCallback above)
  // and will write the extracted features to the feature map.  Once the
  // feature extraction is complete, |callback| is run on the UI thread.  We
  // take ownership of the |callback| object.  |info| may not be valid after
  // ExtractFeatures returns.  This method must run on the UI thread.
  virtual void ExtractFeatures(const BrowseInfo* info,
                               ClientPhishingRequest* request,
                               const DoneCallback& callback);

  // Extract the malware related features. The request object is owned by the
  // caller.
  virtual void ExtractMalwareFeatures(const BrowseInfo* info,
                                      ClientMalwareRequest* request);

 private:
  friend class base::DeleteHelper<BrowserFeatureExtractor>;
  typedef std::pair<ClientPhishingRequest*, DoneCallback> ExtractionData;
  typedef std::map<CancelableRequestProvider::Handle,
                   ExtractionData> PendingQueriesMap;

  // Synchronous browser feature extraction.
  void ExtractBrowseInfoFeatures(const BrowseInfo& info,
                                 ClientPhishingRequest* request);

  // Actually starts feature extraction (does the real work).
  void StartExtractFeatures(ClientPhishingRequest* request,
                            const DoneCallback& callback);

  // HistoryService callback which is called when we're done querying URL visits
  // in the history.
  void QueryUrlHistoryDone(CancelableRequestProvider::Handle handle,
                           bool success,
                           const history::URLRow* row,
                           history::VisitVector* visits);

  // HistoryService callback which is called when we're done querying HTTP host
  // visits in the history.
  void QueryHttpHostVisitsDone(CancelableRequestProvider::Handle handle,
                               bool success,
                               int num_visits,
                               base::Time first_visit);

  // HistoryService callback which is called when we're done querying HTTPS host
  // visits in the history.
  void QueryHttpsHostVisitsDone(CancelableRequestProvider::Handle handle,
                                bool success,
                                int num_visits,
                                base::Time first_visit);

  // Helper function which sets the host history features given the
  // number of host visits and the time of the fist host visit.  Set
  // |is_http_query| to true if the URL scheme is HTTP and to false if
  // the scheme is HTTPS.
  void SetHostVisitsFeatures(int num_visits,
                             base::Time first_visit,
                             bool is_http_query,
                             ClientPhishingRequest* request);

  // Helper function which stores the request and callback while the history
  // query is being processed.
  void StorePendingQuery(CancelableRequestProvider::Handle handle,
                         ClientPhishingRequest* request,
                         const DoneCallback& callback);

  // Helper function which is the counterpart of StorePendingQuery.  If there
  // is a pending query for the given handle it will return false and set both
  // the request and cb pointers.  Otherwise, it will return false.
  bool GetPendingQuery(CancelableRequestProvider::Handle handle,
                       ClientPhishingRequest** request,
                       DoneCallback* callback);

  // Helper function which gets the history server if possible.  If the pointer
  // is set it will return true and false otherwise.
  bool GetHistoryService(HistoryService** history);

  content::WebContents* tab_;
  ClientSideDetectionService* service_;
  CancelableRequestConsumer request_consumer_;
  base::WeakPtrFactory<BrowserFeatureExtractor> weak_factory_;

  // Set of pending extractions (i.e. extractions for which ExtractFeatures was
  // called but not StartExtractFeatures).
  std::map<ClientPhishingRequest*, DoneCallback> pending_extractions_;

  // Set of pending queries (i.e., where history->Query...() was called but
  // the history callback hasn't been invoked yet).
  PendingQueriesMap pending_queries_;

  DISALLOW_COPY_AND_ASSIGN(BrowserFeatureExtractor);
};

}  // namespace safe_browsing
#endif  // CHROME_BROWSER_SAFE_BROWSING_BROWSER_FEATURE_EXTRACTOR_H_