summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/safe_browsing/features.h
blob: 82370c154fb548829aeed2929420c128b4690888 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// Common types and constants for extracting and evaluating features in the
// client-side phishing detection model.  A feature is simply a string and an
// associated floating-point value between 0 and 1.  The phishing
// classification model contains rules which give an appropriate weight to each
// feature or combination of features.  These values can then be summed to
// compute a final phishiness score.
//
// Some features are boolean features.  If these features are set, they always
// have a value of 0.0 or 1.0.  In practice, the features are only set if the
// value is true (1.0).
//
// We also use token features.  These features have a unique name that is
// constructed from the URL or page contents that we are classifying, for
// example, "UrlDomain=chromium".  These features are also always set to 1.0
// if they are present.
//
// The intermediate storage of the features for a URL is a FeatureMap, which is
// just a thin wrapper around a map of feature name to value.  The entire set
// of features for a URL is extracted before we do any scoring.

#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
#pragma once

#include <string>
#include "base/basictypes.h"
#include "base/hash_tables.h"

namespace safe_browsing {

// Container for a map of features to values, which enforces behavior
// such as a maximum number of features in the map.
class FeatureMap {
 public:
  FeatureMap();
  ~FeatureMap();

  // Adds a boolean feature to a FeatureMap with a value of 1.0.
  // Returns true on success, or false if the feature map exceeds
  // kMaxFeatureMapSize.
  bool AddBooleanFeature(const std::string& name);

  // Adds a real-valued feature to a FeatureMap with the given value.
  // Values must always be in the range [0.0, 1.0].  Returns true on
  // success, or false if the feature map exceeds kMaxFeatureMapSize
  // or the value is outside of the allowed range.
  bool AddRealFeature(const std::string& name, double value);

  // Provides read-only access to the current set of features.
  const base::hash_map<std::string, double>& features() const {
    return features_;
  }

  // Clears the set of features in the map.
  void Clear();

  // This is an upper bound on the number of features that will be extracted.
  // We should never hit this cap; it is intended as a sanity check to prevent
  // the FeatureMap from growing too large.
  static const size_t kMaxFeatureMapSize;

 private:
  base::hash_map<std::string, double> features_;

  DISALLOW_COPY_AND_ASSIGN(FeatureMap);
};

namespace features {
// Constants for the various feature names that we use.

////////////////////////////////////////////////////
// URL host features
////////////////////////////////////////////////////

// Set if the URL's hostname is an IP address.
extern const char kUrlHostIsIpAddress[];
// Token feature containing the portion of the hostname controlled by a
// registrar, for example "com" or "co.uk".
extern const char kUrlTldToken[];
// Token feature containing the first host component below the registrar.
// For example, in "www.google.com", the domain would be "google".
extern const char kUrlDomainToken[];
// Token feature containing each host component below the domain.
// For example, in "www.host.example.com", both "www" and "host" would be
// "other host tokens".
extern const char kUrlOtherHostToken[];

////////////////////////////////////////////////////
// Aggregate features for URL host tokens
////////////////////////////////////////////////////

// Set if the number of "other" host tokens for a URL is greater than one.
// Longer hostnames, regardless of the specific tokens, can be a signal that
// the URL is phishy.
extern const char kUrlNumOtherHostTokensGTOne[];
// Set if the number of "other" host tokens for a URL is greater than three.
extern const char kUrlNumOtherHostTokensGTThree[];

////////////////////////////////////////////////////
// URL path token features
////////////////////////////////////////////////////

// Token feature containing each alphanumeric string in the path that is at
// least 3 characters long.  For example, "/abc/d/efg" would have 2 path
// token features, "abc" and "efg".  Query parameters are not included.
extern const char kUrlPathToken[];

////////////////////////////////////////////////////
// DOM HTML form features
////////////////////////////////////////////////////

// Set if the page has any <form> elements.
extern const char kPageHasForms[];
// The fraction of form elements whose |action| attribute points to a
// URL on a different domain from the document URL.
extern const char kPageActionOtherDomainFreq[];

// Set if the page has any <input type="text"> elements
// (includes inputs with missing or unknown types).
extern const char kPageHasTextInputs[];
// Set if the page has any <input type="password"> elements.
extern const char kPageHasPswdInputs[];
// Set if the page has any <input type="radio"> elements.
extern const char kPageHasRadioInputs[];
// Set if the page has any <input type="checkbox"> elements.
extern const char kPageHasCheckInputs[];

////////////////////////////////////////////////////
// DOM HTML link features
////////////////////////////////////////////////////

// The fraction of links in the page which point to a domain other than the
// domain of the document.  See "URL host features" above for a discussion
// of how the doamin is computed.
extern const char kPageExternalLinksFreq[];
// Token feature containing each external domain that is linked to.
extern const char kPageLinkDomain[];
// Fraction of links in the page that use https.
extern const char kPageSecureLinksFreq[];

////////////////////////////////////////////////////
// DOM HTML script features
////////////////////////////////////////////////////

// Set if the number of <script> elements in the page is greater than 1.
extern const char kPageNumScriptTagsGTOne[];
// Set if the number of <script> elements in the page is greater than 6.
extern const char kPageNumScriptTagsGTSix[];

////////////////////////////////////////////////////
// Other DOM HTML features
////////////////////////////////////////////////////

// The fraction of images whose src attribute points to an external domain.
extern const char kPageImgOtherDomainFreq[];

////////////////////////////////////////////////////
// Page term features
////////////////////////////////////////////////////

// Token feature for a term (whitespace-delimited) on a page.  Terms can be
// single words or multi-word n-grams.  Rather than adding this feature for
// every possible token on a page, only the terms that are mentioned in the
// classification model are added.
extern const char kPageTerm[];

}  // namespace features
}  // namepsace safe_browsing

#endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_