diff options
author | noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-06-14 23:41:52 +0000 |
---|---|---|
committer | noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-06-14 23:41:52 +0000 |
commit | dafb72aad4bbf51769b3af3d5d2337554dec1df0 (patch) | |
tree | 494abfa843883c2ccea13acbf4a8b1b84f8860a7 /chrome/common/safe_browsing | |
parent | 2fa18c24e05d0b56cd6e63bc7cdce89549011dd1 (diff) | |
download | chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.zip chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.tar.gz chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.tar.bz2 |
Initial CL to update the client model more frequently.
BUG=None
TEST=None
Review URL: http://codereview.chromium.org/7057025
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@89098 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/common/safe_browsing')
-rw-r--r-- | chrome/common/safe_browsing/client_model.proto | 76 | ||||
-rw-r--r-- | chrome/common/safe_browsing/csd.proto | 4 |
2 files changed, 80 insertions, 0 deletions
diff --git a/chrome/common/safe_browsing/client_model.proto b/chrome/common/safe_browsing/client_model.proto new file mode 100644 index 0000000..aa704c3 --- /dev/null +++ b/chrome/common/safe_browsing/client_model.proto @@ -0,0 +1,76 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// This proto represents a machine learning model which is used to compute +// the probability that a particular page visited by Chrome is phishing. +// +// Note: sine the machine learning model is trained on the server-side and then +// downloaded onto the client it is important that this proto file stays in +// sync with the server-side copy. Otherwise, the client may not be able to +// parse the server generated model anymore. If you want to change this +// protocol definition or you have questions regarding its format please contact +// chrome-anti-phishing@googlegroups.com. + +syntax = "proto2"; + +option optimize_for = LITE_RUNTIME; + +package safe_browsing; + +// This protocol buffer represents a machine learning model that is used in +// client-side phishing detection (in Chrome). The client extracts a set +// of features from every website the user visits. Extracted features map +// feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9). +// +// To compute the phishing score (i.e., the probability that the website is +// phishing) a scorer will simply compute the sum of all rule scores for a +// given set of extracted features. The score of a particular rule corresponds +// to the product of all feature values that are part of the rule times the +// rule weight. If a feature has no value (i.e., is not part of the extracted +// features) its value will be set to zero. The overall score is computed +// by summing up all the rule scores. This overall score is a logodds and can +// be converted to a probability like this: +// p = exp(logodds) / (exp(logodds) + 1). +// +// To make it harder for phishers to reverse engineer our machine learning model +// all the features in the model are hashed with a sha256 hash function. The +// feature extractors also hash the extracted features before scoring happens. +message ClientSideModel { + // In order to save some space we store all the hashed strings in a + // single repeated field and then the rules as well as page terms + // and page words refer to an index in that repeated field. All + // hashes are sha256 hashes stored in binary format. + repeated bytes hashes = 1; + + message Rule { + // List of indexes into hashes above which are basically hashed + // features that form the current rule. + repeated int32 feature = 1; + + // The weight for this particular rule. + required float weight = 2; + } + + // List of rules which make up the model + repeated Rule rule = 2; + + // List of indexes that point to the hashed page terms that appear in + // the model. The hashes are computed over page terms that are encoded + // as lowercase UTF-8 strings. + repeated int32 page_term = 3; + + // List of indexes that point to the hashed page words. The page words + // correspond to all words that appear in page terms. If the term + // "one two" is in the list of page terms then "one" and "two" will be + // in the list of page words. + repeated int32 page_word = 4; + + // Page terms in page_term contain at most this many page words. + required int32 max_words_per_term = 5; + + // Model version number. Every model that we train should have a different + // version number and it should always be larger than the previous model + // version. + optional int32 version = 6; +} diff --git a/chrome/common/safe_browsing/csd.proto b/chrome/common/safe_browsing/csd.proto index 6b737b30..e4b3fbe 100644 --- a/chrome/common/safe_browsing/csd.proto +++ b/chrome/common/safe_browsing/csd.proto @@ -43,6 +43,10 @@ message ClientPhishingRequest { // List of features that were extracted. Those are the features that were // sent to the scorer and which resulted in client_score being computed. repeated Feature feature_map = 5; + + // The version number of the model that was used to compute the client-score. + // Copied from ClientSideModel.version(). + optional int32 model_version = 6; } message ClientPhishingResponse { |