Initial CL to update the client model more frequently.

BUG=None TEST=None Review URL: http://codereview.chromium.org/7057025 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@89098 0039d316-1c4b-4281-b951-d872f2087c98
author: noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2011-06-14 23:41:52 +0000
committer: noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2011-06-14 23:41:52 +0000
commit: dafb72aad4bbf51769b3af3d5d2337554dec1df0 (patch)
tree: 494abfa843883c2ccea13acbf4a8b1b84f8860a7 /chrome/common/safe_browsing
parent: 2fa18c24e05d0b56cd6e63bc7cdce89549011dd1 (diff)
download: chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.zip
chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.tar.gz
chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.tar.bz2
2 files changed, 80 insertions, 0 deletions
diff --git a/chrome/common/safe_browsing/client_model.proto b/chrome/common/safe_browsing/client_model.proto
new file mode 100644
index 0000000..aa704c3
--- /dev/null
+++ b/chrome/common/safe_browsing/client_model.proto
@@ -0,0 +1,76 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This proto represents a machine learning model which is used to compute
+// the probability that a particular page visited by Chrome is phishing.
+//
+// Note: sine the machine learning model is trained on the server-side and then
+// downloaded onto the client it is important that this proto file stays in
+// sync with the server-side copy.  Otherwise, the client may not be able to
+// parse the server generated model anymore.  If you want to change this
+// protocol definition or you have questions regarding its format please contact
+// chrome-anti-phishing@googlegroups.com.
+
+syntax = "proto2";
+
+option optimize_for = LITE_RUNTIME;
+
+package safe_browsing;
+
+// This protocol buffer represents a machine learning model that is used in
+// client-side phishing detection (in Chrome).  The client extracts a set
+// of features from every website the user visits.  Extracted features map
+// feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9).
+//
+// To compute the phishing score (i.e., the probability that the website is
+// phishing) a scorer will simply compute the sum of all rule scores for a
+// given set of extracted features.  The score of a particular rule corresponds
+// to the product of all feature values that are part of the rule times the
+// rule weight.  If a feature has no value (i.e., is not part of the extracted
+// features) its value will be set to zero.  The overall score is computed
+// by summing up all the rule scores.  This overall score is a logodds and can
+// be converted to a probability like this:
+// p = exp(logodds) / (exp(logodds) + 1).
+//
+// To make it harder for phishers to reverse engineer our machine learning model
+// all the features in the model are hashed with a sha256 hash function.  The
+// feature extractors also hash the extracted features before scoring happens.
+message ClientSideModel {
+  // In order to save some space we store all the hashed strings in a
+  // single repeated field and then the rules as well as page terms
+  // and page words refer to an index in that repeated field.  All
+  // hashes are sha256 hashes stored in binary format.
+  repeated bytes hashes = 1;
+
+  message Rule {
+    // List of indexes into hashes above which are basically hashed
+    // features that form the current rule.
+    repeated int32 feature = 1;
+
+    // The weight for this particular rule.
+    required float weight = 2;
+  }
+
+  // List of rules which make up the model
+  repeated Rule rule = 2;
+
+  // List of indexes that point to the hashed page terms that appear in
+  // the model.  The hashes are computed over page terms that are encoded
+  // as lowercase UTF-8 strings.
+  repeated int32 page_term = 3;
+
+  // List of indexes that point to the hashed page words.  The page words
+  // correspond to all words that appear in page terms.  If the term
+  // "one two" is in the list of page terms then "one" and "two" will be
+  // in the list of page words.
+  repeated int32 page_word = 4;
+
+  // Page terms in page_term contain at most this many page words.
+  required int32 max_words_per_term = 5;
+
+  // Model version number.  Every model that we train should have a different
+  // version number and it should always be larger than the previous model
+  // version.
+  optional int32 version = 6;
+}
diff --git a/chrome/common/safe_browsing/csd.proto b/chrome/common/safe_browsing/csd.proto
index 6b737b30..e4b3fbe 100644
--- a/chrome/common/safe_browsing/csd.proto
+++ b/chrome/common/safe_browsing/csd.proto
@@ -43,6 +43,10 @@ message ClientPhishingRequest {
   // List of features that were extracted.  Those are the features that were
   // sent to the scorer and which resulted in client_score being computed.
   repeated Feature feature_map = 5;
+
+  // The version number of the model that was used to compute the client-score.
+  // Copied from ClientSideModel.version().
+  optional int32 model_version = 6;
 }
 
 message ClientPhishingResponse {
author	noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2011-06-14 23:41:52 +0000
committer	noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2011-06-14 23:41:52 +0000
commit	dafb72aad4bbf51769b3af3d5d2337554dec1df0 (patch)
tree	494abfa843883c2ccea13acbf4a8b1b84f8860a7 /chrome/common/safe_browsing
parent	2fa18c24e05d0b56cd6e63bc7cdce89549011dd1 (diff)
download	chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.zip chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.tar.gz chromium_src-dafb72aad4bbf51769b3af3d5d2337554dec1df0.tar.bz2