// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // This proto represents a machine learning model which is used to compute // the probability that a particular page visited by Chrome is phishing. // // Note: sine the machine learning model is trained on the server-side and then // downloaded onto the client it is important that this proto file stays in // sync with the server-side copy. Otherwise, the client may not be able to // parse the server generated model anymore. If you want to change this // protocol definition or you have questions regarding its format please contact // chrome-anti-phishing@googlegroups.com. syntax = "proto2"; option optimize_for = LITE_RUNTIME; package safe_browsing; // This protocol buffer represents a machine learning model that is used in // client-side phishing detection (in Chrome). The client extracts a set // of features from every website the user visits. Extracted features map // feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9). // // To compute the phishing score (i.e., the probability that the website is // phishing) a scorer will simply compute the sum of all rule scores for a // given set of extracted features. The score of a particular rule corresponds // to the product of all feature values that are part of the rule times the // rule weight. If a feature has no value (i.e., is not part of the extracted // features) its value will be set to zero. The overall score is computed // by summing up all the rule scores. This overall score is a logodds and can // be converted to a probability like this: // p = exp(logodds) / (exp(logodds) + 1). // // To make it harder for phishers to reverse engineer our machine learning model // all the features in the model are hashed with a sha256 hash function. The // feature extractors also hash the extracted features before scoring happens. message ClientSideModel { // In order to save some space we store all the hashed strings in a // single repeated field and then the rules as well as page terms // and page words refer to an index in that repeated field. All // hashes are sha256 hashes stored in binary format. repeated bytes hashes = 1; message Rule { // List of indexes into hashes above which are basically hashed // features that form the current rule. repeated int32 feature = 1; // The weight for this particular rule. required float weight = 2; } // List of rules which make up the model repeated Rule rule = 2; // List of indexes that point to the hashed page terms that appear in // the model. The hashes are computed over page terms that are encoded // as lowercase UTF-8 strings. repeated int32 page_term = 3; // List of indexes that point to the hashed page words. The page words // correspond to all words that appear in page terms. If the term // "one two" is in the list of page terms then "one" and "two" will be // in the list of page words. repeated int32 page_word = 4; // Page terms in page_term contain at most this many page words. required int32 max_words_per_term = 5; }