// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This class handles the process of extracting all of the features from a
// page and computing a phishyness score. For more details, see
// phishing_*_feature_extractor.h, scorer.h, and client_model.proto.
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/scoped_ptr.h"
#include "base/string16.h"
#include "base/task.h"
class RenderView;
namespace safe_browsing {
class FeatureExtractorClock;
class FeatureMap;
class PhishingDOMFeatureExtractor;
class PhishingTermFeatureExtractor;
class PhishingUrlFeatureExtractor;
class Scorer;
class PhishingClassifier {
public:
// Callback to be run when phishing classification finishes. If the first
// argument is true, the page is considered phishy by the client-side model,
// and the browser should ping back to get a final verdict. The second
// argument gives the phishyness score which is used in the pingback,
// or kInvalidScore if classification failed.
typedef Callback2::Type
DoneCallback;
static const double kInvalidScore;
// Creates a new PhishingClassifier object that will operate on
// |render_view|. |scorer| will be used for computing the final score, and
// must live at least as long as the PhishingClassifier. |clock| is used to
// time feature extractor operations, and the PhishingClassifier takes
// ownership of this object.
PhishingClassifier(RenderView* render_view,
const Scorer* scorer,
FeatureExtractorClock* clock);
~PhishingClassifier();
// Called by the RenderView when a page has finished loading. This begins
// the feature extraction and scoring process. |page_text| should contain
// the plain text of a web page, including any subframes, as returned by
// RenderView::CaptureText(). |page_text| is owned by the caller, and must
// not be destroyed until either |done_callback| is run or
// CancelPendingClassification() is called.
//
// To avoid blocking the render thread for too long, phishing classification
// may run in several chunks of work, posting a task to the current
// MessageLoop to continue processing. Once the scoring process is complete,
// |done_callback| is run on the current thread. PhishingClassifier takes
// ownership of the callback.
void BeginClassification(const string16* page_text, DoneCallback* callback);
// Called by the RenderView (on the render thread) when a page is unloading
// or the RenderView is being destroyed. This cancels any extraction that
// is in progress.
void CancelPendingClassification();
private:
// Any score equal to or above this value is considered phishy.
static const double kPhishyThreshold;
// Begins the feature extraction process, by extracting URL features and
// beginning DOM feature extraction.
void BeginFeatureExtraction();
// Callback to be run when DOM feature extraction is complete.
// If it was successful, begins term feature extraction, otherwise
// runs the DoneCallback with a non-phishy verdict.
void DOMExtractionFinished(bool success);
// Callback to be run when term feature extraction is complete.
// If it was successful, computes a score and runs the DoneCallback.
// If extraction was unsuccessful, runs the DoneCallback with a
// non-phishy verdict.
void TermExtractionFinished(bool success);
// Helper to verify that there is no pending phishing classification. Dies
// in debug builds if the state is not as expected. This is a no-op in
// release builds.
void CheckNoPendingClassification();
// Helper method to run the DoneCallback and clear the state.
void RunCallback(bool phishy, double phishy_score);
// Helper to run the DoneCallback when feature extraction has failed.
// This always signals a non-phishy verdict for the page, with kInvalidScore.
void RunFailureCallback();
// Clears the current state of the PhishingClassifier.
void Clear();
RenderView* render_view_; // owns us
const Scorer* scorer_; // owned by the caller
scoped_ptr clock_;
scoped_ptr url_extractor_;
scoped_ptr dom_extractor_;
scoped_ptr term_extractor_;
// State for any in-progress extraction.
scoped_ptr features_;
const string16* page_text_; // owned by the caller
scoped_ptr done_callback_;
// Used to create BeginFeatureExtraction tasks.
// These tasks are revoked if classification is cancelled.
ScopedRunnableMethodFactory method_factory_;
DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
};
} // namespace safe_browsing
#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_