// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/renderer/safe_browsing/phishing_classifier.h" #include #include "base/callback.h" #include "base/compiler_specific.h" #include "base/logging.h" #include "base/message_loop.h" #include "base/string_util.h" #include "chrome/common/safe_browsing/csd.pb.h" #include "chrome/common/url_constants.h" #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" #include "chrome/renderer/safe_browsing/features.h" #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" #include "chrome/renderer/safe_browsing/scorer.h" #include "content/renderer/render_view.h" #include "crypto/sha2.h" #include "googleurl/src/gurl.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebDataSource.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebURL.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebURLRequest.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" namespace safe_browsing { const float PhishingClassifier::kInvalidScore = -1.0; const float PhishingClassifier::kPhishyThreshold = 0.5; PhishingClassifier::PhishingClassifier(RenderView* render_view, FeatureExtractorClock* clock) : render_view_(render_view), scorer_(NULL), clock_(clock), ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { Clear(); } PhishingClassifier::~PhishingClassifier() { // The RenderView should have called CancelPendingClassification() before // we are destroyed. CheckNoPendingClassification(); } void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) { DCHECK(!scorer_); scorer_ = scorer; url_extractor_.reset(new PhishingUrlFeatureExtractor); dom_extractor_.reset( new PhishingDOMFeatureExtractor(render_view_, clock_.get())); term_extractor_.reset(new PhishingTermFeatureExtractor( &scorer_->page_terms(), &scorer_->page_words(), scorer_->max_words_per_term(), clock_.get())); } bool PhishingClassifier::is_ready() const { return scorer_ != NULL; } void PhishingClassifier::BeginClassification(const string16* page_text, DoneCallback* done_callback) { DCHECK(is_ready()); // The RenderView should have called CancelPendingClassification() before // starting a new classification, so DCHECK this. CheckNoPendingClassification(); // However, in an opt build, we will go ahead and clean up the pending // classification so that we can start in a known state. CancelPendingClassification(); page_text_ = page_text; done_callback_.reset(done_callback); // For consistency, we always want to invoke the DoneCallback // asynchronously, rather than directly from this method. To ensure that // this is the case, post a task to begin feature extraction on the next // iteration of the message loop. MessageLoop::current()->PostTask( FROM_HERE, method_factory_.NewRunnableMethod( &PhishingClassifier::BeginFeatureExtraction)); } void PhishingClassifier::BeginFeatureExtraction() { WebKit::WebView* web_view = render_view_->webview(); if (!web_view) { RunFailureCallback(); return; } WebKit::WebFrame* frame = web_view->mainFrame(); if (!frame) { RunFailureCallback(); return; } // Check whether the URL is one that we should classify. // Currently, we only classify http: URLs that are GET requests. GURL url(frame->url()); if (!url.SchemeIs(chrome::kHttpScheme)) { RunFailureCallback(); return; } WebKit::WebDataSource* ds = frame->dataSource(); if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) { RunFailureCallback(); return; } features_.reset(new FeatureMap); if (!url_extractor_->ExtractFeatures(url, features_.get())) { RunFailureCallback(); return; } // DOM feature extraction can take awhile, so it runs asynchronously // in several chunks of work and invokes the callback when finished. dom_extractor_->ExtractFeatures( features_.get(), NewCallback(this, &PhishingClassifier::DOMExtractionFinished)); } void PhishingClassifier::CancelPendingClassification() { // Note that cancelling the feature extractors is simply a no-op if they // were not running. DCHECK(is_ready()); dom_extractor_->CancelPendingExtraction(); term_extractor_->CancelPendingExtraction(); method_factory_.RevokeAll(); Clear(); } void PhishingClassifier::DOMExtractionFinished(bool success) { if (success) { // Term feature extraction can take awhile, so it runs asynchronously // in several chunks of work and invokes the callback when finished. term_extractor_->ExtractFeatures( page_text_, features_.get(), NewCallback(this, &PhishingClassifier::TermExtractionFinished)); } else { RunFailureCallback(); } } void PhishingClassifier::TermExtractionFinished(bool success) { if (success) { WebKit::WebView* web_view = render_view_->webview(); if (!web_view) { RunFailureCallback(); return; } WebKit::WebFrame* main_frame = web_view->mainFrame(); if (!main_frame) { RunFailureCallback(); return; } // Hash all of the features so that they match the model, then compute // the score. FeatureMap hashed_features; ClientPhishingRequest verdict; verdict.set_url(main_frame->url().spec()); for (base::hash_map::const_iterator it = features_->features().begin(); it != features_->features().end(); ++it) { VLOG(2) << "Feature: " << it->first << " = " << it->second; bool result = hashed_features.AddRealFeature( crypto::SHA256HashString(it->first), it->second); DCHECK(result); ClientPhishingRequest::Feature* feature = verdict.add_feature_map(); feature->set_name(it->first); feature->set_value(it->second); } float score = static_cast(scorer_->ComputeScore(hashed_features)); verdict.set_client_score(score); verdict.set_is_phishing(score >= kPhishyThreshold); RunCallback(verdict); } else { RunFailureCallback(); } } void PhishingClassifier::CheckNoPendingClassification() { DCHECK(!done_callback_.get()); DCHECK(!page_text_); if (done_callback_.get() || page_text_) { LOG(ERROR) << "Classification in progress, missing call to " << "CancelPendingClassification"; } } void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) { done_callback_->Run(verdict); Clear(); } void PhishingClassifier::RunFailureCallback() { ClientPhishingRequest verdict; // In this case we're not guaranteed to have a valid URL. Just set it // to the empty string to make sure we have a valid protocol buffer. verdict.set_url(""); verdict.set_client_score(kInvalidScore); verdict.set_is_phishing(false); RunCallback(verdict); } void PhishingClassifier::Clear() { page_text_ = NULL; done_callback_.reset(NULL); features_.reset(NULL); } } // namespace safe_browsing