// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/renderer/safe_browsing/phishing_classifier.h" #include #include "base/bind.h" #include "base/callback.h" #include "base/compiler_specific.h" #include "base/location.h" #include "base/logging.h" #include "base/metrics/histogram.h" #include "base/single_thread_task_runner.h" #include "base/strings/string_util.h" #include "base/thread_task_runner_handle.h" #include "chrome/common/safe_browsing/csd.pb.h" #include "chrome/common/url_constants.h" #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" #include "chrome/renderer/safe_browsing/features.h" #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h" #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" #include "chrome/renderer/safe_browsing/scorer.h" #include "content/public/renderer/render_frame.h" #include "crypto/sha2.h" #include "third_party/WebKit/public/platform/WebURL.h" #include "third_party/WebKit/public/platform/WebURLRequest.h" #include "third_party/WebKit/public/web/WebDataSource.h" #include "third_party/WebKit/public/web/WebDocument.h" #include "third_party/WebKit/public/web/WebLocalFrame.h" #include "third_party/WebKit/public/web/WebView.h" #include "url/gurl.h" namespace safe_browsing { const float PhishingClassifier::kInvalidScore = -1.0; const float PhishingClassifier::kPhishyThreshold = 0.5; PhishingClassifier::PhishingClassifier(content::RenderFrame* render_frame, FeatureExtractorClock* clock) : render_frame_(render_frame), scorer_(NULL), clock_(clock), weak_factory_(this) { Clear(); } PhishingClassifier::~PhishingClassifier() { // The RenderView should have called CancelPendingClassification() before // we are destroyed. CheckNoPendingClassification(); } void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) { CheckNoPendingClassification(); scorer_ = scorer; if (scorer_) { url_extractor_.reset(new PhishingUrlFeatureExtractor); dom_extractor_.reset(new PhishingDOMFeatureExtractor(clock_.get())); term_extractor_.reset(new PhishingTermFeatureExtractor( &scorer_->page_terms(), &scorer_->page_words(), scorer_->max_words_per_term(), scorer_->murmurhash3_seed(), scorer_->max_shingles_per_page(), scorer_->shingle_size(), clock_.get())); } else { // We're disabling client-side phishing detection, so tear down all // of the relevant objects. url_extractor_.reset(); dom_extractor_.reset(); term_extractor_.reset(); } } bool PhishingClassifier::is_ready() const { return scorer_ != NULL; } void PhishingClassifier::BeginClassification( const base::string16* page_text, const DoneCallback& done_callback) { DCHECK(is_ready()); // The RenderView should have called CancelPendingClassification() before // starting a new classification, so DCHECK this. CheckNoPendingClassification(); // However, in an opt build, we will go ahead and clean up the pending // classification so that we can start in a known state. CancelPendingClassification(); page_text_ = page_text; done_callback_ = done_callback; // For consistency, we always want to invoke the DoneCallback // asynchronously, rather than directly from this method. To ensure that // this is the case, post a task to begin feature extraction on the next // iteration of the message loop. base::ThreadTaskRunnerHandle::Get()->PostTask( FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction, weak_factory_.GetWeakPtr())); } void PhishingClassifier::BeginFeatureExtraction() { blink::WebLocalFrame* frame = render_frame_->GetWebFrame(); // Check whether the URL is one that we should classify. // Currently, we only classify http: URLs that are GET requests. GURL url(frame->document().url()); if (!url.SchemeIs(url::kHttpScheme)) { RunFailureCallback(); return; } blink::WebDataSource* ds = frame->dataSource(); if (!ds || !base::EqualsASCII(base::StringPiece16(ds->request().httpMethod()), "GET")) { RunFailureCallback(); return; } features_.reset(new FeatureMap); if (!url_extractor_->ExtractFeatures(url, features_.get())) { RunFailureCallback(); return; } // DOM feature extraction can take awhile, so it runs asynchronously // in several chunks of work and invokes the callback when finished. dom_extractor_->ExtractFeatures( frame->document(), features_.get(), base::Bind(&PhishingClassifier::DOMExtractionFinished, base::Unretained(this))); } void PhishingClassifier::CancelPendingClassification() { // Note that cancelling the feature extractors is simply a no-op if they // were not running. DCHECK(is_ready()); dom_extractor_->CancelPendingExtraction(); term_extractor_->CancelPendingExtraction(); weak_factory_.InvalidateWeakPtrs(); Clear(); } void PhishingClassifier::DOMExtractionFinished(bool success) { shingle_hashes_.reset(new std::set); if (success) { // Term feature extraction can take awhile, so it runs asynchronously // in several chunks of work and invokes the callback when finished. term_extractor_->ExtractFeatures( page_text_, features_.get(), shingle_hashes_.get(), base::Bind(&PhishingClassifier::TermExtractionFinished, base::Unretained(this))); } else { RunFailureCallback(); } } void PhishingClassifier::TermExtractionFinished(bool success) { if (success) { blink::WebLocalFrame* main_frame = render_frame_->GetWebFrame(); // Hash all of the features so that they match the model, then compute // the score. FeatureMap hashed_features; ClientPhishingRequest verdict; verdict.set_model_version(scorer_->model_version()); verdict.set_url(main_frame->document().url().string().utf8()); for (base::hash_map::const_iterator it = features_->features().begin(); it != features_->features().end(); ++it) { DVLOG(2) << "Feature: " << it->first << " = " << it->second; bool result = hashed_features.AddRealFeature( crypto::SHA256HashString(it->first), it->second); DCHECK(result); ClientPhishingRequest::Feature* feature = verdict.add_feature_map(); feature->set_name(it->first); feature->set_value(it->second); } for (std::set::const_iterator it = shingle_hashes_->begin(); it != shingle_hashes_->end(); ++it) { verdict.add_shingle_hashes(*it); } float score = static_cast(scorer_->ComputeScore(hashed_features)); verdict.set_client_score(score); verdict.set_is_phishing(score >= kPhishyThreshold); RunCallback(verdict); } else { RunFailureCallback(); } } void PhishingClassifier::CheckNoPendingClassification() { DCHECK(done_callback_.is_null()); DCHECK(!page_text_); if (!done_callback_.is_null() || page_text_) { LOG(ERROR) << "Classification in progress, missing call to " << "CancelPendingClassification"; } } void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) { done_callback_.Run(verdict); Clear(); } void PhishingClassifier::RunFailureCallback() { ClientPhishingRequest verdict; // In this case we're not guaranteed to have a valid URL. Just set it // to the empty string to make sure we have a valid protocol buffer. verdict.set_url(""); verdict.set_client_score(kInvalidScore); verdict.set_is_phishing(false); RunCallback(verdict); } void PhishingClassifier::Clear() { page_text_ = NULL; done_callback_.Reset(); features_.reset(NULL); shingle_hashes_.reset(NULL); } } // namespace safe_browsing