diff options
author | cjhopman <cjhopman@chromium.org> | 2015-04-03 13:28:58 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-04-03 20:29:29 +0000 |
commit | adf25383033ff5794a810f725c78e319126b6edb (patch) | |
tree | cf3c3412d2d03c8a713f56ceb36b6a995dbba0d6 /components/dom_distiller | |
parent | 99e3f8806b9653d009990c002d815e8570fb1cee (diff) | |
download | chromium_src-adf25383033ff5794a810f725c78e319126b6edb.zip chromium_src-adf25383033ff5794a810f725c78e319126b6edb.tar.gz chromium_src-adf25383033ff5794a810f725c78e319126b6edb.tar.bz2 |
Add integration of the new heuristics
This adds the trained model, the javascript core feature extraction, and simple
functions to apply the old or new heuristics to a web_contents*.
BUG=471439
TBR=blundell@chromium.org
Review URL: https://codereview.chromium.org/1047223003
Cr-Commit-Position: refs/heads/master@{#323805}
Diffstat (limited to 'components/dom_distiller')
12 files changed, 358 insertions, 4 deletions
diff --git a/components/dom_distiller/content/BUILD.gn b/components/dom_distiller/content/BUILD.gn index 35af9dd..61dd8e6 100644 --- a/components/dom_distiller/content/BUILD.gn +++ b/components/dom_distiller/content/BUILD.gn @@ -6,6 +6,8 @@ if (!is_ios) { # GYP version: components/dom_distiller.gypi:dom_distiller_content static_library("content") { sources = [ + "distillable_page_utils.cc", + "distillable_page_utils.h", "distiller_page_web_contents.cc", "distiller_page_web_contents.h", "dom_distiller_viewer_source.cc", diff --git a/components/dom_distiller/content/distillable_page_utils.cc b/components/dom_distiller/content/distillable_page_utils.cc new file mode 100644 index 0000000..7f1b3cc --- /dev/null +++ b/components/dom_distiller/content/distillable_page_utils.cc @@ -0,0 +1,74 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/dom_distiller/content/distillable_page_utils.h" + +#include "base/bind.h" +#include "base/message_loop/message_loop.h" +#include "base/strings/utf_string_conversions.h" +#include "base/values.h" +#include "components/dom_distiller/core/distillable_page_detector.h" +#include "components/dom_distiller/core/page_features.h" +#include "content/public/browser/render_frame_host.h" +#include "grit/components_resources.h" +#include "ui/base/resource/resource_bundle.h" + +namespace dom_distiller { +namespace { +void OnOGArticleJsResult(base::Callback<void(bool)> callback, + const base::Value* result) { + bool success = false; + if (result) { + result->GetAsBoolean(&success); + } + callback.Run(success); +} + +void OnExtractFeaturesJsResult(const DistillablePageDetector* detector, + base::Callback<void(bool)> callback, + const base::Value* result) { + callback.Run(detector->Classify(CalculateDerivedFeaturesFromJSON(result))); +} +} // namespace + +void IsOpenGraphArticle(content::WebContents* web_contents, + base::Callback<void(bool)> callback) { + content::RenderFrameHost* main_frame = web_contents->GetMainFrame(); + if (!main_frame) { + base::MessageLoop::current()->PostTask(FROM_HERE, + base::Bind(callback, false)); + return; + } + std::string og_article_js = ResourceBundle::GetSharedInstance() + .GetRawDataResource(IDR_IS_DISTILLABLE_JS) + .as_string(); + main_frame->ExecuteJavaScript(base::UTF8ToUTF16(og_article_js), + base::Bind(OnOGArticleJsResult, callback)); +} + +void IsDistillablePage(content::WebContents* web_contents, + base::Callback<void(bool)> callback) { + IsDistillablePageForDetector(web_contents, + DistillablePageDetector::GetDefault(), callback); +} + +void IsDistillablePageForDetector(content::WebContents* web_contents, + const DistillablePageDetector* detector, + base::Callback<void(bool)> callback) { + content::RenderFrameHost* main_frame = web_contents->GetMainFrame(); + if (!main_frame) { + base::MessageLoop::current()->PostTask(FROM_HERE, + base::Bind(callback, false)); + return; + } + std::string extract_features_js = + ResourceBundle::GetSharedInstance() + .GetRawDataResource(IDR_EXTRACT_PAGE_FEATURES_JS) + .as_string(); + main_frame->ExecuteJavaScript( + base::UTF8ToUTF16(extract_features_js), + base::Bind(OnExtractFeaturesJsResult, detector, callback)); +} + +} // namespace dom_distiller diff --git a/components/dom_distiller/content/distillable_page_utils.h b/components/dom_distiller/content/distillable_page_utils.h new file mode 100644 index 0000000..95216bd --- /dev/null +++ b/components/dom_distiller/content/distillable_page_utils.h @@ -0,0 +1,25 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_DOM_DISTILLER_CONTENT_DISTILLABLE_PAGE_UTILS_H_ +#define COMPONENTS_DOM_DISTILLER_CONTENT_DISTILLABLE_PAGE_UTILS_H_ + +#include "base/callback.h" +#include "content/public/browser/web_contents.h" + +namespace dom_distiller { + +class DistillablePageDetector; + +void IsOpenGraphArticle(content::WebContents* web_contents, + base::Callback<void(bool)> callback); +void IsDistillablePage(content::WebContents* web_contents, + base::Callback<void(bool)> callback); +// The passed detector must be alive until after the callback is called. +void IsDistillablePageForDetector(content::WebContents* web_contents, + const DistillablePageDetector* detector, + base::Callback<void(bool)> callback); +} + +#endif // COMPONENTS_DOM_DISTILLER_CONTENT_DISTILLABLE_PAGE_UTILS_H_ diff --git a/components/dom_distiller/content/distillable_page_utils_browsertest.cc b/components/dom_distiller/content/distillable_page_utils_browsertest.cc new file mode 100644 index 0000000..9525982 --- /dev/null +++ b/components/dom_distiller/content/distillable_page_utils_browsertest.cc @@ -0,0 +1,169 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/bind.h" +#include "base/path_service.h" +#include "base/run_loop.h" +#include "components/dom_distiller/content/distillable_page_utils.h" +#include "components/dom_distiller/core/distillable_page_detector.h" +#include "components/dom_distiller/core/page_features.h" +#include "content/public/browser/browser_context.h" +#include "content/public/browser/render_frame_host.h" +#include "content/public/browser/web_contents_observer.h" +#include "content/public/test/content_browser_test.h" +#include "content/shell/browser/shell.h" +#include "net/test/embedded_test_server/embedded_test_server.h" +#include "ui/base/resource/resource_bundle.h" + +namespace dom_distiller { +namespace { + +const char* kArticlePath = "/og_article.html"; +const char* kNonArticlePath = "/non_og_article.html"; + +class DomDistillerDistillablePageUtilsTest : public content::ContentBrowserTest, + content::WebContentsObserver { + public: + void SetUpOnMainThread() override { + AddComponentsResources(); + SetUpTestServer(); + ContentBrowserTest::SetUpOnMainThread(); + } + + void LoadURL(const std::string& url) { + content::WebContents* current_web_contents = shell()->web_contents(); + Observe(current_web_contents); + base::RunLoop url_loaded_runner; + main_frame_loaded_callback_ = url_loaded_runner.QuitClosure(); + current_web_contents->GetController().LoadURL( + embedded_test_server()->GetURL(url), + content::Referrer(), + ui::PAGE_TRANSITION_TYPED, + std::string()); + url_loaded_runner.Run(); + main_frame_loaded_callback_ = base::Closure(); + Observe(nullptr); + } + + private: + void AddComponentsResources() { + base::FilePath pak_file; + base::FilePath pak_dir; + PathService::Get(base::DIR_MODULE, &pak_dir); + pak_file = + pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak")); + ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath( + pak_file, ui::SCALE_FACTOR_NONE); + } + + void SetUpTestServer() { + base::FilePath path; + PathService::Get(base::DIR_SOURCE_ROOT, &path); + path = path.AppendASCII("components/test/data/dom_distiller"); + embedded_test_server()->ServeFilesFromDirectory(path); + ASSERT_TRUE(embedded_test_server()->InitializeAndWaitUntilReady()); + } + + void DocumentLoadedInFrame( + content::RenderFrameHost* render_frame_host) override { + if (!render_frame_host->GetParent()) + main_frame_loaded_callback_.Run(); + } + + base::Closure main_frame_loaded_callback_; +}; + +class ResultHolder { + public: + ResultHolder(base::Closure callback) : callback_(callback) {} + + void OnResult(bool result) { + result_ = result; + callback_.Run(); + } + + bool GetResult() { + return result_; + } + + base::Callback<void(bool)> GetCallback() { + return base::Bind(&ResultHolder::OnResult, base::Unretained(this)); + } + + private: + base::Closure callback_; + bool result_; +}; + +} // namespace + +IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest, TestIsOGArticle) { + LoadURL(kArticlePath); + base::RunLoop run_loop_; + ResultHolder holder(run_loop_.QuitClosure()); + IsOpenGraphArticle(shell()->web_contents(), holder.GetCallback()); + run_loop_.Run(); + ASSERT_TRUE(holder.GetResult()); +} + +IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest, + TestIsNotOGArticle) { + LoadURL(kNonArticlePath); + base::RunLoop run_loop_; + ResultHolder holder(run_loop_.QuitClosure()); + IsOpenGraphArticle(shell()->web_contents(), holder.GetCallback()); + run_loop_.Run(); + ASSERT_FALSE(holder.GetResult()); +} + +IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest, + TestIsDistillablePage) { + scoped_ptr<AdaBoostProto> proto(new AdaBoostProto); + proto->set_num_features(kDerivedFeaturesCount); + proto->set_num_stumps(1); + + StumpProto* stump = proto->add_stump(); + stump->set_feature_number(0); + stump->set_weight(1); + stump->set_split(-1); + scoped_ptr<DistillablePageDetector> detector( + new DistillablePageDetector(proto.Pass())); + EXPECT_DOUBLE_EQ(0.5, detector->GetThreshold()); + // The first value of the first feature is either 0 or 1. Since the stump's + // split is -1, the stump weight will be applied to any set of derived + // features. + LoadURL(kArticlePath); + base::RunLoop run_loop_; + ResultHolder holder(run_loop_.QuitClosure()); + IsDistillablePageForDetector(shell()->web_contents(), detector.get(), + holder.GetCallback()); + run_loop_.Run(); + ASSERT_TRUE(holder.GetResult()); +} + +IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest, + TestIsNotDistillablePage) { + scoped_ptr<AdaBoostProto> proto(new AdaBoostProto); + proto->set_num_features(kDerivedFeaturesCount); + proto->set_num_stumps(1); + StumpProto* stump = proto->add_stump(); + stump->set_feature_number(0); + stump->set_weight(-1); + stump->set_split(-1); + scoped_ptr<DistillablePageDetector> detector( + new DistillablePageDetector(proto.Pass())); + EXPECT_DOUBLE_EQ(-0.5, detector->GetThreshold()); + // The first value of the first feature is either 0 or 1. Since the stump's + // split is -1, the stump weight will be applied to any set of derived + // features. + LoadURL(kArticlePath); + base::RunLoop run_loop_; + ResultHolder holder(run_loop_.QuitClosure()); + IsDistillablePageForDetector(shell()->web_contents(), detector.get(), + holder.GetCallback()); + run_loop_.Run(); + ASSERT_FALSE(holder.GetResult()); +} + +} // namespace dom_distiller diff --git a/components/dom_distiller/core/data/distillable_page_model.bin b/components/dom_distiller/core/data/distillable_page_model.bin Binary files differnew file mode 100644 index 0000000..39d02eb --- /dev/null +++ b/components/dom_distiller/core/data/distillable_page_model.bin diff --git a/components/dom_distiller/core/distillable_page_detector.cc b/components/dom_distiller/core/distillable_page_detector.cc index d11e042..aa7ddcf 100644 --- a/components/dom_distiller/core/distillable_page_detector.cc +++ b/components/dom_distiller/core/distillable_page_detector.cc @@ -5,9 +5,25 @@ #include "components/dom_distiller/core/distillable_page_detector.h" #include "base/logging.h" +#include "grit/components_resources.h" +#include "ui/base/resource/resource_bundle.h" namespace dom_distiller { +const DistillablePageDetector* DistillablePageDetector::GetDefault() { + static DistillablePageDetector* detector = nullptr; + if (!detector) { + std::string serialized_proto = + ResourceBundle::GetSharedInstance() + .GetRawDataResource(IDR_DISTILLABLE_PAGE_SERIALIZED_MODEL) + .as_string(); + scoped_ptr<AdaBoostProto> proto(new AdaBoostProto); + CHECK(proto->ParseFromString(serialized_proto)); + detector = new DistillablePageDetector(proto.Pass()); + } + return detector; +} + DistillablePageDetector::DistillablePageDetector( scoped_ptr<AdaBoostProto> proto) : proto_(proto.Pass()), threshold_(0.0) { @@ -30,7 +46,9 @@ bool DistillablePageDetector::Classify( double DistillablePageDetector::Score( const std::vector<double>& features) const { - CHECK(features.size() == size_t(proto_->num_features())); + if (features.size() != size_t(proto_->num_features())) { + return 0.0; + } double score = 0.0; for (int i = 0; i < proto_->num_stumps(); ++i) { const StumpProto& stump = proto_->stump(i); diff --git a/components/dom_distiller/core/distillable_page_detector.h b/components/dom_distiller/core/distillable_page_detector.h index 220d8b0..fbe36bf 100644 --- a/components/dom_distiller/core/distillable_page_detector.h +++ b/components/dom_distiller/core/distillable_page_detector.h @@ -18,6 +18,7 @@ namespace dom_distiller { // model. class DistillablePageDetector { public: + static const DistillablePageDetector* GetDefault(); explicit DistillablePageDetector(scoped_ptr<AdaBoostProto> proto); ~DistillablePageDetector(); diff --git a/components/dom_distiller/core/distillable_page_detector_unittest.cc b/components/dom_distiller/core/distillable_page_detector_unittest.cc index 6580fcd..835f8ea 100644 --- a/components/dom_distiller/core/distillable_page_detector_unittest.cc +++ b/components/dom_distiller/core/distillable_page_detector_unittest.cc @@ -91,6 +91,18 @@ TEST(DomDistillerDistillablePageDetectorTest, TestScoreAndClassify) { EXPECT_TRUE(detector->Classify(features)); } +TEST(DomDistillerDistillablePageDetectorTest, TestScoreWrongNumberFeatures) { + scoped_ptr<DistillablePageDetector> detector = + Builder().Stump(0, 1.0, 1.0).Stump(0, 1.4, 2.0).Build(); + EXPECT_DOUBLE_EQ(1.5, detector->GetThreshold()); + + std::vector<double> features; + EXPECT_DOUBLE_EQ(0.0, detector->Score(features)); + features.push_back(-3.0); + features.push_back(1.0); + EXPECT_DOUBLE_EQ(0.0, detector->Score(features)); +} + } diff --git a/components/dom_distiller/core/javascript/extract_features.js b/components/dom_distiller/core/javascript/extract_features.js new file mode 100644 index 0000000..031254c --- /dev/null +++ b/components/dom_distiller/core/javascript/extract_features.js @@ -0,0 +1,28 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +(function() { + function hasOGArticle() { + var elems = document.head.querySelectorAll( + 'meta[property="og:type"],meta[name="og:type"]'); + for (var i in elems) { + if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') { + return true; + } + } + return false; + } + + var body = document.body; + return JSON.stringify({ + 'opengraph': hasOGArticle(), + 'url': document.location.href, + 'numElements': body.querySelectorAll('*').length, + 'numAnchors': body.querySelectorAll('a').length, + 'numForms': body.querySelectorAll('form').length, + 'innerText': body.innerText, + 'textContent': body.textContent, + 'innerHTML': body.innerHTML, + }); +})() diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc index 057adbf..0540588 100644 --- a/components/dom_distiller/core/page_features.cc +++ b/components/dom_distiller/core/page_features.cc @@ -6,6 +6,7 @@ #include <string> +#include "base/json/json_reader.h" #include "third_party/re2/re2/re2.h" namespace dom_distiller { @@ -47,6 +48,8 @@ bool EndsWith(const std::string& t, const std::string& s) { } } +int kDerivedFeaturesCount = 29; + std::vector<double> CalculateDerivedFeatures(bool isOGArticle, const GURL& url, double numElements, @@ -132,7 +135,18 @@ std::vector<double> CalculateDerivedFeatures(bool isOGArticle, return features; } -std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) { +std::vector<double> CalculateDerivedFeaturesFromJSON( + const base::Value* stringified_json) { + std::string stringified; + if (!stringified_json->GetAsString(&stringified)) { + return std::vector<double>(); + } + + scoped_ptr<base::Value> json(base::JSONReader::Read(stringified)); + if (!json) { + return std::vector<double>(); + } + const base::DictionaryValue* dict; if (!json->GetAsDictionary(&dict)) { return std::vector<double>(); diff --git a/components/dom_distiller/core/page_features.h b/components/dom_distiller/core/page_features.h index 919a90a1..236796b 100644 --- a/components/dom_distiller/core/page_features.h +++ b/components/dom_distiller/core/page_features.h @@ -12,6 +12,9 @@ namespace dom_distiller { +// The length of the derived features vector. +extern int kDerivedFeaturesCount; + // The distillable page detector is a model trained on a list of numeric // features derived from core more complex features of a webpage (like the // body's .textContent). This derives the numeric features for a set of core @@ -31,7 +34,8 @@ std::vector<double> CalculateDerivedFeatures(bool isOGArticle, // Calculates the derived features from the JSON value as returned by the // javascript core feature extraction. -std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json); +std::vector<double> CalculateDerivedFeaturesFromJSON( + const base::Value* stringified_json); } // namespace dom_distiller diff --git a/components/dom_distiller/core/page_features_unittest.cc b/components/dom_distiller/core/page_features_unittest.cc index a863afc..413c55f 100644 --- a/components/dom_distiller/core/page_features_unittest.cc +++ b/components/dom_distiller/core/page_features_unittest.cc @@ -9,6 +9,7 @@ #include "base/files/file_util.h" #include "base/json/json_reader.h" +#include "base/json/json_writer.h" #include "base/memory/scoped_ptr.h" #include "base/path_service.h" #include "testing/gtest/include/gtest/gtest.h" @@ -66,8 +67,14 @@ TEST(DomDistillerPageFeaturesTest, TestCalculateDerivedFeatures) { base::DictionaryValue* core_features; ASSERT_TRUE(input_entries->GetDictionary(i, &entry)); ASSERT_TRUE(entry->GetDictionary("features", &core_features)); + // CalculateDerivedFeaturesFromJSON expects a base::Value of the stringified + // JSON (and not a base::Value of the JSON itself) + std::string stringified_json; + ASSERT_TRUE(base::JSONWriter::Write(core_features, &stringified_json)); + scoped_ptr<base::Value> stringified_value( + new base::StringValue(stringified_json)); std::vector<double> derived( - CalculateDerivedFeaturesFromJSON(core_features)); + CalculateDerivedFeaturesFromJSON(stringified_value.get())); ASSERT_EQ(labels.size(), derived.size()); ASSERT_TRUE(expected_output_entries->GetDictionary(i, &entry)); |