summaryrefslogtreecommitdiffstats
path: root/components/dom_distiller
diff options
context:
space:
mode:
authorcjhopman <cjhopman@chromium.org>2015-04-03 13:28:58 -0700
committerCommit bot <commit-bot@chromium.org>2015-04-03 20:29:29 +0000
commitadf25383033ff5794a810f725c78e319126b6edb (patch)
treecf3c3412d2d03c8a713f56ceb36b6a995dbba0d6 /components/dom_distiller
parent99e3f8806b9653d009990c002d815e8570fb1cee (diff)
downloadchromium_src-adf25383033ff5794a810f725c78e319126b6edb.zip
chromium_src-adf25383033ff5794a810f725c78e319126b6edb.tar.gz
chromium_src-adf25383033ff5794a810f725c78e319126b6edb.tar.bz2
Add integration of the new heuristics
This adds the trained model, the javascript core feature extraction, and simple functions to apply the old or new heuristics to a web_contents*. BUG=471439 TBR=blundell@chromium.org Review URL: https://codereview.chromium.org/1047223003 Cr-Commit-Position: refs/heads/master@{#323805}
Diffstat (limited to 'components/dom_distiller')
-rw-r--r--components/dom_distiller/content/BUILD.gn2
-rw-r--r--components/dom_distiller/content/distillable_page_utils.cc74
-rw-r--r--components/dom_distiller/content/distillable_page_utils.h25
-rw-r--r--components/dom_distiller/content/distillable_page_utils_browsertest.cc169
-rw-r--r--components/dom_distiller/core/data/distillable_page_model.binbin0 -> 2204 bytes
-rw-r--r--components/dom_distiller/core/distillable_page_detector.cc20
-rw-r--r--components/dom_distiller/core/distillable_page_detector.h1
-rw-r--r--components/dom_distiller/core/distillable_page_detector_unittest.cc12
-rw-r--r--components/dom_distiller/core/javascript/extract_features.js28
-rw-r--r--components/dom_distiller/core/page_features.cc16
-rw-r--r--components/dom_distiller/core/page_features.h6
-rw-r--r--components/dom_distiller/core/page_features_unittest.cc9
12 files changed, 358 insertions, 4 deletions
diff --git a/components/dom_distiller/content/BUILD.gn b/components/dom_distiller/content/BUILD.gn
index 35af9dd..61dd8e6 100644
--- a/components/dom_distiller/content/BUILD.gn
+++ b/components/dom_distiller/content/BUILD.gn
@@ -6,6 +6,8 @@ if (!is_ios) {
# GYP version: components/dom_distiller.gypi:dom_distiller_content
static_library("content") {
sources = [
+ "distillable_page_utils.cc",
+ "distillable_page_utils.h",
"distiller_page_web_contents.cc",
"distiller_page_web_contents.h",
"dom_distiller_viewer_source.cc",
diff --git a/components/dom_distiller/content/distillable_page_utils.cc b/components/dom_distiller/content/distillable_page_utils.cc
new file mode 100644
index 0000000..7f1b3cc
--- /dev/null
+++ b/components/dom_distiller/content/distillable_page_utils.cc
@@ -0,0 +1,74 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/dom_distiller/content/distillable_page_utils.h"
+
+#include "base/bind.h"
+#include "base/message_loop/message_loop.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/values.h"
+#include "components/dom_distiller/core/distillable_page_detector.h"
+#include "components/dom_distiller/core/page_features.h"
+#include "content/public/browser/render_frame_host.h"
+#include "grit/components_resources.h"
+#include "ui/base/resource/resource_bundle.h"
+
+namespace dom_distiller {
+namespace {
+void OnOGArticleJsResult(base::Callback<void(bool)> callback,
+ const base::Value* result) {
+ bool success = false;
+ if (result) {
+ result->GetAsBoolean(&success);
+ }
+ callback.Run(success);
+}
+
+void OnExtractFeaturesJsResult(const DistillablePageDetector* detector,
+ base::Callback<void(bool)> callback,
+ const base::Value* result) {
+ callback.Run(detector->Classify(CalculateDerivedFeaturesFromJSON(result)));
+}
+} // namespace
+
+void IsOpenGraphArticle(content::WebContents* web_contents,
+ base::Callback<void(bool)> callback) {
+ content::RenderFrameHost* main_frame = web_contents->GetMainFrame();
+ if (!main_frame) {
+ base::MessageLoop::current()->PostTask(FROM_HERE,
+ base::Bind(callback, false));
+ return;
+ }
+ std::string og_article_js = ResourceBundle::GetSharedInstance()
+ .GetRawDataResource(IDR_IS_DISTILLABLE_JS)
+ .as_string();
+ main_frame->ExecuteJavaScript(base::UTF8ToUTF16(og_article_js),
+ base::Bind(OnOGArticleJsResult, callback));
+}
+
+void IsDistillablePage(content::WebContents* web_contents,
+ base::Callback<void(bool)> callback) {
+ IsDistillablePageForDetector(web_contents,
+ DistillablePageDetector::GetDefault(), callback);
+}
+
+void IsDistillablePageForDetector(content::WebContents* web_contents,
+ const DistillablePageDetector* detector,
+ base::Callback<void(bool)> callback) {
+ content::RenderFrameHost* main_frame = web_contents->GetMainFrame();
+ if (!main_frame) {
+ base::MessageLoop::current()->PostTask(FROM_HERE,
+ base::Bind(callback, false));
+ return;
+ }
+ std::string extract_features_js =
+ ResourceBundle::GetSharedInstance()
+ .GetRawDataResource(IDR_EXTRACT_PAGE_FEATURES_JS)
+ .as_string();
+ main_frame->ExecuteJavaScript(
+ base::UTF8ToUTF16(extract_features_js),
+ base::Bind(OnExtractFeaturesJsResult, detector, callback));
+}
+
+} // namespace dom_distiller
diff --git a/components/dom_distiller/content/distillable_page_utils.h b/components/dom_distiller/content/distillable_page_utils.h
new file mode 100644
index 0000000..95216bd
--- /dev/null
+++ b/components/dom_distiller/content/distillable_page_utils.h
@@ -0,0 +1,25 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_DOM_DISTILLER_CONTENT_DISTILLABLE_PAGE_UTILS_H_
+#define COMPONENTS_DOM_DISTILLER_CONTENT_DISTILLABLE_PAGE_UTILS_H_
+
+#include "base/callback.h"
+#include "content/public/browser/web_contents.h"
+
+namespace dom_distiller {
+
+class DistillablePageDetector;
+
+void IsOpenGraphArticle(content::WebContents* web_contents,
+ base::Callback<void(bool)> callback);
+void IsDistillablePage(content::WebContents* web_contents,
+ base::Callback<void(bool)> callback);
+// The passed detector must be alive until after the callback is called.
+void IsDistillablePageForDetector(content::WebContents* web_contents,
+ const DistillablePageDetector* detector,
+ base::Callback<void(bool)> callback);
+}
+
+#endif // COMPONENTS_DOM_DISTILLER_CONTENT_DISTILLABLE_PAGE_UTILS_H_
diff --git a/components/dom_distiller/content/distillable_page_utils_browsertest.cc b/components/dom_distiller/content/distillable_page_utils_browsertest.cc
new file mode 100644
index 0000000..9525982
--- /dev/null
+++ b/components/dom_distiller/content/distillable_page_utils_browsertest.cc
@@ -0,0 +1,169 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/bind.h"
+#include "base/path_service.h"
+#include "base/run_loop.h"
+#include "components/dom_distiller/content/distillable_page_utils.h"
+#include "components/dom_distiller/core/distillable_page_detector.h"
+#include "components/dom_distiller/core/page_features.h"
+#include "content/public/browser/browser_context.h"
+#include "content/public/browser/render_frame_host.h"
+#include "content/public/browser/web_contents_observer.h"
+#include "content/public/test/content_browser_test.h"
+#include "content/shell/browser/shell.h"
+#include "net/test/embedded_test_server/embedded_test_server.h"
+#include "ui/base/resource/resource_bundle.h"
+
+namespace dom_distiller {
+namespace {
+
+const char* kArticlePath = "/og_article.html";
+const char* kNonArticlePath = "/non_og_article.html";
+
+class DomDistillerDistillablePageUtilsTest : public content::ContentBrowserTest,
+ content::WebContentsObserver {
+ public:
+ void SetUpOnMainThread() override {
+ AddComponentsResources();
+ SetUpTestServer();
+ ContentBrowserTest::SetUpOnMainThread();
+ }
+
+ void LoadURL(const std::string& url) {
+ content::WebContents* current_web_contents = shell()->web_contents();
+ Observe(current_web_contents);
+ base::RunLoop url_loaded_runner;
+ main_frame_loaded_callback_ = url_loaded_runner.QuitClosure();
+ current_web_contents->GetController().LoadURL(
+ embedded_test_server()->GetURL(url),
+ content::Referrer(),
+ ui::PAGE_TRANSITION_TYPED,
+ std::string());
+ url_loaded_runner.Run();
+ main_frame_loaded_callback_ = base::Closure();
+ Observe(nullptr);
+ }
+
+ private:
+ void AddComponentsResources() {
+ base::FilePath pak_file;
+ base::FilePath pak_dir;
+ PathService::Get(base::DIR_MODULE, &pak_dir);
+ pak_file =
+ pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
+ ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
+ pak_file, ui::SCALE_FACTOR_NONE);
+ }
+
+ void SetUpTestServer() {
+ base::FilePath path;
+ PathService::Get(base::DIR_SOURCE_ROOT, &path);
+ path = path.AppendASCII("components/test/data/dom_distiller");
+ embedded_test_server()->ServeFilesFromDirectory(path);
+ ASSERT_TRUE(embedded_test_server()->InitializeAndWaitUntilReady());
+ }
+
+ void DocumentLoadedInFrame(
+ content::RenderFrameHost* render_frame_host) override {
+ if (!render_frame_host->GetParent())
+ main_frame_loaded_callback_.Run();
+ }
+
+ base::Closure main_frame_loaded_callback_;
+};
+
+class ResultHolder {
+ public:
+ ResultHolder(base::Closure callback) : callback_(callback) {}
+
+ void OnResult(bool result) {
+ result_ = result;
+ callback_.Run();
+ }
+
+ bool GetResult() {
+ return result_;
+ }
+
+ base::Callback<void(bool)> GetCallback() {
+ return base::Bind(&ResultHolder::OnResult, base::Unretained(this));
+ }
+
+ private:
+ base::Closure callback_;
+ bool result_;
+};
+
+} // namespace
+
+IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest, TestIsOGArticle) {
+ LoadURL(kArticlePath);
+ base::RunLoop run_loop_;
+ ResultHolder holder(run_loop_.QuitClosure());
+ IsOpenGraphArticle(shell()->web_contents(), holder.GetCallback());
+ run_loop_.Run();
+ ASSERT_TRUE(holder.GetResult());
+}
+
+IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest,
+ TestIsNotOGArticle) {
+ LoadURL(kNonArticlePath);
+ base::RunLoop run_loop_;
+ ResultHolder holder(run_loop_.QuitClosure());
+ IsOpenGraphArticle(shell()->web_contents(), holder.GetCallback());
+ run_loop_.Run();
+ ASSERT_FALSE(holder.GetResult());
+}
+
+IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest,
+ TestIsDistillablePage) {
+ scoped_ptr<AdaBoostProto> proto(new AdaBoostProto);
+ proto->set_num_features(kDerivedFeaturesCount);
+ proto->set_num_stumps(1);
+
+ StumpProto* stump = proto->add_stump();
+ stump->set_feature_number(0);
+ stump->set_weight(1);
+ stump->set_split(-1);
+ scoped_ptr<DistillablePageDetector> detector(
+ new DistillablePageDetector(proto.Pass()));
+ EXPECT_DOUBLE_EQ(0.5, detector->GetThreshold());
+ // The first value of the first feature is either 0 or 1. Since the stump's
+ // split is -1, the stump weight will be applied to any set of derived
+ // features.
+ LoadURL(kArticlePath);
+ base::RunLoop run_loop_;
+ ResultHolder holder(run_loop_.QuitClosure());
+ IsDistillablePageForDetector(shell()->web_contents(), detector.get(),
+ holder.GetCallback());
+ run_loop_.Run();
+ ASSERT_TRUE(holder.GetResult());
+}
+
+IN_PROC_BROWSER_TEST_F(DomDistillerDistillablePageUtilsTest,
+ TestIsNotDistillablePage) {
+ scoped_ptr<AdaBoostProto> proto(new AdaBoostProto);
+ proto->set_num_features(kDerivedFeaturesCount);
+ proto->set_num_stumps(1);
+ StumpProto* stump = proto->add_stump();
+ stump->set_feature_number(0);
+ stump->set_weight(-1);
+ stump->set_split(-1);
+ scoped_ptr<DistillablePageDetector> detector(
+ new DistillablePageDetector(proto.Pass()));
+ EXPECT_DOUBLE_EQ(-0.5, detector->GetThreshold());
+ // The first value of the first feature is either 0 or 1. Since the stump's
+ // split is -1, the stump weight will be applied to any set of derived
+ // features.
+ LoadURL(kArticlePath);
+ base::RunLoop run_loop_;
+ ResultHolder holder(run_loop_.QuitClosure());
+ IsDistillablePageForDetector(shell()->web_contents(), detector.get(),
+ holder.GetCallback());
+ run_loop_.Run();
+ ASSERT_FALSE(holder.GetResult());
+}
+
+} // namespace dom_distiller
diff --git a/components/dom_distiller/core/data/distillable_page_model.bin b/components/dom_distiller/core/data/distillable_page_model.bin
new file mode 100644
index 0000000..39d02eb
--- /dev/null
+++ b/components/dom_distiller/core/data/distillable_page_model.bin
Binary files differ
diff --git a/components/dom_distiller/core/distillable_page_detector.cc b/components/dom_distiller/core/distillable_page_detector.cc
index d11e042..aa7ddcf 100644
--- a/components/dom_distiller/core/distillable_page_detector.cc
+++ b/components/dom_distiller/core/distillable_page_detector.cc
@@ -5,9 +5,25 @@
#include "components/dom_distiller/core/distillable_page_detector.h"
#include "base/logging.h"
+#include "grit/components_resources.h"
+#include "ui/base/resource/resource_bundle.h"
namespace dom_distiller {
+const DistillablePageDetector* DistillablePageDetector::GetDefault() {
+ static DistillablePageDetector* detector = nullptr;
+ if (!detector) {
+ std::string serialized_proto =
+ ResourceBundle::GetSharedInstance()
+ .GetRawDataResource(IDR_DISTILLABLE_PAGE_SERIALIZED_MODEL)
+ .as_string();
+ scoped_ptr<AdaBoostProto> proto(new AdaBoostProto);
+ CHECK(proto->ParseFromString(serialized_proto));
+ detector = new DistillablePageDetector(proto.Pass());
+ }
+ return detector;
+}
+
DistillablePageDetector::DistillablePageDetector(
scoped_ptr<AdaBoostProto> proto)
: proto_(proto.Pass()), threshold_(0.0) {
@@ -30,7 +46,9 @@ bool DistillablePageDetector::Classify(
double DistillablePageDetector::Score(
const std::vector<double>& features) const {
- CHECK(features.size() == size_t(proto_->num_features()));
+ if (features.size() != size_t(proto_->num_features())) {
+ return 0.0;
+ }
double score = 0.0;
for (int i = 0; i < proto_->num_stumps(); ++i) {
const StumpProto& stump = proto_->stump(i);
diff --git a/components/dom_distiller/core/distillable_page_detector.h b/components/dom_distiller/core/distillable_page_detector.h
index 220d8b0..fbe36bf 100644
--- a/components/dom_distiller/core/distillable_page_detector.h
+++ b/components/dom_distiller/core/distillable_page_detector.h
@@ -18,6 +18,7 @@ namespace dom_distiller {
// model.
class DistillablePageDetector {
public:
+ static const DistillablePageDetector* GetDefault();
explicit DistillablePageDetector(scoped_ptr<AdaBoostProto> proto);
~DistillablePageDetector();
diff --git a/components/dom_distiller/core/distillable_page_detector_unittest.cc b/components/dom_distiller/core/distillable_page_detector_unittest.cc
index 6580fcd..835f8ea 100644
--- a/components/dom_distiller/core/distillable_page_detector_unittest.cc
+++ b/components/dom_distiller/core/distillable_page_detector_unittest.cc
@@ -91,6 +91,18 @@ TEST(DomDistillerDistillablePageDetectorTest, TestScoreAndClassify) {
EXPECT_TRUE(detector->Classify(features));
}
+TEST(DomDistillerDistillablePageDetectorTest, TestScoreWrongNumberFeatures) {
+ scoped_ptr<DistillablePageDetector> detector =
+ Builder().Stump(0, 1.0, 1.0).Stump(0, 1.4, 2.0).Build();
+ EXPECT_DOUBLE_EQ(1.5, detector->GetThreshold());
+
+ std::vector<double> features;
+ EXPECT_DOUBLE_EQ(0.0, detector->Score(features));
+ features.push_back(-3.0);
+ features.push_back(1.0);
+ EXPECT_DOUBLE_EQ(0.0, detector->Score(features));
+}
+
}
diff --git a/components/dom_distiller/core/javascript/extract_features.js b/components/dom_distiller/core/javascript/extract_features.js
new file mode 100644
index 0000000..031254c
--- /dev/null
+++ b/components/dom_distiller/core/javascript/extract_features.js
@@ -0,0 +1,28 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+(function() {
+ function hasOGArticle() {
+ var elems = document.head.querySelectorAll(
+ 'meta[property="og:type"],meta[name="og:type"]');
+ for (var i in elems) {
+ if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ var body = document.body;
+ return JSON.stringify({
+ 'opengraph': hasOGArticle(),
+ 'url': document.location.href,
+ 'numElements': body.querySelectorAll('*').length,
+ 'numAnchors': body.querySelectorAll('a').length,
+ 'numForms': body.querySelectorAll('form').length,
+ 'innerText': body.innerText,
+ 'textContent': body.textContent,
+ 'innerHTML': body.innerHTML,
+ });
+})()
diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc
index 057adbf..0540588 100644
--- a/components/dom_distiller/core/page_features.cc
+++ b/components/dom_distiller/core/page_features.cc
@@ -6,6 +6,7 @@
#include <string>
+#include "base/json/json_reader.h"
#include "third_party/re2/re2/re2.h"
namespace dom_distiller {
@@ -47,6 +48,8 @@ bool EndsWith(const std::string& t, const std::string& s) {
}
}
+int kDerivedFeaturesCount = 29;
+
std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
const GURL& url,
double numElements,
@@ -132,7 +135,18 @@ std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
return features;
}
-std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) {
+std::vector<double> CalculateDerivedFeaturesFromJSON(
+ const base::Value* stringified_json) {
+ std::string stringified;
+ if (!stringified_json->GetAsString(&stringified)) {
+ return std::vector<double>();
+ }
+
+ scoped_ptr<base::Value> json(base::JSONReader::Read(stringified));
+ if (!json) {
+ return std::vector<double>();
+ }
+
const base::DictionaryValue* dict;
if (!json->GetAsDictionary(&dict)) {
return std::vector<double>();
diff --git a/components/dom_distiller/core/page_features.h b/components/dom_distiller/core/page_features.h
index 919a90a1..236796b 100644
--- a/components/dom_distiller/core/page_features.h
+++ b/components/dom_distiller/core/page_features.h
@@ -12,6 +12,9 @@
namespace dom_distiller {
+// The length of the derived features vector.
+extern int kDerivedFeaturesCount;
+
// The distillable page detector is a model trained on a list of numeric
// features derived from core more complex features of a webpage (like the
// body's .textContent). This derives the numeric features for a set of core
@@ -31,7 +34,8 @@ std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
// Calculates the derived features from the JSON value as returned by the
// javascript core feature extraction.
-std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json);
+std::vector<double> CalculateDerivedFeaturesFromJSON(
+ const base::Value* stringified_json);
} // namespace dom_distiller
diff --git a/components/dom_distiller/core/page_features_unittest.cc b/components/dom_distiller/core/page_features_unittest.cc
index a863afc..413c55f 100644
--- a/components/dom_distiller/core/page_features_unittest.cc
+++ b/components/dom_distiller/core/page_features_unittest.cc
@@ -9,6 +9,7 @@
#include "base/files/file_util.h"
#include "base/json/json_reader.h"
+#include "base/json/json_writer.h"
#include "base/memory/scoped_ptr.h"
#include "base/path_service.h"
#include "testing/gtest/include/gtest/gtest.h"
@@ -66,8 +67,14 @@ TEST(DomDistillerPageFeaturesTest, TestCalculateDerivedFeatures) {
base::DictionaryValue* core_features;
ASSERT_TRUE(input_entries->GetDictionary(i, &entry));
ASSERT_TRUE(entry->GetDictionary("features", &core_features));
+ // CalculateDerivedFeaturesFromJSON expects a base::Value of the stringified
+ // JSON (and not a base::Value of the JSON itself)
+ std::string stringified_json;
+ ASSERT_TRUE(base::JSONWriter::Write(core_features, &stringified_json));
+ scoped_ptr<base::Value> stringified_value(
+ new base::StringValue(stringified_json));
std::vector<double> derived(
- CalculateDerivedFeaturesFromJSON(core_features));
+ CalculateDerivedFeaturesFromJSON(stringified_value.get()));
ASSERT_EQ(labels.size(), derived.size());
ASSERT_TRUE(expected_output_entries->GetDictionary(i, &entry));