summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorshashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-02-05 05:56:20 +0000
committershashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-02-05 05:56:20 +0000
commit415f99b18e6f14c53e08f6574a3bc653049fbaae (patch)
tree6039149dc08176419753d850eba0cc74ff1b1a43
parent640e10a877584a90c8200490c41c937501cca5a3 (diff)
downloadchromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.zip
chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.gz
chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.bz2
Add support for multipage distillation.
Add support for distilling multiple pages, currently support is really basic and just distills pages sequentially. BUG=288015 TEST=included NOTRY=true R=cjhopman@chromium.org Review URL: https://codereview.chromium.org/146843010 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@248867 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--components/dom_distiller.gypi3
-rw-r--r--components/dom_distiller/content/distiller_page_web_contents.cc18
-rw-r--r--components/dom_distiller/content/distiller_page_web_contents_browsertest.cc3
-rw-r--r--components/dom_distiller/content/dom_distiller_viewer_source.cc23
-rw-r--r--components/dom_distiller/core/distiller.cc138
-rw-r--r--components/dom_distiller/core/distiller.h54
-rw-r--r--components/dom_distiller/core/distiller_page.cc8
-rw-r--r--components/dom_distiller/core/distiller_page.h10
-rw-r--r--components/dom_distiller/core/distiller_unittest.cc205
-rw-r--r--components/dom_distiller/core/dom_distiller_service.cc14
-rw-r--r--components/dom_distiller/core/dom_distiller_service.h4
-rw-r--r--components/dom_distiller/core/dom_distiller_service_unittest.cc84
-rw-r--r--components/dom_distiller/core/fake_distiller.cc20
-rw-r--r--components/dom_distiller/core/fake_distiller.h14
-rw-r--r--components/dom_distiller/core/page_distiller.cc91
-rw-r--r--components/dom_distiller/core/page_distiller.h70
-rw-r--r--components/dom_distiller/core/proto/distilled_article.proto21
-rw-r--r--components/dom_distiller/core/proto/distilled_page.proto3
-rw-r--r--components/dom_distiller/core/task_tracker.cc41
-rw-r--r--components/dom_distiller/core/task_tracker.h15
-rw-r--r--components/dom_distiller/core/task_tracker_unittest.cc6
-rw-r--r--third_party/readability/README.chromium2
-rw-r--r--third_party/readability/js/readability.js222
23 files changed, 772 insertions, 297 deletions
diff --git a/components/dom_distiller.gypi b/components/dom_distiller.gypi
index 8cb8901..c22d5c0 100644
--- a/components/dom_distiller.gypi
+++ b/components/dom_distiller.gypi
@@ -84,6 +84,8 @@
'dom_distiller/core/dom_distiller_service.h',
'dom_distiller/core/dom_distiller_store.cc',
'dom_distiller/core/dom_distiller_store.h',
+ 'dom_distiller/core/page_distiller.cc',
+ 'dom_distiller/core/page_distiller.h',
'dom_distiller/core/task_tracker.cc',
'dom_distiller/core/task_tracker.h',
],
@@ -112,6 +114,7 @@
'target_name': 'distilled_page_proto',
'type': 'static_library',
'sources': [
+ 'dom_distiller/core/proto/distilled_article.proto',
'dom_distiller/core/proto/distilled_page.proto',
],
'variables': {
diff --git a/components/dom_distiller/content/distiller_page_web_contents.cc b/components/dom_distiller/content/distiller_page_web_contents.cc
index c0c7ff5..06bd6bc 100644
--- a/components/dom_distiller/content/distiller_page_web_contents.cc
+++ b/components/dom_distiller/content/distiller_page_web_contents.cc
@@ -55,16 +55,22 @@ void DistillerPageWebContents::ExecuteJavaScriptImpl(
base::string16(), // frame_xpath
base::UTF8ToUTF16(script),
base::Bind(&DistillerPage::OnExecuteJavaScriptDone,
- base::Unretained(this)));
+ base::Unretained(this),
+ web_contents_->GetLastCommittedURL()));
}
void DistillerPageWebContents::DidFinishLoad(int64 frame_id,
const GURL& validated_url,
bool is_main_frame,
RenderViewHost* render_view_host) {
- content::WebContentsObserver::Observe(NULL);
- OnLoadURLDone();
+ // TODO(shashishekhar): Find a better way to detect when it is safe to run the
+ // distillation script. Waiting for the entire page to load is really slow.
+ if (is_main_frame) {
+ content::WebContentsObserver::Observe(NULL);
+ OnLoadURLDone();
+ }
}
+
void DistillerPageWebContents::DidFailLoad(
int64 frame_id,
const GURL& validated_url,
@@ -72,8 +78,10 @@ void DistillerPageWebContents::DidFailLoad(
int error_code,
const base::string16& error_description,
RenderViewHost* render_view_host) {
- content::WebContentsObserver::Observe(NULL);
- OnLoadURLFailed();
+ if (is_main_frame) {
+ content::WebContentsObserver::Observe(NULL);
+ OnLoadURLFailed();
+ }
}
} // namespace dom_distiller
diff --git a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc
index 42446e2..c90bd96 100644
--- a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc
+++ b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc
@@ -48,7 +48,8 @@ class DistillerPageWebContentsTest
distiller_page_->ExecuteJavaScript(kScript);
}
- virtual void OnExecuteJavaScriptDone(const base::Value* value) OVERRIDE {
+ virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+ const base::Value* value) OVERRIDE {
value_ = value->DeepCopy();
quit_closure_.Run();
}
diff --git a/components/dom_distiller/content/dom_distiller_viewer_source.cc b/components/dom_distiller/content/dom_distiller_viewer_source.cc
index 2f23a07..8e913d5 100644
--- a/components/dom_distiller/content/dom_distiller_viewer_source.cc
+++ b/components/dom_distiller/content/dom_distiller_viewer_source.cc
@@ -4,6 +4,7 @@
#include "components/dom_distiller/content/dom_distiller_viewer_source.h"
+#include <sstream>
#include <string>
#include <vector>
@@ -12,6 +13,7 @@
#include "base/message_loop/message_loop.h"
#include "base/strings/string_util.h"
#include "components/dom_distiller/core/dom_distiller_service.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
#include "components/dom_distiller/core/proto/distilled_page.pb.h"
#include "components/dom_distiller/core/task_tracker.h"
#include "content/public/browser/render_frame_host.h"
@@ -53,7 +55,8 @@ class RequestViewerHandle : public ViewRequestDelegate {
virtual ~RequestViewerHandle();
// ViewRequestDelegate implementation.
- virtual void OnArticleReady(DistilledPageProto* proto) OVERRIDE;
+ virtual void OnArticleReady(const DistilledArticleProto* article_proto)
+ OVERRIDE;
void TakeViewerHandle(scoped_ptr<ViewerHandle> viewer_handle);
@@ -72,13 +75,21 @@ RequestViewerHandle::RequestViewerHandle(
RequestViewerHandle::~RequestViewerHandle() {}
-void RequestViewerHandle::OnArticleReady(DistilledPageProto* proto) {
- DCHECK(proto);
+void RequestViewerHandle::OnArticleReady(
+ const DistilledArticleProto* article_proto) {
+ DCHECK(article_proto);
std::string title;
std::string unsafe_article_html;
- if (proto->has_title() && proto->has_html()) {
- title = net::EscapeForHTML(proto->title());
- unsafe_article_html = proto->html();
+ if (article_proto->has_title() && article_proto->pages_size() > 0 &&
+ article_proto->pages(0).has_html()) {
+ title = net::EscapeForHTML(article_proto->title());
+ // TODO(shashishekhar): Add support for correcting displaying multiple pages
+ // after discussing the right way to display them.
+ std::ostringstream unsafe_output_stream;
+ for (int page_num = 0; page_num < article_proto->pages_size(); ++page_num) {
+ unsafe_output_stream << article_proto->pages(page_num).html();
+ }
+ unsafe_article_html = unsafe_output_stream.str();
} else {
title = l10n_util::GetStringUTF8(IDS_DOM_DISTILLER_VIEWER_NO_DATA_TITLE);
unsafe_article_html =
diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc
index 84b8a05..7b4c92a 100644
--- a/components/dom_distiller/core/distiller.cc
+++ b/components/dom_distiller/core/distiller.cc
@@ -8,16 +8,19 @@
#include "base/bind.h"
#include "base/callback.h"
-#include "base/strings/stringprintf.h"
+#include "base/strings/string_number_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "base/values.h"
#include "components/dom_distiller/core/distiller_page.h"
#include "components/dom_distiller/core/distiller_url_fetcher.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
#include "components/dom_distiller/core/proto/distilled_page.pb.h"
-#include "grit/dom_distiller_resources.h"
#include "net/url_request/url_request_context_getter.h"
-#include "ui/base/resource/resource_bundle.h"
-#include "url/gurl.h"
+
+namespace {
+// Maximum number of distilled pages in an article.
+const int kMaxPagesInArticle = 32;
+}
namespace dom_distiller {
@@ -39,90 +42,101 @@ scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
DistillerImpl::DistillerImpl(
const DistillerPageFactory& distiller_page_factory,
const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
- : distiller_page_factory_(distiller_page_factory),
- distiller_url_fetcher_factory_(distiller_url_fetcher_factory) {
- distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass();
+ : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
+ distillation_in_progress_(false) {
+ page_distiller_.reset(new PageDistiller(distiller_page_factory));
}
DistillerImpl::~DistillerImpl() {
}
void DistillerImpl::Init() {
- distiller_page_->Init();
+ DCHECK(!distillation_in_progress_);
+ page_distiller_->Init();
+ article_proto_.reset(new DistilledArticleProto());
}
void DistillerImpl::DistillPage(const GURL& url,
const DistillerCallback& distillation_cb) {
+ DCHECK(!distillation_in_progress_);
distillation_cb_ = distillation_cb;
- proto_.reset(new DistilledPageProto());
- proto_->set_url(url.spec());
- LoadURL(url);
+ DistillPage(url);
}
-void DistillerImpl::LoadURL(const GURL& url) {
- distiller_page_->LoadURL(url);
+void DistillerImpl::DistillPage(const GURL& url) {
+ DCHECK(!distillation_in_progress_);
+ if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle &&
+ processed_urls_.find(url.spec()) == processed_urls_.end()) {
+ distillation_in_progress_ = true;
+ // Distill the next page.
+ DCHECK(url.is_valid());
+ DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle);
+ page_distiller_->DistillPage(
+ url,
+ base::Bind(&DistillerImpl::OnPageDistillationFinished,
+ base::Unretained(this),
+ url));
+ } else {
+ RunDistillerCallbackIfDone();
+ }
}
-void DistillerImpl::OnLoadURLDone() {
- GetDistilledContent();
-}
+void DistillerImpl::OnPageDistillationFinished(
+ const GURL& page_url,
+ scoped_ptr<DistilledPageInfo> distilled_page,
+ bool distillation_successful) {
+ DCHECK(distillation_in_progress_);
+ DCHECK(distilled_page.get());
+ if (!distillation_successful) {
+ RunDistillerCallbackIfDone();
+ } else {
+ DistilledPageProto* current_page = article_proto_->add_pages();
+ // Set the title of the article as the title of the first page.
+ if (article_proto_->pages_size() == 1) {
+ article_proto_->set_title(distilled_page->title);
+ }
-void DistillerImpl::GetDistilledContent() {
- std::string script =
- ResourceBundle::GetSharedInstance().GetRawDataResource(
- IDR_DISTILLER_JS).as_string();
- distiller_page_->ExecuteJavaScript(script);
-}
+ current_page->set_url(page_url.spec());
+ current_page->set_html(distilled_page->html);
-void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) {
- std::string result;
- bool fetched_image = false;
- const base::ListValue* result_list = NULL;
- if (!value->GetAsList(&result_list)) {
- DCHECK(proto_);
- distillation_cb_.Run(proto_.Pass());
- return;
- }
- int i = 0;
- for (base::ListValue::const_iterator iter = result_list->begin();
- iter != result_list->end(); ++iter, ++i) {
- std::string item;
- (*iter)->GetAsString(&item);
- // The JavaScript returns an array where the first element is the title,
- // the second element is the article content HTML, and the remaining
- // elements are image URLs referenced in the HTML.
- switch (i) {
- case 0:
- proto_->set_title(item);
- break;
- case 1:
- proto_->set_html(item);
- break;
- default:
- int image_number = i - 2;
- std::string image_id = base::StringPrintf("%d", image_number);
- FetchImage(image_id, item);
- fetched_image = true;
+ GURL next_page_url(distilled_page->next_page_url);
+ if (next_page_url.is_valid()) {
+ // The pages should be in same origin.
+ DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
+ }
+
+ processed_urls_.insert(page_url.spec());
+ distillation_in_progress_ = false;
+ int page_number = article_proto_->pages_size();
+ for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
+ ++img_num) {
+ std::string image_id =
+ base::IntToString(page_number) + "_" + base::IntToString(img_num);
+ FetchImage(current_page, image_id, distilled_page->image_urls[img_num]);
}
+ DistillPage(next_page_url);
}
- if (!fetched_image)
- distillation_cb_.Run(proto_.Pass());
}
-void DistillerImpl::FetchImage(const std::string& image_id,
+void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto,
+ const std::string& image_id,
const std::string& item) {
DistillerURLFetcher* fetcher =
distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
image_fetchers_[image_id] = fetcher;
fetcher->FetchURL(item,
base::Bind(&DistillerImpl::OnFetchImageDone,
- base::Unretained(this), image_id));
+ base::Unretained(this),
+ base::Unretained(distilled_page_proto),
+ image_id));
}
-void DistillerImpl::OnFetchImageDone(const std::string& id,
+void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto,
+ const std::string& id,
const std::string& response) {
- DCHECK(proto_);
- DistilledPageProto_Image* image = proto_->add_image();
+ DCHECK_GT(article_proto_->pages_size(), 0);
+ DCHECK(distilled_page_proto);
+ DistilledPageProto_Image* image = distilled_page_proto->add_image();
image->set_name(id);
image->set_data(response);
DCHECK(image_fetchers_.end() != image_fetchers_.find(id));
@@ -130,8 +144,12 @@ void DistillerImpl::OnFetchImageDone(const std::string& id,
int result = image_fetchers_.erase(id);
delete fetcher;
DCHECK_EQ(1, result);
- if (image_fetchers_.empty()) {
- distillation_cb_.Run(proto_.Pass());
+ RunDistillerCallbackIfDone();
+}
+
+void DistillerImpl::RunDistillerCallbackIfDone() {
+ if (image_fetchers_.empty() && !distillation_in_progress_) {
+ distillation_cb_.Run(article_proto_.Pass());
}
}
diff --git a/components/dom_distiller/core/distiller.h b/components/dom_distiller/core/distiller.h
index adbd41b..9ba5723 100644
--- a/components/dom_distiller/core/distiller.h
+++ b/components/dom_distiller/core/distiller.h
@@ -5,16 +5,14 @@
#ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
#define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
-#include <map>
#include <string>
#include "base/callback.h"
-#include "base/gtest_prod_util.h"
-#include "base/memory/ref_counted.h"
-#include "base/values.h"
-#include "components/dom_distiller/core/distiller_page.h"
+#include "base/containers/hash_tables.h"
+#include "base/memory/scoped_ptr.h"
#include "components/dom_distiller/core/distiller_url_fetcher.h"
-#include "components/dom_distiller/core/proto/distilled_page.pb.h"
+#include "components/dom_distiller/core/page_distiller.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
#include "net/url_request/url_request_context_getter.h"
#include "url/gurl.h"
@@ -24,8 +22,8 @@ class DistillerImpl;
class Distiller {
public:
- typedef base::Callback<void(
- scoped_ptr<DistilledPageProto>)> DistillerCallback;
+ typedef base::Callback<void(scoped_ptr<DistilledArticleProto>)>
+ DistillerCallback;
virtual ~Distiller() {}
// Distills a page, and asynchrounously returns the article HTML to the
@@ -55,8 +53,7 @@ class DistillerFactoryImpl : public DistillerFactory {
};
// Distills a article from a page and associated pages.
-class DistillerImpl : public Distiller,
- public DistillerPage::Delegate {
+class DistillerImpl : public Distiller {
public:
DistillerImpl(
const DistillerPageFactory& distiller_page_factory,
@@ -70,28 +67,35 @@ class DistillerImpl : public Distiller,
virtual void DistillPage(const GURL& url,
const DistillerCallback& callback) OVERRIDE;
- // PageDistillerContext::Delegate
- virtual void OnLoadURLDone() OVERRIDE;
- virtual void OnExecuteJavaScriptDone(const base::Value* value) OVERRIDE;
+ private:
+ void OnFetchImageDone(DistilledPageProto* distilled_page_proto,
+ const std::string& id,
+ const std::string& response);
- void OnFetchImageDone(const std::string& id, const std::string& response);
+ void OnPageDistillationFinished(const GURL& page_url,
+ scoped_ptr<DistilledPageInfo> distilled_page,
+ bool distillation_successful);
- private:
- virtual void LoadURL(const GURL& url);
- virtual void FetchImage(const std::string& image_id, const std::string& item);
+ virtual void FetchImage(DistilledPageProto* distilled_page_proto,
+ const std::string& image_id,
+ const std::string& item);
- // Injects JavaScript to distill a loaded page down to its important content,
- // e.g., extracting a news article from its surrounding boilerplate.
- void GetDistilledContent();
+ // Distills the page and adds the new page to |article_proto|.
+ void DistillPage(const GURL& url);
+
+ // Runs |distillation_cb_| if all distillation callbacks and image fetches are
+ // complete.
+ void RunDistillerCallbackIfDone();
- const DistillerPageFactory& distiller_page_factory_;
const DistillerURLFetcherFactory& distiller_url_fetcher_factory_;
- scoped_ptr<DistillerPage> distiller_page_;
+ scoped_ptr<PageDistiller> page_distiller_;
DistillerCallback distillation_cb_;
- std::map<std::string, DistillerURLFetcher* > image_fetchers_;
-
- scoped_ptr<DistilledPageProto> proto_;
+ base::hash_map<std::string, DistillerURLFetcher*> image_fetchers_;
+ scoped_ptr<DistilledArticleProto> article_proto_;
+ bool distillation_in_progress_;
+ // Set to keep track of which urls are already seen by the distiller.
+ base::hash_set<std::string> processed_urls_;
DISALLOW_COPY_AND_ASSIGN(DistillerImpl);
};
diff --git a/components/dom_distiller/core/distiller_page.cc b/components/dom_distiller/core/distiller_page.cc
index cc486c7..3f4bffe 100644
--- a/components/dom_distiller/core/distiller_page.cc
+++ b/components/dom_distiller/core/distiller_page.cc
@@ -52,16 +52,16 @@ void DistillerPage::OnLoadURLFailed() {
scoped_ptr<base::Value> empty(base::Value::CreateNullValue());
if (!delegate_)
return;
- delegate_->OnExecuteJavaScriptDone(empty.get());
+ delegate_->OnExecuteJavaScriptDone(GURL(), empty.get());
}
-void DistillerPage::OnExecuteJavaScriptDone(
- const base::Value* value) {
+void DistillerPage::OnExecuteJavaScriptDone(const GURL& page_url,
+ const base::Value* value) {
DCHECK_EQ(EXECUTING_JAVASCRIPT, state_);
state_ = PAGE_AVAILABLE;
if (!delegate_)
return;
- delegate_->OnExecuteJavaScriptDone(value);
+ delegate_->OnExecuteJavaScriptDone(page_url, value);
}
} // namespace dom_distiller
diff --git a/components/dom_distiller/core/distiller_page.h b/components/dom_distiller/core/distiller_page.h
index 291ef7e..4e22e8d 100644
--- a/components/dom_distiller/core/distiller_page.h
+++ b/components/dom_distiller/core/distiller_page.h
@@ -22,7 +22,8 @@ class DistillerPage {
public:
virtual ~Delegate() {}
virtual void OnLoadURLDone() {}
- virtual void OnExecuteJavaScriptDone(const base::Value* value) {}
+ virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+ const base::Value* value) {}
};
explicit DistillerPage(Delegate* delegate);
@@ -45,9 +46,10 @@ class DistillerPage {
// a page is available.
void ExecuteJavaScript(const std::string& script);
- // Called when the JavaScript execution completes. |value| contains data
- // returned by the script.
- virtual void OnExecuteJavaScriptDone(const base::Value* value);
+ // Called when the JavaScript execution completes. |page_url| is the url of
+ // the distilled page. |value| contains data returned by the script.
+ virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+ const base::Value* value);
protected:
enum State {
diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc
index 44c2986..3865aef 100644
--- a/components/dom_distiller/core/distiller_unittest.cc
+++ b/components/dom_distiller/core/distiller_unittest.cc
@@ -3,20 +3,26 @@
// found in the LICENSE file.
#include <map>
+#include <string>
+#include <vector>
#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/location.h"
#include "base/memory/scoped_ptr.h"
#include "base/message_loop/message_loop.h"
+#include "base/strings/string_number_conversions.h"
#include "base/values.h"
#include "components/dom_distiller/core/distiller.h"
#include "components/dom_distiller/core/distiller_page.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
#include "components/dom_distiller/core/proto/distilled_page.pb.h"
#include "net/url_request/url_request_context_getter.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
+using std::vector;
+using std::string;
using::testing::Invoke;
using::testing::Return;
using::testing::_;
@@ -25,12 +31,31 @@ namespace {
const char kTitle[] = "Title";
const char kContent[] = "Content";
const char kURL[] = "http://a.com/";
- const char kId0[] = "0";
- const char kId1[] = "1";
- const char kImageURL0[] = "http://a.com/img1.jpg";
- const char kImageURL1[] = "http://a.com/img2.jpg";
- const char kImageData0[] = { 'a', 'b', 'c', 'd', 'e', 0 };
- const char kImageData1[] = { '1', '2', '3', '4', '5', 0 };
+ const size_t kTotalImages = 2;
+ const char* kImageURLs[kTotalImages] = {"http://a.com/img1.jpg",
+ "http://a.com/img2.jpg"};
+ const char* kImageData[kTotalImages] = {"abcde", "12345"};
+
+ const string GetImageName(int page_num, int image_num) {
+ return base::IntToString(page_num) + "_" + base::IntToString(image_num);
+ }
+
+ scoped_ptr<base::ListValue> CreateDistilledValueReturnedFromJS(
+ const string& title,
+ const string& content,
+ const vector<int>& image_indices,
+ const string& next_page_url) {
+ scoped_ptr<base::ListValue> list(new base::ListValue());
+
+ list->AppendString(title);
+ list->AppendString(content);
+ list->AppendString(next_page_url);
+ for (size_t i = 0; i < image_indices.size(); ++i) {
+ list->AppendString(kImageURLs[image_indices[i]]);
+ }
+ return list.Pass();
+ }
+
} // namespace
namespace dom_distiller {
@@ -38,15 +63,15 @@ namespace dom_distiller {
class TestDistillerURLFetcher : public DistillerURLFetcher {
public:
TestDistillerURLFetcher() : DistillerURLFetcher(NULL) {
- responses_[kImageURL0] = std::string(kImageData0);
- responses_[kImageURL1] = std::string(kImageData1);
+ responses_[kImageURLs[0]] = string(kImageData[0]);
+ responses_[kImageURLs[1]] = string(kImageData[1]);
}
- void CallCallback(std::string url, const URLFetcherCallback& callback) {
+ void CallCallback(string url, const URLFetcherCallback& callback) {
callback.Run(responses_[url]);
}
- virtual void FetchURL(const std::string& url,
+ virtual void FetchURL(const string& url,
const URLFetcherCallback& callback) OVERRIDE {
ASSERT_TRUE(base::MessageLoop::current());
base::MessageLoop::current()->PostTask(
@@ -55,7 +80,7 @@ class TestDistillerURLFetcher : public DistillerURLFetcher {
base::Unretained(this), url, callback));
}
- std::map<std::string, std::string> responses_;
+ std::map<string, string> responses_;
};
@@ -73,7 +98,7 @@ class MockDistillerPage : public DistillerPage {
public:
MOCK_METHOD0(InitImpl, void());
MOCK_METHOD1(LoadURLImpl, void(const GURL& gurl));
- MOCK_METHOD1(ExecuteJavaScriptImpl, void(const std::string& script));
+ MOCK_METHOD1(ExecuteJavaScriptImpl, void(const string& script));
explicit MockDistillerPage(DistillerPage::Delegate* delegate)
: DistillerPage(delegate) {}
@@ -96,19 +121,19 @@ class MockDistillerPageFactory : public DistillerPageFactory {
class DistillerTest : public testing::Test {
public:
virtual ~DistillerTest() {}
- void OnDistillPageDone(scoped_ptr<DistilledPageProto> proto) {
- proto_ = proto.Pass();
+ void OnDistillPageDone(scoped_ptr<DistilledArticleProto> proto) {
+ article_proto_ = proto.Pass();
}
protected:
scoped_ptr<DistillerImpl> distiller_;
- scoped_ptr<DistilledPageProto> proto_;
+ scoped_ptr<DistilledArticleProto> article_proto_;
MockDistillerPageFactory page_factory_;
TestDistillerURLFetcherFactory url_fetcher_factory_;
};
-ACTION_P2(DistillerPageOnExecuteJavaScriptDone, distiller_page, list) {
- distiller_page->OnExecuteJavaScriptDone(list);
+ACTION_P3(DistillerPageOnExecuteJavaScriptDone, distiller_page, url, list) {
+ distiller_page->OnExecuteJavaScriptDone(url, list);
}
ACTION_P2(CreateMockDistillerPage, list, kurl) {
@@ -118,18 +143,57 @@ ACTION_P2(CreateMockDistillerPage, list, kurl) {
EXPECT_CALL(*distiller_page, LoadURLImpl(kurl))
.WillOnce(testing::InvokeWithoutArgs(distiller_page,
&DistillerPage::OnLoadURLDone));
- EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_))
- .WillOnce(DistillerPageOnExecuteJavaScriptDone(distiller_page, list));
+ EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_)).WillOnce(
+ DistillerPageOnExecuteJavaScriptDone(distiller_page, kurl, list));
+ return distiller_page;
+}
+
+ACTION_P3(CreateMockDistillerPages, lists, kurls, num_pages) {
+ DistillerPage::Delegate* delegate = arg0;
+ MockDistillerPage* distiller_page = new MockDistillerPage(delegate);
+ EXPECT_CALL(*distiller_page, InitImpl());
+ {
+ testing::InSequence s;
+
+ for (int page = 0; page < num_pages; ++page) {
+ GURL url = GURL(kurls[page]);
+ EXPECT_CALL(*distiller_page, LoadURLImpl(url))
+ .WillOnce(testing::InvokeWithoutArgs(distiller_page,
+ &DistillerPage::OnLoadURLDone));
+ EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_))
+ .WillOnce(DistillerPageOnExecuteJavaScriptDone(
+ distiller_page, url, lists[page].get()));
+ }
+ }
return distiller_page;
}
TEST_F(DistillerTest, DistillPage) {
base::MessageLoopForUI loop;
- scoped_ptr<base::ListValue> list(new base::ListValue());
- list->AppendString(kTitle);
- list->AppendString(kContent);
- list->AppendString(kImageURL0);
- list->AppendString(kImageURL1);
+ scoped_ptr<base::ListValue> list =
+ CreateDistilledValueReturnedFromJS(kTitle, kContent, vector<int>(), "");
+ EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
+ .WillOnce(CreateMockDistillerPage(list.get(), GURL(kURL)));
+ distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
+ distiller_->Init();
+ distiller_->DistillPage(
+ GURL(kURL),
+ base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
+ base::MessageLoop::current()->RunUntilIdle();
+ EXPECT_EQ(kTitle, article_proto_->title());
+ EXPECT_EQ(article_proto_->pages_size(), 1);
+ const DistilledPageProto& first_page = article_proto_->pages(0);
+ EXPECT_EQ(kContent, first_page.html());
+ EXPECT_EQ(kURL, first_page.url());
+}
+
+TEST_F(DistillerTest, DistillPageWithImages) {
+ base::MessageLoopForUI loop;
+ vector<int> image_indices;
+ image_indices.push_back(0);
+ image_indices.push_back(1);
+ scoped_ptr<base::ListValue> list =
+ CreateDistilledValueReturnedFromJS(kTitle, kContent, image_indices, "");
EXPECT_CALL(page_factory_,
CreateDistillerPageMock(_)).WillOnce(
CreateMockDistillerPage(list.get(), GURL(kURL)));
@@ -139,14 +203,91 @@ TEST_F(DistillerTest, DistillPage) {
GURL(kURL),
base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
base::MessageLoop::current()->RunUntilIdle();
- EXPECT_EQ(kTitle, proto_->title());
- EXPECT_EQ(kContent, proto_->html());
- EXPECT_EQ(kURL, proto_->url());
- EXPECT_EQ(2, proto_->image_size());
- EXPECT_EQ(kImageData0, proto_->image(0).data());
- EXPECT_EQ(kId0, proto_->image(0).name());
- EXPECT_EQ(kImageData1, proto_->image(1).data());
- EXPECT_EQ(kId1, proto_->image(1).name());
+ EXPECT_EQ(kTitle, article_proto_->title());
+ EXPECT_EQ(article_proto_->pages_size(), 1);
+ const DistilledPageProto& first_page = article_proto_->pages(0);
+ EXPECT_EQ(kContent, first_page.html());
+ EXPECT_EQ(kURL, first_page.url());
+ EXPECT_EQ(2, first_page.image_size());
+ EXPECT_EQ(kImageData[0], first_page.image(0).data());
+ EXPECT_EQ(GetImageName(1, 0), first_page.image(0).name());
+ EXPECT_EQ(kImageData[1], first_page.image(1).data());
+ EXPECT_EQ(GetImageName(1, 1), first_page.image(1).name());
+}
+
+TEST_F(DistillerTest, DistillMultiplePages) {
+ base::MessageLoopForUI loop;
+ const int kNumPages = 8;
+ vector<int> image_indices[kNumPages];
+ string content[kNumPages];
+ string page_urls[kNumPages];
+ scoped_ptr<base::ListValue> list[kNumPages];
+
+ int next_image_number = 0;
+
+ for (int page_num = 0; page_num < kNumPages; ++page_num) {
+ // Each page has different number of images.
+ int tot_images = (page_num + kTotalImages) % (kTotalImages + 1);
+ for (int img_num = 0; img_num < tot_images; img_num++) {
+ image_indices[page_num].push_back(next_image_number);
+ next_image_number = (next_image_number + 1) % kTotalImages;
+ }
+
+ page_urls[page_num] = "http://a.com/" + base::IntToString(page_num);
+ content[page_num] = "Content for page:" + base::IntToString(page_num);
+ }
+ for (int i = 0; i < kNumPages; ++i) {
+ string next_page_url = "";
+ if (i + 1 < kNumPages)
+ next_page_url = page_urls[i + 1];
+
+ list[i] = CreateDistilledValueReturnedFromJS(
+ kTitle, content[i], image_indices[i], next_page_url);
+ }
+
+ EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
+ .WillOnce(CreateMockDistillerPages(list, page_urls, kNumPages));
+
+ distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
+ distiller_->Init();
+ distiller_->DistillPage(
+ GURL(page_urls[0]),
+ base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
+ base::MessageLoop::current()->RunUntilIdle();
+ EXPECT_EQ(kTitle, article_proto_->title());
+ EXPECT_EQ(article_proto_->pages_size(), kNumPages);
+ for (int page_num = 0; page_num < kNumPages; ++page_num) {
+ const DistilledPageProto& page = article_proto_->pages(page_num);
+ EXPECT_EQ(content[page_num], page.html());
+ EXPECT_EQ(page_urls[page_num], page.url());
+ EXPECT_EQ(image_indices[page_num].size(),
+ static_cast<size_t>(page.image_size()));
+ for (size_t img_num = 0; img_num < image_indices[page_num].size();
+ ++img_num) {
+ EXPECT_EQ(kImageData[image_indices[page_num][img_num]],
+ page.image(img_num).data());
+ EXPECT_EQ(GetImageName(page_num + 1, img_num),
+ page.image(img_num).name());
+ }
+ }
+}
+
+TEST_F(DistillerTest, DistillLinkLoop) {
+ base::MessageLoopForUI loop;
+ // Create a loop, the next page is same as the current page. This could
+ // happen if javascript misparses a next page link.
+ scoped_ptr<base::ListValue> list =
+ CreateDistilledValueReturnedFromJS(kTitle, kContent, vector<int>(), kURL);
+ EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
+ .WillOnce(CreateMockDistillerPage(list.get(), GURL(kURL)));
+ distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
+ distiller_->Init();
+ distiller_->DistillPage(
+ GURL(kURL),
+ base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
+ base::MessageLoop::current()->RunUntilIdle();
+ EXPECT_EQ(kTitle, article_proto_->title());
+ EXPECT_EQ(article_proto_->pages_size(), 1);
}
} // namespace dom_distiller
diff --git a/components/dom_distiller/core/dom_distiller_service.cc b/components/dom_distiller/core/dom_distiller_service.cc
index 797827b..6ee2f2f 100644
--- a/components/dom_distiller/core/dom_distiller_service.cc
+++ b/components/dom_distiller/core/dom_distiller_service.cc
@@ -7,6 +7,7 @@
#include "base/guid.h"
#include "base/message_loop/message_loop.h"
#include "components/dom_distiller/core/dom_distiller_store.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
#include "components/dom_distiller/core/task_tracker.h"
#include "url/gurl.h"
@@ -27,7 +28,7 @@ ArticleEntry CreateSkeletonEntryForUrl(const GURL& url) {
void RunArticleAvailableCallback(
const DomDistillerService::ArticleAvailableCallback& article_cb,
const ArticleEntry& entry,
- DistilledPageProto* proto,
+ const DistilledArticleProto* article_proto,
bool distillation_succeeded) {
article_cb.Run(distillation_succeeded);
}
@@ -189,13 +190,16 @@ void DomDistillerService::CancelTask(TaskTracker* task) {
}
}
-void DomDistillerService::AddDistilledPageToList(const ArticleEntry& entry,
- DistilledPageProto* proto,
- bool distillation_succeeded) {
+void DomDistillerService::AddDistilledPageToList(
+ const ArticleEntry& entry,
+ const DistilledArticleProto* article_proto,
+ bool distillation_succeeded) {
DCHECK(IsEntryValid(entry));
if (distillation_succeeded) {
- DCHECK(proto);
+ DCHECK(article_proto);
+ DCHECK_GT(article_proto->pages_size(), 0);
store_->UpdateEntry(entry);
+ DCHECK_EQ(article_proto->pages_size(), entry.pages_size());
}
}
diff --git a/components/dom_distiller/core/dom_distiller_service.h b/components/dom_distiller/core/dom_distiller_service.h
index ed045cd..630b184 100644
--- a/components/dom_distiller/core/dom_distiller_service.h
+++ b/components/dom_distiller/core/dom_distiller_service.h
@@ -22,7 +22,7 @@ class SyncableService;
namespace dom_distiller {
-class DistilledPageProto;
+class DistilledArticleProto;
class DistillerFactory;
class DomDistillerObserver;
class DomDistillerStoreInterface;
@@ -70,7 +70,7 @@ class DomDistillerService {
private:
void CancelTask(TaskTracker* task);
void AddDistilledPageToList(const ArticleEntry& entry,
- DistilledPageProto* proto,
+ const DistilledArticleProto* article_proto,
bool distillation_succeeded);
TaskTracker* CreateTaskTracker(const ArticleEntry& entry);
diff --git a/components/dom_distiller/core/dom_distiller_service_unittest.cc b/components/dom_distiller/core/dom_distiller_service_unittest.cc
index 35065fd..f3418bb 100644
--- a/components/dom_distiller/core/dom_distiller_service_unittest.cc
+++ b/components/dom_distiller/core/dom_distiller_service_unittest.cc
@@ -9,6 +9,7 @@
#include "base/containers/hash_tables.h"
#include "base/message_loop/message_loop.h"
#include "base/run_loop.h"
+#include "base/strings/string_number_conversions.h"
#include "components/dom_distiller/core/article_entry.h"
#include "components/dom_distiller/core/dom_distiller_model.h"
#include "components/dom_distiller/core/dom_distiller_store.h"
@@ -31,7 +32,7 @@ namespace {
class FakeViewRequestDelegate : public ViewRequestDelegate {
public:
virtual ~FakeViewRequestDelegate() {}
- MOCK_METHOD1(OnArticleReady, void(DistilledPageProto* proto));
+ MOCK_METHOD1(OnArticleReady, void(const DistilledArticleProto* proto));
};
class MockDistillerObserver : public DomDistillerObserver {
@@ -52,11 +53,23 @@ DomDistillerService::ArticleAvailableCallback ArticleCallback(
}
void RunDistillerCallback(FakeDistiller* distiller,
- scoped_ptr<DistilledPageProto> proto) {
+ scoped_ptr<DistilledArticleProto> proto) {
distiller->RunDistillerCallback(proto.Pass());
base::RunLoop().RunUntilIdle();
}
+scoped_ptr<DistilledArticleProto> CreateArticleWithURL(const std::string& url) {
+ scoped_ptr<DistilledArticleProto> proto(new DistilledArticleProto);
+ DistilledPageProto* page = proto->add_pages();
+ page->set_url(url);
+ return proto.Pass();
+}
+
+scoped_ptr<DistilledArticleProto> CreateDefaultArticle() {
+ return CreateArticleWithURL("http://www.example.com/default_article_page1")
+ .Pass();
+}
+
} // namespace
class DomDistillerServiceTest : public testing::Test {
@@ -109,7 +122,7 @@ TEST_F(DomDistillerServiceTest, TestViewEntry) {
ASSERT_FALSE(distiller->GetCallback().is_null());
- scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get()));
RunDistillerCallback(distiller, proto.Pass());
@@ -127,7 +140,7 @@ TEST_F(DomDistillerServiceTest, TestViewUrl) {
ASSERT_FALSE(distiller->GetCallback().is_null());
EXPECT_EQ(url, distiller->GetUrl());
- scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get()));
RunDistillerCallback(distiller, proto.Pass());
@@ -152,7 +165,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleViewUrl) {
ASSERT_FALSE(distiller->GetCallback().is_null());
EXPECT_EQ(url, distiller->GetUrl());
- scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get()));
RunDistillerCallback(distiller, proto.Pass());
@@ -160,7 +173,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleViewUrl) {
ASSERT_FALSE(distiller2->GetCallback().is_null());
EXPECT_EQ(url2, distiller2->GetUrl());
- scoped_ptr<DistilledPageProto> proto2(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto2 = CreateDefaultArticle();
EXPECT_CALL(viewer_delegate2, OnArticleReady(proto2.get()));
RunDistillerCallback(distiller2, proto2.Pass());
@@ -210,7 +223,7 @@ TEST_F(DomDistillerServiceTest, TestAddAndRemoveEntry) {
ASSERT_FALSE(distiller->GetCallback().is_null());
EXPECT_EQ(url, distiller->GetUrl());
- scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto = CreateArticleWithURL(url.spec());
RunDistillerCallback(distiller, proto.Pass());
EXPECT_TRUE(store_->GetEntryByUrl(url, &entry));
@@ -280,7 +293,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleObservers) {
ArticleEntriesUpdated(util::HasExpectedUpdates(expected_updates)));
}
- scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
RunDistillerCallback(distiller, proto.Pass());
// Remove should notify all observers that article is removed.
@@ -316,7 +329,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleCallbacks) {
EXPECT_CALL(article_cb[i], DistillationCompleted(true));
}
- scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+ scoped_ptr<DistilledArticleProto> proto = CreateArticleWithURL(url.spec());
RunDistillerCallback(distiller, proto.Pass());
// Add the same url again, all callbacks should be called with true.
@@ -353,5 +366,58 @@ TEST_F(DomDistillerServiceTest, TestMultipleCallbacksOnRemove) {
base::RunLoop().RunUntilIdle();
}
+TEST_F(DomDistillerServiceTest, TestMultiplePageArticle) {
+ FakeDistiller* distiller = new FakeDistiller(false);
+ EXPECT_CALL(*distiller_factory_, CreateDistillerImpl())
+ .WillOnce(Return(distiller));
+
+ const int kPageCount = 8;
+
+ std::string base_url("http://www.example.com/p");
+ GURL pages_url[kPageCount];
+ for (int page_num = 0; page_num < kPageCount; ++page_num) {
+ pages_url[page_num] = GURL(base_url + base::IntToString(page_num));
+ }
+
+ MockArticleAvailableCallback article_cb;
+ EXPECT_CALL(article_cb, DistillationCompleted(true));
+
+ std::string entry_id =
+ service_->AddToList(pages_url[0], ArticleCallback(&article_cb));
+
+ ArticleEntry entry;
+ ASSERT_FALSE(distiller->GetCallback().is_null());
+ EXPECT_EQ(pages_url[0], distiller->GetUrl());
+
+ // Create the article with pages to pass to the distiller.
+ scoped_ptr<DistilledArticleProto> proto =
+ CreateArticleWithURL(pages_url[0].spec());
+ for (int page_num = 1; page_num < kPageCount; ++page_num) {
+ DistilledPageProto* distilled_page = proto->add_pages();
+ distilled_page->set_url(pages_url[page_num].spec());
+ }
+
+ RunDistillerCallback(distiller, proto.Pass());
+ EXPECT_TRUE(store_->GetEntryByUrl(pages_url[0], &entry));
+
+ EXPECT_EQ(kPageCount, entry.pages_size());
+ // An article should have just one entry.
+ EXPECT_EQ(1u, store_->GetEntries().size());
+
+ // All pages should have correct urls.
+ for (int page_num = 0; page_num < kPageCount; ++page_num) {
+ EXPECT_EQ(pages_url[page_num].spec(), entry.pages(page_num).url());
+ }
+
+ // Should be able to query article using any of the pages url.
+ for (int page_num = 0; page_num < kPageCount; ++page_num) {
+ EXPECT_TRUE(store_->GetEntryByUrl(pages_url[page_num], &entry));
+ }
+
+ service_->RemoveEntry(entry_id);
+ base::RunLoop().RunUntilIdle();
+ EXPECT_EQ(0u, store_->GetEntries().size());
+}
+
} // namespace test
} // namespace dom_distiller
diff --git a/components/dom_distiller/core/fake_distiller.cc b/components/dom_distiller/core/fake_distiller.cc
index 3f1f976..e6cc979 100644
--- a/components/dom_distiller/core/fake_distiller.cc
+++ b/components/dom_distiller/core/fake_distiller.cc
@@ -14,14 +14,26 @@ namespace test {
MockDistillerFactory::MockDistillerFactory() {}
MockDistillerFactory::~MockDistillerFactory() {}
-FakeDistiller::FakeDistiller(bool execute_callback) :
- execute_callback_(execute_callback) {
+FakeDistiller::FakeDistiller(bool execute_callback)
+ : execute_callback_(execute_callback) {
EXPECT_CALL(*this, Die()).Times(testing::AnyNumber());
}
FakeDistiller::~FakeDistiller() { Die(); }
-void FakeDistiller::RunDistillerCallback(scoped_ptr<DistilledPageProto> proto) {
+void FakeDistiller::DistillPage(const GURL& url,
+ const DistillerCallback& callback) {
+ url_ = url;
+ callback_ = callback;
+ if (execute_callback_) {
+ scoped_ptr<DistilledArticleProto> proto(new DistilledArticleProto);
+ proto->add_pages()->set_url(url_.spec());
+ RunDistillerCallback(proto.Pass());
+ }
+}
+
+void FakeDistiller::RunDistillerCallback(
+ scoped_ptr<DistilledArticleProto> proto) {
base::MessageLoop::current()->PostTask(
FROM_HERE,
base::Bind(&FakeDistiller::RunDistillerCallbackInternal,
@@ -30,7 +42,7 @@ void FakeDistiller::RunDistillerCallback(scoped_ptr<DistilledPageProto> proto) {
}
void FakeDistiller::RunDistillerCallbackInternal(
- scoped_ptr<DistilledPageProto> proto) {
+ scoped_ptr<DistilledArticleProto> proto) {
EXPECT_FALSE(callback_.is_null());
callback_.Run(proto.Pass());
callback_.Reset();
diff --git a/components/dom_distiller/core/fake_distiller.h b/components/dom_distiller/core/fake_distiller.h
index dbb00d6..98ab709 100644
--- a/components/dom_distiller/core/fake_distiller.h
+++ b/components/dom_distiller/core/fake_distiller.h
@@ -27,27 +27,21 @@ class MockDistillerFactory : public DistillerFactory {
class FakeDistiller : public Distiller {
public:
- FakeDistiller(bool execute_callback);
+ explicit FakeDistiller(bool execute_callback);
virtual ~FakeDistiller();
MOCK_METHOD0(Die, void());
virtual void DistillPage(const GURL& url,
- const DistillerCallback& callback) OVERRIDE {
- url_ = url;
- callback_ = callback;
- if (execute_callback_) {
- RunDistillerCallback(make_scoped_ptr(new DistilledPageProto));
- }
- }
+ const DistillerCallback& callback) OVERRIDE;
- void RunDistillerCallback(scoped_ptr<DistilledPageProto> proto);
+ void RunDistillerCallback(scoped_ptr<DistilledArticleProto> proto);
GURL GetUrl() { return url_; }
DistillerCallback GetCallback() { return callback_; }
private:
- void RunDistillerCallbackInternal(scoped_ptr<DistilledPageProto> proto);
+ void RunDistillerCallbackInternal(scoped_ptr<DistilledArticleProto> proto);
bool execute_callback_;
GURL url_;
diff --git a/components/dom_distiller/core/page_distiller.cc b/components/dom_distiller/core/page_distiller.cc
new file mode 100644
index 0000000..2d47bd1
--- /dev/null
+++ b/components/dom_distiller/core/page_distiller.cc
@@ -0,0 +1,91 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/dom_distiller/core/page_distiller.h"
+
+#include <map>
+
+#include "base/location.h"
+#include "base/message_loop/message_loop.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/values.h"
+#include "components/dom_distiller/core/distiller_page.h"
+#include "components/dom_distiller/core/distiller_url_fetcher.h"
+#include "grit/dom_distiller_resources.h"
+#include "net/url_request/url_request_context_getter.h"
+#include "ui/base/resource/resource_bundle.h"
+#include "url/gurl.h"
+
+namespace dom_distiller {
+
+DistilledPageInfo::DistilledPageInfo() {}
+
+DistilledPageInfo::~DistilledPageInfo() {}
+
+PageDistiller::PageDistiller(const DistillerPageFactory& distiller_page_factory)
+ : distiller_page_(distiller_page_factory.CreateDistillerPage(this).Pass()) {
+}
+
+PageDistiller::~PageDistiller() {}
+
+void PageDistiller::Init() { distiller_page_->Init(); }
+
+void PageDistiller::DistillPage(const GURL& url,
+ const PageDistillerCallback& callback) {
+ page_distiller_callback_ = callback;
+ LoadURL(url);
+}
+
+void PageDistiller::LoadURL(const GURL& url) { distiller_page_->LoadURL(url); }
+
+void PageDistiller::OnLoadURLDone() { GetDistilledContent(); }
+
+void PageDistiller::GetDistilledContent() {
+ std::string script = ResourceBundle::GetSharedInstance()
+ .GetRawDataResource(IDR_DISTILLER_JS)
+ .as_string();
+ distiller_page_->ExecuteJavaScript(script);
+}
+
+void PageDistiller::OnExecuteJavaScriptDone(const GURL& page_url,
+ const base::Value* value) {
+ scoped_ptr<DistilledPageInfo> page_info(new DistilledPageInfo());
+ std::string result;
+ const base::ListValue* result_list = NULL;
+ if (!value->GetAsList(&result_list)) {
+ base::MessageLoop::current()->PostTask(
+ FROM_HERE,
+ base::Bind(page_distiller_callback_, base::Passed(&page_info), false));
+ } else {
+ int i = 0;
+ for (base::ListValue::const_iterator iter = result_list->begin();
+ iter != result_list->end();
+ ++iter, ++i) {
+ std::string item;
+ (*iter)->GetAsString(&item);
+ // The JavaScript returns an array where the first element is the title,
+ // the second element is the article content HTML, and the remaining
+ // elements are image URLs referenced in the HTML.
+ switch (i) {
+ case 0:
+ page_info->title = item;
+ break;
+ case 1:
+ page_info->html = item;
+ break;
+ case 2: {
+ page_info->next_page_url = item;
+ break;
+ }
+ default:
+ page_info->image_urls.push_back(item);
+ }
+ }
+ base::MessageLoop::current()->PostTask(
+ FROM_HERE,
+ base::Bind(page_distiller_callback_, base::Passed(&page_info), true));
+ }
+}
+
+} // namespace dom_distiller
diff --git a/components/dom_distiller/core/page_distiller.h b/components/dom_distiller/core/page_distiller.h
new file mode 100644
index 0000000..4e32bea
--- /dev/null
+++ b/components/dom_distiller/core/page_distiller.h
@@ -0,0 +1,70 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_
+#define COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_
+
+#include <string>
+#include <vector>
+
+#include "base/callback.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/values.h"
+#include "components/dom_distiller/core/distiller_page.h"
+#include "url/gurl.h"
+
+namespace dom_distiller {
+
+class DistillerImpl;
+
+struct DistilledPageInfo {
+ std::string title;
+ std::string html;
+ std::string next_page_url;
+ std::vector<std::string> image_urls;
+ DistilledPageInfo();
+ ~DistilledPageInfo();
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DistilledPageInfo);
+};
+
+// Distills a single page of an article.
+class PageDistiller : public DistillerPage::Delegate {
+ public:
+ typedef base::Callback<void(scoped_ptr<DistilledPageInfo> distilled_page,
+ bool distillation_successful)>
+ PageDistillerCallback;
+ explicit PageDistiller(const DistillerPageFactory& distiller_page_factory);
+ virtual ~PageDistiller();
+
+ // Creates an execution context. This must be called once before any calls are
+ // made to distill the page.
+ virtual void Init();
+
+ // Distills the |url| and posts the |callback| with results.
+ virtual void DistillPage(const GURL& url,
+ const PageDistillerCallback& callback);
+
+ // DistillerPage::Delegate
+ virtual void OnLoadURLDone() OVERRIDE;
+ virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+ const base::Value* value) OVERRIDE;
+
+ private:
+ virtual void LoadURL(const GURL& url);
+
+ // Injects JavaScript to distill a loaded page down to its important content,
+ // e.g., extracting a news article from its surrounding boilerplate.
+ void GetDistilledContent();
+
+ scoped_ptr<DistillerPage> distiller_page_;
+ PageDistillerCallback page_distiller_callback_;
+
+ DISALLOW_COPY_AND_ASSIGN(PageDistiller);
+};
+
+} // namespace dom_distiller
+
+#endif // COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_
diff --git a/components/dom_distiller/core/proto/distilled_article.proto b/components/dom_distiller/core/proto/distilled_article.proto
new file mode 100644
index 0000000..54284bf
--- /dev/null
+++ b/components/dom_distiller/core/proto/distilled_article.proto
@@ -0,0 +1,21 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Distilled article content.
+
+syntax = "proto2";
+
+option optimize_for = LITE_RUNTIME;
+
+package dom_distiller;
+
+import "distilled_page.proto";
+
+message DistilledArticleProto {
+ // A title for the article.
+ optional string title = 1;
+
+ // The distilled pages in the article.
+ repeated DistilledPageProto pages = 2;
+}
diff --git a/components/dom_distiller/core/proto/distilled_page.proto b/components/dom_distiller/core/proto/distilled_page.proto
index c6f8159..ba71617 100644
--- a/components/dom_distiller/core/proto/distilled_page.proto
+++ b/components/dom_distiller/core/proto/distilled_page.proto
@@ -14,9 +14,6 @@ message DistilledPageProto {
// The URLs used to create this page.
required string url = 1;
- // A title for the distilled page.
- optional string title = 2;
-
// The distilled HTML.
required string html = 3;
diff --git a/components/dom_distiller/core/task_tracker.cc b/components/dom_distiller/core/task_tracker.cc
index ccd5656..1afd482 100644
--- a/components/dom_distiller/core/task_tracker.cc
+++ b/components/dom_distiller/core/task_tracker.cc
@@ -5,6 +5,8 @@
#include "components/dom_distiller/core/task_tracker.h"
#include "base/message_loop/message_loop.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
+#include "components/dom_distiller/core/proto/distilled_page.pb.h"
namespace dom_distiller {
@@ -20,7 +22,8 @@ ViewerHandle::~ViewerHandle() {
TaskTracker::TaskTracker(const ArticleEntry& entry, CancelCallback callback)
: cancel_callback_(callback),
entry_(entry),
- distilled_page_(),
+ distilled_article_(),
+ distillation_complete_(false),
weak_ptr_factory_(this) {}
TaskTracker::~TaskTracker() { DCHECK(viewers_.empty()); }
@@ -51,7 +54,7 @@ void TaskTracker::StartBlobFetcher() {
void TaskTracker::AddSaveCallback(const SaveCallback& callback) {
DCHECK(!callback.is_null());
save_callbacks_.push_back(callback);
- if (distilled_page_) {
+ if (distillation_complete_) {
// Distillation for this task has already completed, and so it can be
// immediately saved.
ScheduleSaveCallbacks(true);
@@ -60,7 +63,7 @@ void TaskTracker::AddSaveCallback(const SaveCallback& callback) {
scoped_ptr<ViewerHandle> TaskTracker::AddViewer(ViewRequestDelegate* delegate) {
viewers_.push_back(delegate);
- if (distilled_page_) {
+ if (distillation_complete_) {
// Distillation for this task has already completed, and so the delegate can
// be immediately told of the result.
base::MessageLoop::current()->PostTask(
@@ -120,12 +123,10 @@ void TaskTracker::ScheduleSaveCallbacks(bool distillation_succeeded) {
void TaskTracker::DoSaveCallbacks(bool distillation_succeeded) {
if (!save_callbacks_.empty()) {
- DistilledPageProto* distilled_proto =
- distillation_succeeded ? distilled_page_.get() : NULL;
-
for (size_t i = 0; i < save_callbacks_.size(); ++i) {
DCHECK(!save_callbacks_[i].is_null());
- save_callbacks_[i].Run(entry_, distilled_proto, distillation_succeeded);
+ save_callbacks_[i].Run(
+ entry_, distilled_article_.get(), distillation_succeeded);
}
save_callbacks_.clear();
@@ -134,21 +135,33 @@ void TaskTracker::DoSaveCallbacks(bool distillation_succeeded) {
}
void TaskTracker::NotifyViewer(ViewRequestDelegate* delegate) {
- DCHECK(distilled_page_);
- delegate->OnArticleReady(distilled_page_.get());
+ DCHECK(distillation_complete_);
+ delegate->OnArticleReady(distilled_article_.get());
}
-void TaskTracker::OnDistilledDataReady(scoped_ptr<DistilledPageProto> proto) {
- distilled_page_ = proto.Pass();
- DCHECK(distilled_page_);
+void TaskTracker::OnDistilledDataReady(
+ scoped_ptr<DistilledArticleProto> distilled_article) {
+ distilled_article_ = distilled_article.Pass();
+ bool distillation_successful = false;
+ if (distilled_article_->pages_size() > 0) {
+ distillation_successful = true;
+ entry_.set_title(distilled_article_->title());
+ // Reset the pages.
+ entry_.clear_pages();
+ for (int i = 0; i < distilled_article_->pages_size(); ++i) {
+ sync_pb::ArticlePage* page = entry_.add_pages();
+ page->set_url(distilled_article_->pages(i).url());
+ }
+ }
+
+ distillation_complete_ = true;
- entry_.set_title(distilled_page_->title());
for (size_t i = 0; i < viewers_.size(); ++i) {
NotifyViewer(viewers_[i]);
}
// Already inside a callback run SaveCallbacks directly.
- DoSaveCallbacks(true);
+ DoSaveCallbacks(distillation_successful);
}
} // namespace dom_distiller
diff --git a/components/dom_distiller/core/task_tracker.h b/components/dom_distiller/core/task_tracker.h
index f545dd9..c4098fe 100644
--- a/components/dom_distiller/core/task_tracker.h
+++ b/components/dom_distiller/core/task_tracker.h
@@ -19,7 +19,7 @@ class GURL;
namespace dom_distiller {
-class DistilledPageProto;
+class DistilledArticleProto;
// A handle to a request to view a DOM distiller entry or URL. The request will
// be cancelled when the handle is destroyed.
@@ -41,10 +41,10 @@ class ViewRequestDelegate {
public:
virtual ~ViewRequestDelegate() {}
// Called when the distilled article contents are available. The
- // DistilledPageProto is owned by a TaskTracker instance and is invalidated
+ // DistilledArticleProto is owned by a TaskTracker instance and is invalidated
// when the corresponding ViewerHandle is destroyed (or when the
// DomDistillerService is destroyed).
- virtual void OnArticleReady(DistilledPageProto* proto) = 0;
+ virtual void OnArticleReady(const DistilledArticleProto* article_proto) = 0;
};
// A TaskTracker manages the various tasks related to viewing, saving,
@@ -68,7 +68,8 @@ class ViewRequestDelegate {
class TaskTracker {
public:
typedef base::Callback<void(TaskTracker*)> CancelCallback;
- typedef base::Callback<void(const ArticleEntry&, DistilledPageProto*, bool)>
+ typedef base::Callback<
+ void(const ArticleEntry&, const DistilledArticleProto*, bool)>
SaveCallback;
TaskTracker(const ArticleEntry& entry, CancelCallback callback);
@@ -90,7 +91,8 @@ class TaskTracker {
bool HasUrl(const GURL& url) const;
private:
- void OnDistilledDataReady(scoped_ptr<DistilledPageProto> distilled_page);
+ void OnDistilledDataReady(
+ scoped_ptr<DistilledArticleProto> distilled_article);
// Posts a task to run DoSaveCallbacks with |distillation_succeeded|.
void ScheduleSaveCallbacks(bool distillation_succeeded);
@@ -113,7 +115,8 @@ class TaskTracker {
std::vector<ViewRequestDelegate*> viewers_;
ArticleEntry entry_;
- scoped_ptr<DistilledPageProto> distilled_page_;
+ scoped_ptr<DistilledArticleProto> distilled_article_;
+ bool distillation_complete_;
// Note: This should remain the last member so it'll be destroyed and
// invalidate its weak pointers before any other members are destroyed.
diff --git a/components/dom_distiller/core/task_tracker_unittest.cc b/components/dom_distiller/core/task_tracker_unittest.cc
index f659fc1..c88becb 100644
--- a/components/dom_distiller/core/task_tracker_unittest.cc
+++ b/components/dom_distiller/core/task_tracker_unittest.cc
@@ -18,7 +18,8 @@ namespace test {
class FakeViewRequestDelegate : public ViewRequestDelegate {
public:
virtual ~FakeViewRequestDelegate() {}
- MOCK_METHOD1(OnArticleReady, void(DistilledPageProto* proto));
+ MOCK_METHOD1(OnArticleReady,
+ void(const DistilledArticleProto* article_proto));
};
class TestCancelCallback {
@@ -36,7 +37,8 @@ class TestCancelCallback {
class MockSaveCallback {
public:
- MOCK_METHOD3(Save, void(const ArticleEntry&, DistilledPageProto*, bool));
+ MOCK_METHOD3(Save,
+ void(const ArticleEntry&, const DistilledArticleProto*, bool));
};
class DomDistillerTaskTrackerTest : public testing::Test {
diff --git a/third_party/readability/README.chromium b/third_party/readability/README.chromium
index a409176..72590db 100644
--- a/third_party/readability/README.chromium
+++ b/third_party/readability/README.chromium
@@ -11,3 +11,5 @@ present it in an easy-to-read form. The CSS of readability is unchanged.
Local Modifications:
- Removed all writes to innerHTML in the JavaScript.
- Removed unneeded styles, footer, scrolling.
+- Removed extraneous whitespace.
+- Exposed nextPageLink as a function.
diff --git a/third_party/readability/js/readability.js b/third_party/readability/js/readability.js
index 68a0286..9cdbb30 100644
--- a/third_party/readability/js/readability.js
+++ b/third_party/readability/js/readability.js
@@ -1,14 +1,20 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Local modifications to this file are described in the README.chromium
+// file.
var dbg = (typeof console !== 'undefined') ? function(s) {
console.log("Readability: " + s);
} : function() {};
/*
- * Readability. An Arc90 Lab Experiment.
+ * Readability. An Arc90 Lab Experiment.
* Website: http://lab.arc90.com/experiments/readability
* Source: http://code.google.com/p/arc90labs-readability
*
- * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
+ * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
*
* Copyright (c) 2010 Arc90 Inc
* Readability is licensed under the Apache License, Version 2.0.
@@ -20,6 +26,7 @@ var readability = {
distilledHTML: '',
distilledArticleContent: null,
+ nextPageLink: '',
version: '1.7.1',
iframeLoads: 0,
@@ -41,7 +48,7 @@ var readability = {
maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
-
+
/**
* All of the regular expressions in use within readability.
* Defined up here so we don't instantiate them repeatedly in loops.
@@ -66,7 +73,7 @@ var readability = {
/**
* Runs readability.
- *
+ *
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
@@ -86,8 +93,11 @@ var readability = {
readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
/* Pull out any possible next page link first */
- var nextPageLink = readability.findNextPageLink(document.body);
-
+ readability.nextPageLink = readability.findNextPageLink(document.body);
+
+ /* We handle processing of nextPage from C++ set nextPageLink to null */
+ var nextPageLink = null;
+
readability.prepDocument();
/* Build readability's DOM tree */
@@ -152,12 +162,8 @@ var readability = {
window.scrollTo(0, 0);
- // TODO(bengr): Remove this assignment of null to nextPageLink when
- // the processing of the next page link is safe.
- nextPageLink = null;
-
if (nextPageLink) {
- /**
+ /**
* Append any additional pages after a small timeout so that people
* can start reading without having to wait for this to finish processing.
**/
@@ -179,16 +185,16 @@ var readability = {
var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
if(readability.reversePageScroll) {
- readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
+ readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
}
else {
- readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
+ readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
}
-
+
return false;
}
};
-
+
document.onkeyup = function(e) {
var code = (window.event) ? event.keyCode : e.keyCode;
if (code === 16) {
@@ -200,7 +206,7 @@ var readability = {
/**
* Run any post-process modifications to article content as necessary.
- *
+ *
* @param Element
* @return void
**/
@@ -226,7 +232,7 @@ var readability = {
for(var i=0, il = images.length; i < il; i+=1) {
var image = images[i];
-
+
if(image.offsetWidth > imageWidthThreshold) {
image.className += " blockImage";
}
@@ -242,7 +248,7 @@ var readability = {
var articleTools = document.createElement("DIV");
articleTools.id = "readTools";
- articleTools.innerHTML =
+ articleTools.innerHTML =
"<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
"<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
"<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
@@ -259,13 +265,13 @@ var readability = {
function sanitizeText() {
return text.replace(/@\w+/, "");
}
-
+
function countMatches(match) {
var matches = text.match(new RegExp(match, "g"));
- return matches !== null ? matches.length : 0;
+ return matches !== null ? matches.length : 0;
}
-
- function isRTL() {
+
+ function isRTL() {
var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
@@ -289,15 +295,15 @@ var readability = {
try {
curTitle = origTitle = document.title;
if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
- curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
+ curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
}
}
catch(e) {}
-
+
if(curTitle.match(/ [\|\-] /))
{
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
-
+
if(curTitle.split(' ').length < 3) {
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
}
@@ -330,7 +336,7 @@ var readability = {
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
- *
+ *
* @return void
**/
prepDocument: function () {
@@ -342,7 +348,7 @@ var readability = {
{
var body = document.createElement("body");
try {
- document.body = body;
+ document.body = body;
}
catch(e) {
document.documentElement.appendChild(body);
@@ -374,11 +380,11 @@ var readability = {
biggestFrameSize = frameSize;
readability.biggestFrame = frames[frameIndex];
}
-
+
if(canAccessFrame && frameSize > bestFrameSize)
{
readability.frameHack = true;
-
+
bestFrame = frames[frameIndex];
bestFrameSize = frameSize;
}
@@ -390,7 +396,7 @@ var readability = {
readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
newBody.style.overflow = 'scroll';
document.body = newBody;
-
+
var frameset = document.getElementsByTagName('frameset')[0];
if(frameset) {
frameset.parentNode.removeChild(frameset); }
@@ -455,7 +461,7 @@ var readability = {
var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
-
+
if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
}
@@ -468,7 +474,7 @@ var readability = {
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
}
},
-
+
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
@@ -477,7 +483,7 @@ var readability = {
* @return void
**/
initializeNode: function (node) {
- node.readability = {"contentScore": 0};
+ node.readability = {"contentScore": 0};
switch(node.tagName) {
case 'DIV':
@@ -489,7 +495,7 @@ var readability = {
case 'BLOCKQUOTE':
node.readability.contentScore += 3;
break;
-
+
case 'ADDRESS':
case 'OL':
case 'UL':
@@ -511,10 +517,10 @@ var readability = {
node.readability.contentScore -= 5;
break;
}
-
+
node.readability.contentScore += readability.getClassWeight(node);
},
-
+
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@@ -525,7 +531,7 @@ var readability = {
grabArticle: function (pageToClone) {
var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
isPaging = (page !== null) ? true: false;
-
+
var page = null;
// Never work on the actual page.
if (isPaging) {
@@ -533,7 +539,7 @@ var readability = {
} else {
page = pageToClone.cloneNode(true);
}
-
+
var allElements = page.getElementsByTagName('*');
/**
@@ -561,7 +567,7 @@ var readability = {
node.parentNode.removeChild(node);
nodeIndex-=1;
continue;
- }
+ }
}
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
@@ -598,7 +604,7 @@ var readability = {
}
}
}
- }
+ }
}
/**
@@ -640,15 +646,15 @@ var readability = {
/* Add points for any commas within this paragraph */
contentScore += innerText.split(',').length;
-
+
/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
-
+
/* Add the score to the parent. The grandparent gets half. */
parentNode.readability.contentScore += contentScore;
if(grandParentNode) {
- grandParentNode.readability.contentScore += contentScore/2;
+ grandParentNode.readability.contentScore += contentScore/2;
}
}
@@ -725,12 +731,12 @@ var readability = {
{
append = true;
}
-
+
if(siblingNode.nodeName === "P") {
var linkDensity = readability.getLinkDensity(siblingNode);
var nodeContent = readability.getInnerText(siblingNode);
var nodeLength = nodeContent.length;
-
+
if(nodeLength > 80 && linkDensity < 0.25)
{
append = true;
@@ -747,7 +753,7 @@ var readability = {
var nodeToAppend = null;
if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
-
+
dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
nodeToAppend = document.createElement("DIV");
try {
@@ -765,7 +771,7 @@ var readability = {
s-=1;
sl-=1;
}
-
+
/* To ensure a node does not interfere with readability styles, remove its classnames */
nodeToAppend.className = "";
@@ -779,15 +785,15 @@ var readability = {
**/
readability.distilledArticleContent = articleContent.cloneNode(true);
//readability.prepArticle(articleContent);
-
+
if (readability.curPageNum === 1) {
var newNode = document.createElement('div');
newNode.id = "readability-page-1";
newNode.setAttribute("class", "page");
readability.moveNodeInnards(articleContent, newNode);
articleContent.appendChild(newNode);
- }
-
+ }
+
/**
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
@@ -813,7 +819,7 @@ var readability = {
return articleContent;
},
-
+
/**
* Removes script tags from the document.
*
@@ -828,12 +834,12 @@ var readability = {
scripts[i].nodeValue="";
scripts[i].removeAttribute('src');
if (scripts[i].parentNode) {
- scripts[i].parentNode.removeChild(scripts[i]);
+ scripts[i].parentNode.removeChild(scripts[i]);
}
}
}
},
-
+
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
@@ -896,18 +902,18 @@ var readability = {
if ( cur.nodeType === 1 ) {
// Remove style attribute(s) :
if(cur.className !== "readability-styled") {
- cur.removeAttribute("style");
+ cur.removeAttribute("style");
}
readability.cleanStyles( cur );
}
cur = cur.nextSibling;
- }
+ }
},
-
+
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
- *
+ *
* @param Element
* @return number (float)
**/
@@ -918,11 +924,11 @@ var readability = {
for(var i=0, il=links.length; i<il;i+=1)
{
linkLength += readability.getInnerText(links[i]).length;
- }
+ }
return linkLength / textLength;
},
-
+
/**
* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
*
@@ -944,10 +950,10 @@ var readability = {
/* If the type isn't alpha-only, it's probably not actually a file extension. */
if(!possibleType.match(/[^a-zA-Z]/)) {
- segment = segment.split(".")[0];
+ segment = segment.split(".")[0];
}
}
-
+
/**
* EW-CMS specific segment replacement. Ugly.
* Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
@@ -968,7 +974,7 @@ var readability = {
if (i < 2 && segment.match(/^\d{1,2}$/)) {
del = true;
}
-
+
/* If this is the first segment and it's just "index", remove it. */
if(i === 0 && segment.toLowerCase() === "index") {
del = true;
@@ -992,7 +998,7 @@ var readability = {
/**
* Look for any paging links that may occur within the document.
- *
+ *
* @param body
* @return object (array)
**/
@@ -1008,7 +1014,7 @@ var readability = {
*
* Also possible: levenshtein distance? longest common subsequence?
*
- * After we do that, assign each page a score, and
+ * After we do that, assign each page a score, and
**/
for(var i = 0, il = allLinks.length; i < il; i+=1) {
var link = allLinks[i],
@@ -1018,12 +1024,12 @@ var readability = {
if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
continue;
}
-
+
/* If it's on a different domain, skip it. */
if(window.location.host !== linkHref.split(/\/+/g)[1]) {
continue;
}
-
+
var linkText = readability.getInnerText(link);
/* If the linkText looks like it's not the next page, skip it. */
@@ -1036,9 +1042,9 @@ var readability = {
if(!linkHrefLeftover.match(/\d/)) {
continue;
}
-
+
if(!(linkHref in possiblePages)) {
- possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
+ possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
} else {
possiblePages[linkHref].linkText += ' | ' + linkText;
}
@@ -1060,7 +1066,7 @@ var readability = {
if(linkData.match(/pag(e|ing|inat)/i)) {
linkObj.score += 25;
}
- if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
+ if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
/* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
if(!linkObj.linkText.match(readability.regexps.nextLink)) {
linkObj.score -= 65;
@@ -1087,10 +1093,10 @@ var readability = {
/* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
if(!parentNodeClassAndId.match(readability.regexps.positive)) {
linkObj.score -= 25;
- negativeNodeMatch = true;
+ negativeNodeMatch = true;
}
}
-
+
parentNode = parentNode.parentNode;
}
@@ -1152,7 +1158,7 @@ var readability = {
dbg('NEXT PAGE IS ' + nextHref);
readability.parsedPages[nextHref] = true;
- return nextHref;
+ return nextHref;
}
else {
return null;
@@ -1204,7 +1210,7 @@ var readability = {
if (typeof options === 'undefined') { options = {}; }
request.onreadystatechange = respondToReadyState;
-
+
request.open('get', url, true);
request.setRequestHeader('Accept', 'text/html');
@@ -1239,7 +1245,7 @@ var readability = {
articlePage.appendChild(linkDiv);
return;
}
-
+
/**
* Now that we've built the article page DOM element, get the page content
* asynchronously and load the cleaned content into the div we created for it.
@@ -1257,7 +1263,7 @@ var readability = {
return;
} else {
readability.pageETags[eTag] = 1;
- }
+ }
}
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
@@ -1308,7 +1314,7 @@ var readability = {
}
}
}
-
+
readability.removeScripts(content);
readability.moveNodeInnards(content, thisPage);
@@ -1330,9 +1336,9 @@ var readability = {
});
}(nextPageLink, articlePage));
},
-
+
/**
- * Get an elements class/id weight. Uses regular expressions to tell if this
+ * Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
*
* @param Element
@@ -1382,7 +1388,7 @@ var readability = {
var allElements = e.getElementsByTagName('*');
while (i < allElements.length) {
readability.deleteExtraBreaks(allElements[i]);
- i++;
+ i++;
}
},
@@ -1397,7 +1403,7 @@ var readability = {
clean: function (e, tag) {
var targetList = e.getElementsByTagName( tag );
var isEmbed = (tag === 'object' || tag === 'embed');
-
+
for (var y=targetList.length-1; y >= 0; y-=1) {
/* Allow youtube and vimeo videos through as people usually want to see those. */
if(isEmbed) {
@@ -1405,7 +1411,7 @@ var readability = {
for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
attributeValues += targetList[y].attributes[i].value + '|';
}
-
+
/* First, check the elements attributes to see if any of them contain youtube or vimeo */
if (attributeValues.search(readability.regexps.videos) !== -1) {
continue;
@@ -1415,13 +1421,13 @@ var readability = {
if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
continue;
}
-
+
}
targetList[y].parentNode.removeChild(targetList[y]);
}
},
-
+
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
@@ -1446,7 +1452,7 @@ var readability = {
for (var i=curTagsLength-1; i >= 0; i-=1) {
var weight = readability.getClassWeight(tagsList[i]);
var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
-
+
dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
if(weight+contentScore < 0)
@@ -1467,7 +1473,7 @@ var readability = {
var embeds = tagsList[i].getElementsByTagName("embed");
for(var ei=0,il=embeds.length; ei < il; ei+=1) {
if (embeds[ei].src.search(readability.regexps.videos) === -1) {
- embedCount+=1;
+ embedCount+=1;
}
}
@@ -1480,7 +1486,7 @@ var readability = {
} else if(li > p && tag !== "ul" && tag !== "ol") {
toRemove = true;
} else if( input > Math.floor(p/3) ) {
- toRemove = true;
+ toRemove = true;
} else if(contentLength < 25 && (img === 0 || img > 2) ) {
toRemove = true;
} else if(weight < 25 && linkDensity > 0.2) {
@@ -1522,7 +1528,7 @@ var readability = {
addFlag: function(flag) {
readability.flags = readability.flags | flag;
},
-
+
removeFlag: function(flag) {
readability.flags = readability.flags & ~flag;
},
@@ -1591,7 +1597,7 @@ var readability = {
}
return ret;
},
-
+
// Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
// <P> node, and makes all next siblings of that pair children of <P>, up
// until the next pair of <BR> nodes is reached.
@@ -1600,7 +1606,7 @@ var readability = {
var second = readability.isMultipleBr(node, true);
if (!second) {
return;
- }
+ }
// Make all next siblings of the second BR into children of a P.
var p = document.createElement('p');
var curr = second.nextSibling;
@@ -1613,7 +1619,7 @@ var readability = {
curr = next;
}
var ret = curr;
-
+
// Remove all nodes between the first and second BR.
curr = node.nextSibling;
while (curr && curr != second) {
@@ -1625,10 +1631,10 @@ var readability = {
second.parentNode.removeChild(second);
// Replace the first BR with the P.
node.parentNode.replaceChild(p, node);
-
+
return ret;
},
-
+
// Returns true if the NodeList contains a double <BR>.
hasDoubleBr: function(nodeList) {
for (var i = 0; i < nodeList.length; nodeList++) {
@@ -1637,8 +1643,8 @@ var readability = {
}
}
return false;
- },
-
+ },
+
// Replaces double <BR> tags with <P> tags.
replaceDoubleBrsWithPs: function(node) {
var allElements = node.getElementsByTagName('BR');
@@ -1652,8 +1658,8 @@ var readability = {
allElements = document.body.getElementsByTagName('BR');
}
},
-
-
+
+
// Replaces a BR and the whitespace that follows it with a P.
replaceBrWithP: function(node) {
if (!readability.isBrNode(node)) {
@@ -1673,7 +1679,7 @@ var readability = {
node.parentNode.replaceChild(p, node);
return curr;
},
-
+
// Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
// children of the <P>.
replaceBrsWithPs: function(node) {
@@ -1687,27 +1693,27 @@ var readability = {
allElements = document.body.getElementsByTagName('BR');
}
},
-
+
// Replaces any tag with any other tag.
replaceTagsWithTags: function(node, srcTag, destTag) {
var allElements = node.getElementsByTagName(srcTag);
for (var i = 0; i < allElements.length; i++) {
var dest = document.createElement(destTag);
readability.moveNodeInnards(allElements[i], dest);
- node.replaceNode(dest, allElements[i]);
+ allElements[i].parentNode.replaceChild(dest, allElements[i]);
}
},
-
+
// Replaces all <noscript> tags with <p> tags.
replaceNoscriptsWithPs: function(node) {
readability.replaceTagsWithTags(node, 'noscript', 'p');
},
-
+
// Replaces all <font> tags with <span> tags.
replaceFontsWithSpans: function(node) {
readability.replaceTagsWithTags(node, 'font', 'span');
},
-
+
// Returns a list of image URLs in the distilled article.
getImages : function() {
var images = document.getElementsByTagName('img');
@@ -1719,10 +1725,15 @@ var readability = {
}
return result;
},
-
+
// Returns the distilled article HTML from the page(s).
getDistilledArticleHTML : function() {
return readability.distilledHTML;
+ },
+
+ // Returns the next page of this article.
+ getNextPageLink : function() {
+ return readability.nextPageLink;
}
};
@@ -1730,12 +1741,13 @@ var readability = {
// element is the article title, the second element is HTML containing the
// long-form content, and remaining elements are URLs for images referenced by
// that HTML. Each <img> tag in the HTML has an id field set to k - 2, which
-// corresponds to a URL listed at index k in the array returned.
+// corresponds to a URL listed at index k in the array returned.
(function () {
readability.init();
- var result = new Array(2);
+ var result = new Array(3);
result[0] = readability.getArticleTitle();
result[1] = readability.getDistilledArticleHTML();
+ result[2] = readability.getNextPageLink();
return result.concat(readability.getImages());
}())