diff options
author | shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-02-05 05:56:20 +0000 |
---|---|---|
committer | shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-02-05 05:56:20 +0000 |
commit | 415f99b18e6f14c53e08f6574a3bc653049fbaae (patch) | |
tree | 6039149dc08176419753d850eba0cc74ff1b1a43 | |
parent | 640e10a877584a90c8200490c41c937501cca5a3 (diff) | |
download | chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.zip chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.gz chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.bz2 |
Add support for multipage distillation.
Add support for distilling multiple pages, currently support is really
basic and just distills pages sequentially.
BUG=288015
TEST=included
NOTRY=true
R=cjhopman@chromium.org
Review URL: https://codereview.chromium.org/146843010
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@248867 0039d316-1c4b-4281-b951-d872f2087c98
23 files changed, 772 insertions, 297 deletions
diff --git a/components/dom_distiller.gypi b/components/dom_distiller.gypi index 8cb8901..c22d5c0 100644 --- a/components/dom_distiller.gypi +++ b/components/dom_distiller.gypi @@ -84,6 +84,8 @@ 'dom_distiller/core/dom_distiller_service.h', 'dom_distiller/core/dom_distiller_store.cc', 'dom_distiller/core/dom_distiller_store.h', + 'dom_distiller/core/page_distiller.cc', + 'dom_distiller/core/page_distiller.h', 'dom_distiller/core/task_tracker.cc', 'dom_distiller/core/task_tracker.h', ], @@ -112,6 +114,7 @@ 'target_name': 'distilled_page_proto', 'type': 'static_library', 'sources': [ + 'dom_distiller/core/proto/distilled_article.proto', 'dom_distiller/core/proto/distilled_page.proto', ], 'variables': { diff --git a/components/dom_distiller/content/distiller_page_web_contents.cc b/components/dom_distiller/content/distiller_page_web_contents.cc index c0c7ff5..06bd6bc 100644 --- a/components/dom_distiller/content/distiller_page_web_contents.cc +++ b/components/dom_distiller/content/distiller_page_web_contents.cc @@ -55,16 +55,22 @@ void DistillerPageWebContents::ExecuteJavaScriptImpl( base::string16(), // frame_xpath base::UTF8ToUTF16(script), base::Bind(&DistillerPage::OnExecuteJavaScriptDone, - base::Unretained(this))); + base::Unretained(this), + web_contents_->GetLastCommittedURL())); } void DistillerPageWebContents::DidFinishLoad(int64 frame_id, const GURL& validated_url, bool is_main_frame, RenderViewHost* render_view_host) { - content::WebContentsObserver::Observe(NULL); - OnLoadURLDone(); + // TODO(shashishekhar): Find a better way to detect when it is safe to run the + // distillation script. Waiting for the entire page to load is really slow. + if (is_main_frame) { + content::WebContentsObserver::Observe(NULL); + OnLoadURLDone(); + } } + void DistillerPageWebContents::DidFailLoad( int64 frame_id, const GURL& validated_url, @@ -72,8 +78,10 @@ void DistillerPageWebContents::DidFailLoad( int error_code, const base::string16& error_description, RenderViewHost* render_view_host) { - content::WebContentsObserver::Observe(NULL); - OnLoadURLFailed(); + if (is_main_frame) { + content::WebContentsObserver::Observe(NULL); + OnLoadURLFailed(); + } } } // namespace dom_distiller diff --git a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc index 42446e2..c90bd96 100644 --- a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc +++ b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc @@ -48,7 +48,8 @@ class DistillerPageWebContentsTest distiller_page_->ExecuteJavaScript(kScript); } - virtual void OnExecuteJavaScriptDone(const base::Value* value) OVERRIDE { + virtual void OnExecuteJavaScriptDone(const GURL& page_url, + const base::Value* value) OVERRIDE { value_ = value->DeepCopy(); quit_closure_.Run(); } diff --git a/components/dom_distiller/content/dom_distiller_viewer_source.cc b/components/dom_distiller/content/dom_distiller_viewer_source.cc index 2f23a07..8e913d5 100644 --- a/components/dom_distiller/content/dom_distiller_viewer_source.cc +++ b/components/dom_distiller/content/dom_distiller_viewer_source.cc @@ -4,6 +4,7 @@ #include "components/dom_distiller/content/dom_distiller_viewer_source.h" +#include <sstream> #include <string> #include <vector> @@ -12,6 +13,7 @@ #include "base/message_loop/message_loop.h" #include "base/strings/string_util.h" #include "components/dom_distiller/core/dom_distiller_service.h" +#include "components/dom_distiller/core/proto/distilled_article.pb.h" #include "components/dom_distiller/core/proto/distilled_page.pb.h" #include "components/dom_distiller/core/task_tracker.h" #include "content/public/browser/render_frame_host.h" @@ -53,7 +55,8 @@ class RequestViewerHandle : public ViewRequestDelegate { virtual ~RequestViewerHandle(); // ViewRequestDelegate implementation. - virtual void OnArticleReady(DistilledPageProto* proto) OVERRIDE; + virtual void OnArticleReady(const DistilledArticleProto* article_proto) + OVERRIDE; void TakeViewerHandle(scoped_ptr<ViewerHandle> viewer_handle); @@ -72,13 +75,21 @@ RequestViewerHandle::RequestViewerHandle( RequestViewerHandle::~RequestViewerHandle() {} -void RequestViewerHandle::OnArticleReady(DistilledPageProto* proto) { - DCHECK(proto); +void RequestViewerHandle::OnArticleReady( + const DistilledArticleProto* article_proto) { + DCHECK(article_proto); std::string title; std::string unsafe_article_html; - if (proto->has_title() && proto->has_html()) { - title = net::EscapeForHTML(proto->title()); - unsafe_article_html = proto->html(); + if (article_proto->has_title() && article_proto->pages_size() > 0 && + article_proto->pages(0).has_html()) { + title = net::EscapeForHTML(article_proto->title()); + // TODO(shashishekhar): Add support for correcting displaying multiple pages + // after discussing the right way to display them. + std::ostringstream unsafe_output_stream; + for (int page_num = 0; page_num < article_proto->pages_size(); ++page_num) { + unsafe_output_stream << article_proto->pages(page_num).html(); + } + unsafe_article_html = unsafe_output_stream.str(); } else { title = l10n_util::GetStringUTF8(IDS_DOM_DISTILLER_VIEWER_NO_DATA_TITLE); unsafe_article_html = diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc index 84b8a05..7b4c92a 100644 --- a/components/dom_distiller/core/distiller.cc +++ b/components/dom_distiller/core/distiller.cc @@ -8,16 +8,19 @@ #include "base/bind.h" #include "base/callback.h" -#include "base/strings/stringprintf.h" +#include "base/strings/string_number_conversions.h" #include "base/strings/utf_string_conversions.h" #include "base/values.h" #include "components/dom_distiller/core/distiller_page.h" #include "components/dom_distiller/core/distiller_url_fetcher.h" +#include "components/dom_distiller/core/proto/distilled_article.pb.h" #include "components/dom_distiller/core/proto/distilled_page.pb.h" -#include "grit/dom_distiller_resources.h" #include "net/url_request/url_request_context_getter.h" -#include "ui/base/resource/resource_bundle.h" -#include "url/gurl.h" + +namespace { +// Maximum number of distilled pages in an article. +const int kMaxPagesInArticle = 32; +} namespace dom_distiller { @@ -39,90 +42,101 @@ scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { DistillerImpl::DistillerImpl( const DistillerPageFactory& distiller_page_factory, const DistillerURLFetcherFactory& distiller_url_fetcher_factory) - : distiller_page_factory_(distiller_page_factory), - distiller_url_fetcher_factory_(distiller_url_fetcher_factory) { - distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass(); + : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), + distillation_in_progress_(false) { + page_distiller_.reset(new PageDistiller(distiller_page_factory)); } DistillerImpl::~DistillerImpl() { } void DistillerImpl::Init() { - distiller_page_->Init(); + DCHECK(!distillation_in_progress_); + page_distiller_->Init(); + article_proto_.reset(new DistilledArticleProto()); } void DistillerImpl::DistillPage(const GURL& url, const DistillerCallback& distillation_cb) { + DCHECK(!distillation_in_progress_); distillation_cb_ = distillation_cb; - proto_.reset(new DistilledPageProto()); - proto_->set_url(url.spec()); - LoadURL(url); + DistillPage(url); } -void DistillerImpl::LoadURL(const GURL& url) { - distiller_page_->LoadURL(url); +void DistillerImpl::DistillPage(const GURL& url) { + DCHECK(!distillation_in_progress_); + if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && + processed_urls_.find(url.spec()) == processed_urls_.end()) { + distillation_in_progress_ = true; + // Distill the next page. + DCHECK(url.is_valid()); + DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); + page_distiller_->DistillPage( + url, + base::Bind(&DistillerImpl::OnPageDistillationFinished, + base::Unretained(this), + url)); + } else { + RunDistillerCallbackIfDone(); + } } -void DistillerImpl::OnLoadURLDone() { - GetDistilledContent(); -} +void DistillerImpl::OnPageDistillationFinished( + const GURL& page_url, + scoped_ptr<DistilledPageInfo> distilled_page, + bool distillation_successful) { + DCHECK(distillation_in_progress_); + DCHECK(distilled_page.get()); + if (!distillation_successful) { + RunDistillerCallbackIfDone(); + } else { + DistilledPageProto* current_page = article_proto_->add_pages(); + // Set the title of the article as the title of the first page. + if (article_proto_->pages_size() == 1) { + article_proto_->set_title(distilled_page->title); + } -void DistillerImpl::GetDistilledContent() { - std::string script = - ResourceBundle::GetSharedInstance().GetRawDataResource( - IDR_DISTILLER_JS).as_string(); - distiller_page_->ExecuteJavaScript(script); -} + current_page->set_url(page_url.spec()); + current_page->set_html(distilled_page->html); -void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) { - std::string result; - bool fetched_image = false; - const base::ListValue* result_list = NULL; - if (!value->GetAsList(&result_list)) { - DCHECK(proto_); - distillation_cb_.Run(proto_.Pass()); - return; - } - int i = 0; - for (base::ListValue::const_iterator iter = result_list->begin(); - iter != result_list->end(); ++iter, ++i) { - std::string item; - (*iter)->GetAsString(&item); - // The JavaScript returns an array where the first element is the title, - // the second element is the article content HTML, and the remaining - // elements are image URLs referenced in the HTML. - switch (i) { - case 0: - proto_->set_title(item); - break; - case 1: - proto_->set_html(item); - break; - default: - int image_number = i - 2; - std::string image_id = base::StringPrintf("%d", image_number); - FetchImage(image_id, item); - fetched_image = true; + GURL next_page_url(distilled_page->next_page_url); + if (next_page_url.is_valid()) { + // The pages should be in same origin. + DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); + } + + processed_urls_.insert(page_url.spec()); + distillation_in_progress_ = false; + int page_number = article_proto_->pages_size(); + for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); + ++img_num) { + std::string image_id = + base::IntToString(page_number) + "_" + base::IntToString(img_num); + FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); } + DistillPage(next_page_url); } - if (!fetched_image) - distillation_cb_.Run(proto_.Pass()); } -void DistillerImpl::FetchImage(const std::string& image_id, +void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, + const std::string& image_id, const std::string& item) { DistillerURLFetcher* fetcher = distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); image_fetchers_[image_id] = fetcher; fetcher->FetchURL(item, base::Bind(&DistillerImpl::OnFetchImageDone, - base::Unretained(this), image_id)); + base::Unretained(this), + base::Unretained(distilled_page_proto), + image_id)); } -void DistillerImpl::OnFetchImageDone(const std::string& id, +void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, + const std::string& id, const std::string& response) { - DCHECK(proto_); - DistilledPageProto_Image* image = proto_->add_image(); + DCHECK_GT(article_proto_->pages_size(), 0); + DCHECK(distilled_page_proto); + DistilledPageProto_Image* image = distilled_page_proto->add_image(); image->set_name(id); image->set_data(response); DCHECK(image_fetchers_.end() != image_fetchers_.find(id)); @@ -130,8 +144,12 @@ void DistillerImpl::OnFetchImageDone(const std::string& id, int result = image_fetchers_.erase(id); delete fetcher; DCHECK_EQ(1, result); - if (image_fetchers_.empty()) { - distillation_cb_.Run(proto_.Pass()); + RunDistillerCallbackIfDone(); +} + +void DistillerImpl::RunDistillerCallbackIfDone() { + if (image_fetchers_.empty() && !distillation_in_progress_) { + distillation_cb_.Run(article_proto_.Pass()); } } diff --git a/components/dom_distiller/core/distiller.h b/components/dom_distiller/core/distiller.h index adbd41b..9ba5723 100644 --- a/components/dom_distiller/core/distiller.h +++ b/components/dom_distiller/core/distiller.h @@ -5,16 +5,14 @@ #ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_ #define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_ -#include <map> #include <string> #include "base/callback.h" -#include "base/gtest_prod_util.h" -#include "base/memory/ref_counted.h" -#include "base/values.h" -#include "components/dom_distiller/core/distiller_page.h" +#include "base/containers/hash_tables.h" +#include "base/memory/scoped_ptr.h" #include "components/dom_distiller/core/distiller_url_fetcher.h" -#include "components/dom_distiller/core/proto/distilled_page.pb.h" +#include "components/dom_distiller/core/page_distiller.h" +#include "components/dom_distiller/core/proto/distilled_article.pb.h" #include "net/url_request/url_request_context_getter.h" #include "url/gurl.h" @@ -24,8 +22,8 @@ class DistillerImpl; class Distiller { public: - typedef base::Callback<void( - scoped_ptr<DistilledPageProto>)> DistillerCallback; + typedef base::Callback<void(scoped_ptr<DistilledArticleProto>)> + DistillerCallback; virtual ~Distiller() {} // Distills a page, and asynchrounously returns the article HTML to the @@ -55,8 +53,7 @@ class DistillerFactoryImpl : public DistillerFactory { }; // Distills a article from a page and associated pages. -class DistillerImpl : public Distiller, - public DistillerPage::Delegate { +class DistillerImpl : public Distiller { public: DistillerImpl( const DistillerPageFactory& distiller_page_factory, @@ -70,28 +67,35 @@ class DistillerImpl : public Distiller, virtual void DistillPage(const GURL& url, const DistillerCallback& callback) OVERRIDE; - // PageDistillerContext::Delegate - virtual void OnLoadURLDone() OVERRIDE; - virtual void OnExecuteJavaScriptDone(const base::Value* value) OVERRIDE; + private: + void OnFetchImageDone(DistilledPageProto* distilled_page_proto, + const std::string& id, + const std::string& response); - void OnFetchImageDone(const std::string& id, const std::string& response); + void OnPageDistillationFinished(const GURL& page_url, + scoped_ptr<DistilledPageInfo> distilled_page, + bool distillation_successful); - private: - virtual void LoadURL(const GURL& url); - virtual void FetchImage(const std::string& image_id, const std::string& item); + virtual void FetchImage(DistilledPageProto* distilled_page_proto, + const std::string& image_id, + const std::string& item); - // Injects JavaScript to distill a loaded page down to its important content, - // e.g., extracting a news article from its surrounding boilerplate. - void GetDistilledContent(); + // Distills the page and adds the new page to |article_proto|. + void DistillPage(const GURL& url); + + // Runs |distillation_cb_| if all distillation callbacks and image fetches are + // complete. + void RunDistillerCallbackIfDone(); - const DistillerPageFactory& distiller_page_factory_; const DistillerURLFetcherFactory& distiller_url_fetcher_factory_; - scoped_ptr<DistillerPage> distiller_page_; + scoped_ptr<PageDistiller> page_distiller_; DistillerCallback distillation_cb_; - std::map<std::string, DistillerURLFetcher* > image_fetchers_; - - scoped_ptr<DistilledPageProto> proto_; + base::hash_map<std::string, DistillerURLFetcher*> image_fetchers_; + scoped_ptr<DistilledArticleProto> article_proto_; + bool distillation_in_progress_; + // Set to keep track of which urls are already seen by the distiller. + base::hash_set<std::string> processed_urls_; DISALLOW_COPY_AND_ASSIGN(DistillerImpl); }; diff --git a/components/dom_distiller/core/distiller_page.cc b/components/dom_distiller/core/distiller_page.cc index cc486c7..3f4bffe 100644 --- a/components/dom_distiller/core/distiller_page.cc +++ b/components/dom_distiller/core/distiller_page.cc @@ -52,16 +52,16 @@ void DistillerPage::OnLoadURLFailed() { scoped_ptr<base::Value> empty(base::Value::CreateNullValue()); if (!delegate_) return; - delegate_->OnExecuteJavaScriptDone(empty.get()); + delegate_->OnExecuteJavaScriptDone(GURL(), empty.get()); } -void DistillerPage::OnExecuteJavaScriptDone( - const base::Value* value) { +void DistillerPage::OnExecuteJavaScriptDone(const GURL& page_url, + const base::Value* value) { DCHECK_EQ(EXECUTING_JAVASCRIPT, state_); state_ = PAGE_AVAILABLE; if (!delegate_) return; - delegate_->OnExecuteJavaScriptDone(value); + delegate_->OnExecuteJavaScriptDone(page_url, value); } } // namespace dom_distiller diff --git a/components/dom_distiller/core/distiller_page.h b/components/dom_distiller/core/distiller_page.h index 291ef7e..4e22e8d 100644 --- a/components/dom_distiller/core/distiller_page.h +++ b/components/dom_distiller/core/distiller_page.h @@ -22,7 +22,8 @@ class DistillerPage { public: virtual ~Delegate() {} virtual void OnLoadURLDone() {} - virtual void OnExecuteJavaScriptDone(const base::Value* value) {} + virtual void OnExecuteJavaScriptDone(const GURL& page_url, + const base::Value* value) {} }; explicit DistillerPage(Delegate* delegate); @@ -45,9 +46,10 @@ class DistillerPage { // a page is available. void ExecuteJavaScript(const std::string& script); - // Called when the JavaScript execution completes. |value| contains data - // returned by the script. - virtual void OnExecuteJavaScriptDone(const base::Value* value); + // Called when the JavaScript execution completes. |page_url| is the url of + // the distilled page. |value| contains data returned by the script. + virtual void OnExecuteJavaScriptDone(const GURL& page_url, + const base::Value* value); protected: enum State { diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc index 44c2986..3865aef 100644 --- a/components/dom_distiller/core/distiller_unittest.cc +++ b/components/dom_distiller/core/distiller_unittest.cc @@ -3,20 +3,26 @@ // found in the LICENSE file. #include <map> +#include <string> +#include <vector> #include "base/bind.h" #include "base/bind_helpers.h" #include "base/location.h" #include "base/memory/scoped_ptr.h" #include "base/message_loop/message_loop.h" +#include "base/strings/string_number_conversions.h" #include "base/values.h" #include "components/dom_distiller/core/distiller.h" #include "components/dom_distiller/core/distiller_page.h" +#include "components/dom_distiller/core/proto/distilled_article.pb.h" #include "components/dom_distiller/core/proto/distilled_page.pb.h" #include "net/url_request/url_request_context_getter.h" #include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" +using std::vector; +using std::string; using::testing::Invoke; using::testing::Return; using::testing::_; @@ -25,12 +31,31 @@ namespace { const char kTitle[] = "Title"; const char kContent[] = "Content"; const char kURL[] = "http://a.com/"; - const char kId0[] = "0"; - const char kId1[] = "1"; - const char kImageURL0[] = "http://a.com/img1.jpg"; - const char kImageURL1[] = "http://a.com/img2.jpg"; - const char kImageData0[] = { 'a', 'b', 'c', 'd', 'e', 0 }; - const char kImageData1[] = { '1', '2', '3', '4', '5', 0 }; + const size_t kTotalImages = 2; + const char* kImageURLs[kTotalImages] = {"http://a.com/img1.jpg", + "http://a.com/img2.jpg"}; + const char* kImageData[kTotalImages] = {"abcde", "12345"}; + + const string GetImageName(int page_num, int image_num) { + return base::IntToString(page_num) + "_" + base::IntToString(image_num); + } + + scoped_ptr<base::ListValue> CreateDistilledValueReturnedFromJS( + const string& title, + const string& content, + const vector<int>& image_indices, + const string& next_page_url) { + scoped_ptr<base::ListValue> list(new base::ListValue()); + + list->AppendString(title); + list->AppendString(content); + list->AppendString(next_page_url); + for (size_t i = 0; i < image_indices.size(); ++i) { + list->AppendString(kImageURLs[image_indices[i]]); + } + return list.Pass(); + } + } // namespace namespace dom_distiller { @@ -38,15 +63,15 @@ namespace dom_distiller { class TestDistillerURLFetcher : public DistillerURLFetcher { public: TestDistillerURLFetcher() : DistillerURLFetcher(NULL) { - responses_[kImageURL0] = std::string(kImageData0); - responses_[kImageURL1] = std::string(kImageData1); + responses_[kImageURLs[0]] = string(kImageData[0]); + responses_[kImageURLs[1]] = string(kImageData[1]); } - void CallCallback(std::string url, const URLFetcherCallback& callback) { + void CallCallback(string url, const URLFetcherCallback& callback) { callback.Run(responses_[url]); } - virtual void FetchURL(const std::string& url, + virtual void FetchURL(const string& url, const URLFetcherCallback& callback) OVERRIDE { ASSERT_TRUE(base::MessageLoop::current()); base::MessageLoop::current()->PostTask( @@ -55,7 +80,7 @@ class TestDistillerURLFetcher : public DistillerURLFetcher { base::Unretained(this), url, callback)); } - std::map<std::string, std::string> responses_; + std::map<string, string> responses_; }; @@ -73,7 +98,7 @@ class MockDistillerPage : public DistillerPage { public: MOCK_METHOD0(InitImpl, void()); MOCK_METHOD1(LoadURLImpl, void(const GURL& gurl)); - MOCK_METHOD1(ExecuteJavaScriptImpl, void(const std::string& script)); + MOCK_METHOD1(ExecuteJavaScriptImpl, void(const string& script)); explicit MockDistillerPage(DistillerPage::Delegate* delegate) : DistillerPage(delegate) {} @@ -96,19 +121,19 @@ class MockDistillerPageFactory : public DistillerPageFactory { class DistillerTest : public testing::Test { public: virtual ~DistillerTest() {} - void OnDistillPageDone(scoped_ptr<DistilledPageProto> proto) { - proto_ = proto.Pass(); + void OnDistillPageDone(scoped_ptr<DistilledArticleProto> proto) { + article_proto_ = proto.Pass(); } protected: scoped_ptr<DistillerImpl> distiller_; - scoped_ptr<DistilledPageProto> proto_; + scoped_ptr<DistilledArticleProto> article_proto_; MockDistillerPageFactory page_factory_; TestDistillerURLFetcherFactory url_fetcher_factory_; }; -ACTION_P2(DistillerPageOnExecuteJavaScriptDone, distiller_page, list) { - distiller_page->OnExecuteJavaScriptDone(list); +ACTION_P3(DistillerPageOnExecuteJavaScriptDone, distiller_page, url, list) { + distiller_page->OnExecuteJavaScriptDone(url, list); } ACTION_P2(CreateMockDistillerPage, list, kurl) { @@ -118,18 +143,57 @@ ACTION_P2(CreateMockDistillerPage, list, kurl) { EXPECT_CALL(*distiller_page, LoadURLImpl(kurl)) .WillOnce(testing::InvokeWithoutArgs(distiller_page, &DistillerPage::OnLoadURLDone)); - EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_)) - .WillOnce(DistillerPageOnExecuteJavaScriptDone(distiller_page, list)); + EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_)).WillOnce( + DistillerPageOnExecuteJavaScriptDone(distiller_page, kurl, list)); + return distiller_page; +} + +ACTION_P3(CreateMockDistillerPages, lists, kurls, num_pages) { + DistillerPage::Delegate* delegate = arg0; + MockDistillerPage* distiller_page = new MockDistillerPage(delegate); + EXPECT_CALL(*distiller_page, InitImpl()); + { + testing::InSequence s; + + for (int page = 0; page < num_pages; ++page) { + GURL url = GURL(kurls[page]); + EXPECT_CALL(*distiller_page, LoadURLImpl(url)) + .WillOnce(testing::InvokeWithoutArgs(distiller_page, + &DistillerPage::OnLoadURLDone)); + EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_)) + .WillOnce(DistillerPageOnExecuteJavaScriptDone( + distiller_page, url, lists[page].get())); + } + } return distiller_page; } TEST_F(DistillerTest, DistillPage) { base::MessageLoopForUI loop; - scoped_ptr<base::ListValue> list(new base::ListValue()); - list->AppendString(kTitle); - list->AppendString(kContent); - list->AppendString(kImageURL0); - list->AppendString(kImageURL1); + scoped_ptr<base::ListValue> list = + CreateDistilledValueReturnedFromJS(kTitle, kContent, vector<int>(), ""); + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPage(list.get(), GURL(kURL))); + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + distiller_->Init(); + distiller_->DistillPage( + GURL(kURL), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(article_proto_->pages_size(), 1); + const DistilledPageProto& first_page = article_proto_->pages(0); + EXPECT_EQ(kContent, first_page.html()); + EXPECT_EQ(kURL, first_page.url()); +} + +TEST_F(DistillerTest, DistillPageWithImages) { + base::MessageLoopForUI loop; + vector<int> image_indices; + image_indices.push_back(0); + image_indices.push_back(1); + scoped_ptr<base::ListValue> list = + CreateDistilledValueReturnedFromJS(kTitle, kContent, image_indices, ""); EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)).WillOnce( CreateMockDistillerPage(list.get(), GURL(kURL))); @@ -139,14 +203,91 @@ TEST_F(DistillerTest, DistillPage) { GURL(kURL), base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); base::MessageLoop::current()->RunUntilIdle(); - EXPECT_EQ(kTitle, proto_->title()); - EXPECT_EQ(kContent, proto_->html()); - EXPECT_EQ(kURL, proto_->url()); - EXPECT_EQ(2, proto_->image_size()); - EXPECT_EQ(kImageData0, proto_->image(0).data()); - EXPECT_EQ(kId0, proto_->image(0).name()); - EXPECT_EQ(kImageData1, proto_->image(1).data()); - EXPECT_EQ(kId1, proto_->image(1).name()); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(article_proto_->pages_size(), 1); + const DistilledPageProto& first_page = article_proto_->pages(0); + EXPECT_EQ(kContent, first_page.html()); + EXPECT_EQ(kURL, first_page.url()); + EXPECT_EQ(2, first_page.image_size()); + EXPECT_EQ(kImageData[0], first_page.image(0).data()); + EXPECT_EQ(GetImageName(1, 0), first_page.image(0).name()); + EXPECT_EQ(kImageData[1], first_page.image(1).data()); + EXPECT_EQ(GetImageName(1, 1), first_page.image(1).name()); +} + +TEST_F(DistillerTest, DistillMultiplePages) { + base::MessageLoopForUI loop; + const int kNumPages = 8; + vector<int> image_indices[kNumPages]; + string content[kNumPages]; + string page_urls[kNumPages]; + scoped_ptr<base::ListValue> list[kNumPages]; + + int next_image_number = 0; + + for (int page_num = 0; page_num < kNumPages; ++page_num) { + // Each page has different number of images. + int tot_images = (page_num + kTotalImages) % (kTotalImages + 1); + for (int img_num = 0; img_num < tot_images; img_num++) { + image_indices[page_num].push_back(next_image_number); + next_image_number = (next_image_number + 1) % kTotalImages; + } + + page_urls[page_num] = "http://a.com/" + base::IntToString(page_num); + content[page_num] = "Content for page:" + base::IntToString(page_num); + } + for (int i = 0; i < kNumPages; ++i) { + string next_page_url = ""; + if (i + 1 < kNumPages) + next_page_url = page_urls[i + 1]; + + list[i] = CreateDistilledValueReturnedFromJS( + kTitle, content[i], image_indices[i], next_page_url); + } + + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPages(list, page_urls, kNumPages)); + + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + distiller_->Init(); + distiller_->DistillPage( + GURL(page_urls[0]), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(article_proto_->pages_size(), kNumPages); + for (int page_num = 0; page_num < kNumPages; ++page_num) { + const DistilledPageProto& page = article_proto_->pages(page_num); + EXPECT_EQ(content[page_num], page.html()); + EXPECT_EQ(page_urls[page_num], page.url()); + EXPECT_EQ(image_indices[page_num].size(), + static_cast<size_t>(page.image_size())); + for (size_t img_num = 0; img_num < image_indices[page_num].size(); + ++img_num) { + EXPECT_EQ(kImageData[image_indices[page_num][img_num]], + page.image(img_num).data()); + EXPECT_EQ(GetImageName(page_num + 1, img_num), + page.image(img_num).name()); + } + } +} + +TEST_F(DistillerTest, DistillLinkLoop) { + base::MessageLoopForUI loop; + // Create a loop, the next page is same as the current page. This could + // happen if javascript misparses a next page link. + scoped_ptr<base::ListValue> list = + CreateDistilledValueReturnedFromJS(kTitle, kContent, vector<int>(), kURL); + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPage(list.get(), GURL(kURL))); + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + distiller_->Init(); + distiller_->DistillPage( + GURL(kURL), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(article_proto_->pages_size(), 1); } } // namespace dom_distiller diff --git a/components/dom_distiller/core/dom_distiller_service.cc b/components/dom_distiller/core/dom_distiller_service.cc index 797827b..6ee2f2f 100644 --- a/components/dom_distiller/core/dom_distiller_service.cc +++ b/components/dom_distiller/core/dom_distiller_service.cc @@ -7,6 +7,7 @@ #include "base/guid.h" #include "base/message_loop/message_loop.h" #include "components/dom_distiller/core/dom_distiller_store.h" +#include "components/dom_distiller/core/proto/distilled_article.pb.h" #include "components/dom_distiller/core/task_tracker.h" #include "url/gurl.h" @@ -27,7 +28,7 @@ ArticleEntry CreateSkeletonEntryForUrl(const GURL& url) { void RunArticleAvailableCallback( const DomDistillerService::ArticleAvailableCallback& article_cb, const ArticleEntry& entry, - DistilledPageProto* proto, + const DistilledArticleProto* article_proto, bool distillation_succeeded) { article_cb.Run(distillation_succeeded); } @@ -189,13 +190,16 @@ void DomDistillerService::CancelTask(TaskTracker* task) { } } -void DomDistillerService::AddDistilledPageToList(const ArticleEntry& entry, - DistilledPageProto* proto, - bool distillation_succeeded) { +void DomDistillerService::AddDistilledPageToList( + const ArticleEntry& entry, + const DistilledArticleProto* article_proto, + bool distillation_succeeded) { DCHECK(IsEntryValid(entry)); if (distillation_succeeded) { - DCHECK(proto); + DCHECK(article_proto); + DCHECK_GT(article_proto->pages_size(), 0); store_->UpdateEntry(entry); + DCHECK_EQ(article_proto->pages_size(), entry.pages_size()); } } diff --git a/components/dom_distiller/core/dom_distiller_service.h b/components/dom_distiller/core/dom_distiller_service.h index ed045cd..630b184 100644 --- a/components/dom_distiller/core/dom_distiller_service.h +++ b/components/dom_distiller/core/dom_distiller_service.h @@ -22,7 +22,7 @@ class SyncableService; namespace dom_distiller { -class DistilledPageProto; +class DistilledArticleProto; class DistillerFactory; class DomDistillerObserver; class DomDistillerStoreInterface; @@ -70,7 +70,7 @@ class DomDistillerService { private: void CancelTask(TaskTracker* task); void AddDistilledPageToList(const ArticleEntry& entry, - DistilledPageProto* proto, + const DistilledArticleProto* article_proto, bool distillation_succeeded); TaskTracker* CreateTaskTracker(const ArticleEntry& entry); diff --git a/components/dom_distiller/core/dom_distiller_service_unittest.cc b/components/dom_distiller/core/dom_distiller_service_unittest.cc index 35065fd..f3418bb 100644 --- a/components/dom_distiller/core/dom_distiller_service_unittest.cc +++ b/components/dom_distiller/core/dom_distiller_service_unittest.cc @@ -9,6 +9,7 @@ #include "base/containers/hash_tables.h" #include "base/message_loop/message_loop.h" #include "base/run_loop.h" +#include "base/strings/string_number_conversions.h" #include "components/dom_distiller/core/article_entry.h" #include "components/dom_distiller/core/dom_distiller_model.h" #include "components/dom_distiller/core/dom_distiller_store.h" @@ -31,7 +32,7 @@ namespace { class FakeViewRequestDelegate : public ViewRequestDelegate { public: virtual ~FakeViewRequestDelegate() {} - MOCK_METHOD1(OnArticleReady, void(DistilledPageProto* proto)); + MOCK_METHOD1(OnArticleReady, void(const DistilledArticleProto* proto)); }; class MockDistillerObserver : public DomDistillerObserver { @@ -52,11 +53,23 @@ DomDistillerService::ArticleAvailableCallback ArticleCallback( } void RunDistillerCallback(FakeDistiller* distiller, - scoped_ptr<DistilledPageProto> proto) { + scoped_ptr<DistilledArticleProto> proto) { distiller->RunDistillerCallback(proto.Pass()); base::RunLoop().RunUntilIdle(); } +scoped_ptr<DistilledArticleProto> CreateArticleWithURL(const std::string& url) { + scoped_ptr<DistilledArticleProto> proto(new DistilledArticleProto); + DistilledPageProto* page = proto->add_pages(); + page->set_url(url); + return proto.Pass(); +} + +scoped_ptr<DistilledArticleProto> CreateDefaultArticle() { + return CreateArticleWithURL("http://www.example.com/default_article_page1") + .Pass(); +} + } // namespace class DomDistillerServiceTest : public testing::Test { @@ -109,7 +122,7 @@ TEST_F(DomDistillerServiceTest, TestViewEntry) { ASSERT_FALSE(distiller->GetCallback().is_null()); - scoped_ptr<DistilledPageProto> proto(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle(); EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get())); RunDistillerCallback(distiller, proto.Pass()); @@ -127,7 +140,7 @@ TEST_F(DomDistillerServiceTest, TestViewUrl) { ASSERT_FALSE(distiller->GetCallback().is_null()); EXPECT_EQ(url, distiller->GetUrl()); - scoped_ptr<DistilledPageProto> proto(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle(); EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get())); RunDistillerCallback(distiller, proto.Pass()); @@ -152,7 +165,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleViewUrl) { ASSERT_FALSE(distiller->GetCallback().is_null()); EXPECT_EQ(url, distiller->GetUrl()); - scoped_ptr<DistilledPageProto> proto(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle(); EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get())); RunDistillerCallback(distiller, proto.Pass()); @@ -160,7 +173,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleViewUrl) { ASSERT_FALSE(distiller2->GetCallback().is_null()); EXPECT_EQ(url2, distiller2->GetUrl()); - scoped_ptr<DistilledPageProto> proto2(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto2 = CreateDefaultArticle(); EXPECT_CALL(viewer_delegate2, OnArticleReady(proto2.get())); RunDistillerCallback(distiller2, proto2.Pass()); @@ -210,7 +223,7 @@ TEST_F(DomDistillerServiceTest, TestAddAndRemoveEntry) { ASSERT_FALSE(distiller->GetCallback().is_null()); EXPECT_EQ(url, distiller->GetUrl()); - scoped_ptr<DistilledPageProto> proto(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto = CreateArticleWithURL(url.spec()); RunDistillerCallback(distiller, proto.Pass()); EXPECT_TRUE(store_->GetEntryByUrl(url, &entry)); @@ -280,7 +293,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleObservers) { ArticleEntriesUpdated(util::HasExpectedUpdates(expected_updates))); } - scoped_ptr<DistilledPageProto> proto(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle(); RunDistillerCallback(distiller, proto.Pass()); // Remove should notify all observers that article is removed. @@ -316,7 +329,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleCallbacks) { EXPECT_CALL(article_cb[i], DistillationCompleted(true)); } - scoped_ptr<DistilledPageProto> proto(new DistilledPageProto); + scoped_ptr<DistilledArticleProto> proto = CreateArticleWithURL(url.spec()); RunDistillerCallback(distiller, proto.Pass()); // Add the same url again, all callbacks should be called with true. @@ -353,5 +366,58 @@ TEST_F(DomDistillerServiceTest, TestMultipleCallbacksOnRemove) { base::RunLoop().RunUntilIdle(); } +TEST_F(DomDistillerServiceTest, TestMultiplePageArticle) { + FakeDistiller* distiller = new FakeDistiller(false); + EXPECT_CALL(*distiller_factory_, CreateDistillerImpl()) + .WillOnce(Return(distiller)); + + const int kPageCount = 8; + + std::string base_url("http://www.example.com/p"); + GURL pages_url[kPageCount]; + for (int page_num = 0; page_num < kPageCount; ++page_num) { + pages_url[page_num] = GURL(base_url + base::IntToString(page_num)); + } + + MockArticleAvailableCallback article_cb; + EXPECT_CALL(article_cb, DistillationCompleted(true)); + + std::string entry_id = + service_->AddToList(pages_url[0], ArticleCallback(&article_cb)); + + ArticleEntry entry; + ASSERT_FALSE(distiller->GetCallback().is_null()); + EXPECT_EQ(pages_url[0], distiller->GetUrl()); + + // Create the article with pages to pass to the distiller. + scoped_ptr<DistilledArticleProto> proto = + CreateArticleWithURL(pages_url[0].spec()); + for (int page_num = 1; page_num < kPageCount; ++page_num) { + DistilledPageProto* distilled_page = proto->add_pages(); + distilled_page->set_url(pages_url[page_num].spec()); + } + + RunDistillerCallback(distiller, proto.Pass()); + EXPECT_TRUE(store_->GetEntryByUrl(pages_url[0], &entry)); + + EXPECT_EQ(kPageCount, entry.pages_size()); + // An article should have just one entry. + EXPECT_EQ(1u, store_->GetEntries().size()); + + // All pages should have correct urls. + for (int page_num = 0; page_num < kPageCount; ++page_num) { + EXPECT_EQ(pages_url[page_num].spec(), entry.pages(page_num).url()); + } + + // Should be able to query article using any of the pages url. + for (int page_num = 0; page_num < kPageCount; ++page_num) { + EXPECT_TRUE(store_->GetEntryByUrl(pages_url[page_num], &entry)); + } + + service_->RemoveEntry(entry_id); + base::RunLoop().RunUntilIdle(); + EXPECT_EQ(0u, store_->GetEntries().size()); +} + } // namespace test } // namespace dom_distiller diff --git a/components/dom_distiller/core/fake_distiller.cc b/components/dom_distiller/core/fake_distiller.cc index 3f1f976..e6cc979 100644 --- a/components/dom_distiller/core/fake_distiller.cc +++ b/components/dom_distiller/core/fake_distiller.cc @@ -14,14 +14,26 @@ namespace test { MockDistillerFactory::MockDistillerFactory() {} MockDistillerFactory::~MockDistillerFactory() {} -FakeDistiller::FakeDistiller(bool execute_callback) : - execute_callback_(execute_callback) { +FakeDistiller::FakeDistiller(bool execute_callback) + : execute_callback_(execute_callback) { EXPECT_CALL(*this, Die()).Times(testing::AnyNumber()); } FakeDistiller::~FakeDistiller() { Die(); } -void FakeDistiller::RunDistillerCallback(scoped_ptr<DistilledPageProto> proto) { +void FakeDistiller::DistillPage(const GURL& url, + const DistillerCallback& callback) { + url_ = url; + callback_ = callback; + if (execute_callback_) { + scoped_ptr<DistilledArticleProto> proto(new DistilledArticleProto); + proto->add_pages()->set_url(url_.spec()); + RunDistillerCallback(proto.Pass()); + } +} + +void FakeDistiller::RunDistillerCallback( + scoped_ptr<DistilledArticleProto> proto) { base::MessageLoop::current()->PostTask( FROM_HERE, base::Bind(&FakeDistiller::RunDistillerCallbackInternal, @@ -30,7 +42,7 @@ void FakeDistiller::RunDistillerCallback(scoped_ptr<DistilledPageProto> proto) { } void FakeDistiller::RunDistillerCallbackInternal( - scoped_ptr<DistilledPageProto> proto) { + scoped_ptr<DistilledArticleProto> proto) { EXPECT_FALSE(callback_.is_null()); callback_.Run(proto.Pass()); callback_.Reset(); diff --git a/components/dom_distiller/core/fake_distiller.h b/components/dom_distiller/core/fake_distiller.h index dbb00d6..98ab709 100644 --- a/components/dom_distiller/core/fake_distiller.h +++ b/components/dom_distiller/core/fake_distiller.h @@ -27,27 +27,21 @@ class MockDistillerFactory : public DistillerFactory { class FakeDistiller : public Distiller { public: - FakeDistiller(bool execute_callback); + explicit FakeDistiller(bool execute_callback); virtual ~FakeDistiller(); MOCK_METHOD0(Die, void()); virtual void DistillPage(const GURL& url, - const DistillerCallback& callback) OVERRIDE { - url_ = url; - callback_ = callback; - if (execute_callback_) { - RunDistillerCallback(make_scoped_ptr(new DistilledPageProto)); - } - } + const DistillerCallback& callback) OVERRIDE; - void RunDistillerCallback(scoped_ptr<DistilledPageProto> proto); + void RunDistillerCallback(scoped_ptr<DistilledArticleProto> proto); GURL GetUrl() { return url_; } DistillerCallback GetCallback() { return callback_; } private: - void RunDistillerCallbackInternal(scoped_ptr<DistilledPageProto> proto); + void RunDistillerCallbackInternal(scoped_ptr<DistilledArticleProto> proto); bool execute_callback_; GURL url_; diff --git a/components/dom_distiller/core/page_distiller.cc b/components/dom_distiller/core/page_distiller.cc new file mode 100644 index 0000000..2d47bd1 --- /dev/null +++ b/components/dom_distiller/core/page_distiller.cc @@ -0,0 +1,91 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/dom_distiller/core/page_distiller.h" + +#include <map> + +#include "base/location.h" +#include "base/message_loop/message_loop.h" +#include "base/strings/utf_string_conversions.h" +#include "base/values.h" +#include "components/dom_distiller/core/distiller_page.h" +#include "components/dom_distiller/core/distiller_url_fetcher.h" +#include "grit/dom_distiller_resources.h" +#include "net/url_request/url_request_context_getter.h" +#include "ui/base/resource/resource_bundle.h" +#include "url/gurl.h" + +namespace dom_distiller { + +DistilledPageInfo::DistilledPageInfo() {} + +DistilledPageInfo::~DistilledPageInfo() {} + +PageDistiller::PageDistiller(const DistillerPageFactory& distiller_page_factory) + : distiller_page_(distiller_page_factory.CreateDistillerPage(this).Pass()) { +} + +PageDistiller::~PageDistiller() {} + +void PageDistiller::Init() { distiller_page_->Init(); } + +void PageDistiller::DistillPage(const GURL& url, + const PageDistillerCallback& callback) { + page_distiller_callback_ = callback; + LoadURL(url); +} + +void PageDistiller::LoadURL(const GURL& url) { distiller_page_->LoadURL(url); } + +void PageDistiller::OnLoadURLDone() { GetDistilledContent(); } + +void PageDistiller::GetDistilledContent() { + std::string script = ResourceBundle::GetSharedInstance() + .GetRawDataResource(IDR_DISTILLER_JS) + .as_string(); + distiller_page_->ExecuteJavaScript(script); +} + +void PageDistiller::OnExecuteJavaScriptDone(const GURL& page_url, + const base::Value* value) { + scoped_ptr<DistilledPageInfo> page_info(new DistilledPageInfo()); + std::string result; + const base::ListValue* result_list = NULL; + if (!value->GetAsList(&result_list)) { + base::MessageLoop::current()->PostTask( + FROM_HERE, + base::Bind(page_distiller_callback_, base::Passed(&page_info), false)); + } else { + int i = 0; + for (base::ListValue::const_iterator iter = result_list->begin(); + iter != result_list->end(); + ++iter, ++i) { + std::string item; + (*iter)->GetAsString(&item); + // The JavaScript returns an array where the first element is the title, + // the second element is the article content HTML, and the remaining + // elements are image URLs referenced in the HTML. + switch (i) { + case 0: + page_info->title = item; + break; + case 1: + page_info->html = item; + break; + case 2: { + page_info->next_page_url = item; + break; + } + default: + page_info->image_urls.push_back(item); + } + } + base::MessageLoop::current()->PostTask( + FROM_HERE, + base::Bind(page_distiller_callback_, base::Passed(&page_info), true)); + } +} + +} // namespace dom_distiller diff --git a/components/dom_distiller/core/page_distiller.h b/components/dom_distiller/core/page_distiller.h new file mode 100644 index 0000000..4e32bea --- /dev/null +++ b/components/dom_distiller/core/page_distiller.h @@ -0,0 +1,70 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_ +#define COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_ + +#include <string> +#include <vector> + +#include "base/callback.h" +#include "base/memory/scoped_ptr.h" +#include "base/values.h" +#include "components/dom_distiller/core/distiller_page.h" +#include "url/gurl.h" + +namespace dom_distiller { + +class DistillerImpl; + +struct DistilledPageInfo { + std::string title; + std::string html; + std::string next_page_url; + std::vector<std::string> image_urls; + DistilledPageInfo(); + ~DistilledPageInfo(); + + private: + DISALLOW_COPY_AND_ASSIGN(DistilledPageInfo); +}; + +// Distills a single page of an article. +class PageDistiller : public DistillerPage::Delegate { + public: + typedef base::Callback<void(scoped_ptr<DistilledPageInfo> distilled_page, + bool distillation_successful)> + PageDistillerCallback; + explicit PageDistiller(const DistillerPageFactory& distiller_page_factory); + virtual ~PageDistiller(); + + // Creates an execution context. This must be called once before any calls are + // made to distill the page. + virtual void Init(); + + // Distills the |url| and posts the |callback| with results. + virtual void DistillPage(const GURL& url, + const PageDistillerCallback& callback); + + // DistillerPage::Delegate + virtual void OnLoadURLDone() OVERRIDE; + virtual void OnExecuteJavaScriptDone(const GURL& page_url, + const base::Value* value) OVERRIDE; + + private: + virtual void LoadURL(const GURL& url); + + // Injects JavaScript to distill a loaded page down to its important content, + // e.g., extracting a news article from its surrounding boilerplate. + void GetDistilledContent(); + + scoped_ptr<DistillerPage> distiller_page_; + PageDistillerCallback page_distiller_callback_; + + DISALLOW_COPY_AND_ASSIGN(PageDistiller); +}; + +} // namespace dom_distiller + +#endif // COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_ diff --git a/components/dom_distiller/core/proto/distilled_article.proto b/components/dom_distiller/core/proto/distilled_article.proto new file mode 100644 index 0000000..54284bf --- /dev/null +++ b/components/dom_distiller/core/proto/distilled_article.proto @@ -0,0 +1,21 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Distilled article content. + +syntax = "proto2"; + +option optimize_for = LITE_RUNTIME; + +package dom_distiller; + +import "distilled_page.proto"; + +message DistilledArticleProto { + // A title for the article. + optional string title = 1; + + // The distilled pages in the article. + repeated DistilledPageProto pages = 2; +} diff --git a/components/dom_distiller/core/proto/distilled_page.proto b/components/dom_distiller/core/proto/distilled_page.proto index c6f8159..ba71617 100644 --- a/components/dom_distiller/core/proto/distilled_page.proto +++ b/components/dom_distiller/core/proto/distilled_page.proto @@ -14,9 +14,6 @@ message DistilledPageProto { // The URLs used to create this page. required string url = 1; - // A title for the distilled page. - optional string title = 2; - // The distilled HTML. required string html = 3; diff --git a/components/dom_distiller/core/task_tracker.cc b/components/dom_distiller/core/task_tracker.cc index ccd5656..1afd482 100644 --- a/components/dom_distiller/core/task_tracker.cc +++ b/components/dom_distiller/core/task_tracker.cc @@ -5,6 +5,8 @@ #include "components/dom_distiller/core/task_tracker.h" #include "base/message_loop/message_loop.h" +#include "components/dom_distiller/core/proto/distilled_article.pb.h" +#include "components/dom_distiller/core/proto/distilled_page.pb.h" namespace dom_distiller { @@ -20,7 +22,8 @@ ViewerHandle::~ViewerHandle() { TaskTracker::TaskTracker(const ArticleEntry& entry, CancelCallback callback) : cancel_callback_(callback), entry_(entry), - distilled_page_(), + distilled_article_(), + distillation_complete_(false), weak_ptr_factory_(this) {} TaskTracker::~TaskTracker() { DCHECK(viewers_.empty()); } @@ -51,7 +54,7 @@ void TaskTracker::StartBlobFetcher() { void TaskTracker::AddSaveCallback(const SaveCallback& callback) { DCHECK(!callback.is_null()); save_callbacks_.push_back(callback); - if (distilled_page_) { + if (distillation_complete_) { // Distillation for this task has already completed, and so it can be // immediately saved. ScheduleSaveCallbacks(true); @@ -60,7 +63,7 @@ void TaskTracker::AddSaveCallback(const SaveCallback& callback) { scoped_ptr<ViewerHandle> TaskTracker::AddViewer(ViewRequestDelegate* delegate) { viewers_.push_back(delegate); - if (distilled_page_) { + if (distillation_complete_) { // Distillation for this task has already completed, and so the delegate can // be immediately told of the result. base::MessageLoop::current()->PostTask( @@ -120,12 +123,10 @@ void TaskTracker::ScheduleSaveCallbacks(bool distillation_succeeded) { void TaskTracker::DoSaveCallbacks(bool distillation_succeeded) { if (!save_callbacks_.empty()) { - DistilledPageProto* distilled_proto = - distillation_succeeded ? distilled_page_.get() : NULL; - for (size_t i = 0; i < save_callbacks_.size(); ++i) { DCHECK(!save_callbacks_[i].is_null()); - save_callbacks_[i].Run(entry_, distilled_proto, distillation_succeeded); + save_callbacks_[i].Run( + entry_, distilled_article_.get(), distillation_succeeded); } save_callbacks_.clear(); @@ -134,21 +135,33 @@ void TaskTracker::DoSaveCallbacks(bool distillation_succeeded) { } void TaskTracker::NotifyViewer(ViewRequestDelegate* delegate) { - DCHECK(distilled_page_); - delegate->OnArticleReady(distilled_page_.get()); + DCHECK(distillation_complete_); + delegate->OnArticleReady(distilled_article_.get()); } -void TaskTracker::OnDistilledDataReady(scoped_ptr<DistilledPageProto> proto) { - distilled_page_ = proto.Pass(); - DCHECK(distilled_page_); +void TaskTracker::OnDistilledDataReady( + scoped_ptr<DistilledArticleProto> distilled_article) { + distilled_article_ = distilled_article.Pass(); + bool distillation_successful = false; + if (distilled_article_->pages_size() > 0) { + distillation_successful = true; + entry_.set_title(distilled_article_->title()); + // Reset the pages. + entry_.clear_pages(); + for (int i = 0; i < distilled_article_->pages_size(); ++i) { + sync_pb::ArticlePage* page = entry_.add_pages(); + page->set_url(distilled_article_->pages(i).url()); + } + } + + distillation_complete_ = true; - entry_.set_title(distilled_page_->title()); for (size_t i = 0; i < viewers_.size(); ++i) { NotifyViewer(viewers_[i]); } // Already inside a callback run SaveCallbacks directly. - DoSaveCallbacks(true); + DoSaveCallbacks(distillation_successful); } } // namespace dom_distiller diff --git a/components/dom_distiller/core/task_tracker.h b/components/dom_distiller/core/task_tracker.h index f545dd9..c4098fe 100644 --- a/components/dom_distiller/core/task_tracker.h +++ b/components/dom_distiller/core/task_tracker.h @@ -19,7 +19,7 @@ class GURL; namespace dom_distiller { -class DistilledPageProto; +class DistilledArticleProto; // A handle to a request to view a DOM distiller entry or URL. The request will // be cancelled when the handle is destroyed. @@ -41,10 +41,10 @@ class ViewRequestDelegate { public: virtual ~ViewRequestDelegate() {} // Called when the distilled article contents are available. The - // DistilledPageProto is owned by a TaskTracker instance and is invalidated + // DistilledArticleProto is owned by a TaskTracker instance and is invalidated // when the corresponding ViewerHandle is destroyed (or when the // DomDistillerService is destroyed). - virtual void OnArticleReady(DistilledPageProto* proto) = 0; + virtual void OnArticleReady(const DistilledArticleProto* article_proto) = 0; }; // A TaskTracker manages the various tasks related to viewing, saving, @@ -68,7 +68,8 @@ class ViewRequestDelegate { class TaskTracker { public: typedef base::Callback<void(TaskTracker*)> CancelCallback; - typedef base::Callback<void(const ArticleEntry&, DistilledPageProto*, bool)> + typedef base::Callback< + void(const ArticleEntry&, const DistilledArticleProto*, bool)> SaveCallback; TaskTracker(const ArticleEntry& entry, CancelCallback callback); @@ -90,7 +91,8 @@ class TaskTracker { bool HasUrl(const GURL& url) const; private: - void OnDistilledDataReady(scoped_ptr<DistilledPageProto> distilled_page); + void OnDistilledDataReady( + scoped_ptr<DistilledArticleProto> distilled_article); // Posts a task to run DoSaveCallbacks with |distillation_succeeded|. void ScheduleSaveCallbacks(bool distillation_succeeded); @@ -113,7 +115,8 @@ class TaskTracker { std::vector<ViewRequestDelegate*> viewers_; ArticleEntry entry_; - scoped_ptr<DistilledPageProto> distilled_page_; + scoped_ptr<DistilledArticleProto> distilled_article_; + bool distillation_complete_; // Note: This should remain the last member so it'll be destroyed and // invalidate its weak pointers before any other members are destroyed. diff --git a/components/dom_distiller/core/task_tracker_unittest.cc b/components/dom_distiller/core/task_tracker_unittest.cc index f659fc1..c88becb 100644 --- a/components/dom_distiller/core/task_tracker_unittest.cc +++ b/components/dom_distiller/core/task_tracker_unittest.cc @@ -18,7 +18,8 @@ namespace test { class FakeViewRequestDelegate : public ViewRequestDelegate { public: virtual ~FakeViewRequestDelegate() {} - MOCK_METHOD1(OnArticleReady, void(DistilledPageProto* proto)); + MOCK_METHOD1(OnArticleReady, + void(const DistilledArticleProto* article_proto)); }; class TestCancelCallback { @@ -36,7 +37,8 @@ class TestCancelCallback { class MockSaveCallback { public: - MOCK_METHOD3(Save, void(const ArticleEntry&, DistilledPageProto*, bool)); + MOCK_METHOD3(Save, + void(const ArticleEntry&, const DistilledArticleProto*, bool)); }; class DomDistillerTaskTrackerTest : public testing::Test { diff --git a/third_party/readability/README.chromium b/third_party/readability/README.chromium index a409176..72590db 100644 --- a/third_party/readability/README.chromium +++ b/third_party/readability/README.chromium @@ -11,3 +11,5 @@ present it in an easy-to-read form. The CSS of readability is unchanged. Local Modifications: - Removed all writes to innerHTML in the JavaScript. - Removed unneeded styles, footer, scrolling. +- Removed extraneous whitespace. +- Exposed nextPageLink as a function. diff --git a/third_party/readability/js/readability.js b/third_party/readability/js/readability.js index 68a0286..9cdbb30 100644 --- a/third_party/readability/js/readability.js +++ b/third_party/readability/js/readability.js @@ -1,14 +1,20 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Local modifications to this file are described in the README.chromium +// file. var dbg = (typeof console !== 'undefined') ? function(s) { console.log("Readability: " + s); } : function() {}; /* - * Readability. An Arc90 Lab Experiment. + * Readability. An Arc90 Lab Experiment. * Website: http://lab.arc90.com/experiments/readability * Source: http://code.google.com/p/arc90labs-readability * - * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. + * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. * * Copyright (c) 2010 Arc90 Inc * Readability is licensed under the Apache License, Version 2.0. @@ -20,6 +26,7 @@ var readability = { distilledHTML: '', distilledArticleContent: null, + nextPageLink: '', version: '1.7.1', iframeLoads: 0, @@ -41,7 +48,7 @@ var readability = { maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ - + /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. @@ -66,7 +73,7 @@ var readability = { /** * Runs readability. - * + * * Workflow: * 1. Prep the document by removing script tags, css, etc. * 2. Build readability's DOM tree. @@ -86,8 +93,11 @@ var readability = { readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; /* Pull out any possible next page link first */ - var nextPageLink = readability.findNextPageLink(document.body); - + readability.nextPageLink = readability.findNextPageLink(document.body); + + /* We handle processing of nextPage from C++ set nextPageLink to null */ + var nextPageLink = null; + readability.prepDocument(); /* Build readability's DOM tree */ @@ -152,12 +162,8 @@ var readability = { window.scrollTo(0, 0); - // TODO(bengr): Remove this assignment of null to nextPageLink when - // the processing of the next page link is safe. - nextPageLink = null; - if (nextPageLink) { - /** + /** * Append any additional pages after a small timeout so that people * can start reading without having to wait for this to finish processing. **/ @@ -179,16 +185,16 @@ var readability = { var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); if(readability.reversePageScroll) { - readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); + readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); } else { - readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); + readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); } - + return false; } }; - + document.onkeyup = function(e) { var code = (window.event) ? event.keyCode : e.keyCode; if (code === 16) { @@ -200,7 +206,7 @@ var readability = { /** * Run any post-process modifications to article content as necessary. - * + * * @param Element * @return void **/ @@ -226,7 +232,7 @@ var readability = { for(var i=0, il = images.length; i < il; i+=1) { var image = images[i]; - + if(image.offsetWidth > imageWidthThreshold) { image.className += " blockImage"; } @@ -242,7 +248,7 @@ var readability = { var articleTools = document.createElement("DIV"); articleTools.id = "readTools"; - articleTools.innerHTML = + articleTools.innerHTML = "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"; @@ -259,13 +265,13 @@ var readability = { function sanitizeText() { return text.replace(/@\w+/, ""); } - + function countMatches(match) { var matches = text.match(new RegExp(match, "g")); - return matches !== null ? matches.length : 0; + return matches !== null ? matches.length : 0; } - - function isRTL() { + + function isRTL() { var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); @@ -289,15 +295,15 @@ var readability = { try { curTitle = origTitle = document.title; if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ - curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); + curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); } } catch(e) {} - + if(curTitle.match(/ [\|\-] /)) { curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); - + if(curTitle.split(' ').length < 3) { curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); } @@ -330,7 +336,7 @@ var readability = { /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. - * + * * @return void **/ prepDocument: function () { @@ -342,7 +348,7 @@ var readability = { { var body = document.createElement("body"); try { - document.body = body; + document.body = body; } catch(e) { document.documentElement.appendChild(body); @@ -374,11 +380,11 @@ var readability = { biggestFrameSize = frameSize; readability.biggestFrame = frames[frameIndex]; } - + if(canAccessFrame && frameSize > bestFrameSize) { readability.frameHack = true; - + bestFrame = frames[frameIndex]; bestFrameSize = frameSize; } @@ -390,7 +396,7 @@ var readability = { readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); newBody.style.overflow = 'scroll'; document.body = newBody; - + var frameset = document.getElementsByTagName('frameset')[0]; if(frameset) { frameset.parentNode.removeChild(frameset); } @@ -455,7 +461,7 @@ var readability = { var imgCount = articleParagraphs[i].getElementsByTagName('img').length; var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; var objectCount = articleParagraphs[i].getElementsByTagName('object').length; - + if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); } @@ -468,7 +474,7 @@ var readability = { dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e); } }, - + /** * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. @@ -477,7 +483,7 @@ var readability = { * @return void **/ initializeNode: function (node) { - node.readability = {"contentScore": 0}; + node.readability = {"contentScore": 0}; switch(node.tagName) { case 'DIV': @@ -489,7 +495,7 @@ var readability = { case 'BLOCKQUOTE': node.readability.contentScore += 3; break; - + case 'ADDRESS': case 'OL': case 'UL': @@ -511,10 +517,10 @@ var readability = { node.readability.contentScore -= 5; break; } - + node.readability.contentScore += readability.getClassWeight(node); }, - + /*** * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. @@ -525,7 +531,7 @@ var readability = { grabArticle: function (pageToClone) { var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), isPaging = (page !== null) ? true: false; - + var page = null; // Never work on the actual page. if (isPaging) { @@ -533,7 +539,7 @@ var readability = { } else { page = pageToClone.cloneNode(true); } - + var allElements = page.getElementsByTagName('*'); /** @@ -561,7 +567,7 @@ var readability = { node.parentNode.removeChild(node); nodeIndex-=1; continue; - } + } } if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") { @@ -598,7 +604,7 @@ var readability = { } } } - } + } } /** @@ -640,15 +646,15 @@ var readability = { /* Add points for any commas within this paragraph */ contentScore += innerText.split(',').length; - + /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ contentScore += Math.min(Math.floor(innerText.length / 100), 3); - + /* Add the score to the parent. The grandparent gets half. */ parentNode.readability.contentScore += contentScore; if(grandParentNode) { - grandParentNode.readability.contentScore += contentScore/2; + grandParentNode.readability.contentScore += contentScore/2; } } @@ -725,12 +731,12 @@ var readability = { { append = true; } - + if(siblingNode.nodeName === "P") { var linkDensity = readability.getLinkDensity(siblingNode); var nodeContent = readability.getInnerText(siblingNode); var nodeLength = nodeContent.length; - + if(nodeLength > 80 && linkDensity < 0.25) { append = true; @@ -747,7 +753,7 @@ var readability = { var nodeToAppend = null; if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - + dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); nodeToAppend = document.createElement("DIV"); try { @@ -765,7 +771,7 @@ var readability = { s-=1; sl-=1; } - + /* To ensure a node does not interfere with readability styles, remove its classnames */ nodeToAppend.className = ""; @@ -779,15 +785,15 @@ var readability = { **/ readability.distilledArticleContent = articleContent.cloneNode(true); //readability.prepArticle(articleContent); - + if (readability.curPageNum === 1) { var newNode = document.createElement('div'); newNode.id = "readability-page-1"; newNode.setAttribute("class", "page"); readability.moveNodeInnards(articleContent, newNode); articleContent.appendChild(newNode); - } - + } + /** * Now that we've gone through the full algorithm, check to see if we got any meaningful content. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher @@ -813,7 +819,7 @@ var readability = { return articleContent; }, - + /** * Removes script tags from the document. * @@ -828,12 +834,12 @@ var readability = { scripts[i].nodeValue=""; scripts[i].removeAttribute('src'); if (scripts[i].parentNode) { - scripts[i].parentNode.removeChild(scripts[i]); + scripts[i].parentNode.removeChild(scripts[i]); } } } }, - + /** * Get the inner text of a node - cross browser compatibly. * This also strips out any excess whitespace to be found. @@ -896,18 +902,18 @@ var readability = { if ( cur.nodeType === 1 ) { // Remove style attribute(s) : if(cur.className !== "readability-styled") { - cur.removeAttribute("style"); + cur.removeAttribute("style"); } readability.cleanStyles( cur ); } cur = cur.nextSibling; - } + } }, - + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. - * + * * @param Element * @return number (float) **/ @@ -918,11 +924,11 @@ var readability = { for(var i=0, il=links.length; i<il;i+=1) { linkLength += readability.getInnerText(links[i]).length; - } + } return linkLength / textLength; }, - + /** * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. * @@ -944,10 +950,10 @@ var readability = { /* If the type isn't alpha-only, it's probably not actually a file extension. */ if(!possibleType.match(/[^a-zA-Z]/)) { - segment = segment.split(".")[0]; + segment = segment.split(".")[0]; } } - + /** * EW-CMS specific segment replacement. Ugly. * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html @@ -968,7 +974,7 @@ var readability = { if (i < 2 && segment.match(/^\d{1,2}$/)) { del = true; } - + /* If this is the first segment and it's just "index", remove it. */ if(i === 0 && segment.toLowerCase() === "index") { del = true; @@ -992,7 +998,7 @@ var readability = { /** * Look for any paging links that may occur within the document. - * + * * @param body * @return object (array) **/ @@ -1008,7 +1014,7 @@ var readability = { * * Also possible: levenshtein distance? longest common subsequence? * - * After we do that, assign each page a score, and + * After we do that, assign each page a score, and **/ for(var i = 0, il = allLinks.length; i < il; i+=1) { var link = allLinks[i], @@ -1018,12 +1024,12 @@ var readability = { if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) { continue; } - + /* If it's on a different domain, skip it. */ if(window.location.host !== linkHref.split(/\/+/g)[1]) { continue; } - + var linkText = readability.getInnerText(link); /* If the linkText looks like it's not the next page, skip it. */ @@ -1036,9 +1042,9 @@ var readability = { if(!linkHrefLeftover.match(/\d/)) { continue; } - + if(!(linkHref in possiblePages)) { - possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; + possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; } else { possiblePages[linkHref].linkText += ' | ' + linkText; } @@ -1060,7 +1066,7 @@ var readability = { if(linkData.match(/pag(e|ing|inat)/i)) { linkObj.score += 25; } - if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, + if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ if(!linkObj.linkText.match(readability.regexps.nextLink)) { linkObj.score -= 65; @@ -1087,10 +1093,10 @@ var readability = { /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ if(!parentNodeClassAndId.match(readability.regexps.positive)) { linkObj.score -= 25; - negativeNodeMatch = true; + negativeNodeMatch = true; } } - + parentNode = parentNode.parentNode; } @@ -1152,7 +1158,7 @@ var readability = { dbg('NEXT PAGE IS ' + nextHref); readability.parsedPages[nextHref] = true; - return nextHref; + return nextHref; } else { return null; @@ -1204,7 +1210,7 @@ var readability = { if (typeof options === 'undefined') { options = {}; } request.onreadystatechange = respondToReadyState; - + request.open('get', url, true); request.setRequestHeader('Accept', 'text/html'); @@ -1239,7 +1245,7 @@ var readability = { articlePage.appendChild(linkDiv); return; } - + /** * Now that we've built the article page DOM element, get the page content * asynchronously and load the cleaned content into the div we created for it. @@ -1257,7 +1263,7 @@ var readability = { return; } else { readability.pageETags[eTag] = 1; - } + } } // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. @@ -1308,7 +1314,7 @@ var readability = { } } } - + readability.removeScripts(content); readability.moveNodeInnards(content, thisPage); @@ -1330,9 +1336,9 @@ var readability = { }); }(nextPageLink, articlePage)); }, - + /** - * Get an elements class/id weight. Uses regular expressions to tell if this + * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. * * @param Element @@ -1382,7 +1388,7 @@ var readability = { var allElements = e.getElementsByTagName('*'); while (i < allElements.length) { readability.deleteExtraBreaks(allElements[i]); - i++; + i++; } }, @@ -1397,7 +1403,7 @@ var readability = { clean: function (e, tag) { var targetList = e.getElementsByTagName( tag ); var isEmbed = (tag === 'object' || tag === 'embed'); - + for (var y=targetList.length-1; y >= 0; y-=1) { /* Allow youtube and vimeo videos through as people usually want to see those. */ if(isEmbed) { @@ -1405,7 +1411,7 @@ var readability = { for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { attributeValues += targetList[y].attributes[i].value + '|'; } - + /* First, check the elements attributes to see if any of them contain youtube or vimeo */ if (attributeValues.search(readability.regexps.videos) !== -1) { continue; @@ -1415,13 +1421,13 @@ var readability = { if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { continue; } - + } targetList[y].parentNode.removeChild(targetList[y]); } }, - + /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. @@ -1446,7 +1452,7 @@ var readability = { for (var i=curTagsLength-1; i >= 0; i-=1) { var weight = readability.getClassWeight(tagsList[i]); var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; - + dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); if(weight+contentScore < 0) @@ -1467,7 +1473,7 @@ var readability = { var embeds = tagsList[i].getElementsByTagName("embed"); for(var ei=0,il=embeds.length; ei < il; ei+=1) { if (embeds[ei].src.search(readability.regexps.videos) === -1) { - embedCount+=1; + embedCount+=1; } } @@ -1480,7 +1486,7 @@ var readability = { } else if(li > p && tag !== "ul" && tag !== "ol") { toRemove = true; } else if( input > Math.floor(p/3) ) { - toRemove = true; + toRemove = true; } else if(contentLength < 25 && (img === 0 || img > 2) ) { toRemove = true; } else if(weight < 25 && linkDensity > 0.2) { @@ -1522,7 +1528,7 @@ var readability = { addFlag: function(flag) { readability.flags = readability.flags | flag; }, - + removeFlag: function(flag) { readability.flags = readability.flags & ~flag; }, @@ -1591,7 +1597,7 @@ var readability = { } return ret; }, - + // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a // <P> node, and makes all next siblings of that pair children of <P>, up // until the next pair of <BR> nodes is reached. @@ -1600,7 +1606,7 @@ var readability = { var second = readability.isMultipleBr(node, true); if (!second) { return; - } + } // Make all next siblings of the second BR into children of a P. var p = document.createElement('p'); var curr = second.nextSibling; @@ -1613,7 +1619,7 @@ var readability = { curr = next; } var ret = curr; - + // Remove all nodes between the first and second BR. curr = node.nextSibling; while (curr && curr != second) { @@ -1625,10 +1631,10 @@ var readability = { second.parentNode.removeChild(second); // Replace the first BR with the P. node.parentNode.replaceChild(p, node); - + return ret; }, - + // Returns true if the NodeList contains a double <BR>. hasDoubleBr: function(nodeList) { for (var i = 0; i < nodeList.length; nodeList++) { @@ -1637,8 +1643,8 @@ var readability = { } } return false; - }, - + }, + // Replaces double <BR> tags with <P> tags. replaceDoubleBrsWithPs: function(node) { var allElements = node.getElementsByTagName('BR'); @@ -1652,8 +1658,8 @@ var readability = { allElements = document.body.getElementsByTagName('BR'); } }, - - + + // Replaces a BR and the whitespace that follows it with a P. replaceBrWithP: function(node) { if (!readability.isBrNode(node)) { @@ -1673,7 +1679,7 @@ var readability = { node.parentNode.replaceChild(p, node); return curr; }, - + // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag // children of the <P>. replaceBrsWithPs: function(node) { @@ -1687,27 +1693,27 @@ var readability = { allElements = document.body.getElementsByTagName('BR'); } }, - + // Replaces any tag with any other tag. replaceTagsWithTags: function(node, srcTag, destTag) { var allElements = node.getElementsByTagName(srcTag); for (var i = 0; i < allElements.length; i++) { var dest = document.createElement(destTag); readability.moveNodeInnards(allElements[i], dest); - node.replaceNode(dest, allElements[i]); + allElements[i].parentNode.replaceChild(dest, allElements[i]); } }, - + // Replaces all <noscript> tags with <p> tags. replaceNoscriptsWithPs: function(node) { readability.replaceTagsWithTags(node, 'noscript', 'p'); }, - + // Replaces all <font> tags with <span> tags. replaceFontsWithSpans: function(node) { readability.replaceTagsWithTags(node, 'font', 'span'); }, - + // Returns a list of image URLs in the distilled article. getImages : function() { var images = document.getElementsByTagName('img'); @@ -1719,10 +1725,15 @@ var readability = { } return result; }, - + // Returns the distilled article HTML from the page(s). getDistilledArticleHTML : function() { return readability.distilledHTML; + }, + + // Returns the next page of this article. + getNextPageLink : function() { + return readability.nextPageLink; } }; @@ -1730,12 +1741,13 @@ var readability = { // element is the article title, the second element is HTML containing the // long-form content, and remaining elements are URLs for images referenced by // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which -// corresponds to a URL listed at index k in the array returned. +// corresponds to a URL listed at index k in the array returned. (function () { readability.init(); - var result = new Array(2); + var result = new Array(3); result[0] = readability.getArticleTitle(); result[1] = readability.getDistilledArticleHTML(); + result[2] = readability.getNextPageLink(); return result.concat(readability.getImages()); }()) |