Add support for multipage distillation.

Add support for distilling multiple pages, currently support is really basic and just distills pages sequentially. BUG=288015 TEST=included NOTRY=true R=cjhopman@chromium.org Review URL: https://codereview.chromium.org/146843010 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@248867 0039d316-1c4b-4281-b951-d872f2087c98
author: shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2014-02-05 05:56:20 +0000
committer: shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2014-02-05 05:56:20 +0000
commit: 415f99b18e6f14c53e08f6574a3bc653049fbaae (patch)
tree: 6039149dc08176419753d850eba0cc74ff1b1a43
parent: 640e10a877584a90c8200490c41c937501cca5a3 (diff)
download: chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.zip
chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.gz
chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.bz2
23 files changed, 772 insertions, 297 deletions
diff --git a/components/dom_distiller.gypi b/components/dom_distiller.gypi
index 8cb8901..c22d5c0 100644
--- a/components/dom_distiller.gypi
+++ b/components/dom_distiller.gypi
@@ -84,6 +84,8 @@
             'dom_distiller/core/dom_distiller_service.h',
             'dom_distiller/core/dom_distiller_store.cc',
             'dom_distiller/core/dom_distiller_store.h',
+            'dom_distiller/core/page_distiller.cc',
+            'dom_distiller/core/page_distiller.h',
             'dom_distiller/core/task_tracker.cc',
             'dom_distiller/core/task_tracker.h',
           ],
@@ -112,6 +114,7 @@
           'target_name': 'distilled_page_proto',
           'type': 'static_library',
           'sources': [
+            'dom_distiller/core/proto/distilled_article.proto',
             'dom_distiller/core/proto/distilled_page.proto',
           ],
           'variables': {
diff --git a/components/dom_distiller/content/distiller_page_web_contents.cc b/components/dom_distiller/content/distiller_page_web_contents.cc
index c0c7ff5..06bd6bc 100644
--- a/components/dom_distiller/content/distiller_page_web_contents.cc
+++ b/components/dom_distiller/content/distiller_page_web_contents.cc
@@ -55,16 +55,22 @@ void DistillerPageWebContents::ExecuteJavaScriptImpl(
       base::string16(),  // frame_xpath
       base::UTF8ToUTF16(script),
       base::Bind(&DistillerPage::OnExecuteJavaScriptDone,
-                 base::Unretained(this)));
+                 base::Unretained(this),
+                 web_contents_->GetLastCommittedURL()));
 }
 
 void DistillerPageWebContents::DidFinishLoad(int64 frame_id,
                                              const GURL& validated_url,
                                              bool is_main_frame,
                                              RenderViewHost* render_view_host) {
-  content::WebContentsObserver::Observe(NULL);
-  OnLoadURLDone();
+  // TODO(shashishekhar): Find a better way to detect when it is safe to run the
+  // distillation script. Waiting for the entire page to load is really slow.
+  if (is_main_frame) {
+    content::WebContentsObserver::Observe(NULL);
+    OnLoadURLDone();
+  }
 }
+
 void DistillerPageWebContents::DidFailLoad(
     int64 frame_id,
     const GURL& validated_url,
@@ -72,8 +78,10 @@ void DistillerPageWebContents::DidFailLoad(
     int error_code,
     const base::string16& error_description,
     RenderViewHost* render_view_host) {
-  content::WebContentsObserver::Observe(NULL);
-  OnLoadURLFailed();
+  if (is_main_frame) {
+    content::WebContentsObserver::Observe(NULL);
+    OnLoadURLFailed();
+  }
 }
 
 }  // namespace dom_distiller
diff --git a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc
index 42446e2..c90bd96 100644
--- a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc
+++ b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc
@@ -48,7 +48,8 @@ class DistillerPageWebContentsTest
     distiller_page_->ExecuteJavaScript(kScript);
   }
 
-  virtual void OnExecuteJavaScriptDone(const base::Value* value) OVERRIDE {
+  virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+                                       const base::Value* value) OVERRIDE {
     value_ = value->DeepCopy();
     quit_closure_.Run();
   }
diff --git a/components/dom_distiller/content/dom_distiller_viewer_source.cc b/components/dom_distiller/content/dom_distiller_viewer_source.cc
index 2f23a07..8e913d5 100644
--- a/components/dom_distiller/content/dom_distiller_viewer_source.cc
+++ b/components/dom_distiller/content/dom_distiller_viewer_source.cc
@@ -4,6 +4,7 @@
 
 #include "components/dom_distiller/content/dom_distiller_viewer_source.h"
 
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -12,6 +13,7 @@
 #include "base/message_loop/message_loop.h"
 #include "base/strings/string_util.h"
 #include "components/dom_distiller/core/dom_distiller_service.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
 #include "components/dom_distiller/core/task_tracker.h"
 #include "content/public/browser/render_frame_host.h"
@@ -53,7 +55,8 @@ class RequestViewerHandle : public ViewRequestDelegate {
   virtual ~RequestViewerHandle();
 
   // ViewRequestDelegate implementation.
-  virtual void OnArticleReady(DistilledPageProto* proto) OVERRIDE;
+  virtual void OnArticleReady(const DistilledArticleProto* article_proto)
+      OVERRIDE;
 
   void TakeViewerHandle(scoped_ptr<ViewerHandle> viewer_handle);
 
@@ -72,13 +75,21 @@ RequestViewerHandle::RequestViewerHandle(
 
 RequestViewerHandle::~RequestViewerHandle() {}
 
-void RequestViewerHandle::OnArticleReady(DistilledPageProto* proto) {
-  DCHECK(proto);
+void RequestViewerHandle::OnArticleReady(
+    const DistilledArticleProto* article_proto) {
+  DCHECK(article_proto);
   std::string title;
   std::string unsafe_article_html;
-  if (proto->has_title() && proto->has_html()) {
-    title = net::EscapeForHTML(proto->title());
-    unsafe_article_html = proto->html();
+  if (article_proto->has_title() && article_proto->pages_size() > 0 &&
+      article_proto->pages(0).has_html()) {
+    title = net::EscapeForHTML(article_proto->title());
+    // TODO(shashishekhar): Add support for correcting displaying multiple pages
+    // after discussing the right way to display them.
+    std::ostringstream unsafe_output_stream;
+    for (int page_num = 0; page_num < article_proto->pages_size(); ++page_num) {
+      unsafe_output_stream << article_proto->pages(page_num).html();
+    }
+    unsafe_article_html = unsafe_output_stream.str();
   } else {
     title = l10n_util::GetStringUTF8(IDS_DOM_DISTILLER_VIEWER_NO_DATA_TITLE);
     unsafe_article_html =
diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc
index 84b8a05..7b4c92a 100644
--- a/components/dom_distiller/core/distiller.cc
+++ b/components/dom_distiller/core/distiller.cc
@@ -8,16 +8,19 @@
 
 #include "base/bind.h"
 #include "base/callback.h"
-#include "base/strings/stringprintf.h"
+#include "base/strings/string_number_conversions.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/values.h"
 #include "components/dom_distiller/core/distiller_page.h"
 #include "components/dom_distiller/core/distiller_url_fetcher.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
-#include "grit/dom_distiller_resources.h"
 #include "net/url_request/url_request_context_getter.h"
-#include "ui/base/resource/resource_bundle.h"
-#include "url/gurl.h"
+
+namespace {
+// Maximum number of distilled pages in an article.
+const int kMaxPagesInArticle = 32;
+}
 
 namespace dom_distiller {
 
@@ -39,90 +42,101 @@ scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
 DistillerImpl::DistillerImpl(
     const DistillerPageFactory& distiller_page_factory,
     const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
-  : distiller_page_factory_(distiller_page_factory),
-    distiller_url_fetcher_factory_(distiller_url_fetcher_factory) {
-  distiller_page_ = distiller_page_factory_.CreateDistillerPage(this).Pass();
+    : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
+      distillation_in_progress_(false) {
+  page_distiller_.reset(new PageDistiller(distiller_page_factory));
 }
 
 DistillerImpl::~DistillerImpl() {
 }
 
 void DistillerImpl::Init() {
-  distiller_page_->Init();
+  DCHECK(!distillation_in_progress_);
+  page_distiller_->Init();
+  article_proto_.reset(new DistilledArticleProto());
 }
 
 void DistillerImpl::DistillPage(const GURL& url,
                             const DistillerCallback& distillation_cb) {
+  DCHECK(!distillation_in_progress_);
   distillation_cb_ = distillation_cb;
-  proto_.reset(new DistilledPageProto());
-  proto_->set_url(url.spec());
-  LoadURL(url);
+  DistillPage(url);
 }
 
-void DistillerImpl::LoadURL(const GURL& url) {
-  distiller_page_->LoadURL(url);
+void DistillerImpl::DistillPage(const GURL& url) {
+  DCHECK(!distillation_in_progress_);
+  if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle &&
+      processed_urls_.find(url.spec()) == processed_urls_.end()) {
+    distillation_in_progress_ = true;
+    // Distill the next page.
+    DCHECK(url.is_valid());
+    DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle);
+    page_distiller_->DistillPage(
+        url,
+        base::Bind(&DistillerImpl::OnPageDistillationFinished,
+                   base::Unretained(this),
+                   url));
+  } else {
+    RunDistillerCallbackIfDone();
+  }
 }
 
-void DistillerImpl::OnLoadURLDone() {
-  GetDistilledContent();
-}
+void DistillerImpl::OnPageDistillationFinished(
+    const GURL& page_url,
+    scoped_ptr<DistilledPageInfo> distilled_page,
+    bool distillation_successful) {
+  DCHECK(distillation_in_progress_);
+  DCHECK(distilled_page.get());
+  if (!distillation_successful) {
+    RunDistillerCallbackIfDone();
+  } else {
+    DistilledPageProto* current_page = article_proto_->add_pages();
+    // Set the title of the article as the title of the first page.
+    if (article_proto_->pages_size() == 1) {
+      article_proto_->set_title(distilled_page->title);
+    }
 
-void DistillerImpl::GetDistilledContent() {
-  std::string script =
-      ResourceBundle::GetSharedInstance().GetRawDataResource(
-          IDR_DISTILLER_JS).as_string();
-  distiller_page_->ExecuteJavaScript(script);
-}
+    current_page->set_url(page_url.spec());
+    current_page->set_html(distilled_page->html);
 
-void DistillerImpl::OnExecuteJavaScriptDone(const base::Value* value) {
-  std::string result;
-  bool fetched_image = false;
-  const base::ListValue* result_list = NULL;
-  if (!value->GetAsList(&result_list)) {
-    DCHECK(proto_);
-    distillation_cb_.Run(proto_.Pass());
-    return;
-  }
-  int i = 0;
-  for (base::ListValue::const_iterator iter = result_list->begin();
-      iter != result_list->end(); ++iter, ++i) {
-    std::string item;
-    (*iter)->GetAsString(&item);
-    // The JavaScript returns an array where the first element is the title,
-    // the second element is the article content HTML, and the remaining
-    // elements are image URLs referenced in the HTML.
-    switch (i) {
-      case 0:
-        proto_->set_title(item);
-        break;
-      case 1:
-        proto_->set_html(item);
-        break;
-      default:
-        int image_number = i - 2;
-        std::string image_id = base::StringPrintf("%d", image_number);
-        FetchImage(image_id, item);
-        fetched_image = true;
+    GURL next_page_url(distilled_page->next_page_url);
+    if (next_page_url.is_valid()) {
+      // The pages should be in same origin.
+      DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
+    }
+
+    processed_urls_.insert(page_url.spec());
+    distillation_in_progress_ = false;
+    int page_number = article_proto_->pages_size();
+    for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
+         ++img_num) {
+      std::string image_id =
+          base::IntToString(page_number) + "_" + base::IntToString(img_num);
+      FetchImage(current_page, image_id, distilled_page->image_urls[img_num]);
     }
+    DistillPage(next_page_url);
   }
-  if (!fetched_image)
-    distillation_cb_.Run(proto_.Pass());
 }
 
-void DistillerImpl::FetchImage(const std::string& image_id,
+void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto,
+                               const std::string& image_id,
                                const std::string& item) {
   DistillerURLFetcher* fetcher =
       distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
   image_fetchers_[image_id] = fetcher;
   fetcher->FetchURL(item,
                     base::Bind(&DistillerImpl::OnFetchImageDone,
-                               base::Unretained(this), image_id));
+                               base::Unretained(this),
+                               base::Unretained(distilled_page_proto),
+                               image_id));
 }
 
-void DistillerImpl::OnFetchImageDone(const std::string& id,
+void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto,
+                                     const std::string& id,
                                      const std::string& response) {
-  DCHECK(proto_);
-  DistilledPageProto_Image* image = proto_->add_image();
+  DCHECK_GT(article_proto_->pages_size(), 0);
+  DCHECK(distilled_page_proto);
+  DistilledPageProto_Image* image = distilled_page_proto->add_image();
   image->set_name(id);
   image->set_data(response);
   DCHECK(image_fetchers_.end() != image_fetchers_.find(id));
@@ -130,8 +144,12 @@ void DistillerImpl::OnFetchImageDone(const std::string& id,
   int result = image_fetchers_.erase(id);
   delete fetcher;
   DCHECK_EQ(1, result);
-  if (image_fetchers_.empty()) {
-    distillation_cb_.Run(proto_.Pass());
+  RunDistillerCallbackIfDone();
+}
+
+void DistillerImpl::RunDistillerCallbackIfDone() {
+  if (image_fetchers_.empty() && !distillation_in_progress_) {
+    distillation_cb_.Run(article_proto_.Pass());
   }
 }
 
diff --git a/components/dom_distiller/core/distiller.h b/components/dom_distiller/core/distiller.h
index adbd41b..9ba5723 100644
--- a/components/dom_distiller/core/distiller.h
+++ b/components/dom_distiller/core/distiller.h
@@ -5,16 +5,14 @@
 #ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
 #define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
 
-#include <map>
 #include <string>
 
 #include "base/callback.h"
-#include "base/gtest_prod_util.h"
-#include "base/memory/ref_counted.h"
-#include "base/values.h"
-#include "components/dom_distiller/core/distiller_page.h"
+#include "base/containers/hash_tables.h"
+#include "base/memory/scoped_ptr.h"
 #include "components/dom_distiller/core/distiller_url_fetcher.h"
-#include "components/dom_distiller/core/proto/distilled_page.pb.h"
+#include "components/dom_distiller/core/page_distiller.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
 #include "net/url_request/url_request_context_getter.h"
 #include "url/gurl.h"
 
@@ -24,8 +22,8 @@ class DistillerImpl;
 
 class Distiller {
  public:
-  typedef base::Callback<void(
-      scoped_ptr<DistilledPageProto>)> DistillerCallback;
+  typedef base::Callback<void(scoped_ptr<DistilledArticleProto>)>
+      DistillerCallback;
   virtual ~Distiller() {}
 
   // Distills a page, and asynchrounously returns the article HTML to the
@@ -55,8 +53,7 @@ class DistillerFactoryImpl : public DistillerFactory {
 };
 
 // Distills a article from a page and associated pages.
-class DistillerImpl : public Distiller,
-                      public DistillerPage::Delegate {
+class DistillerImpl : public Distiller {
  public:
   DistillerImpl(
       const DistillerPageFactory& distiller_page_factory,
@@ -70,28 +67,35 @@ class DistillerImpl : public Distiller,
   virtual void DistillPage(const GURL& url,
                            const DistillerCallback& callback) OVERRIDE;
 
-  // PageDistillerContext::Delegate
-  virtual void OnLoadURLDone() OVERRIDE;
-  virtual void OnExecuteJavaScriptDone(const base::Value* value) OVERRIDE;
+ private:
+  void OnFetchImageDone(DistilledPageProto* distilled_page_proto,
+                        const std::string& id,
+                        const std::string& response);
 
-  void OnFetchImageDone(const std::string& id, const std::string& response);
+  void OnPageDistillationFinished(const GURL& page_url,
+                                  scoped_ptr<DistilledPageInfo> distilled_page,
+                                  bool distillation_successful);
 
- private:
-  virtual void LoadURL(const GURL& url);
-  virtual void FetchImage(const std::string& image_id, const std::string& item);
+  virtual void FetchImage(DistilledPageProto* distilled_page_proto,
+                          const std::string& image_id,
+                          const std::string& item);
 
-  // Injects JavaScript to distill a loaded page down to its important content,
-  // e.g., extracting a news article from its surrounding boilerplate.
-  void GetDistilledContent();
+  // Distills the page and adds the new page to |article_proto|.
+  void DistillPage(const GURL& url);
+
+  // Runs |distillation_cb_| if all distillation callbacks and image fetches are
+  // complete.
+  void RunDistillerCallbackIfDone();
 
-  const DistillerPageFactory& distiller_page_factory_;
   const DistillerURLFetcherFactory& distiller_url_fetcher_factory_;
-  scoped_ptr<DistillerPage> distiller_page_;
+  scoped_ptr<PageDistiller> page_distiller_;
   DistillerCallback distillation_cb_;
 
-  std::map<std::string, DistillerURLFetcher* > image_fetchers_;
-
-  scoped_ptr<DistilledPageProto> proto_;
+  base::hash_map<std::string, DistillerURLFetcher*> image_fetchers_;
+  scoped_ptr<DistilledArticleProto> article_proto_;
+  bool distillation_in_progress_;
+  // Set to keep track of which urls are already seen by the distiller.
+  base::hash_set<std::string> processed_urls_;
 
   DISALLOW_COPY_AND_ASSIGN(DistillerImpl);
 };
diff --git a/components/dom_distiller/core/distiller_page.cc b/components/dom_distiller/core/distiller_page.cc
index cc486c7..3f4bffe 100644
--- a/components/dom_distiller/core/distiller_page.cc
+++ b/components/dom_distiller/core/distiller_page.cc
@@ -52,16 +52,16 @@ void DistillerPage::OnLoadURLFailed() {
   scoped_ptr<base::Value> empty(base::Value::CreateNullValue());
   if (!delegate_)
     return;
-  delegate_->OnExecuteJavaScriptDone(empty.get());
+  delegate_->OnExecuteJavaScriptDone(GURL(), empty.get());
 }
 
-void DistillerPage::OnExecuteJavaScriptDone(
-    const base::Value* value) {
+void DistillerPage::OnExecuteJavaScriptDone(const GURL& page_url,
+                                            const base::Value* value) {
   DCHECK_EQ(EXECUTING_JAVASCRIPT, state_);
   state_ = PAGE_AVAILABLE;
   if (!delegate_)
     return;
-  delegate_->OnExecuteJavaScriptDone(value);
+  delegate_->OnExecuteJavaScriptDone(page_url, value);
 }
 
 }  // namespace dom_distiller
diff --git a/components/dom_distiller/core/distiller_page.h b/components/dom_distiller/core/distiller_page.h
index 291ef7e..4e22e8d 100644
--- a/components/dom_distiller/core/distiller_page.h
+++ b/components/dom_distiller/core/distiller_page.h
@@ -22,7 +22,8 @@ class DistillerPage {
    public:
     virtual ~Delegate() {}
     virtual void OnLoadURLDone() {}
-    virtual void OnExecuteJavaScriptDone(const base::Value* value) {}
+    virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+                                         const base::Value* value) {}
   };
 
   explicit DistillerPage(Delegate* delegate);
@@ -45,9 +46,10 @@ class DistillerPage {
   // a page is available.
   void ExecuteJavaScript(const std::string& script);
 
-  // Called when the JavaScript execution completes. |value| contains data
-  // returned by the script.
-  virtual void OnExecuteJavaScriptDone(const base::Value* value);
+  // Called when the JavaScript execution completes. |page_url| is the url of
+  // the distilled page. |value| contains data returned by the script.
+  virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+                                       const base::Value* value);
 
  protected:
   enum State {
diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc
index 44c2986..3865aef 100644
--- a/components/dom_distiller/core/distiller_unittest.cc
+++ b/components/dom_distiller/core/distiller_unittest.cc
@@ -3,20 +3,26 @@
 // found in the LICENSE file.
 
 #include <map>
+#include <string>
+#include <vector>
 
 #include "base/bind.h"
 #include "base/bind_helpers.h"
 #include "base/location.h"
 #include "base/memory/scoped_ptr.h"
 #include "base/message_loop/message_loop.h"
+#include "base/strings/string_number_conversions.h"
 #include "base/values.h"
 #include "components/dom_distiller/core/distiller.h"
 #include "components/dom_distiller/core/distiller_page.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
 #include "net/url_request/url_request_context_getter.h"
 #include "testing/gmock/include/gmock/gmock.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
+using std::vector;
+using std::string;
 using::testing::Invoke;
 using::testing::Return;
 using::testing::_;
@@ -25,12 +31,31 @@ namespace {
   const char kTitle[] = "Title";
   const char kContent[] = "Content";
   const char kURL[] = "http://a.com/";
-  const char kId0[] = "0";
-  const char kId1[] = "1";
-  const char kImageURL0[] = "http://a.com/img1.jpg";
-  const char kImageURL1[] = "http://a.com/img2.jpg";
-  const char kImageData0[] = { 'a', 'b', 'c', 'd', 'e', 0 };
-  const char kImageData1[] = { '1', '2', '3', '4', '5', 0 };
+  const size_t kTotalImages = 2;
+  const char* kImageURLs[kTotalImages] = {"http://a.com/img1.jpg",
+                                          "http://a.com/img2.jpg"};
+  const char* kImageData[kTotalImages] = {"abcde", "12345"};
+
+  const string GetImageName(int page_num, int image_num) {
+    return base::IntToString(page_num) + "_" + base::IntToString(image_num);
+  }
+
+  scoped_ptr<base::ListValue> CreateDistilledValueReturnedFromJS(
+      const string& title,
+      const string& content,
+      const vector<int>& image_indices,
+      const string& next_page_url) {
+    scoped_ptr<base::ListValue> list(new base::ListValue());
+
+    list->AppendString(title);
+    list->AppendString(content);
+    list->AppendString(next_page_url);
+    for (size_t i = 0; i < image_indices.size(); ++i) {
+      list->AppendString(kImageURLs[image_indices[i]]);
+    }
+    return list.Pass();
+  }
+
 }  // namespace
 
 namespace dom_distiller {
@@ -38,15 +63,15 @@ namespace dom_distiller {
 class TestDistillerURLFetcher : public DistillerURLFetcher {
  public:
   TestDistillerURLFetcher() : DistillerURLFetcher(NULL) {
-    responses_[kImageURL0] = std::string(kImageData0);
-    responses_[kImageURL1] = std::string(kImageData1);
+    responses_[kImageURLs[0]] = string(kImageData[0]);
+    responses_[kImageURLs[1]] = string(kImageData[1]);
   }
 
-  void CallCallback(std::string url, const URLFetcherCallback& callback) {
+  void CallCallback(string url, const URLFetcherCallback& callback) {
     callback.Run(responses_[url]);
   }
 
-  virtual void FetchURL(const std::string& url,
+  virtual void FetchURL(const string& url,
                         const URLFetcherCallback& callback) OVERRIDE {
     ASSERT_TRUE(base::MessageLoop::current());
     base::MessageLoop::current()->PostTask(
@@ -55,7 +80,7 @@ class TestDistillerURLFetcher : public DistillerURLFetcher {
                    base::Unretained(this), url, callback));
   }
 
-  std::map<std::string, std::string> responses_;
+  std::map<string, string> responses_;
 };
 
 
@@ -73,7 +98,7 @@ class MockDistillerPage : public DistillerPage {
  public:
   MOCK_METHOD0(InitImpl, void());
   MOCK_METHOD1(LoadURLImpl, void(const GURL& gurl));
-  MOCK_METHOD1(ExecuteJavaScriptImpl, void(const std::string& script));
+  MOCK_METHOD1(ExecuteJavaScriptImpl, void(const string& script));
 
   explicit MockDistillerPage(DistillerPage::Delegate* delegate)
       : DistillerPage(delegate) {}
@@ -96,19 +121,19 @@ class MockDistillerPageFactory : public DistillerPageFactory {
 class DistillerTest : public testing::Test {
  public:
   virtual ~DistillerTest() {}
-  void OnDistillPageDone(scoped_ptr<DistilledPageProto> proto) {
-    proto_ = proto.Pass();
+  void OnDistillPageDone(scoped_ptr<DistilledArticleProto> proto) {
+    article_proto_ = proto.Pass();
   }
 
  protected:
   scoped_ptr<DistillerImpl> distiller_;
-  scoped_ptr<DistilledPageProto> proto_;
+  scoped_ptr<DistilledArticleProto> article_proto_;
   MockDistillerPageFactory page_factory_;
   TestDistillerURLFetcherFactory url_fetcher_factory_;
 };
 
-ACTION_P2(DistillerPageOnExecuteJavaScriptDone, distiller_page, list) {
-  distiller_page->OnExecuteJavaScriptDone(list);
+ACTION_P3(DistillerPageOnExecuteJavaScriptDone, distiller_page, url, list) {
+  distiller_page->OnExecuteJavaScriptDone(url, list);
 }
 
 ACTION_P2(CreateMockDistillerPage, list, kurl) {
@@ -118,18 +143,57 @@ ACTION_P2(CreateMockDistillerPage, list, kurl) {
   EXPECT_CALL(*distiller_page, LoadURLImpl(kurl))
       .WillOnce(testing::InvokeWithoutArgs(distiller_page,
                                            &DistillerPage::OnLoadURLDone));
-  EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_))
-      .WillOnce(DistillerPageOnExecuteJavaScriptDone(distiller_page, list));
+  EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_)).WillOnce(
+      DistillerPageOnExecuteJavaScriptDone(distiller_page, kurl, list));
+  return distiller_page;
+}
+
+ACTION_P3(CreateMockDistillerPages, lists, kurls, num_pages) {
+  DistillerPage::Delegate* delegate = arg0;
+  MockDistillerPage* distiller_page = new MockDistillerPage(delegate);
+  EXPECT_CALL(*distiller_page, InitImpl());
+  {
+    testing::InSequence s;
+
+    for (int page = 0; page < num_pages; ++page) {
+      GURL url = GURL(kurls[page]);
+      EXPECT_CALL(*distiller_page, LoadURLImpl(url))
+          .WillOnce(testing::InvokeWithoutArgs(distiller_page,
+                                               &DistillerPage::OnLoadURLDone));
+      EXPECT_CALL(*distiller_page, ExecuteJavaScriptImpl(_))
+          .WillOnce(DistillerPageOnExecuteJavaScriptDone(
+              distiller_page, url, lists[page].get()));
+    }
+  }
   return distiller_page;
 }
 
 TEST_F(DistillerTest, DistillPage) {
   base::MessageLoopForUI loop;
-  scoped_ptr<base::ListValue> list(new base::ListValue());
-  list->AppendString(kTitle);
-  list->AppendString(kContent);
-  list->AppendString(kImageURL0);
-  list->AppendString(kImageURL1);
+  scoped_ptr<base::ListValue> list =
+      CreateDistilledValueReturnedFromJS(kTitle, kContent, vector<int>(), "");
+  EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
+      .WillOnce(CreateMockDistillerPage(list.get(), GURL(kURL)));
+  distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
+  distiller_->Init();
+  distiller_->DistillPage(
+      GURL(kURL),
+      base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
+  base::MessageLoop::current()->RunUntilIdle();
+  EXPECT_EQ(kTitle, article_proto_->title());
+  EXPECT_EQ(article_proto_->pages_size(), 1);
+  const DistilledPageProto& first_page = article_proto_->pages(0);
+  EXPECT_EQ(kContent, first_page.html());
+  EXPECT_EQ(kURL, first_page.url());
+}
+
+TEST_F(DistillerTest, DistillPageWithImages) {
+  base::MessageLoopForUI loop;
+  vector<int> image_indices;
+  image_indices.push_back(0);
+  image_indices.push_back(1);
+  scoped_ptr<base::ListValue> list =
+      CreateDistilledValueReturnedFromJS(kTitle, kContent, image_indices, "");
   EXPECT_CALL(page_factory_,
               CreateDistillerPageMock(_)).WillOnce(
                   CreateMockDistillerPage(list.get(), GURL(kURL)));
@@ -139,14 +203,91 @@ TEST_F(DistillerTest, DistillPage) {
       GURL(kURL),
       base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
   base::MessageLoop::current()->RunUntilIdle();
-  EXPECT_EQ(kTitle, proto_->title());
-  EXPECT_EQ(kContent, proto_->html());
-  EXPECT_EQ(kURL, proto_->url());
-  EXPECT_EQ(2, proto_->image_size());
-  EXPECT_EQ(kImageData0, proto_->image(0).data());
-  EXPECT_EQ(kId0, proto_->image(0).name());
-  EXPECT_EQ(kImageData1, proto_->image(1).data());
-  EXPECT_EQ(kId1, proto_->image(1).name());
+  EXPECT_EQ(kTitle, article_proto_->title());
+  EXPECT_EQ(article_proto_->pages_size(), 1);
+  const DistilledPageProto& first_page = article_proto_->pages(0);
+  EXPECT_EQ(kContent, first_page.html());
+  EXPECT_EQ(kURL, first_page.url());
+  EXPECT_EQ(2, first_page.image_size());
+  EXPECT_EQ(kImageData[0], first_page.image(0).data());
+  EXPECT_EQ(GetImageName(1, 0), first_page.image(0).name());
+  EXPECT_EQ(kImageData[1], first_page.image(1).data());
+  EXPECT_EQ(GetImageName(1, 1), first_page.image(1).name());
+}
+
+TEST_F(DistillerTest, DistillMultiplePages) {
+  base::MessageLoopForUI loop;
+  const int kNumPages = 8;
+  vector<int> image_indices[kNumPages];
+  string content[kNumPages];
+  string page_urls[kNumPages];
+  scoped_ptr<base::ListValue> list[kNumPages];
+
+  int next_image_number = 0;
+
+  for (int page_num = 0; page_num < kNumPages; ++page_num) {
+    // Each page has different number of images.
+    int tot_images = (page_num + kTotalImages) % (kTotalImages + 1);
+    for (int img_num = 0; img_num < tot_images; img_num++) {
+      image_indices[page_num].push_back(next_image_number);
+      next_image_number = (next_image_number + 1) % kTotalImages;
+    }
+
+    page_urls[page_num] = "http://a.com/" + base::IntToString(page_num);
+    content[page_num] = "Content for page:" + base::IntToString(page_num);
+  }
+  for (int i = 0; i < kNumPages; ++i) {
+    string next_page_url = "";
+    if (i + 1 < kNumPages)
+      next_page_url = page_urls[i + 1];
+
+    list[i] = CreateDistilledValueReturnedFromJS(
+        kTitle, content[i], image_indices[i], next_page_url);
+  }
+
+  EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
+      .WillOnce(CreateMockDistillerPages(list, page_urls, kNumPages));
+
+  distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
+  distiller_->Init();
+  distiller_->DistillPage(
+      GURL(page_urls[0]),
+      base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
+  base::MessageLoop::current()->RunUntilIdle();
+  EXPECT_EQ(kTitle, article_proto_->title());
+  EXPECT_EQ(article_proto_->pages_size(), kNumPages);
+  for (int page_num = 0; page_num < kNumPages; ++page_num) {
+    const DistilledPageProto& page = article_proto_->pages(page_num);
+    EXPECT_EQ(content[page_num], page.html());
+    EXPECT_EQ(page_urls[page_num], page.url());
+    EXPECT_EQ(image_indices[page_num].size(),
+              static_cast<size_t>(page.image_size()));
+    for (size_t img_num = 0; img_num < image_indices[page_num].size();
+         ++img_num) {
+      EXPECT_EQ(kImageData[image_indices[page_num][img_num]],
+                page.image(img_num).data());
+      EXPECT_EQ(GetImageName(page_num + 1, img_num),
+                page.image(img_num).name());
+    }
+  }
+}
+
+TEST_F(DistillerTest, DistillLinkLoop) {
+  base::MessageLoopForUI loop;
+  // Create a loop, the next page is same as the current page. This could
+  // happen if javascript misparses a next page link.
+  scoped_ptr<base::ListValue> list =
+      CreateDistilledValueReturnedFromJS(kTitle, kContent, vector<int>(), kURL);
+  EXPECT_CALL(page_factory_, CreateDistillerPageMock(_))
+      .WillOnce(CreateMockDistillerPage(list.get(), GURL(kURL)));
+  distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_));
+  distiller_->Init();
+  distiller_->DistillPage(
+      GURL(kURL),
+      base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this)));
+  base::MessageLoop::current()->RunUntilIdle();
+  EXPECT_EQ(kTitle, article_proto_->title());
+  EXPECT_EQ(article_proto_->pages_size(), 1);
 }
 
 }  // namespace dom_distiller
diff --git a/components/dom_distiller/core/dom_distiller_service.cc b/components/dom_distiller/core/dom_distiller_service.cc
index 797827b..6ee2f2f 100644
--- a/components/dom_distiller/core/dom_distiller_service.cc
+++ b/components/dom_distiller/core/dom_distiller_service.cc
@@ -7,6 +7,7 @@
 #include "base/guid.h"
 #include "base/message_loop/message_loop.h"
 #include "components/dom_distiller/core/dom_distiller_store.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
 #include "components/dom_distiller/core/task_tracker.h"
 #include "url/gurl.h"
 
@@ -27,7 +28,7 @@ ArticleEntry CreateSkeletonEntryForUrl(const GURL& url) {
 void RunArticleAvailableCallback(
     const DomDistillerService::ArticleAvailableCallback& article_cb,
     const ArticleEntry& entry,
-    DistilledPageProto* proto,
+    const DistilledArticleProto* article_proto,
     bool distillation_succeeded) {
   article_cb.Run(distillation_succeeded);
 }
@@ -189,13 +190,16 @@ void DomDistillerService::CancelTask(TaskTracker* task) {
   }
 }
 
-void DomDistillerService::AddDistilledPageToList(const ArticleEntry& entry,
-                                                 DistilledPageProto* proto,
-                                                 bool distillation_succeeded) {
+void DomDistillerService::AddDistilledPageToList(
+    const ArticleEntry& entry,
+    const DistilledArticleProto* article_proto,
+    bool distillation_succeeded) {
   DCHECK(IsEntryValid(entry));
   if (distillation_succeeded) {
-    DCHECK(proto);
+    DCHECK(article_proto);
+    DCHECK_GT(article_proto->pages_size(), 0);
     store_->UpdateEntry(entry);
+    DCHECK_EQ(article_proto->pages_size(), entry.pages_size());
   }
 }
 
diff --git a/components/dom_distiller/core/dom_distiller_service.h b/components/dom_distiller/core/dom_distiller_service.h
index ed045cd..630b184 100644
--- a/components/dom_distiller/core/dom_distiller_service.h
+++ b/components/dom_distiller/core/dom_distiller_service.h
@@ -22,7 +22,7 @@ class SyncableService;
 
 namespace dom_distiller {
 
-class DistilledPageProto;
+class DistilledArticleProto;
 class DistillerFactory;
 class DomDistillerObserver;
 class DomDistillerStoreInterface;
@@ -70,7 +70,7 @@ class DomDistillerService {
  private:
   void CancelTask(TaskTracker* task);
   void AddDistilledPageToList(const ArticleEntry& entry,
-                              DistilledPageProto* proto,
+                              const DistilledArticleProto* article_proto,
                               bool distillation_succeeded);
 
   TaskTracker* CreateTaskTracker(const ArticleEntry& entry);
diff --git a/components/dom_distiller/core/dom_distiller_service_unittest.cc b/components/dom_distiller/core/dom_distiller_service_unittest.cc
index 35065fd..f3418bb 100644
--- a/components/dom_distiller/core/dom_distiller_service_unittest.cc
+++ b/components/dom_distiller/core/dom_distiller_service_unittest.cc
@@ -9,6 +9,7 @@
 #include "base/containers/hash_tables.h"
 #include "base/message_loop/message_loop.h"
 #include "base/run_loop.h"
+#include "base/strings/string_number_conversions.h"
 #include "components/dom_distiller/core/article_entry.h"
 #include "components/dom_distiller/core/dom_distiller_model.h"
 #include "components/dom_distiller/core/dom_distiller_store.h"
@@ -31,7 +32,7 @@ namespace {
 class FakeViewRequestDelegate : public ViewRequestDelegate {
  public:
   virtual ~FakeViewRequestDelegate() {}
-  MOCK_METHOD1(OnArticleReady, void(DistilledPageProto* proto));
+  MOCK_METHOD1(OnArticleReady, void(const DistilledArticleProto* proto));
 };
 
 class MockDistillerObserver : public DomDistillerObserver {
@@ -52,11 +53,23 @@ DomDistillerService::ArticleAvailableCallback ArticleCallback(
 }
 
 void RunDistillerCallback(FakeDistiller* distiller,
-                          scoped_ptr<DistilledPageProto> proto) {
+                          scoped_ptr<DistilledArticleProto> proto) {
   distiller->RunDistillerCallback(proto.Pass());
   base::RunLoop().RunUntilIdle();
 }
 
+scoped_ptr<DistilledArticleProto> CreateArticleWithURL(const std::string& url) {
+  scoped_ptr<DistilledArticleProto> proto(new DistilledArticleProto);
+  DistilledPageProto* page = proto->add_pages();
+  page->set_url(url);
+  return proto.Pass();
+}
+
+scoped_ptr<DistilledArticleProto> CreateDefaultArticle() {
+  return CreateArticleWithURL("http://www.example.com/default_article_page1")
+      .Pass();
+}
+
 }  // namespace
 
 class DomDistillerServiceTest : public testing::Test {
@@ -109,7 +122,7 @@ TEST_F(DomDistillerServiceTest, TestViewEntry) {
 
   ASSERT_FALSE(distiller->GetCallback().is_null());
 
-  scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
   EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get()));
 
   RunDistillerCallback(distiller, proto.Pass());
@@ -127,7 +140,7 @@ TEST_F(DomDistillerServiceTest, TestViewUrl) {
   ASSERT_FALSE(distiller->GetCallback().is_null());
   EXPECT_EQ(url, distiller->GetUrl());
 
-  scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
   EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get()));
 
   RunDistillerCallback(distiller, proto.Pass());
@@ -152,7 +165,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleViewUrl) {
   ASSERT_FALSE(distiller->GetCallback().is_null());
   EXPECT_EQ(url, distiller->GetUrl());
 
-  scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
   EXPECT_CALL(viewer_delegate, OnArticleReady(proto.get()));
 
   RunDistillerCallback(distiller, proto.Pass());
@@ -160,7 +173,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleViewUrl) {
   ASSERT_FALSE(distiller2->GetCallback().is_null());
   EXPECT_EQ(url2, distiller2->GetUrl());
 
-  scoped_ptr<DistilledPageProto> proto2(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto2 = CreateDefaultArticle();
   EXPECT_CALL(viewer_delegate2, OnArticleReady(proto2.get()));
 
   RunDistillerCallback(distiller2, proto2.Pass());
@@ -210,7 +223,7 @@ TEST_F(DomDistillerServiceTest, TestAddAndRemoveEntry) {
   ASSERT_FALSE(distiller->GetCallback().is_null());
   EXPECT_EQ(url, distiller->GetUrl());
 
-  scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto = CreateArticleWithURL(url.spec());
   RunDistillerCallback(distiller, proto.Pass());
 
   EXPECT_TRUE(store_->GetEntryByUrl(url, &entry));
@@ -280,7 +293,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleObservers) {
         ArticleEntriesUpdated(util::HasExpectedUpdates(expected_updates)));
   }
 
-  scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto = CreateDefaultArticle();
   RunDistillerCallback(distiller, proto.Pass());
 
   // Remove should notify all observers that article is removed.
@@ -316,7 +329,7 @@ TEST_F(DomDistillerServiceTest, TestMultipleCallbacks) {
     EXPECT_CALL(article_cb[i], DistillationCompleted(true));
   }
 
-  scoped_ptr<DistilledPageProto> proto(new DistilledPageProto);
+  scoped_ptr<DistilledArticleProto> proto = CreateArticleWithURL(url.spec());
   RunDistillerCallback(distiller, proto.Pass());
 
   // Add the same url again, all callbacks should be called with true.
@@ -353,5 +366,58 @@ TEST_F(DomDistillerServiceTest, TestMultipleCallbacksOnRemove) {
   base::RunLoop().RunUntilIdle();
 }
 
+TEST_F(DomDistillerServiceTest, TestMultiplePageArticle) {
+  FakeDistiller* distiller = new FakeDistiller(false);
+  EXPECT_CALL(*distiller_factory_, CreateDistillerImpl())
+      .WillOnce(Return(distiller));
+
+  const int kPageCount = 8;
+
+  std::string base_url("http://www.example.com/p");
+  GURL pages_url[kPageCount];
+  for (int page_num = 0; page_num < kPageCount; ++page_num) {
+    pages_url[page_num] = GURL(base_url + base::IntToString(page_num));
+  }
+
+  MockArticleAvailableCallback article_cb;
+  EXPECT_CALL(article_cb, DistillationCompleted(true));
+
+  std::string entry_id =
+      service_->AddToList(pages_url[0], ArticleCallback(&article_cb));
+
+  ArticleEntry entry;
+  ASSERT_FALSE(distiller->GetCallback().is_null());
+  EXPECT_EQ(pages_url[0], distiller->GetUrl());
+
+  // Create the article with pages to pass to the distiller.
+  scoped_ptr<DistilledArticleProto> proto =
+      CreateArticleWithURL(pages_url[0].spec());
+  for (int page_num = 1; page_num < kPageCount; ++page_num) {
+    DistilledPageProto* distilled_page = proto->add_pages();
+    distilled_page->set_url(pages_url[page_num].spec());
+  }
+
+  RunDistillerCallback(distiller, proto.Pass());
+  EXPECT_TRUE(store_->GetEntryByUrl(pages_url[0], &entry));
+
+  EXPECT_EQ(kPageCount, entry.pages_size());
+  // An article should have just one entry.
+  EXPECT_EQ(1u, store_->GetEntries().size());
+
+  // All pages should have correct urls.
+  for (int page_num = 0; page_num < kPageCount; ++page_num) {
+    EXPECT_EQ(pages_url[page_num].spec(), entry.pages(page_num).url());
+  }
+
+  // Should be able to query article using any of the pages url.
+  for (int page_num = 0; page_num < kPageCount; ++page_num) {
+    EXPECT_TRUE(store_->GetEntryByUrl(pages_url[page_num], &entry));
+  }
+
+  service_->RemoveEntry(entry_id);
+  base::RunLoop().RunUntilIdle();
+  EXPECT_EQ(0u, store_->GetEntries().size());
+}
+
 }  // namespace test
 }  // namespace dom_distiller
diff --git a/components/dom_distiller/core/fake_distiller.cc b/components/dom_distiller/core/fake_distiller.cc
index 3f1f976..e6cc979 100644
--- a/components/dom_distiller/core/fake_distiller.cc
+++ b/components/dom_distiller/core/fake_distiller.cc
@@ -14,14 +14,26 @@ namespace test {
 MockDistillerFactory::MockDistillerFactory() {}
 MockDistillerFactory::~MockDistillerFactory() {}
 
-FakeDistiller::FakeDistiller(bool execute_callback) :
-      execute_callback_(execute_callback) {
+FakeDistiller::FakeDistiller(bool execute_callback)
+    : execute_callback_(execute_callback) {
   EXPECT_CALL(*this, Die()).Times(testing::AnyNumber());
 }
 
 FakeDistiller::~FakeDistiller() { Die(); }
 
-void FakeDistiller::RunDistillerCallback(scoped_ptr<DistilledPageProto> proto) {
+void FakeDistiller::DistillPage(const GURL& url,
+                                const DistillerCallback& callback) {
+  url_ = url;
+  callback_ = callback;
+  if (execute_callback_) {
+    scoped_ptr<DistilledArticleProto> proto(new DistilledArticleProto);
+    proto->add_pages()->set_url(url_.spec());
+    RunDistillerCallback(proto.Pass());
+  }
+}
+
+void FakeDistiller::RunDistillerCallback(
+    scoped_ptr<DistilledArticleProto> proto) {
   base::MessageLoop::current()->PostTask(
       FROM_HERE,
       base::Bind(&FakeDistiller::RunDistillerCallbackInternal,
@@ -30,7 +42,7 @@ void FakeDistiller::RunDistillerCallback(scoped_ptr<DistilledPageProto> proto) {
 }
 
 void FakeDistiller::RunDistillerCallbackInternal(
-    scoped_ptr<DistilledPageProto> proto) {
+    scoped_ptr<DistilledArticleProto> proto) {
   EXPECT_FALSE(callback_.is_null());
   callback_.Run(proto.Pass());
   callback_.Reset();
diff --git a/components/dom_distiller/core/fake_distiller.h b/components/dom_distiller/core/fake_distiller.h
index dbb00d6..98ab709 100644
--- a/components/dom_distiller/core/fake_distiller.h
+++ b/components/dom_distiller/core/fake_distiller.h
@@ -27,27 +27,21 @@ class MockDistillerFactory : public DistillerFactory {
 
 class FakeDistiller : public Distiller {
  public:
-  FakeDistiller(bool execute_callback);
+  explicit FakeDistiller(bool execute_callback);
   virtual ~FakeDistiller();
   MOCK_METHOD0(Die, void());
 
   virtual void DistillPage(const GURL& url,
-                           const DistillerCallback& callback) OVERRIDE {
-    url_ = url;
-    callback_ = callback;
-    if (execute_callback_) {
-      RunDistillerCallback(make_scoped_ptr(new DistilledPageProto));
-    }
-  }
+                           const DistillerCallback& callback) OVERRIDE;
 
-  void RunDistillerCallback(scoped_ptr<DistilledPageProto> proto);
+  void RunDistillerCallback(scoped_ptr<DistilledArticleProto> proto);
 
   GURL GetUrl() { return url_; }
 
   DistillerCallback GetCallback() { return callback_; }
 
  private:
-  void RunDistillerCallbackInternal(scoped_ptr<DistilledPageProto> proto);
+  void RunDistillerCallbackInternal(scoped_ptr<DistilledArticleProto> proto);
 
   bool execute_callback_;
   GURL url_;
diff --git a/components/dom_distiller/core/page_distiller.cc b/components/dom_distiller/core/page_distiller.cc
new file mode 100644
index 0000000..2d47bd1
--- /dev/null
+++ b/components/dom_distiller/core/page_distiller.cc
@@ -0,0 +1,91 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/dom_distiller/core/page_distiller.h"
+
+#include <map>
+
+#include "base/location.h"
+#include "base/message_loop/message_loop.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/values.h"
+#include "components/dom_distiller/core/distiller_page.h"
+#include "components/dom_distiller/core/distiller_url_fetcher.h"
+#include "grit/dom_distiller_resources.h"
+#include "net/url_request/url_request_context_getter.h"
+#include "ui/base/resource/resource_bundle.h"
+#include "url/gurl.h"
+
+namespace dom_distiller {
+
+DistilledPageInfo::DistilledPageInfo() {}
+
+DistilledPageInfo::~DistilledPageInfo() {}
+
+PageDistiller::PageDistiller(const DistillerPageFactory& distiller_page_factory)
+    : distiller_page_(distiller_page_factory.CreateDistillerPage(this).Pass()) {
+}
+
+PageDistiller::~PageDistiller() {}
+
+void PageDistiller::Init() { distiller_page_->Init(); }
+
+void PageDistiller::DistillPage(const GURL& url,
+                                const PageDistillerCallback& callback) {
+  page_distiller_callback_ = callback;
+  LoadURL(url);
+}
+
+void PageDistiller::LoadURL(const GURL& url) { distiller_page_->LoadURL(url); }
+
+void PageDistiller::OnLoadURLDone() { GetDistilledContent(); }
+
+void PageDistiller::GetDistilledContent() {
+  std::string script = ResourceBundle::GetSharedInstance()
+                           .GetRawDataResource(IDR_DISTILLER_JS)
+                           .as_string();
+  distiller_page_->ExecuteJavaScript(script);
+}
+
+void PageDistiller::OnExecuteJavaScriptDone(const GURL& page_url,
+                                            const base::Value* value) {
+  scoped_ptr<DistilledPageInfo> page_info(new DistilledPageInfo());
+  std::string result;
+  const base::ListValue* result_list = NULL;
+  if (!value->GetAsList(&result_list)) {
+    base::MessageLoop::current()->PostTask(
+        FROM_HERE,
+        base::Bind(page_distiller_callback_, base::Passed(&page_info), false));
+  } else {
+    int i = 0;
+    for (base::ListValue::const_iterator iter = result_list->begin();
+         iter != result_list->end();
+         ++iter, ++i) {
+      std::string item;
+      (*iter)->GetAsString(&item);
+      // The JavaScript returns an array where the first element is the title,
+      // the second element is the article content HTML, and the remaining
+      // elements are image URLs referenced in the HTML.
+      switch (i) {
+        case 0:
+          page_info->title = item;
+          break;
+        case 1:
+          page_info->html = item;
+          break;
+        case 2: {
+          page_info->next_page_url = item;
+          break;
+        }
+        default:
+          page_info->image_urls.push_back(item);
+      }
+    }
+    base::MessageLoop::current()->PostTask(
+        FROM_HERE,
+        base::Bind(page_distiller_callback_, base::Passed(&page_info), true));
+  }
+}
+
+}  // namespace dom_distiller
diff --git a/components/dom_distiller/core/page_distiller.h b/components/dom_distiller/core/page_distiller.h
new file mode 100644
index 0000000..4e32bea
--- /dev/null
+++ b/components/dom_distiller/core/page_distiller.h
@@ -0,0 +1,70 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_
+#define COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_
+
+#include <string>
+#include <vector>
+
+#include "base/callback.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/values.h"
+#include "components/dom_distiller/core/distiller_page.h"
+#include "url/gurl.h"
+
+namespace dom_distiller {
+
+class DistillerImpl;
+
+struct DistilledPageInfo {
+  std::string title;
+  std::string html;
+  std::string next_page_url;
+  std::vector<std::string> image_urls;
+  DistilledPageInfo();
+  ~DistilledPageInfo();
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(DistilledPageInfo);
+};
+
+// Distills a single page of an article.
+class PageDistiller : public DistillerPage::Delegate {
+ public:
+  typedef base::Callback<void(scoped_ptr<DistilledPageInfo> distilled_page,
+                              bool distillation_successful)>
+      PageDistillerCallback;
+  explicit PageDistiller(const DistillerPageFactory& distiller_page_factory);
+  virtual ~PageDistiller();
+
+  // Creates an execution context. This must be called once before any calls are
+  // made to distill the page.
+  virtual void Init();
+
+  // Distills the |url| and posts the |callback| with results.
+  virtual void DistillPage(const GURL& url,
+                           const PageDistillerCallback& callback);
+
+  // DistillerPage::Delegate
+  virtual void OnLoadURLDone() OVERRIDE;
+  virtual void OnExecuteJavaScriptDone(const GURL& page_url,
+                                       const base::Value* value) OVERRIDE;
+
+ private:
+  virtual void LoadURL(const GURL& url);
+
+  // Injects JavaScript to distill a loaded page down to its important content,
+  // e.g., extracting a news article from its surrounding boilerplate.
+  void GetDistilledContent();
+
+  scoped_ptr<DistillerPage> distiller_page_;
+  PageDistillerCallback page_distiller_callback_;
+
+  DISALLOW_COPY_AND_ASSIGN(PageDistiller);
+};
+
+}  // namespace dom_distiller
+
+#endif  // COMPONENTS_DOM_DISTILLER_CORE_PAGE_DISTILLER_H_
diff --git a/components/dom_distiller/core/proto/distilled_article.proto b/components/dom_distiller/core/proto/distilled_article.proto
new file mode 100644
index 0000000..54284bf
--- /dev/null
+++ b/components/dom_distiller/core/proto/distilled_article.proto
@@ -0,0 +1,21 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Distilled article content.
+
+syntax = "proto2";
+
+option optimize_for = LITE_RUNTIME;
+
+package dom_distiller;
+
+import "distilled_page.proto";
+
+message DistilledArticleProto {
+  // A title for the article.
+  optional string title = 1;
+
+  // The distilled pages in the article.
+  repeated DistilledPageProto pages = 2;
+}
diff --git a/components/dom_distiller/core/proto/distilled_page.proto b/components/dom_distiller/core/proto/distilled_page.proto
index c6f8159..ba71617 100644
--- a/components/dom_distiller/core/proto/distilled_page.proto
+++ b/components/dom_distiller/core/proto/distilled_page.proto
@@ -14,9 +14,6 @@ message DistilledPageProto {
   // The URLs used to create this page.
   required string url = 1;
 
-  // A title for the distilled page.
-  optional string title = 2;
-
   // The distilled HTML.
   required string html = 3;
 
diff --git a/components/dom_distiller/core/task_tracker.cc b/components/dom_distiller/core/task_tracker.cc
index ccd5656..1afd482 100644
--- a/components/dom_distiller/core/task_tracker.cc
+++ b/components/dom_distiller/core/task_tracker.cc
@@ -5,6 +5,8 @@
 #include "components/dom_distiller/core/task_tracker.h"
 
 #include "base/message_loop/message_loop.h"
+#include "components/dom_distiller/core/proto/distilled_article.pb.h"
+#include "components/dom_distiller/core/proto/distilled_page.pb.h"
 
 namespace dom_distiller {
 
@@ -20,7 +22,8 @@ ViewerHandle::~ViewerHandle() {
 TaskTracker::TaskTracker(const ArticleEntry& entry, CancelCallback callback)
     : cancel_callback_(callback),
       entry_(entry),
-      distilled_page_(),
+      distilled_article_(),
+      distillation_complete_(false),
       weak_ptr_factory_(this) {}
 
 TaskTracker::~TaskTracker() { DCHECK(viewers_.empty()); }
@@ -51,7 +54,7 @@ void TaskTracker::StartBlobFetcher() {
 void TaskTracker::AddSaveCallback(const SaveCallback& callback) {
   DCHECK(!callback.is_null());
   save_callbacks_.push_back(callback);
-  if (distilled_page_) {
+  if (distillation_complete_) {
     // Distillation for this task has already completed, and so it can be
     // immediately saved.
     ScheduleSaveCallbacks(true);
@@ -60,7 +63,7 @@ void TaskTracker::AddSaveCallback(const SaveCallback& callback) {
 
 scoped_ptr<ViewerHandle> TaskTracker::AddViewer(ViewRequestDelegate* delegate) {
   viewers_.push_back(delegate);
-  if (distilled_page_) {
+  if (distillation_complete_) {
     // Distillation for this task has already completed, and so the delegate can
     // be immediately told of the result.
     base::MessageLoop::current()->PostTask(
@@ -120,12 +123,10 @@ void TaskTracker::ScheduleSaveCallbacks(bool distillation_succeeded) {
 
 void TaskTracker::DoSaveCallbacks(bool distillation_succeeded) {
   if (!save_callbacks_.empty()) {
-    DistilledPageProto* distilled_proto =
-        distillation_succeeded ? distilled_page_.get() : NULL;
-
     for (size_t i = 0; i < save_callbacks_.size(); ++i) {
       DCHECK(!save_callbacks_[i].is_null());
-      save_callbacks_[i].Run(entry_, distilled_proto, distillation_succeeded);
+      save_callbacks_[i].Run(
+          entry_, distilled_article_.get(), distillation_succeeded);
     }
 
     save_callbacks_.clear();
@@ -134,21 +135,33 @@ void TaskTracker::DoSaveCallbacks(bool distillation_succeeded) {
 }
 
 void TaskTracker::NotifyViewer(ViewRequestDelegate* delegate) {
-  DCHECK(distilled_page_);
-  delegate->OnArticleReady(distilled_page_.get());
+  DCHECK(distillation_complete_);
+  delegate->OnArticleReady(distilled_article_.get());
 }
 
-void TaskTracker::OnDistilledDataReady(scoped_ptr<DistilledPageProto> proto) {
-  distilled_page_ = proto.Pass();
-  DCHECK(distilled_page_);
+void TaskTracker::OnDistilledDataReady(
+    scoped_ptr<DistilledArticleProto> distilled_article) {
+  distilled_article_ = distilled_article.Pass();
+  bool distillation_successful = false;
+  if (distilled_article_->pages_size() > 0) {
+    distillation_successful = true;
+    entry_.set_title(distilled_article_->title());
+    // Reset the pages.
+    entry_.clear_pages();
+    for (int i = 0; i < distilled_article_->pages_size(); ++i) {
+      sync_pb::ArticlePage* page = entry_.add_pages();
+      page->set_url(distilled_article_->pages(i).url());
+    }
+  }
+
+  distillation_complete_ = true;
 
-  entry_.set_title(distilled_page_->title());
   for (size_t i = 0; i < viewers_.size(); ++i) {
     NotifyViewer(viewers_[i]);
   }
 
   // Already inside a callback run SaveCallbacks directly.
-  DoSaveCallbacks(true);
+  DoSaveCallbacks(distillation_successful);
 }
 
 }  // namespace dom_distiller
diff --git a/components/dom_distiller/core/task_tracker.h b/components/dom_distiller/core/task_tracker.h
index f545dd9..c4098fe 100644
--- a/components/dom_distiller/core/task_tracker.h
+++ b/components/dom_distiller/core/task_tracker.h
@@ -19,7 +19,7 @@ class GURL;
 
 namespace dom_distiller {
 
-class DistilledPageProto;
+class DistilledArticleProto;
 
 // A handle to a request to view a DOM distiller entry or URL. The request will
 // be cancelled when the handle is destroyed.
@@ -41,10 +41,10 @@ class ViewRequestDelegate {
  public:
   virtual ~ViewRequestDelegate() {}
   // Called when the distilled article contents are available. The
-  // DistilledPageProto is owned by a TaskTracker instance and is invalidated
+  // DistilledArticleProto is owned by a TaskTracker instance and is invalidated
   // when the corresponding ViewerHandle is destroyed (or when the
   // DomDistillerService is destroyed).
-  virtual void OnArticleReady(DistilledPageProto* proto) = 0;
+  virtual void OnArticleReady(const DistilledArticleProto* article_proto) = 0;
 };
 
 // A TaskTracker manages the various tasks related to viewing, saving,
@@ -68,7 +68,8 @@ class ViewRequestDelegate {
 class TaskTracker {
  public:
   typedef base::Callback<void(TaskTracker*)> CancelCallback;
-  typedef base::Callback<void(const ArticleEntry&, DistilledPageProto*, bool)>
+  typedef base::Callback<
+      void(const ArticleEntry&, const DistilledArticleProto*, bool)>
       SaveCallback;
 
   TaskTracker(const ArticleEntry& entry, CancelCallback callback);
@@ -90,7 +91,8 @@ class TaskTracker {
   bool HasUrl(const GURL& url) const;
 
  private:
-  void OnDistilledDataReady(scoped_ptr<DistilledPageProto> distilled_page);
+  void OnDistilledDataReady(
+      scoped_ptr<DistilledArticleProto> distilled_article);
   // Posts a task to run DoSaveCallbacks with |distillation_succeeded|.
   void ScheduleSaveCallbacks(bool distillation_succeeded);
 
@@ -113,7 +115,8 @@ class TaskTracker {
   std::vector<ViewRequestDelegate*> viewers_;
 
   ArticleEntry entry_;
-  scoped_ptr<DistilledPageProto> distilled_page_;
+  scoped_ptr<DistilledArticleProto> distilled_article_;
+  bool distillation_complete_;
 
   // Note: This should remain the last member so it'll be destroyed and
   // invalidate its weak pointers before any other members are destroyed.
diff --git a/components/dom_distiller/core/task_tracker_unittest.cc b/components/dom_distiller/core/task_tracker_unittest.cc
index f659fc1..c88becb 100644
--- a/components/dom_distiller/core/task_tracker_unittest.cc
+++ b/components/dom_distiller/core/task_tracker_unittest.cc
@@ -18,7 +18,8 @@ namespace test {
 class FakeViewRequestDelegate : public ViewRequestDelegate {
  public:
   virtual ~FakeViewRequestDelegate() {}
-  MOCK_METHOD1(OnArticleReady, void(DistilledPageProto* proto));
+  MOCK_METHOD1(OnArticleReady,
+               void(const DistilledArticleProto* article_proto));
 };
 
 class TestCancelCallback {
@@ -36,7 +37,8 @@ class TestCancelCallback {
 
 class MockSaveCallback {
  public:
-  MOCK_METHOD3(Save, void(const ArticleEntry&, DistilledPageProto*, bool));
+  MOCK_METHOD3(Save,
+               void(const ArticleEntry&, const DistilledArticleProto*, bool));
 };
 
 class DomDistillerTaskTrackerTest : public testing::Test {
diff --git a/third_party/readability/README.chromium b/third_party/readability/README.chromium
index a409176..72590db 100644
--- a/third_party/readability/README.chromium
+++ b/third_party/readability/README.chromium
@@ -11,3 +11,5 @@ present it in an easy-to-read form. The CSS of readability is unchanged.
 Local Modifications:
 - Removed all writes to innerHTML in the JavaScript.
 - Removed unneeded styles, footer, scrolling.
+- Removed extraneous whitespace.
+- Exposed nextPageLink as a function.
diff --git a/third_party/readability/js/readability.js b/third_party/readability/js/readability.js
index 68a0286..9cdbb30 100644
--- a/third_party/readability/js/readability.js
+++ b/third_party/readability/js/readability.js
@@ -1,14 +1,20 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Local modifications to this file are described in the README.chromium
+// file.
 
 var dbg = (typeof console !== 'undefined') ? function(s) {
     console.log("Readability: " + s);
 } : function() {};
 
 /*
- * Readability. An Arc90 Lab Experiment. 
+ * Readability. An Arc90 Lab Experiment.
  * Website: http://lab.arc90.com/experiments/readability
  * Source:  http://code.google.com/p/arc90labs-readability
  *
- * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. 
+ * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
  *
  * Copyright (c) 2010 Arc90 Inc
  * Readability is licensed under the Apache License, Version 2.0.
@@ -20,6 +26,7 @@ var readability = {
 
     distilledHTML: '',
     distilledArticleContent: null,
+    nextPageLink: '',
 
     version:                '1.7.1',
     iframeLoads:             0,
@@ -41,7 +48,7 @@ var readability = {
     maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
     parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
     pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
-    
+
     /**
      * All of the regular expressions in use within readability.
      * Defined up here so we don't instantiate them repeatedly in loops.
@@ -66,7 +73,7 @@ var readability = {
 
     /**
      * Runs readability.
-     * 
+     *
      * Workflow:
      *  1. Prep the document by removing script tags, css, etc.
      *  2. Build readability's DOM tree.
@@ -86,8 +93,11 @@ var readability = {
         readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
 
         /* Pull out any possible next page link first */
-        var nextPageLink = readability.findNextPageLink(document.body);
-        
+        readability.nextPageLink = readability.findNextPageLink(document.body);
+
+        /* We handle processing of nextPage from C++ set nextPageLink to null */
+        var nextPageLink = null;
+
         readability.prepDocument();
 
         /* Build readability's DOM tree */
@@ -152,12 +162,8 @@ var readability = {
 
         window.scrollTo(0, 0);
 
-        // TODO(bengr): Remove this assignment of null to nextPageLink when
-        //              the processing of the next page link is safe.
-        nextPageLink = null;
-
         if (nextPageLink) {
-            /** 
+            /**
              * Append any additional pages after a small timeout so that people
              * can start reading without having to wait for this to finish processing.
             **/
@@ -179,16 +185,16 @@ var readability = {
                 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
 
                 if(readability.reversePageScroll) {
-                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);                   
+                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
                 }
                 else {
-                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);                   
+                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
                 }
-                
+
                 return false;
             }
         };
-        
+
         document.onkeyup = function(e) {
             var code = (window.event) ? event.keyCode : e.keyCode;
             if (code === 16) {
@@ -200,7 +206,7 @@ var readability = {
 
     /**
      * Run any post-process modifications to article content as necessary.
-     * 
+     *
      * @param Element
      * @return void
     **/
@@ -226,7 +232,7 @@ var readability = {
 
         for(var i=0, il = images.length; i < il; i+=1) {
             var image = images[i];
-            
+
             if(image.offsetWidth > imageWidthThreshold) {
                 image.className += " blockImage";
             }
@@ -242,7 +248,7 @@ var readability = {
         var articleTools = document.createElement("DIV");
 
         articleTools.id        = "readTools";
-        articleTools.innerHTML = 
+        articleTools.innerHTML =
             "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
             "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
             "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
@@ -259,13 +265,13 @@ var readability = {
         function sanitizeText() {
             return text.replace(/@\w+/, "");
         }
-        
+
         function countMatches(match) {
             var matches = text.match(new RegExp(match, "g"));
-            return matches !== null ? matches.length : 0; 
+            return matches !== null ? matches.length : 0;
         }
-        
-        function isRTL() {            
+
+        function isRTL() {
             var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
             var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
 
@@ -289,15 +295,15 @@ var readability = {
         try {
             curTitle = origTitle = document.title;
             if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
-                curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); 
+                curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
             }
         }
         catch(e) {}
-        
+
         if(curTitle.match(/ [\|\-] /))
         {
             curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
-            
+
             if(curTitle.split(' ').length < 3) {
                 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
             }
@@ -330,7 +336,7 @@ var readability = {
     /**
      * Prepare the HTML document for readability to scrape it.
      * This includes things like stripping javascript, CSS, and handling terrible markup.
-     * 
+     *
      * @return void
      **/
     prepDocument: function () {
@@ -342,7 +348,7 @@ var readability = {
         {
             var body = document.createElement("body");
             try {
-                document.body = body;       
+                document.body = body;
             }
             catch(e) {
                 document.documentElement.appendChild(body);
@@ -374,11 +380,11 @@ var readability = {
                     biggestFrameSize         = frameSize;
                     readability.biggestFrame = frames[frameIndex];
                 }
-                
+
                 if(canAccessFrame && frameSize > bestFrameSize)
                 {
                     readability.frameHack = true;
-    
+
                     bestFrame = frames[frameIndex];
                     bestFrameSize = frameSize;
                 }
@@ -390,7 +396,7 @@ var readability = {
                 readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
                 newBody.style.overflow = 'scroll';
                 document.body = newBody;
-                
+
                 var frameset = document.getElementsByTagName('frameset')[0];
                 if(frameset) {
                     frameset.parentNode.removeChild(frameset); }
@@ -455,7 +461,7 @@ var readability = {
             var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
             var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
             var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
-            
+
             if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
                 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
             }
@@ -468,7 +474,7 @@ var readability = {
             dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
         }
     },
-    
+
     /**
      * Initialize a node with the readability object. Also checks the
      * className/id for special names to add to its score.
@@ -477,7 +483,7 @@ var readability = {
      * @return void
     **/
     initializeNode: function (node) {
-        node.readability = {"contentScore": 0};         
+        node.readability = {"contentScore": 0};
 
         switch(node.tagName) {
             case 'DIV':
@@ -489,7 +495,7 @@ var readability = {
             case 'BLOCKQUOTE':
                 node.readability.contentScore += 3;
                 break;
-                
+
             case 'ADDRESS':
             case 'OL':
             case 'UL':
@@ -511,10 +517,10 @@ var readability = {
                 node.readability.contentScore -= 5;
                 break;
         }
-       
+
         node.readability.contentScore += readability.getClassWeight(node);
     },
-    
+
     /***
      * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
      *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@@ -525,7 +531,7 @@ var readability = {
     grabArticle: function (pageToClone) {
         var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
             isPaging = (page !== null) ? true: false;
-    
+
         var page = null;
         // Never work on the actual page.
         if (isPaging) {
@@ -533,7 +539,7 @@ var readability = {
         } else {
             page = pageToClone.cloneNode(true);
         }
-     
+
         var allElements = page.getElementsByTagName('*');
 
         /**
@@ -561,7 +567,7 @@ var readability = {
                     node.parentNode.removeChild(node);
                     nodeIndex-=1;
                     continue;
-                }               
+                }
             }
 
             if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
@@ -598,7 +604,7 @@ var readability = {
                         }
                     }
                 }
-            } 
+            }
         }
 
         /**
@@ -640,15 +646,15 @@ var readability = {
 
             /* Add points for any commas within this paragraph */
             contentScore += innerText.split(',').length;
-            
+
             /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
             contentScore += Math.min(Math.floor(innerText.length / 100), 3);
-           
+
             /* Add the score to the parent. The grandparent gets half. */
             parentNode.readability.contentScore += contentScore;
 
             if(grandParentNode) {
-                grandParentNode.readability.contentScore += contentScore/2;             
+                grandParentNode.readability.contentScore += contentScore/2;
             }
         }
 
@@ -725,12 +731,12 @@ var readability = {
             {
                 append = true;
             }
-            
+
             if(siblingNode.nodeName === "P") {
                 var linkDensity = readability.getLinkDensity(siblingNode);
                 var nodeContent = readability.getInnerText(siblingNode);
                 var nodeLength  = nodeContent.length;
-                
+
                 if(nodeLength > 80 && linkDensity < 0.25)
                 {
                     append = true;
@@ -747,7 +753,7 @@ var readability = {
                 var nodeToAppend = null;
                 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
                     /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
-                    
+
                     dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
                     nodeToAppend = document.createElement("DIV");
                     try {
@@ -765,7 +771,7 @@ var readability = {
                     s-=1;
                     sl-=1;
                 }
-                
+
                 /* To ensure a node does not interfere with readability styles, remove its classnames */
                 nodeToAppend.className = "";
 
@@ -779,15 +785,15 @@ var readability = {
         **/
         readability.distilledArticleContent = articleContent.cloneNode(true);
         //readability.prepArticle(articleContent);
-      
+
         if (readability.curPageNum === 1) {
             var newNode = document.createElement('div');
             newNode.id = "readability-page-1";
             newNode.setAttribute("class", "page");
             readability.moveNodeInnards(articleContent, newNode);
             articleContent.appendChild(newNode);
-        } 
-      
+        }
+
         /**
          * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
          * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
@@ -813,7 +819,7 @@ var readability = {
 
         return articleContent;
     },
-    
+
     /**
      * Removes script tags from the document.
      *
@@ -828,12 +834,12 @@ var readability = {
                 scripts[i].nodeValue="";
                 scripts[i].removeAttribute('src');
                 if (scripts[i].parentNode) {
-                        scripts[i].parentNode.removeChild(scripts[i]);          
+                        scripts[i].parentNode.removeChild(scripts[i]);
                 }
             }
         }
     },
-    
+
     /**
      * Get the inner text of a node - cross browser compatibly.
      * This also strips out any excess whitespace to be found.
@@ -896,18 +902,18 @@ var readability = {
             if ( cur.nodeType === 1 ) {
                 // Remove style attribute(s) :
                 if(cur.className !== "readability-styled") {
-                    cur.removeAttribute("style");                   
+                    cur.removeAttribute("style");
                 }
                 readability.cleanStyles( cur );
             }
             cur = cur.nextSibling;
-        }           
+        }
     },
-    
+
     /**
      * Get the density of links as a percentage of the content
      * This is the amount of text that is inside a link divided by the total text in the node.
-     * 
+     *
      * @param Element
      * @return number (float)
     **/
@@ -918,11 +924,11 @@ var readability = {
         for(var i=0, il=links.length; i<il;i+=1)
         {
             linkLength += readability.getInnerText(links[i]).length;
-        }       
+        }
 
         return linkLength / textLength;
     },
-    
+
     /**
      * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
      *
@@ -944,10 +950,10 @@ var readability = {
 
                 /* If the type isn't alpha-only, it's probably not actually a file extension. */
                 if(!possibleType.match(/[^a-zA-Z]/)) {
-                    segment = segment.split(".")[0];                    
+                    segment = segment.split(".")[0];
                 }
             }
-            
+
             /**
              * EW-CMS specific segment replacement. Ugly.
              * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
@@ -968,7 +974,7 @@ var readability = {
             if (i < 2 && segment.match(/^\d{1,2}$/)) {
                 del = true;
             }
-            
+
             /* If this is the first segment and it's just "index", remove it. */
             if(i === 0 && segment.toLowerCase() === "index") {
                 del = true;
@@ -992,7 +998,7 @@ var readability = {
 
     /**
      * Look for any paging links that may occur within the document.
-     * 
+     *
      * @param body
      * @return object (array)
     **/
@@ -1008,7 +1014,7 @@ var readability = {
          *
          * Also possible: levenshtein distance? longest common subsequence?
          *
-         * After we do that, assign each page a score, and 
+         * After we do that, assign each page a score, and
         **/
         for(var i = 0, il = allLinks.length; i < il; i+=1) {
             var link     = allLinks[i],
@@ -1018,12 +1024,12 @@ var readability = {
             if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
                 continue;
             }
-            
+
             /* If it's on a different domain, skip it. */
             if(window.location.host !== linkHref.split(/\/+/g)[1]) {
                 continue;
             }
-            
+
             var linkText = readability.getInnerText(link);
 
             /* If the linkText looks like it's not the next page, skip it. */
@@ -1036,9 +1042,9 @@ var readability = {
             if(!linkHrefLeftover.match(/\d/)) {
                 continue;
             }
-            
+
             if(!(linkHref in possiblePages)) {
-                possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};             
+                possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
             } else {
                 possiblePages[linkHref].linkText += ' | ' + linkText;
             }
@@ -1060,7 +1066,7 @@ var readability = {
             if(linkData.match(/pag(e|ing|inat)/i)) {
                 linkObj.score += 25;
             }
-            if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 
+            if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
                 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
                 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
                     linkObj.score -= 65;
@@ -1087,10 +1093,10 @@ var readability = {
                     /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
                     if(!parentNodeClassAndId.match(readability.regexps.positive)) {
                         linkObj.score -= 25;
-                        negativeNodeMatch = true;                       
+                        negativeNodeMatch = true;
                     }
                 }
-                
+
                 parentNode = parentNode.parentNode;
             }
 
@@ -1152,7 +1158,7 @@ var readability = {
 
             dbg('NEXT PAGE IS ' + nextHref);
             readability.parsedPages[nextHref] = true;
-            return nextHref;            
+            return nextHref;
         }
         else {
             return null;
@@ -1204,7 +1210,7 @@ var readability = {
         if (typeof options === 'undefined') { options = {}; }
 
         request.onreadystatechange = respondToReadyState;
-        
+
         request.open('get', url, true);
         request.setRequestHeader('Accept', 'text/html');
 
@@ -1239,7 +1245,7 @@ var readability = {
             articlePage.appendChild(linkDiv);
             return;
         }
-        
+
         /**
          * Now that we've built the article page DOM element, get the page content
          * asynchronously and load the cleaned content into the div we created for it.
@@ -1257,7 +1263,7 @@ var readability = {
                             return;
                         } else {
                             readability.pageETags[eTag] = 1;
-                        }                       
+                        }
                     }
 
                     // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
@@ -1308,7 +1314,7 @@ var readability = {
                             }
                         }
                     }
-                    
+
                     readability.removeScripts(content);
 
                     readability.moveNodeInnards(content, thisPage);
@@ -1330,9 +1336,9 @@ var readability = {
             });
         }(nextPageLink, articlePage));
     },
-    
+
     /**
-     * Get an elements class/id weight. Uses regular expressions to tell if this 
+     * Get an elements class/id weight. Uses regular expressions to tell if this
      * element looks good or bad.
      *
      * @param Element
@@ -1382,7 +1388,7 @@ var readability = {
         var allElements = e.getElementsByTagName('*');
         while (i < allElements.length) {
             readability.deleteExtraBreaks(allElements[i]);
-            i++; 
+            i++;
         }
     },
 
@@ -1397,7 +1403,7 @@ var readability = {
     clean: function (e, tag) {
         var targetList = e.getElementsByTagName( tag );
         var isEmbed    = (tag === 'object' || tag === 'embed');
-        
+
         for (var y=targetList.length-1; y >= 0; y-=1) {
             /* Allow youtube and vimeo videos through as people usually want to see those. */
             if(isEmbed) {
@@ -1405,7 +1411,7 @@ var readability = {
                 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
                     attributeValues += targetList[y].attributes[i].value + '|';
                 }
-                
+
                 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
                 if (attributeValues.search(readability.regexps.videos) !== -1) {
                     continue;
@@ -1415,13 +1421,13 @@ var readability = {
                 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
                     continue;
                 }
-                
+
             }
 
             targetList[y].parentNode.removeChild(targetList[y]);
         }
     },
-    
+
     /**
      * Clean an element of all tags of type "tag" if they look fishy.
      * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
@@ -1446,7 +1452,7 @@ var readability = {
         for (var i=curTagsLength-1; i >= 0; i-=1) {
             var weight = readability.getClassWeight(tagsList[i]);
             var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
-            
+
             dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
 
             if(weight+contentScore < 0)
@@ -1467,7 +1473,7 @@ var readability = {
                 var embeds     = tagsList[i].getElementsByTagName("embed");
                 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
                     if (embeds[ei].src.search(readability.regexps.videos) === -1) {
-                      embedCount+=1; 
+                      embedCount+=1;
                     }
                 }
 
@@ -1480,7 +1486,7 @@ var readability = {
                 } else if(li > p && tag !== "ul" && tag !== "ol") {
                     toRemove = true;
                 } else if( input > Math.floor(p/3) ) {
-                    toRemove = true; 
+                    toRemove = true;
                 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
                     toRemove = true;
                 } else if(weight < 25 && linkDensity > 0.2) {
@@ -1522,7 +1528,7 @@ var readability = {
     addFlag: function(flag) {
         readability.flags = readability.flags | flag;
     },
- 
+
     removeFlag: function(flag) {
         readability.flags = readability.flags & ~flag;
     },
@@ -1591,7 +1597,7 @@ var readability = {
         }
         return ret;
     },
-  
+
     // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
     // <P> node, and makes all next siblings of that pair children of <P>, up
     // until the next pair of <BR> nodes is reached.
@@ -1600,7 +1606,7 @@ var readability = {
         var second = readability.isMultipleBr(node, true);
         if (!second) {
             return;
-        } 
+        }
         // Make all next siblings of the second BR into children of a P.
         var p = document.createElement('p');
         var curr = second.nextSibling;
@@ -1613,7 +1619,7 @@ var readability = {
             curr = next;
         }
         var ret = curr;
-  
+
         // Remove all nodes between the first and second BR.
         curr = node.nextSibling;
         while (curr && curr != second) {
@@ -1625,10 +1631,10 @@ var readability = {
         second.parentNode.removeChild(second);
         // Replace the first BR with the P.
         node.parentNode.replaceChild(p, node);
-  
+
         return ret;
     },
-  
+
     // Returns true if the NodeList contains a double <BR>.
     hasDoubleBr: function(nodeList) {
         for (var i = 0; i < nodeList.length; nodeList++) {
@@ -1637,8 +1643,8 @@ var readability = {
             }
         }
         return false;
-    }, 
-    
+    },
+
     // Replaces double <BR> tags with <P> tags.
     replaceDoubleBrsWithPs: function(node) {
         var allElements = node.getElementsByTagName('BR');
@@ -1652,8 +1658,8 @@ var readability = {
             allElements = document.body.getElementsByTagName('BR');
         }
     },
-  
-  
+
+
     // Replaces a BR and the whitespace that follows it with a P.
     replaceBrWithP: function(node) {
         if (!readability.isBrNode(node)) {
@@ -1673,7 +1679,7 @@ var readability = {
         node.parentNode.replaceChild(p, node);
         return curr;
     },
-  
+
     // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
     // children of the <P>.
     replaceBrsWithPs: function(node) {
@@ -1687,27 +1693,27 @@ var readability = {
             allElements = document.body.getElementsByTagName('BR');
         }
     },
-  
+
     // Replaces any tag with any other tag.
     replaceTagsWithTags: function(node, srcTag, destTag) {
         var allElements = node.getElementsByTagName(srcTag);
         for (var i = 0; i < allElements.length; i++) {
             var dest = document.createElement(destTag);
             readability.moveNodeInnards(allElements[i], dest);
-            node.replaceNode(dest, allElements[i]);
+            allElements[i].parentNode.replaceChild(dest, allElements[i]);
         }
     },
-  
+
     // Replaces all <noscript> tags with <p> tags.
     replaceNoscriptsWithPs: function(node) {
         readability.replaceTagsWithTags(node, 'noscript', 'p');
     },
-  
+
     // Replaces all <font> tags with <span> tags.
     replaceFontsWithSpans: function(node) {
         readability.replaceTagsWithTags(node, 'font', 'span');
     },
-  
+
     // Returns a list of image URLs in the distilled article.
     getImages : function() {
         var images = document.getElementsByTagName('img');
@@ -1719,10 +1725,15 @@ var readability = {
         }
         return result;
     },
-  
+
     // Returns the distilled article HTML from the page(s).
     getDistilledArticleHTML : function() {
         return readability.distilledHTML;
+    },
+
+    // Returns the next page of this article.
+    getNextPageLink : function() {
+        return readability.nextPageLink;
     }
 };
 
@@ -1730,12 +1741,13 @@ var readability = {
 // element is the article title, the second element is HTML containing the
 // long-form content, and remaining elements are URLs for images referenced by
 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which
-// corresponds to a URL listed at index k in the array returned. 
+// corresponds to a URL listed at index k in the array returned.
 (function () {
     readability.init();
-    var result = new Array(2);
+    var result = new Array(3);
     result[0] = readability.getArticleTitle();
     result[1] = readability.getDistilledArticleHTML();
+    result[2] = readability.getNextPageLink();
     return result.concat(readability.getImages());
 }())
author	shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2014-02-05 05:56:20 +0000
committer	shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2014-02-05 05:56:20 +0000
commit	415f99b18e6f14c53e08f6574a3bc653049fbaae (patch)
tree	6039149dc08176419753d850eba0cc74ff1b1a43
parent	640e10a877584a90c8200490c41c937501cca5a3 (diff)
download	chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.zip chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.gz chromium_src-415f99b18e6f14c53e08f6574a3bc653049fbaae.tar.bz2