diff options
author | shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-02-15 05:44:59 +0000 |
---|---|---|
committer | shashishekhar@chromium.org <shashishekhar@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-02-15 05:44:59 +0000 |
commit | 178277c96c4dc1ab93e3be2adf73bc91256ba453 (patch) | |
tree | a66ebf93071fd6ff97807dc84440ff6a3dd52c10 /components | |
parent | 77a3bfabad6e282d23d840f7019399710e9a5737 (diff) | |
download | chromium_src-178277c96c4dc1ab93e3be2adf73bc91256ba453.zip chromium_src-178277c96c4dc1ab93e3be2adf73bc91256ba453.tar.gz chromium_src-178277c96c4dc1ab93e3be2adf73bc91256ba453.tar.bz2 |
Store page no for distilled pages undergoing distillation.
In order to support distillation of previous pages and enable
meaningful incremental updates for viewers, distiller needs
to maintain page number information for pages under distillation.
This information will be used to add support for incremental updates
and distilling previous pages of an article.
BUG=288015
TEST=Covered by existing tests + added tests for failure and
page limit for distiller.
Review URL: https://codereview.chromium.org/130543003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@251546 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'components')
-rw-r--r-- | components/dom_distiller/core/distiller.cc | 182 | ||||
-rw-r--r-- | components/dom_distiller/core/distiller.h | 81 | ||||
-rw-r--r-- | components/dom_distiller/core/distiller_unittest.cc | 114 |
3 files changed, 319 insertions, 58 deletions
diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc index 8e155eb..9bd8ba2 100644 --- a/components/dom_distiller/core/distiller.cc +++ b/components/dom_distiller/core/distiller.cc @@ -21,7 +21,7 @@ namespace { // Maximum number of distilled pages in an article. -const int kMaxPagesInArticle = 32; +const size_t kMaxPagesInArticle = 32; } namespace dom_distiller { @@ -41,64 +41,105 @@ scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() { return distiller.PassAs<Distiller>(); } +DistillerImpl::DistilledPageData::DistilledPageData() {} + +DistillerImpl::DistilledPageData::~DistilledPageData() {} + DistillerImpl::DistillerImpl( const DistillerPageFactory& distiller_page_factory, const DistillerURLFetcherFactory& distiller_url_fetcher_factory) : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), - distillation_in_progress_(false) { + max_pages_in_article_(kMaxPagesInArticle) { page_distiller_.reset(new PageDistiller(distiller_page_factory)); } -DistillerImpl::~DistillerImpl() { - DCHECK(image_fetchers_.empty()); - DCHECK(!distillation_in_progress_); -} +DistillerImpl::~DistillerImpl() { DCHECK(AreAllPagesFinished()); } void DistillerImpl::Init() { - DCHECK(!distillation_in_progress_); + DCHECK(AreAllPagesFinished()); page_distiller_->Init(); - article_proto_.reset(new DistilledArticleProto()); +} + +void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) { + max_pages_in_article_ = max_num_pages; +} + +bool DistillerImpl::AreAllPagesFinished() const { + return started_pages_index_.empty() && waiting_pages_.empty(); +} + +size_t DistillerImpl::TotalPageCount() const { + return waiting_pages_.size() + started_pages_index_.size() + + finished_pages_index_.size(); +} + +void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) { + if (!IsPageNumberInUse(page_num) && url.is_valid() && + TotalPageCount() < max_pages_in_article_ && + seen_urls_.find(url.spec()) == seen_urls_.end()) { + waiting_pages_[page_num] = url; + } +} + +bool DistillerImpl::IsPageNumberInUse(int page_num) const { + return waiting_pages_.find(page_num) != waiting_pages_.end() || + started_pages_index_.find(page_num) != started_pages_index_.end() || + finished_pages_index_.find(page_num) != finished_pages_index_.end(); +} + +DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index) + const { + DCHECK_LT(index, pages_.size()); + DistilledPageData* page_data = pages_[index]; + DCHECK(page_data); + return page_data; } void DistillerImpl::DistillPage(const GURL& url, const DistillerCallback& distillation_cb) { - DCHECK(!distillation_in_progress_); + DCHECK(AreAllPagesFinished()); distillation_cb_ = distillation_cb; - DistillPage(url); + + AddToDistillationQueue(0, url); + DistillNextPage(); } -void DistillerImpl::DistillPage(const GURL& url) { - DCHECK(!distillation_in_progress_); - if (url.is_valid() && article_proto_->pages_size() < kMaxPagesInArticle && - processed_urls_.find(url.spec()) == processed_urls_.end()) { - distillation_in_progress_ = true; - // Distill the next page. +void DistillerImpl::DistillNextPage() { + if (!waiting_pages_.empty()) { + std::map<int, GURL>::iterator front = waiting_pages_.begin(); + int page_num = front->first; + const GURL url = front->second; + + waiting_pages_.erase(front); DCHECK(url.is_valid()); - DCHECK_LT(article_proto_->pages_size(), kMaxPagesInArticle); + DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end()); + DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); + seen_urls_.insert(url.spec()); + pages_.push_back(new DistilledPageData()); + started_pages_index_[page_num] = pages_.size() - 1; page_distiller_->DistillPage( url, base::Bind(&DistillerImpl::OnPageDistillationFinished, base::Unretained(this), + page_num, url)); - } else { - RunDistillerCallbackIfDone(); } } void DistillerImpl::OnPageDistillationFinished( + int page_num, const GURL& page_url, scoped_ptr<DistilledPageInfo> distilled_page, bool distillation_successful) { - DCHECK(distillation_in_progress_); DCHECK(distilled_page.get()); - if (!distillation_successful) { - RunDistillerCallbackIfDone(); - } else { - DistilledPageProto* current_page = article_proto_->add_pages(); - // Set the title of the article as the title of the first page. - if (article_proto_->pages_size() == 1) { - article_proto_->set_title(distilled_page->title); - } + DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); + if (distillation_successful) { + DistilledPageData* page_data = + GetPageAtIndex(started_pages_index_[page_num]); + DistilledPageProto* current_page = new DistilledPageProto(); + page_data->proto.reset(current_page); + page_data->page_num = page_num; + page_data->title = distilled_page->title; current_page->set_url(page_url.spec()); current_page->set_html(distilled_page->html); @@ -107,59 +148,106 @@ void DistillerImpl::OnPageDistillationFinished( if (next_page_url.is_valid()) { // The pages should be in same origin. DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); + AddToDistillationQueue(page_num + 1, next_page_url); } - processed_urls_.insert(page_url.spec()); - distillation_in_progress_ = false; - int page_number = article_proto_->pages_size(); for (size_t img_num = 0; img_num < distilled_page->image_urls.size(); ++img_num) { std::string image_id = - base::IntToString(page_number) + "_" + base::IntToString(img_num); - FetchImage(current_page, image_id, distilled_page->image_urls[img_num]); + base::IntToString(page_num + 1) + "_" + base::IntToString(img_num); + FetchImage(page_num, image_id, distilled_page->image_urls[img_num]); } - DistillPage(next_page_url); + + AddPageIfDone(page_num); + DistillNextPage(); + } else { + started_pages_index_.erase(page_num); + RunDistillerCallbackIfDone(); } } -void DistillerImpl::FetchImage(DistilledPageProto* distilled_page_proto, +void DistillerImpl::FetchImage(int page_num, const std::string& image_id, const std::string& item) { + DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); + DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); DistillerURLFetcher* fetcher = distiller_url_fetcher_factory_.CreateDistillerURLFetcher(); - image_fetchers_.push_back(fetcher); + page_data->image_fetchers_.push_back(fetcher); + fetcher->FetchURL(item, base::Bind(&DistillerImpl::OnFetchImageDone, base::Unretained(this), - base::Unretained(distilled_page_proto), + page_num, base::Unretained(fetcher), image_id)); } -void DistillerImpl::OnFetchImageDone(DistilledPageProto* distilled_page_proto, +void DistillerImpl::OnFetchImageDone(int page_num, DistillerURLFetcher* url_fetcher, const std::string& id, const std::string& response) { - DCHECK_GT(article_proto_->pages_size(), 0); - DCHECK(distilled_page_proto); + DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); + DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); + DCHECK(page_data->proto); DCHECK(url_fetcher); ScopedVector<DistillerURLFetcher>::iterator fetcher_it = - std::find(image_fetchers_.begin(), image_fetchers_.end(), url_fetcher); + std::find(page_data->image_fetchers_.begin(), + page_data->image_fetchers_.end(), + url_fetcher); - DCHECK(fetcher_it != image_fetchers_.end()); + DCHECK(fetcher_it != page_data->image_fetchers_.end()); // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone // callback is invoked by the |url_fetcher|. - image_fetchers_.weak_erase(fetcher_it); + page_data->image_fetchers_.weak_erase(fetcher_it); base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher); - DistilledPageProto_Image* image = distilled_page_proto->add_image(); + + DistilledPageProto_Image* image = page_data->proto->add_image(); image->set_name(id); image->set_data(response); - RunDistillerCallbackIfDone(); + + AddPageIfDone(page_num); +} + +void DistillerImpl::AddPageIfDone(int page_num) { + DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end()); + DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end()); + DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]); + if (page_data->image_fetchers_.empty()) { + finished_pages_index_[page_num] = started_pages_index_[page_num]; + started_pages_index_.erase(page_num); + RunDistillerCallbackIfDone(); + } } void DistillerImpl::RunDistillerCallbackIfDone() { - if (image_fetchers_.empty() && !distillation_in_progress_) { - distillation_cb_.Run(article_proto_.Pass()); + DCHECK(!distillation_cb_.is_null()); + if (AreAllPagesFinished()) { + bool first_page = true; + scoped_ptr<DistilledArticleProto> article_proto( + new DistilledArticleProto()); + // Stitch the pages back into the article. + for (std::map<int, size_t>::iterator it = finished_pages_index_.begin(); + it != finished_pages_index_.end();) { + DistilledPageData* page_data = GetPageAtIndex(it->second); + *(article_proto->add_pages()) = *(page_data->proto); + + if (first_page) { + article_proto->set_title(page_data->title); + first_page = false; + } + + finished_pages_index_.erase(it++); + } + + pages_.clear(); + DCHECK_LE(static_cast<size_t>(article_proto->pages_size()), + max_pages_in_article_); + + DCHECK(pages_.empty()); + DCHECK(finished_pages_index_.empty()); + distillation_cb_.Run(article_proto.Pass()); + distillation_cb_.Reset(); } } diff --git a/components/dom_distiller/core/distiller.h b/components/dom_distiller/core/distiller.h index 5490750..0b3a49f 100644 --- a/components/dom_distiller/core/distiller.h +++ b/components/dom_distiller/core/distiller.h @@ -68,36 +68,97 @@ class DistillerImpl : public Distiller { virtual void DistillPage(const GURL& url, const DistillerCallback& callback) OVERRIDE; + void SetMaxNumPagesInArticle(size_t max_num_pages); + private: - void OnFetchImageDone(DistilledPageProto* distilled_page_proto, + // In case of multiple pages, the Distiller maintains state of multiple pages + // as page numbers relative to the page number where distillation started. + // E.g. if distillation starts at page 2 for a 3 page article. The relative + // page numbers assigned to pages will be [-1,0,1]. + + // Class representing the state of a page under distillation. + struct DistilledPageData { + DistilledPageData(); + virtual ~DistilledPageData(); + // Relative page number of the page. + int page_num; + std::string title; + ScopedVector<DistillerURLFetcher> image_fetchers_; + scoped_ptr<DistilledPageProto> proto; + + private: + DISALLOW_COPY_AND_ASSIGN(DistilledPageData); + }; + + void OnFetchImageDone(int page_num, DistillerURLFetcher* url_fetcher, const std::string& id, const std::string& response); - void OnPageDistillationFinished(const GURL& page_url, + void OnPageDistillationFinished(int page_num, + const GURL& page_url, scoped_ptr<DistilledPageInfo> distilled_page, bool distillation_successful); - virtual void FetchImage(DistilledPageProto* distilled_page_proto, + virtual void FetchImage(int page_num, const std::string& image_id, const std::string& item); - // Distills the page and adds the new page to |article_proto|. - void DistillPage(const GURL& url); + // Distills the next page. + void DistillNextPage(); + + // Adds the |url| to |pages_to_be_distilled| if |page_num| is a valid relative + // page number and |url| is valid. Ignores duplicate pages and urls. + void AddToDistillationQueue(int page_num, const GURL& url); + + // Check if |page_num| is a valid relative page number, i.e. page with + // |page_num| is either under distillation or has already completed + // distillation. + bool IsPageNumberInUse(int page_num) const; + + bool AreAllPagesFinished() const; + + // Total number of pages in the article that the distiller knows of, this + // includes pages that are pending distillation. + size_t TotalPageCount() const; // Runs |distillation_cb_| if all distillation callbacks and image fetches are // complete. void RunDistillerCallbackIfDone(); + // Checks if page |distilled_page_data| has finished distillation, including + // all image fetches. + void AddPageIfDone(int page_num); + + DistilledPageData* GetPageAtIndex(size_t index) const; + const DistillerURLFetcherFactory& distiller_url_fetcher_factory_; scoped_ptr<PageDistiller> page_distiller_; DistillerCallback distillation_cb_; - ScopedVector<DistillerURLFetcher> image_fetchers_; - scoped_ptr<DistilledArticleProto> article_proto_; - bool distillation_in_progress_; - // Set to keep track of which urls are already seen by the distiller. - base::hash_set<std::string> processed_urls_; + // Set of pages that are under distillation or have finished distillation. + // |started_pages_index_| and |finished_pages_index_| maintains the mapping + // from page number to the indices in |pages_|. + ScopedVector<DistilledPageData> pages_; + + // Maps page numbers of finished pages to the indices in |pages_|. + std::map<int, size_t> finished_pages_index_; + + // Maps page numbers of pages under distillation to the indices in |pages_|. + // If a page is |started_pages_| that means it is still waiting for an action + // (distillation or image fetch) to finish. + base::hash_map<int, size_t> started_pages_index_; + + // The list of pages that are still waiting for distillation to start. + // This is a map, to make distiller prefer distilling lower page numbers + // first. + std::map<int, GURL> waiting_pages_; + + // Set to keep track of which urls are already seen by the distiller. Used to + // prevent distiller from distilling the same url twice. + base::hash_set<std::string> seen_urls_; + + size_t max_pages_in_article_; DISALLOW_COPY_AND_ASSIGN(DistillerImpl); }; diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc index 3865aef..092433b 100644 --- a/components/dom_distiller/core/distiller_unittest.cc +++ b/components/dom_distiller/core/distiller_unittest.cc @@ -117,7 +117,6 @@ class MockDistillerPageFactory : public DistillerPageFactory { } }; - class DistillerTest : public testing::Test { public: virtual ~DistillerTest() {} @@ -290,4 +289,117 @@ TEST_F(DistillerTest, DistillLinkLoop) { EXPECT_EQ(article_proto_->pages_size(), 1); } +TEST_F(DistillerTest, CheckMaxPageLimit) { + base::MessageLoopForUI loop; + const size_t kMaxPagesInArticle = 10; + string page_urls[kMaxPagesInArticle]; + scoped_ptr<base::ListValue> list[kMaxPagesInArticle]; + + // Note: Next page url of the last page of article is set. So distiller will + // try to do kMaxPagesInArticle + 1 calls if the max article limit does not + // work. + string url_prefix = "http://a.com/"; + for (size_t page_num = 0; page_num < kMaxPagesInArticle; ++page_num) { + page_urls[page_num] = url_prefix + base::IntToString(page_num + 1); + string content = "Content for page:" + base::IntToString(page_num); + string next_page_url = url_prefix + base::IntToString(page_num + 2); + list[page_num] = CreateDistilledValueReturnedFromJS( + kTitle, content, vector<int>(), next_page_url); + } + + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPages( + list, page_urls, static_cast<int>(kMaxPagesInArticle))); + + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + + distiller_->SetMaxNumPagesInArticle(kMaxPagesInArticle); + + distiller_->Init(); + distiller_->DistillPage( + GURL(page_urls[0]), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(kMaxPagesInArticle, + static_cast<size_t>(article_proto_->pages_size())); + + // Now check if distilling an article with exactly the page limit works by + // resetting the next page url of the last page of the article. + list[kMaxPagesInArticle - 1] = + CreateDistilledValueReturnedFromJS(kTitle, "Content", vector<int>(), ""); + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPages( + list, page_urls, static_cast<int>(kMaxPagesInArticle))); + + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + distiller_->SetMaxNumPagesInArticle(kMaxPagesInArticle); + + distiller_->Init(); + distiller_->DistillPage( + GURL(page_urls[0]), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(kMaxPagesInArticle, + static_cast<size_t>(article_proto_->pages_size())); +} + +TEST_F(DistillerTest, SinglePageDistillationFailure) { + base::MessageLoopForUI loop; + // To simulate failure return a null value. + scoped_ptr<base::Value> nullValue(base::Value::CreateNullValue()); + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPage(nullValue.get(), GURL(kURL))); + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + distiller_->Init(); + distiller_->DistillPage( + GURL(kURL), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ("", article_proto_->title()); + EXPECT_EQ(0, article_proto_->pages_size()); +} + +TEST_F(DistillerTest, MultiplePagesDistillationFailure) { + base::MessageLoopForUI loop; + const int kNumPages = 8; + string content[kNumPages]; + string page_urls[kNumPages]; + scoped_ptr<base::Value> distilled_values[kNumPages]; + // The page number of the failed page. + int failed_page_num = 3; + string url_prefix = "http://a.com/"; + for (int page_num = 0; page_num < kNumPages; ++page_num) { + page_urls[page_num] = url_prefix + base::IntToString(page_num); + content[page_num] = "Content for page:" + base::IntToString(page_num); + string next_page_url = url_prefix + base::IntToString(page_num + 1); + if (page_num != failed_page_num) { + distilled_values[page_num] = CreateDistilledValueReturnedFromJS( + kTitle, content[page_num], vector<int>(), next_page_url); + } else { + distilled_values[page_num].reset(base::Value::CreateNullValue()); + } + } + + // Expect only calls till the failed page number. + EXPECT_CALL(page_factory_, CreateDistillerPageMock(_)) + .WillOnce(CreateMockDistillerPages( + distilled_values, page_urls, failed_page_num + 1)); + + distiller_.reset(new DistillerImpl(page_factory_, url_fetcher_factory_)); + distiller_->Init(); + distiller_->DistillPage( + GURL(page_urls[0]), + base::Bind(&DistillerTest::OnDistillPageDone, base::Unretained(this))); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(kTitle, article_proto_->title()); + EXPECT_EQ(article_proto_->pages_size(), failed_page_num); + for (int page_num = 0; page_num < failed_page_num; ++page_num) { + const DistilledPageProto& page = article_proto_->pages(page_num); + EXPECT_EQ(content[page_num], page.html()); + EXPECT_EQ(page_urls[page_num], page.url()); + } +} + } // namespace dom_distiller |