diff options
author | kuan <kuan@chromium.org> | 2014-10-27 18:06:22 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2014-10-28 01:06:36 +0000 |
commit | 42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8 (patch) | |
tree | 72b8bccbb7b1139eae7dc50a35cc2795efd60f44 /components/dom_distiller | |
parent | b2237a9c8e8c7c078157a02566bcd2f64f3eca94 (diff) | |
download | chromium_src-42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8.zip chromium_src-42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8.tar.gz chromium_src-42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8.tar.bz2 |
add pagination info to DistilledPageProto
- this is part of the implementation of scoring mechanism for next/prev page
links.
- pagination info will be passed to the scoring framework.
- standalone content extractor outputs the page links.
- modified tests to verify pagination info.
BUG=425952
Review URL: https://codereview.chromium.org/678543002
Cr-Commit-Position: refs/heads/master@{#301527}
Diffstat (limited to 'components/dom_distiller')
4 files changed, 63 insertions, 12 deletions
diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc index 3976d89..b5bd95d 100644 --- a/components/dom_distiller/core/distiller.cc +++ b/components/dom_distiller/core/distiller.cc @@ -177,6 +177,8 @@ void DistillerImpl::OnPageDistillationFinished( // The pages should be in same origin. DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin()); AddToDistillationQueue(page_num + 1, next_page_url); + page_data->distilled_page_proto->data.mutable_pagination_info()-> + set_next_page(next_page_url.spec()); } } @@ -185,6 +187,16 @@ void DistillerImpl::OnPageDistillationFinished( if (prev_page_url.is_valid()) { DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin()); AddToDistillationQueue(page_num - 1, prev_page_url); + page_data->distilled_page_proto->data.mutable_pagination_info()-> + set_prev_page(prev_page_url.spec()); + } + } + + if (pagination_info.has_canonical_page()) { + GURL canonical_page_url(pagination_info.canonical_page()); + if (canonical_page_url.is_valid()) { + page_data->distilled_page_proto->data.mutable_pagination_info()-> + set_canonical_page(canonical_page_url.spec()); } } } diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc index a6ef019..ed70ec5 100644 --- a/components/dom_distiller/core/distiller_unittest.cc +++ b/components/dom_distiller/core/distiller_unittest.cc @@ -137,20 +137,29 @@ void VerifyIncrementalUpdatesMatch( } } +string GenerateNextPageUrl(const std::string& url_prefix, size_t page_num, + size_t pages_size) { + return page_num + 1 < pages_size ? + url_prefix + base::IntToString(page_num + 1) : ""; +} + +string GeneratePrevPageUrl(const std::string& url_prefix, size_t page_num) { + return page_num > 0 ? url_prefix + base::IntToString(page_num - 1) : ""; +} + scoped_ptr<MultipageDistillerData> CreateMultipageDistillerDataWithoutImages( size_t pages_size) { scoped_ptr<MultipageDistillerData> result(new MultipageDistillerData()); - string url_prefix = "http://a.com/"; + string url_prefix = kURL; for (size_t page_num = 0; page_num < pages_size; ++page_num) { result->page_urls.push_back(url_prefix + base::IntToString(page_num)); result->content.push_back("Content for page:" + base::IntToString(page_num)); result->image_ids.push_back(vector<int>()); - string next_page_url = (page_num + 1 < pages_size) - ? url_prefix + base::IntToString(page_num + 1) - : ""; + string next_page_url = + GenerateNextPageUrl(url_prefix, page_num, pages_size); string prev_page_url = - (page_num > 0) ? result->page_urls[page_num - 1] : ""; + GeneratePrevPageUrl(url_prefix, page_num); scoped_ptr<base::Value> distilled_value = CreateDistilledValueReturnedFromJS(kTitle, result->content[page_num], @@ -165,10 +174,13 @@ scoped_ptr<MultipageDistillerData> CreateMultipageDistillerDataWithoutImages( void VerifyArticleProtoMatchesMultipageData( const dom_distiller::DistilledArticleProto* article_proto, const MultipageDistillerData* distiller_data, - size_t pages_size) { - ASSERT_EQ(pages_size, static_cast<size_t>(article_proto->pages_size())); + size_t distilled_pages_size, + size_t total_pages_size) { + ASSERT_EQ(distilled_pages_size, + static_cast<size_t>(article_proto->pages_size())); EXPECT_EQ(kTitle, article_proto->title()); - for (size_t page_num = 0; page_num < pages_size; ++page_num) { + std::string url_prefix = kURL; + for (size_t page_num = 0; page_num < distilled_pages_size; ++page_num) { const dom_distiller::DistilledPageProto& page = article_proto->pages(page_num); EXPECT_EQ(distiller_data->content[page_num], page.html()); @@ -182,6 +194,13 @@ void VerifyArticleProtoMatchesMultipageData( EXPECT_EQ(GetImageName(page_num + 1, img_num), page.image(img_num).name()); } + std::string expected_next_page_url = + GenerateNextPageUrl(url_prefix, page_num, total_pages_size); + std::string expected_prev_page_url = + GeneratePrevPageUrl(url_prefix, page_num); + EXPECT_EQ(expected_next_page_url, page.pagination_info().next_page()); + EXPECT_EQ(expected_prev_page_url, page.pagination_info().prev_page()); + EXPECT_FALSE(page.pagination_info().has_canonical_page()); } } @@ -390,7 +409,7 @@ TEST_F(DistillerTest, DistillMultiplePages) { CreateMockDistillerPages(distiller_data.get(), kNumPages, 0).Pass()); base::MessageLoop::current()->RunUntilIdle(); VerifyArticleProtoMatchesMultipageData( - article_proto_.get(), distiller_data.get(), kNumPages); + article_proto_.get(), distiller_data.get(), kNumPages, kNumPages); } TEST_F(DistillerTest, DistillLinkLoop) { @@ -498,7 +517,7 @@ TEST_F(DistillerTest, MultiplePagesDistillationFailure) { base::MessageLoop::current()->RunUntilIdle(); EXPECT_EQ(kTitle, article_proto_->title()); VerifyArticleProtoMatchesMultipageData( - article_proto_.get(), distiller_data.get(), failed_page_num); + article_proto_.get(), distiller_data.get(), failed_page_num, kNumPages); } TEST_F(DistillerTest, DistillPreviousPage) { @@ -517,7 +536,7 @@ TEST_F(DistillerTest, DistillPreviousPage) { distiller_data.get(), kNumPages, start_page_num).Pass()); base::MessageLoop::current()->RunUntilIdle(); VerifyArticleProtoMatchesMultipageData( - article_proto_.get(), distiller_data.get(), kNumPages); + article_proto_.get(), distiller_data.get(), kNumPages, kNumPages); } TEST_F(DistillerTest, IncrementalUpdates) { @@ -562,7 +581,7 @@ TEST_F(DistillerTest, IncrementalUpdatesDoNotDeleteFinalArticle) { // Should still be able to access article and pages. VerifyArticleProtoMatchesMultipageData( - article_proto_.get(), distiller_data.get(), kNumPages); + article_proto_.get(), distiller_data.get(), kNumPages, kNumPages); } TEST_F(DistillerTest, DeletingArticleDoesNotInterfereWithUpdates) { diff --git a/components/dom_distiller/core/proto/distilled_page.proto b/components/dom_distiller/core/proto/distilled_page.proto index 1670b3d..5ece78f 100644 --- a/components/dom_distiller/core/proto/distilled_page.proto +++ b/components/dom_distiller/core/proto/distilled_page.proto @@ -42,4 +42,13 @@ message DistilledPageProto { // text directionality optional string text_direction = 7; + + message PaginationInfo { + optional string next_page = 1; + optional string prev_page = 2; + optional string canonical_page = 3; + } + + // Pagination info for this page. + optional PaginationInfo pagination_info = 8; } diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc index 150fd31..b6a5a4e 100644 --- a/components/dom_distiller/standalone/content_extractor.cc +++ b/components/dom_distiller/standalone/content_extractor.cc @@ -138,12 +138,23 @@ std::string GetReadableArticleString( output << "Article Title: " << article_proto.title() << std::endl; output << "# of pages: " << article_proto.pages_size() << std::endl; for (int i = 0; i < article_proto.pages_size(); ++i) { + if (i > 0) output << std::endl; const DistilledPageProto& page = article_proto.pages(i); output << "Page " << i << std::endl; output << "URL: " << page.url() << std::endl; output << "Content: " << page.html() << std::endl; if (page.has_debug_info() && page.debug_info().has_log()) output << "Log: " << page.debug_info().log() << std::endl; + if (page.has_pagination_info()) { + if (page.pagination_info().has_next_page()) { + output << "Next Page: " << page.pagination_info().next_page() + << std::endl; + } + if (page.pagination_info().has_prev_page()) { + output << "Prev Page: " << page.pagination_info().prev_page() + << std::endl; + } + } } return output.str(); } |