summaryrefslogtreecommitdiffstats
path: root/components/dom_distiller
diff options
context:
space:
mode:
authorkuan <kuan@chromium.org>2014-10-27 18:06:22 -0700
committerCommit bot <commit-bot@chromium.org>2014-10-28 01:06:36 +0000
commit42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8 (patch)
tree72b8bccbb7b1139eae7dc50a35cc2795efd60f44 /components/dom_distiller
parentb2237a9c8e8c7c078157a02566bcd2f64f3eca94 (diff)
downloadchromium_src-42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8.zip
chromium_src-42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8.tar.gz
chromium_src-42a54f5f7d5da2ea5ccc1d0ffda4a18ea2314ae8.tar.bz2
add pagination info to DistilledPageProto
- this is part of the implementation of scoring mechanism for next/prev page links. - pagination info will be passed to the scoring framework. - standalone content extractor outputs the page links. - modified tests to verify pagination info. BUG=425952 Review URL: https://codereview.chromium.org/678543002 Cr-Commit-Position: refs/heads/master@{#301527}
Diffstat (limited to 'components/dom_distiller')
-rw-r--r--components/dom_distiller/core/distiller.cc12
-rw-r--r--components/dom_distiller/core/distiller_unittest.cc43
-rw-r--r--components/dom_distiller/core/proto/distilled_page.proto9
-rw-r--r--components/dom_distiller/standalone/content_extractor.cc11
4 files changed, 63 insertions, 12 deletions
diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc
index 3976d89..b5bd95d 100644
--- a/components/dom_distiller/core/distiller.cc
+++ b/components/dom_distiller/core/distiller.cc
@@ -177,6 +177,8 @@ void DistillerImpl::OnPageDistillationFinished(
// The pages should be in same origin.
DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
AddToDistillationQueue(page_num + 1, next_page_url);
+ page_data->distilled_page_proto->data.mutable_pagination_info()->
+ set_next_page(next_page_url.spec());
}
}
@@ -185,6 +187,16 @@ void DistillerImpl::OnPageDistillationFinished(
if (prev_page_url.is_valid()) {
DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
AddToDistillationQueue(page_num - 1, prev_page_url);
+ page_data->distilled_page_proto->data.mutable_pagination_info()->
+ set_prev_page(prev_page_url.spec());
+ }
+ }
+
+ if (pagination_info.has_canonical_page()) {
+ GURL canonical_page_url(pagination_info.canonical_page());
+ if (canonical_page_url.is_valid()) {
+ page_data->distilled_page_proto->data.mutable_pagination_info()->
+ set_canonical_page(canonical_page_url.spec());
}
}
}
diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc
index a6ef019..ed70ec5 100644
--- a/components/dom_distiller/core/distiller_unittest.cc
+++ b/components/dom_distiller/core/distiller_unittest.cc
@@ -137,20 +137,29 @@ void VerifyIncrementalUpdatesMatch(
}
}
+string GenerateNextPageUrl(const std::string& url_prefix, size_t page_num,
+ size_t pages_size) {
+ return page_num + 1 < pages_size ?
+ url_prefix + base::IntToString(page_num + 1) : "";
+}
+
+string GeneratePrevPageUrl(const std::string& url_prefix, size_t page_num) {
+ return page_num > 0 ? url_prefix + base::IntToString(page_num - 1) : "";
+}
+
scoped_ptr<MultipageDistillerData> CreateMultipageDistillerDataWithoutImages(
size_t pages_size) {
scoped_ptr<MultipageDistillerData> result(new MultipageDistillerData());
- string url_prefix = "http://a.com/";
+ string url_prefix = kURL;
for (size_t page_num = 0; page_num < pages_size; ++page_num) {
result->page_urls.push_back(url_prefix + base::IntToString(page_num));
result->content.push_back("Content for page:" +
base::IntToString(page_num));
result->image_ids.push_back(vector<int>());
- string next_page_url = (page_num + 1 < pages_size)
- ? url_prefix + base::IntToString(page_num + 1)
- : "";
+ string next_page_url =
+ GenerateNextPageUrl(url_prefix, page_num, pages_size);
string prev_page_url =
- (page_num > 0) ? result->page_urls[page_num - 1] : "";
+ GeneratePrevPageUrl(url_prefix, page_num);
scoped_ptr<base::Value> distilled_value =
CreateDistilledValueReturnedFromJS(kTitle,
result->content[page_num],
@@ -165,10 +174,13 @@ scoped_ptr<MultipageDistillerData> CreateMultipageDistillerDataWithoutImages(
void VerifyArticleProtoMatchesMultipageData(
const dom_distiller::DistilledArticleProto* article_proto,
const MultipageDistillerData* distiller_data,
- size_t pages_size) {
- ASSERT_EQ(pages_size, static_cast<size_t>(article_proto->pages_size()));
+ size_t distilled_pages_size,
+ size_t total_pages_size) {
+ ASSERT_EQ(distilled_pages_size,
+ static_cast<size_t>(article_proto->pages_size()));
EXPECT_EQ(kTitle, article_proto->title());
- for (size_t page_num = 0; page_num < pages_size; ++page_num) {
+ std::string url_prefix = kURL;
+ for (size_t page_num = 0; page_num < distilled_pages_size; ++page_num) {
const dom_distiller::DistilledPageProto& page =
article_proto->pages(page_num);
EXPECT_EQ(distiller_data->content[page_num], page.html());
@@ -182,6 +194,13 @@ void VerifyArticleProtoMatchesMultipageData(
EXPECT_EQ(GetImageName(page_num + 1, img_num),
page.image(img_num).name());
}
+ std::string expected_next_page_url =
+ GenerateNextPageUrl(url_prefix, page_num, total_pages_size);
+ std::string expected_prev_page_url =
+ GeneratePrevPageUrl(url_prefix, page_num);
+ EXPECT_EQ(expected_next_page_url, page.pagination_info().next_page());
+ EXPECT_EQ(expected_prev_page_url, page.pagination_info().prev_page());
+ EXPECT_FALSE(page.pagination_info().has_canonical_page());
}
}
@@ -390,7 +409,7 @@ TEST_F(DistillerTest, DistillMultiplePages) {
CreateMockDistillerPages(distiller_data.get(), kNumPages, 0).Pass());
base::MessageLoop::current()->RunUntilIdle();
VerifyArticleProtoMatchesMultipageData(
- article_proto_.get(), distiller_data.get(), kNumPages);
+ article_proto_.get(), distiller_data.get(), kNumPages, kNumPages);
}
TEST_F(DistillerTest, DistillLinkLoop) {
@@ -498,7 +517,7 @@ TEST_F(DistillerTest, MultiplePagesDistillationFailure) {
base::MessageLoop::current()->RunUntilIdle();
EXPECT_EQ(kTitle, article_proto_->title());
VerifyArticleProtoMatchesMultipageData(
- article_proto_.get(), distiller_data.get(), failed_page_num);
+ article_proto_.get(), distiller_data.get(), failed_page_num, kNumPages);
}
TEST_F(DistillerTest, DistillPreviousPage) {
@@ -517,7 +536,7 @@ TEST_F(DistillerTest, DistillPreviousPage) {
distiller_data.get(), kNumPages, start_page_num).Pass());
base::MessageLoop::current()->RunUntilIdle();
VerifyArticleProtoMatchesMultipageData(
- article_proto_.get(), distiller_data.get(), kNumPages);
+ article_proto_.get(), distiller_data.get(), kNumPages, kNumPages);
}
TEST_F(DistillerTest, IncrementalUpdates) {
@@ -562,7 +581,7 @@ TEST_F(DistillerTest, IncrementalUpdatesDoNotDeleteFinalArticle) {
// Should still be able to access article and pages.
VerifyArticleProtoMatchesMultipageData(
- article_proto_.get(), distiller_data.get(), kNumPages);
+ article_proto_.get(), distiller_data.get(), kNumPages, kNumPages);
}
TEST_F(DistillerTest, DeletingArticleDoesNotInterfereWithUpdates) {
diff --git a/components/dom_distiller/core/proto/distilled_page.proto b/components/dom_distiller/core/proto/distilled_page.proto
index 1670b3d..5ece78f 100644
--- a/components/dom_distiller/core/proto/distilled_page.proto
+++ b/components/dom_distiller/core/proto/distilled_page.proto
@@ -42,4 +42,13 @@ message DistilledPageProto {
// text directionality
optional string text_direction = 7;
+
+ message PaginationInfo {
+ optional string next_page = 1;
+ optional string prev_page = 2;
+ optional string canonical_page = 3;
+ }
+
+ // Pagination info for this page.
+ optional PaginationInfo pagination_info = 8;
}
diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc
index 150fd31..b6a5a4e 100644
--- a/components/dom_distiller/standalone/content_extractor.cc
+++ b/components/dom_distiller/standalone/content_extractor.cc
@@ -138,12 +138,23 @@ std::string GetReadableArticleString(
output << "Article Title: " << article_proto.title() << std::endl;
output << "# of pages: " << article_proto.pages_size() << std::endl;
for (int i = 0; i < article_proto.pages_size(); ++i) {
+ if (i > 0) output << std::endl;
const DistilledPageProto& page = article_proto.pages(i);
output << "Page " << i << std::endl;
output << "URL: " << page.url() << std::endl;
output << "Content: " << page.html() << std::endl;
if (page.has_debug_info() && page.debug_info().has_log())
output << "Log: " << page.debug_info().log() << std::endl;
+ if (page.has_pagination_info()) {
+ if (page.pagination_info().has_next_page()) {
+ output << "Next Page: " << page.pagination_info().next_page()
+ << std::endl;
+ }
+ if (page.pagination_info().has_prev_page()) {
+ output << "Prev Page: " << page.pagination_info().prev_page()
+ << std::endl;
+ }
+ }
}
return output.str();
}