From 07e92a5667bfa8b18e3681edcbbdb7df2f7ae454 Mon Sep 17 00:00:00 2001 From: "cjhopman@chromium.org" Date: Fri, 23 May 2014 07:27:31 +0000 Subject: Pull DomDistillerOptions up to the DistillerFactory This allows the creator of the DistillerFactory to set the DomDistillerOptions for all pages distilled by Distillers from that factory. Add an option to the content_extractor to extract just the text from the page. DEPENDSON= https://codereview.chromium.org/270663005/ https://codereview.chromium.org/286453002/ Review URL: https://codereview.chromium.org/286583002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@272431 0039d316-1c4b-4281-b951-d872f2087c98 --- .../dom_distiller/dom_distiller_service_factory.cc | 3 +- components/dom_distiller.gypi | 1 + .../distiller_page_web_contents_browsertest.cc | 1 + components/dom_distiller/core/distiller.cc | 15 +++-- components/dom_distiller/core/distiller.h | 9 ++- components/dom_distiller/core/distiller_page.cc | 7 ++- components/dom_distiller/core/distiller_page.h | 9 ++- .../dom_distiller/core/distiller_unittest.cc | 69 ++++++++++++++++------ .../dom_distiller/standalone/content_extractor.cc | 10 +++- third_party/dom_distiller_js/dom_distiller_js.gyp | 2 +- 10 files changed, 92 insertions(+), 34 deletions(-) diff --git a/chrome/browser/dom_distiller/dom_distiller_service_factory.cc b/chrome/browser/dom_distiller/dom_distiller_service_factory.cc index 035b8f4..92ae35c 100644 --- a/chrome/browser/dom_distiller/dom_distiller_service_factory.cc +++ b/chrome/browser/dom_distiller/dom_distiller_service_factory.cc @@ -63,7 +63,8 @@ KeyedService* DomDistillerServiceFactory::BuildServiceInstanceFor( scoped_ptr distiller_url_fetcher_factory( new DistillerURLFetcherFactory(profile->GetRequestContext())); scoped_ptr distiller_factory( - new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass())); + new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), + dom_distiller::proto::DomDistillerOptions())); DomDistillerContextKeyedService* service = new DomDistillerContextKeyedService( diff --git a/components/dom_distiller.gypi b/components/dom_distiller.gypi index a86746f..092a26a 100644 --- a/components/dom_distiller.gypi +++ b/components/dom_distiller.gypi @@ -48,6 +48,7 @@ ], 'export_dependent_settings': [ 'distilled_page_proto', + '../third_party/dom_distiller_js/dom_distiller_js.gyp:dom_distiller_js_proto', ], 'sources': [ 'dom_distiller/android/component_jni_registrar.cc', diff --git a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc index 72b572f..5d1cd2a 100644 --- a/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc +++ b/components/dom_distiller/content/distiller_page_web_contents_browsertest.cc @@ -36,6 +36,7 @@ class DistillerPageWebContentsTest : public ContentBrowserTest { quit_closure_ = quit_closure; distiller_page_->DistillPage( embedded_test_server()->GetURL(url), + dom_distiller::proto::DomDistillerOptions(), base::Bind(&DistillerPageWebContentsTest::OnPageDistillationFinished, this)); } diff --git a/components/dom_distiller/core/distiller.cc b/components/dom_distiller/core/distiller.cc index 070e681..472b453 100644 --- a/components/dom_distiller/core/distiller.cc +++ b/components/dom_distiller/core/distiller.cc @@ -29,15 +29,17 @@ const size_t kMaxPagesInArticle = 32; namespace dom_distiller { DistillerFactoryImpl::DistillerFactoryImpl( - scoped_ptr distiller_url_fetcher_factory) - : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) { + scoped_ptr distiller_url_fetcher_factory, + const dom_distiller::proto::DomDistillerOptions& dom_distiller_options) + : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()), + dom_distiller_options_(dom_distiller_options) { } DistillerFactoryImpl::~DistillerFactoryImpl() {} scoped_ptr DistillerFactoryImpl::CreateDistiller() { - scoped_ptr distiller( - new DistillerImpl(*distiller_url_fetcher_factory_)); + scoped_ptr distiller(new DistillerImpl( + *distiller_url_fetcher_factory_, dom_distiller_options_)); return distiller.PassAs(); } @@ -46,8 +48,10 @@ DistillerImpl::DistilledPageData::DistilledPageData() {} DistillerImpl::DistilledPageData::~DistilledPageData() {} DistillerImpl::DistillerImpl( - const DistillerURLFetcherFactory& distiller_url_fetcher_factory) + const DistillerURLFetcherFactory& distiller_url_fetcher_factory, + const dom_distiller::proto::DomDistillerOptions& dom_distiller_options) : distiller_url_fetcher_factory_(distiller_url_fetcher_factory), + dom_distiller_options_(dom_distiller_options), max_pages_in_article_(kMaxPagesInArticle), destruction_allowed_(true), weak_factory_(this) { @@ -120,6 +124,7 @@ void DistillerImpl::DistillNextPage() { started_pages_index_[page_num] = pages_.size() - 1; distiller_page_->DistillPage( url, + dom_distiller_options_, base::Bind(&DistillerImpl::OnPageDistillationFinished, weak_factory_.GetWeakPtr(), page_num, diff --git a/components/dom_distiller/core/distiller.h b/components/dom_distiller/core/distiller.h index 106967b..648949e 100644 --- a/components/dom_distiller/core/distiller.h +++ b/components/dom_distiller/core/distiller.h @@ -56,19 +56,22 @@ class DistillerFactory { class DistillerFactoryImpl : public DistillerFactory { public: DistillerFactoryImpl( - scoped_ptr distiller_url_fetcher_factory); + scoped_ptr distiller_url_fetcher_factory, + const dom_distiller::proto::DomDistillerOptions& dom_distiller_options); virtual ~DistillerFactoryImpl(); virtual scoped_ptr CreateDistiller() OVERRIDE; private: scoped_ptr distiller_url_fetcher_factory_; + dom_distiller::proto::DomDistillerOptions dom_distiller_options_; }; // Distills a article from a page and associated pages. class DistillerImpl : public Distiller { public: DistillerImpl( - const DistillerURLFetcherFactory& distiller_url_fetcher_factory); + const DistillerURLFetcherFactory& distiller_url_fetcher_factory, + const dom_distiller::proto::DomDistillerOptions& dom_distiller_options); virtual ~DistillerImpl(); virtual void DistillPage( @@ -147,6 +150,8 @@ class DistillerImpl : public Distiller { const DistillerURLFetcherFactory& distiller_url_fetcher_factory_; scoped_ptr distiller_page_; + + dom_distiller::proto::DomDistillerOptions dom_distiller_options_; DistillationFinishedCallback finished_cb_; DistillationUpdateCallback update_cb_; diff --git a/components/dom_distiller/core/distiller_page.cc b/components/dom_distiller/core/distiller_page.cc index ceee2a8..a44531c 100644 --- a/components/dom_distiller/core/distiller_page.cc +++ b/components/dom_distiller/core/distiller_page.cc @@ -58,14 +58,15 @@ DistillerPage::DistillerPage() : ready_(true) {} DistillerPage::~DistillerPage() {} -void DistillerPage::DistillPage(const GURL& gurl, - const DistillerPageCallback& callback) { +void DistillerPage::DistillPage( + const GURL& gurl, + const dom_distiller::proto::DomDistillerOptions options, + const DistillerPageCallback& callback) { DCHECK(ready_); // It is only possible to distill one page at a time. |ready_| is reset when // the callback to OnDistillationDone happens. ready_ = false; distiller_page_callback_ = callback; - dom_distiller::proto::DomDistillerOptions options; DistillPageImpl(gurl, GetDistillerScriptWithOptions(options)); } diff --git a/components/dom_distiller/core/distiller_page.h b/components/dom_distiller/core/distiller_page.h index a30d5e7..429a676 100644 --- a/components/dom_distiller/core/distiller_page.h +++ b/components/dom_distiller/core/distiller_page.h @@ -11,6 +11,7 @@ #include "base/memory/scoped_ptr.h" #include "base/memory/weak_ptr.h" #include "base/values.h" +#include "third_party/dom_distiller_js/dom_distiller.pb.h" #include "url/gurl.h" namespace dom_distiller { @@ -44,9 +45,11 @@ class DistillerPage { // Loads a URL. |OnDistillationDone| is called when the load completes or // fails. May be called when the distiller is idle. Callers can assume that, - // for a given |url|, any DistillerPage implementation will extract the same - // content. - void DistillPage(const GURL& url, const DistillerPageCallback& callback); + // for a given |url| and |options|, any DistillerPage implementation will + // extract the same content. + void DistillPage(const GURL& url, + const dom_distiller::proto::DomDistillerOptions options, + const DistillerPageCallback& callback); // Called when the JavaScript execution completes. |page_url| is the url of // the distilled page. |value| contains data returned by the script. diff --git a/components/dom_distiller/core/distiller_unittest.cc b/components/dom_distiller/core/distiller_unittest.cc index 7157e71..109912f 100644 --- a/components/dom_distiller/core/distiller_unittest.cc +++ b/components/dom_distiller/core/distiller_unittest.cc @@ -12,6 +12,7 @@ #include "base/location.h" #include "base/memory/scoped_ptr.h" #include "base/message_loop/message_loop.h" +#include "base/path_service.h" #include "base/strings/string_number_conversions.h" #include "base/values.h" #include "components/dom_distiller/core/article_distillation_update.h" @@ -25,6 +26,7 @@ #include "testing/gtest/include/gtest/gtest.h" #include "third_party/dom_distiller_js/dom_distiller.pb.h" #include "third_party/dom_distiller_js/dom_distiller_json_converter.h" +#include "ui/base/resource/resource_bundle.h" using std::vector; using std::string; @@ -32,6 +34,9 @@ using ::testing::Invoke; using ::testing::Return; using ::testing::_; +using dom_distiller::proto::DomDistillerOptions; +using dom_distiller::proto::DomDistillerResult; + namespace { const char kTitle[] = "Title"; const char kContent[] = "Content"; @@ -51,7 +56,7 @@ scoped_ptr CreateDistilledValueReturnedFromJS( const vector& image_indices, const string& next_page_url, const string& prev_page_url = "") { - dom_distiller::proto::DomDistillerResult result; + DomDistillerResult result; result.set_title(title); result.mutable_distilled_content()->set_html(content); result.mutable_pagination_info()->set_next_page(next_page_url); @@ -181,6 +186,15 @@ void VerifyArticleProtoMatchesMultipageData( } } +void AddComponentsResources() { + base::FilePath pak_file; + base::FilePath pak_dir; + PathService::Get(base::DIR_MODULE, &pak_dir); + pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak")); + ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath( + pak_file, ui::SCALE_FACTOR_NONE); +} + } // namespace namespace dom_distiller { @@ -198,7 +212,7 @@ class TestDistillerURLFetcher : public DistillerURLFetcher { virtual void FetchURL(const string& url, const URLFetcherCallback& callback) OVERRIDE { - DCHECK(callback_.is_null()); + ASSERT_FALSE(callback.is_null()); url_ = url; callback_ = callback; if (!delay_fetch_) { @@ -208,6 +222,7 @@ class TestDistillerURLFetcher : public DistillerURLFetcher { void PostCallbackTask() { ASSERT_TRUE(base::MessageLoop::current()); + ASSERT_FALSE(callback_.is_null()); base::MessageLoop::current()->PostTask( FROM_HERE, base::Bind(callback_, responses_[url_])); } @@ -241,6 +256,10 @@ class DistillerTest : public testing::Test { public: virtual ~DistillerTest() {} + virtual void SetUp() OVERRIDE { + AddComponentsResources(); + } + void OnDistillArticleDone(scoped_ptr proto) { article_proto_ = proto.Pass(); } @@ -311,7 +330,8 @@ TEST_F(DistillerTest, DistillPage) { base::MessageLoopForUI loop; scoped_ptr result = CreateDistilledValueReturnedFromJS(kTitle, kContent, vector(), ""); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(kURL, CreateMockDistillerPage(result.get(), GURL(kURL)).Pass()); base::MessageLoop::current()->RunUntilIdle(); EXPECT_EQ(kTitle, article_proto_->title()); @@ -328,7 +348,8 @@ TEST_F(DistillerTest, DistillPageWithImages) { image_indices.push_back(1); scoped_ptr result = CreateDistilledValueReturnedFromJS(kTitle, kContent, image_indices, ""); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(kURL, CreateMockDistillerPage(result.get(), GURL(kURL)).Pass()); base::MessageLoop::current()->RunUntilIdle(); EXPECT_EQ(kTitle, article_proto_->title()); @@ -362,7 +383,8 @@ TEST_F(DistillerTest, DistillMultiplePages) { distiller_data->image_ids.push_back(image_indices); } - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage( distiller_data->page_urls[0], CreateMockDistillerPages(distiller_data.get(), kNumPages, 0).Pass()); @@ -377,7 +399,8 @@ TEST_F(DistillerTest, DistillLinkLoop) { // happen if javascript misparses a next page link. scoped_ptr result = CreateDistilledValueReturnedFromJS(kTitle, kContent, vector(), kURL); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(kURL, CreateMockDistillerPage(result.get(), GURL(kURL)).Pass()); base::MessageLoop::current()->RunUntilIdle(); EXPECT_EQ(kTitle, article_proto_->title()); @@ -404,7 +427,8 @@ TEST_F(DistillerTest, CheckMaxPageLimitExtraPage) { distiller_data->distilled_values.pop_back(); distiller_data->distilled_values.push_back(last_page_data.release()); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); distiller_->SetMaxNumPagesInArticle(kMaxPagesInArticle); @@ -423,7 +447,8 @@ TEST_F(DistillerTest, CheckMaxPageLimitExactLimit) { scoped_ptr distiller_data = CreateMultipageDistillerDataWithoutImages(kMaxPagesInArticle); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); // Check if distilling an article with exactly the page limit works. distiller_->SetMaxNumPagesInArticle(kMaxPagesInArticle); @@ -441,7 +466,8 @@ TEST_F(DistillerTest, SinglePageDistillationFailure) { base::MessageLoopForUI loop; // To simulate failure return a null value. scoped_ptr nullValue(base::Value::CreateNullValue()); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(kURL, CreateMockDistillerPage(nullValue.get(), GURL(kURL)).Pass()); base::MessageLoop::current()->RunUntilIdle(); @@ -464,7 +490,8 @@ TEST_F(DistillerTest, MultiplePagesDistillationFailure) { distiller_data->distilled_values.begin() + failed_page_num, base::Value::CreateNullValue()); // Expect only calls till the failed page number. - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(distiller_data->page_urls[0], CreateMockDistillerPages( distiller_data.get(), failed_page_num + 1, 0).Pass()); @@ -483,7 +510,8 @@ TEST_F(DistillerTest, DistillPreviousPage) { scoped_ptr distiller_data = CreateMultipageDistillerDataWithoutImages(kNumPages); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(distiller_data->page_urls[start_page_num], CreateMockDistillerPages( distiller_data.get(), kNumPages, start_page_num).Pass()); @@ -501,7 +529,8 @@ TEST_F(DistillerTest, IncrementalUpdates) { scoped_ptr distiller_data = CreateMultipageDistillerDataWithoutImages(kNumPages); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(distiller_data->page_urls[start_page_num], CreateMockDistillerPages( distiller_data.get(), kNumPages, start_page_num).Pass()); @@ -521,7 +550,8 @@ TEST_F(DistillerTest, IncrementalUpdatesDoNotDeleteFinalArticle) { scoped_ptr distiller_data = CreateMultipageDistillerDataWithoutImages(kNumPages); - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(distiller_data->page_urls[start_page_num], CreateMockDistillerPages( distiller_data.get(), kNumPages, start_page_num).Pass()); @@ -543,7 +573,8 @@ TEST_F(DistillerTest, DeletingArticleDoesNotInterfereWithUpdates) { // The page number of the article on which distillation starts. int start_page_num = 3; - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(distiller_data->page_urls[start_page_num], CreateMockDistillerPages( distiller_data.get(), kNumPages, start_page_num).Pass()); @@ -565,10 +596,11 @@ TEST_F(DistillerTest, CancelWithDelayedImageFetchCallback) { scoped_ptr distilled_value = CreateDistilledValueReturnedFromJS(kTitle, kContent, image_indices, ""); TestDistillerURLFetcher* delayed_fetcher = new TestDistillerURLFetcher(true); - MockDistillerURLFetcherFactory url_fetcher_factory; - EXPECT_CALL(url_fetcher_factory, CreateDistillerURLFetcher()) + MockDistillerURLFetcherFactory mock_url_fetcher_factory; + EXPECT_CALL(mock_url_fetcher_factory, CreateDistillerURLFetcher()) .WillOnce(Return(delayed_fetcher)); - distiller_.reset(new DistillerImpl(url_fetcher_factory)); + distiller_.reset( + new DistillerImpl(mock_url_fetcher_factory, DomDistillerOptions())); DistillPage( kURL, CreateMockDistillerPage(distilled_value.get(), GURL(kURL)).Pass()); base::MessageLoop::current()->RunUntilIdle(); @@ -585,7 +617,8 @@ TEST_F(DistillerTest, CancelWithDelayedJSCallback) { scoped_ptr distilled_value = CreateDistilledValueReturnedFromJS(kTitle, kContent, vector(), ""); MockDistillerPage* distiller_page = NULL; - distiller_.reset(new DistillerImpl(url_fetcher_factory_)); + distiller_.reset( + new DistillerImpl(url_fetcher_factory_, DomDistillerOptions())); DistillPage(kURL, CreateMockDistillerPageWithPendingJSCallback(&distiller_page, GURL(kURL))); diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc index 760b165..dd69495 100644 --- a/components/dom_distiller/standalone/content_extractor.cc +++ b/components/dom_distiller/standalone/content_extractor.cc @@ -22,6 +22,7 @@ #include "content/public/test/content_browser_test.h" #include "content/shell/browser/shell.h" #include "net/dns/mock_host_resolver.h" +#include "third_party/dom_distiller_js/dom_distiller.pb.h" #include "ui/base/resource/resource_bundle.h" using content::ContentBrowserTest; @@ -43,6 +44,8 @@ const char* kOutputFile = "output-file"; // output. const char* kShouldOutputBinary = "output-binary"; +const char* kExtractTextOnly = "extract-text-only"; + scoped_ptr CreateDomDistillerService( content::BrowserContext* context, const base::FilePath& db_path) { @@ -61,8 +64,13 @@ scoped_ptr CreateDomDistillerService( new DistillerPageWebContentsFactory(context)); scoped_ptr distiller_url_fetcher_factory( new DistillerURLFetcherFactory(context->GetRequestContext())); + + dom_distiller::proto::DomDistillerOptions options; + if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) { + options.set_extract_text_only(true); + } scoped_ptr distiller_factory( - new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass())); + new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options)); return scoped_ptr(new DomDistillerService( dom_distiller_store.PassAs(), diff --git a/third_party/dom_distiller_js/dom_distiller_js.gyp b/third_party/dom_distiller_js/dom_distiller_js.gyp index 1e6641b..aaa96ed 100644 --- a/third_party/dom_distiller_js/dom_distiller_js.gyp +++ b/third_party/dom_distiller_js/dom_distiller_js.gyp @@ -11,7 +11,7 @@ 'proto_in_dir': 'package/proto', 'proto_out_dir': 'third_party/dom_distiller_js', }, - 'all_dependent_settings': { + 'direct_dependent_settings': { 'include_dirs': ['package/proto_gen'], }, 'includes': [ '../../build/protoc.gypi', ], -- cgit v1.1