diff options
author | cjhopman@chromium.org <cjhopman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-06-12 22:02:28 +0000 |
---|---|---|
committer | cjhopman@chromium.org <cjhopman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-06-12 22:02:28 +0000 |
commit | d10be7ee3903086680c0f31cfbf9c6207a253a69 (patch) | |
tree | c2df7f05b4cd3727cfc5b3ef4bf472538d15d4b4 | |
parent | ab5f281c977f91e519c49faba93559cb4825f7b8 (diff) | |
download | chromium_src-d10be7ee3903086680c0f31cfbf9c6207a253a69.zip chromium_src-d10be7ee3903086680c0f31cfbf9c6207a253a69.tar.gz chromium_src-d10be7ee3903086680c0f31cfbf9c6207a253a69.tar.bz2 |
Make content_extractor support a multiple-url request
This adds the --urls flag which accepts a space-separated list of urls
to distill.
This adds a pretty straightforward approach to having multiple of these
requests happening at the same time.
Once all requests are finished, they will be printed (in the order that
they appeared in --urls). If printing binary, each serialized protobuf
will be preceded by its size (as a varint32).
TBR=ben@
Review URL: https://codereview.chromium.org/276553002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276819 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | components/dom_distiller/DEPS | 1 | ||||
-rw-r--r-- | components/dom_distiller/standalone/content_extractor.cc | 147 |
2 files changed, 114 insertions, 34 deletions
diff --git a/components/dom_distiller/DEPS b/components/dom_distiller/DEPS index e886236..b81b29f 100644 --- a/components/dom_distiller/DEPS +++ b/components/dom_distiller/DEPS @@ -1,4 +1,5 @@ include_rules = [ + "+google", # For third_party/protobuf. "+grit", # For generated headers. "+jni", "+sync/api", diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc index e851a71..0b609ed 100644 --- a/components/dom_distiller/standalone/content_extractor.cc +++ b/components/dom_distiller/standalone/content_extractor.cc @@ -10,6 +10,7 @@ #include "base/path_service.h" #include "base/run_loop.h" #include "base/strings/string_number_conversions.h" +#include "base/strings/string_split.h" #include "components/dom_distiller/content/distiller_page_web_contents.h" #include "components/dom_distiller/core/distiller.h" #include "components/dom_distiller/core/dom_distiller_database.h" @@ -22,6 +23,8 @@ #include "content/public/browser/browser_thread.h" #include "content/public/test/content_browser_test.h" #include "content/shell/browser/shell.h" +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream_impl_lite.h" #include "net/dns/mock_host_resolver.h" #include "third_party/dom_distiller_js/dom_distiller.pb.h" #include "ui/base/resource/resource_bundle.h" @@ -35,6 +38,9 @@ namespace { // The url to distill. const char* kUrlSwitch = "url"; +// A space-separated list of urls to distill. +const char* kUrlsSwitch = "urls"; + // Indicates that DNS resolution should be disabled for this test. const char* kDisableDnsSwitch = "disable-dns"; @@ -51,6 +57,9 @@ const char* kExtractTextOnly = "extract-text-only"; // Indicates to include debug output. const char* kDebugLevel = "debug-level"; +// Maximum number of concurrent started extractor requests. +const int kMaxExtractorTasks = 8; + scoped_ptr<DomDistillerService> CreateDomDistillerService( content::BrowserContext* context, const base::FilePath& db_path) { @@ -100,29 +109,30 @@ void AddComponentsResources() { pak_file, ui::SCALE_FACTOR_NONE); } -void LogArticle(const DistilledArticleProto& article_proto) { - std::stringstream output; - if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) { - output << article_proto.SerializeAsString(); - } else { - output << "Article Title: " << article_proto.title() << std::endl; - output << "# of pages: " << article_proto.pages_size() << std::endl; - for (int i = 0; i < article_proto.pages_size(); ++i) { - const DistilledPageProto& page = article_proto.pages(i); - output << "Page " << i << std::endl; - output << "URL: " << page.url() << std::endl; - output << "Content: " << page.html() << std::endl; - } - } +bool WriteProtobufWithSize( + const google::protobuf::MessageLite& message, + google::protobuf::io::ZeroCopyOutputStream* output_stream) { + google::protobuf::io::CodedOutputStream coded_output(output_stream); + + // Write the size. + const int size = message.ByteSize(); + coded_output.WriteLittleEndian32(size); + message.SerializeWithCachedSizes(&coded_output); + return !coded_output.HadError(); +} - std::string data = output.str(); - if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) { - base::FilePath filename = - CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile); - base::WriteFile(filename, data.c_str(), data.size()); - } else { - VLOG(0) << data; +std::string GetReadableArticleString( + const DistilledArticleProto& article_proto) { + std::stringstream output; + output << "Article Title: " << article_proto.title() << std::endl; + output << "# of pages: " << article_proto.pages_size() << std::endl; + for (int i = 0; i < article_proto.pages_size(); ++i) { + const DistilledPageProto& page = article_proto.pages(i); + output << "Page " << i << std::endl; + output << "URL: " << page.url() << std::endl; + output << "Content: " << page.html() << std::endl; } + return output.str(); } } // namespace @@ -139,19 +149,34 @@ class ContentExtractionRequest : public ViewRequestDelegate { return *article_proto_; } - static scoped_ptr<ContentExtractionRequest> CreateForCommandLine( + static ScopedVector<ContentExtractionRequest> CreateForCommandLine( const CommandLine& command_line) { - GURL url; + ScopedVector<ContentExtractionRequest> requests; if (command_line.HasSwitch(kUrlSwitch)) { + GURL url; std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch); url = GURL(url_string); + if (url.is_valid()) { + requests.push_back(new ContentExtractionRequest(url)); + } + } else if (command_line.HasSwitch(kUrlsSwitch)) { + std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch); + std::vector<std::string> urls; + base::SplitString(urls_string, ' ', &urls); + for (size_t i = 0; i < urls.size(); ++i) { + GURL url(urls[i]); + if (url.is_valid()) { + requests.push_back(new ContentExtractionRequest(url)); + } else { + ADD_FAILURE() << "Bad url"; + } + } } - if (!url.is_valid()) { + if (requests.empty()) { ADD_FAILURE() << "No valid url provided"; - return scoped_ptr<ContentExtractionRequest>(); } - return scoped_ptr<ContentExtractionRequest>( - new ContentExtractionRequest(url)); + + return requests.Pass(); } private: @@ -175,6 +200,15 @@ class ContentExtractionRequest : public ViewRequestDelegate { }; class ContentExtractor : public ContentBrowserTest { + public: + ContentExtractor() + : pending_tasks_(0), + max_tasks_(kMaxExtractorTasks), + next_request_(0), + output_data_(), + protobuf_output_stream_( + new google::protobuf::io::StringOutputStream(&output_data_)) {} + // Change behavior of the default host resolver to avoid DNS lookup errors, so // we can make network calls. virtual void SetUpOnMainThread() OVERRIDE { @@ -198,10 +232,18 @@ class ContentExtractor : public ContentBrowserTest { service_ = CreateDomDistillerService(context, db_dir_.path()); const CommandLine& command_line = *CommandLine::ForCurrentProcess(); - request_ = ContentExtractionRequest::CreateForCommandLine(command_line); - request_->Start( - service_.get(), - base::Bind(&ContentExtractor::Finish, base::Unretained(this))); + requests_ = ContentExtractionRequest::CreateForCommandLine(command_line); + PumpQueue(); + } + + void PumpQueue() { + while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) { + requests_[next_request_]->Start( + service_.get(), + base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this))); + ++next_request_; + ++pending_tasks_; + } } private: @@ -221,18 +263,55 @@ class ContentExtractor : public ContentBrowserTest { mock_host_resolver_override_.reset(); } + void FinishRequest() { + --pending_tasks_; + if (next_request_ == requests_.size() && pending_tasks_ == 0) { + Finish(); + } else { + PumpQueue(); + } + } + + void DoArticleOutput() { + for (size_t i = 0; i < requests_.size(); ++i) { + const DistilledArticleProto& article = requests_[i]->GetArticleCopy(); + if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) { + WriteProtobufWithSize(article, protobuf_output_stream_.get()); + } else { + output_data_ += GetReadableArticleString(article) + "\n"; + } + } + + if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) { + base::FilePath filename = + CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile); + ASSERT_EQ( + (int)output_data_.size(), + base::WriteFile(filename, output_data_.c_str(), output_data_.size())); + } else { + VLOG(0) << output_data_; + } + } + void Finish() { - LogArticle(request_->GetArticleCopy()); - request_.reset(); + DoArticleOutput(); + requests_.clear(); service_.reset(); base::MessageLoop::current()->PostTask( FROM_HERE, base::MessageLoop::QuitWhenIdleClosure()); } + size_t pending_tasks_; + size_t max_tasks_; + size_t next_request_; + base::ScopedTempDir db_dir_; scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_; scoped_ptr<DomDistillerService> service_; - scoped_ptr<ContentExtractionRequest> request_; + ScopedVector<ContentExtractionRequest> requests_; + + std::string output_data_; + scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_; }; IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) { |