summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorcjhopman@chromium.org <cjhopman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-06-12 22:02:28 +0000
committercjhopman@chromium.org <cjhopman@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-06-12 22:02:28 +0000
commitd10be7ee3903086680c0f31cfbf9c6207a253a69 (patch)
treec2df7f05b4cd3727cfc5b3ef4bf472538d15d4b4
parentab5f281c977f91e519c49faba93559cb4825f7b8 (diff)
downloadchromium_src-d10be7ee3903086680c0f31cfbf9c6207a253a69.zip
chromium_src-d10be7ee3903086680c0f31cfbf9c6207a253a69.tar.gz
chromium_src-d10be7ee3903086680c0f31cfbf9c6207a253a69.tar.bz2
Make content_extractor support a multiple-url request
This adds the --urls flag which accepts a space-separated list of urls to distill. This adds a pretty straightforward approach to having multiple of these requests happening at the same time. Once all requests are finished, they will be printed (in the order that they appeared in --urls). If printing binary, each serialized protobuf will be preceded by its size (as a varint32). TBR=ben@ Review URL: https://codereview.chromium.org/276553002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@276819 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--components/dom_distiller/DEPS1
-rw-r--r--components/dom_distiller/standalone/content_extractor.cc147
2 files changed, 114 insertions, 34 deletions
diff --git a/components/dom_distiller/DEPS b/components/dom_distiller/DEPS
index e886236..b81b29f 100644
--- a/components/dom_distiller/DEPS
+++ b/components/dom_distiller/DEPS
@@ -1,4 +1,5 @@
include_rules = [
+ "+google", # For third_party/protobuf.
"+grit", # For generated headers.
"+jni",
"+sync/api",
diff --git a/components/dom_distiller/standalone/content_extractor.cc b/components/dom_distiller/standalone/content_extractor.cc
index e851a71..0b609ed 100644
--- a/components/dom_distiller/standalone/content_extractor.cc
+++ b/components/dom_distiller/standalone/content_extractor.cc
@@ -10,6 +10,7 @@
#include "base/path_service.h"
#include "base/run_loop.h"
#include "base/strings/string_number_conversions.h"
+#include "base/strings/string_split.h"
#include "components/dom_distiller/content/distiller_page_web_contents.h"
#include "components/dom_distiller/core/distiller.h"
#include "components/dom_distiller/core/dom_distiller_database.h"
@@ -22,6 +23,8 @@
#include "content/public/browser/browser_thread.h"
#include "content/public/test/content_browser_test.h"
#include "content/shell/browser/shell.h"
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
#include "net/dns/mock_host_resolver.h"
#include "third_party/dom_distiller_js/dom_distiller.pb.h"
#include "ui/base/resource/resource_bundle.h"
@@ -35,6 +38,9 @@ namespace {
// The url to distill.
const char* kUrlSwitch = "url";
+// A space-separated list of urls to distill.
+const char* kUrlsSwitch = "urls";
+
// Indicates that DNS resolution should be disabled for this test.
const char* kDisableDnsSwitch = "disable-dns";
@@ -51,6 +57,9 @@ const char* kExtractTextOnly = "extract-text-only";
// Indicates to include debug output.
const char* kDebugLevel = "debug-level";
+// Maximum number of concurrent started extractor requests.
+const int kMaxExtractorTasks = 8;
+
scoped_ptr<DomDistillerService> CreateDomDistillerService(
content::BrowserContext* context,
const base::FilePath& db_path) {
@@ -100,29 +109,30 @@ void AddComponentsResources() {
pak_file, ui::SCALE_FACTOR_NONE);
}
-void LogArticle(const DistilledArticleProto& article_proto) {
- std::stringstream output;
- if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
- output << article_proto.SerializeAsString();
- } else {
- output << "Article Title: " << article_proto.title() << std::endl;
- output << "# of pages: " << article_proto.pages_size() << std::endl;
- for (int i = 0; i < article_proto.pages_size(); ++i) {
- const DistilledPageProto& page = article_proto.pages(i);
- output << "Page " << i << std::endl;
- output << "URL: " << page.url() << std::endl;
- output << "Content: " << page.html() << std::endl;
- }
- }
+bool WriteProtobufWithSize(
+ const google::protobuf::MessageLite& message,
+ google::protobuf::io::ZeroCopyOutputStream* output_stream) {
+ google::protobuf::io::CodedOutputStream coded_output(output_stream);
+
+ // Write the size.
+ const int size = message.ByteSize();
+ coded_output.WriteLittleEndian32(size);
+ message.SerializeWithCachedSizes(&coded_output);
+ return !coded_output.HadError();
+}
- std::string data = output.str();
- if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
- base::FilePath filename =
- CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
- base::WriteFile(filename, data.c_str(), data.size());
- } else {
- VLOG(0) << data;
+std::string GetReadableArticleString(
+ const DistilledArticleProto& article_proto) {
+ std::stringstream output;
+ output << "Article Title: " << article_proto.title() << std::endl;
+ output << "# of pages: " << article_proto.pages_size() << std::endl;
+ for (int i = 0; i < article_proto.pages_size(); ++i) {
+ const DistilledPageProto& page = article_proto.pages(i);
+ output << "Page " << i << std::endl;
+ output << "URL: " << page.url() << std::endl;
+ output << "Content: " << page.html() << std::endl;
}
+ return output.str();
}
} // namespace
@@ -139,19 +149,34 @@ class ContentExtractionRequest : public ViewRequestDelegate {
return *article_proto_;
}
- static scoped_ptr<ContentExtractionRequest> CreateForCommandLine(
+ static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
const CommandLine& command_line) {
- GURL url;
+ ScopedVector<ContentExtractionRequest> requests;
if (command_line.HasSwitch(kUrlSwitch)) {
+ GURL url;
std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
url = GURL(url_string);
+ if (url.is_valid()) {
+ requests.push_back(new ContentExtractionRequest(url));
+ }
+ } else if (command_line.HasSwitch(kUrlsSwitch)) {
+ std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
+ std::vector<std::string> urls;
+ base::SplitString(urls_string, ' ', &urls);
+ for (size_t i = 0; i < urls.size(); ++i) {
+ GURL url(urls[i]);
+ if (url.is_valid()) {
+ requests.push_back(new ContentExtractionRequest(url));
+ } else {
+ ADD_FAILURE() << "Bad url";
+ }
+ }
}
- if (!url.is_valid()) {
+ if (requests.empty()) {
ADD_FAILURE() << "No valid url provided";
- return scoped_ptr<ContentExtractionRequest>();
}
- return scoped_ptr<ContentExtractionRequest>(
- new ContentExtractionRequest(url));
+
+ return requests.Pass();
}
private:
@@ -175,6 +200,15 @@ class ContentExtractionRequest : public ViewRequestDelegate {
};
class ContentExtractor : public ContentBrowserTest {
+ public:
+ ContentExtractor()
+ : pending_tasks_(0),
+ max_tasks_(kMaxExtractorTasks),
+ next_request_(0),
+ output_data_(),
+ protobuf_output_stream_(
+ new google::protobuf::io::StringOutputStream(&output_data_)) {}
+
// Change behavior of the default host resolver to avoid DNS lookup errors, so
// we can make network calls.
virtual void SetUpOnMainThread() OVERRIDE {
@@ -198,10 +232,18 @@ class ContentExtractor : public ContentBrowserTest {
service_ = CreateDomDistillerService(context,
db_dir_.path());
const CommandLine& command_line = *CommandLine::ForCurrentProcess();
- request_ = ContentExtractionRequest::CreateForCommandLine(command_line);
- request_->Start(
- service_.get(),
- base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
+ requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
+ PumpQueue();
+ }
+
+ void PumpQueue() {
+ while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
+ requests_[next_request_]->Start(
+ service_.get(),
+ base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
+ ++next_request_;
+ ++pending_tasks_;
+ }
}
private:
@@ -221,18 +263,55 @@ class ContentExtractor : public ContentBrowserTest {
mock_host_resolver_override_.reset();
}
+ void FinishRequest() {
+ --pending_tasks_;
+ if (next_request_ == requests_.size() && pending_tasks_ == 0) {
+ Finish();
+ } else {
+ PumpQueue();
+ }
+ }
+
+ void DoArticleOutput() {
+ for (size_t i = 0; i < requests_.size(); ++i) {
+ const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
+ if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
+ WriteProtobufWithSize(article, protobuf_output_stream_.get());
+ } else {
+ output_data_ += GetReadableArticleString(article) + "\n";
+ }
+ }
+
+ if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
+ base::FilePath filename =
+ CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
+ ASSERT_EQ(
+ (int)output_data_.size(),
+ base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
+ } else {
+ VLOG(0) << output_data_;
+ }
+ }
+
void Finish() {
- LogArticle(request_->GetArticleCopy());
- request_.reset();
+ DoArticleOutput();
+ requests_.clear();
service_.reset();
base::MessageLoop::current()->PostTask(
FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
}
+ size_t pending_tasks_;
+ size_t max_tasks_;
+ size_t next_request_;
+
base::ScopedTempDir db_dir_;
scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
scoped_ptr<DomDistillerService> service_;
- scoped_ptr<ContentExtractionRequest> request_;
+ ScopedVector<ContentExtractionRequest> requests_;
+
+ std::string output_data_;
+ scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
};
IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {