diff options
-rw-r--r-- | chrome/browser/download/save_page_browsertest.cc | 5 | ||||
-rw-r--r-- | content/browser/download/mhtml_generation_manager.cc | 98 | ||||
-rw-r--r-- | content/browser/download/mhtml_generation_manager.h | 11 | ||||
-rw-r--r-- | content/browser/frame_host/render_frame_host_impl.cc | 8 | ||||
-rw-r--r-- | content/browser/frame_host/render_frame_host_impl.h | 6 | ||||
-rw-r--r-- | content/common/frame_messages.h | 72 | ||||
-rw-r--r-- | content/renderer/render_frame_impl.cc | 86 | ||||
-rw-r--r-- | content/renderer/render_frame_impl.h | 10 | ||||
-rw-r--r-- | third_party/WebKit/Source/core/page/PageSerializer.cpp | 32 | ||||
-rw-r--r-- | third_party/WebKit/Source/core/page/PageSerializer.h | 28 | ||||
-rw-r--r-- | third_party/WebKit/Source/web/WebPageSerializer.cpp | 55 | ||||
-rw-r--r-- | third_party/WebKit/Source/web/tests/PageSerializerTest.cpp | 8 | ||||
-rw-r--r-- | third_party/WebKit/public/web/WebPageSerializer.h | 26 |
13 files changed, 288 insertions, 157 deletions
diff --git a/chrome/browser/download/save_page_browsertest.cc b/chrome/browser/download/save_page_browsertest.cc index a7e0a54..fbbe0e3 100644 --- a/chrome/browser/download/save_page_browsertest.cc +++ b/chrome/browser/download/save_page_browsertest.cc @@ -908,10 +908,7 @@ IN_PROC_BROWSER_TEST_F(SavePageSitePerProcessBrowserTest, SaveAsMHTML) { count++; pos++; } - // TODO(lukasza): Need to dedupe savable resources (i.e. 1.png) across frames. - // This will be fixed by crrev.com/1417323006. - // EXPECT_EQ(1, count) - // << "Verify number of image/png parts in the mhtml output"; + EXPECT_EQ(1, count) << "Verify number of image/png parts in the mhtml output"; } // Test suite that verifies that the frame tree "looks" the same before diff --git a/content/browser/download/mhtml_generation_manager.cc b/content/browser/download/mhtml_generation_manager.cc index 9937db7..c1cad61 100644 --- a/content/browser/download/mhtml_generation_manager.cc +++ b/content/browser/download/mhtml_generation_manager.cc @@ -18,12 +18,14 @@ #include "base/strings/string_number_conversions.h" #include "base/strings/stringprintf.h" #include "content/browser/frame_host/frame_tree_node.h" +#include "content/browser/frame_host/render_frame_host_impl.h" #include "content/common/frame_messages.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/render_frame_host.h" #include "content/public/browser/render_process_host.h" #include "content/public/browser/render_process_host_observer.h" #include "content/public/browser/web_contents.h" +#include "url/gurl.h" namespace content { @@ -38,14 +40,25 @@ class MHTMLGenerationManager::Job : public RenderProcessHostObserver { GenerateMHTMLCallback callback() const { return callback_; } + // Handler for FrameHostMsg_SerializeAsMHTMLResponse (a notification from the + // renderer that the MHTML generation for previous frame has finished). + // Returns |true| upon success; |false| otherwise. + bool OnSerializeAsMHTMLResponse( + RenderFrameHostImpl* sender, + const std::set<std::string>& digests_of_uris_of_serialized_resources); + // Sends IPC to the renderer, asking for MHTML generation of the next frame. // // Returns true if the message was sent successfully; false otherwise. bool SendToNextRenderFrame(); // Indicates if more calls to SendToNextRenderFrame are needed. - bool HasMoreFramesToProcess() const { - return !pending_frame_tree_node_ids_.empty(); + bool IsDone() const { + bool waiting_for_response_from_renderer = + frame_tree_node_id_of_busy_frame_ != + FrameTreeNode::kFrameTreeNodeInvalidId; + bool no_more_requests_to_send = pending_frame_tree_node_ids_.empty(); + return !waiting_for_response_from_renderer && no_more_requests_to_send; } // Close the file on the file thread and respond back on the UI thread with @@ -73,12 +86,17 @@ class MHTMLGenerationManager::Job : public RenderProcessHostObserver { // See also MHTMLGenerationManager::id_to_job_ map. int job_id_; + // The IDs of frames that still need to be processed. + std::queue<int> pending_frame_tree_node_ids_; + + // Identifies a frame to which we've sent FrameMsg_SerializeAsMHTML but for + // which we didn't yet process FrameHostMsg_SerializeAsMHTMLResponse via + // OnSerializeAsMHTMLResponse. + int frame_tree_node_id_of_busy_frame_; + // The handle to the file the MHTML is saved to for the browser process. base::File browser_file_; - // The IDs of frames we still need to process. - std::queue<int> pending_frame_tree_node_ids_; - // Map from frames into content ids (see WebPageSerializer::generateMHTMLParts // for more details about what "content ids" are and how they are used). std::map<int, std::string> frame_tree_node_to_content_id_; @@ -86,6 +104,10 @@ class MHTMLGenerationManager::Job : public RenderProcessHostObserver { // MIME multipart boundary to use in the MHTML doc. std::string mhtml_boundary_marker_; + // Digests of URIs of already generated MHTML parts. + std::set<std::string> digests_of_already_serialized_uris_; + std::string salt_; + // The callback to call once generation is complete. GenerateMHTMLCallback callback_; @@ -100,7 +122,9 @@ MHTMLGenerationManager::Job::Job(int job_id, WebContents* web_contents, GenerateMHTMLCallback callback) : job_id_(job_id), + frame_tree_node_id_of_busy_frame_(FrameTreeNode::kFrameTreeNodeInvalidId), mhtml_boundary_marker_(GenerateMHTMLBoundaryMarker()), + salt_(base::GenerateGUID()), callback_(callback), observed_renderer_process_host_(this) { DCHECK_CURRENTLY_ON(BrowserThread::UI); @@ -144,9 +168,13 @@ bool MHTMLGenerationManager::Job::SendToNextRenderFrame() { DCHECK(browser_file_.IsValid()); DCHECK_LT(0u, pending_frame_tree_node_ids_.size()); + FrameMsg_SerializeAsMHTML_Params ipc_params; + ipc_params.job_id = job_id_; + ipc_params.mhtml_boundary_marker = mhtml_boundary_marker_; + int frame_tree_node_id = pending_frame_tree_node_ids_.front(); pending_frame_tree_node_ids_.pop(); - bool is_last_frame = pending_frame_tree_node_ids_.empty(); + ipc_params.is_last_frame = pending_frame_tree_node_ids_.empty(); FrameTreeNode* ftn = FrameTreeNode::GloballyFindByID(frame_tree_node_id); if (!ftn) // The contents went away. @@ -157,12 +185,21 @@ bool MHTMLGenerationManager::Job::SendToNextRenderFrame() { observed_renderer_process_host_.RemoveAll(); observed_renderer_process_host_.Add(rfh->GetProcess()); - IPC::PlatformFileForTransit renderer_file = IPC::GetFileHandleForProcess( + // Tell the renderer to skip (= deduplicate) already covered MHTML parts. + ipc_params.salt = salt_; + ipc_params.digests_of_uris_to_skip = digests_of_already_serialized_uris_; + + ipc_params.destination_file = IPC::GetFileHandleForProcess( browser_file_.GetPlatformFile(), rfh->GetProcess()->GetHandle(), false); // |close_source_handle|. - rfh->Send(new FrameMsg_SerializeAsMHTML( - rfh->GetRoutingID(), job_id_, renderer_file, mhtml_boundary_marker_, - CreateFrameRoutingIdToContentId(rfh->GetSiteInstance()), is_last_frame)); + ipc_params.frame_routing_id_to_content_id = + CreateFrameRoutingIdToContentId(rfh->GetSiteInstance()); + + // Send the IPC asking the renderer to serialize the frame. + DCHECK_EQ(FrameTreeNode::kFrameTreeNodeInvalidId, + frame_tree_node_id_of_busy_frame_); + frame_tree_node_id_of_busy_frame_ = frame_tree_node_id; + rfh->Send(new FrameMsg_SerializeAsMHTML(rfh->GetRoutingID(), ipc_params)); return true; } @@ -207,6 +244,31 @@ void MHTMLGenerationManager::Job::CloseFile( callback); } +bool MHTMLGenerationManager::Job::OnSerializeAsMHTMLResponse( + RenderFrameHostImpl* sender, + const std::set<std::string>& digests_of_uris_of_serialized_resources) { + // Sanitize renderer input / reject unexpected messages. + int sender_id = sender->frame_tree_node()->frame_tree_node_id(); + if (sender_id != frame_tree_node_id_of_busy_frame_) { + NOTREACHED(); + return false; // Report failure. + } + frame_tree_node_id_of_busy_frame_ = FrameTreeNode::kFrameTreeNodeInvalidId; + + // Renderer should be deduping resources with the same uris. + DCHECK_EQ(0u, base::STLSetIntersection<std::set<std::string>>( + digests_of_already_serialized_uris_, + digests_of_uris_of_serialized_resources).size()); + digests_of_already_serialized_uris_.insert( + digests_of_uris_of_serialized_resources.begin(), + digests_of_uris_of_serialized_resources.end()); + + if (pending_frame_tree_node_ids_.empty()) + return true; // Report success. + + return SendToNextRenderFrame(); +} + // static int64_t MHTMLGenerationManager::Job::CloseFileOnFileThread(base::File file) { DCHECK_CURRENTLY_ON(BrowserThread::FILE); @@ -260,9 +322,11 @@ void MHTMLGenerationManager::SaveMHTML(WebContents* web_contents, job_id)); } -void MHTMLGenerationManager::OnSavedFrameAsMHTML( +void MHTMLGenerationManager::OnSerializeAsMHTMLResponse( + RenderFrameHostImpl* sender, int job_id, - bool mhtml_generation_in_renderer_succeeded) { + bool mhtml_generation_in_renderer_succeeded, + const std::set<std::string>& digests_of_uris_of_serialized_resources) { DCHECK_CURRENTLY_ON(BrowserThread::UI); if (!mhtml_generation_in_renderer_succeeded) { @@ -274,14 +338,14 @@ void MHTMLGenerationManager::OnSavedFrameAsMHTML( if (!job) return; - if (job->HasMoreFramesToProcess()) { - if (!job->SendToNextRenderFrame()) { - JobFinished(job_id, JobStatus::FAILURE); - } + if (!job->OnSerializeAsMHTMLResponse( + sender, digests_of_uris_of_serialized_resources)) { + JobFinished(job_id, JobStatus::FAILURE); return; } - JobFinished(job_id, JobStatus::SUCCESS); + if (job->IsDone()) + JobFinished(job_id, JobStatus::SUCCESS); } // static diff --git a/content/browser/download/mhtml_generation_manager.h b/content/browser/download/mhtml_generation_manager.h index e697e13..b418e54 100644 --- a/content/browser/download/mhtml_generation_manager.h +++ b/content/browser/download/mhtml_generation_manager.h @@ -8,12 +8,15 @@ #include <stdint.h> #include <map> +#include <set> +#include <string> #include "base/files/file.h" #include "base/macros.h" #include "base/memory/singleton.h" #include "base/process/process.h" #include "ipc/ipc_platform_file.h" +#include "url/gurl.h" namespace base { class FilePath; @@ -21,6 +24,7 @@ class FilePath; namespace content { +class RenderFrameHostImpl; class WebContents; // The class and all of its members live on the UI thread. Only static methods @@ -42,8 +46,11 @@ class MHTMLGenerationManager { // Handler for FrameHostMsg_SerializeAsMHTMLResponse (a notification from the // renderer that the MHTML generation finished for a single frame). - void OnSavedFrameAsMHTML(int job_id, - bool mhtml_generation_in_renderer_succeeded); + void OnSerializeAsMHTMLResponse( + RenderFrameHostImpl* sender, + int job_id, + bool mhtml_generation_in_renderer_succeeded, + const std::set<std::string>& digests_of_uris_of_serialized_resources); private: friend struct base::DefaultSingletonTraits<MHTMLGenerationManager>; diff --git a/content/browser/frame_host/render_frame_host_impl.cc b/content/browser/frame_host/render_frame_host_impl.cc index a74c597..1834494 100644 --- a/content/browser/frame_host/render_frame_host_impl.cc +++ b/content/browser/frame_host/render_frame_host_impl.cc @@ -1714,8 +1714,12 @@ void RenderFrameHostImpl::OnDidChangeLoadProgress(double load_progress) { frame_tree_node_->DidChangeLoadProgress(load_progress); } -void RenderFrameHostImpl::OnSerializeAsMHTMLResponse(int job_id, bool success) { - MHTMLGenerationManager::GetInstance()->OnSavedFrameAsMHTML(job_id, success); +void RenderFrameHostImpl::OnSerializeAsMHTMLResponse( + int job_id, + bool success, + const std::set<std::string>& digests_of_uris_of_serialized_resources) { + MHTMLGenerationManager::GetInstance()->OnSerializeAsMHTMLResponse( + this, job_id, success, digests_of_uris_of_serialized_resources); } #if defined(OS_MACOSX) || defined(OS_ANDROID) diff --git a/content/browser/frame_host/render_frame_host_impl.h b/content/browser/frame_host/render_frame_host_impl.h index cc190ac..07f78db 100644 --- a/content/browser/frame_host/render_frame_host_impl.h +++ b/content/browser/frame_host/render_frame_host_impl.h @@ -9,6 +9,7 @@ #include <stdint.h> #include <map> +#include <set> #include <string> #include <vector> @@ -620,7 +621,10 @@ class CONTENT_EXPORT RenderFrameHostImpl void OnDidStartLoading(bool to_different_document); void OnDidStopLoading(); void OnDidChangeLoadProgress(double load_progress); - void OnSerializeAsMHTMLResponse(int job_id, bool success); + void OnSerializeAsMHTMLResponse( + int job_id, + bool success, + const std::set<std::string>& digests_of_uris_of_serialized_resources); #if defined(OS_MACOSX) || defined(OS_ANDROID) void OnShowPopup(const FrameHostMsg_ShowPopup_Params& params); diff --git a/content/common/frame_messages.h b/content/common/frame_messages.h index 561e0e9..59f3284 100644 --- a/content/common/frame_messages.h +++ b/content/common/frame_messages.h @@ -8,6 +8,11 @@ #include <stddef.h> #include <stdint.h> +#include <map> +#include <set> +#include <string> +#include <vector> + #include "build/build_config.h" #include "cc/surfaces/surface_id.h" #include "cc/surfaces/surface_sequence.h" @@ -453,6 +458,38 @@ IPC_STRUCT_TRAITS_BEGIN(content::SavableSubframe) IPC_STRUCT_TRAITS_MEMBER(routing_id) IPC_STRUCT_TRAITS_END() +IPC_STRUCT_BEGIN(FrameMsg_SerializeAsMHTML_Params) + // Job id - used to match responses to requests. + IPC_STRUCT_MEMBER(int, job_id) + + // Destination file handle. + IPC_STRUCT_MEMBER(IPC::PlatformFileForTransit, destination_file) + + // MHTML boundary marker / MIME multipart boundary maker. The same + // |mhtml_boundary_marker| should be used for serialization of each frame. + IPC_STRUCT_MEMBER(std::string, mhtml_boundary_marker) + + // Frame to content-id map. + // Keys are routing ids of either RenderFrames or RenderFrameProxies. + // Values are MHTML content-ids - see WebPageSerializer::generateMHTMLParts. + IPC_STRUCT_MEMBER(FrameMsg_SerializeAsMHTML_FrameRoutingIdToContentIdMap, + frame_routing_id_to_content_id) + + // |digests_of_uris_to_skip| contains digests of uris of MHTML parts that + // should be skipped. This helps deduplicate mhtml parts across frames. + // SECURITY NOTE: Sha256 digests (rather than uris) are used to prevent + // disclosing uris to other renderer processes; the digests should be + // generated using SHA256HashString function from crypto/sha2.h and hashing + // |salt + url.spec()|. + IPC_STRUCT_MEMBER(std::set<std::string>, digests_of_uris_to_skip) + + // Salt used for |digests_of_uris_to_skip|. + IPC_STRUCT_MEMBER(std::string, salt) + + // If |is_last_frame| is true, then an MHTML footer will be generated. + IPC_STRUCT_MEMBER(bool, is_last_frame) +IPC_STRUCT_END() + #if defined(OS_MACOSX) || defined(OS_ANDROID) // This message is used for supporting popup menus on Mac OS X and Android using // native controls. See the FrameHostMsg_ShowPopup message. @@ -752,28 +789,11 @@ IPC_MESSAGE_ROUTED1(FrameMsg_GetSerializedHtmlWithLocalLinks, FrameMsg_GetSerializedHtmlWithLocalLinks_Map) // Serialize target frame and its resources into MHTML and write it into the -// provided destination file handle. -// -// When starting generation of a new MHTML document, one needs to start by -// sending FrameMsg_SerializeAsMHTML for the *main* frame (main frame needs to -// be the first part in the MHTML document + main frame will trigger generation -// of the MHTML header). -// -// The same |mhtml_boundary_marker| should be used for serialization of each -// frame (this string will be used as a mime multipart boundary within the mhtml -// document). -// -// For more details about frame to content id map please see -// WebPageSerializer::generateMHTMLParts method. -// -// |is_last_frame| controls whether the serializer in the renderer will -// emit the MHTML footer. -IPC_MESSAGE_ROUTED5(FrameMsg_SerializeAsMHTML, - int /* job_id (used to match responses to requests) */, - IPC::PlatformFileForTransit /* destination file handle */, - std::string /* mhtml boundary marker */, - FrameMsg_SerializeAsMHTML_FrameRoutingIdToContentIdMap, - bool /* is last frame */) +// provided destination file handle. Note that when serializing multiple +// frames, one needs to serialize the *main* frame first (the main frame +// needs to go first according to RFC2557 + the main frame will trigger +// generation of the MHTML header). +IPC_MESSAGE_ROUTED1(FrameMsg_SerializeAsMHTML, FrameMsg_SerializeAsMHTML_Params) IPC_MESSAGE_ROUTED1(FrameMsg_SetFrameOwnerProperties, blink::WebFrameOwnerProperties /* frame_owner_properties */) @@ -1319,9 +1339,11 @@ IPC_MESSAGE_ROUTED2(FrameHostMsg_SerializedHtmlWithLocalLinksResponse, bool /* end of data? */) // Response to FrameMsg_SerializeAsMHTML. -IPC_MESSAGE_ROUTED2(FrameHostMsg_SerializeAsMHTMLResponse, - int /* job_id (used to match responses to requests) */, - bool /* true if success, false if error */) +IPC_MESSAGE_ROUTED3( + FrameHostMsg_SerializeAsMHTMLResponse, + int /* job_id (used to match responses to requests) */, + bool /* true if success, false if error */, + std::set<std::string> /* digests of uris of serialized resources */) // Sent when the renderer updates hint for importance of a tab. IPC_MESSAGE_ROUTED1(FrameHostMsg_UpdatePageImportanceSignals, diff --git a/content/renderer/render_frame_impl.cc b/content/renderer/render_frame_impl.cc index 180be85..a6d24f2 100644 --- a/content/renderer/render_frame_impl.cc +++ b/content/renderer/render_frame_impl.cc @@ -17,10 +17,12 @@ #include "base/files/file.h" #include "base/i18n/char_iterator.h" #include "base/logging.h" +#include "base/macros.h" #include "base/memory/shared_memory.h" #include "base/memory/weak_ptr.h" #include "base/metrics/histogram.h" #include "base/process/process.h" +#include "base/stl_util.h" #include "base/strings/string16.h" #include "base/strings/utf_string_conversions.h" #include "base/thread_task_runner_handle.h" @@ -127,6 +129,7 @@ #include "content/renderer/web_frame_utils.h" #include "content/renderer/web_ui_extension.h" #include "content/renderer/websharedworker_proxy.h" +#include "crypto/sha2.h" #include "gin/modules/module_registry.h" #include "media/audio/audio_output_device.h" #include "media/base/audio_renderer_mixer_input.h" @@ -580,6 +583,55 @@ WebString ConvertRelativePathToHtmlAttribute(const base::FilePath& path) { path.NormalizePathSeparatorsTo(FILE_PATH_LITERAL('/')).AsUTF8Unsafe()); } +// Implementation of WebPageSerializer::MHTMLPartsGenerationDelegate that +// 1. Bases shouldSkipResource and getContentID responses on contents of +// FrameMsg_SerializeAsMHTML_Params. +// 2. Stores digests of urls of serialized resources (i.e. urls reported via +// shouldSkipResource) into |digests_of_uris_of_serialized_resources| passed +// to the constructor. +class MHTMLPartsGenerationDelegate + : public WebPageSerializer::MHTMLPartsGenerationDelegate { + public: + MHTMLPartsGenerationDelegate( + const FrameMsg_SerializeAsMHTML_Params& params, + std::set<std::string>* digests_of_uris_of_serialized_resources) + : params_(params), + digests_of_uris_of_serialized_resources_( + digests_of_uris_of_serialized_resources) { + DCHECK(digests_of_uris_of_serialized_resources_); + } + + bool shouldSkipResource(const WebURL& url) override { + std::string digest = + crypto::SHA256HashString(params_.salt + GURL(url).spec()); + + // Skip if the |url| already covered by serialization of an *earlier* frame. + if (ContainsKey(params_.digests_of_uris_to_skip, digest)) + return true; + + // Let's record |url| as being serialized for the *current* frame. + auto pair = digests_of_uris_of_serialized_resources_->insert(digest); + bool insertion_took_place = pair.second; + DCHECK(insertion_took_place); // Blink should dedupe within a frame. + + return false; + } + + WebString getContentID(const WebFrame& frame) override { + int routing_id = GetRoutingIdForFrameOrProxy(const_cast<WebFrame*>(&frame)); + auto it = params_.frame_routing_id_to_content_id.find(routing_id); + DCHECK(it != params_.frame_routing_id_to_content_id.end()); + const std::string& content_id = it->second; + return WebString::fromUTF8(content_id); + } + + private: + const FrameMsg_SerializeAsMHTML_Params& params_; + std::set<std::string>* digests_of_uris_of_serialized_resources_; + + DISALLOW_COPY_AND_ASSIGN(MHTMLPartsGenerationDelegate); +}; + bool IsContentWithCertificateErrorsRelevantToUI( const blink::WebURL& url, const blink::WebCString& security_info, @@ -4766,28 +4818,18 @@ void RenderFrameImpl::OnGetSerializedHtmlWithLocalLinks( } void RenderFrameImpl::OnSerializeAsMHTML( - int job_id, - IPC::PlatformFileForTransit file_for_transit, - const std::string& std_mhtml_boundary, - const std::map<int, std::string>& frame_routing_id_to_content_id, - bool is_last_frame) { + const FrameMsg_SerializeAsMHTML_Params& params) { // Unpack IPC payload. - base::File file = IPC::PlatformFileForTransitToFile(file_for_transit); - const WebString mhtml_boundary = WebString::fromUTF8(std_mhtml_boundary); + base::File file = IPC::PlatformFileForTransitToFile(params.destination_file); + const WebString mhtml_boundary = + WebString::fromUTF8(params.mhtml_boundary_marker); DCHECK(!mhtml_boundary.isEmpty()); - std::vector<std::pair<WebFrame*, WebString>> web_frame_to_content_id; - for (const auto& it : frame_routing_id_to_content_id) { - const std::string& content_id = it.second; - WebFrame* web_frame = GetWebFrameFromRoutingIdForFrameOrProxy(it.first); - if (!web_frame) - continue; - - web_frame_to_content_id.push_back( - std::make_pair(web_frame, WebString::fromUTF8(content_id))); - } WebData data; bool success = true; + std::set<std::string> digests_of_uris_of_serialized_resources; + MHTMLPartsGenerationDelegate delegate( + params, &digests_of_uris_of_serialized_resources); // Generate MHTML header if needed. if (IsMainFrame()) { @@ -4800,8 +4842,8 @@ void RenderFrameImpl::OnSerializeAsMHTML( // Generate MHTML parts. if (success) { - data = WebPageSerializer::generateMHTMLParts( - mhtml_boundary, GetWebFrame(), false, web_frame_to_content_id); + data = WebPageSerializer::generateMHTMLParts(mhtml_boundary, GetWebFrame(), + false, &delegate); // TODO(jcivelli): write the chunks in deferred tasks to give a chance to // the message loop to process other events. if (file.WriteAtCurrentPos(data.data(), data.size()) < 0) { @@ -4810,7 +4852,7 @@ void RenderFrameImpl::OnSerializeAsMHTML( } // Generate MHTML footer if needed. - if (success && is_last_frame) { + if (success && params.is_last_frame) { data = WebPageSerializer::generateMHTMLFooter(mhtml_boundary); if (file.WriteAtCurrentPos(data.data(), data.size()) < 0) { success = false; @@ -4819,7 +4861,9 @@ void RenderFrameImpl::OnSerializeAsMHTML( // Cleanup and notify the browser process about completion. file.Close(); // Need to flush file contents before sending IPC response. - Send(new FrameHostMsg_SerializeAsMHTMLResponse(routing_id_, job_id, success)); + Send(new FrameHostMsg_SerializeAsMHTMLResponse( + routing_id_, params.job_id, success, + digests_of_uris_of_serialized_resources)); } void RenderFrameImpl::OpenURL(const GURL& url, diff --git a/content/renderer/render_frame_impl.h b/content/renderer/render_frame_impl.h index 76e2925..7efc773 100644 --- a/content/renderer/render_frame_impl.h +++ b/content/renderer/render_frame_impl.h @@ -48,6 +48,7 @@ #include "third_party/WebKit/public/web/WebPageSerializerClient.h" #include "third_party/WebKit/public/web/WebScriptExecutionCallback.h" #include "ui/gfx/range/range.h" +#include "url/gurl.h" #if defined(ENABLE_PLUGINS) #include "content/renderer/pepper/plugin_power_saver_helper.h" @@ -61,10 +62,10 @@ #include "media/mojo/interfaces/service_factory.mojom.h" #endif -class GURL; class TransportDIB; struct FrameMsg_NewFrame_WidgetParams; struct FrameMsg_PostMessage_Params; +struct FrameMsg_SerializeAsMHTML_Params; struct FrameMsg_TextTrackSettings_Params; namespace blink { @@ -790,12 +791,7 @@ class CONTENT_EXPORT RenderFrameImpl void OnGetSavableResourceLinks(); void OnGetSerializedHtmlWithLocalLinks( const std::map<GURL, base::FilePath>& url_to_local_path); - void OnSerializeAsMHTML( - int job_id, - IPC::PlatformFileForTransit file_for_transit, - const std::string& mhtml_boundary_marker, - const std::map<int, std::string>& frame_routing_id_to_content_id, - bool is_last_frame); + void OnSerializeAsMHTML(const FrameMsg_SerializeAsMHTML_Params& params); // Requests that the browser process navigates to |url|. If // |is_history_navigation_in_new_child| is true, the browser process should diff --git a/third_party/WebKit/Source/core/page/PageSerializer.cpp b/third_party/WebKit/Source/core/page/PageSerializer.cpp index d2bfd21..a3fe2fe 100644 --- a/third_party/WebKit/Source/core/page/PageSerializer.cpp +++ b/third_party/WebKit/Source/core/page/PageSerializer.cpp @@ -84,7 +84,7 @@ static bool shouldIgnoreElement(const Element& element) class SerializerMarkupAccumulator : public MarkupAccumulator { STACK_ALLOCATED(); public: - SerializerMarkupAccumulator(PageSerializer*, const Document&, WillBeHeapVector<RawPtrWillBeMember<Node>>&); + SerializerMarkupAccumulator(PageSerializer::Delegate&, const Document&, WillBeHeapVector<RawPtrWillBeMember<Node>>&); ~SerializerMarkupAccumulator() override; protected: @@ -103,7 +103,7 @@ private: const String& attributeName, const String& attributeValue); - PageSerializer* m_serializer; + PageSerializer::Delegate& m_delegate; RawPtrWillBeMember<const Document> m_document; // FIXME: |PageSerializer| uses |m_nodes| for collecting nodes in document @@ -116,9 +116,9 @@ private: WillBeHeapHashSet<RawPtrWillBeMember<const Element>> m_elementsWithRewrittenLinks; }; -SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node>>& nodes) +SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer::Delegate& delegate, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node>>& nodes) : MarkupAccumulator(ResolveAllURLs) - , m_serializer(serializer) + , m_delegate(delegate) , m_document(&document) , m_nodes(nodes) { @@ -137,11 +137,7 @@ void SerializerMarkupAccumulator::appendText(StringBuilder& result, Text& text) bool SerializerMarkupAccumulator::shouldIgnoreAttribute(const Attribute& attribute) { - PageSerializer::Delegate* delegate = m_serializer->delegate(); - if (delegate) - return delegate->shouldIgnoreAttribute(attribute); - - return MarkupAccumulator::shouldIgnoreAttribute(attribute); + return m_delegate.shouldIgnoreAttribute(attribute); } void SerializerMarkupAccumulator::appendElement(StringBuilder& result, Element& element, Namespaces* namespaces) @@ -176,9 +172,8 @@ void SerializerMarkupAccumulator::appendAttribute( && attribute.name() == HTMLNames::srcdocAttr; if (isLinkAttribute || isSrcDocAttribute) { // Check if the delegate wants to do link rewriting for the element. - PageSerializer::Delegate* delegate = m_serializer->delegate(); String newLinkForTheElement; - if (delegate && delegate->rewriteLink(element, newLinkForTheElement)) { + if (m_delegate.rewriteLink(element, newLinkForTheElement)) { if (isLinkAttribute) { // Rewrite element links. appendRewrittenAttribute( @@ -247,7 +242,7 @@ void SerializerMarkupAccumulator::appendRewrittenAttribute( PageSerializer::PageSerializer( Vector<SerializedResource>& resources, - Delegate* delegate) + Delegate& delegate) : m_resources(&resources) , m_delegate(delegate) { @@ -267,7 +262,7 @@ void PageSerializer::serializeFrame(const LocalFrame& frame) } WillBeHeapVector<RawPtrWillBeMember<Node>> serializedNodes; - SerializerMarkupAccumulator accumulator(this, document, serializedNodes); + SerializerMarkupAccumulator accumulator(m_delegate, document, serializedNodes); String text = serializeNodes<EditingStrategy>(accumulator, document, IncludeNode); CString frameHTML = document.encoding().encode(text, WTF::EntitiesForUnencodables); @@ -302,7 +297,6 @@ void PageSerializer::serializeFrame(const LocalFrame& frame) if (CSSStyleSheet* sheet = linkElement.sheet()) { KURL url = document.completeURL(linkElement.getAttribute(HTMLNames::hrefAttr)); serializeCSSStyleSheet(*sheet, url); - ASSERT(m_resourceURLs.contains(url)); } } else if (isHTMLStyleElement(element)) { HTMLStyleElement& styleElement = toHTMLStyleElement(element); @@ -332,7 +326,7 @@ void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KUR serializeCSSRule(rule); } - if (url.isValid() && !m_resourceURLs.contains(url)) { + if (shouldAddURL(url)) { WTF::TextEncoding textEncoding(styleSheet.contents()->charset()); ASSERT(textEncoding.isValid()); String textString = cssText.toString(); @@ -392,7 +386,8 @@ void PageSerializer::serializeCSSRule(CSSRule* rule) bool PageSerializer::shouldAddURL(const KURL& url) { - return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData(); + return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData() + && !m_delegate.shouldSkipResource(url); } void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer> data, const KURL& url) @@ -469,11 +464,6 @@ void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document& } } -PageSerializer::Delegate* PageSerializer::delegate() -{ - return m_delegate; -} - // Returns MOTW (Mark of the Web) declaration before html tag which is in // HTML comment, e.g. "<!-- saved from url=(%04d)%s -->" // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. diff --git a/third_party/WebKit/Source/core/page/PageSerializer.h b/third_party/WebKit/Source/core/page/PageSerializer.h index bd99ede..0de4675 100644 --- a/third_party/WebKit/Source/core/page/PageSerializer.h +++ b/third_party/WebKit/Source/core/page/PageSerializer.h @@ -68,7 +68,11 @@ class CORE_EXPORT PageSerializer final { public: class Delegate { public: - virtual bool shouldIgnoreAttribute(const Attribute&) = 0; + // Controls whether HTML serialization should skip the given attribute. + virtual bool shouldIgnoreAttribute(const Attribute&) + { + return false; + } // Method allowing the Delegate control which URLs are written into the // generated html document. @@ -79,14 +83,24 @@ public: // (i.e. in place of img.src or iframe.src or object.data). // // If no link rewriting is desired, this method should return false. - virtual bool rewriteLink(const Element&, String& rewrittenLink) = 0; + virtual bool rewriteLink(const Element&, String& rewrittenLink) + { + return false; + } + + // Tells whether to skip serialization of a subresource with a given URI. + // Used to deduplicate resources across multiple frames. + virtual bool shouldSkipResource(const KURL&) + { + return false; + } }; // Constructs a serializer that will write output to the given vector of - // SerializedResources and use the optional Delegate for controlling some - // serialization aspects. Callers need to ensure that the Delegate stays + // SerializedResources and uses the Delegate for controlling some + // serialization aspects. Callers need to ensure that both arguments stay // alive until the PageSerializer gets destroyed. - PageSerializer(Vector<SerializedResource>&, Delegate*); + PageSerializer(Vector<SerializedResource>&, Delegate&); // Initiates the serialization of the frame. All serialized content and // retrieved resources are added to the Vector passed to the constructor. @@ -94,8 +108,6 @@ public: // Subsequent resources are images, css, etc. void serializeFrame(const LocalFrame&); - Delegate* delegate(); - static String markOfTheWebDeclaration(const KURL&); private: @@ -118,7 +130,7 @@ private: Vector<SerializedResource>* m_resources; ListHashSet<KURL> m_resourceURLs; - Delegate* m_delegate; + Delegate& m_delegate; }; } // namespace blink diff --git a/third_party/WebKit/Source/web/WebPageSerializer.cpp b/third_party/WebKit/Source/web/WebPageSerializer.cpp index afb89bc..bb9d48b 100644 --- a/third_party/WebKit/Source/web/WebPageSerializer.cpp +++ b/third_party/WebKit/Source/web/WebPageSerializer.cpp @@ -69,28 +69,21 @@ namespace blink { namespace { -using ContentIDMap = WillBeHeapHashMap<RawPtrWillBeMember<Frame>, String>; - -class MHTMLPageSerializerDelegate final : - public NoBaseWillBeGarbageCollected<MHTMLPageSerializerDelegate>, - public PageSerializer::Delegate { +class MHTMLPageSerializerDelegate final : public PageSerializer::Delegate { WTF_MAKE_NONCOPYABLE(MHTMLPageSerializerDelegate); public: - MHTMLPageSerializerDelegate(const ContentIDMap& frameToContentID); + explicit MHTMLPageSerializerDelegate(WebPageSerializer::MHTMLPartsGenerationDelegate&); bool shouldIgnoreAttribute(const Attribute&) override; bool rewriteLink(const Element&, String& rewrittenLink) override; - -#if ENABLE(OILPAN) - void trace(Visitor* visitor) { visitor->trace(m_frameToContentID); } -#endif + bool shouldSkipResource(const KURL&) override; private: - const ContentIDMap& m_frameToContentID; + WebPageSerializer::MHTMLPartsGenerationDelegate& m_webDelegate; }; MHTMLPageSerializerDelegate::MHTMLPageSerializerDelegate( - const ContentIDMap& frameToContentID) - : m_frameToContentID(frameToContentID) + WebPageSerializer::MHTMLPartsGenerationDelegate& webDelegate) + : m_webDelegate(webDelegate) { } @@ -114,7 +107,8 @@ bool MHTMLPageSerializerDelegate::rewriteLink( if (!frame) return false; - KURL cidURI = MHTMLParser::convertContentIDToURI(m_frameToContentID.get(frame)); + WebString contentID = m_webDelegate.getContentID(*WebFrame::fromFrame(frame)); + KURL cidURI = MHTMLParser::convertContentIDToURI(contentID); ASSERT(cidURI.isValid()); if (isHTMLFrameElementBase(&element)) { @@ -135,20 +129,9 @@ bool MHTMLPageSerializerDelegate::rewriteLink( return false; } -ContentIDMap createFrameToContentIDMap( - const WebVector<std::pair<WebFrame*, WebString>>& webFrameToContentID) +bool MHTMLPageSerializerDelegate::shouldSkipResource(const KURL& url) { - ContentIDMap result; - for (const auto& it : webFrameToContentID) { - WebFrame* webFrame = it.first; - const WebString& webContentID = it.second; - - Frame* frame = webFrame->toImplBase()->frame(); - String contentID(webContentID); - - result.add(frame, contentID); - } - return result; + return m_webDelegate.shouldSkipResource(url); } } // namespace @@ -167,28 +150,34 @@ WebData WebPageSerializer::generateMHTMLHeader( WebData WebPageSerializer::generateMHTMLParts( const WebString& boundary, WebLocalFrame* webFrame, bool useBinaryEncoding, - const WebVector<std::pair<WebFrame*, WebString>>& webFrameToContentID) + MHTMLPartsGenerationDelegate* webDelegate) { + ASSERT(webFrame); + ASSERT(webDelegate); + // Translate arguments from public to internal blink APIs. LocalFrame* frame = toWebLocalFrameImpl(webFrame)->frame(); MHTMLArchive::EncodingPolicy encodingPolicy = useBinaryEncoding ? MHTMLArchive::EncodingPolicy::UseBinaryEncoding : MHTMLArchive::EncodingPolicy::UseDefaultEncoding; - ContentIDMap frameToContentID = createFrameToContentIDMap(webFrameToContentID); // Serialize. Vector<SerializedResource> resources; - MHTMLPageSerializerDelegate delegate(frameToContentID); - PageSerializer serializer(resources, &delegate); + MHTMLPageSerializerDelegate coreDelegate(*webDelegate); + PageSerializer serializer(resources, coreDelegate); serializer.serializeFrame(*frame); + // Get Content-ID for the frame being serialized. + String frameContentID = webDelegate->getContentID(*webFrame); + ASSERT(!frameContentID.isEmpty()); + // Encode serializer's output as MHTML. RefPtr<SharedBuffer> output = SharedBuffer::create(); bool isFirstResource = true; for (const SerializedResource& resource : resources) { // Frame is the 1st resource (see PageSerializer::serializeFrame doc - // comment). Frames need a Content-ID header. - String contentID = isFirstResource ? frameToContentID.get(frame) : String(); + // comment). Frames get a Content-ID header. + String contentID = isFirstResource ? frameContentID : String(); MHTMLArchive::generateMHTMLPart( boundary, contentID, encodingPolicy, resource, *output); diff --git a/third_party/WebKit/Source/web/tests/PageSerializerTest.cpp b/third_party/WebKit/Source/web/tests/PageSerializerTest.cpp index a52501c..f333ef5 100644 --- a/third_party/WebKit/Source/web/tests/PageSerializerTest.cpp +++ b/third_party/WebKit/Source/web/tests/PageSerializerTest.cpp @@ -119,7 +119,7 @@ protected: void serialize(const char* url) { FrameTestHelpers::loadFrame(m_helper.webView()->mainFrame(), KURL(m_baseUrl, url).string().utf8().data()); - PageSerializer serializer(m_resources, this); + PageSerializer serializer(m_resources, *this); Frame* frame = m_helper.webViewImpl()->mainFrameImpl()->frame(); for (; frame; frame = frame->tree().traverseNext()) { // This is safe, because tests do not do cross-site navigation @@ -168,12 +168,6 @@ private: } // PageSerializer::Delegate implementation. - bool shouldIgnoreAttribute(const Attribute&) override - { - return false; - } - - // PageSerializer::Delegate implementation. bool rewriteLink(const Element& element, String& rewrittenLink) { String completeURL; diff --git a/third_party/WebKit/public/web/WebPageSerializer.h b/third_party/WebKit/public/web/WebPageSerializer.h index 7a59b6a..44d5f94 100644 --- a/third_party/WebKit/public/web/WebPageSerializer.h +++ b/third_party/WebKit/public/web/WebPageSerializer.h @@ -63,21 +63,29 @@ public: BLINK_EXPORT static WebData generateMHTMLHeader( const WebString& boundary, WebLocalFrame*); - // Generates and returns MHTML parts for the given frame and all the + // Delegate for controling the behavior of generateMHTMLParts method. + class MHTMLPartsGenerationDelegate { + public: + // Tells whether to skip serialization of a subresource with a given URI. + // Used to deduplicate resources across multiple frames. + virtual bool shouldSkipResource(const WebURL&) = 0; + + // Returns a Content-ID to be used for the given frame. + // See rfc2557 - section 8.3 - "Use of the Content-ID header and CID URLs". + // Format note - the returned string should be of the form "<foo@bar.com>" + // (i.e. the strings should include the angle brackets). + virtual WebString getContentID(const WebFrame&) = 0; + }; + + // Generates and returns MHTML parts for the given frame and the // savable resources underneath. // // Same |boundary| needs to used for all generateMHTMLHeader and // generateMHTMLParts and generateMHTMLFooter calls that belong to the same // MHTML document (see also rfc1341, section 7.2.1, "boundary" description). - // - // |frameToContentID| is used for 1) emitting cid: scheme uri links for - // subframes and 2) emitting MIME Content-ID headers. - // See rfc2557 - section 8.3 - "Use of the Content-ID header and CID URLs". - // Format note - |frameToContentID| should contain strings of the form - // "<foo@bar.com>" (i.e. the strings should include the angle brackets). BLINK_EXPORT static WebData generateMHTMLParts( const WebString& boundary, WebLocalFrame*, bool useBinaryEncoding, - const WebVector<std::pair<WebFrame*, WebString>>& frameToContentID); + MHTMLPartsGenerationDelegate*); // Generates and returns an MHTML footer. // @@ -92,7 +100,7 @@ public: // This function will serialize the specified frame to HTML data. // We have a data buffer to temporary saving generated html data. We will - // sequentially call WebPageSeriazlierClient once the data buffer is full. + // sequentially call WebPageSerializerClient once the data buffer is full. // // Return false means if no data has been serialized (i.e. because // the target frame didn't have a valid url). |