// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "content/browser/download/mhtml_generation_manager.h" #include #include #include #include "base/bind.h" #include "base/files/file.h" #include "base/guid.h" #include "base/macros.h" #include "base/scoped_observer.h" #include "base/stl_util.h" #include "base/strings/stringprintf.h" #include "content/browser/bad_message.h" #include "content/browser/frame_host/frame_tree_node.h" #include "content/browser/frame_host/render_frame_host_impl.h" #include "content/common/frame_messages.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/render_frame_host.h" #include "content/public/browser/render_process_host.h" #include "content/public/browser/render_process_host_observer.h" #include "content/public/browser/web_contents.h" #include "net/base/mime_util.h" namespace content { // The class and all of its members live on the UI thread. Only static methods // are executed on other threads. class MHTMLGenerationManager::Job : public RenderProcessHostObserver { public: Job(int job_id, WebContents* web_contents, GenerateMHTMLCallback callback); ~Job() override; int id() const { return job_id_; } void set_browser_file(base::File file) { browser_file_ = std::move(file); } GenerateMHTMLCallback callback() const { return callback_; } // Handler for FrameHostMsg_SerializeAsMHTMLResponse (a notification from the // renderer that the MHTML generation for previous frame has finished). // Returns |true| upon success; |false| otherwise. bool OnSerializeAsMHTMLResponse( RenderFrameHostImpl* sender, const std::set& digests_of_uris_of_serialized_resources); // Sends IPC to the renderer, asking for MHTML generation of the next frame. // // Returns true if the message was sent successfully; false otherwise. bool SendToNextRenderFrame(); // Indicates if more calls to SendToNextRenderFrame are needed. bool IsDone() const { bool waiting_for_response_from_renderer = frame_tree_node_id_of_busy_frame_ != FrameTreeNode::kFrameTreeNodeInvalidId; bool no_more_requests_to_send = pending_frame_tree_node_ids_.empty(); return !waiting_for_response_from_renderer && no_more_requests_to_send; } // Close the file on the file thread and respond back on the UI thread with // file size. void CloseFile(base::Callback callback); // RenderProcessHostObserver: void RenderProcessExited(RenderProcessHost* host, base::TerminationStatus status, int exit_code) override; void RenderProcessHostDestroyed(RenderProcessHost* host) override; private: static int64_t CloseFileOnFileThread(base::File file); void AddFrame(RenderFrameHost* render_frame_host); // Creates a new map with values (content ids) the same as in // |frame_tree_node_to_content_id_| map, but with the keys translated from // frame_tree_node_id into a |site_instance|-specific routing_id. std::map CreateFrameRoutingIdToContentId( SiteInstance* site_instance); // Id used to map renderer responses to jobs. // See also MHTMLGenerationManager::id_to_job_ map. int job_id_; // The IDs of frames that still need to be processed. std::queue pending_frame_tree_node_ids_; // Identifies a frame to which we've sent FrameMsg_SerializeAsMHTML but for // which we didn't yet process FrameHostMsg_SerializeAsMHTMLResponse via // OnSerializeAsMHTMLResponse. int frame_tree_node_id_of_busy_frame_; // The handle to the file the MHTML is saved to for the browser process. base::File browser_file_; // Map from frames to content ids (see WebFrameSerializer::generateMHTMLParts // for more details about what "content ids" are and how they are used). std::map frame_tree_node_to_content_id_; // MIME multipart boundary to use in the MHTML doc. std::string mhtml_boundary_marker_; // Digests of URIs of already generated MHTML parts. std::set digests_of_already_serialized_uris_; std::string salt_; // The callback to call once generation is complete. GenerateMHTMLCallback callback_; // RAII helper for registering this Job as a RenderProcessHost observer. ScopedObserver observed_renderer_process_host_; DISALLOW_COPY_AND_ASSIGN(Job); }; MHTMLGenerationManager::Job::Job(int job_id, WebContents* web_contents, GenerateMHTMLCallback callback) : job_id_(job_id), frame_tree_node_id_of_busy_frame_(FrameTreeNode::kFrameTreeNodeInvalidId), mhtml_boundary_marker_(net::GenerateMimeMultipartBoundary()), salt_(base::GenerateGUID()), callback_(callback), observed_renderer_process_host_(this) { DCHECK_CURRENTLY_ON(BrowserThread::UI); web_contents->ForEachFrame(base::Bind( &MHTMLGenerationManager::Job::AddFrame, base::Unretained(this))); // Safe because ForEachFrame is synchronous. // Main frame needs to be processed first. DCHECK(!pending_frame_tree_node_ids_.empty()); DCHECK(FrameTreeNode::GloballyFindByID(pending_frame_tree_node_ids_.front()) ->parent() == nullptr); } MHTMLGenerationManager::Job::~Job() { DCHECK_CURRENTLY_ON(BrowserThread::UI); } std::map MHTMLGenerationManager::Job::CreateFrameRoutingIdToContentId( SiteInstance* site_instance) { std::map result; for (const auto& it : frame_tree_node_to_content_id_) { int ftn_id = it.first; const std::string& content_id = it.second; FrameTreeNode* ftn = FrameTreeNode::GloballyFindByID(ftn_id); if (!ftn) continue; int routing_id = ftn->render_manager()->GetRoutingIdForSiteInstance(site_instance); if (routing_id == MSG_ROUTING_NONE) continue; result[routing_id] = content_id; } return result; } bool MHTMLGenerationManager::Job::SendToNextRenderFrame() { DCHECK(browser_file_.IsValid()); DCHECK_LT(0u, pending_frame_tree_node_ids_.size()); FrameMsg_SerializeAsMHTML_Params ipc_params; ipc_params.job_id = job_id_; ipc_params.mhtml_boundary_marker = mhtml_boundary_marker_; int frame_tree_node_id = pending_frame_tree_node_ids_.front(); pending_frame_tree_node_ids_.pop(); ipc_params.is_last_frame = pending_frame_tree_node_ids_.empty(); FrameTreeNode* ftn = FrameTreeNode::GloballyFindByID(frame_tree_node_id); if (!ftn) // The contents went away. return false; RenderFrameHost* rfh = ftn->current_frame_host(); // Get notified if the target of the IPC message dies between responding. observed_renderer_process_host_.RemoveAll(); observed_renderer_process_host_.Add(rfh->GetProcess()); // Tell the renderer to skip (= deduplicate) already covered MHTML parts. ipc_params.salt = salt_; ipc_params.digests_of_uris_to_skip = digests_of_already_serialized_uris_; ipc_params.destination_file = IPC::GetFileHandleForProcess( browser_file_.GetPlatformFile(), rfh->GetProcess()->GetHandle(), false); // |close_source_handle|. ipc_params.frame_routing_id_to_content_id = CreateFrameRoutingIdToContentId(rfh->GetSiteInstance()); // Send the IPC asking the renderer to serialize the frame. DCHECK_EQ(FrameTreeNode::kFrameTreeNodeInvalidId, frame_tree_node_id_of_busy_frame_); frame_tree_node_id_of_busy_frame_ = frame_tree_node_id; rfh->Send(new FrameMsg_SerializeAsMHTML(rfh->GetRoutingID(), ipc_params)); return true; } void MHTMLGenerationManager::Job::RenderProcessExited( RenderProcessHost* host, base::TerminationStatus status, int exit_code) { DCHECK_CURRENTLY_ON(BrowserThread::UI); MHTMLGenerationManager::GetInstance()->RenderProcessExited(this); } void MHTMLGenerationManager::Job::AddFrame(RenderFrameHost* render_frame_host) { auto* rfhi = static_cast(render_frame_host); int frame_tree_node_id = rfhi->frame_tree_node()->frame_tree_node_id(); pending_frame_tree_node_ids_.push(frame_tree_node_id); std::string guid = base::GenerateGUID(); std::string content_id = base::StringPrintf("", frame_tree_node_id, guid.c_str()); frame_tree_node_to_content_id_[frame_tree_node_id] = content_id; } void MHTMLGenerationManager::Job::RenderProcessHostDestroyed( RenderProcessHost* host) { DCHECK_CURRENTLY_ON(BrowserThread::UI); observed_renderer_process_host_.Remove(host); } void MHTMLGenerationManager::Job::CloseFile( base::Callback callback) { DCHECK_CURRENTLY_ON(BrowserThread::UI); if (!browser_file_.IsValid()) { callback.Run(-1); return; } BrowserThread::PostTaskAndReplyWithResult( BrowserThread::FILE, FROM_HERE, base::Bind(&MHTMLGenerationManager::Job::CloseFileOnFileThread, base::Passed(std::move(browser_file_))), callback); } bool MHTMLGenerationManager::Job::OnSerializeAsMHTMLResponse( RenderFrameHostImpl* sender, const std::set& digests_of_uris_of_serialized_resources) { // Sanitize renderer input / reject unexpected messages. int sender_id = sender->frame_tree_node()->frame_tree_node_id(); if (sender_id != frame_tree_node_id_of_busy_frame_) { ReceivedBadMessage(sender->GetProcess(), bad_message::DWNLD_INVALID_SERIALIZE_AS_MHTML_RESPONSE); return false; // Report failure. } frame_tree_node_id_of_busy_frame_ = FrameTreeNode::kFrameTreeNodeInvalidId; // Renderer should be deduping resources with the same uris. DCHECK_EQ(0u, base::STLSetIntersection>( digests_of_already_serialized_uris_, digests_of_uris_of_serialized_resources).size()); digests_of_already_serialized_uris_.insert( digests_of_uris_of_serialized_resources.begin(), digests_of_uris_of_serialized_resources.end()); if (pending_frame_tree_node_ids_.empty()) return true; // Report success. return SendToNextRenderFrame(); } // static int64_t MHTMLGenerationManager::Job::CloseFileOnFileThread(base::File file) { DCHECK_CURRENTLY_ON(BrowserThread::FILE); DCHECK(file.IsValid()); int64_t file_size = file.GetLength(); file.Close(); return file_size; } MHTMLGenerationManager* MHTMLGenerationManager::GetInstance() { return base::Singleton::get(); } MHTMLGenerationManager::MHTMLGenerationManager() : next_job_id_(0) {} MHTMLGenerationManager::~MHTMLGenerationManager() { STLDeleteValues(&id_to_job_); } void MHTMLGenerationManager::SaveMHTML(WebContents* web_contents, const base::FilePath& file_path, const GenerateMHTMLCallback& callback) { DCHECK_CURRENTLY_ON(BrowserThread::UI); int job_id = NewJob(web_contents, callback); BrowserThread::PostTaskAndReplyWithResult( BrowserThread::FILE, FROM_HERE, base::Bind(&MHTMLGenerationManager::CreateFile, file_path), base::Bind(&MHTMLGenerationManager::OnFileAvailable, base::Unretained(this), // Safe b/c |this| is a singleton. job_id)); } void MHTMLGenerationManager::OnSerializeAsMHTMLResponse( RenderFrameHostImpl* sender, int job_id, bool mhtml_generation_in_renderer_succeeded, const std::set& digests_of_uris_of_serialized_resources) { DCHECK_CURRENTLY_ON(BrowserThread::UI); Job* job = FindJob(job_id); if (!job) { ReceivedBadMessage(sender->GetProcess(), bad_message::DWNLD_INVALID_SERIALIZE_AS_MHTML_RESPONSE); return; } if (!mhtml_generation_in_renderer_succeeded) { JobFinished(job, JobStatus::FAILURE); return; } if (!job->OnSerializeAsMHTMLResponse( sender, digests_of_uris_of_serialized_resources)) { JobFinished(job, JobStatus::FAILURE); return; } if (job->IsDone()) JobFinished(job, JobStatus::SUCCESS); } // static base::File MHTMLGenerationManager::CreateFile(const base::FilePath& file_path) { DCHECK_CURRENTLY_ON(BrowserThread::FILE); // SECURITY NOTE: A file descriptor to the file created below will be passed // to multiple renderer processes which (in out-of-process iframes mode) can // act on behalf of separate web principals. Therefore it is important to // only allow writing to the file and forbid reading from the file (as this // would allow reading content generated by other renderers / other web // principals). uint32_t file_flags = base::File::FLAG_CREATE_ALWAYS | base::File::FLAG_WRITE; base::File browser_file(file_path, file_flags); if (!browser_file.IsValid()) { LOG(ERROR) << "Failed to create file to save MHTML at: " << file_path.value(); } return browser_file; } void MHTMLGenerationManager::OnFileAvailable(int job_id, base::File browser_file) { DCHECK_CURRENTLY_ON(BrowserThread::UI); Job* job = FindJob(job_id); DCHECK(job); if (!browser_file.IsValid()) { LOG(ERROR) << "Failed to create file"; JobFinished(job, JobStatus::FAILURE); return; } job->set_browser_file(std::move(browser_file)); if (!job->SendToNextRenderFrame()) { JobFinished(job, JobStatus::FAILURE); } } void MHTMLGenerationManager::JobFinished(Job* job, JobStatus job_status) { DCHECK_CURRENTLY_ON(BrowserThread::UI); DCHECK(job); job->CloseFile( base::Bind(&MHTMLGenerationManager::OnFileClosed, base::Unretained(this), // Safe b/c |this| is a singleton. job->id(), job_status)); } void MHTMLGenerationManager::OnFileClosed(int job_id, JobStatus job_status, int64_t file_size) { DCHECK_CURRENTLY_ON(BrowserThread::UI); Job* job = FindJob(job_id); DCHECK(job); job->callback().Run(job_status == JobStatus::SUCCESS ? file_size : -1); id_to_job_.erase(job_id); delete job; } int MHTMLGenerationManager::NewJob(WebContents* web_contents, const GenerateMHTMLCallback& callback) { DCHECK_CURRENTLY_ON(BrowserThread::UI); int job_id = next_job_id_++; Job* job = new Job(job_id, web_contents, callback); id_to_job_[job_id] = job; return job_id; } MHTMLGenerationManager::Job* MHTMLGenerationManager::FindJob(int job_id) { DCHECK_CURRENTLY_ON(BrowserThread::UI); IDToJobMap::iterator iter = id_to_job_.find(job_id); if (iter == id_to_job_.end()) { NOTREACHED(); return nullptr; } return iter->second; } void MHTMLGenerationManager::RenderProcessExited(Job* job) { DCHECK_CURRENTLY_ON(BrowserThread::UI); DCHECK(job); JobFinished(job, JobStatus::FAILURE); } } // namespace content