// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "pdf/document_loader.h" #include "base/logging.h" #include "base/strings/string_util.h" #include "net/http/http_util.h" #include "ppapi/c/pp_errors.h" #include "ppapi/cpp/url_loader.h" #include "ppapi/cpp/url_request_info.h" #include "ppapi/cpp/url_response_info.h" namespace chrome_pdf { namespace { // Document below size will be downloaded in one chunk. const uint32_t kMinFileSize = 64 * 1024; // If the headers have a byte-range response, writes the start and end // positions and returns true if at least the start position was parsed. // The end position will be set to 0 if it was not found or parsed from the // response. // Returns false if not even a start position could be parsed. bool GetByteRange(const std::string& headers, uint32_t* start, uint32_t* end) { net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n"); while (it.GetNext()) { if (base::LowerCaseEqualsASCII(it.name(), "content-range")) { std::string range = it.values().c_str(); if (base::StartsWith(range, "bytes", base::CompareCase::INSENSITIVE_ASCII)) { range = range.substr(strlen("bytes")); std::string::size_type pos = range.find('-'); std::string range_end; if (pos != std::string::npos) range_end = range.substr(pos + 1); base::TrimWhitespaceASCII(range, base::TRIM_LEADING, &range); base::TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end); *start = atoi(range.c_str()); *end = atoi(range_end.c_str()); return true; } } } return false; } // If the headers have a multi-part response, returns the boundary name. // Otherwise returns an empty string. std::string GetMultiPartBoundary(const std::string& headers) { net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n"); while (it.GetNext()) { if (base::LowerCaseEqualsASCII(it.name(), "content-type")) { std::string type = base::ToLowerASCII(it.values()); if (base::StartsWith(type, "multipart/", base::CompareCase::SENSITIVE)) { const char* boundary = strstr(type.c_str(), "boundary="); if (!boundary) { NOTREACHED(); break; } return std::string(boundary + 9); } } } return std::string(); } bool IsValidContentType(const std::string& type) { return (base::EndsWith(type, "/pdf", base::CompareCase::INSENSITIVE_ASCII) || base::EndsWith(type, ".pdf", base::CompareCase::INSENSITIVE_ASCII) || base::EndsWith(type, "/x-pdf", base::CompareCase::INSENSITIVE_ASCII) || base::EndsWith(type, "/*", base::CompareCase::INSENSITIVE_ASCII) || base::EndsWith(type, "/acrobat", base::CompareCase::INSENSITIVE_ASCII) || base::EndsWith(type, "/unknown", base::CompareCase::INSENSITIVE_ASCII)); } } // namespace DocumentLoader::Client::~Client() { } DocumentLoader::DocumentLoader(Client* client) : client_(client), partial_document_(false), request_pending_(false), current_pos_(0), current_chunk_size_(0), current_chunk_read_(0), document_size_(0), header_request_(true), is_multipart_(false) { loader_factory_.Initialize(this); } DocumentLoader::~DocumentLoader() { } bool DocumentLoader::Init(const pp::URLLoader& loader, const std::string& url, const std::string& headers) { DCHECK(url_.empty()); url_ = url; loader_ = loader; std::string response_headers; if (!headers.empty()) { response_headers = headers; } else { pp::URLResponseInfo response = loader_.GetResponseInfo(); pp::Var headers_var = response.GetHeaders(); if (headers_var.is_string()) { response_headers = headers_var.AsString(); } } bool accept_ranges_bytes = false; bool content_encoded = false; uint32_t content_length = 0; std::string type; std::string disposition; // This happens for PDFs not loaded from http(s) sources. if (response_headers == "Content-Type: text/plain") { if (!base::StartsWith(url, "http://", base::CompareCase::INSENSITIVE_ASCII) && !base::StartsWith(url, "https://", base::CompareCase::INSENSITIVE_ASCII)) { type = "application/pdf"; } } if (type.empty() && !response_headers.empty()) { net::HttpUtil::HeadersIterator it(response_headers.begin(), response_headers.end(), "\n"); while (it.GetNext()) { if (base::LowerCaseEqualsASCII(it.name(), "content-length")) { content_length = atoi(it.values().c_str()); } else if (base::LowerCaseEqualsASCII(it.name(), "accept-ranges")) { accept_ranges_bytes = base::LowerCaseEqualsASCII(it.values(), "bytes"); } else if (base::LowerCaseEqualsASCII(it.name(), "content-encoding")) { content_encoded = true; } else if (base::LowerCaseEqualsASCII(it.name(), "content-type")) { type = it.values(); size_t semi_colon_pos = type.find(';'); if (semi_colon_pos != std::string::npos) { type = type.substr(0, semi_colon_pos); } TrimWhitespaceASCII(type, base::TRIM_ALL, &type); } else if (base::LowerCaseEqualsASCII(it.name(), "content-disposition")) { disposition = it.values(); } } } if (!type.empty() && !IsValidContentType(type)) return false; if (base::StartsWith(disposition, "attachment", base::CompareCase::INSENSITIVE_ASCII)) return false; if (content_length > 0) chunk_stream_.Preallocate(content_length); document_size_ = content_length; requests_count_ = 0; // Enable partial loading only if file size is above the threshold. // It will allow avoiding latency for multiple requests. if (content_length > kMinFileSize && accept_ranges_bytes && !content_encoded) { LoadPartialDocument(); } else { LoadFullDocument(); } return true; } void DocumentLoader::LoadPartialDocument() { partial_document_ = true; // Force the main request to be cancelled, since if we're a full-frame plugin // there could be other references to the loader. loader_.Close(); loader_ = pp::URLLoader(); // Download file header. header_request_ = true; RequestData(0, std::min(GetRequestSize(), document_size_)); } void DocumentLoader::LoadFullDocument() { partial_document_ = false; chunk_buffer_.clear(); ReadMore(); } bool DocumentLoader::IsDocumentComplete() const { if (document_size_ == 0) // Document size unknown. return false; return IsDataAvailable(0, document_size_); } uint32_t DocumentLoader::GetAvailableData() const { if (document_size_ == 0) { // If document size is unknown. return current_pos_; } std::vector > ranges; chunk_stream_.GetMissedRanges(0, document_size_, &ranges); uint32_t available = document_size_; for (const auto& range : ranges) available -= range.second; return available; } void DocumentLoader::ClearPendingRequests() { // The first item in the queue is pending (need to keep it in the queue). if (pending_requests_.size() > 1) { // Remove all elements except the first one. pending_requests_.erase(++pending_requests_.begin(), pending_requests_.end()); } } bool DocumentLoader::GetBlock(uint32_t position, uint32_t size, void* buf) const { return chunk_stream_.ReadData(position, size, buf); } bool DocumentLoader::IsDataAvailable(uint32_t position, uint32_t size) const { return chunk_stream_.IsRangeAvailable(position, size); } void DocumentLoader::RequestData(uint32_t position, uint32_t size) { DCHECK(partial_document_); // We have some artefact request from // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after // document is complete. // We need this fix in PDFIum. Adding this as a work around. // Bug: http://code.google.com/p/chromium/issues/detail?id=79996 // Test url: // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf if (IsDocumentComplete()) return; pending_requests_.push_back(std::pair(position, size)); DownloadPendingRequests(); } void DocumentLoader::DownloadPendingRequests() { if (request_pending_ || pending_requests_.empty()) return; // Remove already completed requests. // By design DownloadPendingRequests() should have at least 1 request in the // queue. ReadComplete() will remove the last pending comment from the queue. while (pending_requests_.size() > 1) { if (IsDataAvailable(pending_requests_.front().first, pending_requests_.front().second)) { pending_requests_.pop_front(); } else { break; } } uint32_t pos = pending_requests_.front().first; uint32_t size = pending_requests_.front().second; if (IsDataAvailable(pos, size)) { ReadComplete(); return; } // If current request has been partially downloaded already, split it into // a few smaller requests. std::vector > ranges; chunk_stream_.GetMissedRanges(pos, size, &ranges); if (!ranges.empty()) { pending_requests_.pop_front(); pending_requests_.insert(pending_requests_.begin(), ranges.begin(), ranges.end()); pos = pending_requests_.front().first; size = pending_requests_.front().second; } uint32_t cur_request_size = GetRequestSize(); // If size is less than default request, try to expand download range for // more optimal download. if (size < cur_request_size && partial_document_) { // First, try to expand block towards the end of the file. uint32_t new_pos = pos; uint32_t new_size = cur_request_size; if (pos + new_size > document_size_) new_size = document_size_ - pos; std::vector > ranges; if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) { new_pos = ranges[0].first; new_size = ranges[0].second; } // Second, try to expand block towards the beginning of the file. if (new_size < cur_request_size) { uint32_t block_end = new_pos + new_size; if (block_end > cur_request_size) { new_pos = block_end - cur_request_size; } else { new_pos = 0; } new_size = block_end - new_pos; if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) { new_pos = ranges.back().first; new_size = ranges.back().second; } } pos = new_pos; size = new_size; } size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos); size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1); if (pos - last_byte_before < cur_request_size) { size = pos + size - last_byte_before; pos = last_byte_before; } if ((pos + size < first_byte_after) && (pos + size + cur_request_size >= first_byte_after)) size = first_byte_after - pos; request_pending_ = true; // Start downloading first pending request. loader_.Close(); loader_ = client_->CreateURLLoader(); pp::CompletionCallback callback = loader_factory_.NewCallback(&DocumentLoader::DidOpen); pp::URLRequestInfo request = GetRequest(pos, size); requests_count_++; int rv = loader_.Open(request, callback); if (rv != PP_OK_COMPLETIONPENDING) callback.Run(rv); } pp::URLRequestInfo DocumentLoader::GetRequest(uint32_t position, uint32_t size) const { pp::URLRequestInfo request(client_->GetPluginInstance()); request.SetURL(url_); request.SetMethod("GET"); request.SetFollowRedirects(true); request.SetCustomReferrerURL(url_); const size_t kBufSize = 100; char buf[kBufSize]; // According to rfc2616, byte range specifies position of the first and last // bytes in the requested range inclusively. Therefore we should subtract 1 // from the position + size, to get index of the last byte that needs to be // downloaded. base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position, position + size - 1); pp::Var header(buf); request.SetHeaders(header); return request; } void DocumentLoader::DidOpen(int32_t result) { if (result != PP_OK) { NOTREACHED(); return; } int32_t http_code = loader_.GetResponseInfo().GetStatusCode(); if (http_code >= 400 && http_code < 500) { // Error accessing resource. 4xx error indicate subsequent requests // will fail too. // E.g. resource has been removed from the server while loading it. // https://code.google.com/p/chromium/issues/detail?id=414827 return; } is_multipart_ = false; current_chunk_size_ = 0; current_chunk_read_ = 0; pp::Var headers_var = loader_.GetResponseInfo().GetHeaders(); std::string headers; if (headers_var.is_string()) headers = headers_var.AsString(); std::string boundary = GetMultiPartBoundary(headers); if (!boundary.empty()) { // Leave position untouched for now, when we read the data we'll get it. is_multipart_ = true; multipart_boundary_ = boundary; } else { // Need to make sure that the server returned a byte-range, since it's // possible for a server to just ignore our bye-range request and just // return the entire document even if it supports byte-range requests. // i.e. sniff response to // http://www.act.org/compass/sample/pdf/geometry.pdf current_pos_ = 0; uint32_t start_pos, end_pos; if (GetByteRange(headers, &start_pos, &end_pos)) { current_pos_ = start_pos; if (end_pos && end_pos > start_pos) current_chunk_size_ = end_pos - start_pos + 1; } } ReadMore(); } void DocumentLoader::ReadMore() { pp::CompletionCallback callback = loader_factory_.NewCallback(&DocumentLoader::DidRead); int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback); if (rv != PP_OK_COMPLETIONPENDING) callback.Run(rv); } void DocumentLoader::DidRead(int32_t result) { if (result > 0) { char* start = buffer_; size_t length = result; if (is_multipart_ && result > 2) { for (int i = 2; i < result; ++i) { if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') || (i >= 4 && buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' && buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) { uint32_t start_pos, end_pos; if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) { current_pos_ = start_pos; start += i; length -= i; if (end_pos && end_pos > start_pos) current_chunk_size_ = end_pos - start_pos + 1; } break; } } // Reset this flag so we don't look inside the buffer in future calls of // DidRead for this response. Note that this code DOES NOT handle multi- // part responses with more than one part (we don't issue them at the // moment, so they shouldn't arrive). is_multipart_ = false; } if (current_chunk_size_ && current_chunk_read_ + length > current_chunk_size_) length = current_chunk_size_ - current_chunk_read_; if (length) { if (document_size_ > 0) { chunk_stream_.WriteData(current_pos_, start, length); } else { // If we did not get content-length in the response, we can't // preallocate buffer for the entire document. Resizing array causing // memory fragmentation issues on the large files and OOM exceptions. // To fix this, we collect all chunks of the file to the list and // concatenate them together after request is complete. chunk_buffer_.push_back(std::vector()); chunk_buffer_.back().resize(length); memcpy(&(chunk_buffer_.back()[0]), start, length); } current_pos_ += length; current_chunk_read_ += length; client_->OnNewDataAvailable(); } ReadMore(); } else if (result == PP_OK) { ReadComplete(); } else { NOTREACHED(); } } void DocumentLoader::ReadComplete() { if (!partial_document_) { if (document_size_ == 0) { // For the document with no 'content-length" specified we've collected all // the chunks already. Let's allocate final document buffer and copy them // over. chunk_stream_.Preallocate(current_pos_); uint32_t pos = 0; for (auto& chunk : chunk_buffer_) { chunk_stream_.WriteData(pos, &(chunk[0]), chunk.size()); pos += chunk.size(); } chunk_buffer_.clear(); } document_size_ = current_pos_; client_->OnDocumentComplete(); return; } request_pending_ = false; pending_requests_.pop_front(); // If there are more pending request - continue downloading. if (!pending_requests_.empty()) { DownloadPendingRequests(); return; } if (IsDocumentComplete()) { client_->OnDocumentComplete(); return; } if (header_request_) client_->OnPartialDocumentLoaded(); else client_->OnPendingRequestComplete(); header_request_ = false; // The OnPendingRequestComplete could have added more requests. if (!pending_requests_.empty()) { DownloadPendingRequests(); } else { // Document is not complete and we have no outstanding requests. // Let's keep downloading PDF file in small chunks. uint32_t pos = chunk_stream_.GetFirstMissingByte(); std::vector > ranges; chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges); DCHECK(!ranges.empty()); RequestData(ranges[0].first, ranges[0].second); } } uint32_t DocumentLoader::GetRequestSize() const { // Document loading strategy: // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we // double the size (64k), and so on, until we cap max request size at 2M for // 71 or more requests. uint32_t limited_count = std::min(std::max(requests_count_, 10u), 70u); return 32 * 1024 * (1 << ((limited_count - 1) / 10u)); } } // namespace chrome_pdf