Minimize the number of range requests made by PDFium

Previously, Chrome would make a series of range requests for linearized PDFs, starting at 32 KB and slowly increasing. On high latency connections, this is considerably slower than using a single request. Now Chrome will make range requests as large as possible and cancel them if the renderer needs some data in a different place in the document. This significantly reduces the number of range requests performed by Chrome. BUG=460201,78264 Review URL: https://codereview.chromium.org/1506023002 Cr-Commit-Position: refs/heads/master@{#364235}
author: spelchat <spelchat@chromium.org> 2015-12-09 16:44:15 -0800
committer: Commit bot <commit-bot@chromium.org> 2015-12-10 00:45:12 +0000
commit: 3ba2a28104c4d6feef3efd71d9be73f085886f53 (patch)
tree: 2c05f055b78bed8cf7faf7ad5af3918595e74d46 /pdf
parent: 0eabbb80369d664feb34db3ea4e3f9247f535e95 (diff)
download: chromium_src-3ba2a28104c4d6feef3efd71d9be73f085886f53.zip
chromium_src-3ba2a28104c4d6feef3efd71d9be73f085886f53.tar.gz
chromium_src-3ba2a28104c4d6feef3efd71d9be73f085886f53.tar.bz2
4 files changed, 150 insertions, 125 deletions
diff --git a/pdf/chunk_stream.cc b/pdf/chunk_stream.cc
index 7ac8f97..e580151 100644
--- a/pdf/chunk_stream.cc
+++ b/pdf/chunk_stream.cc
@@ -17,7 +17,7 @@
 
 namespace chrome_pdf {
 
-ChunkStream::ChunkStream() {
+ChunkStream::ChunkStream() : stream_size_(0) {
 }
 
 ChunkStream::~ChunkStream() {
@@ -26,10 +26,12 @@ ChunkStream::~ChunkStream() {
 void ChunkStream::Clear() {
   chunks_.clear();
   data_.clear();
+  stream_size_ = 0;
 }
 
 void ChunkStream::Preallocate(size_t stream_size) {
   data_.reserve(stream_size);
+  stream_size_ = stream_size;
 }
 
 size_t ChunkStream::GetSize() {
@@ -150,7 +152,7 @@ size_t ChunkStream::GetFirstMissingByte() const {
   return begin->first > 0 ? 0 : begin->second;
 }
 
-size_t ChunkStream::GetLastByteBefore(size_t offset) const {
+size_t ChunkStream::GetFirstMissingByteInInterval(size_t offset) const {
   if (chunks_.empty())
     return 0;
   std::map<size_t, size_t>::const_iterator it = chunks_.upper_bound(offset);
@@ -160,13 +162,13 @@ size_t ChunkStream::GetLastByteBefore(size_t offset) const {
   return it->first + it->second;
 }
 
-size_t ChunkStream::GetFirstByteAfter(size_t offset) const {
+size_t ChunkStream::GetLastMissingByteInInterval(size_t offset) const {
   if (chunks_.empty())
-    return 0;
+    return stream_size_ - 1;
   std::map<size_t, size_t>::const_iterator it = chunks_.upper_bound(offset);
   if (it == chunks_.end())
-    return data_.size();
-  return it->first;
+    return stream_size_ - 1;
+  return it->first - 1;
 }
 
 }  // namespace chrome_pdf
diff --git a/pdf/chunk_stream.h b/pdf/chunk_stream.h
index fac1ec6..048f958 100644
--- a/pdf/chunk_stream.h
+++ b/pdf/chunk_stream.h
@@ -8,6 +8,7 @@
 #include <stddef.h>
 
 #include <map>
+#include <utility>
 #include <vector>
 
 namespace chrome_pdf {
@@ -33,16 +34,20 @@ class ChunkStream {
   bool IsRangeAvailable(size_t offset, size_t size) const;
   size_t GetFirstMissingByte() const;
 
-  size_t GetLastByteBefore(size_t offset) const;
-  size_t GetFirstByteAfter(size_t offset) const;
+  // Finds the first byte of the missing byte interval that offset belongs to.
+  size_t GetFirstMissingByteInInterval(size_t offset) const;
+  // Returns the last byte of the missing byte interval that offset belongs to.
+  size_t GetLastMissingByteInInterval(size_t offset) const;
 
  private:
   std::vector<unsigned char> data_;
 
   // Pair, first - begining of the chunk, second - size of the chunk.
   std::map<size_t, size_t> chunks_;
+
+  size_t stream_size_;
 };
 
 };  // namespace chrome_pdf
 
-#endif
+#endif  // PDF_CHUNK_STREAM_H_
diff --git a/pdf/document_loader.cc b/pdf/document_loader.cc
index 5bbed1a..89e7467 100644
--- a/pdf/document_loader.cc
+++ b/pdf/document_loader.cc
@@ -16,9 +16,6 @@ namespace chrome_pdf {
 
 namespace {
 
-// Document below size will be downloaded in one chunk.
-const uint32_t kMinFileSize = 64 * 1024;
-
 // If the headers have a byte-range response, writes the start and end
 // positions and returns true if at least the start position was parsed.
 // The end position will be set to 0 if it was not found or parsed from the
@@ -176,14 +173,18 @@ bool DocumentLoader::Init(const pp::URLLoader& loader,
 }
 
 void DocumentLoader::LoadPartialDocument() {
+  // The current request is a full request (not a range request) so it starts at
+  // 0 and ends at |document_size_|.
+  current_chunk_size_ = document_size_;
+  current_pos_ = 0;
+  current_request_offset_ = 0;
+  current_request_size_ = 0;
+  current_request_extended_size_ = document_size_;
+  request_pending_ = true;
+
   partial_document_ = true;
-  // Force the main request to be cancelled, since if we're a full-frame plugin
-  // there could be other references to the loader.
-  loader_.Close();
-  loader_ = pp::URLLoader();
-  // Download file header.
   header_request_ = true;
-  RequestData(0, std::min(GetRequestSize(), document_size_));
+  ReadMore();
 }
 
 void DocumentLoader::LoadFullDocument() {
@@ -212,12 +213,8 @@ uint32_t DocumentLoader::GetAvailableData() const {
 }
 
 void DocumentLoader::ClearPendingRequests() {
-  // The first item in the queue is pending (need to keep it in the queue).
-  if (pending_requests_.size() > 1) {
-    // Remove all elements except the first one.
-    pending_requests_.erase(++pending_requests_.begin(),
-                            pending_requests_.end());
-  }
+  pending_requests_.erase(pending_requests_.begin(),
+                          pending_requests_.end());
 }
 
 bool DocumentLoader::GetBlock(uint32_t position,
@@ -247,86 +244,74 @@ void DocumentLoader::RequestData(uint32_t position, uint32_t size) {
   DownloadPendingRequests();
 }
 
+void DocumentLoader::RemoveCompletedRanges() {
+  // Split every request that has been partially downloaded already into smaller
+  // requests.
+  std::vector<std::pair<size_t, size_t> > ranges;
+  auto it = pending_requests_.begin();
+  while (it != pending_requests_.end()) {
+    chunk_stream_.GetMissedRanges(it->first, it->second, &ranges);
+    pending_requests_.insert(it, ranges.begin(), ranges.end());
+    ranges.clear();
+    pending_requests_.erase(it++);
+  }
+}
+
 void DocumentLoader::DownloadPendingRequests() {
-  if (request_pending_ || pending_requests_.empty())
+  if (request_pending_)
     return;
 
-  // Remove already completed requests.
-  // By design DownloadPendingRequests() should have at least 1 request in the
-  // queue. ReadComplete() will remove the last pending comment from the queue.
-  while (pending_requests_.size() > 1) {
-    if (IsDataAvailable(pending_requests_.front().first,
-                        pending_requests_.front().second)) {
-      pending_requests_.pop_front();
-    } else {
-      break;
+  uint32_t pos;
+  uint32_t size;
+  if (pending_requests_.empty()) {
+    // If the document is not complete and we have no outstanding requests,
+    // download what's left for as long as no other request gets added to
+    // |pending_requests_|.
+    pos = chunk_stream_.GetFirstMissingByte();
+    if (pos >= document_size_) {
+      // We're done downloading the document.
+      return;
     }
-  }
-
-  uint32_t pos = pending_requests_.front().first;
-  uint32_t size = pending_requests_.front().second;
-  if (IsDataAvailable(pos, size)) {
-    ReadComplete();
-    return;
-  }
+    // Start with size 0, we'll set |current_request_extended_size_| to > 0.
+    // This way this request will get cancelled as soon as the renderer wants
+    // another portion of the document.
+    size = 0;
+  } else {
+    RemoveCompletedRanges();
 
-  // If current request has been partially downloaded already, split it into
-  // a few smaller requests.
-  std::vector<std::pair<size_t, size_t> > ranges;
-  chunk_stream_.GetMissedRanges(pos, size, &ranges);
-  if (!ranges.empty()) {
-    pending_requests_.pop_front();
-    pending_requests_.insert(pending_requests_.begin(),
-                             ranges.begin(), ranges.end());
     pos = pending_requests_.front().first;
     size = pending_requests_.front().second;
-  }
-
-  uint32_t cur_request_size = GetRequestSize();
-  // If size is less than default request, try to expand download range for
-  // more optimal download.
-  if (size < cur_request_size && partial_document_) {
-    // First, try to expand block towards the end of the file.
-    uint32_t new_pos = pos;
-    uint32_t new_size = cur_request_size;
-    if (pos + new_size > document_size_)
-      new_size = document_size_ - pos;
-
-    std::vector<std::pair<size_t, size_t> > ranges;
-    if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
-      new_pos = ranges[0].first;
-      new_size = ranges[0].second;
+    if (IsDataAvailable(pos, size)) {
+      ReadComplete();
+      return;
     }
+  }
 
-    // Second, try to expand block towards the beginning of the file.
-    if (new_size < cur_request_size) {
-      uint32_t block_end = new_pos + new_size;
-      if (block_end > cur_request_size) {
-        new_pos = block_end - cur_request_size;
-      } else {
-        new_pos = 0;
-      }
-      new_size = block_end - new_pos;
-
-      if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
-        new_pos = ranges.back().first;
-        new_size = ranges.back().second;
-      }
+  size_t last_byte_before = chunk_stream_.GetFirstMissingByteInInterval(pos);
+  if (size < kDefaultRequestSize) {
+    // Try to extend before pos, up to size |kDefaultRequestSize|.
+    if (pos + size - last_byte_before > kDefaultRequestSize) {
+      pos += size - kDefaultRequestSize;
+      size = kDefaultRequestSize;
+    } else {
+      size += pos - last_byte_before;
+      pos = last_byte_before;
     }
-    pos = new_pos;
-    size = new_size;
   }
-
-  size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
-  size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
-  if (pos - last_byte_before < cur_request_size) {
-    size = pos + size - last_byte_before;
+  if (pos - last_byte_before < kDefaultRequestSize) {
+    // Don't leave a gap smaller than |kDefaultRequestSize|.
+    size += pos - last_byte_before;
     pos = last_byte_before;
   }
 
-  if ((pos + size < first_byte_after) &&
-      (pos + size + cur_request_size >= first_byte_after))
-    size = first_byte_after - pos;
+  current_request_offset_ = pos;
+  current_request_size_ = size;
+
+  // Extend the request until the next downloaded byte or the end of the
+  // document.
+  size_t last_missing_byte =
+      chunk_stream_.GetLastMissingByteInInterval(pos + size - 1);
+  current_request_extended_size_ = last_missing_byte - pos + 1;
 
   request_pending_ = true;
 
@@ -335,7 +320,7 @@ void DocumentLoader::DownloadPendingRequests() {
   loader_ = client_->CreateURLLoader();
   pp::CompletionCallback callback =
       loader_factory_.NewCallback(&DocumentLoader::DidOpen);
-  pp::URLRequestInfo request = GetRequest(pos, size);
+  pp::URLRequestInfo request = GetRequest(pos, current_request_extended_size_);
   requests_count_++;
   int rv = loader_.Open(request, callback);
   if (rv != PP_OK_COMPLETIONPENDING)
@@ -469,14 +454,51 @@ void DocumentLoader::DidRead(int32_t result) {
       current_chunk_read_ += length;
       client_->OnNewDataAvailable();
     }
+
+    // Only call the renderer if we allow partial loading.
+    if (!partial_document_) {
+      ReadMore();
+      return;
+    }
+
+    UpdateRendering();
+    RemoveCompletedRanges();
+
+    if (!pending_requests_.empty()) {
+      // If there are pending requests and the current content we're downloading
+      // doesn't satisfy any of these requests, cancel the current request to
+      // fullfill those more important requests.
+      bool satisfying_pending_request =
+            SatisfyingRequest(current_request_offset_, current_request_size_);
+      for (const auto& pending_request : pending_requests_) {
+        if (SatisfyingRequest(pending_request.first, pending_request.second)) {
+          satisfying_pending_request = true;
+          break;
+        }
+      }
+      // Cancel the request as it's not satisfying any request from the
+      // renderer, unless the current request is finished in which case we let
+      // it finish cleanly.
+      if (!satisfying_pending_request &&
+          current_pos_ < current_request_offset_ +
+          current_request_extended_size_) {
+        loader_.Close();
+      }
+    }
+
     ReadMore();
-  } else if (result == PP_OK) {
+  } else if (result == PP_OK || result == PP_ERROR_ABORTED) {
     ReadComplete();
   } else {
     NOTREACHED();
   }
 }
 
+bool DocumentLoader::SatisfyingRequest(size_t offset, size_t size) const {
+  return offset <= current_pos_ + kDefaultRequestSize &&
+      current_pos_ < offset + size;
+}
+
 void DocumentLoader::ReadComplete() {
   if (!partial_document_) {
     if (document_size_ == 0) {
@@ -497,46 +519,22 @@ void DocumentLoader::ReadComplete() {
   }
 
   request_pending_ = false;
-  pending_requests_.pop_front();
-
-  // If there are more pending request - continue downloading.
-  if (!pending_requests_.empty()) {
-    DownloadPendingRequests();
-    return;
-  }
 
   if (IsDocumentComplete()) {
     client_->OnDocumentComplete();
     return;
   }
 
+  UpdateRendering();
+  DownloadPendingRequests();
+}
+
+void DocumentLoader::UpdateRendering() {
   if (header_request_)
     client_->OnPartialDocumentLoaded();
   else
     client_->OnPendingRequestComplete();
   header_request_ = false;
-
-  // The OnPendingRequestComplete could have added more requests.
-  if (!pending_requests_.empty()) {
-    DownloadPendingRequests();
-  } else {
-    // Document is not complete and we have no outstanding requests.
-    // Let's keep downloading PDF file in small chunks.
-    uint32_t pos = chunk_stream_.GetFirstMissingByte();
-    std::vector<std::pair<size_t, size_t> > ranges;
-    chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
-    DCHECK(!ranges.empty());
-    RequestData(ranges[0].first, ranges[0].second);
-  }
-}
-
-uint32_t DocumentLoader::GetRequestSize() const {
-  // Document loading strategy:
-  // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
-  // double the size (64k), and so on, until we cap max request size at 2M for
-  // 71 or more requests.
-  uint32_t limited_count = std::min(std::max(requests_count_, 10u), 70u);
-  return 32 * 1024 * (1 << ((limited_count - 1) / 10u));
 }
 
 }  // namespace chrome_pdf
diff --git a/pdf/document_loader.h b/pdf/document_loader.h
index 4e734a0..7e175de 100644
--- a/pdf/document_loader.h
+++ b/pdf/document_loader.h
@@ -7,6 +7,7 @@
 
 #include <list>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "base/basictypes.h"
@@ -14,8 +15,6 @@
 #include "ppapi/cpp/url_loader.h"
 #include "ppapi/utility/completion_callback_factory.h"
 
-#define kDefaultRequestSize 32768u
-
 namespace chrome_pdf {
 
 class DocumentLoader {
@@ -81,12 +80,24 @@ class DocumentLoader {
   void LoadFullDocument();
   // Download pending requests.
   void DownloadPendingRequests();
+  // Remove completed ranges.
+  void RemoveCompletedRanges();
+  // Returns true if we are already in progress satisfying the request, or just
+  // about ready to start. This helps us avoid expensive jumping around, and
+  // even worse leaving tiny gaps in the byte stream that might have to be
+  // filled later.
+  bool SatisfyingRequest(size_t pos, size_t size) const;
   // Called when we complete server request and read all data from it.
   void ReadComplete();
   // Creates request to download size byte of data data starting from position.
   pp::URLRequestInfo GetRequest(uint32_t position, uint32_t size) const;
-  // Returns current request size in bytes.
-  uint32_t GetRequestSize() const;
+  // Updates the rendering by the Client.
+  void UpdateRendering();
+
+  // Document below size will be downloaded in one chunk.
+  static const uint32_t kMinFileSize = 64 * 1024;
+  // Number was chosen in crbug.com/78264#c8
+  enum { kDefaultRequestSize = 65536 };
 
   Client* client_;
   std::string url_;
@@ -97,6 +108,15 @@ class DocumentLoader {
   bool request_pending_;
   typedef std::list<std::pair<size_t, size_t> > PendingRequests;
   PendingRequests pending_requests_;
+  // The starting position of the HTTP request currently being processed.
+  size_t current_request_offset_;
+  // The size of the byte range the current HTTP request must download before
+  // being cancelled.
+  size_t current_request_size_;
+  // The actual byte range size of the current HTTP request. This may be larger
+  // than |current_request_size_| and the request may be cancelled before
+  // reaching |current_request_offset_| + |current_request_extended_size_|.
+  size_t current_request_extended_size_;
   char buffer_[kDefaultRequestSize];
   uint32_t current_pos_;
   uint32_t current_chunk_size_;
author	spelchat <spelchat@chromium.org>	2015-12-09 16:44:15 -0800
committer	Commit bot <commit-bot@chromium.org>	2015-12-10 00:45:12 +0000
commit	3ba2a28104c4d6feef3efd71d9be73f085886f53 (patch)
tree	2c05f055b78bed8cf7faf7ad5af3918595e74d46 /pdf
parent	0eabbb80369d664feb34db3ea4e3f9247f535e95 (diff)
download	chromium_src-3ba2a28104c4d6feef3efd71d9be73f085886f53.zip chromium_src-3ba2a28104c4d6feef3efd71d9be73f085886f53.tar.gz chromium_src-3ba2a28104c4d6feef3efd71d9be73f085886f53.tar.bz2