• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "pdf/document_loader.h"
6 
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
14 
15 namespace chrome_pdf {
16 
17 // Document below size will be downloaded in one chunk.
18 const uint32 kMinFileSize = 64*1024;
19 
DocumentLoader(Client * client)20 DocumentLoader::DocumentLoader(Client* client)
21     : client_(client), partial_document_(false), request_pending_(false),
22       current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
23       document_size_(0), header_request_(true), is_multipart_(false) {
24   loader_factory_.Initialize(this);
25 }
26 
~DocumentLoader()27 DocumentLoader::~DocumentLoader() {
28 }
29 
Init(const pp::URLLoader & loader,const std::string & url,const std::string & headers)30 bool DocumentLoader::Init(const pp::URLLoader& loader,
31                           const std::string& url,
32                           const std::string& headers) {
33   DCHECK(url_.empty());
34   url_ = url;
35   loader_ = loader;
36 
37   std::string response_headers;
38   if (!headers.empty()) {
39     response_headers = headers;
40   } else {
41     pp::URLResponseInfo response = loader_.GetResponseInfo();
42     pp::Var headers_var = response.GetHeaders();
43 
44     if (headers_var.is_string()) {
45       response_headers = headers_var.AsString();
46     }
47   }
48 
49   bool accept_ranges_bytes = false;
50   bool content_encoded = false;
51   uint32 content_length = 0;
52   std::string type;
53   std::string disposition;
54   if (!response_headers.empty()) {
55     net::HttpUtil::HeadersIterator it(response_headers.begin(),
56                                       response_headers.end(), "\n");
57     while (it.GetNext()) {
58       if (LowerCaseEqualsASCII(it.name(), "content-length")) {
59         content_length = atoi(it.values().c_str());
60       } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
61         accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
62       } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
63         content_encoded = true;
64       } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
65         type = it.values();
66         size_t semi_colon_pos = type.find(';');
67         if (semi_colon_pos != std::string::npos) {
68           type = type.substr(0, semi_colon_pos);
69         }
70         TrimWhitespace(type, base::TRIM_ALL, &type);
71       } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
72         disposition = it.values();
73       }
74     }
75   }
76   if (!type.empty() &&
77       !EndsWith(type, "/pdf", false) &&
78       !EndsWith(type, ".pdf", false) &&
79       !EndsWith(type, "/x-pdf", false) &&
80       !EndsWith(type, "/*", false) &&
81       !EndsWith(type, "/acrobat", false) &&
82       !EndsWith(type, "/unknown", false)) {
83     return false;
84   }
85   if (StartsWithASCII(disposition, "attachment", false)) {
86     return false;
87   }
88 
89   if (content_length > 0)
90     chunk_stream_.Preallocate(content_length);
91 
92   document_size_ = content_length;
93   requests_count_ = 0;
94 
95   // Enable partial loading only if file size is above the threshold.
96   // It will allow avoiding latency for multiple requests.
97   if (content_length > kMinFileSize &&
98       accept_ranges_bytes &&
99       !content_encoded) {
100     LoadPartialDocument();
101   } else {
102     LoadFullDocument();
103   }
104   return true;
105 }
106 
LoadPartialDocument()107 void DocumentLoader::LoadPartialDocument() {
108   partial_document_ = true;
109   // Force the main request to be cancelled, since if we're a full-frame plugin
110   // there could be other references to the loader.
111   loader_.Close();
112   loader_ = pp::URLLoader();
113   // Download file header.
114   header_request_ = true;
115   RequestData(0, std::min(GetRequestSize(), document_size_));
116 }
117 
LoadFullDocument()118 void DocumentLoader::LoadFullDocument() {
119   partial_document_ = false;
120   chunk_buffer_.clear();
121   ReadMore();
122 }
123 
IsDocumentComplete() const124 bool DocumentLoader::IsDocumentComplete() const {
125   if (document_size_ == 0)  // Document size unknown.
126     return false;
127   return IsDataAvailable(0, document_size_);
128 }
129 
GetAvailableData() const130 uint32 DocumentLoader::GetAvailableData() const {
131   if (document_size_ == 0) {  // If document size is unknown.
132     return current_pos_;
133   }
134 
135   std::vector<std::pair<size_t, size_t> > ranges;
136   chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
137   uint32 available = document_size_;
138   std::vector<std::pair<size_t, size_t> >::iterator it;
139   for (it = ranges.begin(); it != ranges.end(); ++it) {
140     available -= it->second;
141   }
142   return available;
143 }
144 
ClearPendingRequests()145 void DocumentLoader::ClearPendingRequests() {
146   // The first item in the queue is pending (need to keep it in the queue).
147   if (pending_requests_.size() > 1) {
148     // Remove all elements except the first one.
149     pending_requests_.erase(++pending_requests_.begin(),
150                             pending_requests_.end());
151   }
152 }
153 
GetBlock(uint32 position,uint32 size,void * buf) const154 bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
155   return chunk_stream_.ReadData(position, size, buf);
156 }
157 
IsDataAvailable(uint32 position,uint32 size) const158 bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
159   return chunk_stream_.IsRangeAvailable(position, size);
160 }
161 
RequestData(uint32 position,uint32 size)162 void DocumentLoader::RequestData(uint32 position, uint32 size) {
163   DCHECK(partial_document_);
164 
165   // We have some artefact request from
166   // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
167   // document is complete.
168   // We need this fix in PDFIum. Adding this as a work around.
169   // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
170   // Test url:
171   // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
172   if (IsDocumentComplete())
173     return;
174 
175   pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
176   DownloadPendingRequests();
177 }
178 
DownloadPendingRequests()179 void DocumentLoader::DownloadPendingRequests() {
180   if (request_pending_ || pending_requests_.empty())
181     return;
182 
183   // Remove already completed requests.
184   // By design DownloadPendingRequests() should have at least 1 request in the
185   // queue. ReadComplete() will remove the last pending comment from the queue.
186   while (pending_requests_.size() > 1) {
187     if (IsDataAvailable(pending_requests_.front().first,
188                         pending_requests_.front().second)) {
189       pending_requests_.pop_front();
190     } else {
191       break;
192     }
193   }
194 
195   uint32 pos = pending_requests_.front().first;
196   uint32 size = pending_requests_.front().second;
197   if (IsDataAvailable(pos, size)) {
198     ReadComplete();
199     return;
200   }
201 
202   // If current request has been partially downloaded already, split it into
203   // a few smaller requests.
204   std::vector<std::pair<size_t, size_t> > ranges;
205   chunk_stream_.GetMissedRanges(pos, size, &ranges);
206   if (ranges.size() > 0) {
207     pending_requests_.pop_front();
208     pending_requests_.insert(pending_requests_.begin(),
209                              ranges.begin(), ranges.end());
210     pos = pending_requests_.front().first;
211     size = pending_requests_.front().second;
212   }
213 
214   uint32 cur_request_size = GetRequestSize();
215   // If size is less than default request, try to expand download range for
216   // more optimal download.
217   if (size < cur_request_size && partial_document_) {
218     // First, try to expand block towards the end of the file.
219     uint32 new_pos = pos;
220     uint32 new_size = cur_request_size;
221     if (pos + new_size > document_size_)
222       new_size = document_size_ - pos;
223 
224     std::vector<std::pair<size_t, size_t> > ranges;
225     if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
226       new_pos = ranges[0].first;
227       new_size = ranges[0].second;
228     }
229 
230     // Second, try to expand block towards the beginning of the file.
231     if (new_size < cur_request_size) {
232       uint32 block_end = new_pos + new_size;
233       if (block_end > cur_request_size) {
234         new_pos = block_end - cur_request_size;
235       } else {
236         new_pos = 0;
237       }
238       new_size = block_end - new_pos;
239 
240       if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
241         new_pos = ranges.back().first;
242         new_size = ranges.back().second;
243       }
244     }
245     pos = new_pos;
246     size = new_size;
247   }
248 
249   size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
250   size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
251   if (pos - last_byte_before < cur_request_size) {
252     size = pos + size - last_byte_before;
253     pos = last_byte_before;
254   }
255 
256   if ((pos + size < first_byte_after) &&
257       (pos + size + cur_request_size >= first_byte_after))
258     size = first_byte_after - pos;
259 
260   request_pending_ = true;
261 
262   // Start downloading first pending request.
263   loader_.Close();
264   loader_ = client_->CreateURLLoader();
265   pp::CompletionCallback callback =
266       loader_factory_.NewCallback(&DocumentLoader::DidOpen);
267   pp::URLRequestInfo request = GetRequest(pos, size);
268   requests_count_++;
269   int rv = loader_.Open(request, callback);
270   if (rv != PP_OK_COMPLETIONPENDING)
271     callback.Run(rv);
272 }
273 
GetRequest(uint32 position,uint32 size) const274 pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
275                                               uint32 size) const {
276   pp::URLRequestInfo request(client_->GetPluginInstance());
277   request.SetURL(url_.c_str());
278   request.SetMethod("GET");
279   request.SetFollowRedirects(true);
280 
281   const size_t kBufSize = 100;
282   char buf[kBufSize];
283   // According to rfc2616, byte range specifies position of the first and last
284   // bytes in the requested range inclusively. Therefore we should subtract 1
285   // from the position + size, to get index of the last byte that needs to be
286   // downloaded.
287   base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
288                  position + size - 1);
289   pp::Var header(buf);
290   request.SetHeaders(header);
291 
292   return request;
293 }
294 
DidOpen(int32_t result)295 void DocumentLoader::DidOpen(int32_t result) {
296   if (result != PP_OK) {
297     NOTREACHED();
298     return;
299   }
300 
301   is_multipart_ = false;
302   current_chunk_size_ = 0;
303   current_chunk_read_ = 0;
304 
305   pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
306   std::string headers;
307   if (headers_var.is_string())
308     headers = headers_var.AsString();
309 
310   std::string boundary = GetMultiPartBoundary(headers);
311   if (boundary.size()) {
312     // Leave position untouched for now, when we read the data we'll get it.
313     is_multipart_ = true;
314     multipart_boundary_ = boundary;
315   } else {
316     // Need to make sure that the server returned a byte-range, since it's
317     // possible for a server to just ignore our bye-range request and just
318     // return the entire document even if it supports byte-range requests.
319     // i.e. sniff response to
320     // http://www.act.org/compass/sample/pdf/geometry.pdf
321     current_pos_ = 0;
322     uint32 start_pos, end_pos;
323     if (GetByteRange(headers, &start_pos, &end_pos)) {
324       current_pos_ = start_pos;
325       if (end_pos && end_pos > start_pos)
326         current_chunk_size_ = end_pos - start_pos + 1;
327     }
328   }
329 
330   ReadMore();
331 }
332 
GetByteRange(const std::string & headers,uint32 * start,uint32 * end)333 bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
334                                   uint32* end) {
335   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
336   while (it.GetNext()) {
337     if (LowerCaseEqualsASCII(it.name(), "content-range")) {
338       std::string range = it.values().c_str();
339       if (StartsWithASCII(range, "bytes", false)) {
340         range = range.substr(strlen("bytes"));
341         std::string::size_type pos = range.find('-');
342         std::string range_end;
343         if (pos != std::string::npos)
344           range_end = range.substr(pos + 1);
345         TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
346         TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
347         *start = atoi(range.c_str());
348         *end = atoi(range_end.c_str());
349         return true;
350       }
351     }
352   }
353   return false;
354 }
355 
GetMultiPartBoundary(const std::string & headers)356 std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
357   net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
358   while (it.GetNext()) {
359     if (LowerCaseEqualsASCII(it.name(), "content-type")) {
360       std::string type = base::StringToLowerASCII(it.values());
361       if (StartsWithASCII(type, "multipart/", true)) {
362         const char* boundary = strstr(type.c_str(), "boundary=");
363         if (!boundary) {
364           NOTREACHED();
365           break;
366         }
367 
368         return std::string(boundary + 9);
369       }
370     }
371   }
372   return std::string();
373 }
374 
ReadMore()375 void DocumentLoader::ReadMore() {
376   pp::CompletionCallback callback =
377         loader_factory_.NewCallback(&DocumentLoader::DidRead);
378   int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
379   if (rv != PP_OK_COMPLETIONPENDING)
380     callback.Run(rv);
381 }
382 
DidRead(int32_t result)383 void DocumentLoader::DidRead(int32_t result) {
384   if (result > 0) {
385     char* start = buffer_;
386     size_t length = result;
387     if (is_multipart_ && result > 2) {
388       for (int i = 2; i < result; ++i) {
389         if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
390             (i >= 4 &&
391              buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
392              buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
393           uint32 start_pos, end_pos;
394           if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
395             current_pos_ = start_pos;
396             start += i;
397             length -= i;
398             if (end_pos && end_pos > start_pos)
399               current_chunk_size_ = end_pos - start_pos + 1;
400           }
401           break;
402         }
403       }
404 
405       // Reset this flag so we don't look inside the buffer in future calls of
406       // DidRead for this response.  Note that this code DOES NOT handle multi-
407       // part responses with more than one part (we don't issue them at the
408       // moment, so they shouldn't arrive).
409       is_multipart_ = false;
410     }
411 
412     if (current_chunk_size_ &&
413         current_chunk_read_ + length > current_chunk_size_)
414       length = current_chunk_size_ - current_chunk_read_;
415 
416     if (length) {
417       if (document_size_ > 0) {
418         chunk_stream_.WriteData(current_pos_, start, length);
419       } else {
420         // If we did not get content-length in the response, we can't
421         // preallocate buffer for the entire document. Resizing array causing
422         // memory fragmentation issues on the large files and OOM exceptions.
423         // To fix this, we collect all chunks of the file to the list and
424         // concatenate them together after request is complete.
425         chunk_buffer_.push_back(std::vector<unsigned char>());
426         chunk_buffer_.back().resize(length);
427         memcpy(&(chunk_buffer_.back()[0]), start, length);
428       }
429       current_pos_ += length;
430       current_chunk_read_ += length;
431       client_->OnNewDataAvailable();
432     }
433     ReadMore();
434   } else if (result == PP_OK) {
435     ReadComplete();
436   } else {
437     NOTREACHED();
438   }
439 }
440 
ReadComplete()441 void DocumentLoader::ReadComplete() {
442   if (!partial_document_) {
443     if (document_size_ == 0) {
444       // For the document with no 'content-length" specified we've collected all
445       // the chunks already. Let's allocate final document buffer and copy them
446       // over.
447       chunk_stream_.Preallocate(current_pos_);
448       uint32 pos = 0;
449       std::list<std::vector<unsigned char> >::iterator it;
450       for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
451         chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
452         pos += it->size();
453       }
454       chunk_buffer_.clear();
455     }
456     document_size_ = current_pos_;
457     client_->OnDocumentComplete();
458     return;
459   }
460 
461   request_pending_ = false;
462   pending_requests_.pop_front();
463 
464   // If there are more pending request - continue downloading.
465   if (!pending_requests_.empty()) {
466     DownloadPendingRequests();
467     return;
468   }
469 
470   if (IsDocumentComplete()) {
471     client_->OnDocumentComplete();
472     return;
473   }
474 
475   if (header_request_)
476     client_->OnPartialDocumentLoaded();
477   else
478     client_->OnPendingRequestComplete();
479   header_request_ = false;
480 
481   // The OnPendingRequestComplete could have added more requests.
482   if (!pending_requests_.empty()) {
483     DownloadPendingRequests();
484   } else {
485     // Document is not complete and we have no outstanding requests.
486     // Let's keep downloading PDF file in small chunks.
487     uint32 pos = chunk_stream_.GetFirstMissingByte();
488     std::vector<std::pair<size_t, size_t> > ranges;
489     chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
490     DCHECK(ranges.size() > 0);
491     RequestData(ranges[0].first, ranges[0].second);
492   }
493 }
494 
GetRequestSize() const495 uint32 DocumentLoader::GetRequestSize() const {
496   // Document loading strategy:
497   // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
498   // double the size (64k), and so on, until we cap max request size at 2M for
499   // 71 or more requests.
500   uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
501   return 32*1024 * (1 << ((limited_count - 1) / 10u));
502 }
503 
504 }  // namespace chrome_pdf
505