1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "pdf/document_loader.h"
6
7#include "base/logging.h"
8#include "base/strings/string_util.h"
9#include "net/http/http_util.h"
10#include "ppapi/c/pp_errors.h"
11#include "ppapi/cpp/url_loader.h"
12#include "ppapi/cpp/url_request_info.h"
13#include "ppapi/cpp/url_response_info.h"
14
15namespace chrome_pdf {
16
17// Document below size will be downloaded in one chunk.
18const uint32 kMinFileSize = 64*1024;
19
20DocumentLoader::DocumentLoader(Client* client)
21    : client_(client), partial_document_(false), request_pending_(false),
22      current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
23      document_size_(0), header_request_(true), is_multipart_(false) {
24  loader_factory_.Initialize(this);
25}
26
27DocumentLoader::~DocumentLoader() {
28}
29
30bool DocumentLoader::Init(const pp::URLLoader& loader,
31                          const std::string& url,
32                          const std::string& headers) {
33  DCHECK(url_.empty());
34  url_ = url;
35  loader_ = loader;
36
37  std::string response_headers;
38  if (!headers.empty()) {
39    response_headers = headers;
40  } else {
41    pp::URLResponseInfo response = loader_.GetResponseInfo();
42    pp::Var headers_var = response.GetHeaders();
43
44    if (headers_var.is_string()) {
45      response_headers = headers_var.AsString();
46    }
47  }
48
49  bool accept_ranges_bytes = false;
50  bool content_encoded = false;
51  uint32 content_length = 0;
52  std::string type;
53  std::string disposition;
54  if (!response_headers.empty()) {
55    net::HttpUtil::HeadersIterator it(response_headers.begin(),
56                                      response_headers.end(), "\n");
57    while (it.GetNext()) {
58      if (LowerCaseEqualsASCII(it.name(), "content-length")) {
59        content_length = atoi(it.values().c_str());
60      } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
61        accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
62      } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
63        content_encoded = true;
64      } else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
65        type = it.values();
66        size_t semi_colon_pos = type.find(';');
67        if (semi_colon_pos != std::string::npos) {
68          type = type.substr(0, semi_colon_pos);
69        }
70        TrimWhitespace(type, base::TRIM_ALL, &type);
71      } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
72        disposition = it.values();
73      }
74    }
75  }
76  if (!type.empty() &&
77      !EndsWith(type, "/pdf", false) &&
78      !EndsWith(type, ".pdf", false) &&
79      !EndsWith(type, "/x-pdf", false) &&
80      !EndsWith(type, "/*", false) &&
81      !EndsWith(type, "/acrobat", false) &&
82      !EndsWith(type, "/unknown", false)) {
83    return false;
84  }
85  if (StartsWithASCII(disposition, "attachment", false)) {
86    return false;
87  }
88
89  if (content_length > 0)
90    chunk_stream_.Preallocate(content_length);
91
92  document_size_ = content_length;
93  requests_count_ = 0;
94
95  // Enable partial loading only if file size is above the threshold.
96  // It will allow avoiding latency for multiple requests.
97  if (content_length > kMinFileSize &&
98      accept_ranges_bytes &&
99      !content_encoded) {
100    LoadPartialDocument();
101  } else {
102    LoadFullDocument();
103  }
104  return true;
105}
106
107void DocumentLoader::LoadPartialDocument() {
108  partial_document_ = true;
109  // Force the main request to be cancelled, since if we're a full-frame plugin
110  // there could be other references to the loader.
111  loader_.Close();
112  loader_ = pp::URLLoader();
113  // Download file header.
114  header_request_ = true;
115  RequestData(0, std::min(GetRequestSize(), document_size_));
116}
117
118void DocumentLoader::LoadFullDocument() {
119  partial_document_ = false;
120  chunk_buffer_.clear();
121  ReadMore();
122}
123
124bool DocumentLoader::IsDocumentComplete() const {
125  if (document_size_ == 0)  // Document size unknown.
126    return false;
127  return IsDataAvailable(0, document_size_);
128}
129
130uint32 DocumentLoader::GetAvailableData() const {
131  if (document_size_ == 0) {  // If document size is unknown.
132    return current_pos_;
133  }
134
135  std::vector<std::pair<size_t, size_t> > ranges;
136  chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
137  uint32 available = document_size_;
138  std::vector<std::pair<size_t, size_t> >::iterator it;
139  for (it = ranges.begin(); it != ranges.end(); ++it) {
140    available -= it->second;
141  }
142  return available;
143}
144
145void DocumentLoader::ClearPendingRequests() {
146  // The first item in the queue is pending (need to keep it in the queue).
147  if (pending_requests_.size() > 1) {
148    // Remove all elements except the first one.
149    pending_requests_.erase(++pending_requests_.begin(),
150                            pending_requests_.end());
151  }
152}
153
154bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
155  return chunk_stream_.ReadData(position, size, buf);
156}
157
158bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
159  return chunk_stream_.IsRangeAvailable(position, size);
160}
161
162void DocumentLoader::RequestData(uint32 position, uint32 size) {
163  DCHECK(partial_document_);
164
165  // We have some artefact request from
166  // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
167  // document is complete.
168  // We need this fix in PDFIum. Adding this as a work around.
169  // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
170  // Test url:
171  // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
172  if (IsDocumentComplete())
173    return;
174
175  pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
176  DownloadPendingRequests();
177}
178
179void DocumentLoader::DownloadPendingRequests() {
180  if (request_pending_ || pending_requests_.empty())
181    return;
182
183  // Remove already completed requests.
184  // By design DownloadPendingRequests() should have at least 1 request in the
185  // queue. ReadComplete() will remove the last pending comment from the queue.
186  while (pending_requests_.size() > 1) {
187    if (IsDataAvailable(pending_requests_.front().first,
188                        pending_requests_.front().second)) {
189      pending_requests_.pop_front();
190    } else {
191      break;
192    }
193  }
194
195  uint32 pos = pending_requests_.front().first;
196  uint32 size = pending_requests_.front().second;
197  if (IsDataAvailable(pos, size)) {
198    ReadComplete();
199    return;
200  }
201
202  // If current request has been partially downloaded already, split it into
203  // a few smaller requests.
204  std::vector<std::pair<size_t, size_t> > ranges;
205  chunk_stream_.GetMissedRanges(pos, size, &ranges);
206  if (ranges.size() > 0) {
207    pending_requests_.pop_front();
208    pending_requests_.insert(pending_requests_.begin(),
209                             ranges.begin(), ranges.end());
210    pos = pending_requests_.front().first;
211    size = pending_requests_.front().second;
212  }
213
214  uint32 cur_request_size = GetRequestSize();
215  // If size is less than default request, try to expand download range for
216  // more optimal download.
217  if (size < cur_request_size && partial_document_) {
218    // First, try to expand block towards the end of the file.
219    uint32 new_pos = pos;
220    uint32 new_size = cur_request_size;
221    if (pos + new_size > document_size_)
222      new_size = document_size_ - pos;
223
224    std::vector<std::pair<size_t, size_t> > ranges;
225    if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
226      new_pos = ranges[0].first;
227      new_size = ranges[0].second;
228    }
229
230    // Second, try to expand block towards the beginning of the file.
231    if (new_size < cur_request_size) {
232      uint32 block_end = new_pos + new_size;
233      if (block_end > cur_request_size) {
234        new_pos = block_end - cur_request_size;
235      } else {
236        new_pos = 0;
237      }
238      new_size = block_end - new_pos;
239
240      if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
241        new_pos = ranges.back().first;
242        new_size = ranges.back().second;
243      }
244    }
245    pos = new_pos;
246    size = new_size;
247  }
248
249  size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
250  size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
251  if (pos - last_byte_before < cur_request_size) {
252    size = pos + size - last_byte_before;
253    pos = last_byte_before;
254  }
255
256  if ((pos + size < first_byte_after) &&
257      (pos + size + cur_request_size >= first_byte_after))
258    size = first_byte_after - pos;
259
260  request_pending_ = true;
261
262  // Start downloading first pending request.
263  loader_.Close();
264  loader_ = client_->CreateURLLoader();
265  pp::CompletionCallback callback =
266      loader_factory_.NewCallback(&DocumentLoader::DidOpen);
267  pp::URLRequestInfo request = GetRequest(pos, size);
268  requests_count_++;
269  int rv = loader_.Open(request, callback);
270  if (rv != PP_OK_COMPLETIONPENDING)
271    callback.Run(rv);
272}
273
274pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
275                                              uint32 size) const {
276  pp::URLRequestInfo request(client_->GetPluginInstance());
277  request.SetURL(url_.c_str());
278  request.SetMethod("GET");
279  request.SetFollowRedirects(true);
280
281  const size_t kBufSize = 100;
282  char buf[kBufSize];
283  // According to rfc2616, byte range specifies position of the first and last
284  // bytes in the requested range inclusively. Therefore we should subtract 1
285  // from the position + size, to get index of the last byte that needs to be
286  // downloaded.
287  base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
288                 position + size - 1);
289  pp::Var header(buf);
290  request.SetHeaders(header);
291
292  return request;
293}
294
295void DocumentLoader::DidOpen(int32_t result) {
296  if (result != PP_OK) {
297    NOTREACHED();
298    return;
299  }
300
301  is_multipart_ = false;
302  current_chunk_size_ = 0;
303  current_chunk_read_ = 0;
304
305  pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
306  std::string headers;
307  if (headers_var.is_string())
308    headers = headers_var.AsString();
309
310  std::string boundary = GetMultiPartBoundary(headers);
311  if (boundary.size()) {
312    // Leave position untouched for now, when we read the data we'll get it.
313    is_multipart_ = true;
314    multipart_boundary_ = boundary;
315  } else {
316    // Need to make sure that the server returned a byte-range, since it's
317    // possible for a server to just ignore our bye-range request and just
318    // return the entire document even if it supports byte-range requests.
319    // i.e. sniff response to
320    // http://www.act.org/compass/sample/pdf/geometry.pdf
321    current_pos_ = 0;
322    uint32 start_pos, end_pos;
323    if (GetByteRange(headers, &start_pos, &end_pos)) {
324      current_pos_ = start_pos;
325      if (end_pos && end_pos > start_pos)
326        current_chunk_size_ = end_pos - start_pos + 1;
327    }
328  }
329
330  ReadMore();
331}
332
333bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
334                                  uint32* end) {
335  net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
336  while (it.GetNext()) {
337    if (LowerCaseEqualsASCII(it.name(), "content-range")) {
338      std::string range = it.values().c_str();
339      if (StartsWithASCII(range, "bytes", false)) {
340        range = range.substr(strlen("bytes"));
341        std::string::size_type pos = range.find('-');
342        std::string range_end;
343        if (pos != std::string::npos)
344          range_end = range.substr(pos + 1);
345        TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
346        TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
347        *start = atoi(range.c_str());
348        *end = atoi(range_end.c_str());
349        return true;
350      }
351    }
352  }
353  return false;
354}
355
356std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
357  net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
358  while (it.GetNext()) {
359    if (LowerCaseEqualsASCII(it.name(), "content-type")) {
360      std::string type = base::StringToLowerASCII(it.values());
361      if (StartsWithASCII(type, "multipart/", true)) {
362        const char* boundary = strstr(type.c_str(), "boundary=");
363        if (!boundary) {
364          NOTREACHED();
365          break;
366        }
367
368        return std::string(boundary + 9);
369      }
370    }
371  }
372  return std::string();
373}
374
375void DocumentLoader::ReadMore() {
376  pp::CompletionCallback callback =
377        loader_factory_.NewCallback(&DocumentLoader::DidRead);
378  int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
379  if (rv != PP_OK_COMPLETIONPENDING)
380    callback.Run(rv);
381}
382
383void DocumentLoader::DidRead(int32_t result) {
384  if (result > 0) {
385    char* start = buffer_;
386    size_t length = result;
387    if (is_multipart_ && result > 2) {
388      for (int i = 2; i < result; ++i) {
389        if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
390            (i >= 4 &&
391             buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
392             buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
393          uint32 start_pos, end_pos;
394          if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
395            current_pos_ = start_pos;
396            start += i;
397            length -= i;
398            if (end_pos && end_pos > start_pos)
399              current_chunk_size_ = end_pos - start_pos + 1;
400          }
401          break;
402        }
403      }
404
405      // Reset this flag so we don't look inside the buffer in future calls of
406      // DidRead for this response.  Note that this code DOES NOT handle multi-
407      // part responses with more than one part (we don't issue them at the
408      // moment, so they shouldn't arrive).
409      is_multipart_ = false;
410    }
411
412    if (current_chunk_size_ &&
413        current_chunk_read_ + length > current_chunk_size_)
414      length = current_chunk_size_ - current_chunk_read_;
415
416    if (length) {
417      if (document_size_ > 0) {
418        chunk_stream_.WriteData(current_pos_, start, length);
419      } else {
420        // If we did not get content-length in the response, we can't
421        // preallocate buffer for the entire document. Resizing array causing
422        // memory fragmentation issues on the large files and OOM exceptions.
423        // To fix this, we collect all chunks of the file to the list and
424        // concatenate them together after request is complete.
425        chunk_buffer_.push_back(std::vector<unsigned char>());
426        chunk_buffer_.back().resize(length);
427        memcpy(&(chunk_buffer_.back()[0]), start, length);
428      }
429      current_pos_ += length;
430      current_chunk_read_ += length;
431      client_->OnNewDataAvailable();
432    }
433    ReadMore();
434  } else if (result == PP_OK) {
435    ReadComplete();
436  } else {
437    NOTREACHED();
438  }
439}
440
441void DocumentLoader::ReadComplete() {
442  if (!partial_document_) {
443    if (document_size_ == 0) {
444      // For the document with no 'content-length" specified we've collected all
445      // the chunks already. Let's allocate final document buffer and copy them
446      // over.
447      chunk_stream_.Preallocate(current_pos_);
448      uint32 pos = 0;
449      std::list<std::vector<unsigned char> >::iterator it;
450      for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
451        chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
452        pos += it->size();
453      }
454      chunk_buffer_.clear();
455    }
456    document_size_ = current_pos_;
457    client_->OnDocumentComplete();
458    return;
459  }
460
461  request_pending_ = false;
462  pending_requests_.pop_front();
463
464  // If there are more pending request - continue downloading.
465  if (!pending_requests_.empty()) {
466    DownloadPendingRequests();
467    return;
468  }
469
470  if (IsDocumentComplete()) {
471    client_->OnDocumentComplete();
472    return;
473  }
474
475  if (header_request_)
476    client_->OnPartialDocumentLoaded();
477  else
478    client_->OnPendingRequestComplete();
479  header_request_ = false;
480
481  // The OnPendingRequestComplete could have added more requests.
482  if (!pending_requests_.empty()) {
483    DownloadPendingRequests();
484  } else {
485    // Document is not complete and we have no outstanding requests.
486    // Let's keep downloading PDF file in small chunks.
487    uint32 pos = chunk_stream_.GetFirstMissingByte();
488    std::vector<std::pair<size_t, size_t> > ranges;
489    chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
490    DCHECK(ranges.size() > 0);
491    RequestData(ranges[0].first, ranges[0].second);
492  }
493}
494
495uint32 DocumentLoader::GetRequestSize() const {
496  // Document loading strategy:
497  // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
498  // double the size (64k), and so on, until we cap max request size at 2M for
499  // 71 or more requests.
500  uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
501  return 32*1024 * (1 << ((limited_count - 1) / 10u));
502}
503
504}  // namespace chrome_pdf
505