1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "pdf/document_loader.h" 6 7#include "base/logging.h" 8#include "base/strings/string_util.h" 9#include "net/http/http_util.h" 10#include "ppapi/c/pp_errors.h" 11#include "ppapi/cpp/url_loader.h" 12#include "ppapi/cpp/url_request_info.h" 13#include "ppapi/cpp/url_response_info.h" 14 15namespace chrome_pdf { 16 17// Document below size will be downloaded in one chunk. 18const uint32 kMinFileSize = 64*1024; 19 20DocumentLoader::DocumentLoader(Client* client) 21 : client_(client), partial_document_(false), request_pending_(false), 22 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0), 23 document_size_(0), header_request_(true), is_multipart_(false) { 24 loader_factory_.Initialize(this); 25} 26 27DocumentLoader::~DocumentLoader() { 28} 29 30bool DocumentLoader::Init(const pp::URLLoader& loader, 31 const std::string& url, 32 const std::string& headers) { 33 DCHECK(url_.empty()); 34 url_ = url; 35 loader_ = loader; 36 37 std::string response_headers; 38 if (!headers.empty()) { 39 response_headers = headers; 40 } else { 41 pp::URLResponseInfo response = loader_.GetResponseInfo(); 42 pp::Var headers_var = response.GetHeaders(); 43 44 if (headers_var.is_string()) { 45 response_headers = headers_var.AsString(); 46 } 47 } 48 49 bool accept_ranges_bytes = false; 50 bool content_encoded = false; 51 uint32 content_length = 0; 52 std::string type; 53 std::string disposition; 54 if (!response_headers.empty()) { 55 net::HttpUtil::HeadersIterator it(response_headers.begin(), 56 response_headers.end(), "\n"); 57 while (it.GetNext()) { 58 if (LowerCaseEqualsASCII(it.name(), "content-length")) { 59 content_length = atoi(it.values().c_str()); 60 } else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) { 61 accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes"); 62 } else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) { 63 content_encoded = true; 64 } else if (LowerCaseEqualsASCII(it.name(), "content-type")) { 65 type = it.values(); 66 size_t semi_colon_pos = type.find(';'); 67 if (semi_colon_pos != std::string::npos) { 68 type = type.substr(0, semi_colon_pos); 69 } 70 TrimWhitespace(type, base::TRIM_ALL, &type); 71 } else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) { 72 disposition = it.values(); 73 } 74 } 75 } 76 if (!type.empty() && 77 !EndsWith(type, "/pdf", false) && 78 !EndsWith(type, ".pdf", false) && 79 !EndsWith(type, "/x-pdf", false) && 80 !EndsWith(type, "/*", false) && 81 !EndsWith(type, "/acrobat", false) && 82 !EndsWith(type, "/unknown", false)) { 83 return false; 84 } 85 if (StartsWithASCII(disposition, "attachment", false)) { 86 return false; 87 } 88 89 if (content_length > 0) 90 chunk_stream_.Preallocate(content_length); 91 92 document_size_ = content_length; 93 requests_count_ = 0; 94 95 // Enable partial loading only if file size is above the threshold. 96 // It will allow avoiding latency for multiple requests. 97 if (content_length > kMinFileSize && 98 accept_ranges_bytes && 99 !content_encoded) { 100 LoadPartialDocument(); 101 } else { 102 LoadFullDocument(); 103 } 104 return true; 105} 106 107void DocumentLoader::LoadPartialDocument() { 108 partial_document_ = true; 109 // Force the main request to be cancelled, since if we're a full-frame plugin 110 // there could be other references to the loader. 111 loader_.Close(); 112 loader_ = pp::URLLoader(); 113 // Download file header. 114 header_request_ = true; 115 RequestData(0, std::min(GetRequestSize(), document_size_)); 116} 117 118void DocumentLoader::LoadFullDocument() { 119 partial_document_ = false; 120 chunk_buffer_.clear(); 121 ReadMore(); 122} 123 124bool DocumentLoader::IsDocumentComplete() const { 125 if (document_size_ == 0) // Document size unknown. 126 return false; 127 return IsDataAvailable(0, document_size_); 128} 129 130uint32 DocumentLoader::GetAvailableData() const { 131 if (document_size_ == 0) { // If document size is unknown. 132 return current_pos_; 133 } 134 135 std::vector<std::pair<size_t, size_t> > ranges; 136 chunk_stream_.GetMissedRanges(0, document_size_, &ranges); 137 uint32 available = document_size_; 138 std::vector<std::pair<size_t, size_t> >::iterator it; 139 for (it = ranges.begin(); it != ranges.end(); ++it) { 140 available -= it->second; 141 } 142 return available; 143} 144 145void DocumentLoader::ClearPendingRequests() { 146 // The first item in the queue is pending (need to keep it in the queue). 147 if (pending_requests_.size() > 1) { 148 // Remove all elements except the first one. 149 pending_requests_.erase(++pending_requests_.begin(), 150 pending_requests_.end()); 151 } 152} 153 154bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const { 155 return chunk_stream_.ReadData(position, size, buf); 156} 157 158bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const { 159 return chunk_stream_.IsRangeAvailable(position, size); 160} 161 162void DocumentLoader::RequestData(uint32 position, uint32 size) { 163 DCHECK(partial_document_); 164 165 // We have some artefact request from 166 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after 167 // document is complete. 168 // We need this fix in PDFIum. Adding this as a work around. 169 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996 170 // Test url: 171 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf 172 if (IsDocumentComplete()) 173 return; 174 175 pending_requests_.push_back(std::pair<size_t, size_t>(position, size)); 176 DownloadPendingRequests(); 177} 178 179void DocumentLoader::DownloadPendingRequests() { 180 if (request_pending_ || pending_requests_.empty()) 181 return; 182 183 // Remove already completed requests. 184 // By design DownloadPendingRequests() should have at least 1 request in the 185 // queue. ReadComplete() will remove the last pending comment from the queue. 186 while (pending_requests_.size() > 1) { 187 if (IsDataAvailable(pending_requests_.front().first, 188 pending_requests_.front().second)) { 189 pending_requests_.pop_front(); 190 } else { 191 break; 192 } 193 } 194 195 uint32 pos = pending_requests_.front().first; 196 uint32 size = pending_requests_.front().second; 197 if (IsDataAvailable(pos, size)) { 198 ReadComplete(); 199 return; 200 } 201 202 // If current request has been partially downloaded already, split it into 203 // a few smaller requests. 204 std::vector<std::pair<size_t, size_t> > ranges; 205 chunk_stream_.GetMissedRanges(pos, size, &ranges); 206 if (ranges.size() > 0) { 207 pending_requests_.pop_front(); 208 pending_requests_.insert(pending_requests_.begin(), 209 ranges.begin(), ranges.end()); 210 pos = pending_requests_.front().first; 211 size = pending_requests_.front().second; 212 } 213 214 uint32 cur_request_size = GetRequestSize(); 215 // If size is less than default request, try to expand download range for 216 // more optimal download. 217 if (size < cur_request_size && partial_document_) { 218 // First, try to expand block towards the end of the file. 219 uint32 new_pos = pos; 220 uint32 new_size = cur_request_size; 221 if (pos + new_size > document_size_) 222 new_size = document_size_ - pos; 223 224 std::vector<std::pair<size_t, size_t> > ranges; 225 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) { 226 new_pos = ranges[0].first; 227 new_size = ranges[0].second; 228 } 229 230 // Second, try to expand block towards the beginning of the file. 231 if (new_size < cur_request_size) { 232 uint32 block_end = new_pos + new_size; 233 if (block_end > cur_request_size) { 234 new_pos = block_end - cur_request_size; 235 } else { 236 new_pos = 0; 237 } 238 new_size = block_end - new_pos; 239 240 if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) { 241 new_pos = ranges.back().first; 242 new_size = ranges.back().second; 243 } 244 } 245 pos = new_pos; 246 size = new_size; 247 } 248 249 size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos); 250 size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1); 251 if (pos - last_byte_before < cur_request_size) { 252 size = pos + size - last_byte_before; 253 pos = last_byte_before; 254 } 255 256 if ((pos + size < first_byte_after) && 257 (pos + size + cur_request_size >= first_byte_after)) 258 size = first_byte_after - pos; 259 260 request_pending_ = true; 261 262 // Start downloading first pending request. 263 loader_.Close(); 264 loader_ = client_->CreateURLLoader(); 265 pp::CompletionCallback callback = 266 loader_factory_.NewCallback(&DocumentLoader::DidOpen); 267 pp::URLRequestInfo request = GetRequest(pos, size); 268 requests_count_++; 269 int rv = loader_.Open(request, callback); 270 if (rv != PP_OK_COMPLETIONPENDING) 271 callback.Run(rv); 272} 273 274pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position, 275 uint32 size) const { 276 pp::URLRequestInfo request(client_->GetPluginInstance()); 277 request.SetURL(url_.c_str()); 278 request.SetMethod("GET"); 279 request.SetFollowRedirects(true); 280 281 const size_t kBufSize = 100; 282 char buf[kBufSize]; 283 // According to rfc2616, byte range specifies position of the first and last 284 // bytes in the requested range inclusively. Therefore we should subtract 1 285 // from the position + size, to get index of the last byte that needs to be 286 // downloaded. 287 base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position, 288 position + size - 1); 289 pp::Var header(buf); 290 request.SetHeaders(header); 291 292 return request; 293} 294 295void DocumentLoader::DidOpen(int32_t result) { 296 if (result != PP_OK) { 297 NOTREACHED(); 298 return; 299 } 300 301 is_multipart_ = false; 302 current_chunk_size_ = 0; 303 current_chunk_read_ = 0; 304 305 pp::Var headers_var = loader_.GetResponseInfo().GetHeaders(); 306 std::string headers; 307 if (headers_var.is_string()) 308 headers = headers_var.AsString(); 309 310 std::string boundary = GetMultiPartBoundary(headers); 311 if (boundary.size()) { 312 // Leave position untouched for now, when we read the data we'll get it. 313 is_multipart_ = true; 314 multipart_boundary_ = boundary; 315 } else { 316 // Need to make sure that the server returned a byte-range, since it's 317 // possible for a server to just ignore our bye-range request and just 318 // return the entire document even if it supports byte-range requests. 319 // i.e. sniff response to 320 // http://www.act.org/compass/sample/pdf/geometry.pdf 321 current_pos_ = 0; 322 uint32 start_pos, end_pos; 323 if (GetByteRange(headers, &start_pos, &end_pos)) { 324 current_pos_ = start_pos; 325 if (end_pos && end_pos > start_pos) 326 current_chunk_size_ = end_pos - start_pos + 1; 327 } 328 } 329 330 ReadMore(); 331} 332 333bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start, 334 uint32* end) { 335 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n"); 336 while (it.GetNext()) { 337 if (LowerCaseEqualsASCII(it.name(), "content-range")) { 338 std::string range = it.values().c_str(); 339 if (StartsWithASCII(range, "bytes", false)) { 340 range = range.substr(strlen("bytes")); 341 std::string::size_type pos = range.find('-'); 342 std::string range_end; 343 if (pos != std::string::npos) 344 range_end = range.substr(pos + 1); 345 TrimWhitespaceASCII(range, base::TRIM_LEADING, &range); 346 TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end); 347 *start = atoi(range.c_str()); 348 *end = atoi(range_end.c_str()); 349 return true; 350 } 351 } 352 } 353 return false; 354} 355 356std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) { 357 net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n"); 358 while (it.GetNext()) { 359 if (LowerCaseEqualsASCII(it.name(), "content-type")) { 360 std::string type = base::StringToLowerASCII(it.values()); 361 if (StartsWithASCII(type, "multipart/", true)) { 362 const char* boundary = strstr(type.c_str(), "boundary="); 363 if (!boundary) { 364 NOTREACHED(); 365 break; 366 } 367 368 return std::string(boundary + 9); 369 } 370 } 371 } 372 return std::string(); 373} 374 375void DocumentLoader::ReadMore() { 376 pp::CompletionCallback callback = 377 loader_factory_.NewCallback(&DocumentLoader::DidRead); 378 int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback); 379 if (rv != PP_OK_COMPLETIONPENDING) 380 callback.Run(rv); 381} 382 383void DocumentLoader::DidRead(int32_t result) { 384 if (result > 0) { 385 char* start = buffer_; 386 size_t length = result; 387 if (is_multipart_ && result > 2) { 388 for (int i = 2; i < result; ++i) { 389 if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') || 390 (i >= 4 && 391 buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' && 392 buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) { 393 uint32 start_pos, end_pos; 394 if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) { 395 current_pos_ = start_pos; 396 start += i; 397 length -= i; 398 if (end_pos && end_pos > start_pos) 399 current_chunk_size_ = end_pos - start_pos + 1; 400 } 401 break; 402 } 403 } 404 405 // Reset this flag so we don't look inside the buffer in future calls of 406 // DidRead for this response. Note that this code DOES NOT handle multi- 407 // part responses with more than one part (we don't issue them at the 408 // moment, so they shouldn't arrive). 409 is_multipart_ = false; 410 } 411 412 if (current_chunk_size_ && 413 current_chunk_read_ + length > current_chunk_size_) 414 length = current_chunk_size_ - current_chunk_read_; 415 416 if (length) { 417 if (document_size_ > 0) { 418 chunk_stream_.WriteData(current_pos_, start, length); 419 } else { 420 // If we did not get content-length in the response, we can't 421 // preallocate buffer for the entire document. Resizing array causing 422 // memory fragmentation issues on the large files and OOM exceptions. 423 // To fix this, we collect all chunks of the file to the list and 424 // concatenate them together after request is complete. 425 chunk_buffer_.push_back(std::vector<unsigned char>()); 426 chunk_buffer_.back().resize(length); 427 memcpy(&(chunk_buffer_.back()[0]), start, length); 428 } 429 current_pos_ += length; 430 current_chunk_read_ += length; 431 client_->OnNewDataAvailable(); 432 } 433 ReadMore(); 434 } else if (result == PP_OK) { 435 ReadComplete(); 436 } else { 437 NOTREACHED(); 438 } 439} 440 441void DocumentLoader::ReadComplete() { 442 if (!partial_document_) { 443 if (document_size_ == 0) { 444 // For the document with no 'content-length" specified we've collected all 445 // the chunks already. Let's allocate final document buffer and copy them 446 // over. 447 chunk_stream_.Preallocate(current_pos_); 448 uint32 pos = 0; 449 std::list<std::vector<unsigned char> >::iterator it; 450 for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) { 451 chunk_stream_.WriteData(pos, &((*it)[0]), it->size()); 452 pos += it->size(); 453 } 454 chunk_buffer_.clear(); 455 } 456 document_size_ = current_pos_; 457 client_->OnDocumentComplete(); 458 return; 459 } 460 461 request_pending_ = false; 462 pending_requests_.pop_front(); 463 464 // If there are more pending request - continue downloading. 465 if (!pending_requests_.empty()) { 466 DownloadPendingRequests(); 467 return; 468 } 469 470 if (IsDocumentComplete()) { 471 client_->OnDocumentComplete(); 472 return; 473 } 474 475 if (header_request_) 476 client_->OnPartialDocumentLoaded(); 477 else 478 client_->OnPendingRequestComplete(); 479 header_request_ = false; 480 481 // The OnPendingRequestComplete could have added more requests. 482 if (!pending_requests_.empty()) { 483 DownloadPendingRequests(); 484 } else { 485 // Document is not complete and we have no outstanding requests. 486 // Let's keep downloading PDF file in small chunks. 487 uint32 pos = chunk_stream_.GetFirstMissingByte(); 488 std::vector<std::pair<size_t, size_t> > ranges; 489 chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges); 490 DCHECK(ranges.size() > 0); 491 RequestData(ranges[0].first, ranges[0].second); 492 } 493} 494 495uint32 DocumentLoader::GetRequestSize() const { 496 // Document loading strategy: 497 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we 498 // double the size (64k), and so on, until we cap max request size at 2M for 499 // 71 or more requests. 500 uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u); 501 return 32*1024 * (1 << ((limited_count - 1) / 10u)); 502} 503 504} // namespace chrome_pdf 505