0
Files
src/pdf/document_loader.cc
raymes@chromium.org 69fe5c7781 Fix loading PDFs with no HTTP response headers in OOP PDF
Currently if no response headers are passed into the plugin in OOP PDF then the
headers fetched URLLoader that loaded the stream URL will be used, but the
MIME type for the blob URL is always plain text, which causes the document
loader to refuse to load the document. Currently, if no headers are passed
to the in process plugin, we always try to load the document, so this just
mimics that behavior in the OOP plugin. When the in-process plugin is removed
we can change the document loader and remove the hack.

This also removes blob: URLs from the MIME type check in document loader
because they are no longer ever passed in.

BUG=303491

Review URL: https://codereview.chromium.org/301223004

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@274081 0039d316-1c4b-4281-b951-d872f2087c98
2014-05-31 22:45:50 +00:00

505 lines
16 KiB
C++

// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "pdf/document_loader.h"
#include "base/logging.h"
#include "base/strings/string_util.h"
#include "net/http/http_util.h"
#include "ppapi/c/pp_errors.h"
#include "ppapi/cpp/url_loader.h"
#include "ppapi/cpp/url_request_info.h"
#include "ppapi/cpp/url_response_info.h"
namespace chrome_pdf {
// Document below size will be downloaded in one chunk.
const uint32 kMinFileSize = 64*1024;
DocumentLoader::DocumentLoader(Client* client)
: client_(client), partial_document_(false), request_pending_(false),
current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
document_size_(0), header_request_(true), is_multipart_(false) {
loader_factory_.Initialize(this);
}
DocumentLoader::~DocumentLoader() {
}
bool DocumentLoader::Init(const pp::URLLoader& loader,
const std::string& url,
const std::string& headers) {
DCHECK(url_.empty());
url_ = url;
loader_ = loader;
std::string response_headers;
if (!headers.empty()) {
response_headers = headers;
} else {
pp::URLResponseInfo response = loader_.GetResponseInfo();
pp::Var headers_var = response.GetHeaders();
if (headers_var.is_string()) {
response_headers = headers_var.AsString();
}
}
bool accept_ranges_bytes = false;
bool content_encoded = false;
uint32 content_length = 0;
std::string type;
std::string disposition;
if (!response_headers.empty()) {
net::HttpUtil::HeadersIterator it(response_headers.begin(),
response_headers.end(), "\n");
while (it.GetNext()) {
if (LowerCaseEqualsASCII(it.name(), "content-length")) {
content_length = atoi(it.values().c_str());
} else if (LowerCaseEqualsASCII(it.name(), "accept-ranges")) {
accept_ranges_bytes = LowerCaseEqualsASCII(it.values(), "bytes");
} else if (LowerCaseEqualsASCII(it.name(), "content-encoding")) {
content_encoded = true;
} else if (LowerCaseEqualsASCII(it.name(), "content-type")) {
type = it.values();
size_t semi_colon_pos = type.find(';');
if (semi_colon_pos != std::string::npos) {
type = type.substr(0, semi_colon_pos);
}
TrimWhitespace(type, base::TRIM_ALL, &type);
} else if (LowerCaseEqualsASCII(it.name(), "content-disposition")) {
disposition = it.values();
}
}
}
if (!type.empty() &&
!EndsWith(type, "/pdf", false) &&
!EndsWith(type, ".pdf", false) &&
!EndsWith(type, "/x-pdf", false) &&
!EndsWith(type, "/*", false) &&
!EndsWith(type, "/acrobat", false) &&
!EndsWith(type, "/unknown", false)) {
return false;
}
if (StartsWithASCII(disposition, "attachment", false)) {
return false;
}
if (content_length > 0)
chunk_stream_.Preallocate(content_length);
document_size_ = content_length;
requests_count_ = 0;
// Enable partial loading only if file size is above the threshold.
// It will allow avoiding latency for multiple requests.
if (content_length > kMinFileSize &&
accept_ranges_bytes &&
!content_encoded) {
LoadPartialDocument();
} else {
LoadFullDocument();
}
return true;
}
void DocumentLoader::LoadPartialDocument() {
partial_document_ = true;
// Force the main request to be cancelled, since if we're a full-frame plugin
// there could be other references to the loader.
loader_.Close();
loader_ = pp::URLLoader();
// Download file header.
header_request_ = true;
RequestData(0, std::min(GetRequestSize(), document_size_));
}
void DocumentLoader::LoadFullDocument() {
partial_document_ = false;
chunk_buffer_.clear();
ReadMore();
}
bool DocumentLoader::IsDocumentComplete() const {
if (document_size_ == 0) // Document size unknown.
return false;
return IsDataAvailable(0, document_size_);
}
uint32 DocumentLoader::GetAvailableData() const {
if (document_size_ == 0) { // If document size is unknown.
return current_pos_;
}
std::vector<std::pair<size_t, size_t> > ranges;
chunk_stream_.GetMissedRanges(0, document_size_, &ranges);
uint32 available = document_size_;
std::vector<std::pair<size_t, size_t> >::iterator it;
for (it = ranges.begin(); it != ranges.end(); ++it) {
available -= it->second;
}
return available;
}
void DocumentLoader::ClearPendingRequests() {
// The first item in the queue is pending (need to keep it in the queue).
if (pending_requests_.size() > 1) {
// Remove all elements except the first one.
pending_requests_.erase(++pending_requests_.begin(),
pending_requests_.end());
}
}
bool DocumentLoader::GetBlock(uint32 position, uint32 size, void* buf) const {
return chunk_stream_.ReadData(position, size, buf);
}
bool DocumentLoader::IsDataAvailable(uint32 position, uint32 size) const {
return chunk_stream_.IsRangeAvailable(position, size);
}
void DocumentLoader::RequestData(uint32 position, uint32 size) {
DCHECK(partial_document_);
// We have some artefact request from
// PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
// document is complete.
// We need this fix in PDFIum. Adding this as a work around.
// Bug: http://code.google.com/p/chromium/issues/detail?id=79996
// Test url:
// http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
if (IsDocumentComplete())
return;
pending_requests_.push_back(std::pair<size_t, size_t>(position, size));
DownloadPendingRequests();
}
void DocumentLoader::DownloadPendingRequests() {
if (request_pending_ || pending_requests_.empty())
return;
// Remove already completed requests.
// By design DownloadPendingRequests() should have at least 1 request in the
// queue. ReadComplete() will remove the last pending comment from the queue.
while (pending_requests_.size() > 1) {
if (IsDataAvailable(pending_requests_.front().first,
pending_requests_.front().second)) {
pending_requests_.pop_front();
} else {
break;
}
}
uint32 pos = pending_requests_.front().first;
uint32 size = pending_requests_.front().second;
if (IsDataAvailable(pos, size)) {
ReadComplete();
return;
}
// If current request has been partially downloaded already, split it into
// a few smaller requests.
std::vector<std::pair<size_t, size_t> > ranges;
chunk_stream_.GetMissedRanges(pos, size, &ranges);
if (ranges.size() > 0) {
pending_requests_.pop_front();
pending_requests_.insert(pending_requests_.begin(),
ranges.begin(), ranges.end());
pos = pending_requests_.front().first;
size = pending_requests_.front().second;
}
uint32 cur_request_size = GetRequestSize();
// If size is less than default request, try to expand download range for
// more optimal download.
if (size < cur_request_size && partial_document_) {
// First, try to expand block towards the end of the file.
uint32 new_pos = pos;
uint32 new_size = cur_request_size;
if (pos + new_size > document_size_)
new_size = document_size_ - pos;
std::vector<std::pair<size_t, size_t> > ranges;
if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
new_pos = ranges[0].first;
new_size = ranges[0].second;
}
// Second, try to expand block towards the beginning of the file.
if (new_size < cur_request_size) {
uint32 block_end = new_pos + new_size;
if (block_end > cur_request_size) {
new_pos = block_end - cur_request_size;
} else {
new_pos = 0;
}
new_size = block_end - new_pos;
if (chunk_stream_.GetMissedRanges(new_pos, new_size, &ranges)) {
new_pos = ranges.back().first;
new_size = ranges.back().second;
}
}
pos = new_pos;
size = new_size;
}
size_t last_byte_before = chunk_stream_.GetLastByteBefore(pos);
size_t first_byte_after = chunk_stream_.GetFirstByteAfter(pos + size - 1);
if (pos - last_byte_before < cur_request_size) {
size = pos + size - last_byte_before;
pos = last_byte_before;
}
if ((pos + size < first_byte_after) &&
(pos + size + cur_request_size >= first_byte_after))
size = first_byte_after - pos;
request_pending_ = true;
// Start downloading first pending request.
loader_.Close();
loader_ = client_->CreateURLLoader();
pp::CompletionCallback callback =
loader_factory_.NewCallback(&DocumentLoader::DidOpen);
pp::URLRequestInfo request = GetRequest(pos, size);
requests_count_++;
int rv = loader_.Open(request, callback);
if (rv != PP_OK_COMPLETIONPENDING)
callback.Run(rv);
}
pp::URLRequestInfo DocumentLoader::GetRequest(uint32 position,
uint32 size) const {
pp::URLRequestInfo request(client_->GetPluginInstance());
request.SetURL(url_.c_str());
request.SetMethod("GET");
request.SetFollowRedirects(true);
const size_t kBufSize = 100;
char buf[kBufSize];
// According to rfc2616, byte range specifies position of the first and last
// bytes in the requested range inclusively. Therefore we should subtract 1
// from the position + size, to get index of the last byte that needs to be
// downloaded.
base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position,
position + size - 1);
pp::Var header(buf);
request.SetHeaders(header);
return request;
}
void DocumentLoader::DidOpen(int32_t result) {
if (result != PP_OK) {
NOTREACHED();
return;
}
is_multipart_ = false;
current_chunk_size_ = 0;
current_chunk_read_ = 0;
pp::Var headers_var = loader_.GetResponseInfo().GetHeaders();
std::string headers;
if (headers_var.is_string())
headers = headers_var.AsString();
std::string boundary = GetMultiPartBoundary(headers);
if (boundary.size()) {
// Leave position untouched for now, when we read the data we'll get it.
is_multipart_ = true;
multipart_boundary_ = boundary;
} else {
// Need to make sure that the server returned a byte-range, since it's
// possible for a server to just ignore our bye-range request and just
// return the entire document even if it supports byte-range requests.
// i.e. sniff response to
// http://www.act.org/compass/sample/pdf/geometry.pdf
current_pos_ = 0;
uint32 start_pos, end_pos;
if (GetByteRange(headers, &start_pos, &end_pos)) {
current_pos_ = start_pos;
if (end_pos && end_pos > start_pos)
current_chunk_size_ = end_pos - start_pos + 1;
}
}
ReadMore();
}
bool DocumentLoader::GetByteRange(const std::string& headers, uint32* start,
uint32* end) {
net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
while (it.GetNext()) {
if (LowerCaseEqualsASCII(it.name(), "content-range")) {
std::string range = it.values().c_str();
if (StartsWithASCII(range, "bytes", false)) {
range = range.substr(strlen("bytes"));
std::string::size_type pos = range.find('-');
std::string range_end;
if (pos != std::string::npos)
range_end = range.substr(pos + 1);
TrimWhitespaceASCII(range, base::TRIM_LEADING, &range);
TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end);
*start = atoi(range.c_str());
*end = atoi(range_end.c_str());
return true;
}
}
}
return false;
}
std::string DocumentLoader::GetMultiPartBoundary(const std::string& headers) {
net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n");
while (it.GetNext()) {
if (LowerCaseEqualsASCII(it.name(), "content-type")) {
std::string type = StringToLowerASCII(it.values());
if (StartsWithASCII(type, "multipart/", true)) {
const char* boundary = strstr(type.c_str(), "boundary=");
if (!boundary) {
NOTREACHED();
break;
}
return std::string(boundary + 9);
}
}
}
return std::string();
}
void DocumentLoader::ReadMore() {
pp::CompletionCallback callback =
loader_factory_.NewCallback(&DocumentLoader::DidRead);
int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback);
if (rv != PP_OK_COMPLETIONPENDING)
callback.Run(rv);
}
void DocumentLoader::DidRead(int32_t result) {
if (result > 0) {
char* start = buffer_;
size_t length = result;
if (is_multipart_ && result > 2) {
for (int i = 2; i < result; ++i) {
if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') ||
(i >= 4 &&
buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' &&
buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) {
uint32 start_pos, end_pos;
if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) {
current_pos_ = start_pos;
start += i;
length -= i;
if (end_pos && end_pos > start_pos)
current_chunk_size_ = end_pos - start_pos + 1;
}
break;
}
}
// Reset this flag so we don't look inside the buffer in future calls of
// DidRead for this response. Note that this code DOES NOT handle multi-
// part responses with more than one part (we don't issue them at the
// moment, so they shouldn't arrive).
is_multipart_ = false;
}
if (current_chunk_size_ &&
current_chunk_read_ + length > current_chunk_size_)
length = current_chunk_size_ - current_chunk_read_;
if (length) {
if (document_size_ > 0) {
chunk_stream_.WriteData(current_pos_, start, length);
} else {
// If we did not get content-length in the response, we can't
// preallocate buffer for the entire document. Resizing array causing
// memory fragmentation issues on the large files and OOM exceptions.
// To fix this, we collect all chunks of the file to the list and
// concatenate them together after request is complete.
chunk_buffer_.push_back(std::vector<unsigned char>());
chunk_buffer_.back().resize(length);
memcpy(&(chunk_buffer_.back()[0]), start, length);
}
current_pos_ += length;
current_chunk_read_ += length;
client_->OnNewDataAvailable();
}
ReadMore();
} else if (result == PP_OK) {
ReadComplete();
} else {
NOTREACHED();
}
}
void DocumentLoader::ReadComplete() {
if (!partial_document_) {
if (document_size_ == 0) {
// For the document with no 'content-length" specified we've collected all
// the chunks already. Let's allocate final document buffer and copy them
// over.
chunk_stream_.Preallocate(current_pos_);
uint32 pos = 0;
std::list<std::vector<unsigned char> >::iterator it;
for (it = chunk_buffer_.begin(); it != chunk_buffer_.end(); ++it) {
chunk_stream_.WriteData(pos, &((*it)[0]), it->size());
pos += it->size();
}
chunk_buffer_.clear();
}
document_size_ = current_pos_;
client_->OnDocumentComplete();
return;
}
request_pending_ = false;
pending_requests_.pop_front();
// If there are more pending request - continue downloading.
if (!pending_requests_.empty()) {
DownloadPendingRequests();
return;
}
if (IsDocumentComplete()) {
client_->OnDocumentComplete();
return;
}
if (header_request_)
client_->OnPartialDocumentLoaded();
else
client_->OnPendingRequestComplete();
header_request_ = false;
// The OnPendingRequestComplete could have added more requests.
if (!pending_requests_.empty()) {
DownloadPendingRequests();
} else {
// Document is not complete and we have no outstanding requests.
// Let's keep downloading PDF file in small chunks.
uint32 pos = chunk_stream_.GetFirstMissingByte();
std::vector<std::pair<size_t, size_t> > ranges;
chunk_stream_.GetMissedRanges(pos, GetRequestSize(), &ranges);
DCHECK(ranges.size() > 0);
RequestData(ranges[0].first, ranges[0].second);
}
}
uint32 DocumentLoader::GetRequestSize() const {
// Document loading strategy:
// For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
// double the size (64k), and so on, until we cap max request size at 2M for
// 71 or more requests.
uint32 limited_count = std::min(std::max(requests_count_, 10u), 70u);
return 32*1024 * (1 << ((limited_count - 1) / 10u));
}
} // namespace chrome_pdf