pdfium_page.cc revision 6e8cce623b6e4fe0c9e4af605d675dd9d0338c38
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "pdf/pdfium/pdfium_page.h"
6
7#include <math.h>
8
9#include "base/logging.h"
10#include "base/strings/string_number_conversions.h"
11#include "base/strings/string_util.h"
12#include "base/strings/utf_string_conversions.h"
13#include "base/values.h"
14#include "pdf/pdfium/pdfium_engine.h"
15
16// Used when doing hit detection.
17#define kTolerance 20.0
18
19// Dictionary Value key names for returning the accessible page content as JSON.
20const char kPageWidth[] = "width";
21const char kPageHeight[] = "height";
22const char kPageTextBox[] = "textBox";
23const char kTextBoxLeft[] = "left";
24const char kTextBoxTop[]  = "top";
25const char kTextBoxWidth[] = "width";
26const char kTextBoxHeight[]  = "height";
27const char kTextBoxFontSize[] = "fontSize";
28const char kTextBoxNodes[] = "textNodes";
29const char kTextNodeType[] = "type";
30const char kTextNodeText[] = "text";
31const char kTextNodeURL[] = "url";
32const char kTextNodeTypeText[] = "text";
33const char kTextNodeTypeURL[] = "url";
34const char kDocLinkURLPrefix[] = "#page";
35
36namespace chrome_pdf {
37
38PDFiumPage::PDFiumPage(PDFiumEngine* engine,
39                       int i,
40                       const pp::Rect& r,
41                       bool available)
42    : engine_(engine),
43      page_(NULL),
44      text_page_(NULL),
45      index_(i),
46      rect_(r),
47      calculated_links_(false),
48      available_(available) {
49}
50
51PDFiumPage::~PDFiumPage() {
52  Unload();
53}
54
55void PDFiumPage::Unload() {
56  if (text_page_) {
57    FPDFText_ClosePage(text_page_);
58    text_page_ = NULL;
59  }
60
61  if (page_) {
62    if (engine_->form()) {
63      FORM_OnBeforeClosePage(page_, engine_->form());
64    }
65    FPDF_ClosePage(page_);
66    page_ = NULL;
67  }
68}
69
70FPDF_PAGE PDFiumPage::GetPage() {
71  ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
72  if (!available_)
73    return NULL;
74  if (!page_) {
75    page_ = FPDF_LoadPage(engine_->doc(), index_);
76    if (page_ && engine_->form()) {
77      FORM_OnAfterLoadPage(page_, engine_->form());
78    }
79  }
80  return page_;
81}
82
83FPDF_PAGE PDFiumPage::GetPrintPage() {
84  ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
85  if (!available_)
86    return NULL;
87  if (!page_)
88    page_ = FPDF_LoadPage(engine_->doc(), index_);
89  return page_;
90}
91
92void PDFiumPage::ClosePrintPage() {
93  if (page_) {
94    FPDF_ClosePage(page_);
95    page_ = NULL;
96  }
97}
98
99FPDF_TEXTPAGE PDFiumPage::GetTextPage() {
100  if (!available_)
101    return NULL;
102  if (!text_page_)
103    text_page_ = FPDFText_LoadPage(GetPage());
104  return text_page_;
105}
106
107base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {
108  base::DictionaryValue* node = new base::DictionaryValue();
109
110  if (!available_)
111    return node;
112
113  double width = FPDF_GetPageWidth(GetPage());
114  double height = FPDF_GetPageHeight(GetPage());
115
116  base::ListValue* text = new base::ListValue();
117  int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
118  for (int i = 0; i < box_count; i++) {
119    double left, top, right, bottom;
120    FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom);
121    text->Append(
122        GetTextBoxAsValue(height, left, top, right, bottom, rotation));
123  }
124
125  node->SetDouble(kPageWidth, width);
126  node->SetDouble(kPageHeight, height);
127  node->Set(kPageTextBox, text);  // Takes ownership of |text|
128
129  return node;
130}
131
132base::Value* PDFiumPage::GetTextBoxAsValue(double page_height,
133                                           double left, double top,
134                                           double right, double bottom,
135                                           int rotation) {
136  base::string16 text_utf16;
137  int char_count =
138    FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0);
139  if (char_count > 0) {
140    unsigned short* data = reinterpret_cast<unsigned short*>(
141        WriteInto(&text_utf16, char_count + 1));
142    FPDFText_GetBoundedText(GetTextPage(),
143                            left, top, right, bottom,
144                            data, char_count);
145  }
146  std::string text_utf8 = base::UTF16ToUTF8(text_utf16);
147
148  FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top);
149  Area area;
150  std::vector<LinkTarget> targets;
151  if (link) {
152    targets.push_back(LinkTarget());
153    area = GetLinkTarget(link, &targets[0]);
154  } else {
155    pp::Rect rect(
156        PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation));
157    GetLinks(rect, &targets);
158    area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA;
159  }
160
161  int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top,
162                                              kTolerance, kTolerance);
163  double font_size = FPDFText_GetFontSize(GetTextPage(), char_index);
164
165  base::DictionaryValue* node = new base::DictionaryValue();
166  node->SetDouble(kTextBoxLeft, left);
167  node->SetDouble(kTextBoxTop, page_height - top);
168  node->SetDouble(kTextBoxWidth, right - left);
169  node->SetDouble(kTextBoxHeight, top - bottom);
170  node->SetDouble(kTextBoxFontSize, font_size);
171
172  base::ListValue* text_nodes = new base::ListValue();
173
174  if (area == DOCLINK_AREA) {
175    std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page);
176    text_nodes->Append(CreateURLNode(text_utf8, url));
177  } else if (area == WEBLINK_AREA && link) {
178    text_nodes->Append(CreateURLNode(text_utf8, targets[0].url));
179  } else if (area == WEBLINK_AREA && !link) {
180    size_t start = 0;
181    for (size_t i = 0; i < targets.size(); ++i) {
182      // Remove the extra NULL character at end.
183      // Otherwise, find() will not return any matches.
184      if (targets[i].url.size() > 0 &&
185          targets[i].url[targets[i].url.size() - 1] == '\0') {
186        targets[i].url.resize(targets[i].url.size() - 1);
187      }
188      // There should only ever be one NULL character
189      DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0');
190
191      // PDFium may change the case of generated links.
192      std::string lowerCaseURL = base::StringToLowerASCII(targets[i].url);
193      std::string lowerCaseText = base::StringToLowerASCII(text_utf8);
194      size_t pos = lowerCaseText.find(lowerCaseURL, start);
195      size_t length = targets[i].url.size();
196      if (pos == std::string::npos) {
197        // Check if the link is a "mailto:" URL
198        if (lowerCaseURL.compare(0, 7, "mailto:") == 0) {
199          pos = lowerCaseText.find(lowerCaseURL.substr(7), start);
200          length -= 7;
201        }
202
203        if (pos == std::string::npos) {
204          // No match has been found.  This should never happen.
205          continue;
206        }
207      }
208
209      std::string before_text = text_utf8.substr(start, pos - start);
210      if (before_text.size() > 0)
211        text_nodes->Append(CreateTextNode(before_text));
212      std::string link_text = text_utf8.substr(pos, length);
213      text_nodes->Append(CreateURLNode(link_text, targets[i].url));
214
215      start = pos + length;
216    }
217    std::string before_text = text_utf8.substr(start);
218    if (before_text.size() > 0)
219      text_nodes->Append(CreateTextNode(before_text));
220  } else {
221    text_nodes->Append(CreateTextNode(text_utf8));
222  }
223
224  node->Set(kTextBoxNodes, text_nodes);  // Takes ownership of |text_nodes|.
225  return node;
226}
227
228base::Value* PDFiumPage::CreateTextNode(std::string text) {
229  base::DictionaryValue* node = new base::DictionaryValue();
230  node->SetString(kTextNodeType, kTextNodeTypeText);
231  node->SetString(kTextNodeText, text);
232  return node;
233}
234
235base::Value* PDFiumPage::CreateURLNode(std::string text, std::string url) {
236  base::DictionaryValue* node = new base::DictionaryValue();
237  node->SetString(kTextNodeType, kTextNodeTypeURL);
238  node->SetString(kTextNodeText, text);
239  node->SetString(kTextNodeURL, url);
240  return node;
241}
242
243PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,
244                                          int rotation,
245                                          int* char_index,
246                                          LinkTarget* target) {
247  if (!available_)
248    return NONSELECTABLE_AREA;
249  pp::Point point2 = point - rect_.point();
250  double new_x, new_y;
251  FPDF_DeviceToPage(GetPage(), 0, 0, rect_.width(), rect_.height(),
252        rotation, point2.x(), point2.y(), &new_x, &new_y);
253
254  int rv = FPDFText_GetCharIndexAtPos(
255      GetTextPage(), new_x, new_y, kTolerance, kTolerance);
256  *char_index = rv;
257
258  FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), new_x, new_y);
259  if (link) {
260    // We don't handle all possible link types of the PDF. For example,
261    // launch actions, cross-document links, etc.
262    // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
263    // and we should proceed with area detection.
264    PDFiumPage::Area area = GetLinkTarget(link, target);
265    if (area != PDFiumPage::NONSELECTABLE_AREA)
266      return area;
267  }
268
269  if (rv < 0)
270    return NONSELECTABLE_AREA;
271
272  return GetLink(*char_index, target) != -1 ? WEBLINK_AREA : TEXT_AREA;
273}
274
275base::char16 PDFiumPage::GetCharAtIndex(int index) {
276  if (!available_)
277    return L'\0';
278  return static_cast<base::char16>(FPDFText_GetUnicode(GetTextPage(), index));
279}
280
281int PDFiumPage::GetCharCount() {
282  if (!available_)
283    return 0;
284  return FPDFText_CountChars(GetTextPage());
285}
286
287PDFiumPage::Area PDFiumPage::GetLinkTarget(
288    FPDF_LINK link, PDFiumPage::LinkTarget* target) {
289  FPDF_DEST dest = FPDFLink_GetDest(engine_->doc(), link);
290  if (dest != NULL)
291    return GetDestinationTarget(dest, target);
292
293  FPDF_ACTION action = FPDFLink_GetAction(link);
294  if (action) {
295    switch (FPDFAction_GetType(action)) {
296      case PDFACTION_GOTO: {
297          FPDF_DEST dest = FPDFAction_GetDest(engine_->doc(), action);
298          if (dest)
299            return GetDestinationTarget(dest, target);
300          // TODO(gene): We don't fully support all types of the in-document
301          // links. Need to implement that. There is a bug to track that:
302          // http://code.google.com/p/chromium/issues/detail?id=55776
303        } break;
304      case PDFACTION_URI: {
305          if (target) {
306            size_t buffer_size =
307                FPDFAction_GetURIPath(engine_->doc(), action, NULL, 0);
308            if (buffer_size > 1) {
309              void* data = WriteInto(&target->url, buffer_size);
310              FPDFAction_GetURIPath(engine_->doc(), action, data, buffer_size);
311            }
312          }
313          return WEBLINK_AREA;
314        } break;
315      // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
316      // at the moment.
317    }
318  }
319
320  return NONSELECTABLE_AREA;
321}
322
323PDFiumPage::Area PDFiumPage::GetDestinationTarget(
324    FPDF_DEST destination, PDFiumPage::LinkTarget* target) {
325  int page_index = FPDFDest_GetPageIndex(engine_->doc(), destination);
326  if (target) {
327    target->page = page_index;
328  }
329  return DOCLINK_AREA;
330}
331
332int PDFiumPage::GetLink(int char_index, PDFiumPage::LinkTarget* target) {
333  if (!available_)
334    return -1;
335
336  CalculateLinks();
337
338  // Get the bounding box of the rect again, since it might have moved because
339  // of the tolerance above.
340  double left, right, bottom, top;
341  FPDFText_GetCharBox(GetTextPage(), char_index, &left, &right, &bottom, &top);
342
343  pp::Point origin(
344      PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0).point());
345  for (size_t i = 0; i < links_.size(); ++i) {
346    for (size_t j = 0; j < links_[i].rects.size(); ++j) {
347      if (links_[i].rects[j].Contains(origin)) {
348        if (target)
349          target->url = links_[i].url;
350        return i;
351      }
352    }
353  }
354  return -1;
355}
356
357std::vector<int> PDFiumPage::GetLinks(pp::Rect text_area,
358                                      std::vector<LinkTarget>* targets) {
359  if (!available_)
360    return std::vector<int>();
361
362  CalculateLinks();
363
364  std::vector<int> links;
365
366  for (size_t i = 0; i < links_.size(); ++i) {
367    for (size_t j = 0; j < links_[i].rects.size(); ++j) {
368      if (links_[i].rects[j].Intersects(text_area)) {
369        if (targets) {
370          LinkTarget target;
371          target.url = links_[i].url;
372          targets->push_back(target);
373        }
374        links.push_back(i);
375      }
376    }
377  }
378  return links;
379}
380
381void PDFiumPage::CalculateLinks() {
382  if (calculated_links_)
383    return;
384
385  calculated_links_ = true;
386  FPDF_PAGELINK links = FPDFLink_LoadWebLinks(GetTextPage());
387  int count = FPDFLink_CountWebLinks(links);
388  for (int i = 0; i < count; ++i) {
389    base::string16 url;
390    int url_length = FPDFLink_GetURL(links, i, NULL, 0);
391    if (url_length > 1) {  // WriteInto needs at least 2 characters.
392      unsigned short* data =
393          reinterpret_cast<unsigned short*>(WriteInto(&url, url_length));
394      FPDFLink_GetURL(links, i, data, url_length);
395    }
396    Link link;
397    link.url = base::UTF16ToUTF8(url);
398
399    // If the link cannot be converted to a pp::Var, then it is not possible to
400    // pass it to JS. In this case, ignore the link like other PDF viewers.
401    // See http://crbug.com/312882 for an example.
402    pp::Var link_var(link.url);
403    if (!link_var.is_string())
404      continue;
405
406    // Make sure all the characters in the URL are valid per RFC 1738.
407    // http://crbug.com/340326 has a sample bad PDF.
408    // GURL does not work correctly, e.g. it just strips \t \r \n.
409    bool is_invalid_url = false;
410    for (size_t j = 0; j < link.url.length(); ++j) {
411      // Control characters are not allowed.
412      // 0x7F is also a control character.
413      // 0x80 and above are not in US-ASCII.
414      if (link.url[j] < ' ' || link.url[j] >= '\x7F') {
415        is_invalid_url = true;
416        break;
417      }
418    }
419    if (is_invalid_url)
420      continue;
421
422    int rect_count = FPDFLink_CountRects(links, i);
423    for (int j = 0; j < rect_count; ++j) {
424      double left, top, right, bottom;
425      FPDFLink_GetRect(links, i, j, &left, &top, &right, &bottom);
426      link.rects.push_back(
427          PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0));
428    }
429    links_.push_back(link);
430  }
431  FPDFLink_CloseWebLinks(links);
432}
433
434pp::Rect PDFiumPage::PageToScreen(const pp::Point& offset,
435                                  double zoom,
436                                  double left,
437                                  double top,
438                                  double right,
439                                  double bottom,
440                                  int rotation) {
441  if (!available_)
442    return pp::Rect();
443
444  int new_left, new_top, new_right, new_bottom;
445  FPDF_PageToDevice(
446      page_,
447      static_cast<int>((rect_.x() - offset.x()) * zoom),
448      static_cast<int>((rect_.y() - offset.y()) * zoom),
449      static_cast<int>(ceil(rect_.width() * zoom)),
450      static_cast<int>(ceil(rect_.height() * zoom)),
451      rotation, left, top, &new_left, &new_top);
452  FPDF_PageToDevice(
453      page_,
454      static_cast<int>((rect_.x() - offset.x()) * zoom),
455      static_cast<int>((rect_.y() - offset.y()) * zoom),
456      static_cast<int>(ceil(rect_.width() * zoom)),
457      static_cast<int>(ceil(rect_.height() * zoom)),
458      rotation, right, bottom, &new_right, &new_bottom);
459
460  // If the PDF is rotated, the horizontal/vertical coordinates could be
461  // flipped.  See
462  // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
463  if (new_right < new_left)
464    std::swap(new_right, new_left);
465  if (new_bottom < new_top)
466    std::swap(new_bottom, new_top);
467
468  return pp::Rect(
469      new_left, new_top, new_right - new_left + 1, new_bottom - new_top + 1);
470}
471
472PDFiumPage::Link::Link() {
473}
474
475PDFiumPage::Link::~Link() {
476}
477
478}  // namespace chrome_pdf
479