1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "pdf/pdfium/pdfium_page.h"
6
7#include <math.h>
8
9#include "base/logging.h"
10#include "base/strings/string_number_conversions.h"
11#include "base/strings/string_util.h"
12#include "base/strings/utf_string_conversions.h"
13#include "base/values.h"
14#include "pdf/pdfium/pdfium_engine.h"
15
16// Used when doing hit detection.
17#define kTolerance 20.0
18
19// Dictionary Value key names for returning the accessible page content as JSON.
20const char kPageWidth[] = "width";
21const char kPageHeight[] = "height";
22const char kPageTextBox[] = "textBox";
23const char kTextBoxLeft[] = "left";
24const char kTextBoxTop[]  = "top";
25const char kTextBoxWidth[] = "width";
26const char kTextBoxHeight[]  = "height";
27const char kTextBoxFontSize[] = "fontSize";
28const char kTextBoxNodes[] = "textNodes";
29const char kTextNodeType[] = "type";
30const char kTextNodeText[] = "text";
31const char kTextNodeURL[] = "url";
32const char kTextNodeTypeText[] = "text";
33const char kTextNodeTypeURL[] = "url";
34const char kDocLinkURLPrefix[] = "#page";
35
36namespace chrome_pdf {
37
38PDFiumPage::PDFiumPage(PDFiumEngine* engine,
39                       int i,
40                       const pp::Rect& r,
41                       bool available)
42    : engine_(engine),
43      page_(NULL),
44      text_page_(NULL),
45      index_(i),
46      rect_(r),
47      calculated_links_(false),
48      available_(available) {
49}
50
51PDFiumPage::~PDFiumPage() {
52}
53
54void PDFiumPage::Unload() {
55  if (text_page_) {
56    FPDFText_ClosePage(text_page_);
57    text_page_ = NULL;
58  }
59
60  if (page_) {
61    if (engine_->form()) {
62      FORM_OnBeforeClosePage(page_, engine_->form());
63    }
64    FPDF_ClosePage(page_);
65    page_ = NULL;
66  }
67}
68
69FPDF_PAGE PDFiumPage::GetPage() {
70  ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
71  if (!available_)
72    return NULL;
73  if (!page_) {
74    page_ = FPDF_LoadPage(engine_->doc(), index_);
75    if (page_ && engine_->form()) {
76      FORM_OnAfterLoadPage(page_, engine_->form());
77    }
78  }
79  return page_;
80}
81
82FPDF_PAGE PDFiumPage::GetPrintPage() {
83  ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
84  if (!available_)
85    return NULL;
86  if (!page_)
87    page_ = FPDF_LoadPage(engine_->doc(), index_);
88  return page_;
89}
90
91void PDFiumPage::ClosePrintPage() {
92  if (page_) {
93    FPDF_ClosePage(page_);
94    page_ = NULL;
95  }
96}
97
98FPDF_TEXTPAGE PDFiumPage::GetTextPage() {
99  if (!available_)
100    return NULL;
101  if (!text_page_)
102    text_page_ = FPDFText_LoadPage(GetPage());
103  return text_page_;
104}
105
106base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {
107  base::DictionaryValue* node = new base::DictionaryValue();
108
109  if (!available_)
110    return node;
111
112  double width = FPDF_GetPageWidth(GetPage());
113  double height = FPDF_GetPageHeight(GetPage());
114
115  base::ListValue* text = new base::ListValue();
116  int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
117  for (int i = 0; i < box_count; i++) {
118    double left, top, right, bottom;
119    FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom);
120    text->Append(
121        GetTextBoxAsValue(height, left, top, right, bottom, rotation));
122  }
123
124  node->SetDouble(kPageWidth, width);
125  node->SetDouble(kPageHeight, height);
126  node->Set(kPageTextBox, text);  // Takes ownership of |text|
127
128  return node;
129}
130
131base::Value* PDFiumPage::GetTextBoxAsValue(double page_height,
132                                           double left, double top,
133                                           double right, double bottom,
134                                           int rotation) {
135  base::string16 text_utf16;
136  int char_count =
137    FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0);
138  if (char_count > 0) {
139    unsigned short* data = reinterpret_cast<unsigned short*>(
140        WriteInto(&text_utf16, char_count + 1));
141    FPDFText_GetBoundedText(GetTextPage(),
142                            left, top, right, bottom,
143                            data, char_count);
144  }
145  std::string text_utf8 = base::UTF16ToUTF8(text_utf16);
146
147  FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top);
148  Area area;
149  std::vector<LinkTarget> targets;
150  if (link) {
151    targets.push_back(LinkTarget());
152    area = GetLinkTarget(link, &targets[0]);
153  } else {
154    pp::Rect rect(
155        PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation));
156    GetLinks(rect, &targets);
157    area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA;
158  }
159
160  int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top,
161                                              kTolerance, kTolerance);
162  double font_size = FPDFText_GetFontSize(GetTextPage(), char_index);
163
164  base::DictionaryValue* node = new base::DictionaryValue();
165  node->SetDouble(kTextBoxLeft, left);
166  node->SetDouble(kTextBoxTop, page_height - top);
167  node->SetDouble(kTextBoxWidth, right - left);
168  node->SetDouble(kTextBoxHeight, top - bottom);
169  node->SetDouble(kTextBoxFontSize, font_size);
170
171  base::ListValue* text_nodes = new base::ListValue();
172
173  if (area == DOCLINK_AREA) {
174    std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page);
175    text_nodes->Append(CreateURLNode(text_utf8, url));
176  } else if (area == WEBLINK_AREA && link) {
177    text_nodes->Append(CreateURLNode(text_utf8, targets[0].url));
178  } else if (area == WEBLINK_AREA && !link) {
179    size_t start = 0;
180    for (size_t i = 0; i < targets.size(); ++i) {
181      // Remove the extra NULL character at end.
182      // Otherwise, find() will not return any matches.
183      if (targets[i].url.size() > 0 &&
184          targets[i].url[targets[i].url.size() - 1] == '\0') {
185        targets[i].url.resize(targets[i].url.size() - 1);
186      }
187      // There should only ever be one NULL character
188      DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0');
189
190      // PDFium may change the case of generated links.
191      std::string lowerCaseURL = base::StringToLowerASCII(targets[i].url);
192      std::string lowerCaseText = base::StringToLowerASCII(text_utf8);
193      size_t pos = lowerCaseText.find(lowerCaseURL, start);
194      size_t length = targets[i].url.size();
195      if (pos == std::string::npos) {
196        // Check if the link is a "mailto:" URL
197        if (lowerCaseURL.compare(0, 7, "mailto:") == 0) {
198          pos = lowerCaseText.find(lowerCaseURL.substr(7), start);
199          length -= 7;
200        }
201
202        if (pos == std::string::npos) {
203          // No match has been found.  This should never happen.
204          continue;
205        }
206      }
207
208      std::string before_text = text_utf8.substr(start, pos - start);
209      if (before_text.size() > 0)
210        text_nodes->Append(CreateTextNode(before_text));
211      std::string link_text = text_utf8.substr(pos, length);
212      text_nodes->Append(CreateURLNode(link_text, targets[i].url));
213
214      start = pos + length;
215    }
216    std::string before_text = text_utf8.substr(start);
217    if (before_text.size() > 0)
218      text_nodes->Append(CreateTextNode(before_text));
219  } else {
220    text_nodes->Append(CreateTextNode(text_utf8));
221  }
222
223  node->Set(kTextBoxNodes, text_nodes);  // Takes ownership of |text_nodes|.
224  return node;
225}
226
227base::Value* PDFiumPage::CreateTextNode(std::string text) {
228  base::DictionaryValue* node = new base::DictionaryValue();
229  node->SetString(kTextNodeType, kTextNodeTypeText);
230  node->SetString(kTextNodeText, text);
231  return node;
232}
233
234base::Value* PDFiumPage::CreateURLNode(std::string text, std::string url) {
235  base::DictionaryValue* node = new base::DictionaryValue();
236  node->SetString(kTextNodeType, kTextNodeTypeURL);
237  node->SetString(kTextNodeText, text);
238  node->SetString(kTextNodeURL, url);
239  return node;
240}
241
242PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,
243                                          int rotation,
244                                          int* char_index,
245                                          LinkTarget* target) {
246  if (!available_)
247    return NONSELECTABLE_AREA;
248  pp::Point point2 = point - rect_.point();
249  double new_x, new_y;
250  FPDF_DeviceToPage(GetPage(), 0, 0, rect_.width(), rect_.height(),
251        rotation, point2.x(), point2.y(), &new_x, &new_y);
252
253  int rv = FPDFText_GetCharIndexAtPos(
254      GetTextPage(), new_x, new_y, kTolerance, kTolerance);
255  *char_index = rv;
256
257  FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), new_x, new_y);
258  if (link) {
259    // We don't handle all possible link types of the PDF. For example,
260    // launch actions, cross-document links, etc.
261    // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
262    // and we should proceed with area detection.
263    PDFiumPage::Area area = GetLinkTarget(link, target);
264    if (area != PDFiumPage::NONSELECTABLE_AREA)
265      return area;
266  }
267
268  if (rv < 0)
269    return NONSELECTABLE_AREA;
270
271  return GetLink(*char_index, target) != -1 ? WEBLINK_AREA : TEXT_AREA;
272}
273
274base::char16 PDFiumPage::GetCharAtIndex(int index) {
275  if (!available_)
276    return L'\0';
277  return static_cast<base::char16>(FPDFText_GetUnicode(GetTextPage(), index));
278}
279
280int PDFiumPage::GetCharCount() {
281  if (!available_)
282    return 0;
283  return FPDFText_CountChars(GetTextPage());
284}
285
286PDFiumPage::Area PDFiumPage::GetLinkTarget(
287    FPDF_LINK link, PDFiumPage::LinkTarget* target) {
288  FPDF_DEST dest = FPDFLink_GetDest(engine_->doc(), link);
289  if (dest != NULL)
290    return GetDestinationTarget(dest, target);
291
292  FPDF_ACTION action = FPDFLink_GetAction(link);
293  if (action) {
294    switch (FPDFAction_GetType(action)) {
295      case PDFACTION_GOTO: {
296          FPDF_DEST dest = FPDFAction_GetDest(engine_->doc(), action);
297          if (dest)
298            return GetDestinationTarget(dest, target);
299          // TODO(gene): We don't fully support all types of the in-document
300          // links. Need to implement that. There is a bug to track that:
301          // http://code.google.com/p/chromium/issues/detail?id=55776
302        } break;
303      case PDFACTION_URI: {
304          if (target) {
305            size_t buffer_size =
306                FPDFAction_GetURIPath(engine_->doc(), action, NULL, 0);
307            if (buffer_size > 1) {
308              void* data = WriteInto(&target->url, buffer_size);
309              FPDFAction_GetURIPath(engine_->doc(), action, data, buffer_size);
310            }
311          }
312          return WEBLINK_AREA;
313        } break;
314      // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
315      // at the moment.
316    }
317  }
318
319  return NONSELECTABLE_AREA;
320}
321
322PDFiumPage::Area PDFiumPage::GetDestinationTarget(
323    FPDF_DEST destination, PDFiumPage::LinkTarget* target) {
324  int page_index = FPDFDest_GetPageIndex(engine_->doc(), destination);
325  if (target) {
326    target->page = page_index;
327  }
328  return DOCLINK_AREA;
329}
330
331int PDFiumPage::GetLink(int char_index, PDFiumPage::LinkTarget* target) {
332  if (!available_)
333    return -1;
334
335  CalculateLinks();
336
337  // Get the bounding box of the rect again, since it might have moved because
338  // of the tolerance above.
339  double left, right, bottom, top;
340  FPDFText_GetCharBox(GetTextPage(), char_index, &left, &right, &bottom, &top);
341
342  pp::Point origin(
343      PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0).point());
344  for (size_t i = 0; i < links_.size(); ++i) {
345    for (size_t j = 0; j < links_[i].rects.size(); ++j) {
346      if (links_[i].rects[j].Contains(origin)) {
347        if (target)
348          target->url = links_[i].url;
349        return i;
350      }
351    }
352  }
353  return -1;
354}
355
356std::vector<int> PDFiumPage::GetLinks(pp::Rect text_area,
357                                      std::vector<LinkTarget>* targets) {
358  if (!available_)
359    return std::vector<int>();
360
361  CalculateLinks();
362
363  std::vector<int> links;
364
365  for (size_t i = 0; i < links_.size(); ++i) {
366    for (size_t j = 0; j < links_[i].rects.size(); ++j) {
367      if (links_[i].rects[j].Intersects(text_area)) {
368        if (targets) {
369          LinkTarget target;
370          target.url = links_[i].url;
371          targets->push_back(target);
372        }
373        links.push_back(i);
374      }
375    }
376  }
377  return links;
378}
379
380void PDFiumPage::CalculateLinks() {
381  if (calculated_links_)
382    return;
383
384  calculated_links_ = true;
385  FPDF_PAGELINK links = FPDFLink_LoadWebLinks(GetTextPage());
386  int count = FPDFLink_CountWebLinks(links);
387  for (int i = 0; i < count; ++i) {
388    base::string16 url;
389    int url_length = FPDFLink_GetURL(links, i, NULL, 0);
390    if (url_length > 1) {  // WriteInto needs at least 2 characters.
391      unsigned short* data =
392          reinterpret_cast<unsigned short*>(WriteInto(&url, url_length));
393      FPDFLink_GetURL(links, i, data, url_length);
394    }
395    Link link;
396    link.url = base::UTF16ToUTF8(url);
397
398    // If the link cannot be converted to a pp::Var, then it is not possible to
399    // pass it to JS. In this case, ignore the link like other PDF viewers.
400    // See http://crbug.com/312882 for an example.
401    pp::Var link_var(link.url);
402    if (!link_var.is_string())
403      continue;
404
405    // Make sure all the characters in the URL are valid per RFC 1738.
406    // http://crbug.com/340326 has a sample bad PDF.
407    // GURL does not work correctly, e.g. it just strips \t \r \n.
408    bool is_invalid_url = false;
409    for (size_t j = 0; j < link.url.length(); ++j) {
410      // Control characters are not allowed.
411      // 0x7F is also a control character.
412      // 0x80 and above are not in US-ASCII.
413      if (link.url[j] < ' ' || link.url[j] >= '\x7F') {
414        is_invalid_url = true;
415        break;
416      }
417    }
418    if (is_invalid_url)
419      continue;
420
421    int rect_count = FPDFLink_CountRects(links, i);
422    for (int j = 0; j < rect_count; ++j) {
423      double left, top, right, bottom;
424      FPDFLink_GetRect(links, i, j, &left, &top, &right, &bottom);
425      link.rects.push_back(
426          PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0));
427    }
428    links_.push_back(link);
429  }
430  FPDFLink_CloseWebLinks(links);
431}
432
433pp::Rect PDFiumPage::PageToScreen(const pp::Point& offset,
434                                  double zoom,
435                                  double left,
436                                  double top,
437                                  double right,
438                                  double bottom,
439                                  int rotation) {
440  if (!available_)
441    return pp::Rect();
442
443  int new_left, new_top, new_right, new_bottom;
444  FPDF_PageToDevice(
445      page_,
446      static_cast<int>((rect_.x() - offset.x()) * zoom),
447      static_cast<int>((rect_.y() - offset.y()) * zoom),
448      static_cast<int>(ceil(rect_.width() * zoom)),
449      static_cast<int>(ceil(rect_.height() * zoom)),
450      rotation, left, top, &new_left, &new_top);
451  FPDF_PageToDevice(
452      page_,
453      static_cast<int>((rect_.x() - offset.x()) * zoom),
454      static_cast<int>((rect_.y() - offset.y()) * zoom),
455      static_cast<int>(ceil(rect_.width() * zoom)),
456      static_cast<int>(ceil(rect_.height() * zoom)),
457      rotation, right, bottom, &new_right, &new_bottom);
458
459  // If the PDF is rotated, the horizontal/vertical coordinates could be
460  // flipped.  See
461  // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
462  if (new_right < new_left)
463    std::swap(new_right, new_left);
464  if (new_bottom < new_top)
465    std::swap(new_bottom, new_top);
466
467  return pp::Rect(
468      new_left, new_top, new_right - new_left + 1, new_bottom - new_top + 1);
469}
470
471PDFiumPage::Link::Link() {
472}
473
474PDFiumPage::Link::~Link() {
475}
476
477}  // namespace chrome_pdf
478