cpdf_linkextract.cpp revision 4d3acf4ec42bf6e838f9060103aff98fbf170794
1// Copyright 2016 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdftext/cpdf_linkextract.h"
8
9#include <vector>
10
11#include "core/fpdftext/cpdf_textpage.h"
12#include "core/fxcrt/fx_ext.h"
13#include "core/fxcrt/fx_string.h"
14#include "core/fxcrt/fx_system.h"
15
16CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
17    : m_pTextPage(pTextPage) {}
18
19CPDF_LinkExtract::~CPDF_LinkExtract() {}
20
21void CPDF_LinkExtract::ExtractLinks() {
22  m_LinkArray.clear();
23  if (!m_pTextPage->IsParsed())
24    return;
25
26  m_strPageText = m_pTextPage->GetPageText(0, -1);
27  if (m_strPageText.IsEmpty())
28    return;
29
30  ParseLink();
31}
32
33void CPDF_LinkExtract::ParseLink() {
34  int start = 0, pos = 0;
35  int TotalChar = m_pTextPage->CountChars();
36  while (pos < TotalChar) {
37    FPDF_CHAR_INFO pageChar;
38    m_pTextPage->GetCharInfo(pos, &pageChar);
39    if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
40        pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
41      int nCount = pos - start;
42      if (pos == TotalChar - 1)
43        nCount++;
44      CFX_WideString strBeCheck;
45      strBeCheck = m_pTextPage->GetPageText(start, nCount);
46      if (strBeCheck.GetLength() > 5) {
47        while (strBeCheck.GetLength() > 0) {
48          FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
49          if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
50            strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
51            nCount--;
52          } else {
53            break;
54          }
55        }
56        if (nCount > 5 &&
57            (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
58          m_LinkArray.push_back({start, nCount, strBeCheck});
59        }
60      }
61      start = ++pos;
62    } else {
63      pos++;
64    }
65  }
66}
67
68bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
69  CFX_WideString str = strBeCheck;
70  str.MakeLower();
71  if (str.Find(L"http://www.") != -1) {
72    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
73    return true;
74  }
75  if (str.Find(L"http://") != -1) {
76    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
77    return true;
78  }
79  if (str.Find(L"https://www.") != -1) {
80    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
81    return true;
82  }
83  if (str.Find(L"https://") != -1) {
84    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
85    return true;
86  }
87  if (str.Find(L"www.") != -1) {
88    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
89    strBeCheck = L"http://" + strBeCheck;
90    return true;
91  }
92  return false;
93}
94
95bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
96  int aPos = str.Find(L'@');
97  // Invalid when no '@'.
98  if (aPos < 1)
99    return false;
100
101  // Check the local part.
102  int pPos = aPos;  // Used to track the position of '@' or '.'.
103  for (int i = aPos - 1; i >= 0; i--) {
104    FX_WCHAR ch = str.GetAt(i);
105    if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
106      continue;
107
108    if (ch != L'.' || i == pPos - 1 || i == 0) {
109      if (i == aPos - 1) {
110        // There is '.' or invalid char before '@'.
111        return false;
112      }
113      // End extracting for other invalid chars, '.' at the beginning, or
114      // consecutive '.'.
115      int removed_len = i == pPos - 1 ? i + 2 : i + 1;
116      str = str.Right(str.GetLength() - removed_len);
117      break;
118    }
119    // Found a valid '.'.
120    pPos = i;
121  }
122
123  // Check the domain name part.
124  aPos = str.Find(L'@');
125  if (aPos < 1)
126    return false;
127
128  str.TrimRight(L'.');
129  // At least one '.' in domain name, but not at the beginning.
130  // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
131  // Check whether we should remove this check.
132  int ePos = str.Find(L'.', aPos + 1);
133  if (ePos == -1 || ePos == aPos + 1)
134    return false;
135
136  // Validate all other chars in domain name.
137  int nLen = str.GetLength();
138  pPos = 0;  // Used to track the position of '.'.
139  for (int i = aPos + 1; i < nLen; i++) {
140    FX_WCHAR wch = str.GetAt(i);
141    if (wch == L'-' || FXSYS_iswalnum(wch))
142      continue;
143
144    if (wch != L'.' || i == pPos + 1) {
145      // Domain name should end before invalid char.
146      int host_end = i == pPos + 1 ? i - 2 : i - 1;
147      if (pPos > 0 && host_end - aPos >= 3) {
148        // Trim the ending invalid chars if there is at least one '.' and name.
149        str = str.Left(host_end + 1);
150        break;
151      }
152      return false;
153    }
154    pPos = i;
155  }
156
157  if (str.Find(L"mailto:") == -1)
158    str = L"mailto:" + str;
159
160  return true;
161}
162
163CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
164  return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
165}
166
167std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
168  if (index >= m_LinkArray.size())
169    return std::vector<CFX_FloatRect>();
170
171  return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
172                                   m_LinkArray[index].m_Count);
173}
174