cpdf_linkextract.cpp revision 4d3acf4ec42bf6e838f9060103aff98fbf170794
1// Copyright 2016 PDFium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7#include "core/fpdftext/cpdf_linkextract.h" 8 9#include <vector> 10 11#include "core/fpdftext/cpdf_textpage.h" 12#include "core/fxcrt/fx_ext.h" 13#include "core/fxcrt/fx_string.h" 14#include "core/fxcrt/fx_system.h" 15 16CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) 17 : m_pTextPage(pTextPage) {} 18 19CPDF_LinkExtract::~CPDF_LinkExtract() {} 20 21void CPDF_LinkExtract::ExtractLinks() { 22 m_LinkArray.clear(); 23 if (!m_pTextPage->IsParsed()) 24 return; 25 26 m_strPageText = m_pTextPage->GetPageText(0, -1); 27 if (m_strPageText.IsEmpty()) 28 return; 29 30 ParseLink(); 31} 32 33void CPDF_LinkExtract::ParseLink() { 34 int start = 0, pos = 0; 35 int TotalChar = m_pTextPage->CountChars(); 36 while (pos < TotalChar) { 37 FPDF_CHAR_INFO pageChar; 38 m_pTextPage->GetCharInfo(pos, &pageChar); 39 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || 40 pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { 41 int nCount = pos - start; 42 if (pos == TotalChar - 1) 43 nCount++; 44 CFX_WideString strBeCheck; 45 strBeCheck = m_pTextPage->GetPageText(start, nCount); 46 if (strBeCheck.GetLength() > 5) { 47 while (strBeCheck.GetLength() > 0) { 48 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); 49 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { 50 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); 51 nCount--; 52 } else { 53 break; 54 } 55 } 56 if (nCount > 5 && 57 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { 58 m_LinkArray.push_back({start, nCount, strBeCheck}); 59 } 60 } 61 start = ++pos; 62 } else { 63 pos++; 64 } 65 } 66} 67 68bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { 69 CFX_WideString str = strBeCheck; 70 str.MakeLower(); 71 if (str.Find(L"http://www.") != -1) { 72 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); 73 return true; 74 } 75 if (str.Find(L"http://") != -1) { 76 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); 77 return true; 78 } 79 if (str.Find(L"https://www.") != -1) { 80 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); 81 return true; 82 } 83 if (str.Find(L"https://") != -1) { 84 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); 85 return true; 86 } 87 if (str.Find(L"www.") != -1) { 88 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 89 strBeCheck = L"http://" + strBeCheck; 90 return true; 91 } 92 return false; 93} 94 95bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 96 int aPos = str.Find(L'@'); 97 // Invalid when no '@'. 98 if (aPos < 1) 99 return false; 100 101 // Check the local part. 102 int pPos = aPos; // Used to track the position of '@' or '.'. 103 for (int i = aPos - 1; i >= 0; i--) { 104 FX_WCHAR ch = str.GetAt(i); 105 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) 106 continue; 107 108 if (ch != L'.' || i == pPos - 1 || i == 0) { 109 if (i == aPos - 1) { 110 // There is '.' or invalid char before '@'. 111 return false; 112 } 113 // End extracting for other invalid chars, '.' at the beginning, or 114 // consecutive '.'. 115 int removed_len = i == pPos - 1 ? i + 2 : i + 1; 116 str = str.Right(str.GetLength() - removed_len); 117 break; 118 } 119 // Found a valid '.'. 120 pPos = i; 121 } 122 123 // Check the domain name part. 124 aPos = str.Find(L'@'); 125 if (aPos < 1) 126 return false; 127 128 str.TrimRight(L'.'); 129 // At least one '.' in domain name, but not at the beginning. 130 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. 131 // Check whether we should remove this check. 132 int ePos = str.Find(L'.', aPos + 1); 133 if (ePos == -1 || ePos == aPos + 1) 134 return false; 135 136 // Validate all other chars in domain name. 137 int nLen = str.GetLength(); 138 pPos = 0; // Used to track the position of '.'. 139 for (int i = aPos + 1; i < nLen; i++) { 140 FX_WCHAR wch = str.GetAt(i); 141 if (wch == L'-' || FXSYS_iswalnum(wch)) 142 continue; 143 144 if (wch != L'.' || i == pPos + 1) { 145 // Domain name should end before invalid char. 146 int host_end = i == pPos + 1 ? i - 2 : i - 1; 147 if (pPos > 0 && host_end - aPos >= 3) { 148 // Trim the ending invalid chars if there is at least one '.' and name. 149 str = str.Left(host_end + 1); 150 break; 151 } 152 return false; 153 } 154 pPos = i; 155 } 156 157 if (str.Find(L"mailto:") == -1) 158 str = L"mailto:" + str; 159 160 return true; 161} 162 163CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { 164 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; 165} 166 167std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { 168 if (index >= m_LinkArray.size()) 169 return std::vector<CFX_FloatRect>(); 170 171 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, 172 m_LinkArray[index].m_Count); 173} 174