14d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// Copyright 2016 PDFium Authors. All rights reserved. 24d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// Use of this source code is governed by a BSD-style license that can be 34d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// found in the LICENSE file. 44d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 54d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 64d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 74d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fpdftext/cpdf_linkextract.h" 84d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 94d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include <vector> 104d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 114d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fpdftext/cpdf_textpage.h" 124d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fxcrt/fx_ext.h" 134d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fxcrt/fx_string.h" 144d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fxcrt/fx_system.h" 154d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 164d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. MoltmannCPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) 174d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann : m_pTextPage(pTextPage) {} 184d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 194d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. MoltmannCPDF_LinkExtract::~CPDF_LinkExtract() {} 204d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 214d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannvoid CPDF_LinkExtract::ExtractLinks() { 224d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann m_LinkArray.clear(); 234d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (!m_pTextPage->IsParsed()) 244d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return; 254d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 264d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann m_strPageText = m_pTextPage->GetPageText(0, -1); 274d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (m_strPageText.IsEmpty()) 284d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return; 294d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 304d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann ParseLink(); 314d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann} 324d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 334d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannvoid CPDF_LinkExtract::ParseLink() { 344d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int start = 0, pos = 0; 354d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int TotalChar = m_pTextPage->CountChars(); 364d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann while (pos < TotalChar) { 374d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann FPDF_CHAR_INFO pageChar; 384d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann m_pTextPage->GetCharInfo(pos, &pageChar); 394d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || 404d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { 414d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int nCount = pos - start; 424d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (pos == TotalChar - 1) 434d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann nCount++; 444d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann CFX_WideString strBeCheck; 454d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = m_pTextPage->GetPageText(start, nCount); 464d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (strBeCheck.GetLength() > 5) { 474d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann while (strBeCheck.GetLength() > 0) { 484d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); 494d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { 504d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); 514d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann nCount--; 524d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } else { 534d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann break; 544d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 554d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 564d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (nCount > 5 && 574d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { 584d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann m_LinkArray.push_back({start, nCount, strBeCheck}); 594d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 604d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 614d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann start = ++pos; 624d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } else { 634d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann pos++; 644d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 654d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 664d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann} 674d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 684d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannbool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { 694d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann CFX_WideString str = strBeCheck; 704d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann str.MakeLower(); 714d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (str.Find(L"http://www.") != -1) { 724d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); 734d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return true; 744d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 754d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (str.Find(L"http://") != -1) { 764d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); 774d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return true; 784d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 794d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (str.Find(L"https://www.") != -1) { 804d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); 814d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return true; 824d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 834d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (str.Find(L"https://") != -1) { 844d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); 854d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return true; 864d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 874d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (str.Find(L"www.") != -1) { 884d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 894d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann strBeCheck = L"http://" + strBeCheck; 904d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return true; 914d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 924d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return false; 934d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann} 944d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 954d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannbool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 964d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int aPos = str.Find(L'@'); 974d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Invalid when no '@'. 984d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (aPos < 1) 994d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return false; 1004d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1014d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Check the local part. 1024d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int pPos = aPos; // Used to track the position of '@' or '.'. 1034d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann for (int i = aPos - 1; i >= 0; i--) { 1044d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann FX_WCHAR ch = str.GetAt(i); 1054d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) 1064d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann continue; 1074d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1084d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (ch != L'.' || i == pPos - 1 || i == 0) { 1094d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (i == aPos - 1) { 1104d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // There is '.' or invalid char before '@'. 1114d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return false; 1124d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 1134d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // End extracting for other invalid chars, '.' at the beginning, or 1144d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // consecutive '.'. 1154d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int removed_len = i == pPos - 1 ? i + 2 : i + 1; 1164d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann str = str.Right(str.GetLength() - removed_len); 1174d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann break; 1184d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 1194d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Found a valid '.'. 1204d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann pPos = i; 1214d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 1224d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1234d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Check the domain name part. 1244d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann aPos = str.Find(L'@'); 1254d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (aPos < 1) 1264d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return false; 1274d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1284d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann str.TrimRight(L'.'); 1294d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // At least one '.' in domain name, but not at the beginning. 1304d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // TODO(weili): RFC5322 allows domain names to be a local name without '.'. 1314d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Check whether we should remove this check. 1324d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int ePos = str.Find(L'.', aPos + 1); 1334d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (ePos == -1 || ePos == aPos + 1) 1344d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return false; 1354d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1364d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Validate all other chars in domain name. 1374d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int nLen = str.GetLength(); 1384d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann pPos = 0; // Used to track the position of '.'. 1394d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann for (int i = aPos + 1; i < nLen; i++) { 1404d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann FX_WCHAR wch = str.GetAt(i); 1414d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (wch == L'-' || FXSYS_iswalnum(wch)) 1424d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann continue; 1434d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1444d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (wch != L'.' || i == pPos + 1) { 1454d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Domain name should end before invalid char. 1464d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann int host_end = i == pPos + 1 ? i - 2 : i - 1; 1474d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (pPos > 0 && host_end - aPos >= 3) { 1484d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann // Trim the ending invalid chars if there is at least one '.' and name. 1494d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann str = str.Left(host_end + 1); 1504d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann break; 1514d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 1524d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return false; 1534d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 1544d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann pPos = i; 1554d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann } 1564d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1574d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (str.Find(L"mailto:") == -1) 1584d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann str = L"mailto:" + str; 1594d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1604d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return true; 1614d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann} 1624d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1634d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. MoltmannCFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { 1644d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; 1654d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann} 1664d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1674d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannstd::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { 1684d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann if (index >= m_LinkArray.size()) 1694d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return std::vector<CFX_FloatRect>(); 1704d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann 1714d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, 1724d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann m_LinkArray[index].m_Count); 1734d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann} 174