14d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// Copyright 2016 PDFium Authors. All rights reserved.
24d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// Use of this source code is governed by a BSD-style license that can be
34d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// found in the LICENSE file.
44d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
54d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
64d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
74d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fpdftext/cpdf_linkextract.h"
84d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
94d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include <vector>
104d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
114d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fpdftext/cpdf_textpage.h"
124d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fxcrt/fx_ext.h"
134d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fxcrt/fx_string.h"
144d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann#include "core/fxcrt/fx_system.h"
154d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
164d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. MoltmannCPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
174d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    : m_pTextPage(pTextPage) {}
184d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
194d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. MoltmannCPDF_LinkExtract::~CPDF_LinkExtract() {}
204d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
214d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannvoid CPDF_LinkExtract::ExtractLinks() {
224d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  m_LinkArray.clear();
234d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (!m_pTextPage->IsParsed())
244d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return;
254d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
264d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  m_strPageText = m_pTextPage->GetPageText(0, -1);
274d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (m_strPageText.IsEmpty())
284d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return;
294d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
304d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  ParseLink();
314d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann}
324d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
334d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannvoid CPDF_LinkExtract::ParseLink() {
344d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  int start = 0, pos = 0;
354d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  int TotalChar = m_pTextPage->CountChars();
364d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  while (pos < TotalChar) {
374d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    FPDF_CHAR_INFO pageChar;
384d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    m_pTextPage->GetCharInfo(pos, &pageChar);
394d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
404d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
414d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      int nCount = pos - start;
424d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      if (pos == TotalChar - 1)
434d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        nCount++;
444d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      CFX_WideString strBeCheck;
454d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      strBeCheck = m_pTextPage->GetPageText(start, nCount);
464d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      if (strBeCheck.GetLength() > 5) {
474d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        while (strBeCheck.GetLength() > 0) {
484d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann          FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
494d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann          if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
504d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann            strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
514d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann            nCount--;
524d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann          } else {
534d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann            break;
544d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann          }
554d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        }
564d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        if (nCount > 5 &&
574d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann            (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
584d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann          m_LinkArray.push_back({start, nCount, strBeCheck});
594d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        }
604d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      }
614d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      start = ++pos;
624d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    } else {
634d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      pos++;
644d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    }
654d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
664d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann}
674d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
684d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannbool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
694d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  CFX_WideString str = strBeCheck;
704d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  str.MakeLower();
714d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (str.Find(L"http://www.") != -1) {
724d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
734d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return true;
744d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
754d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (str.Find(L"http://") != -1) {
764d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
774d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return true;
784d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
794d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (str.Find(L"https://www.") != -1) {
804d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
814d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return true;
824d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
834d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (str.Find(L"https://") != -1) {
844d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
854d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return true;
864d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
874d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (str.Find(L"www.") != -1) {
884d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
894d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    strBeCheck = L"http://" + strBeCheck;
904d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return true;
914d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
924d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  return false;
934d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann}
944d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
954d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannbool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
964d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  int aPos = str.Find(L'@');
974d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // Invalid when no '@'.
984d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (aPos < 1)
994d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return false;
1004d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1014d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // Check the local part.
1024d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  int pPos = aPos;  // Used to track the position of '@' or '.'.
1034d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  for (int i = aPos - 1; i >= 0; i--) {
1044d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    FX_WCHAR ch = str.GetAt(i);
1054d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
1064d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      continue;
1074d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1084d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    if (ch != L'.' || i == pPos - 1 || i == 0) {
1094d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      if (i == aPos - 1) {
1104d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        // There is '.' or invalid char before '@'.
1114d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        return false;
1124d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      }
1134d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      // End extracting for other invalid chars, '.' at the beginning, or
1144d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      // consecutive '.'.
1154d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      int removed_len = i == pPos - 1 ? i + 2 : i + 1;
1164d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      str = str.Right(str.GetLength() - removed_len);
1174d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      break;
1184d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    }
1194d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    // Found a valid '.'.
1204d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    pPos = i;
1214d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
1224d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1234d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // Check the domain name part.
1244d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  aPos = str.Find(L'@');
1254d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (aPos < 1)
1264d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return false;
1274d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1284d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  str.TrimRight(L'.');
1294d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // At least one '.' in domain name, but not at the beginning.
1304d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
1314d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // Check whether we should remove this check.
1324d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  int ePos = str.Find(L'.', aPos + 1);
1334d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (ePos == -1 || ePos == aPos + 1)
1344d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return false;
1354d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1364d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  // Validate all other chars in domain name.
1374d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  int nLen = str.GetLength();
1384d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  pPos = 0;  // Used to track the position of '.'.
1394d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  for (int i = aPos + 1; i < nLen; i++) {
1404d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    FX_WCHAR wch = str.GetAt(i);
1414d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    if (wch == L'-' || FXSYS_iswalnum(wch))
1424d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      continue;
1434d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1444d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    if (wch != L'.' || i == pPos + 1) {
1454d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      // Domain name should end before invalid char.
1464d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      int host_end = i == pPos + 1 ? i - 2 : i - 1;
1474d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      if (pPos > 0 && host_end - aPos >= 3) {
1484d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        // Trim the ending invalid chars if there is at least one '.' and name.
1494d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        str = str.Left(host_end + 1);
1504d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann        break;
1514d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      }
1524d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann      return false;
1534d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    }
1544d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    pPos = i;
1554d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  }
1564d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1574d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (str.Find(L"mailto:") == -1)
1584d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    str = L"mailto:" + str;
1594d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1604d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  return true;
1614d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann}
1624d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1634d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. MoltmannCFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
1644d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
1654d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann}
1664d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1674d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmannstd::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
1684d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  if (index >= m_LinkArray.size())
1694d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann    return std::vector<CFX_FloatRect>();
1704d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann
1714d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann  return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
1724d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann                                   m_LinkArray[index].m_Count);
1734d3acf4ec42bf6e838f9060103aff98fbf170794Philip P. Moltmann}
174