1// Copyright 2016 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdftext/cpdf_textpagefind.h"
8
9#include <cwchar>
10#include <cwctype>
11#include <vector>
12
13#include "core/fpdftext/cpdf_textpage.h"
14#include "core/fxcrt/fx_string.h"
15#include "core/fxcrt/fx_system.h"
16#include "third_party/base/stl_util.h"
17
18namespace {
19
20bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
21  if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
22      (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
23      (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
24      (curChar >= 0x0400 && curChar <= 0x04FF) ||
25      (curChar >= 0x0500 && curChar <= 0x052F) ||
26      (curChar >= 0xA640 && curChar <= 0xA69F) ||
27      (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
28      (curChar >= 0x2000 && curChar <= 0x206F)) {
29    return false;
30  }
31  return true;
32}
33
34}  // namespace
35
36CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
37    : m_pTextPage(pTextPage),
38      m_flags(0),
39      m_findNextStart(-1),
40      m_findPreStart(-1),
41      m_bMatchCase(false),
42      m_bMatchWholeWord(false),
43      m_resStart(0),
44      m_resEnd(-1),
45      m_IsFind(false) {
46  m_strText = m_pTextPage->GetPageText();
47  int nCount = pTextPage->CountChars();
48  if (nCount)
49    m_CharIndex.push_back(0);
50  for (int i = 0; i < nCount; i++) {
51    FPDF_CHAR_INFO info;
52    pTextPage->GetCharInfo(i, &info);
53    int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
54    if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
55        info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
56      if (indexSize % 2) {
57        m_CharIndex.push_back(1);
58      } else {
59        if (indexSize <= 0)
60          continue;
61        m_CharIndex[indexSize - 1] += 1;
62      }
63    } else {
64      if (indexSize % 2) {
65        if (indexSize <= 0)
66          continue;
67        m_CharIndex[indexSize - 1] = i + 1;
68      } else {
69        m_CharIndex.push_back(i + 1);
70      }
71    }
72  }
73  int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
74  if (indexSize % 2)
75    m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
76}
77
78CPDF_TextPageFind::~CPDF_TextPageFind() {}
79
80int CPDF_TextPageFind::GetCharIndex(int index) const {
81  return m_pTextPage->CharIndexFromTextIndex(index);
82}
83
84bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
85                                  int flags,
86                                  int startPos) {
87  if (!m_pTextPage)
88    return false;
89  if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
90    m_strText = m_pTextPage->GetPageText();
91  CFX_WideString findwhatStr = findwhat;
92  m_findWhat = findwhatStr;
93  m_flags = flags;
94  m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
95  if (m_strText.IsEmpty()) {
96    m_IsFind = false;
97    return true;
98  }
99  FX_STRSIZE len = findwhatStr.GetLength();
100  if (!m_bMatchCase) {
101    findwhatStr.MakeLower();
102    m_strText.MakeLower();
103  }
104  m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
105  m_findNextStart = startPos;
106  if (startPos == -1)
107    m_findPreStart = m_strText.GetLength() - 1;
108  else
109    m_findPreStart = startPos;
110  m_csFindWhatArray.clear();
111  int i = 0;
112  while (i < len) {
113    if (findwhatStr.GetAt(i) != ' ')
114      break;
115    i++;
116  }
117  if (i < len)
118    ExtractFindWhat(findwhatStr);
119  else
120    m_csFindWhatArray.push_back(findwhatStr);
121  if (m_csFindWhatArray.empty())
122    return false;
123  m_IsFind = true;
124  m_resStart = 0;
125  m_resEnd = -1;
126  return true;
127}
128
129bool CPDF_TextPageFind::FindNext() {
130  if (!m_pTextPage)
131    return false;
132  m_resArray.clear();
133  if (m_findNextStart == -1)
134    return false;
135  if (m_strText.IsEmpty()) {
136    m_IsFind = false;
137    return m_IsFind;
138  }
139  int strLen = m_strText.GetLength();
140  if (m_findNextStart > strLen - 1) {
141    m_IsFind = false;
142    return m_IsFind;
143  }
144  int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
145  int nResultPos = 0;
146  int nStartPos = 0;
147  nStartPos = m_findNextStart;
148  bool bSpaceStart = false;
149  for (int iWord = 0; iWord < nCount; iWord++) {
150    CFX_WideString csWord = m_csFindWhatArray[iWord];
151    if (csWord.IsEmpty()) {
152      if (iWord == nCount - 1) {
153        FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
154        if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
155            strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
156          nResultPos = nStartPos + 1;
157          break;
158        }
159        iWord = -1;
160      } else if (iWord == 0) {
161        bSpaceStart = true;
162      }
163      continue;
164    }
165    int endIndex;
166    nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
167    if (nResultPos == -1) {
168      m_IsFind = false;
169      return m_IsFind;
170    }
171    endIndex = nResultPos + csWord.GetLength() - 1;
172    if (iWord == 0)
173      m_resStart = nResultPos;
174    bool bMatch = true;
175    if (iWord != 0 && !bSpaceStart) {
176      int PreResEndPos = nStartPos;
177      int curChar = csWord.GetAt(0);
178      CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
179      int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
180      if (nStartPos == nResultPos &&
181          !(IsIgnoreSpaceCharacter(lastChar) ||
182            IsIgnoreSpaceCharacter(curChar))) {
183        bMatch = false;
184      }
185      for (int d = PreResEndPos; d < nResultPos; d++) {
186        FX_WCHAR strInsert = m_strText.GetAt(d);
187        if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
188            strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
189          bMatch = false;
190          break;
191        }
192      }
193    } else if (bSpaceStart) {
194      if (nResultPos > 0) {
195        FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
196        if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
197            strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
198          bMatch = false;
199          m_resStart = nResultPos;
200        } else {
201          m_resStart = nResultPos - 1;
202        }
203      }
204    }
205    if (m_bMatchWholeWord && bMatch) {
206      bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
207    }
208    nStartPos = endIndex + 1;
209    if (!bMatch) {
210      iWord = -1;
211      if (bSpaceStart)
212        nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
213      else
214        nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
215    }
216  }
217  m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
218  m_IsFind = true;
219  int resStart = GetCharIndex(m_resStart);
220  int resEnd = GetCharIndex(m_resEnd);
221  m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
222  if (m_flags & FPDFTEXT_CONSECUTIVE) {
223    m_findNextStart = m_resStart + 1;
224    m_findPreStart = m_resEnd - 1;
225  } else {
226    m_findNextStart = m_resEnd + 1;
227    m_findPreStart = m_resStart - 1;
228  }
229  return m_IsFind;
230}
231
232bool CPDF_TextPageFind::FindPrev() {
233  if (!m_pTextPage)
234    return false;
235  m_resArray.clear();
236  if (m_strText.IsEmpty() || m_findPreStart < 0) {
237    m_IsFind = false;
238    return m_IsFind;
239  }
240  CPDF_TextPageFind findEngine(m_pTextPage);
241  bool ret = findEngine.FindFirst(m_findWhat, m_flags);
242  if (!ret) {
243    m_IsFind = false;
244    return m_IsFind;
245  }
246  int order = -1, MatchedCount = 0;
247  while (ret) {
248    ret = findEngine.FindNext();
249    if (ret) {
250      int order1 = findEngine.GetCurOrder();
251      int MatchedCount1 = findEngine.GetMatchedCount();
252      if (((order1 + MatchedCount1) - 1) > m_findPreStart)
253        break;
254      order = order1;
255      MatchedCount = MatchedCount1;
256    }
257  }
258  if (order == -1) {
259    m_IsFind = false;
260    return m_IsFind;
261  }
262  m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
263  m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
264  m_IsFind = true;
265  m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
266  if (m_flags & FPDFTEXT_CONSECUTIVE) {
267    m_findNextStart = m_resStart + 1;
268    m_findPreStart = m_resEnd - 1;
269  } else {
270    m_findNextStart = m_resEnd + 1;
271    m_findPreStart = m_resStart - 1;
272  }
273  return m_IsFind;
274}
275
276void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
277  if (findwhat.IsEmpty())
278    return;
279  int index = 0;
280  while (1) {
281    CFX_WideString csWord = TEXT_EMPTY;
282    int ret =
283        ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
284    if (csWord.IsEmpty()) {
285      if (ret) {
286        m_csFindWhatArray.push_back(L"");
287        index++;
288        continue;
289      } else {
290        break;
291      }
292    }
293    int pos = 0;
294    while (pos < csWord.GetLength()) {
295      CFX_WideString curStr = csWord.Mid(pos, 1);
296      FX_WCHAR curChar = csWord.GetAt(pos);
297      if (IsIgnoreSpaceCharacter(curChar)) {
298        if (pos > 0 && curChar == 0x2019) {
299          pos++;
300          continue;
301        }
302        if (pos > 0)
303          m_csFindWhatArray.push_back(csWord.Mid(0, pos));
304        m_csFindWhatArray.push_back(curStr);
305        if (pos == csWord.GetLength() - 1) {
306          csWord.clear();
307          break;
308        }
309        csWord = csWord.Right(csWord.GetLength() - pos - 1);
310        pos = 0;
311        continue;
312      }
313      pos++;
314    }
315    if (!csWord.IsEmpty())
316      m_csFindWhatArray.push_back(csWord);
317    index++;
318  }
319}
320
321bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
322                                         int startPos,
323                                         int endPos) {
324  FX_WCHAR char_left = 0;
325  FX_WCHAR char_right = 0;
326  int char_count = endPos - startPos + 1;
327  if (char_count < 1)
328    return false;
329  if (char_count == 1 && csPageText.GetAt(startPos) > 255)
330    return true;
331  if (startPos - 1 >= 0)
332    char_left = csPageText.GetAt(startPos - 1);
333  if (startPos + char_count < csPageText.GetLength())
334    char_right = csPageText.GetAt(startPos + char_count);
335  if ((char_left > 'A' && char_left < 'a') ||
336      (char_left > 'a' && char_left < 'z') ||
337      (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
338      (char_right > 'A' && char_right < 'a') ||
339      (char_right > 'a' && char_right < 'z') ||
340      (char_right > 0xfb00 && char_right < 0xfb06) ||
341      std::iswdigit(char_right)) {
342    return false;
343  }
344  if (!(('A' > char_left || char_left > 'Z') &&
345        ('a' > char_left || char_left > 'z') &&
346        ('A' > char_right || char_right > 'Z') &&
347        ('a' > char_right || char_right > 'z'))) {
348    return false;
349  }
350  if (char_count > 0) {
351    if (csPageText.GetAt(startPos) >= L'0' &&
352        csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
353        char_left <= L'9') {
354      return false;
355    }
356    if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
357        char_right >= L'0' && char_right <= L'9') {
358      return false;
359    }
360  }
361  return true;
362}
363
364bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
365                                         const FX_WCHAR* lpszFullString,
366                                         int iSubString,
367                                         FX_WCHAR chSep) {
368  if (!lpszFullString)
369    return false;
370  while (iSubString--) {
371    lpszFullString = std::wcschr(lpszFullString, chSep);
372    if (!lpszFullString) {
373      rString.clear();
374      return false;
375    }
376    lpszFullString++;
377    while (*lpszFullString == chSep)
378      lpszFullString++;
379  }
380  const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
381  int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
382                     : (int)FXSYS_wcslen(lpszFullString);
383  ASSERT(nLen >= 0);
384  FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
385               nLen * sizeof(FX_WCHAR));
386  rString.ReleaseBuffer();
387  return true;
388}
389
390CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
391  CFX_WideString str2;
392  str2.clear();
393  int nlen = str.GetLength();
394  for (int i = nlen - 1; i >= 0; i--)
395    str2 += str.GetAt(i);
396  return str2;
397}
398
399int CPDF_TextPageFind::GetCurOrder() const {
400  return GetCharIndex(m_resStart);
401}
402
403int CPDF_TextPageFind::GetMatchedCount() const {
404  int resStart = GetCharIndex(m_resStart);
405  int resEnd = GetCharIndex(m_resEnd);
406  return resEnd - resStart + 1;
407}
408