1// Copyright 2016 PDFium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7#include "core/fpdftext/cpdf_textpagefind.h" 8 9#include <cwchar> 10#include <cwctype> 11#include <vector> 12 13#include "core/fpdftext/cpdf_textpage.h" 14#include "core/fxcrt/fx_string.h" 15#include "core/fxcrt/fx_system.h" 16#include "third_party/base/stl_util.h" 17 18namespace { 19 20bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) { 21 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || 22 (curChar >= 0xFE70 && curChar <= 0xFEFF) || 23 (curChar >= 0xFB50 && curChar <= 0xFDFF) || 24 (curChar >= 0x0400 && curChar <= 0x04FF) || 25 (curChar >= 0x0500 && curChar <= 0x052F) || 26 (curChar >= 0xA640 && curChar <= 0xA69F) || 27 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || 28 (curChar >= 0x2000 && curChar <= 0x206F)) { 29 return false; 30 } 31 return true; 32} 33 34} // namespace 35 36CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) 37 : m_pTextPage(pTextPage), 38 m_flags(0), 39 m_findNextStart(-1), 40 m_findPreStart(-1), 41 m_bMatchCase(false), 42 m_bMatchWholeWord(false), 43 m_resStart(0), 44 m_resEnd(-1), 45 m_IsFind(false) { 46 m_strText = m_pTextPage->GetPageText(); 47 int nCount = pTextPage->CountChars(); 48 if (nCount) 49 m_CharIndex.push_back(0); 50 for (int i = 0; i < nCount; i++) { 51 FPDF_CHAR_INFO info; 52 pTextPage->GetCharInfo(i, &info); 53 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 54 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || 55 info.m_Flag == FPDFTEXT_CHAR_GENERATED) { 56 if (indexSize % 2) { 57 m_CharIndex.push_back(1); 58 } else { 59 if (indexSize <= 0) 60 continue; 61 m_CharIndex[indexSize - 1] += 1; 62 } 63 } else { 64 if (indexSize % 2) { 65 if (indexSize <= 0) 66 continue; 67 m_CharIndex[indexSize - 1] = i + 1; 68 } else { 69 m_CharIndex.push_back(i + 1); 70 } 71 } 72 } 73 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 74 if (indexSize % 2) 75 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); 76} 77 78CPDF_TextPageFind::~CPDF_TextPageFind() {} 79 80int CPDF_TextPageFind::GetCharIndex(int index) const { 81 return m_pTextPage->CharIndexFromTextIndex(index); 82} 83 84bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, 85 int flags, 86 int startPos) { 87 if (!m_pTextPage) 88 return false; 89 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) 90 m_strText = m_pTextPage->GetPageText(); 91 CFX_WideString findwhatStr = findwhat; 92 m_findWhat = findwhatStr; 93 m_flags = flags; 94 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; 95 if (m_strText.IsEmpty()) { 96 m_IsFind = false; 97 return true; 98 } 99 FX_STRSIZE len = findwhatStr.GetLength(); 100 if (!m_bMatchCase) { 101 findwhatStr.MakeLower(); 102 m_strText.MakeLower(); 103 } 104 m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD); 105 m_findNextStart = startPos; 106 if (startPos == -1) 107 m_findPreStart = m_strText.GetLength() - 1; 108 else 109 m_findPreStart = startPos; 110 m_csFindWhatArray.clear(); 111 int i = 0; 112 while (i < len) { 113 if (findwhatStr.GetAt(i) != ' ') 114 break; 115 i++; 116 } 117 if (i < len) 118 ExtractFindWhat(findwhatStr); 119 else 120 m_csFindWhatArray.push_back(findwhatStr); 121 if (m_csFindWhatArray.empty()) 122 return false; 123 m_IsFind = true; 124 m_resStart = 0; 125 m_resEnd = -1; 126 return true; 127} 128 129bool CPDF_TextPageFind::FindNext() { 130 if (!m_pTextPage) 131 return false; 132 m_resArray.clear(); 133 if (m_findNextStart == -1) 134 return false; 135 if (m_strText.IsEmpty()) { 136 m_IsFind = false; 137 return m_IsFind; 138 } 139 int strLen = m_strText.GetLength(); 140 if (m_findNextStart > strLen - 1) { 141 m_IsFind = false; 142 return m_IsFind; 143 } 144 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); 145 int nResultPos = 0; 146 int nStartPos = 0; 147 nStartPos = m_findNextStart; 148 bool bSpaceStart = false; 149 for (int iWord = 0; iWord < nCount; iWord++) { 150 CFX_WideString csWord = m_csFindWhatArray[iWord]; 151 if (csWord.IsEmpty()) { 152 if (iWord == nCount - 1) { 153 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); 154 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || 155 strInsert == TEXT_RETURN_CHAR || strInsert == 160) { 156 nResultPos = nStartPos + 1; 157 break; 158 } 159 iWord = -1; 160 } else if (iWord == 0) { 161 bSpaceStart = true; 162 } 163 continue; 164 } 165 int endIndex; 166 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); 167 if (nResultPos == -1) { 168 m_IsFind = false; 169 return m_IsFind; 170 } 171 endIndex = nResultPos + csWord.GetLength() - 1; 172 if (iWord == 0) 173 m_resStart = nResultPos; 174 bool bMatch = true; 175 if (iWord != 0 && !bSpaceStart) { 176 int PreResEndPos = nStartPos; 177 int curChar = csWord.GetAt(0); 178 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; 179 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); 180 if (nStartPos == nResultPos && 181 !(IsIgnoreSpaceCharacter(lastChar) || 182 IsIgnoreSpaceCharacter(curChar))) { 183 bMatch = false; 184 } 185 for (int d = PreResEndPos; d < nResultPos; d++) { 186 FX_WCHAR strInsert = m_strText.GetAt(d); 187 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && 188 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 189 bMatch = false; 190 break; 191 } 192 } 193 } else if (bSpaceStart) { 194 if (nResultPos > 0) { 195 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); 196 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && 197 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 198 bMatch = false; 199 m_resStart = nResultPos; 200 } else { 201 m_resStart = nResultPos - 1; 202 } 203 } 204 } 205 if (m_bMatchWholeWord && bMatch) { 206 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); 207 } 208 nStartPos = endIndex + 1; 209 if (!bMatch) { 210 iWord = -1; 211 if (bSpaceStart) 212 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); 213 else 214 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); 215 } 216 } 217 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1; 218 m_IsFind = true; 219 int resStart = GetCharIndex(m_resStart); 220 int resEnd = GetCharIndex(m_resEnd); 221 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); 222 if (m_flags & FPDFTEXT_CONSECUTIVE) { 223 m_findNextStart = m_resStart + 1; 224 m_findPreStart = m_resEnd - 1; 225 } else { 226 m_findNextStart = m_resEnd + 1; 227 m_findPreStart = m_resStart - 1; 228 } 229 return m_IsFind; 230} 231 232bool CPDF_TextPageFind::FindPrev() { 233 if (!m_pTextPage) 234 return false; 235 m_resArray.clear(); 236 if (m_strText.IsEmpty() || m_findPreStart < 0) { 237 m_IsFind = false; 238 return m_IsFind; 239 } 240 CPDF_TextPageFind findEngine(m_pTextPage); 241 bool ret = findEngine.FindFirst(m_findWhat, m_flags); 242 if (!ret) { 243 m_IsFind = false; 244 return m_IsFind; 245 } 246 int order = -1, MatchedCount = 0; 247 while (ret) { 248 ret = findEngine.FindNext(); 249 if (ret) { 250 int order1 = findEngine.GetCurOrder(); 251 int MatchedCount1 = findEngine.GetMatchedCount(); 252 if (((order1 + MatchedCount1) - 1) > m_findPreStart) 253 break; 254 order = order1; 255 MatchedCount = MatchedCount1; 256 } 257 } 258 if (order == -1) { 259 m_IsFind = false; 260 return m_IsFind; 261 } 262 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); 263 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); 264 m_IsFind = true; 265 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); 266 if (m_flags & FPDFTEXT_CONSECUTIVE) { 267 m_findNextStart = m_resStart + 1; 268 m_findPreStart = m_resEnd - 1; 269 } else { 270 m_findNextStart = m_resEnd + 1; 271 m_findPreStart = m_resStart - 1; 272 } 273 return m_IsFind; 274} 275 276void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { 277 if (findwhat.IsEmpty()) 278 return; 279 int index = 0; 280 while (1) { 281 CFX_WideString csWord = TEXT_EMPTY; 282 int ret = 283 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR); 284 if (csWord.IsEmpty()) { 285 if (ret) { 286 m_csFindWhatArray.push_back(L""); 287 index++; 288 continue; 289 } else { 290 break; 291 } 292 } 293 int pos = 0; 294 while (pos < csWord.GetLength()) { 295 CFX_WideString curStr = csWord.Mid(pos, 1); 296 FX_WCHAR curChar = csWord.GetAt(pos); 297 if (IsIgnoreSpaceCharacter(curChar)) { 298 if (pos > 0 && curChar == 0x2019) { 299 pos++; 300 continue; 301 } 302 if (pos > 0) 303 m_csFindWhatArray.push_back(csWord.Mid(0, pos)); 304 m_csFindWhatArray.push_back(curStr); 305 if (pos == csWord.GetLength() - 1) { 306 csWord.clear(); 307 break; 308 } 309 csWord = csWord.Right(csWord.GetLength() - pos - 1); 310 pos = 0; 311 continue; 312 } 313 pos++; 314 } 315 if (!csWord.IsEmpty()) 316 m_csFindWhatArray.push_back(csWord); 317 index++; 318 } 319} 320 321bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, 322 int startPos, 323 int endPos) { 324 FX_WCHAR char_left = 0; 325 FX_WCHAR char_right = 0; 326 int char_count = endPos - startPos + 1; 327 if (char_count < 1) 328 return false; 329 if (char_count == 1 && csPageText.GetAt(startPos) > 255) 330 return true; 331 if (startPos - 1 >= 0) 332 char_left = csPageText.GetAt(startPos - 1); 333 if (startPos + char_count < csPageText.GetLength()) 334 char_right = csPageText.GetAt(startPos + char_count); 335 if ((char_left > 'A' && char_left < 'a') || 336 (char_left > 'a' && char_left < 'z') || 337 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || 338 (char_right > 'A' && char_right < 'a') || 339 (char_right > 'a' && char_right < 'z') || 340 (char_right > 0xfb00 && char_right < 0xfb06) || 341 std::iswdigit(char_right)) { 342 return false; 343 } 344 if (!(('A' > char_left || char_left > 'Z') && 345 ('a' > char_left || char_left > 'z') && 346 ('A' > char_right || char_right > 'Z') && 347 ('a' > char_right || char_right > 'z'))) { 348 return false; 349 } 350 if (char_count > 0) { 351 if (csPageText.GetAt(startPos) >= L'0' && 352 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && 353 char_left <= L'9') { 354 return false; 355 } 356 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && 357 char_right >= L'0' && char_right <= L'9') { 358 return false; 359 } 360 } 361 return true; 362} 363 364bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, 365 const FX_WCHAR* lpszFullString, 366 int iSubString, 367 FX_WCHAR chSep) { 368 if (!lpszFullString) 369 return false; 370 while (iSubString--) { 371 lpszFullString = std::wcschr(lpszFullString, chSep); 372 if (!lpszFullString) { 373 rString.clear(); 374 return false; 375 } 376 lpszFullString++; 377 while (*lpszFullString == chSep) 378 lpszFullString++; 379 } 380 const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep); 381 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) 382 : (int)FXSYS_wcslen(lpszFullString); 383 ASSERT(nLen >= 0); 384 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, 385 nLen * sizeof(FX_WCHAR)); 386 rString.ReleaseBuffer(); 387 return true; 388} 389 390CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { 391 CFX_WideString str2; 392 str2.clear(); 393 int nlen = str.GetLength(); 394 for (int i = nlen - 1; i >= 0; i--) 395 str2 += str.GetAt(i); 396 return str2; 397} 398 399int CPDF_TextPageFind::GetCurOrder() const { 400 return GetCharIndex(m_resStart); 401} 402 403int CPDF_TextPageFind::GetMatchedCount() const { 404 int resStart = GetCharIndex(m_resStart); 405 int resEnd = GetCharIndex(m_resEnd); 406 return resEnd - resStart + 1; 407} 408