1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include <algorithm>
8#include <cctype>
9#include <cwctype>
10#include <memory>
11
12#include "core/include/fpdfapi/fpdf_module.h"
13#include "core/include/fpdfapi/fpdf_page.h"
14#include "core/include/fpdfapi/fpdf_pageobj.h"
15#include "core/include/fpdfapi/fpdf_resource.h"
16#include "core/include/fpdftext/fpdf_text.h"
17#include "core/include/fxcrt/fx_bidi.h"
18#include "core/include/fxcrt/fx_ext.h"
19#include "core/include/fxcrt/fx_ucd.h"
20#include "text_int.h"
21
22namespace {
23
24FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
25  if (curChar < 255) {
26    return FALSE;
27  }
28  if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
29      (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
30      (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
31      (curChar >= 0x0400 && curChar <= 0x04FF) ||
32      (curChar >= 0x0500 && curChar <= 0x052F) ||
33      (curChar >= 0xA640 && curChar <= 0xA69F) ||
34      (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
35      (curChar >= 0x2000 && curChar <= 0x206F)) {
36    return FALSE;
37  }
38  return TRUE;
39}
40
41FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
42  if (threshold < 300) {
43    return threshold / 2.0f;
44  }
45  if (threshold < 500) {
46    return threshold / 4.0f;
47  }
48  if (threshold < 700) {
49    return threshold / 5.0f;
50  }
51  return threshold / 6.0f;
52}
53
54FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
55                             const CFX_Matrix& matrix) {
56  FX_FLOAT baseSpace = 0.0;
57  const int nItems = pTextObj->CountItems();
58  if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
59    FX_BOOL bAllChar = TRUE;
60    FX_FLOAT spacing = matrix.TransformDistance(
61        pTextObj->m_TextState.GetObject()->m_CharSpace);
62    baseSpace = spacing;
63    for (int i = 0; i < nItems; i++) {
64      CPDF_TextObjectItem item;
65      pTextObj->GetItemInfo(i, &item);
66      if (item.m_CharCode == (FX_DWORD)-1) {
67        FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
68        FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000;
69        baseSpace = std::min(baseSpace, kerning + spacing);
70        bAllChar = FALSE;
71      }
72    }
73    if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
74      baseSpace = 0.0;
75    }
76  }
77  return baseSpace;
78}
79
80const FX_FLOAT kDefaultFontSize = 1.0f;
81
82}  // namespace
83
84CPDFText_ParseOptions::CPDFText_ParseOptions()
85    : m_bGetCharCodeOnly(FALSE),
86      m_bNormalizeObjs(TRUE),
87      m_bOutputHyphen(FALSE) {}
88
89IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
90                                             int flags) {
91  return new CPDF_TextPage(pPage, flags);
92}
93
94IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
95    const IPDF_TextPage* pTextPage) {
96  return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr;
97}
98
99IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
100  return new CPDF_LinkExtract();
101}
102
103#define TEXT_BLANK_CHAR L' '
104#define TEXT_LINEFEED_CHAR L'\n'
105#define TEXT_RETURN_CHAR L'\r'
106#define TEXT_EMPTY L""
107#define TEXT_BLANK L" "
108#define TEXT_RETURN_LINEFEED L"\r\n"
109#define TEXT_LINEFEED L"\n"
110#define TEXT_CHARRATIO_GAPDELTA 0.070
111
112CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
113    : m_pPage(pPage),
114      m_charList(512),
115      m_TempCharList(50),
116      m_parserflag(flags),
117      m_pPreTextObj(nullptr),
118      m_bIsParsed(false),
119      m_TextlineDir(-1),
120      m_CurlineRect(0, 0, 0, 0) {
121  m_TextBuf.EstimateSize(0, 10240);
122  pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
123                          (int)pPage->GetPageHeight(), 0);
124}
125
126void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) {
127  m_ParseOptions.m_bNormalizeObjs = bNormalize;
128}
129bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
130  switch (charInfo.m_Unicode) {
131    case 0x2:
132    case 0x3:
133    case 0x93:
134    case 0x94:
135    case 0x96:
136    case 0x97:
137    case 0x98:
138    case 0xfffe:
139      return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
140    default:
141      return false;
142  }
143}
144FX_BOOL CPDF_TextPage::ParseTextPage() {
145  m_bIsParsed = false;
146  if (!m_pPage)
147    return FALSE;
148
149  m_TextBuf.Clear();
150  m_charList.RemoveAll();
151  m_pPreTextObj = NULL;
152  ProcessObject();
153  m_bIsParsed = true;
154  if (!m_ParseOptions.m_bGetCharCodeOnly) {
155    m_CharIndex.RemoveAll();
156    int nCount = m_charList.GetSize();
157    if (nCount) {
158      m_CharIndex.Add(0);
159    }
160    for (int i = 0; i < nCount; i++) {
161      int indexSize = m_CharIndex.GetSize();
162      FX_BOOL bNormal = FALSE;
163      PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i);
164      if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
165        bNormal = TRUE;
166      } else if (charinfo.m_Unicode == 0 || IsControlChar(charinfo)) {
167        bNormal = FALSE;
168      } else {
169        bNormal = TRUE;
170      }
171      if (bNormal) {
172        if (indexSize % 2) {
173          m_CharIndex.Add(1);
174        } else {
175          if (indexSize <= 0) {
176            continue;
177          }
178          m_CharIndex.SetAt(indexSize - 1,
179                            m_CharIndex.GetAt(indexSize - 1) + 1);
180        }
181      } else {
182        if (indexSize % 2) {
183          if (indexSize <= 0) {
184            continue;
185          }
186          m_CharIndex.SetAt(indexSize - 1, i + 1);
187        } else {
188          m_CharIndex.Add(i + 1);
189        }
190      }
191    }
192    int indexSize = m_CharIndex.GetSize();
193    if (indexSize % 2) {
194      m_CharIndex.RemoveAt(indexSize - 1);
195    }
196  }
197  return TRUE;
198}
199int CPDF_TextPage::CountChars() const {
200  if (m_ParseOptions.m_bGetCharCodeOnly) {
201    return m_TextBuf.GetSize();
202  }
203  return m_charList.GetSize();
204}
205int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
206  int indexSize = m_CharIndex.GetSize();
207  int count = 0;
208  for (int i = 0; i < indexSize; i += 2) {
209    count += m_CharIndex.GetAt(i + 1);
210    if (count > TextIndex) {
211      return TextIndex - count + m_CharIndex.GetAt(i + 1) +
212             m_CharIndex.GetAt(i);
213    }
214  }
215  return -1;
216}
217int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
218  int indexSize = m_CharIndex.GetSize();
219  int count = 0;
220  for (int i = 0; i < indexSize; i += 2) {
221    count += m_CharIndex.GetAt(i + 1);
222    if (m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
223      if (CharIndex - m_CharIndex.GetAt(i) < 0) {
224        return -1;
225      }
226      return CharIndex - m_CharIndex.GetAt(i) + count -
227             m_CharIndex.GetAt(i + 1);
228    }
229  }
230  return -1;
231}
232void CPDF_TextPage::GetRectArray(int start,
233                                 int nCount,
234                                 CFX_RectArray& rectArray) const {
235  if (m_ParseOptions.m_bGetCharCodeOnly) {
236    return;
237  }
238  if (start < 0 || nCount == 0) {
239    return;
240  }
241  if (!m_bIsParsed) {
242    return;
243  }
244  PAGECHAR_INFO info_curchar;
245  CPDF_TextObject* pCurObj = NULL;
246  CFX_FloatRect rect;
247  int curPos = start;
248  FX_BOOL flagNewRect = TRUE;
249  if (nCount + start > m_charList.GetSize() || nCount == -1) {
250    nCount = m_charList.GetSize() - start;
251  }
252  while (nCount--) {
253    info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++);
254    if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
255      continue;
256    }
257    if (info_curchar.m_CharBox.Width() < 0.01 ||
258        info_curchar.m_CharBox.Height() < 0.01) {
259      continue;
260    }
261    if (!pCurObj) {
262      pCurObj = info_curchar.m_pTextObj;
263    }
264    if (pCurObj != info_curchar.m_pTextObj) {
265      rectArray.Add(rect);
266      pCurObj = info_curchar.m_pTextObj;
267      flagNewRect = TRUE;
268    }
269    if (flagNewRect) {
270      FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY;
271      CFX_Matrix matrix, matrix_reverse;
272      info_curchar.m_pTextObj->GetTextMatrix(&matrix);
273      matrix.Concat(info_curchar.m_Matrix);
274      matrix_reverse.SetReverse(matrix);
275      matrix_reverse.Transform(orgX, orgY);
276      rect.left = info_curchar.m_CharBox.left;
277      rect.right = info_curchar.m_CharBox.right;
278      if (pCurObj->GetFont()->GetTypeDescent()) {
279        rect.bottom = orgY +
280                      pCurObj->GetFont()->GetTypeDescent() *
281                          pCurObj->GetFontSize() / 1000;
282        FX_FLOAT xPosTemp = orgX;
283        matrix.Transform(xPosTemp, rect.bottom);
284      } else {
285        rect.bottom = info_curchar.m_CharBox.bottom;
286      }
287      if (pCurObj->GetFont()->GetTypeAscent()) {
288        rect.top =
289            orgY +
290            pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
291        FX_FLOAT xPosTemp =
292            orgX +
293            GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
294                pCurObj->GetFontSize() / 1000;
295        matrix.Transform(xPosTemp, rect.top);
296      } else {
297        rect.top = info_curchar.m_CharBox.top;
298      }
299      flagNewRect = FALSE;
300      rect = info_curchar.m_CharBox;
301      rect.Normalize();
302    } else {
303      info_curchar.m_CharBox.Normalize();
304      if (rect.left > info_curchar.m_CharBox.left) {
305        rect.left = info_curchar.m_CharBox.left;
306      }
307      if (rect.right < info_curchar.m_CharBox.right) {
308        rect.right = info_curchar.m_CharBox.right;
309      }
310      if (rect.top < info_curchar.m_CharBox.top) {
311        rect.top = info_curchar.m_CharBox.top;
312      }
313      if (rect.bottom > info_curchar.m_CharBox.bottom) {
314        rect.bottom = info_curchar.m_CharBox.bottom;
315      }
316    }
317  }
318  rectArray.Add(rect);
319  return;
320}
321int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
322                                 FX_FLOAT xTolerance,
323                                 FX_FLOAT yTolerance) const {
324  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
325    return -3;
326
327  int pos = 0;
328  int NearPos = -1;
329  double xdif = 5000, ydif = 5000;
330  while (pos < m_charList.GetSize()) {
331    PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
332    CFX_FloatRect charrect = charinfo.m_CharBox;
333    if (charrect.Contains(point.x, point.y)) {
334      break;
335    }
336    if (xTolerance > 0 || yTolerance > 0) {
337      CFX_FloatRect charRectExt;
338      charrect.Normalize();
339      charRectExt.left = charrect.left - xTolerance / 2;
340      charRectExt.right = charrect.right + xTolerance / 2;
341      charRectExt.top = charrect.top + yTolerance / 2;
342      charRectExt.bottom = charrect.bottom - yTolerance / 2;
343      if (charRectExt.Contains(point.x, point.y)) {
344        double curXdif, curYdif;
345        curXdif = FXSYS_fabs(point.x - charrect.left) <
346                          FXSYS_fabs(point.x - charrect.right)
347                      ? FXSYS_fabs(point.x - charrect.left)
348                      : FXSYS_fabs(point.x - charrect.right);
349        curYdif = FXSYS_fabs(point.y - charrect.bottom) <
350                          FXSYS_fabs(point.y - charrect.top)
351                      ? FXSYS_fabs(point.y - charrect.bottom)
352                      : FXSYS_fabs(point.y - charrect.top);
353        if (curYdif + curXdif < xdif + ydif) {
354          ydif = curYdif;
355          xdif = curXdif;
356          NearPos = pos;
357        }
358      }
359    }
360    ++pos;
361  }
362  if (pos >= m_charList.GetSize()) {
363    pos = NearPos;
364  }
365  return pos;
366}
367CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
368  CFX_WideString strText;
369  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
370    return strText;
371
372  int nCount = m_charList.GetSize();
373  int pos = 0;
374  FX_FLOAT posy = 0;
375  FX_BOOL IsContainPreChar = FALSE;
376  FX_BOOL ISAddLineFeed = FALSE;
377  while (pos < nCount) {
378    PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
379    if (IsRectIntersect(rect, charinfo.m_CharBox)) {
380      if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar &&
381          ISAddLineFeed) {
382        posy = charinfo.m_OriginY;
383        if (strText.GetLength() > 0) {
384          strText += L"\r\n";
385        }
386      }
387      IsContainPreChar = TRUE;
388      ISAddLineFeed = FALSE;
389      if (charinfo.m_Unicode) {
390        strText += charinfo.m_Unicode;
391      }
392    } else if (charinfo.m_Unicode == 32) {
393      if (IsContainPreChar && charinfo.m_Unicode) {
394        strText += charinfo.m_Unicode;
395        IsContainPreChar = FALSE;
396        ISAddLineFeed = FALSE;
397      }
398    } else {
399      IsContainPreChar = FALSE;
400      ISAddLineFeed = TRUE;
401    }
402  }
403  return strText;
404}
405void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect,
406                                        CFX_RectArray& resRectArray) const {
407  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
408    return;
409
410  CFX_FloatRect curRect;
411  FX_BOOL flagNewRect = TRUE;
412  CPDF_TextObject* pCurObj = NULL;
413  int nCount = m_charList.GetSize();
414  int pos = 0;
415  while (pos < nCount) {
416    PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
417    if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
418      continue;
419    }
420    if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
421      if (!pCurObj) {
422        pCurObj = info_curchar.m_pTextObj;
423      }
424      if (pCurObj != info_curchar.m_pTextObj) {
425        resRectArray.Add(curRect);
426        pCurObj = info_curchar.m_pTextObj;
427        flagNewRect = TRUE;
428      }
429      if (flagNewRect) {
430        curRect = info_curchar.m_CharBox;
431        flagNewRect = FALSE;
432        curRect.Normalize();
433      } else {
434        info_curchar.m_CharBox.Normalize();
435        if (curRect.left > info_curchar.m_CharBox.left) {
436          curRect.left = info_curchar.m_CharBox.left;
437        }
438        if (curRect.right < info_curchar.m_CharBox.right) {
439          curRect.right = info_curchar.m_CharBox.right;
440        }
441        if (curRect.top < info_curchar.m_CharBox.top) {
442          curRect.top = info_curchar.m_CharBox.top;
443        }
444        if (curRect.bottom > info_curchar.m_CharBox.bottom) {
445          curRect.bottom = info_curchar.m_CharBox.bottom;
446        }
447      }
448    }
449  }
450  resRectArray.Add(curRect);
451  return;
452}
453int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x,
454                                 FX_FLOAT y,
455                                 FX_FLOAT xTolerance,
456                                 FX_FLOAT yTolerance) const {
457  if (m_ParseOptions.m_bGetCharCodeOnly) {
458    return -3;
459  }
460  CPDF_Point point(x, y);
461  return GetIndexAtPos(point, xTolerance, yTolerance);
462}
463
464void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
465  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
466    return;
467
468  if (index < 0 || index >= m_charList.GetSize())
469    return;
470
471  const PAGECHAR_INFO* charinfo =
472      static_cast<PAGECHAR_INFO*>(m_charList.GetAt(index));
473  info->m_Charcode = charinfo->m_CharCode;
474  info->m_OriginX = charinfo->m_OriginX;
475  info->m_OriginY = charinfo->m_OriginY;
476  info->m_Unicode = charinfo->m_Unicode;
477  info->m_Flag = charinfo->m_Flag;
478  info->m_CharBox = charinfo->m_CharBox;
479  info->m_pTextObj = charinfo->m_pTextObj;
480  if (charinfo->m_pTextObj && charinfo->m_pTextObj->GetFont()) {
481    info->m_FontSize = charinfo->m_pTextObj->GetFontSize();
482  } else {
483    info->m_FontSize = kDefaultFontSize;
484  }
485  info->m_Matrix.Copy(charinfo->m_Matrix);
486}
487
488void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
489                                             int32_t& nCount) const {
490  PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
491  PAGECHAR_INFO charinfo2 =
492      *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
493  if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
494      FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
495    return;
496  }
497  if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
498    PAGECHAR_INFO charinfo1 = charinfo;
499    int startIndex = start;
500    while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
501           charinfo1.m_Index == charinfo.m_Index) {
502      startIndex--;
503      if (startIndex < 0) {
504        break;
505      }
506      charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
507    }
508    startIndex++;
509    start = startIndex;
510  }
511  if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
512    PAGECHAR_INFO charinfo3 = charinfo2;
513    int endIndex = start + nCount - 1;
514    while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
515           charinfo3.m_Index == charinfo2.m_Index) {
516      endIndex++;
517      if (endIndex >= m_charList.GetSize()) {
518        break;
519      }
520      charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
521    }
522    endIndex--;
523    nCount = endIndex - start + 1;
524  }
525}
526CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
527  if (!m_bIsParsed || nCount == 0)
528    return L"";
529
530  if (start < 0)
531    start = 0;
532
533  if (nCount == -1) {
534    nCount = m_charList.GetSize() - start;
535    return m_TextBuf.GetWideString().Mid(start,
536                                         m_TextBuf.GetWideString().GetLength());
537  }
538  if (nCount <= 0 || m_charList.GetSize() <= 0) {
539    return L"";
540  }
541  if (nCount + start > m_charList.GetSize() - 1) {
542    nCount = m_charList.GetSize() - start;
543  }
544  if (nCount <= 0) {
545    return L"";
546  }
547  CheckMarkedContentObject(start, nCount);
548  int startindex = 0;
549  PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
550  int startOffset = 0;
551  while (charinfo.m_Index == -1) {
552    startOffset++;
553    if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) {
554      return L"";
555    }
556    charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
557  }
558  startindex = charinfo.m_Index;
559  charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
560  int nCountOffset = 0;
561  while (charinfo.m_Index == -1) {
562    nCountOffset++;
563    if (nCountOffset >= nCount) {
564      return L"";
565    }
566    charinfo =
567        *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1);
568  }
569  nCount = start + nCount - nCountOffset - startindex;
570  if (nCount <= 0) {
571    return L"";
572  }
573  return m_TextBuf.GetWideString().Mid(startindex, nCount);
574}
575int CPDF_TextPage::CountRects(int start, int nCount) {
576  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed || start < 0)
577    return -1;
578
579  if (nCount == -1 || nCount + start > m_charList.GetSize()) {
580    nCount = m_charList.GetSize() - start;
581  }
582  m_SelRects.RemoveAll();
583  GetRectArray(start, nCount, m_SelRects);
584  return m_SelRects.GetSize();
585}
586void CPDF_TextPage::GetRect(int rectIndex,
587                            FX_FLOAT& left,
588                            FX_FLOAT& top,
589                            FX_FLOAT& right,
590                            FX_FLOAT& bottom) const {
591  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
592    return;
593
594  if (rectIndex < 0 || rectIndex >= m_SelRects.GetSize())
595    return;
596
597  left = m_SelRects.GetAt(rectIndex).left;
598  top = m_SelRects.GetAt(rectIndex).top;
599  right = m_SelRects.GetAt(rectIndex).right;
600  bottom = m_SelRects.GetAt(rectIndex).bottom;
601}
602
603FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) {
604  if (m_ParseOptions.m_bGetCharCodeOnly) {
605    return FALSE;
606  }
607  if (end == start) {
608    return FALSE;
609  }
610  FPDF_CHAR_INFO info_start;
611  FPDF_CHAR_INFO info_end;
612  GetCharInfo(start, &info_start);
613  GetCharInfo(end, &info_end);
614  while (info_end.m_CharBox.Width() == 0 || info_end.m_CharBox.Height() == 0) {
615    if (--end <= start)
616      return FALSE;
617
618    GetCharInfo(end, &info_end);
619  }
620  FX_FLOAT dx = (info_end.m_OriginX - info_start.m_OriginX);
621  FX_FLOAT dy = (info_end.m_OriginY - info_start.m_OriginY);
622  if (dx == 0) {
623    if (dy > 0) {
624      Rotate = 90;
625    } else if (dy < 0) {
626      Rotate = 270;
627    } else {
628      Rotate = 0;
629    }
630  } else {
631    float a = FXSYS_atan2(dy, dx);
632    Rotate = (int)(a * 180 / FX_PI + 0.5);
633  }
634  if (Rotate < 0) {
635    Rotate = -Rotate;
636  } else if (Rotate > 0) {
637    Rotate = 360 - Rotate;
638  }
639  return TRUE;
640}
641
642FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect,
643                                         int& Rotate) {
644  if (m_ParseOptions.m_bGetCharCodeOnly) {
645    return FALSE;
646  }
647  int start, end, count,
648      n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom,
649                               TRUE);
650  if (n < 1) {
651    return FALSE;
652  }
653  if (n > 1) {
654    GetBoundedSegment(n - 1, start, count);
655    end = start + count - 1;
656    GetBoundedSegment(0, start, count);
657  } else {
658    GetBoundedSegment(0, start, count);
659    end = start + count - 1;
660  }
661  return GetBaselineRotate(start, end, Rotate);
662}
663FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) {
664  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
665    return FALSE;
666
667  if (rectIndex < 0 || rectIndex > m_SelRects.GetSize())
668    return FALSE;
669
670  CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
671  return GetBaselineRotate(rect, Rotate);
672}
673int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left,
674                                        FX_FLOAT top,
675                                        FX_FLOAT right,
676                                        FX_FLOAT bottom,
677                                        FX_BOOL bContains) {
678  if (m_ParseOptions.m_bGetCharCodeOnly)
679    return -1;
680
681  m_Segment.RemoveAll();
682  if (!m_bIsParsed)
683    return -1;
684
685  CFX_FloatRect rect(left, bottom, right, top);
686  rect.Normalize();
687  int nCount = m_charList.GetSize();
688  int pos = 0;
689  FPDF_SEGMENT segment;
690  segment.m_Start = 0;
691  segment.m_nCount = 0;
692  int segmentStatus = 0;
693  FX_BOOL IsContainPreChar = FALSE;
694  while (pos < nCount) {
695    PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
696    if (bContains && rect.Contains(charinfo.m_CharBox)) {
697      if (segmentStatus == 0 || segmentStatus == 2) {
698        segment.m_Start = pos;
699        segment.m_nCount = 1;
700        segmentStatus = 1;
701      } else if (segmentStatus == 1) {
702        segment.m_nCount++;
703      }
704      IsContainPreChar = TRUE;
705    } else if (!bContains &&
706               (IsRectIntersect(rect, charinfo.m_CharBox) ||
707                rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
708      if (segmentStatus == 0 || segmentStatus == 2) {
709        segment.m_Start = pos;
710        segment.m_nCount = 1;
711        segmentStatus = 1;
712      } else if (segmentStatus == 1) {
713        segment.m_nCount++;
714      }
715      IsContainPreChar = TRUE;
716    } else if (charinfo.m_Unicode == 32) {
717      if (IsContainPreChar == TRUE) {
718        if (segmentStatus == 0 || segmentStatus == 2) {
719          segment.m_Start = pos;
720          segment.m_nCount = 1;
721          segmentStatus = 1;
722        } else if (segmentStatus == 1) {
723          segment.m_nCount++;
724        }
725        IsContainPreChar = FALSE;
726      } else {
727        if (segmentStatus == 1) {
728          segmentStatus = 2;
729          m_Segment.Add(segment);
730          segment.m_Start = 0;
731          segment.m_nCount = 0;
732        }
733      }
734    } else {
735      if (segmentStatus == 1) {
736        segmentStatus = 2;
737        m_Segment.Add(segment);
738        segment.m_Start = 0;
739        segment.m_nCount = 0;
740      }
741      IsContainPreChar = FALSE;
742    }
743    pos++;
744  }
745  if (segmentStatus == 1) {
746    segmentStatus = 2;
747    m_Segment.Add(segment);
748    segment.m_Start = 0;
749    segment.m_nCount = 0;
750  }
751  return m_Segment.GetSize();
752}
753void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const {
754  if (m_ParseOptions.m_bGetCharCodeOnly) {
755    return;
756  }
757  if (index < 0 || index >= m_Segment.GetSize()) {
758    return;
759  }
760  start = m_Segment.GetAt(index).m_Start;
761  count = m_Segment.GetAt(index).m_nCount;
762}
763int CPDF_TextPage::GetWordBreak(int index, int direction) const {
764  if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
765    return -1;
766
767  if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT)
768    return -1;
769
770  if (index < 0 || index >= m_charList.GetSize())
771    return -1;
772
773  PAGECHAR_INFO charinfo;
774  charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
775  if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
776    return index;
777  }
778  if (!IsLetter(charinfo.m_Unicode)) {
779    return index;
780  }
781  int breakPos = index;
782  if (direction == FPDFTEXT_LEFT) {
783    while (--breakPos > 0) {
784      charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
785      if (!IsLetter(charinfo.m_Unicode)) {
786        return breakPos;
787      }
788    }
789  } else if (direction == FPDFTEXT_RIGHT) {
790    while (++breakPos < m_charList.GetSize()) {
791      charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
792      if (!IsLetter(charinfo.m_Unicode)) {
793        return breakPos;
794      }
795    }
796  }
797  return breakPos;
798}
799int32_t CPDF_TextPage::FindTextlineFlowDirection() {
800  if (!m_pPage) {
801    return -1;
802  }
803  const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth();
804  const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight();
805  CFX_ByteArray nHorizontalMask;
806  if (!nHorizontalMask.SetSize(nPageWidth)) {
807    return -1;
808  }
809  uint8_t* pDataH = nHorizontalMask.GetData();
810  CFX_ByteArray nVerticalMask;
811  if (!nVerticalMask.SetSize(nPageHeight)) {
812    return -1;
813  }
814  uint8_t* pDataV = nVerticalMask.GetData();
815  int32_t index = 0;
816  FX_FLOAT fLineHeight = 0.0f;
817  CPDF_PageObject* pPageObj = NULL;
818  FX_POSITION pos = NULL;
819  pos = m_pPage->GetFirstObjectPosition();
820  if (!pos) {
821    return -1;
822  }
823  while (pos) {
824    pPageObj = m_pPage->GetNextObject(pos);
825    if (NULL == pPageObj) {
826      continue;
827    }
828    if (PDFPAGE_TEXT != pPageObj->m_Type) {
829      continue;
830    }
831    int32_t minH =
832        (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left;
833    int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth
834                       ? nPageWidth
835                       : (int32_t)pPageObj->m_Right;
836    int32_t minV =
837        (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom;
838    int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight
839                       ? nPageHeight
840                       : (int32_t)pPageObj->m_Top;
841    if (minH >= maxH || minV >= maxV) {
842      continue;
843    }
844    FXSYS_memset(pDataH + minH, 1, maxH - minH);
845    FXSYS_memset(pDataV + minV, 1, maxV - minV);
846    if (fLineHeight <= 0.0f) {
847      fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
848    }
849    pPageObj = NULL;
850  }
851  int32_t nStartH = 0;
852  int32_t nEndH = 0;
853  FX_FLOAT nSumH = 0.0f;
854  for (index = 0; index < nPageWidth; index++)
855    if (1 == nHorizontalMask[index]) {
856      break;
857    }
858  nStartH = index;
859  for (index = nPageWidth; index > 0; index--)
860    if (1 == nHorizontalMask[index - 1]) {
861      break;
862    }
863  nEndH = index;
864  for (index = nStartH; index < nEndH; index++) {
865    nSumH += nHorizontalMask[index];
866  }
867  nSumH /= nEndH - nStartH;
868  int32_t nStartV = 0;
869  int32_t nEndV = 0;
870  FX_FLOAT nSumV = 0.0f;
871  for (index = 0; index < nPageHeight; index++)
872    if (1 == nVerticalMask[index]) {
873      break;
874    }
875  nStartV = index;
876  for (index = nPageHeight; index > 0; index--)
877    if (1 == nVerticalMask[index - 1]) {
878      break;
879    }
880  nEndV = index;
881  for (index = nStartV; index < nEndV; index++) {
882    nSumV += nVerticalMask[index];
883  }
884  nSumV /= nEndV - nStartV;
885  if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) {
886    return 0;
887  }
888  if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) {
889    return 1;
890  }
891  if (nSumH > 0.8f) {
892    return 0;
893  }
894  if (nSumH - nSumV > 0.0f) {
895    return 0;
896  }
897  if (nSumV - nSumH > 0.0f) {
898    return 1;
899  }
900  return -1;
901}
902void CPDF_TextPage::ProcessObject() {
903  CPDF_PageObject* pPageObj = NULL;
904  if (!m_pPage) {
905    return;
906  }
907  FX_POSITION pos;
908  pos = m_pPage->GetFirstObjectPosition();
909  if (!pos) {
910    return;
911  }
912  m_TextlineDir = FindTextlineFlowDirection();
913  int nCount = 0;
914  while (pos) {
915    pPageObj = m_pPage->GetNextObject(pos);
916    if (pPageObj) {
917      if (pPageObj->m_Type == PDFPAGE_TEXT) {
918        CFX_Matrix matrix;
919        ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
920        nCount++;
921      } else if (pPageObj->m_Type == PDFPAGE_FORM) {
922        CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
923        ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
924      }
925    }
926    pPageObj = NULL;
927  }
928  int count = m_LineObj.GetSize();
929  for (int i = 0; i < count; i++) {
930    ProcessTextObject(m_LineObj.GetAt(i));
931  }
932  m_LineObj.RemoveAll();
933  CloseTempLine();
934}
935void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
936                                      const CFX_Matrix& formMatrix) {
937  CPDF_PageObject* pPageObj = NULL;
938  FX_POSITION pos;
939  if (!pFormObj) {
940    return;
941  }
942  pos = pFormObj->m_pForm->GetFirstObjectPosition();
943  if (!pos) {
944    return;
945  }
946  CFX_Matrix curFormMatrix;
947  curFormMatrix.Copy(pFormObj->m_FormMatrix);
948  curFormMatrix.Concat(formMatrix);
949  while (pos) {
950    pPageObj = pFormObj->m_pForm->GetNextObject(pos);
951    if (pPageObj) {
952      if (pPageObj->m_Type == PDFPAGE_TEXT) {
953        ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos);
954      } else if (pPageObj->m_Type == PDFPAGE_FORM) {
955        ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
956      }
957    }
958    pPageObj = NULL;
959  }
960}
961int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const {
962  if (charCode == -1) {
963    return 0;
964  }
965  int w = pFont->GetCharWidthF(charCode);
966  if (w == 0) {
967    CFX_ByteString str;
968    pFont->AppendChar(str, charCode);
969    w = pFont->GetStringWidth(str, 1);
970    if (w == 0) {
971      FX_RECT BBox;
972      pFont->GetCharBBox(charCode, BBox);
973      w = BBox.right - BBox.left;
974    }
975  }
976  return w;
977}
978void CPDF_TextPage::OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str) {
979  int32_t start, count;
980  CFX_BidiChar::Direction ret = pBidi->GetBidiInfo(&start, &count);
981  if (ret == CFX_BidiChar::RIGHT) {
982    for (int i = start + count - 1; i >= start; i--) {
983      m_TextBuf.AppendChar(str.GetAt(i));
984      m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
985    }
986  } else {
987    int end = start + count;
988    for (int i = start; i < end; i++) {
989      m_TextBuf.AppendChar(str.GetAt(i));
990      m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
991    }
992  }
993}
994void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) {
995  PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
996  FX_WCHAR wChar = str.GetAt(i);
997  if (!IsControlChar(Info)) {
998    Info.m_Index = m_TextBuf.GetLength();
999    if (wChar >= 0xFB00 && wChar <= 0xFB06) {
1000      FX_WCHAR* pDst = NULL;
1001      FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1002      if (nCount >= 1) {
1003        pDst = FX_Alloc(FX_WCHAR, nCount);
1004        FX_Unicode_GetNormalization(wChar, pDst);
1005        for (int nIndex = 0; nIndex < nCount; nIndex++) {
1006          PAGECHAR_INFO Info2 = Info;
1007          Info2.m_Unicode = pDst[nIndex];
1008          Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1009          m_TextBuf.AppendChar(Info2.m_Unicode);
1010          if (!m_ParseOptions.m_bGetCharCodeOnly) {
1011            m_charList.Add(Info2);
1012          }
1013        }
1014        FX_Free(pDst);
1015        return;
1016      }
1017    }
1018    m_TextBuf.AppendChar(wChar);
1019  } else {
1020    Info.m_Index = -1;
1021  }
1022  if (!m_ParseOptions.m_bGetCharCodeOnly) {
1023    m_charList.Add(Info);
1024  }
1025}
1026void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) {
1027  PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1028  if (!IsControlChar(Info)) {
1029    Info.m_Index = m_TextBuf.GetLength();
1030    FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
1031    FX_WCHAR* pDst = NULL;
1032    FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1033    if (nCount >= 1) {
1034      pDst = FX_Alloc(FX_WCHAR, nCount);
1035      FX_Unicode_GetNormalization(wChar, pDst);
1036      for (int nIndex = 0; nIndex < nCount; nIndex++) {
1037        PAGECHAR_INFO Info2 = Info;
1038        Info2.m_Unicode = pDst[nIndex];
1039        Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1040        m_TextBuf.AppendChar(Info2.m_Unicode);
1041        if (!m_ParseOptions.m_bGetCharCodeOnly) {
1042          m_charList.Add(Info2);
1043        }
1044      }
1045      FX_Free(pDst);
1046      return;
1047    }
1048    Info.m_Unicode = wChar;
1049    m_TextBuf.AppendChar(Info.m_Unicode);
1050  } else {
1051    Info.m_Index = -1;
1052  }
1053  if (!m_ParseOptions.m_bGetCharCodeOnly) {
1054    m_charList.Add(Info);
1055  }
1056}
1057void CPDF_TextPage::CloseTempLine() {
1058  int count1 = m_TempCharList.GetSize();
1059  if (count1 <= 0) {
1060    return;
1061  }
1062  std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
1063  CFX_WideString str = m_TempTextBuf.GetWideString();
1064  CFX_WordArray order;
1065  FX_BOOL bR2L = FALSE;
1066  int32_t start = 0, count = 0;
1067  int nR2L = 0, nL2R = 0;
1068  FX_BOOL bPrevSpace = FALSE;
1069  for (int i = 0; i < str.GetLength(); i++) {
1070    if (str.GetAt(i) == 32) {
1071      if (bPrevSpace) {
1072        m_TempTextBuf.Delete(i, 1);
1073        m_TempCharList.Delete(i);
1074        str.Delete(i);
1075        count1--;
1076        i--;
1077        continue;
1078      }
1079      bPrevSpace = TRUE;
1080    } else {
1081      bPrevSpace = FALSE;
1082    }
1083    if (pBidiChar->AppendChar(str.GetAt(i))) {
1084      CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1085      order.Add(start);
1086      order.Add(count);
1087      order.Add(ret);
1088      if (!bR2L) {
1089        if (ret == CFX_BidiChar::RIGHT) {
1090          nR2L++;
1091        } else if (ret == CFX_BidiChar::LEFT) {
1092          nL2R++;
1093        }
1094      }
1095    }
1096  }
1097  if (pBidiChar->EndChar()) {
1098    CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1099    order.Add(start);
1100    order.Add(count);
1101    order.Add(ret);
1102    if (!bR2L) {
1103      if (ret == CFX_BidiChar::RIGHT) {
1104        nR2L++;
1105      } else if (ret == CFX_BidiChar::LEFT) {
1106        nL2R++;
1107      }
1108    }
1109  }
1110  if (nR2L > 0 && nR2L >= nL2R) {
1111    bR2L = TRUE;
1112  }
1113  if (m_parserflag == FPDFTEXT_RLTB || bR2L) {
1114    int count = order.GetSize();
1115    for (int i = count - 1; i > 0; i -= 3) {
1116      int ret = order.GetAt(i);
1117      int start = order.GetAt(i - 2);
1118      int count1 = order.GetAt(i - 1);
1119      if (ret == 2 || ret == 0) {
1120        for (int j = start + count1 - 1; j >= start; j--) {
1121          AddCharInfoByRLDirection(str, j);
1122        }
1123      } else {
1124        int j = i;
1125        FX_BOOL bSymbol = FALSE;
1126        while (j > 0 && order.GetAt(j) != 2) {
1127          bSymbol = !order.GetAt(j);
1128          j -= 3;
1129        }
1130        int end = start + count1;
1131        int n = 0;
1132        if (bSymbol) {
1133          n = j + 6;
1134        } else {
1135          n = j + 3;
1136        }
1137        if (n >= i) {
1138          for (int m = start; m < end; m++) {
1139            AddCharInfoByLRDirection(str, m);
1140          }
1141        } else {
1142          j = i;
1143          i = n;
1144          for (; n <= j; n += 3) {
1145            int start = order.GetAt(n - 2);
1146            int count1 = order.GetAt(n - 1);
1147            int end = start + count1;
1148            for (int m = start; m < end; m++) {
1149              AddCharInfoByLRDirection(str, m);
1150            }
1151          }
1152        }
1153      }
1154    }
1155  } else {
1156    int count = order.GetSize();
1157    FX_BOOL bL2R = FALSE;
1158    for (int i = 0; i < count; i += 3) {
1159      int ret = order.GetAt(i + 2);
1160      int start = order.GetAt(i);
1161      int count1 = order.GetAt(i + 1);
1162      if (ret == 2 || (i == 0 && ret == 0 && !bL2R)) {
1163        int j = i + 3;
1164        while (bR2L && j < count) {
1165          if (order.GetAt(j + 2) == 1) {
1166            break;
1167          } else {
1168            j += 3;
1169          }
1170        }
1171        if (j == 3) {
1172          i = -3;
1173          bL2R = TRUE;
1174          continue;
1175        }
1176        int end = m_TempCharList.GetSize() - 1;
1177        if (j < count) {
1178          end = order.GetAt(j) - 1;
1179        }
1180        i = j - 3;
1181        for (int n = end; n >= start; n--) {
1182          AddCharInfoByRLDirection(str, n);
1183        }
1184      } else {
1185        int end = start + count1;
1186        for (int n = start; n < end; n++) {
1187          AddCharInfoByLRDirection(str, n);
1188        }
1189      }
1190    }
1191  }
1192  order.RemoveAll();
1193  m_TempCharList.RemoveAll();
1194  m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
1195}
1196void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj,
1197                                      const CFX_Matrix& formMatrix,
1198                                      FX_POSITION ObjPos) {
1199  CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right,
1200                   pTextObj->m_Top);
1201  if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1202    return;
1203  }
1204  int count = m_LineObj.GetSize();
1205  PDFTEXT_Obj Obj;
1206  Obj.m_pTextObj = pTextObj;
1207  Obj.m_formMatrix = formMatrix;
1208  if (count == 0) {
1209    m_LineObj.Add(Obj);
1210    return;
1211  }
1212  if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
1213    return;
1214  }
1215  PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
1216  CPDF_TextObjectItem item;
1217  int nItem = prev_Obj.m_pTextObj->CountItems();
1218  prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
1219  FX_FLOAT prev_width =
1220      GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
1221      prev_Obj.m_pTextObj->GetFontSize() / 1000;
1222  CFX_Matrix prev_matrix;
1223  prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1224  prev_width = FXSYS_fabs(prev_width);
1225  prev_matrix.Concat(prev_Obj.m_formMatrix);
1226  prev_width = prev_matrix.TransformDistance(prev_width);
1227  pTextObj->GetItemInfo(0, &item);
1228  FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
1229                        pTextObj->GetFontSize() / 1000;
1230  this_width = FXSYS_fabs(this_width);
1231  CFX_Matrix this_matrix;
1232  pTextObj->GetTextMatrix(&this_matrix);
1233  this_width = FXSYS_fabs(this_width);
1234  this_matrix.Concat(formMatrix);
1235  this_width = this_matrix.TransformDistance(this_width);
1236  FX_FLOAT threshold =
1237      prev_width > this_width ? prev_width / 4 : this_width / 4;
1238  FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(),
1239           prev_y = prev_Obj.m_pTextObj->GetPosY();
1240  prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
1241  m_DisplayMatrix.Transform(prev_x, prev_y);
1242  FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
1243  formMatrix.Transform(this_x, this_y);
1244  m_DisplayMatrix.Transform(this_x, this_y);
1245  if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
1246    for (int i = 0; i < count; i++) {
1247      ProcessTextObject(m_LineObj.GetAt(i));
1248    }
1249    m_LineObj.RemoveAll();
1250    m_LineObj.Add(Obj);
1251    return;
1252  }
1253  int i = 0;
1254  if (m_ParseOptions.m_bNormalizeObjs) {
1255    for (i = count - 1; i >= 0; i--) {
1256      PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
1257      CFX_Matrix prev_matrix;
1258      prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1259      FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(),
1260               Prev_y = prev_Obj.m_pTextObj->GetPosY();
1261      prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
1262      m_DisplayMatrix.Transform(Prev_x, Prev_y);
1263      if (this_x >= Prev_x) {
1264        if (i == count - 1) {
1265          m_LineObj.Add(Obj);
1266        } else {
1267          m_LineObj.InsertAt(i + 1, Obj);
1268        }
1269        break;
1270      }
1271    }
1272    if (i < 0) {
1273      m_LineObj.InsertAt(0, Obj);
1274    }
1275  } else {
1276    m_LineObj.Add(Obj);
1277  }
1278}
1279int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
1280  CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1281  CPDF_ContentMarkData* pMarkData =
1282      (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1283  if (!pMarkData) {
1284    return FPDFTEXT_MC_PASS;
1285  }
1286  int nContentMark = pMarkData->CountItems();
1287  if (nContentMark < 1) {
1288    return FPDFTEXT_MC_PASS;
1289  }
1290  CFX_WideString actText;
1291  FX_BOOL bExist = FALSE;
1292  CPDF_Dictionary* pDict = NULL;
1293  int n = 0;
1294  for (n = 0; n < nContentMark; n++) {
1295    CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1296    CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1297    pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
1298    CPDF_String* temp =
1299        ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
1300    if (temp) {
1301      bExist = TRUE;
1302      actText = temp->GetUnicodeText();
1303    }
1304  }
1305  if (!bExist) {
1306    return FPDFTEXT_MC_PASS;
1307  }
1308  if (m_pPreTextObj) {
1309    if (CPDF_ContentMarkData* pPreMarkData =
1310            (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) {
1311      if (pPreMarkData->CountItems() == n) {
1312        CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
1313        if (pDict == item.GetParam()) {
1314          return FPDFTEXT_MC_DONE;
1315        }
1316      }
1317    }
1318  }
1319  CPDF_Font* pFont = pTextObj->GetFont();
1320  FX_STRSIZE nItems = actText.GetLength();
1321  if (nItems < 1) {
1322    return FPDFTEXT_MC_PASS;
1323  }
1324  bExist = FALSE;
1325  for (FX_STRSIZE i = 0; i < nItems; i++) {
1326    FX_WCHAR wChar = actText.GetAt(i);
1327    if (-1 == pFont->CharCodeFromUnicode(wChar)) {
1328      continue;
1329    } else {
1330      bExist = TRUE;
1331      break;
1332    }
1333  }
1334  if (!bExist) {
1335    return FPDFTEXT_MC_PASS;
1336  }
1337  bExist = FALSE;
1338  for (FX_STRSIZE i = 0; i < nItems; i++) {
1339    FX_WCHAR wChar = actText.GetAt(i);
1340    if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
1341      bExist = TRUE;
1342      break;
1343    }
1344  }
1345  if (!bExist) {
1346    return FPDFTEXT_MC_DONE;
1347  }
1348  return FPDFTEXT_MC_DELAY;
1349}
1350void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
1351  CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1352  CPDF_ContentMarkData* pMarkData =
1353      (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1354  if (!pMarkData) {
1355    return;
1356  }
1357  int nContentMark = pMarkData->CountItems();
1358  if (nContentMark < 1) {
1359    return;
1360  }
1361  CFX_WideString actText;
1362  CPDF_Dictionary* pDict = NULL;
1363  int n = 0;
1364  for (n = 0; n < nContentMark; n++) {
1365    CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1366    CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1367    pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
1368    CPDF_String* temp =
1369        ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
1370    if (temp) {
1371      actText = temp->GetUnicodeText();
1372    }
1373  }
1374  FX_STRSIZE nItems = actText.GetLength();
1375  if (nItems < 1) {
1376    return;
1377  }
1378  CPDF_Font* pFont = pTextObj->GetFont();
1379  CFX_Matrix formMatrix = Obj.m_formMatrix;
1380  CFX_Matrix matrix;
1381  pTextObj->GetTextMatrix(&matrix);
1382  matrix.Concat(formMatrix);
1383  FX_FLOAT fPosX = pTextObj->GetPosX();
1384  FX_FLOAT fPosY = pTextObj->GetPosY();
1385  int nCharInfoIndex = m_TextBuf.GetLength();
1386  CFX_FloatRect charBox;
1387  charBox.top = pTextObj->m_Top;
1388  charBox.left = pTextObj->m_Left;
1389  charBox.right = pTextObj->m_Right;
1390  charBox.bottom = pTextObj->m_Bottom;
1391  for (FX_STRSIZE k = 0; k < nItems; k++) {
1392    FX_WCHAR wChar = actText.GetAt(k);
1393    if (wChar <= 0x80 && !isprint(wChar)) {
1394      wChar = 0x20;
1395    }
1396    if (wChar >= 0xFFFD) {
1397      continue;
1398    }
1399    PAGECHAR_INFO charinfo;
1400    charinfo.m_OriginX = fPosX;
1401    charinfo.m_OriginY = fPosY;
1402    charinfo.m_Index = nCharInfoIndex;
1403    charinfo.m_Unicode = wChar;
1404    charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
1405    charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
1406    charinfo.m_pTextObj = pTextObj;
1407    charinfo.m_CharBox.top = charBox.top;
1408    charinfo.m_CharBox.left = charBox.left;
1409    charinfo.m_CharBox.right = charBox.right;
1410    charinfo.m_CharBox.bottom = charBox.bottom;
1411    charinfo.m_Matrix.Copy(matrix);
1412    m_TempTextBuf.AppendChar(wChar);
1413    m_TempCharList.Add(charinfo);
1414  }
1415}
1416void CPDF_TextPage::FindPreviousTextObject(void) {
1417  if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
1418    return;
1419  }
1420  PAGECHAR_INFO preChar;
1421  if (m_TempCharList.GetSize() >= 1) {
1422    preChar =
1423        *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1424  } else {
1425    preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
1426  }
1427  if (preChar.m_pTextObj) {
1428    m_pPreTextObj = preChar.m_pTextObj;
1429  }
1430}
1431void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
1432                                    int32_t iBufStartAppend) {
1433  int32_t i, j;
1434  i = iCharListStartAppend;
1435  j = m_TempCharList.GetSize() - 1;
1436  for (; i < j; i++, j--) {
1437    std::swap(m_TempCharList[i], m_TempCharList[j]);
1438    std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
1439  }
1440  FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
1441  i = iBufStartAppend;
1442  j = m_TempTextBuf.GetLength() - 1;
1443  for (; i < j; i++, j--) {
1444    std::swap(pTempBuffer[i], pTempBuffer[j]);
1445  }
1446}
1447FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
1448                                     const CPDF_Font* pFont,
1449                                     int nItems) const {
1450  std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
1451  int32_t nR2L = 0;
1452  int32_t nL2R = 0;
1453  int32_t start = 0, count = 0;
1454  CPDF_TextObjectItem item;
1455  for (int32_t i = 0; i < nItems; i++) {
1456    pTextObj->GetItemInfo(i, &item);
1457    if (item.m_CharCode == (FX_DWORD)-1) {
1458      continue;
1459    }
1460    CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1461    FX_WCHAR wChar = wstrItem.GetAt(0);
1462    if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1463      wChar = (FX_WCHAR)item.m_CharCode;
1464    }
1465    if (!wChar) {
1466      continue;
1467    }
1468    if (pBidiChar->AppendChar(wChar)) {
1469      CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1470      if (ret == CFX_BidiChar::RIGHT) {
1471        nR2L++;
1472      } else if (ret == CFX_BidiChar::LEFT) {
1473        nL2R++;
1474      }
1475    }
1476  }
1477  if (pBidiChar->EndChar()) {
1478    CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
1479    if (ret == CFX_BidiChar::RIGHT) {
1480      nR2L++;
1481    } else if (ret == CFX_BidiChar::LEFT) {
1482      nL2R++;
1483    }
1484  }
1485  return (nR2L > 0 && nR2L >= nL2R);
1486}
1487void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
1488  CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1489  if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1490    return;
1491  }
1492  CFX_Matrix formMatrix = Obj.m_formMatrix;
1493  CPDF_Font* pFont = pTextObj->GetFont();
1494  CFX_Matrix matrix;
1495  pTextObj->GetTextMatrix(&matrix);
1496  matrix.Concat(formMatrix);
1497  int32_t bPreMKC = PreMarkedContent(Obj);
1498  if (FPDFTEXT_MC_DONE == bPreMKC) {
1499    m_pPreTextObj = pTextObj;
1500    m_perMatrix.Copy(formMatrix);
1501    return;
1502  }
1503  int result = 0;
1504  if (m_pPreTextObj) {
1505    result = ProcessInsertObject(pTextObj, formMatrix);
1506    if (2 == result) {
1507      m_CurlineRect =
1508          CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1509                        Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1510    } else {
1511      m_CurlineRect.Union(
1512          CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1513                        Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
1514    }
1515    PAGECHAR_INFO generateChar;
1516    if (result == 1) {
1517      if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
1518        if (!formMatrix.IsIdentity()) {
1519          generateChar.m_Matrix.Copy(formMatrix);
1520        }
1521        m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1522        m_TempCharList.Add(generateChar);
1523      }
1524    } else if (result == 2) {
1525      CloseTempLine();
1526      if (m_TextBuf.GetSize()) {
1527        if (m_ParseOptions.m_bGetCharCodeOnly) {
1528          m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1529          m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1530        } else {
1531          if (GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
1532            m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1533            if (!formMatrix.IsIdentity()) {
1534              generateChar.m_Matrix.Copy(formMatrix);
1535            }
1536            m_charList.Add(generateChar);
1537          }
1538          if (GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
1539            m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1540            if (!formMatrix.IsIdentity()) {
1541              generateChar.m_Matrix.Copy(formMatrix);
1542            }
1543            m_charList.Add(generateChar);
1544          }
1545        }
1546      }
1547    } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
1548      int32_t nChars = pTextObj->CountChars();
1549      if (nChars == 1) {
1550        CPDF_TextObjectItem item;
1551        pTextObj->GetCharInfo(0, &item);
1552        CFX_WideString wstrItem =
1553            pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1554        if (wstrItem.IsEmpty()) {
1555          wstrItem += (FX_WCHAR)item.m_CharCode;
1556        }
1557        FX_WCHAR curChar = wstrItem.GetAt(0);
1558        if (0x2D == curChar || 0xAD == curChar) {
1559          return;
1560        }
1561      }
1562      while (m_TempTextBuf.GetSize() > 0 &&
1563             m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() -
1564                                                 1) == 0x20) {
1565        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1566        m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1567      }
1568      PAGECHAR_INFO* cha =
1569          (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1570      m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1571      cha->m_Unicode = 0x2;
1572      cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1573      m_TempTextBuf.AppendChar(0xfffe);
1574    }
1575  } else {
1576    m_CurlineRect =
1577        CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1578                      Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1579  }
1580  if (FPDFTEXT_MC_DELAY == bPreMKC) {
1581    ProcessMarkedContent(Obj);
1582    m_pPreTextObj = pTextObj;
1583    m_perMatrix.Copy(formMatrix);
1584    return;
1585  }
1586  m_pPreTextObj = pTextObj;
1587  m_perMatrix.Copy(formMatrix);
1588  int nItems = pTextObj->CountItems();
1589  FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix);
1590
1591  const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1592  const FX_BOOL bIsBidiAndMirrorInverse =
1593      bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1594  int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1595  int32_t iCharListStartAppend = m_TempCharList.GetSize();
1596
1597  FX_FLOAT spacing = 0;
1598  for (int i = 0; i < nItems; i++) {
1599    CPDF_TextObjectItem item;
1600    PAGECHAR_INFO charinfo;
1601    charinfo.m_OriginX = 0;
1602    charinfo.m_OriginY = 0;
1603    pTextObj->GetItemInfo(i, &item);
1604    if (item.m_CharCode == (FX_DWORD)-1) {
1605      CFX_WideString str = m_TempTextBuf.GetWideString();
1606      if (str.IsEmpty()) {
1607        str = m_TextBuf.GetWideString();
1608      }
1609      if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1610        continue;
1611      }
1612      FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1613      spacing = -fontsize_h * item.m_OriginX / 1000;
1614      continue;
1615    }
1616    FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace;
1617    if (charSpace > 0.001) {
1618      spacing += matrix.TransformDistance(charSpace);
1619    } else if (charSpace < -0.001) {
1620      spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1621    }
1622    spacing -= baseSpace;
1623    if (spacing && i > 0) {
1624      int last_width = 0;
1625      FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1626      FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
1627      FX_FLOAT threshold = 0;
1628      if (space_charcode != -1) {
1629        threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1630      }
1631      if (threshold > fontsize_h / 3) {
1632        threshold = 0;
1633      } else {
1634        threshold /= 2;
1635      }
1636      if (threshold == 0) {
1637        threshold = fontsize_h;
1638        int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1639        threshold = this_width > last_width ? (FX_FLOAT)this_width
1640                                            : (FX_FLOAT)last_width;
1641        threshold = _NormalizeThreshold(threshold);
1642        threshold = fontsize_h * threshold / 1000;
1643      }
1644      if (threshold && (spacing && spacing >= threshold)) {
1645        charinfo.m_Unicode = TEXT_BLANK_CHAR;
1646        charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1647        charinfo.m_pTextObj = pTextObj;
1648        charinfo.m_Index = m_TextBuf.GetLength();
1649        m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1650        charinfo.m_CharCode = -1;
1651        charinfo.m_Matrix.Copy(formMatrix);
1652        matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1653                         charinfo.m_OriginY);
1654        charinfo.m_CharBox =
1655            CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY,
1656                          charinfo.m_OriginX, charinfo.m_OriginY);
1657        m_TempCharList.Add(charinfo);
1658      }
1659      if (item.m_CharCode == (FX_DWORD)-1) {
1660        continue;
1661      }
1662    }
1663    spacing = 0;
1664    CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1665    FX_BOOL bNoUnicode = FALSE;
1666    FX_WCHAR wChar = wstrItem.GetAt(0);
1667    if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1668      if (wstrItem.IsEmpty()) {
1669        wstrItem += (FX_WCHAR)item.m_CharCode;
1670      } else {
1671        wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
1672      }
1673      bNoUnicode = TRUE;
1674    }
1675    charinfo.m_Index = -1;
1676    charinfo.m_CharCode = item.m_CharCode;
1677    if (bNoUnicode) {
1678      charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1679    } else {
1680      charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1681    }
1682    charinfo.m_pTextObj = pTextObj;
1683    charinfo.m_OriginX = 0, charinfo.m_OriginY = 0;
1684    matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1685                     charinfo.m_OriginY);
1686    FX_RECT rect(0, 0, 0, 0);
1687    rect.Intersect(0, 0, 0, 0);
1688    charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect);
1689    charinfo.m_CharBox.top =
1690        rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1691    charinfo.m_CharBox.left =
1692        rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1693    charinfo.m_CharBox.right =
1694        rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1695    charinfo.m_CharBox.bottom =
1696        rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1697    if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1698      charinfo.m_CharBox.top =
1699          charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1700    }
1701    if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1702      charinfo.m_CharBox.right =
1703          charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1704    }
1705    matrix.TransformRect(charinfo.m_CharBox);
1706    charinfo.m_Matrix.Copy(matrix);
1707    if (wstrItem.IsEmpty()) {
1708      charinfo.m_Unicode = 0;
1709      m_TempCharList.Add(charinfo);
1710      m_TempTextBuf.AppendChar(0xfffe);
1711      continue;
1712    } else {
1713      int nTotal = wstrItem.GetLength();
1714      FX_BOOL bDel = FALSE;
1715      const int count = std::min(m_TempCharList.GetSize(), 7);
1716      FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1717          (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1718      for (int n = m_TempCharList.GetSize();
1719           n > m_TempCharList.GetSize() - count; n--) {
1720        PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1);
1721        if (charinfo1->m_CharCode == charinfo.m_CharCode &&
1722            charinfo1->m_pTextObj->GetFont() ==
1723                charinfo.m_pTextObj->GetFont() &&
1724            FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold &&
1725            FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) {
1726          bDel = TRUE;
1727          break;
1728        }
1729      }
1730      if (!bDel) {
1731        for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1732          charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1733          if (charinfo.m_Unicode) {
1734            charinfo.m_Index = m_TextBuf.GetLength();
1735            m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1736          } else {
1737            m_TempTextBuf.AppendChar(0xfffe);
1738          }
1739          m_TempCharList.Add(charinfo);
1740        }
1741      } else if (i == 0) {
1742        CFX_WideString str = m_TempTextBuf.GetWideString();
1743        if (!str.IsEmpty() &&
1744            str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1745          m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1746          m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1747        }
1748      }
1749    }
1750  }
1751  if (bIsBidiAndMirrorInverse) {
1752    SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1753  }
1754}
1755int32_t CPDF_TextPage::GetTextObjectWritingMode(
1756    const CPDF_TextObject* pTextObj) {
1757  int32_t nChars = pTextObj->CountChars();
1758  if (nChars == 1) {
1759    return m_TextlineDir;
1760  }
1761  CPDF_TextObjectItem first, last;
1762  pTextObj->GetCharInfo(0, &first);
1763  pTextObj->GetCharInfo(nChars - 1, &last);
1764  CFX_Matrix textMatrix;
1765  pTextObj->GetTextMatrix(&textMatrix);
1766  textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
1767  textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
1768  FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
1769  FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
1770  if (dX <= 0.0001f && dY <= 0.0001f) {
1771    return -1;
1772  }
1773  CFX_VectorF v;
1774  v.Set(dX, dY);
1775  v.Normalize();
1776  if (v.y <= 0.0872f) {
1777    return v.x <= 0.0872f ? m_TextlineDir : 0;
1778  }
1779  if (v.x <= 0.0872f) {
1780    return 1;
1781  }
1782  return m_TextlineDir;
1783}
1784FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1785  CFX_WideString strCurText = m_TempTextBuf.GetWideString();
1786  if (strCurText.GetLength() == 0) {
1787    strCurText = m_TextBuf.GetWideString();
1788  }
1789  FX_STRSIZE nCount = strCurText.GetLength();
1790  int nIndex = nCount - 1;
1791  FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1792  while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) {
1793    wcTmp = strCurText.GetAt(--nIndex);
1794  }
1795  if (0x2D == wcTmp || 0xAD == wcTmp) {
1796    if (--nIndex > 0) {
1797      FX_WCHAR preChar = strCurText.GetAt((nIndex));
1798      if (((preChar >= L'A' && preChar <= L'Z') ||
1799           (preChar >= L'a' && preChar <= L'z')) &&
1800          ((curChar >= L'A' && curChar <= L'Z') ||
1801           (curChar >= L'a' && curChar <= L'z'))) {
1802        return TRUE;
1803      }
1804    }
1805    int size = m_TempCharList.GetSize();
1806    PAGECHAR_INFO preChar;
1807    if (size) {
1808      preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
1809    } else {
1810      size = m_charList.GetSize();
1811      if (size == 0) {
1812        return FALSE;
1813      }
1814      preChar = (PAGECHAR_INFO)m_charList[size - 1];
1815    }
1816    if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag &&
1817        (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode)) {
1818      return TRUE;
1819    }
1820  }
1821  return FALSE;
1822}
1823int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj,
1824                                       const CFX_Matrix& formMatrix) {
1825  FindPreviousTextObject();
1826  FX_BOOL bNewline = FALSE;
1827  int WritingMode = GetTextObjectWritingMode(pObj);
1828  if (WritingMode == -1) {
1829    WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1830  }
1831  CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right,
1832                          pObj->m_Top);
1833  CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1834                          m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1835  CPDF_TextObjectItem PrevItem, item;
1836  int nItem = m_pPreTextObj->CountItems();
1837  m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1838  pObj->GetItemInfo(0, &item);
1839  CFX_WideString wstrItem =
1840      pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1841  if (wstrItem.IsEmpty()) {
1842    wstrItem += (FX_WCHAR)item.m_CharCode;
1843  }
1844  FX_WCHAR curChar = wstrItem.GetAt(0);
1845  if (WritingMode == 0) {
1846    if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1847      FX_FLOAT top =
1848          this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1849      FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1850                                                            : prev_rect.bottom;
1851      if (bottom >= top) {
1852        if (IsHyphen(curChar)) {
1853          return 3;
1854        }
1855        return 2;
1856      }
1857    }
1858  } else if (WritingMode == 1) {
1859    if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1860        prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1861      FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1862                                                          : m_CurlineRect.left;
1863      FX_FLOAT right = this_rect.right < m_CurlineRect.right
1864                           ? this_rect.right
1865                           : m_CurlineRect.right;
1866      if (right <= left) {
1867        if (IsHyphen(curChar)) {
1868          return 3;
1869        }
1870        return 2;
1871      }
1872    }
1873  }
1874  FX_FLOAT last_pos = PrevItem.m_OriginX;
1875  int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1876  FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1877  last_width = FXSYS_fabs(last_width);
1878  int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1879  FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1880  this_width = FXSYS_fabs(this_width);
1881  FX_FLOAT threshold =
1882      last_width > this_width ? last_width / 4 : this_width / 4;
1883  CFX_Matrix prev_matrix, prev_reverse;
1884  m_pPreTextObj->GetTextMatrix(&prev_matrix);
1885  prev_matrix.Concat(m_perMatrix);
1886  prev_reverse.SetReverse(prev_matrix);
1887  FX_FLOAT x = pObj->GetPosX();
1888  FX_FLOAT y = pObj->GetPosY();
1889  formMatrix.Transform(x, y);
1890  prev_reverse.Transform(x, y);
1891  if (last_width < this_width) {
1892    threshold = prev_reverse.TransformDistance(threshold);
1893  }
1894  CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1895                      m_pPreTextObj->m_Right, pObj->m_Top);
1896  CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1897                      m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1898  CFX_FloatRect rect3 = rect1;
1899  rect1.Intersect(rect2);
1900  if (WritingMode == 0) {
1901    if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1902        ((y > threshold * 2 || y < threshold * -3) &&
1903         (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
1904      bNewline = TRUE;
1905      if (nItem > 1) {
1906        CPDF_TextObjectItem tempItem;
1907        m_pPreTextObj->GetItemInfo(0, &tempItem);
1908        CFX_Matrix m;
1909        m_pPreTextObj->GetTextMatrix(&m);
1910        if (PrevItem.m_OriginX > tempItem.m_OriginX &&
1911            m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1912            m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1913            m.c < 0.1) {
1914          CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1915                           m_pPreTextObj->m_Top);
1916          if (re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
1917            bNewline = FALSE;
1918          } else {
1919            CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
1920            if (re.Contains(m_pPreTextObj->GetPosX(),
1921                            m_pPreTextObj->GetPosY())) {
1922              bNewline = FALSE;
1923            }
1924          }
1925        }
1926      }
1927    }
1928  }
1929  if (bNewline)
1930    return IsHyphen(curChar) ? 3 : 2;
1931
1932  int32_t nChars = pObj->CountChars();
1933  if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
1934      IsHyphen(curChar)) {
1935    return 3;
1936  }
1937  CFX_WideString PrevStr =
1938      m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1939  FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1940  CFX_Matrix matrix;
1941  pObj->GetTextMatrix(&matrix);
1942  matrix.Concat(formMatrix);
1943  threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1944  threshold = threshold > 400
1945                  ? (threshold < 700
1946                         ? threshold / 4
1947                         : (threshold > 800 ? threshold / 6 : threshold / 5))
1948                  : (threshold / 2);
1949  if (nLastWidth >= nThisWidth) {
1950    threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1951  } else {
1952    threshold *= FXSYS_fabs(pObj->GetFontSize());
1953    threshold = matrix.TransformDistance(threshold);
1954    threshold = prev_reverse.TransformDistance(threshold);
1955  }
1956  threshold /= 1000;
1957  if ((threshold < 1.4881 && threshold > 1.4879) ||
1958      (threshold < 1.39001 && threshold > 1.38999)) {
1959    threshold *= 1.5;
1960  }
1961  if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
1962      preChar != L' ') {
1963    if (curChar != L' ' && preChar != L' ') {
1964      if ((x - last_pos - last_width) > threshold ||
1965          (last_pos - x - last_width) > threshold) {
1966        return 1;
1967      }
1968      if (x < 0 && (last_pos - x - last_width) > threshold) {
1969        return 1;
1970      }
1971      if ((x - last_pos - last_width) > this_width ||
1972          (x - last_pos - this_width) > last_width) {
1973        return 1;
1974      }
1975    }
1976  }
1977  return 0;
1978}
1979FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1980                                        CPDF_TextObject* pTextObj2) {
1981  if (!pTextObj1 || !pTextObj2) {
1982    return FALSE;
1983  }
1984  CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
1985                         pTextObj2->m_Right, pTextObj2->m_Top);
1986  CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
1987                         pTextObj1->m_Right, pTextObj1->m_Top);
1988  if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() &&
1989      !m_ParseOptions.m_bGetCharCodeOnly) {
1990    FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1991    int nCount = m_charList.GetSize();
1992    if (nCount >= 2) {
1993      PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
1994      FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1995      if (dbXdif > dbSpace) {
1996        return FALSE;
1997      }
1998    }
1999  }
2000  if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
2001    rcPreObj.Intersect(rcCurObj);
2002    if (rcPreObj.IsEmpty()) {
2003      return FALSE;
2004    }
2005    if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
2006        rcCurObj.Width() / 2) {
2007      return FALSE;
2008    }
2009    if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
2010      return FALSE;
2011    }
2012  }
2013  int nPreCount = pTextObj2->CountItems();
2014  int nCurCount = pTextObj1->CountItems();
2015  if (nPreCount != nCurCount) {
2016    return FALSE;
2017  }
2018  CPDF_TextObjectItem itemPer, itemCur;
2019  for (int i = 0; i < nPreCount; i++) {
2020    pTextObj2->GetItemInfo(i, &itemPer);
2021    pTextObj1->GetItemInfo(i, &itemCur);
2022    if (itemCur.m_CharCode != itemPer.m_CharCode) {
2023      return FALSE;
2024    }
2025  }
2026  if (FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) >
2027          GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) *
2028              pTextObj2->GetFontSize() / 1000 * 0.9 ||
2029      FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
2030          std::max(std::max(rcPreObj.Height(), rcPreObj.Width()),
2031                   pTextObj2->GetFontSize()) /
2032              8) {
2033    return FALSE;
2034  }
2035  return TRUE;
2036}
2037FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
2038                                             FX_POSITION ObjPos) {
2039  if (!pTextObj) {
2040    return FALSE;
2041  }
2042  int i = 0;
2043  if (!ObjPos) {
2044    ObjPos = m_pPage->GetLastObjectPosition();
2045  }
2046  CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
2047  while (i < 5 && ObjPos) {
2048    pObj = m_pPage->GetPrevObject(ObjPos);
2049    if (pObj == pTextObj) {
2050      continue;
2051    }
2052    if (pObj->m_Type != PDFPAGE_TEXT) {
2053      continue;
2054    }
2055    if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
2056      return TRUE;
2057    }
2058    i++;
2059  }
2060  return FALSE;
2061}
2062
2063FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
2064  int size = m_TempCharList.GetSize();
2065  PAGECHAR_INFO preChar;
2066  if (size) {
2067    preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
2068  } else {
2069    size = m_charList.GetSize();
2070    if (size == 0) {
2071      return FALSE;
2072    }
2073    preChar = (PAGECHAR_INFO)m_charList[size - 1];
2074  }
2075  info.m_Index = m_TextBuf.GetLength();
2076  info.m_Unicode = unicode;
2077  info.m_pTextObj = NULL;
2078  info.m_CharCode = -1;
2079  info.m_Flag = FPDFTEXT_CHAR_GENERATED;
2080  int preWidth = 0;
2081  if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD)-1)
2082    preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont());
2083
2084  FX_FLOAT fFontSize = preChar.m_pTextObj ? preChar.m_pTextObj->GetFontSize()
2085                                          : preChar.m_CharBox.Height();
2086  if (!fFontSize)
2087    fFontSize = kDefaultFontSize;
2088
2089  info.m_OriginX = preChar.m_OriginX + preWidth * (fFontSize) / 1000;
2090  info.m_OriginY = preChar.m_OriginY;
2091  info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX,
2092                                 info.m_OriginY);
2093  return TRUE;
2094}
2095
2096FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
2097                                       const CFX_FloatRect& rect2) {
2098  CFX_FloatRect rect = rect1;
2099  rect.Intersect(rect2);
2100  return !rect.IsEmpty();
2101}
2102FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) {
2103  if (unicode < L'A') {
2104    return FALSE;
2105  }
2106  if (unicode > L'Z' && unicode < L'a') {
2107    return FALSE;
2108  }
2109  if (unicode > L'z') {
2110    return FALSE;
2111  }
2112  return TRUE;
2113}
2114CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
2115    : m_pTextPage(pTextPage),
2116      m_flags(0),
2117      m_findNextStart(-1),
2118      m_findPreStart(-1),
2119      m_bMatchCase(FALSE),
2120      m_bMatchWholeWord(FALSE),
2121      m_resStart(0),
2122      m_resEnd(-1),
2123      m_IsFind(FALSE) {
2124  m_strText = m_pTextPage->GetPageText();
2125  int nCount = pTextPage->CountChars();
2126  if (nCount) {
2127    m_CharIndex.Add(0);
2128  }
2129  for (int i = 0; i < nCount; i++) {
2130    FPDF_CHAR_INFO info;
2131    pTextPage->GetCharInfo(i, &info);
2132    int indexSize = m_CharIndex.GetSize();
2133    if (info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) {
2134      if (indexSize % 2) {
2135        m_CharIndex.Add(1);
2136      } else {
2137        if (indexSize <= 0) {
2138          continue;
2139        }
2140        m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
2141      }
2142    } else {
2143      if (indexSize % 2) {
2144        if (indexSize <= 0) {
2145          continue;
2146        }
2147        m_CharIndex.SetAt(indexSize - 1, i + 1);
2148      } else {
2149        m_CharIndex.Add(i + 1);
2150      }
2151    }
2152  }
2153  int indexSize = m_CharIndex.GetSize();
2154  if (indexSize % 2) {
2155    m_CharIndex.RemoveAt(indexSize - 1);
2156  }
2157}
2158int CPDF_TextPageFind::GetCharIndex(int index) const {
2159  return m_pTextPage->CharIndexFromTextIndex(index);
2160  int indexSize = m_CharIndex.GetSize();
2161  int count = 0;
2162  for (int i = 0; i < indexSize; i += 2) {
2163    count += m_CharIndex.GetAt(i + 1);
2164    if (count > index) {
2165      return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
2166    }
2167  }
2168  return -1;
2169}
2170FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
2171                                     int flags,
2172                                     int startPos) {
2173  if (!m_pTextPage) {
2174    return FALSE;
2175  }
2176  if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
2177    m_strText = m_pTextPage->GetPageText();
2178  }
2179  CFX_WideString findwhatStr = findwhat;
2180  m_findWhat = findwhatStr;
2181  m_flags = flags;
2182  m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
2183  if (m_strText.IsEmpty()) {
2184    m_IsFind = FALSE;
2185    return TRUE;
2186  }
2187  FX_STRSIZE len = findwhatStr.GetLength();
2188  if (!m_bMatchCase) {
2189    findwhatStr.MakeLower();
2190    m_strText.MakeLower();
2191  }
2192  m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
2193  m_findNextStart = startPos;
2194  if (startPos == -1) {
2195    m_findPreStart = m_strText.GetLength() - 1;
2196  } else {
2197    m_findPreStart = startPos;
2198  }
2199  m_csFindWhatArray.RemoveAll();
2200  int i = 0;
2201  while (i < len) {
2202    if (findwhatStr.GetAt(i) != ' ') {
2203      break;
2204    }
2205    i++;
2206  }
2207  if (i < len) {
2208    ExtractFindWhat(findwhatStr);
2209  } else {
2210    m_csFindWhatArray.Add(findwhatStr);
2211  }
2212  if (m_csFindWhatArray.GetSize() <= 0) {
2213    return FALSE;
2214  }
2215  m_IsFind = TRUE;
2216  m_resStart = 0;
2217  m_resEnd = -1;
2218  return TRUE;
2219}
2220FX_BOOL CPDF_TextPageFind::FindNext() {
2221  if (!m_pTextPage) {
2222    return FALSE;
2223  }
2224  m_resArray.RemoveAll();
2225  if (m_findNextStart == -1) {
2226    return FALSE;
2227  }
2228  if (m_strText.IsEmpty()) {
2229    m_IsFind = FALSE;
2230    return m_IsFind;
2231  }
2232  int strLen = m_strText.GetLength();
2233  if (m_findNextStart > strLen - 1) {
2234    m_IsFind = FALSE;
2235    return m_IsFind;
2236  }
2237  int nCount = m_csFindWhatArray.GetSize();
2238  int nResultPos = 0;
2239  int nStartPos = 0;
2240  nStartPos = m_findNextStart;
2241  FX_BOOL bSpaceStart = FALSE;
2242  for (int iWord = 0; iWord < nCount; iWord++) {
2243    CFX_WideString csWord = m_csFindWhatArray[iWord];
2244    if (csWord.IsEmpty()) {
2245      if (iWord == nCount - 1) {
2246        FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
2247        if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR ||
2248            strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
2249          nResultPos = nStartPos + 1;
2250          break;
2251        }
2252        iWord = -1;
2253      } else if (iWord == 0) {
2254        bSpaceStart = TRUE;
2255      }
2256      continue;
2257    }
2258    int endIndex;
2259    nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
2260    if (nResultPos == -1) {
2261      m_IsFind = FALSE;
2262      return m_IsFind;
2263    }
2264    endIndex = nResultPos + csWord.GetLength() - 1;
2265    if (iWord == 0) {
2266      m_resStart = nResultPos;
2267    }
2268    FX_BOOL bMatch = TRUE;
2269    if (iWord != 0 && !bSpaceStart) {
2270      int PreResEndPos = nStartPos;
2271      int curChar = csWord.GetAt(0);
2272      CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
2273      int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
2274      if (nStartPos == nResultPos &&
2275          !(_IsIgnoreSpaceCharacter(lastChar) ||
2276            _IsIgnoreSpaceCharacter(curChar))) {
2277        bMatch = FALSE;
2278      }
2279      for (int d = PreResEndPos; d < nResultPos; d++) {
2280        FX_WCHAR strInsert = m_strText.GetAt(d);
2281        if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2282            strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2283          bMatch = FALSE;
2284          break;
2285        }
2286      }
2287    } else if (bSpaceStart) {
2288      if (nResultPos > 0) {
2289        FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
2290        if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2291            strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2292          bMatch = FALSE;
2293          m_resStart = nResultPos;
2294        } else {
2295          m_resStart = nResultPos - 1;
2296        }
2297      }
2298    }
2299    if (m_bMatchWholeWord && bMatch) {
2300      bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
2301    }
2302    nStartPos = endIndex + 1;
2303    if (!bMatch) {
2304      iWord = -1;
2305      if (bSpaceStart) {
2306        nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
2307      } else {
2308        nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
2309      }
2310    }
2311  }
2312  m_resEnd = nResultPos +
2313             m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1;
2314  m_IsFind = TRUE;
2315  int resStart = GetCharIndex(m_resStart);
2316  int resEnd = GetCharIndex(m_resEnd);
2317  m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
2318  if (m_flags & FPDFTEXT_CONSECUTIVE) {
2319    m_findNextStart = m_resStart + 1;
2320    m_findPreStart = m_resEnd - 1;
2321  } else {
2322    m_findNextStart = m_resEnd + 1;
2323    m_findPreStart = m_resStart - 1;
2324  }
2325  return m_IsFind;
2326}
2327FX_BOOL CPDF_TextPageFind::FindPrev() {
2328  if (!m_pTextPage) {
2329    return FALSE;
2330  }
2331  m_resArray.RemoveAll();
2332  if (m_strText.IsEmpty() || m_findPreStart < 0) {
2333    m_IsFind = FALSE;
2334    return m_IsFind;
2335  }
2336  CPDF_TextPageFind findEngine(m_pTextPage);
2337  FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
2338  if (!ret) {
2339    m_IsFind = FALSE;
2340    return m_IsFind;
2341  }
2342  int order = -1, MatchedCount = 0;
2343  while (ret) {
2344    ret = findEngine.FindNext();
2345    if (ret) {
2346      int order1 = findEngine.GetCurOrder();
2347      int MatchedCount1 = findEngine.GetMatchedCount();
2348      if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
2349        break;
2350      }
2351      order = order1;
2352      MatchedCount = MatchedCount1;
2353    }
2354  }
2355  if (order == -1) {
2356    m_IsFind = FALSE;
2357    return m_IsFind;
2358  }
2359  m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
2360  m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
2361  m_IsFind = TRUE;
2362  m_pTextPage->GetRectArray(order, MatchedCount, m_resArray);
2363  if (m_flags & FPDFTEXT_CONSECUTIVE) {
2364    m_findNextStart = m_resStart + 1;
2365    m_findPreStart = m_resEnd - 1;
2366  } else {
2367    m_findNextStart = m_resEnd + 1;
2368    m_findPreStart = m_resStart - 1;
2369  }
2370  return m_IsFind;
2371}
2372void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
2373  if (findwhat.IsEmpty()) {
2374    return;
2375  }
2376  int index = 0;
2377  while (1) {
2378    CFX_WideString csWord = TEXT_EMPTY;
2379    int ret =
2380        ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR);
2381    if (csWord.IsEmpty()) {
2382      if (ret) {
2383        m_csFindWhatArray.Add(CFX_WideString(L""));
2384        index++;
2385        continue;
2386      } else {
2387        break;
2388      }
2389    }
2390    int pos = 0;
2391    while (pos < csWord.GetLength()) {
2392      CFX_WideString curStr = csWord.Mid(pos, 1);
2393      FX_WCHAR curChar = csWord.GetAt(pos);
2394      if (_IsIgnoreSpaceCharacter(curChar)) {
2395        if (pos > 0 && curChar == 0x2019) {
2396          pos++;
2397          continue;
2398        }
2399        if (pos > 0) {
2400          CFX_WideString preStr = csWord.Mid(0, pos);
2401          m_csFindWhatArray.Add(preStr);
2402        }
2403        m_csFindWhatArray.Add(curStr);
2404        if (pos == csWord.GetLength() - 1) {
2405          csWord.Empty();
2406          break;
2407        }
2408        csWord = csWord.Right(csWord.GetLength() - pos - 1);
2409        pos = 0;
2410        continue;
2411      }
2412      pos++;
2413    }
2414    if (!csWord.IsEmpty()) {
2415      m_csFindWhatArray.Add(csWord);
2416    }
2417    index++;
2418  }
2419}
2420FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
2421                                            int startPos,
2422                                            int endPos) {
2423  FX_WCHAR char_left = 0;
2424  FX_WCHAR char_right = 0;
2425  int char_count = endPos - startPos + 1;
2426  if (char_count < 1) {
2427    return FALSE;
2428  }
2429  if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
2430    return TRUE;
2431  }
2432  if (startPos - 1 >= 0) {
2433    char_left = csPageText.GetAt(startPos - 1);
2434  }
2435  if (startPos + char_count < csPageText.GetLength()) {
2436    char_right = csPageText.GetAt(startPos + char_count);
2437  }
2438  if ((char_left > 'A' && char_left < 'a') ||
2439      (char_left > 'a' && char_left < 'z') ||
2440      (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
2441      (char_right > 'A' && char_right < 'a') ||
2442      (char_right > 'a' && char_right < 'z') ||
2443      (char_right > 0xfb00 && char_right < 0xfb06) ||
2444      std::iswdigit(char_right)) {
2445    return FALSE;
2446  }
2447  if (!(('A' > char_left || char_left > 'Z') &&
2448        ('a' > char_left || char_left > 'z') &&
2449        ('A' > char_right || char_right > 'Z') &&
2450        ('a' > char_right || char_right > 'z'))) {
2451    return FALSE;
2452  }
2453  if (char_count > 0) {
2454    if (csPageText.GetAt(startPos) >= L'0' &&
2455        csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
2456        char_left <= L'9') {
2457      return FALSE;
2458    }
2459    if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
2460        char_right >= L'0' && char_right <= L'9') {
2461      return FALSE;
2462    }
2463  }
2464  return TRUE;
2465}
2466FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
2467                                            const FX_WCHAR* lpszFullString,
2468                                            int iSubString,
2469                                            FX_WCHAR chSep) {
2470  if (!lpszFullString) {
2471    return FALSE;
2472  }
2473  while (iSubString--) {
2474    lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
2475    if (!lpszFullString) {
2476      rString.Empty();
2477      return FALSE;
2478    }
2479    lpszFullString++;
2480    while (*lpszFullString == chSep) {
2481      lpszFullString++;
2482    }
2483  }
2484  const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
2485  int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
2486                     : (int)FXSYS_wcslen(lpszFullString);
2487  ASSERT(nLen >= 0);
2488  FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
2489               nLen * sizeof(FX_WCHAR));
2490  rString.ReleaseBuffer();
2491  return TRUE;
2492}
2493CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
2494  CFX_WideString str2;
2495  str2.Empty();
2496  int nlen = str.GetLength();
2497  for (int i = nlen - 1; i >= 0; i--) {
2498    str2 += str.GetAt(i);
2499  }
2500  return str2;
2501}
2502void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const {
2503  rects.Copy(m_resArray);
2504}
2505int CPDF_TextPageFind::GetCurOrder() const {
2506  return GetCharIndex(m_resStart);
2507}
2508int CPDF_TextPageFind::GetMatchedCount() const {
2509  int resStart = GetCharIndex(m_resStart);
2510  int resEnd = GetCharIndex(m_resEnd);
2511  return resEnd - resStart + 1;
2512}
2513
2514CPDF_LinkExtract::CPDF_LinkExtract()
2515    : m_pTextPage(nullptr), m_bIsParsed(false) {
2516}
2517
2518CPDF_LinkExtract::~CPDF_LinkExtract() {
2519  DeleteLinkList();
2520}
2521
2522FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
2523  if (!pTextPage || !pTextPage->IsParsed())
2524    return FALSE;
2525
2526  m_pTextPage = (const CPDF_TextPage*)pTextPage;
2527  m_strPageText = m_pTextPage->GetPageText(0, -1);
2528  DeleteLinkList();
2529  if (m_strPageText.IsEmpty()) {
2530    return FALSE;
2531  }
2532  ParseLink();
2533  m_bIsParsed = true;
2534  return TRUE;
2535}
2536
2537void CPDF_LinkExtract::DeleteLinkList() {
2538  while (m_LinkList.GetSize()) {
2539    CPDF_LinkExt* linkinfo = NULL;
2540    linkinfo = m_LinkList.GetAt(0);
2541    m_LinkList.RemoveAt(0);
2542    delete linkinfo;
2543  }
2544  m_LinkList.RemoveAll();
2545}
2546int CPDF_LinkExtract::CountLinks() const {
2547  if (!m_bIsParsed) {
2548    return -1;
2549  }
2550  return m_LinkList.GetSize();
2551}
2552void CPDF_LinkExtract::ParseLink() {
2553  int start = 0, pos = 0;
2554  int TotalChar = m_pTextPage->CountChars();
2555  while (pos < TotalChar) {
2556    FPDF_CHAR_INFO pageChar;
2557    m_pTextPage->GetCharInfo(pos, &pageChar);
2558    if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 ||
2559        pos == TotalChar - 1) {
2560      int nCount = pos - start;
2561      if (pos == TotalChar - 1) {
2562        nCount++;
2563      }
2564      CFX_WideString strBeCheck;
2565      strBeCheck = m_pTextPage->GetPageText(start, nCount);
2566      if (strBeCheck.GetLength() > 5) {
2567        while (strBeCheck.GetLength() > 0) {
2568          FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2569          if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2570            strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2571            nCount--;
2572          } else {
2573            break;
2574          }
2575        }
2576        if (nCount > 5 &&
2577            (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
2578          AppendToLinkList(start, nCount, strBeCheck);
2579        }
2580      }
2581      start = ++pos;
2582    } else {
2583      pos++;
2584    }
2585  }
2586}
2587FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
2588  CFX_WideString str = strBeCheck;
2589  str.MakeLower();
2590  if (str.Find(L"http://www.") != -1) {
2591    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
2592    return TRUE;
2593  }
2594  if (str.Find(L"http://") != -1) {
2595    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2596    return TRUE;
2597  }
2598  if (str.Find(L"https://www.") != -1) {
2599    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
2600    return TRUE;
2601  }
2602  if (str.Find(L"https://") != -1) {
2603    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2604    return TRUE;
2605  }
2606  if (str.Find(L"www.") != -1) {
2607    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2608    strBeCheck = L"http://" + strBeCheck;
2609    return TRUE;
2610  }
2611  return FALSE;
2612}
2613bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2614  int aPos = str.Find(L'@');
2615  // Invalid when no '@'.
2616  if (aPos < 1) {
2617    return FALSE;
2618  }
2619
2620  // Check the local part.
2621  int pPos = aPos;  // Used to track the position of '@' or '.'.
2622  for (int i = aPos - 1; i >= 0; i--) {
2623    FX_WCHAR ch = str.GetAt(i);
2624    if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
2625      continue;
2626    }
2627    if (ch != L'.' || i == pPos - 1 || i == 0) {
2628      if (i == aPos - 1) {
2629        // There is '.' or invalid char before '@'.
2630        return FALSE;
2631      }
2632      // End extracting for other invalid chars, '.' at the beginning, or
2633      // consecutive '.'.
2634      int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2635      str = str.Right(str.GetLength() - removed_len);
2636      break;
2637    }
2638    // Found a valid '.'.
2639    pPos = i;
2640  }
2641
2642  // Check the domain name part.
2643  aPos = str.Find(L'@');
2644  if (aPos < 1) {
2645    return FALSE;
2646  }
2647  str.TrimRight(L'.');
2648  // At least one '.' in domain name, but not at the beginning.
2649  // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
2650  // Check whether we should remove this check.
2651  int ePos = str.Find(L'.', aPos + 1);
2652  if (ePos == -1 || ePos == aPos + 1) {
2653    return FALSE;
2654  }
2655  // Validate all other chars in domain name.
2656  int nLen = str.GetLength();
2657  pPos = 0;  // Used to track the position of '.'.
2658  for (int i = aPos + 1; i < nLen; i++) {
2659    FX_WCHAR wch = str.GetAt(i);
2660    if (wch == L'-' || FXSYS_iswalnum(wch)) {
2661      continue;
2662    }
2663    if (wch != L'.' || i == pPos + 1) {
2664      // Domain name should end before invalid char.
2665      int host_end = i == pPos + 1 ? i - 2 : i - 1;
2666      if (pPos > 0 && host_end - aPos >= 3) {
2667        // Trim the ending invalid chars if there is at least one '.' and name.
2668        str = str.Left(host_end + 1);
2669        break;
2670      }
2671      return FALSE;
2672    }
2673    pPos = i;
2674  }
2675
2676  if (str.Find(L"mailto:") == -1) {
2677    str = L"mailto:" + str;
2678  }
2679  return TRUE;
2680}
2681
2682void CPDF_LinkExtract::AppendToLinkList(int start,
2683                                        int count,
2684                                        const CFX_WideString& strUrl) {
2685  CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
2686  linkInfo->m_strUrl = strUrl;
2687  linkInfo->m_Start = start;
2688  linkInfo->m_Count = count;
2689  m_LinkList.Add(linkInfo);
2690}
2691
2692CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
2693  if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2694    return L"";
2695  }
2696  CPDF_LinkExt* link = NULL;
2697  link = m_LinkList.GetAt(index);
2698  if (!link) {
2699    return L"";
2700  }
2701  return link->m_strUrl;
2702}
2703void CPDF_LinkExtract::GetBoundedSegment(int index,
2704                                         int& start,
2705                                         int& count) const {
2706  if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2707    return;
2708  }
2709  CPDF_LinkExt* link = NULL;
2710  link = m_LinkList.GetAt(index);
2711  if (!link) {
2712    return;
2713  }
2714  start = link->m_Start;
2715  count = link->m_Count;
2716}
2717void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const {
2718  if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2719    return;
2720  }
2721  CPDF_LinkExt* link = NULL;
2722  link = m_LinkList.GetAt(index);
2723  if (!link) {
2724    return;
2725  }
2726  m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2727}
2728