cpdf_textpage.cpp revision 33357cad1fd1321a2b38d2963e2585f27ce980a2
1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdftext/cpdf_textpage.h"
8
9#include <algorithm>
10#include <utility>
11#include <vector>
12
13#include "core/fpdfapi/font/cpdf_font.h"
14#include "core/fpdfapi/page/cpdf_form.h"
15#include "core/fpdfapi/page/cpdf_formobject.h"
16#include "core/fpdfapi/page/cpdf_page.h"
17#include "core/fpdfapi/page/cpdf_pageobject.h"
18#include "core/fpdfapi/page/cpdf_textobject.h"
19#include "core/fpdfapi/parser/cpdf_dictionary.h"
20#include "core/fpdfapi/parser/cpdf_string.h"
21#include "core/fpdftext/unicodenormalizationdata.h"
22#include "core/fxcrt/fx_bidi.h"
23#include "core/fxcrt/fx_ext.h"
24#include "core/fxcrt/fx_ucd.h"
25#include "third_party/base/stl_util.h"
26
27namespace {
28
29const FX_FLOAT kDefaultFontSize = 1.0f;
30const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
31    nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
32    g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
33
34FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
35  if (threshold < 300)
36    return threshold / 2.0f;
37  if (threshold < 500)
38    return threshold / 4.0f;
39  if (threshold < 700)
40    return threshold / 5.0f;
41  return threshold / 6.0f;
42}
43
44FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj,
45                            const CFX_Matrix& matrix) {
46  FX_FLOAT baseSpace = 0.0;
47  const int nItems = pTextObj->CountItems();
48  if (pTextObj->m_TextState.GetCharSpace() && nItems >= 3) {
49    bool bAllChar = true;
50    FX_FLOAT spacing =
51        matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace());
52    baseSpace = spacing;
53    for (int i = 0; i < nItems; i++) {
54      CPDF_TextObjectItem item;
55      pTextObj->GetItemInfo(i, &item);
56      if (item.m_CharCode == static_cast<uint32_t>(-1)) {
57        FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
58        FX_FLOAT kerning = -fontsize_h * item.m_Origin.x / 1000;
59        baseSpace = std::min(baseSpace, kerning + spacing);
60        bAllChar = false;
61      }
62    }
63    if (baseSpace < 0.0 || (nItems == 3 && !bAllChar))
64      baseSpace = 0.0;
65  }
66  return baseSpace;
67}
68
69FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) {
70  wch = wch & 0xFFFF;
71  FX_WCHAR wFind = g_UnicodeData_Normalization[wch];
72  if (!wFind) {
73    if (pDst)
74      *pDst = wch;
75    return 1;
76  }
77  if (wFind >= 0x8000) {
78    wch = wFind - 0x8000;
79    wFind = 1;
80  } else {
81    wch = wFind & 0x0FFF;
82    wFind >>= 12;
83  }
84  const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind];
85  if (pMap == g_UnicodeData_Normalization_Map4) {
86    pMap = g_UnicodeData_Normalization_Map4 + wch;
87    wFind = (FX_WCHAR)(*pMap++);
88  } else {
89    pMap += wch;
90  }
91  if (pDst) {
92    FX_WCHAR n = wFind;
93    while (n--)
94      *pDst++ = *pMap++;
95  }
96  return (FX_STRSIZE)wFind;
97}
98
99float MaskPercentFilled(const std::vector<bool>& mask,
100                        int32_t start,
101                        int32_t end) {
102  if (start >= end)
103    return 0;
104  float count = std::count_if(mask.begin() + start, mask.begin() + end,
105                              [](bool r) { return r; });
106  return count / (end - start);
107}
108
109}  // namespace
110
111FPDF_CHAR_INFO::FPDF_CHAR_INFO()
112    : m_Unicode(0),
113      m_Charcode(0),
114      m_Flag(0),
115      m_FontSize(0),
116      m_pTextObj(nullptr) {}
117
118FPDF_CHAR_INFO::~FPDF_CHAR_INFO() {}
119
120PAGECHAR_INFO::PAGECHAR_INFO()
121    : m_Index(0), m_CharCode(0), m_Unicode(0), m_Flag(0), m_pTextObj(nullptr) {}
122
123PAGECHAR_INFO::PAGECHAR_INFO(const PAGECHAR_INFO&) = default;
124
125PAGECHAR_INFO::~PAGECHAR_INFO() {}
126
127CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags)
128    : m_pPage(pPage),
129      m_parserflag(flags),
130      m_pPreTextObj(nullptr),
131      m_bIsParsed(false),
132      m_TextlineDir(TextOrientation::Unknown) {
133  m_TextBuf.EstimateSize(0, 10240);
134  m_DisplayMatrix =
135      pPage->GetDisplayMatrix(0, 0, static_cast<int>(pPage->GetPageWidth()),
136                              static_cast<int>(pPage->GetPageHeight()), 0);
137}
138
139CPDF_TextPage::~CPDF_TextPage() {}
140
141bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
142  switch (charInfo.m_Unicode) {
143    case 0x2:
144    case 0x3:
145    case 0x93:
146    case 0x94:
147    case 0x96:
148    case 0x97:
149    case 0x98:
150    case 0xfffe:
151      return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
152    default:
153      return false;
154  }
155}
156
157void CPDF_TextPage::ParseTextPage() {
158  m_bIsParsed = false;
159  m_TextBuf.Clear();
160  m_CharList.clear();
161  m_pPreTextObj = nullptr;
162  ProcessObject();
163
164  m_bIsParsed = true;
165  m_CharIndex.clear();
166  int nCount = pdfium::CollectionSize<int>(m_CharList);
167  if (nCount)
168    m_CharIndex.push_back(0);
169
170  for (int i = 0; i < nCount; i++) {
171    int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
172    const PAGECHAR_INFO& charinfo = m_CharList[i];
173    if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED ||
174        (charinfo.m_Unicode != 0 && !IsControlChar(charinfo))) {
175      if (indexSize % 2) {
176        m_CharIndex.push_back(1);
177      } else {
178        if (indexSize <= 0)
179          continue;
180        m_CharIndex[indexSize - 1] += 1;
181      }
182    } else {
183      if (indexSize % 2) {
184        if (indexSize <= 0)
185          continue;
186        m_CharIndex[indexSize - 1] = i + 1;
187      } else {
188        m_CharIndex.push_back(i + 1);
189      }
190    }
191  }
192  int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
193  if (indexSize % 2)
194    m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
195}
196
197int CPDF_TextPage::CountChars() const {
198  return pdfium::CollectionSize<int>(m_CharList);
199}
200
201int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
202  int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
203  int count = 0;
204  for (int i = 0; i < indexSize; i += 2) {
205    count += m_CharIndex[i + 1];
206    if (count > TextIndex)
207      return TextIndex - count + m_CharIndex[i + 1] + m_CharIndex[i];
208  }
209  return -1;
210}
211
212int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
213  int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
214  int count = 0;
215  for (int i = 0; i < indexSize; i += 2) {
216    count += m_CharIndex[i + 1];
217    if (m_CharIndex[i + 1] + m_CharIndex[i] > CharIndex) {
218      if (CharIndex - m_CharIndex[i] < 0)
219        return -1;
220
221      return CharIndex - m_CharIndex[i] + count - m_CharIndex[i + 1];
222    }
223  }
224  return -1;
225}
226
227std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start,
228                                                       int nCount) const {
229  if (start < 0 || nCount == 0 || !m_bIsParsed)
230    return std::vector<CFX_FloatRect>();
231
232  if (nCount + start > pdfium::CollectionSize<int>(m_CharList) ||
233      nCount == -1) {
234    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
235  }
236
237  std::vector<CFX_FloatRect> rectArray;
238  CPDF_TextObject* pCurObj = nullptr;
239  CFX_FloatRect rect;
240  int curPos = start;
241  bool bFlagNewRect = true;
242  while (nCount--) {
243    PAGECHAR_INFO info_curchar = m_CharList[curPos++];
244    if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED)
245      continue;
246    if (info_curchar.m_CharBox.Width() < 0.01 ||
247        info_curchar.m_CharBox.Height() < 0.01) {
248      continue;
249    }
250    if (!pCurObj)
251      pCurObj = info_curchar.m_pTextObj;
252    if (pCurObj != info_curchar.m_pTextObj) {
253      rectArray.push_back(rect);
254      pCurObj = info_curchar.m_pTextObj;
255      bFlagNewRect = true;
256    }
257    if (bFlagNewRect) {
258      CFX_Matrix matrix = info_curchar.m_pTextObj->GetTextMatrix();
259      matrix.Concat(info_curchar.m_Matrix);
260
261      CFX_Matrix matrix_reverse;
262      matrix_reverse.SetReverse(matrix);
263
264      CFX_PointF origin = matrix_reverse.Transform(info_curchar.m_Origin);
265      rect.left = info_curchar.m_CharBox.left;
266      rect.right = info_curchar.m_CharBox.right;
267      if (pCurObj->GetFont()->GetTypeDescent()) {
268        rect.bottom = origin.y +
269                      pCurObj->GetFont()->GetTypeDescent() *
270                          pCurObj->GetFontSize() / 1000;
271
272        rect.bottom = matrix.Transform(CFX_PointF(origin.x, rect.bottom)).y;
273      } else {
274        rect.bottom = info_curchar.m_CharBox.bottom;
275      }
276      if (pCurObj->GetFont()->GetTypeAscent()) {
277        rect.top =
278            origin.y +
279            pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
280        FX_FLOAT xPosTemp =
281            origin.x +
282            GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
283                pCurObj->GetFontSize() / 1000;
284        rect.top = matrix.Transform(CFX_PointF(xPosTemp, rect.top)).y;
285      } else {
286        rect.top = info_curchar.m_CharBox.top;
287      }
288      bFlagNewRect = false;
289      rect = info_curchar.m_CharBox;
290      rect.Normalize();
291    } else {
292      info_curchar.m_CharBox.Normalize();
293      rect.left = std::min(rect.left, info_curchar.m_CharBox.left);
294      rect.right = std::max(rect.right, info_curchar.m_CharBox.right);
295      rect.top = std::max(rect.top, info_curchar.m_CharBox.top);
296      rect.bottom = std::min(rect.bottom, info_curchar.m_CharBox.bottom);
297    }
298  }
299  rectArray.push_back(rect);
300  return rectArray;
301}
302
303int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
304                                 const CFX_SizeF& tolerance) const {
305  if (!m_bIsParsed)
306    return -3;
307
308  int pos = 0;
309  int NearPos = -1;
310  double xdif = 5000;
311  double ydif = 5000;
312  while (pos < pdfium::CollectionSize<int>(m_CharList)) {
313    PAGECHAR_INFO charinfo = m_CharList[pos];
314    CFX_FloatRect charrect = charinfo.m_CharBox;
315    if (charrect.Contains(point))
316      break;
317    if (tolerance.width > 0 || tolerance.height > 0) {
318      CFX_FloatRect charRectExt;
319      charrect.Normalize();
320      charRectExt.left = charrect.left - tolerance.width / 2;
321      charRectExt.right = charrect.right + tolerance.width / 2;
322      charRectExt.top = charrect.top + tolerance.height / 2;
323      charRectExt.bottom = charrect.bottom - tolerance.height / 2;
324      if (charRectExt.Contains(point)) {
325        double curXdif, curYdif;
326        curXdif = FXSYS_fabs(point.x - charrect.left) <
327                          FXSYS_fabs(point.x - charrect.right)
328                      ? FXSYS_fabs(point.x - charrect.left)
329                      : FXSYS_fabs(point.x - charrect.right);
330        curYdif = FXSYS_fabs(point.y - charrect.bottom) <
331                          FXSYS_fabs(point.y - charrect.top)
332                      ? FXSYS_fabs(point.y - charrect.bottom)
333                      : FXSYS_fabs(point.y - charrect.top);
334        if (curYdif + curXdif < xdif + ydif) {
335          ydif = curYdif;
336          xdif = curXdif;
337          NearPos = pos;
338        }
339      }
340    }
341    ++pos;
342  }
343  return pos < pdfium::CollectionSize<int>(m_CharList) ? pos : NearPos;
344}
345
346CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
347  if (!m_bIsParsed)
348    return CFX_WideString();
349
350  FX_FLOAT posy = 0;
351  bool IsContainPreChar = false;
352  bool IsAddLineFeed = false;
353  CFX_WideString strText;
354  for (const auto& charinfo : m_CharList) {
355    if (IsRectIntersect(rect, charinfo.m_CharBox)) {
356      if (FXSYS_fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
357          IsAddLineFeed) {
358        posy = charinfo.m_Origin.y;
359        if (!strText.IsEmpty())
360          strText += L"\r\n";
361      }
362      IsContainPreChar = true;
363      IsAddLineFeed = false;
364      if (charinfo.m_Unicode)
365        strText += charinfo.m_Unicode;
366    } else if (charinfo.m_Unicode == 32) {
367      if (IsContainPreChar && charinfo.m_Unicode) {
368        strText += charinfo.m_Unicode;
369        IsContainPreChar = false;
370        IsAddLineFeed = false;
371      }
372    } else {
373      IsContainPreChar = false;
374      IsAddLineFeed = true;
375    }
376  }
377  return strText;
378}
379
380void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
381  if (!m_bIsParsed)
382    return;
383
384  if (index < 0 || index >= pdfium::CollectionSize<int>(m_CharList))
385    return;
386
387  const PAGECHAR_INFO& charinfo = m_CharList[index];
388  info->m_Charcode = charinfo.m_CharCode;
389  info->m_Origin = charinfo.m_Origin;
390  info->m_Unicode = charinfo.m_Unicode;
391  info->m_Flag = charinfo.m_Flag;
392  info->m_CharBox = charinfo.m_CharBox;
393  info->m_pTextObj = charinfo.m_pTextObj;
394  if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont())
395    info->m_FontSize = charinfo.m_pTextObj->GetFontSize();
396  else
397    info->m_FontSize = kDefaultFontSize;
398  info->m_Matrix = charinfo.m_Matrix;
399}
400
401void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
402                                             int32_t& nCount) const {
403  PAGECHAR_INFO charinfo = m_CharList[start];
404  PAGECHAR_INFO charinfo2 = m_CharList[start + nCount - 1];
405  if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
406      FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
407    return;
408  }
409  if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
410    PAGECHAR_INFO charinfo1 = charinfo;
411    int startIndex = start;
412    while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
413           charinfo1.m_Index == charinfo.m_Index) {
414      startIndex--;
415      if (startIndex < 0)
416        break;
417      charinfo1 = m_CharList[startIndex];
418    }
419    startIndex++;
420    start = startIndex;
421  }
422  if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
423    PAGECHAR_INFO charinfo3 = charinfo2;
424    int endIndex = start + nCount - 1;
425    while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
426           charinfo3.m_Index == charinfo2.m_Index) {
427      endIndex++;
428      if (endIndex >= pdfium::CollectionSize<int>(m_CharList))
429        break;
430      charinfo3 = m_CharList[endIndex];
431    }
432    endIndex--;
433    nCount = endIndex - start + 1;
434  }
435}
436
437CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
438  if (!m_bIsParsed || nCount == 0)
439    return L"";
440
441  if (start < 0)
442    start = 0;
443
444  if (nCount == -1) {
445    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
446    return CFX_WideString(
447        m_TextBuf.AsStringC().Mid(start, m_TextBuf.AsStringC().GetLength()));
448  }
449  if (nCount <= 0 || m_CharList.empty())
450    return L"";
451  if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
452    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
453  if (nCount <= 0)
454    return L"";
455  CheckMarkedContentObject(start, nCount);
456  int startindex = 0;
457  PAGECHAR_INFO charinfo = m_CharList[start];
458  int startOffset = 0;
459  while (charinfo.m_Index == -1) {
460    startOffset++;
461    if (startOffset > nCount ||
462        start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
463      return L"";
464    }
465    charinfo = m_CharList[start + startOffset];
466  }
467  startindex = charinfo.m_Index;
468  charinfo = m_CharList[start + nCount - 1];
469  int nCountOffset = 0;
470  while (charinfo.m_Index == -1) {
471    nCountOffset++;
472    if (nCountOffset >= nCount)
473      return L"";
474    charinfo = m_CharList[start + nCount - nCountOffset - 1];
475  }
476  nCount = start + nCount - nCountOffset - startindex;
477  if (nCount <= 0)
478    return L"";
479  return CFX_WideString(m_TextBuf.AsStringC().Mid(startindex, nCount));
480}
481
482int CPDF_TextPage::CountRects(int start, int nCount) {
483  if (!m_bIsParsed || start < 0)
484    return -1;
485
486  if (nCount == -1 ||
487      nCount + start > pdfium::CollectionSize<int>(m_CharList)) {
488    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
489  }
490  m_SelRects = GetRectArray(start, nCount);
491  return pdfium::CollectionSize<int>(m_SelRects);
492}
493
494void CPDF_TextPage::GetRect(int rectIndex,
495                            FX_FLOAT& left,
496                            FX_FLOAT& top,
497                            FX_FLOAT& right,
498                            FX_FLOAT& bottom) const {
499  if (!m_bIsParsed)
500    return;
501
502  if (rectIndex < 0 || rectIndex >= pdfium::CollectionSize<int>(m_SelRects))
503    return;
504
505  left = m_SelRects[rectIndex].left;
506  top = m_SelRects[rectIndex].top;
507  right = m_SelRects[rectIndex].right;
508  bottom = m_SelRects[rectIndex].bottom;
509}
510
511CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
512    const {
513  if (m_pPage->GetPageObjectList()->empty())
514    return TextOrientation::Unknown;
515
516  const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
517  const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
518  if (nPageWidth <= 0 || nPageHeight <= 0)
519    return TextOrientation::Unknown;
520
521  std::vector<bool> nHorizontalMask(nPageWidth);
522  std::vector<bool> nVerticalMask(nPageHeight);
523  FX_FLOAT fLineHeight = 0.0f;
524  int32_t nStartH = nPageWidth;
525  int32_t nEndH = 0;
526  int32_t nStartV = nPageHeight;
527  int32_t nEndV = 0;
528  for (const auto& pPageObj : *m_pPage->GetPageObjectList()) {
529    if (!pPageObj->IsText())
530      continue;
531
532    int32_t minH = std::max(static_cast<int32_t>(pPageObj->m_Left), 0);
533    int32_t maxH =
534        std::min(static_cast<int32_t>(pPageObj->m_Right), nPageWidth);
535    int32_t minV = std::max(static_cast<int32_t>(pPageObj->m_Bottom), 0);
536    int32_t maxV = std::min(static_cast<int32_t>(pPageObj->m_Top), nPageHeight);
537    if (minH >= maxH || minV >= maxV)
538      continue;
539
540    for (int32_t i = minH; i < maxH; ++i)
541      nHorizontalMask[i] = true;
542    for (int32_t i = minV; i < maxV; ++i)
543      nVerticalMask[i] = true;
544
545    nStartH = std::min(nStartH, minH);
546    nEndH = std::max(nEndH, maxH);
547    nStartV = std::min(nStartV, minV);
548    nEndV = std::max(nEndV, maxV);
549
550    if (fLineHeight <= 0.0f)
551      fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
552  }
553  const int32_t nDoubleLineHeight = 2 * fLineHeight;
554  if ((nEndV - nStartV) < nDoubleLineHeight)
555    return TextOrientation::Horizontal;
556  if ((nEndH - nStartH) < nDoubleLineHeight)
557    return TextOrientation::Vertical;
558
559  const FX_FLOAT nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH);
560  if (nSumH > 0.8f)
561    return TextOrientation::Horizontal;
562
563  const FX_FLOAT nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV);
564  if (nSumH > nSumV)
565    return TextOrientation::Horizontal;
566  if (nSumH < nSumV)
567    return TextOrientation::Vertical;
568  return TextOrientation::Unknown;
569}
570
571void CPDF_TextPage::AppendGeneratedCharacter(FX_WCHAR unicode,
572                                             const CFX_Matrix& formMatrix) {
573  PAGECHAR_INFO generateChar;
574  if (!GenerateCharInfo(unicode, generateChar))
575    return;
576
577  m_TextBuf.AppendChar(unicode);
578  if (!formMatrix.IsIdentity())
579    generateChar.m_Matrix = formMatrix;
580  m_CharList.push_back(generateChar);
581}
582
583void CPDF_TextPage::ProcessObject() {
584  if (m_pPage->GetPageObjectList()->empty())
585    return;
586
587  m_TextlineDir = FindTextlineFlowOrientation();
588  const CPDF_PageObjectList* pObjList = m_pPage->GetPageObjectList();
589  for (auto it = pObjList->begin(); it != pObjList->end(); ++it) {
590    if (CPDF_PageObject* pObj = it->get()) {
591      if (pObj->IsText()) {
592        CFX_Matrix matrix;
593        ProcessTextObject(pObj->AsText(), matrix, pObjList, it);
594      } else if (pObj->IsForm()) {
595        CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
596        ProcessFormObject(pObj->AsForm(), formMatrix);
597      }
598    }
599  }
600  for (const auto& obj : m_LineObj)
601    ProcessTextObject(obj);
602
603  m_LineObj.clear();
604  CloseTempLine();
605}
606
607void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
608                                      const CFX_Matrix& formMatrix) {
609  CPDF_PageObjectList* pObjectList = pFormObj->m_pForm->GetPageObjectList();
610  if (pObjectList->empty())
611    return;
612
613  CFX_Matrix curFormMatrix;
614  curFormMatrix = pFormObj->m_FormMatrix;
615  curFormMatrix.Concat(formMatrix);
616
617  for (auto it = pObjectList->begin(); it != pObjectList->end(); ++it) {
618    if (CPDF_PageObject* pPageObj = it->get()) {
619      if (pPageObj->IsText())
620        ProcessTextObject(pPageObj->AsText(), curFormMatrix, pObjectList, it);
621      else if (pPageObj->IsForm())
622        ProcessFormObject(pPageObj->AsForm(), curFormMatrix);
623    }
624  }
625}
626
627int CPDF_TextPage::GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const {
628  if (charCode == CPDF_Font::kInvalidCharCode)
629    return 0;
630
631  if (int w = pFont->GetCharWidthF(charCode))
632    return w;
633
634  CFX_ByteString str;
635  pFont->AppendChar(str, charCode);
636  if (int w = pFont->GetStringWidth(str.c_str(), 1))
637    return w;
638
639  return pFont->GetCharBBox(charCode).Width();
640}
641
642void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar,
643                                             PAGECHAR_INFO info) {
644  if (IsControlChar(info)) {
645    info.m_Index = -1;
646    m_CharList.push_back(info);
647    return;
648  }
649
650  info.m_Index = m_TextBuf.GetLength();
651  if (wChar >= 0xFB00 && wChar <= 0xFB06) {
652    FX_WCHAR* pDst = nullptr;
653    FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
654    if (nCount >= 1) {
655      pDst = FX_Alloc(FX_WCHAR, nCount);
656      Unicode_GetNormalization(wChar, pDst);
657      for (int nIndex = 0; nIndex < nCount; nIndex++) {
658        PAGECHAR_INFO info2 = info;
659        info2.m_Unicode = pDst[nIndex];
660        info2.m_Flag = FPDFTEXT_CHAR_PIECE;
661        m_TextBuf.AppendChar(info2.m_Unicode);
662        m_CharList.push_back(info2);
663      }
664      FX_Free(pDst);
665      return;
666    }
667  }
668  m_TextBuf.AppendChar(wChar);
669  m_CharList.push_back(info);
670}
671
672void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar,
673                                             PAGECHAR_INFO info) {
674  if (IsControlChar(info)) {
675    info.m_Index = -1;
676    m_CharList.push_back(info);
677    return;
678  }
679
680  info.m_Index = m_TextBuf.GetLength();
681  wChar = FX_GetMirrorChar(wChar, true, false);
682  FX_WCHAR* pDst = nullptr;
683  FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
684  if (nCount >= 1) {
685    pDst = FX_Alloc(FX_WCHAR, nCount);
686    Unicode_GetNormalization(wChar, pDst);
687    for (int nIndex = 0; nIndex < nCount; nIndex++) {
688      PAGECHAR_INFO info2 = info;
689      info2.m_Unicode = pDst[nIndex];
690      info2.m_Flag = FPDFTEXT_CHAR_PIECE;
691      m_TextBuf.AppendChar(info2.m_Unicode);
692      m_CharList.push_back(info2);
693    }
694    FX_Free(pDst);
695    return;
696  }
697  info.m_Unicode = wChar;
698  m_TextBuf.AppendChar(info.m_Unicode);
699  m_CharList.push_back(info);
700}
701
702void CPDF_TextPage::CloseTempLine() {
703  if (m_TempCharList.empty())
704    return;
705
706  CFX_WideString str = m_TempTextBuf.MakeString();
707  bool bPrevSpace = false;
708  for (int i = 0; i < str.GetLength(); i++) {
709    if (str.GetAt(i) != ' ') {
710      bPrevSpace = false;
711      continue;
712    }
713    if (bPrevSpace) {
714      m_TempTextBuf.Delete(i, 1);
715      m_TempCharList.erase(m_TempCharList.begin() + i);
716      str.Delete(i);
717      i--;
718    }
719    bPrevSpace = true;
720  }
721  CFX_BidiString bidi(str);
722  if (m_parserflag == FPDFText_Direction::Right)
723    bidi.SetOverallDirectionRight();
724  CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection();
725  for (const auto& segment : bidi) {
726    if (segment.direction == CFX_BidiChar::RIGHT ||
727        (segment.direction == CFX_BidiChar::NEUTRAL &&
728         eCurrentDirection == CFX_BidiChar::RIGHT)) {
729      eCurrentDirection = CFX_BidiChar::RIGHT;
730      for (int m = segment.start + segment.count; m > segment.start; --m)
731        AddCharInfoByRLDirection(bidi.CharAt(m - 1), m_TempCharList[m - 1]);
732    } else {
733      eCurrentDirection = CFX_BidiChar::LEFT;
734      for (int m = segment.start; m < segment.start + segment.count; m++)
735        AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]);
736    }
737  }
738  m_TempCharList.clear();
739  m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
740}
741
742void CPDF_TextPage::ProcessTextObject(
743    CPDF_TextObject* pTextObj,
744    const CFX_Matrix& formMatrix,
745    const CPDF_PageObjectList* pObjList,
746    CPDF_PageObjectList::const_iterator ObjPos) {
747  if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
748    return;
749
750  size_t count = m_LineObj.size();
751  PDFTEXT_Obj Obj;
752  Obj.m_pTextObj = pTextObj;
753  Obj.m_formMatrix = formMatrix;
754  if (count == 0) {
755    m_LineObj.push_back(Obj);
756    return;
757  }
758  if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos))
759    return;
760
761  PDFTEXT_Obj prev_Obj = m_LineObj[count - 1];
762  CPDF_TextObjectItem item;
763  int nItem = prev_Obj.m_pTextObj->CountItems();
764  prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
765  FX_FLOAT prev_width =
766      GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
767      prev_Obj.m_pTextObj->GetFontSize() / 1000;
768
769  CFX_Matrix prev_matrix = prev_Obj.m_pTextObj->GetTextMatrix();
770  prev_width = FXSYS_fabs(prev_width);
771  prev_matrix.Concat(prev_Obj.m_formMatrix);
772  prev_width = prev_matrix.TransformDistance(prev_width);
773  pTextObj->GetItemInfo(0, &item);
774  FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
775                        pTextObj->GetFontSize() / 1000;
776  this_width = FXSYS_fabs(this_width);
777
778  CFX_Matrix this_matrix = pTextObj->GetTextMatrix();
779  this_width = FXSYS_fabs(this_width);
780  this_matrix.Concat(formMatrix);
781  this_width = this_matrix.TransformDistance(this_width);
782
783  FX_FLOAT threshold =
784      prev_width > this_width ? prev_width / 4 : this_width / 4;
785  CFX_PointF prev_pos = m_DisplayMatrix.Transform(
786      prev_Obj.m_formMatrix.Transform(prev_Obj.m_pTextObj->GetPos()));
787  CFX_PointF this_pos =
788      m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos()));
789  if (FXSYS_fabs(this_pos.y - prev_pos.y) > threshold * 2) {
790    for (size_t i = 0; i < count; i++)
791      ProcessTextObject(m_LineObj[i]);
792    m_LineObj.clear();
793    m_LineObj.push_back(Obj);
794    return;
795  }
796
797  for (size_t i = count; i > 0; --i) {
798    PDFTEXT_Obj prev_text_obj = m_LineObj[i - 1];
799    CFX_PointF new_prev_pos =
800        m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform(
801            prev_text_obj.m_pTextObj->GetPos()));
802    if (this_pos.x >= new_prev_pos.x) {
803      m_LineObj.insert(m_LineObj.begin() + i, Obj);
804      return;
805    }
806  }
807  m_LineObj.insert(m_LineObj.begin(), Obj);
808}
809
810FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
811  CPDF_TextObject* pTextObj = Obj.m_pTextObj;
812  if (!pTextObj->m_ContentMark)
813    return FPDFText_MarkedContent::Pass;
814
815  int nContentMark = pTextObj->m_ContentMark.CountItems();
816  if (nContentMark < 1)
817    return FPDFText_MarkedContent::Pass;
818
819  CFX_WideString actText;
820  bool bExist = false;
821  CPDF_Dictionary* pDict = nullptr;
822  int n = 0;
823  for (n = 0; n < nContentMark; n++) {
824    const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
825    pDict = item.GetParam();
826    if (!pDict)
827      continue;
828    CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText"));
829    if (temp) {
830      bExist = true;
831      actText = temp->GetUnicodeText();
832    }
833  }
834  if (!bExist)
835    return FPDFText_MarkedContent::Pass;
836
837  if (m_pPreTextObj && m_pPreTextObj->m_ContentMark &&
838      m_pPreTextObj->m_ContentMark.CountItems() == n &&
839      pDict == m_pPreTextObj->m_ContentMark.GetItem(n - 1).GetParam()) {
840    return FPDFText_MarkedContent::Done;
841  }
842
843  FX_STRSIZE nItems = actText.GetLength();
844  if (nItems < 1)
845    return FPDFText_MarkedContent::Pass;
846
847  CPDF_Font* pFont = pTextObj->GetFont();
848  bExist = false;
849  for (FX_STRSIZE i = 0; i < nItems; i++) {
850    if (pFont->CharCodeFromUnicode(actText.GetAt(i)) !=
851        CPDF_Font::kInvalidCharCode) {
852      bExist = true;
853      break;
854    }
855  }
856  if (!bExist)
857    return FPDFText_MarkedContent::Pass;
858
859  bExist = false;
860  for (FX_STRSIZE i = 0; i < nItems; i++) {
861    FX_WCHAR wChar = actText.GetAt(i);
862    if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
863      bExist = true;
864      break;
865    }
866  }
867  if (!bExist)
868    return FPDFText_MarkedContent::Done;
869
870  return FPDFText_MarkedContent::Delay;
871}
872
873void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
874  CPDF_TextObject* pTextObj = Obj.m_pTextObj;
875  if (!pTextObj->m_ContentMark)
876    return;
877
878  int nContentMark = pTextObj->m_ContentMark.CountItems();
879  if (nContentMark < 1)
880    return;
881
882  CFX_WideString actText;
883  for (int n = 0; n < nContentMark; n++) {
884    const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n);
885    CPDF_Dictionary* pDict = item.GetParam();
886    if (pDict)
887      actText = pDict->GetUnicodeTextFor("ActualText");
888  }
889  FX_STRSIZE nItems = actText.GetLength();
890  if (nItems < 1)
891    return;
892
893  CPDF_Font* pFont = pTextObj->GetFont();
894  CFX_Matrix matrix = pTextObj->GetTextMatrix();
895  matrix.Concat(Obj.m_formMatrix);
896
897  for (FX_STRSIZE k = 0; k < nItems; k++) {
898    FX_WCHAR wChar = actText.GetAt(k);
899    if (wChar <= 0x80 && !isprint(wChar))
900      wChar = 0x20;
901    if (wChar >= 0xFFFD)
902      continue;
903
904    PAGECHAR_INFO charinfo;
905    charinfo.m_Origin = pTextObj->GetPos();
906    charinfo.m_Index = m_TextBuf.GetLength();
907    charinfo.m_Unicode = wChar;
908    charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
909    charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
910    charinfo.m_pTextObj = pTextObj;
911    charinfo.m_CharBox = pTextObj->GetRect();
912    charinfo.m_Matrix = matrix;
913    m_TempTextBuf.AppendChar(wChar);
914    m_TempCharList.push_back(charinfo);
915  }
916}
917
918void CPDF_TextPage::FindPreviousTextObject() {
919  if (m_TempCharList.empty() && m_CharList.empty())
920    return;
921
922  PAGECHAR_INFO preChar =
923      m_TempCharList.empty() ? m_CharList.back() : m_TempCharList.back();
924
925  if (preChar.m_pTextObj)
926    m_pPreTextObj = preChar.m_pTextObj;
927}
928
929void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
930                                    int32_t iBufStartAppend) {
931  int32_t i = iCharListStartAppend;
932  int32_t j = pdfium::CollectionSize<int32_t>(m_TempCharList) - 1;
933  for (; i < j; i++, j--) {
934    std::swap(m_TempCharList[i], m_TempCharList[j]);
935    std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
936  }
937  FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
938  i = iBufStartAppend;
939  j = m_TempTextBuf.GetLength() - 1;
940  for (; i < j; i++, j--)
941    std::swap(pTempBuffer[i], pTempBuffer[j]);
942}
943
944bool CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
945                                  const CPDF_Font* pFont,
946                                  int nItems) const {
947  CFX_WideString str;
948  for (int32_t i = 0; i < nItems; i++) {
949    CPDF_TextObjectItem item;
950    pTextObj->GetItemInfo(i, &item);
951    if (item.m_CharCode == static_cast<uint32_t>(-1))
952      continue;
953    CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
954    FX_WCHAR wChar = wstrItem.GetAt(0);
955    if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode)
956      wChar = (FX_WCHAR)item.m_CharCode;
957    if (wChar)
958      str += wChar;
959  }
960  return CFX_BidiString(str).OverallDirection() == CFX_BidiChar::RIGHT;
961}
962
963void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
964  CPDF_TextObject* pTextObj = Obj.m_pTextObj;
965  if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f)
966    return;
967  CFX_Matrix formMatrix = Obj.m_formMatrix;
968  CPDF_Font* pFont = pTextObj->GetFont();
969  CFX_Matrix matrix = pTextObj->GetTextMatrix();
970  matrix.Concat(formMatrix);
971
972  FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj);
973  if (ePreMKC == FPDFText_MarkedContent::Done) {
974    m_pPreTextObj = pTextObj;
975    m_perMatrix = formMatrix;
976    return;
977  }
978  GenerateCharacter result = GenerateCharacter::None;
979  if (m_pPreTextObj) {
980    result = ProcessInsertObject(pTextObj, formMatrix);
981    if (result == GenerateCharacter::LineBreak)
982      m_CurlineRect = Obj.m_pTextObj->GetRect();
983    else
984      m_CurlineRect.Union(Obj.m_pTextObj->GetRect());
985
986    switch (result) {
987      case GenerateCharacter::None:
988        break;
989      case GenerateCharacter::Space: {
990        PAGECHAR_INFO generateChar;
991        if (GenerateCharInfo(TEXT_SPACE_CHAR, generateChar)) {
992          if (!formMatrix.IsIdentity())
993            generateChar.m_Matrix = formMatrix;
994          m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
995          m_TempCharList.push_back(generateChar);
996        }
997        break;
998      }
999      case GenerateCharacter::LineBreak:
1000        CloseTempLine();
1001        if (m_TextBuf.GetSize()) {
1002          AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix);
1003          AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix);
1004        }
1005        break;
1006      case GenerateCharacter::Hyphen:
1007        if (pTextObj->CountChars() == 1) {
1008          CPDF_TextObjectItem item;
1009          pTextObj->GetCharInfo(0, &item);
1010          CFX_WideString wstrItem =
1011              pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1012          if (wstrItem.IsEmpty())
1013            wstrItem += (FX_WCHAR)item.m_CharCode;
1014          FX_WCHAR curChar = wstrItem.GetAt(0);
1015          if (curChar == 0x2D || curChar == 0xAD)
1016            return;
1017        }
1018        while (m_TempTextBuf.GetSize() > 0 &&
1019               m_TempTextBuf.AsStringC().GetAt(m_TempTextBuf.GetLength() - 1) ==
1020                   0x20) {
1021          m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1022          m_TempCharList.pop_back();
1023        }
1024        PAGECHAR_INFO* charinfo = &m_TempCharList.back();
1025        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1026        charinfo->m_Unicode = 0x2;
1027        charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1028        m_TempTextBuf.AppendChar(0xfffe);
1029        break;
1030    }
1031  } else {
1032    m_CurlineRect = Obj.m_pTextObj->GetRect();
1033  }
1034
1035  if (ePreMKC == FPDFText_MarkedContent::Delay) {
1036    ProcessMarkedContent(Obj);
1037    m_pPreTextObj = pTextObj;
1038    m_perMatrix = formMatrix;
1039    return;
1040  }
1041  m_pPreTextObj = pTextObj;
1042  m_perMatrix = formMatrix;
1043  int nItems = pTextObj->CountItems();
1044  FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix);
1045
1046  const bool bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1047  const bool bIsBidiAndMirrorInverse =
1048      bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1049  int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1050  int32_t iCharListStartAppend =
1051      pdfium::CollectionSize<int32_t>(m_TempCharList);
1052
1053  FX_FLOAT spacing = 0;
1054  for (int i = 0; i < nItems; i++) {
1055    CPDF_TextObjectItem item;
1056    PAGECHAR_INFO charinfo;
1057    pTextObj->GetItemInfo(i, &item);
1058    if (item.m_CharCode == static_cast<uint32_t>(-1)) {
1059      CFX_WideString str = m_TempTextBuf.MakeString();
1060      if (str.IsEmpty())
1061        str = m_TextBuf.AsStringC();
1062      if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_SPACE_CHAR)
1063        continue;
1064
1065      FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1066      spacing = -fontsize_h * item.m_Origin.x / 1000;
1067      continue;
1068    }
1069    FX_FLOAT charSpace = pTextObj->m_TextState.GetCharSpace();
1070    if (charSpace > 0.001)
1071      spacing += matrix.TransformDistance(charSpace);
1072    else if (charSpace < -0.001)
1073      spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1074    spacing -= baseSpace;
1075    if (spacing && i > 0) {
1076      int last_width = 0;
1077      FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1078      uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
1079      FX_FLOAT threshold = 0;
1080      if (space_charcode != CPDF_Font::kInvalidCharCode)
1081        threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1082      if (threshold > fontsize_h / 3)
1083        threshold = 0;
1084      else
1085        threshold /= 2;
1086      if (threshold == 0) {
1087        threshold = fontsize_h;
1088        int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1089        threshold = this_width > last_width ? (FX_FLOAT)this_width
1090                                            : (FX_FLOAT)last_width;
1091        threshold = NormalizeThreshold(threshold);
1092        threshold = fontsize_h * threshold / 1000;
1093      }
1094      if (threshold && (spacing && spacing >= threshold)) {
1095        charinfo.m_Unicode = TEXT_SPACE_CHAR;
1096        charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1097        charinfo.m_pTextObj = pTextObj;
1098        charinfo.m_Index = m_TextBuf.GetLength();
1099        m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
1100        charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
1101        charinfo.m_Matrix = formMatrix;
1102        charinfo.m_Origin = matrix.Transform(item.m_Origin);
1103        charinfo.m_CharBox =
1104            CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
1105                          charinfo.m_Origin.x, charinfo.m_Origin.y);
1106        m_TempCharList.push_back(charinfo);
1107      }
1108      if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
1109        continue;
1110    }
1111    spacing = 0;
1112    CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1113    bool bNoUnicode = false;
1114    if (wstrItem.IsEmpty() && item.m_CharCode) {
1115      wstrItem += static_cast<FX_WCHAR>(item.m_CharCode);
1116      bNoUnicode = true;
1117    }
1118    charinfo.m_Index = -1;
1119    charinfo.m_CharCode = item.m_CharCode;
1120    if (bNoUnicode)
1121      charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1122    else
1123      charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1124
1125    charinfo.m_pTextObj = pTextObj;
1126    charinfo.m_Origin = matrix.Transform(item.m_Origin);
1127
1128    FX_RECT rect =
1129        charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
1130    charinfo.m_CharBox.top =
1131        rect.top * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1132    charinfo.m_CharBox.left =
1133        rect.left * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1134    charinfo.m_CharBox.right =
1135        rect.right * pTextObj->GetFontSize() / 1000 + item.m_Origin.x;
1136    charinfo.m_CharBox.bottom =
1137        rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_Origin.y;
1138    if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1139      charinfo.m_CharBox.top =
1140          charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1141    }
1142    if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1143      charinfo.m_CharBox.right =
1144          charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1145    }
1146    matrix.TransformRect(charinfo.m_CharBox);
1147    charinfo.m_Matrix = matrix;
1148    if (wstrItem.IsEmpty()) {
1149      charinfo.m_Unicode = 0;
1150      m_TempCharList.push_back(charinfo);
1151      m_TempTextBuf.AppendChar(0xfffe);
1152      continue;
1153    } else {
1154      int nTotal = wstrItem.GetLength();
1155      bool bDel = false;
1156      const int count =
1157          std::min(pdfium::CollectionSize<int>(m_TempCharList), 7);
1158      FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1159          (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1160      for (int n = pdfium::CollectionSize<int>(m_TempCharList);
1161           n > pdfium::CollectionSize<int>(m_TempCharList) - count; n--) {
1162        const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1];
1163        CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
1164        if (charinfo1.m_CharCode == charinfo.m_CharCode &&
1165            charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
1166            FXSYS_fabs(diff.x) < threshold && FXSYS_fabs(diff.y) < threshold) {
1167          bDel = true;
1168          break;
1169        }
1170      }
1171      if (!bDel) {
1172        for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1173          charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1174          if (charinfo.m_Unicode) {
1175            charinfo.m_Index = m_TextBuf.GetLength();
1176            m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1177          } else {
1178            m_TempTextBuf.AppendChar(0xfffe);
1179          }
1180          m_TempCharList.push_back(charinfo);
1181        }
1182      } else if (i == 0) {
1183        CFX_WideString str = m_TempTextBuf.MakeString();
1184        if (!str.IsEmpty() &&
1185            str.GetAt(str.GetLength() - 1) == TEXT_SPACE_CHAR) {
1186          m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1187          m_TempCharList.pop_back();
1188        }
1189      }
1190    }
1191  }
1192  if (bIsBidiAndMirrorInverse)
1193    SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1194}
1195
1196CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
1197    const CPDF_TextObject* pTextObj) const {
1198  int32_t nChars = pTextObj->CountChars();
1199  if (nChars == 1)
1200    return m_TextlineDir;
1201
1202  CPDF_TextObjectItem first, last;
1203  pTextObj->GetCharInfo(0, &first);
1204  pTextObj->GetCharInfo(nChars - 1, &last);
1205
1206  CFX_Matrix textMatrix = pTextObj->GetTextMatrix();
1207  first.m_Origin = textMatrix.Transform(first.m_Origin);
1208  last.m_Origin = textMatrix.Transform(last.m_Origin);
1209
1210  FX_FLOAT dX = FXSYS_fabs(last.m_Origin.x - first.m_Origin.x);
1211  FX_FLOAT dY = FXSYS_fabs(last.m_Origin.y - first.m_Origin.y);
1212  if (dX <= 0.0001f && dY <= 0.0001f)
1213    return TextOrientation::Unknown;
1214
1215  CFX_VectorF v(dX, dY);
1216  v.Normalize();
1217  if (v.y <= 0.0872f)
1218    return v.x <= 0.0872f ? m_TextlineDir : TextOrientation::Horizontal;
1219
1220  if (v.x <= 0.0872f)
1221    return TextOrientation::Vertical;
1222
1223  return m_TextlineDir;
1224}
1225
1226bool CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1227  CFX_WideString strCurText = m_TempTextBuf.MakeString();
1228  if (strCurText.IsEmpty())
1229    strCurText = m_TextBuf.AsStringC();
1230  FX_STRSIZE nCount = strCurText.GetLength();
1231  int nIndex = nCount - 1;
1232  FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1233  while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0)
1234    wcTmp = strCurText.GetAt(--nIndex);
1235  if (0x2D == wcTmp || 0xAD == wcTmp) {
1236    if (--nIndex > 0) {
1237      FX_WCHAR preChar = strCurText.GetAt((nIndex));
1238      if (((preChar >= L'A' && preChar <= L'Z') ||
1239           (preChar >= L'a' && preChar <= L'z')) &&
1240          ((curChar >= L'A' && curChar <= L'Z') ||
1241           (curChar >= L'a' && curChar <= L'z'))) {
1242        return true;
1243      }
1244    }
1245    const PAGECHAR_INFO* preInfo;
1246    if (!m_TempCharList.empty())
1247      preInfo = &m_TempCharList.back();
1248    else if (!m_CharList.empty())
1249      preInfo = &m_CharList.back();
1250    else
1251      return false;
1252    if (FPDFTEXT_CHAR_PIECE == preInfo->m_Flag &&
1253        (0xAD == preInfo->m_Unicode || 0x2D == preInfo->m_Unicode)) {
1254      return true;
1255    }
1256  }
1257  return false;
1258}
1259
1260CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
1261    const CPDF_TextObject* pObj,
1262    const CFX_Matrix& formMatrix) {
1263  FindPreviousTextObject();
1264  TextOrientation WritingMode = GetTextObjectWritingMode(pObj);
1265  if (WritingMode == TextOrientation::Unknown)
1266    WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1267
1268  CFX_FloatRect this_rect = pObj->GetRect();
1269  CFX_FloatRect prev_rect = m_pPreTextObj->GetRect();
1270  CPDF_TextObjectItem PrevItem;
1271  CPDF_TextObjectItem item;
1272  int nItem = m_pPreTextObj->CountItems();
1273  m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1274  pObj->GetItemInfo(0, &item);
1275  CFX_WideString wstrItem =
1276      pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1277  if (wstrItem.IsEmpty())
1278    wstrItem += static_cast<FX_WCHAR>(item.m_CharCode);
1279  FX_WCHAR curChar = wstrItem.GetAt(0);
1280  if (WritingMode == TextOrientation::Horizontal) {
1281    if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1282      FX_FLOAT top =
1283          this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1284      FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1285                                                            : prev_rect.bottom;
1286      if (bottom >= top) {
1287        return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1288                                 : GenerateCharacter::LineBreak;
1289      }
1290    }
1291  } else if (WritingMode == TextOrientation::Vertical) {
1292    if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1293        prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1294      FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1295                                                          : m_CurlineRect.left;
1296      FX_FLOAT right = this_rect.right < m_CurlineRect.right
1297                           ? this_rect.right
1298                           : m_CurlineRect.right;
1299      if (right <= left) {
1300        return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1301                                 : GenerateCharacter::LineBreak;
1302      }
1303    }
1304  }
1305
1306  FX_FLOAT last_pos = PrevItem.m_Origin.x;
1307  int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1308  FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1309  last_width = FXSYS_fabs(last_width);
1310  int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1311  FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1312  this_width = FXSYS_fabs(this_width);
1313  FX_FLOAT threshold =
1314      last_width > this_width ? last_width / 4 : this_width / 4;
1315
1316  CFX_Matrix prev_matrix = m_pPreTextObj->GetTextMatrix();
1317  prev_matrix.Concat(m_perMatrix);
1318
1319  CFX_Matrix prev_reverse;
1320  prev_reverse.SetReverse(prev_matrix);
1321
1322  CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos()));
1323  if (last_width < this_width)
1324    threshold = prev_reverse.TransformDistance(threshold);
1325
1326  bool bNewline = false;
1327  if (WritingMode == TextOrientation::Horizontal) {
1328    CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1329                        m_pPreTextObj->m_Right, pObj->m_Top);
1330    CFX_FloatRect rect2 = m_pPreTextObj->GetRect();
1331    CFX_FloatRect rect3 = rect1;
1332    rect1.Intersect(rect2);
1333    if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1334        ((pos.y > threshold * 2 || pos.y < threshold * -3) &&
1335         (FXSYS_fabs(pos.y) < 1 ? FXSYS_fabs(pos.x) < FXSYS_fabs(pos.y)
1336                                : true))) {
1337      bNewline = true;
1338      if (nItem > 1) {
1339        CPDF_TextObjectItem tempItem;
1340        m_pPreTextObj->GetItemInfo(0, &tempItem);
1341        CFX_Matrix m = m_pPreTextObj->GetTextMatrix();
1342        if (PrevItem.m_Origin.x > tempItem.m_Origin.x &&
1343            m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1344            m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1345            m.c < 0.1) {
1346          CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1347                           m_pPreTextObj->m_Top);
1348          if (re.Contains(pObj->GetPos())) {
1349            bNewline = false;
1350          } else {
1351            CFX_FloatRect rect(0, pObj->m_Bottom, 1000, pObj->m_Top);
1352            if (rect.Contains(m_pPreTextObj->GetPos()))
1353              bNewline = false;
1354          }
1355        }
1356      }
1357    }
1358  }
1359  if (bNewline) {
1360    return IsHyphen(curChar) ? GenerateCharacter::Hyphen
1361                             : GenerateCharacter::LineBreak;
1362  }
1363
1364  int32_t nChars = pObj->CountChars();
1365  if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
1366      IsHyphen(curChar)) {
1367    return GenerateCharacter::Hyphen;
1368  }
1369  CFX_WideString PrevStr =
1370      m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1371  FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1372  CFX_Matrix matrix = pObj->GetTextMatrix();
1373  matrix.Concat(formMatrix);
1374
1375  threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1376  threshold = threshold > 400
1377                  ? (threshold < 700
1378                         ? threshold / 4
1379                         : (threshold > 800 ? threshold / 6 : threshold / 5))
1380                  : (threshold / 2);
1381  if (nLastWidth >= nThisWidth) {
1382    threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1383  } else {
1384    threshold *= FXSYS_fabs(pObj->GetFontSize());
1385    threshold = matrix.TransformDistance(threshold);
1386    threshold = prev_reverse.TransformDistance(threshold);
1387  }
1388  threshold /= 1000;
1389  if ((threshold < 1.4881 && threshold > 1.4879) ||
1390      (threshold < 1.39001 && threshold > 1.38999)) {
1391    threshold *= 1.5;
1392  }
1393  if (FXSYS_fabs(last_pos + last_width - pos.x) > threshold &&
1394      curChar != L' ' && preChar != L' ') {
1395    if (curChar != L' ' && preChar != L' ') {
1396      if ((pos.x - last_pos - last_width) > threshold ||
1397          (last_pos - pos.x - last_width) > threshold) {
1398        return GenerateCharacter::Space;
1399      }
1400      if (pos.x < 0 && (last_pos - pos.x - last_width) > threshold)
1401        return GenerateCharacter::Space;
1402      if ((pos.x - last_pos - last_width) > this_width ||
1403          (pos.x - last_pos - this_width) > last_width) {
1404        return GenerateCharacter::Space;
1405      }
1406    }
1407  }
1408  return GenerateCharacter::None;
1409}
1410
1411bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1412                                     CPDF_TextObject* pTextObj2) {
1413  if (!pTextObj1 || !pTextObj2)
1414    return false;
1415
1416  CFX_FloatRect rcPreObj = pTextObj2->GetRect();
1417  CFX_FloatRect rcCurObj = pTextObj1->GetRect();
1418  if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
1419    FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1420    size_t nCount = m_CharList.size();
1421    if (nCount >= 2) {
1422      PAGECHAR_INFO perCharTemp = m_CharList[nCount - 2];
1423      FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1424      if (dbXdif > dbSpace)
1425        return false;
1426    }
1427  }
1428  if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1429    rcPreObj.Intersect(rcCurObj);
1430    if (rcPreObj.IsEmpty())
1431      return false;
1432    if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
1433        rcCurObj.Width() / 2) {
1434      return false;
1435    }
1436    if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize())
1437      return false;
1438  }
1439  int nPreCount = pTextObj2->CountItems();
1440  int nCurCount = pTextObj1->CountItems();
1441  if (nPreCount != nCurCount)
1442    return false;
1443  // If both objects have no items, consider them same.
1444  if (!nPreCount)
1445    return true;
1446
1447  CPDF_TextObjectItem itemPer;
1448  CPDF_TextObjectItem itemCur;
1449  for (int i = 0; i < nPreCount; i++) {
1450    pTextObj2->GetItemInfo(i, &itemPer);
1451    pTextObj1->GetItemInfo(i, &itemCur);
1452    if (itemCur.m_CharCode != itemPer.m_CharCode)
1453      return false;
1454  }
1455
1456  CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos();
1457  FX_FLOAT font_size = pTextObj2->GetFontSize();
1458  FX_FLOAT char_size = GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont());
1459  FX_FLOAT max_pre_size =
1460      std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size);
1461  if (FXSYS_fabs(diff.x) > char_size * font_size / 1000 * 0.9 ||
1462      FXSYS_fabs(diff.y) > max_pre_size / 8) {
1463    return false;
1464  }
1465  return true;
1466}
1467
1468bool CPDF_TextPage::IsSameAsPreTextObject(
1469    CPDF_TextObject* pTextObj,
1470    const CPDF_PageObjectList* pObjList,
1471    CPDF_PageObjectList::const_iterator iter) {
1472  int i = 0;
1473  while (i < 5 && iter != pObjList->begin()) {
1474    --iter;
1475    CPDF_PageObject* pOtherObj = iter->get();
1476    if (pOtherObj == pTextObj || !pOtherObj->IsText())
1477      continue;
1478    if (IsSameTextObject(pOtherObj->AsText(), pTextObj))
1479      return true;
1480    ++i;
1481  }
1482  return false;
1483}
1484
1485bool CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
1486  const PAGECHAR_INFO* preChar;
1487  if (!m_TempCharList.empty())
1488    preChar = &m_TempCharList.back();
1489  else if (!m_CharList.empty())
1490    preChar = &m_CharList.back();
1491  else
1492    return false;
1493
1494  info.m_Index = m_TextBuf.GetLength();
1495  info.m_Unicode = unicode;
1496  info.m_pTextObj = nullptr;
1497  info.m_CharCode = CPDF_Font::kInvalidCharCode;
1498  info.m_Flag = FPDFTEXT_CHAR_GENERATED;
1499
1500  int preWidth = 0;
1501  if (preChar->m_pTextObj && preChar->m_CharCode != -1) {
1502    preWidth =
1503        GetCharWidth(preChar->m_CharCode, preChar->m_pTextObj->GetFont());
1504  }
1505
1506  FX_FLOAT fFontSize = preChar->m_pTextObj ? preChar->m_pTextObj->GetFontSize()
1507                                           : preChar->m_CharBox.Height();
1508  if (!fFontSize)
1509    fFontSize = kDefaultFontSize;
1510
1511  info.m_Origin = CFX_PointF(
1512      preChar->m_Origin.x + preWidth * (fFontSize) / 1000, preChar->m_Origin.y);
1513  info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y,
1514                                 info.m_Origin.x, info.m_Origin.y);
1515  return true;
1516}
1517
1518bool CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
1519                                    const CFX_FloatRect& rect2) {
1520  CFX_FloatRect rect = rect1;
1521  rect.Intersect(rect2);
1522  return !rect.IsEmpty();
1523}
1524