1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_
8#define CORE_SRC_FPDFTEXT_TEXT_INT_H_
9
10#include "core/include/fpdftext/fpdf_text.h"
11#include "core/include/fxcrt/fx_basic.h"
12
13class CFX_BidiChar;
14class CPDF_DocProgressiveSearch;
15class CPDF_FormObject;
16class CPDF_LinkExtract;
17class CPDF_TextPageFind;
18
19#define FPDFTEXT_CHAR_ERROR -1
20#define FPDFTEXT_CHAR_NORMAL 0
21#define FPDFTEXT_CHAR_GENERATED 1
22#define FPDFTEXT_CHAR_UNUNICODE 2
23#define FPDFTEXT_CHAR_HYPHEN 3
24#define FPDFTEXT_CHAR_PIECE 4
25#define FPDFTEXT_MC_PASS 0
26#define FPDFTEXT_MC_DONE 1
27#define FPDFTEXT_MC_DELAY 2
28
29typedef struct _PAGECHAR_INFO {
30  int m_CharCode;
31  FX_WCHAR m_Unicode;
32  FX_FLOAT m_OriginX;
33  FX_FLOAT m_OriginY;
34  int32_t m_Flag;
35  CFX_FloatRect m_CharBox;
36  CPDF_TextObject* m_pTextObj;
37  CFX_Matrix m_Matrix;
38  int m_Index;
39} PAGECHAR_INFO;
40typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;
41typedef struct {
42  int m_Start;
43  int m_nCount;
44} FPDF_SEGMENT;
45typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;
46typedef struct {
47  CPDF_TextObject* m_pTextObj;
48  CFX_Matrix m_formMatrix;
49} PDFTEXT_Obj;
50typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
51
52class CPDF_TextPage : public IPDF_TextPage {
53 public:
54  CPDF_TextPage(const CPDF_Page* pPage, int flags);
55  ~CPDF_TextPage() override {}
56
57  // IPDF_TextPage
58  FX_BOOL ParseTextPage() override;
59  void NormalizeObjects(FX_BOOL bNormalize) override;
60  bool IsParsed() const override { return m_bIsParsed; }
61  int CharIndexFromTextIndex(int TextIndex) const override;
62  int TextIndexFromCharIndex(int CharIndex) const override;
63  int CountChars() const override;
64  void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override;
65  void GetRectArray(int start,
66                    int nCount,
67                    CFX_RectArray& rectArray) const override;
68  int GetIndexAtPos(CPDF_Point point,
69                    FX_FLOAT xTolerance,
70                    FX_FLOAT yTolerance) const override;
71  int GetIndexAtPos(FX_FLOAT x,
72                    FX_FLOAT y,
73                    FX_FLOAT xTolerance,
74                    FX_FLOAT yTolerance) const override;
75  CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override;
76  void GetRectsArrayByRect(const CFX_FloatRect& rect,
77                           CFX_RectArray& resRectArray) const override;
78  CFX_WideString GetPageText(int start = 0, int nCount = -1) const override;
79  int CountRects(int start, int nCount) override;
80  void GetRect(int rectIndex,
81               FX_FLOAT& left,
82               FX_FLOAT& top,
83               FX_FLOAT& right,
84               FX_FLOAT& bottom) const override;
85  FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override;
86  FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override;
87  int CountBoundedSegments(FX_FLOAT left,
88                           FX_FLOAT top,
89                           FX_FLOAT right,
90                           FX_FLOAT bottom,
91                           FX_BOOL bContains = FALSE) override;
92  void GetBoundedSegment(int index, int& start, int& count) const override;
93  int GetWordBreak(int index, int direction) const override;
94
95  const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; }
96  static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1,
97                                 const CFX_FloatRect& rect2);
98  static FX_BOOL IsLetter(FX_WCHAR unicode);
99
100 private:
101  FX_BOOL IsHyphen(FX_WCHAR curChar);
102  bool IsControlChar(const PAGECHAR_INFO& charInfo);
103  FX_BOOL GetBaselineRotate(int start, int end, int& Rotate);
104  void ProcessObject();
105  void ProcessFormObject(CPDF_FormObject* pFormObj,
106                         const CFX_Matrix& formMatrix);
107  void ProcessTextObject(PDFTEXT_Obj pObj);
108  void ProcessTextObject(CPDF_TextObject* pTextObj,
109                         const CFX_Matrix& formMatrix,
110                         FX_POSITION ObjPos);
111  int ProcessInsertObject(const CPDF_TextObject* pObj,
112                          const CFX_Matrix& formMatrix);
113  FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
114  FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);
115  FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1,
116                           CPDF_TextObject* pTextObj2);
117  int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;
118  void CloseTempLine();
119  void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str);
120  int32_t PreMarkedContent(PDFTEXT_Obj pObj);
121  void ProcessMarkedContent(PDFTEXT_Obj pObj);
122  void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
123  void FindPreviousTextObject(void);
124  void AddCharInfoByLRDirection(CFX_WideString& str, int i);
125  void AddCharInfoByRLDirection(CFX_WideString& str, int i);
126  int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
127  int32_t FindTextlineFlowDirection();
128
129  void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
130  FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj,
131                        const CPDF_Font* pFont,
132                        int nItems) const;
133
134  CPDFText_ParseOptions m_ParseOptions;
135  CFX_WordArray m_CharIndex;
136  const CPDF_PageObjects* const m_pPage;
137  PAGECHAR_InfoArray m_charList;
138  CFX_WideTextBuf m_TextBuf;
139  PAGECHAR_InfoArray m_TempCharList;
140  CFX_WideTextBuf m_TempTextBuf;
141  const int m_parserflag;
142  CPDF_TextObject* m_pPreTextObj;
143  CFX_Matrix m_perMatrix;
144  bool m_bIsParsed;
145  CFX_Matrix m_DisplayMatrix;
146  SEGMENT_Array m_Segment;
147  CFX_RectArray m_SelRects;
148  LINEOBJ m_LineObj;
149  int32_t m_TextlineDir;
150  CFX_FloatRect m_CurlineRect;
151};
152
153class CPDF_TextPageFind : public IPDF_TextPageFind {
154 public:
155  explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
156  ~CPDF_TextPageFind() override {}
157
158  // IPDF_TextPageFind
159  FX_BOOL FindFirst(const CFX_WideString& findwhat,
160                    int flags,
161                    int startPos = 0) override;
162  FX_BOOL FindNext() override;
163  FX_BOOL FindPrev() override;
164  void GetRectArray(CFX_RectArray& rects) const override;
165  int GetCurOrder() const override;
166  int GetMatchedCount() const override;
167
168 protected:
169  void ExtractFindWhat(const CFX_WideString& findwhat);
170  FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText,
171                           int startPos,
172                           int endPos);
173  FX_BOOL ExtractSubString(CFX_WideString& rString,
174                           const FX_WCHAR* lpszFullString,
175                           int iSubString,
176                           FX_WCHAR chSep);
177  CFX_WideString MakeReverse(const CFX_WideString& str);
178  int ReverseFind(const CFX_WideString& csPageText,
179                  const CFX_WideString& csWord,
180                  int nStartPos,
181                  int& WordLength);
182  int GetCharIndex(int index) const;
183
184 private:
185  CFX_WordArray m_CharIndex;
186  const IPDF_TextPage* m_pTextPage;
187  CFX_WideString m_strText;
188  CFX_WideString m_findWhat;
189  int m_flags;
190  CFX_WideStringArray m_csFindWhatArray;
191  int m_findNextStart;
192  int m_findPreStart;
193  FX_BOOL m_bMatchCase;
194  FX_BOOL m_bMatchWholeWord;
195  int m_resStart;
196  int m_resEnd;
197  CFX_RectArray m_resArray;
198  FX_BOOL m_IsFind;
199};
200
201class CPDF_LinkExt {
202 public:
203  CPDF_LinkExt() {}
204  int m_Start;
205  int m_Count;
206  CFX_WideString m_strUrl;
207  virtual ~CPDF_LinkExt() {}
208};
209
210typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
211
212class CPDF_LinkExtract : public IPDF_LinkExtract {
213 public:
214  CPDF_LinkExtract();
215  ~CPDF_LinkExtract() override;
216
217  // IPDF_LinkExtract
218  FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override;
219  int CountLinks() const override;
220  CFX_WideString GetURL(int index) const override;
221  void GetBoundedSegment(int index, int& start, int& count) const override;
222  void GetRects(int index, CFX_RectArray& rects) const override;
223
224  FX_BOOL IsExtract() const { return m_bIsParsed; }
225
226 protected:
227  void ParseLink();
228  void DeleteLinkList();
229  FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
230  bool CheckMailLink(CFX_WideString& str);
231  void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
232
233 private:
234  LINK_InfoArray m_LinkList;
235  const CPDF_TextPage* m_pTextPage;
236  CFX_WideString m_strPageText;
237  bool m_bIsParsed;
238};
239
240FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst);
241void NormalizeString(CFX_WideString& str);
242void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
243void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
244                           CPDF_PageObjects* pPage,
245                           FX_BOOL bUseLF,
246                           CFX_PtrArray* pObjArray);
247
248#endif  // CORE_SRC_FPDFTEXT_TEXT_INT_H_
249