1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#ifndef _PDF_TEXT_INT_H_
8#define _PDF_TEXT_INT_H_
9class CPDF_TextParseOptions : public CFX_Object
10{
11public:
12    CPDF_TextParseOptions();
13    FX_BOOL			m_bCheckObjectOrder;
14    FX_BOOL			m_bCheckDirection;
15    int				m_nCheckSameObject;
16};
17class CPDF_TextPage;
18class CPDF_LinkExtract;
19class CPDF_TextPageFind;
20class CPDF_DocProgressiveSearch;
21#define FPDFTEXT_CHAR_ERROR			-1
22#define FPDFTEXT_CHAR_NORMAL		0
23#define FPDFTEXT_CHAR_GENERATED		1
24#define FPDFTEXT_CHAR_UNUNICODE		2
25#define FPDFTEXT_CHAR_HYPHEN		3
26#define FPDFTEXT_CHAR_PIECE			4
27#define FPDFTEXT_MC_PASS			0
28#define FPDFTEXT_MC_DONE			1
29#define FPDFTEXT_MC_DELAY			2
30typedef struct _PAGECHAR_INFO: public CFX_Object {
31    int					m_CharCode;
32    FX_WCHAR			m_Unicode;
33    FX_FLOAT			m_OriginX;
34    FX_FLOAT			m_OriginY;
35    FX_INT32			m_Flag;
36    CFX_FloatRect		m_CharBox;
37    CPDF_TextObject*	m_pTextObj;
38    CFX_AffineMatrix	m_Matrix;
39    int					m_Index;
40} PAGECHAR_INFO;
41typedef	CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;
42typedef struct {
43    int	m_Start;
44    int m_nCount;
45} FPDF_SEGMENT;
46typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;
47typedef struct {
48    CPDF_TextObject*	m_pTextObj;
49    CFX_AffineMatrix	m_formMatrix;
50} PDFTEXT_Obj;
51typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
52class CPDF_TextPage: public IPDF_TextPage
53{
54public:
55    CPDF_TextPage(const CPDF_Page* pPage, int flags = 0);
56    CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0);
57    CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);
58    virtual FX_BOOL					ParseTextPage();
59    virtual void					NormalizeObjects(FX_BOOL bNormalize);
60    virtual	FX_BOOL					IsParsered() const
61    {
62        return m_IsParsered;
63    }
64    virtual ~CPDF_TextPage() {};
65public:
66    virtual int CharIndexFromTextIndex(int TextIndex)const ;
67    virtual int TextIndexFromCharIndex(int CharIndex)const;
68    virtual int						CountChars() const;
69    virtual	void					GetCharInfo(int index, FPDF_CHAR_INFO & info) const;
70    virtual void					GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const;
71    virtual int						GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const;
72    virtual int						GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance,
73            FX_FLOAT yTorelance) const;
74    virtual CFX_WideString			GetTextByRect(CFX_FloatRect rect) const;
75    virtual void					GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const;
76    virtual	int						GetOrderByDirection(int order, int direction) const;
77    virtual	CFX_WideString			GetPageText(int start = 0, int nCount = -1) const;
78
79    virtual int						CountRects(int start, int nCount);
80    virtual	void					GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top
81                                            , FX_FLOAT& right, FX_FLOAT &bottom) const;
82    virtual FX_BOOL					GetBaselineRotate(int rectIndex, int& Rotate);
83    virtual FX_BOOL					GetBaselineRotate(CFX_FloatRect rect, int& Rotate);
84    virtual	int						CountBoundedSegments(FX_FLOAT left, FX_FLOAT top,
85            FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE);
86    virtual	void					GetBoundedSegment(int index, int& start, int& count) const;
87    virtual int						GetWordBreak(int index, int direction) const;
88public:
89    const	PAGECHAR_InfoArray*		GetCharList() const
90    {
91        return &m_charList;
92    }
93    static	FX_BOOL					IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2);
94    static	FX_BOOL					IsLetter(FX_WCHAR unicode);
95private:
96    FX_BOOL							IsHyphen(FX_WCHAR curChar);
97    FX_BOOL							IsControlChar(PAGECHAR_INFO* pCharInfo);
98    FX_BOOL							GetBaselineRotate(int start, int end, int& Rotate);
99    void							ProcessObject();
100    void							ProcessFormObject(CPDF_FormObject*	pFormObj, CFX_AffineMatrix formMatrix);
101    void							ProcessTextObject(PDFTEXT_Obj pObj);
102    void							ProcessTextObject(CPDF_TextObject*	pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos);
103    int								ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix);
104    FX_BOOL							GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
105    FX_BOOL							IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);
106    FX_BOOL							IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
107    int								GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;
108    void							CloseTempLine();
109    void							OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str);
110    FX_INT32	PreMarkedContent(PDFTEXT_Obj pObj);
111    void		ProcessMarkedContent(PDFTEXT_Obj pObj);
112    void		CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const;
113    void		FindPreviousTextObject(void);
114    void		AddCharInfoByLRDirection(CFX_WideString& str, int i);
115    void		AddCharInfoByRLDirection(CFX_WideString& str, int i);
116    FX_INT32	GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
117    FX_INT32	FindTextlineFlowDirection();
118protected:
119    CPDFText_ParseOptions			m_ParseOptions;
120    CFX_WordArray					m_CharIndex;
121    const CPDF_PageObjects*			m_pPage;
122    PAGECHAR_InfoArray				m_charList;
123    CFX_WideTextBuf					m_TextBuf;
124    PAGECHAR_InfoArray				m_TempCharList;
125    CFX_WideTextBuf					m_TempTextBuf;
126    int								m_parserflag;
127    CPDF_TextObject*				m_pPreTextObj;
128    CFX_AffineMatrix				m_perMatrix;
129    FX_BOOL							m_IsParsered;
130    CFX_AffineMatrix				m_DisplayMatrix;
131
132    SEGMENT_Array					m_Segment;
133    CFX_RectArray					m_SelRects;
134    LINEOBJ							m_LineObj;
135    FX_BOOL							m_TextlineDir;
136    CFX_FloatRect					m_CurlineRect;
137};
138class CPDF_TextPageFind: public IPDF_TextPageFind
139{
140public:
141    CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
142    virtual							~CPDF_TextPageFind() {};
143public:
144    virtual	FX_BOOL					FindFirst(CFX_WideString findwhat, int flags, int startPos = 0);
145    virtual	FX_BOOL					FindNext();
146    virtual	FX_BOOL					FindPrev();
147
148    virtual void					GetRectArray(CFX_RectArray& rects) const;
149    virtual int						GetCurOrder() const;
150    virtual int						GetMatchedCount()const;
151protected:
152    void							ExtractFindWhat(CFX_WideString findwhat);
153    FX_BOOL							IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos);
154    FX_BOOL							ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString,
155            int iSubString, FX_WCHAR chSep);
156    CFX_WideString					MakeReverse(const CFX_WideString str);
157    int								ReverseFind(CFX_WideString csPageText, CFX_WideString csWord, int nStartPos, int& WordLength);
158    int								GetCharIndex(int index) const;
159private:
160    CFX_WordArray					m_CharIndex;
161    const IPDF_TextPage*			m_pTextPage;
162    CFX_WideString					m_strText;
163    CFX_WideString					m_findWhat;
164    int								m_flags;
165    CFX_WideStringArray				m_csFindWhatArray;
166    int								m_findNextStart;
167    int								m_findPreStart;
168    FX_BOOL							m_bMatchCase;
169    FX_BOOL							m_bMatchWholeWord;
170    int								m_resStart;
171    int								m_resEnd;
172    CFX_RectArray					m_resArray;
173    FX_BOOL							m_IsFind;
174};
175class CPDF_LinkExt: public CFX_Object
176{
177public:
178    CPDF_LinkExt() {};
179    int								m_Start;
180    int								m_Count;
181    CFX_WideString					m_strUrl;
182    virtual							~CPDF_LinkExt() {};
183};
184typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
185class CPDF_LinkExtract: public IPDF_LinkExtract
186{
187public:
188    CPDF_LinkExtract();
189    virtual							~CPDF_LinkExtract();
190    virtual FX_BOOL					ExtractLinks(const IPDF_TextPage* pTextPage);
191    virtual	FX_BOOL					IsExtract() const
192    {
193        return m_IsParserd;
194    }
195public:
196    virtual int						CountLinks() const;
197    virtual	CFX_WideString			GetURL(int index) const;
198    virtual	void					GetBoundedSegment(int index, int& start, int& count) const;
199    virtual	void					GetRects(int index, CFX_RectArray& rects)const;
200protected:
201    void							parserLink();
202    void							DeleteLinkList();
203    FX_BOOL							CheckWebLink(CFX_WideString& strBeCheck);
204    FX_BOOL							CheckMailLink(CFX_WideString& str);
205    FX_BOOL							AppendToLinkList(int start, int count, CFX_WideString strUrl);
206private:
207    LINK_InfoArray					m_LinkList;
208    const CPDF_TextPage*			m_pTextPage;
209    CFX_WideString					m_strPageText;
210    FX_BOOL							m_IsParserd;
211};
212FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst);
213void NormalizeString(CFX_WideString& str);
214void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
215#endif
216