1ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// Copyright 2014 PDFium Authors. All rights reserved.
2ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// Use of this source code is governed by a BSD-style license that can be
3ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// found in the LICENSE file.
4ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
5ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
7ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_TEXT_H_
8ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define _FPDF_TEXT_H_
9ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_PARSER_
10ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#include "../fpdfapi/fpdf_parser.h"
11ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif
12ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_PAGEOBJ_H_
13ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#include "../fpdfapi/fpdf_pageobj.h"
14ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif
15ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_PAGE_
16ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#include "../fpdfapi/fpdf_page.h"
17ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif
18ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass CPDF_PageObjects;
19ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_AUTO_ROTATE		1
20ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_AUTO_WIDTH		2
21ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_KEEP_COLUMN		4
22ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_USE_OCR			8
23ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_INCLUDE_INVISIBLE	16
24ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovvoid PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
25ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov                     int iMinWidth, FX_DWORD flags);
26ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovvoid PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
27ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov                             int iMinWidth, FX_DWORD flags);
28ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovvoid PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
29ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov                               FX_DWORD flags);
30ee451cb395940862dad63c85adfe8f2fd55e864cSvet GanovCFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage);
31ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPage;
32ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_LinkExtract;
33ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPageFind;
34ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_ERROR			-1
35ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_NORMAL			0
36ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_GENERATED		1
37ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_UNUNICODE		2
38ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovtypedef struct {
39ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_WCHAR			m_Unicode;
40ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_WCHAR			m_Charcode;
41ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_INT32			m_Flag;
42ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_FLOAT			m_FontSize;
43ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_FLOAT			m_OriginX;
44ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_FLOAT			m_OriginY;
45ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    CFX_FloatRect		m_CharBox;
46ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    CPDF_TextObject*	m_pTextObj;
47ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    CFX_AffineMatrix	m_Matrix;
48ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov} FPDF_CHAR_INFO;
49ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovtypedef	CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray;
50ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_LRTB	0
51ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_RLTB	1
52ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_TBRL	2
53ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_LEFT			-1
54ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_RIGHT			1
55ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_UP				-2
56ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_DOWN			2
57ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_ReflowedPage;
58ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_UNKNOW	0
59ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_LRTB	1
60ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_RLTB	2
61ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_TBRL	3
62ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass CPDFText_ParseOptions : public CFX_Object
63ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{
64ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
65ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
66ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    CPDFText_ParseOptions();
67ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_BOOL			m_bGetCharCodeOnly;
68ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_BOOL			m_bNormalizeObjs;
69ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    FX_BOOL			m_bOutputHyphen;
70ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov};
71ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPage : public CFX_Object
72ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{
73ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
74ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
75ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual ~IPDF_TextPage() {}
76ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    static IPDF_TextPage*	CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);
77ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    static IPDF_TextPage*	CreateTextPage(const CPDF_Page* pPage, int flags = 0);
78ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    static IPDF_TextPage*	CreateTextPage(const CPDF_PageObjects* pObjs, int flags = 0);
79ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    static IPDF_TextPage*	CreateReflowTextPage(IPDF_ReflowedPage* pRefPage);
80ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
81ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual void			NormalizeObjects(FX_BOOL bNormalize) = 0;
82ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
83ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual FX_BOOL			ParseTextPage() = 0;
84ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
85ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
86ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual FX_BOOL			IsParsered() const = 0;
87ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
88ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
89ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int CharIndexFromTextIndex(int TextIndex) const = 0;
90ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
91ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int TextIndexFromCharIndex(int CharIndex) const = 0;
92ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
93ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
94ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int				CountChars() const = 0;
95ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
96ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	void			GetCharInfo(int index, FPDF_CHAR_INFO & info) const = 0;
97ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
98ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual void			GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const = 0;
99ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
100ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
101ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
102ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int				GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0;
103ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
104ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int				GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0;
105ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
106ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	int				GetOrderByDirection(int index, int direction) const = 0;
107ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
108ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual CFX_WideString	GetTextByRect(CFX_FloatRect rect) const = 0;
109ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
110ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual void			GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const = 0;
111ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
112ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
113ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int				CountRects(int start, int nCount) = 0;
114ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
115ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	void			GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0;
116ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
117ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual FX_BOOL			GetBaselineRotate(int rectIndex, int& Rotate) = 0;
118ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
119ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual FX_BOOL			GetBaselineRotate(CFX_FloatRect rect, int& Rotate) = 0;
120ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
121ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	int				CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) = 0;
122ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
123ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	void			GetBoundedSegment(int index, int& start, int& count) const = 0;
124ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
125ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
126ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int				GetWordBreak(int index, int direction) const = 0;
127ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
128ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual CFX_WideString	GetPageText(int start = 0, int nCount = -1 ) const = 0;
129ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov};
130ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_MATCHCASE      0x00000001
131ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_MATCHWHOLEWORD 0x00000002
132ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_CONSECUTIVE	0x00000004
133ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPageFind : public CFX_Object
134ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{
135ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
136ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
137ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	~IPDF_TextPageFind() {}
138ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
139ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    static	IPDF_TextPageFind*	CreatePageFind(const IPDF_TextPage* pTextPage);
140ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
141ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
142ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	FX_BOOL				FindFirst(CFX_WideString findwhat, int flags, int startPos = 0) = 0;
143ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
144ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	FX_BOOL				FindNext() = 0;
145ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
146ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	FX_BOOL				FindPrev() = 0;
147ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
148ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual void				GetRectArray(CFX_RectArray& rects) const = 0;
149ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
150ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int					GetCurOrder() const = 0;
151ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
152ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int					GetMatchedCount() const = 0;
153ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov};
154ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_LinkExtract : public CFX_Object
155ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{
156ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
157ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
158ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	~IPDF_LinkExtract() {}
159ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
160ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    static	IPDF_LinkExtract*	CreateLinkExtract();
161ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
162ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual FX_BOOL				ExtractLinks(const IPDF_TextPage* pTextPage) = 0;
163ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic:
164ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
165ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual int					CountLinks() const = 0;
166ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
167ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual CFX_WideString		GetURL(int index) const = 0;
168ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
169ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual	void				GetBoundedSegment(int index, int& start, int& count) const = 0;
170ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov
171ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov    virtual void				GetRects(int index, CFX_RectArray& rects) const = 0;
172ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov};
173ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif
174