1ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// Copyright 2014 PDFium Authors. All rights reserved. 2ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// Use of this source code is governed by a BSD-style license that can be 3ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// found in the LICENSE file. 4ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 5ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 7ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_TEXT_H_ 8ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define _FPDF_TEXT_H_ 9ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_PARSER_ 10ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#include "../fpdfapi/fpdf_parser.h" 11ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif 12ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_PAGEOBJ_H_ 13ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#include "../fpdfapi/fpdf_pageobj.h" 14ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif 15ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#ifndef _FPDF_PAGE_ 16ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#include "../fpdfapi/fpdf_page.h" 17ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif 18ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass CPDF_PageObjects; 19ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_AUTO_ROTATE 1 20ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_AUTO_WIDTH 2 21ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_KEEP_COLUMN 4 22ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_USE_OCR 8 23ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define PDF2TXT_INCLUDE_INVISIBLE 16 24ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovvoid PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 25ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov int iMinWidth, FX_DWORD flags); 26ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovvoid PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 27ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov int iMinWidth, FX_DWORD flags); 28ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovvoid PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 29ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_DWORD flags); 30ee451cb395940862dad63c85adfe8f2fd55e864cSvet GanovCFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage); 31ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPage; 32ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_LinkExtract; 33ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPageFind; 34ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_ERROR -1 35ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_NORMAL 0 36ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_GENERATED 1 37ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define CHAR_UNUNICODE 2 38ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovtypedef struct { 39ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_WCHAR m_Unicode; 40ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_WCHAR m_Charcode; 41ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_INT32 m_Flag; 42ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_FLOAT m_FontSize; 43ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_FLOAT m_OriginX; 44ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_FLOAT m_OriginY; 45ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov CFX_FloatRect m_CharBox; 46ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov CPDF_TextObject* m_pTextObj; 47ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov CFX_AffineMatrix m_Matrix; 48ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov} FPDF_CHAR_INFO; 49ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovtypedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; 50ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_LRTB 0 51ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_RLTB 1 52ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_TBRL 2 53ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_LEFT -1 54ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_RIGHT 1 55ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_UP -2 56ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_DOWN 2 57ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_ReflowedPage; 58ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_UNKNOW 0 59ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_LRTB 1 60ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_RLTB 2 61ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_WRITINGMODE_TBRL 3 62ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass CPDFText_ParseOptions : public CFX_Object 63ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{ 64ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 65ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 66ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov CPDFText_ParseOptions(); 67ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_BOOL m_bGetCharCodeOnly; 68ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_BOOL m_bNormalizeObjs; 69ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov FX_BOOL m_bOutputHyphen; 70ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov}; 71ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPage : public CFX_Object 72ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{ 73ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 74ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 75ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual ~IPDF_TextPage() {} 76ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); 77ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); 78ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov static IPDF_TextPage* CreateTextPage(const CPDF_PageObjects* pObjs, int flags = 0); 79ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage); 80ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 81ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void NormalizeObjects(FX_BOOL bNormalize) = 0; 82ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 83ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL ParseTextPage() = 0; 84ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 85ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 86ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL IsParsered() const = 0; 87ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 88ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 89ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int CharIndexFromTextIndex(int TextIndex) const = 0; 90ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 91ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int TextIndexFromCharIndex(int CharIndex) const = 0; 92ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 93ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 94ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int CountChars() const = 0; 95ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 96ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const = 0; 97ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 98ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const = 0; 99ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 100ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 101ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 102ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; 103ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 104ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; 105ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 106ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int GetOrderByDirection(int index, int direction) const = 0; 107ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 108ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual CFX_WideString GetTextByRect(CFX_FloatRect rect) const = 0; 109ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 110ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const = 0; 111ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 112ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 113ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int CountRects(int start, int nCount) = 0; 114ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 115ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0; 116ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 117ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; 118ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 119ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL GetBaselineRotate(CFX_FloatRect rect, int& Rotate) = 0; 120ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 121ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) = 0; 122ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 123ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 124ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 125ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 126ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int GetWordBreak(int index, int direction) const = 0; 127ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 128ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual CFX_WideString GetPageText(int start = 0, int nCount = -1 ) const = 0; 129ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov}; 130ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_MATCHCASE 0x00000001 131ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_MATCHWHOLEWORD 0x00000002 132ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#define FPDFTEXT_CONSECUTIVE 0x00000004 133ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_TextPageFind : public CFX_Object 134ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{ 135ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 136ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 137ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual ~IPDF_TextPageFind() {} 138ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 139ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); 140ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 141ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 142ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL FindFirst(CFX_WideString findwhat, int flags, int startPos = 0) = 0; 143ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 144ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL FindNext() = 0; 145ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 146ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL FindPrev() = 0; 147ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 148ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetRectArray(CFX_RectArray& rects) const = 0; 149ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 150ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int GetCurOrder() const = 0; 151ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 152ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int GetMatchedCount() const = 0; 153ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov}; 154ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovclass IPDF_LinkExtract : public CFX_Object 155ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov{ 156ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 157ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 158ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual ~IPDF_LinkExtract() {} 159ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 160ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov static IPDF_LinkExtract* CreateLinkExtract(); 161ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 162ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; 163ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganovpublic: 164ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 165ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual int CountLinks() const = 0; 166ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 167ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual CFX_WideString GetURL(int index) const = 0; 168ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 169ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 170ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov 171ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov virtual void GetRects(int index, CFX_RectArray& rects) const = 0; 172ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov}; 173ee451cb395940862dad63c85adfe8f2fd55e864cSvet Ganov#endif 174