1// Copyright 2014 PDFium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7#ifndef PUBLIC_FPDF_TEXT_H_ 8#define PUBLIC_FPDF_TEXT_H_ 9 10// NOLINTNEXTLINE(build/include) 11#include "fpdfview.h" 12 13// Exported Functions 14#ifdef __cplusplus 15extern "C" { 16#endif 17 18// Function: FPDFText_LoadPage 19// Prepare information about all characters in a page. 20// Parameters: 21// page - Handle to the page. Returned by FPDF_LoadPage function 22// (in FPDFVIEW module). 23// Return value: 24// A handle to the text page information structure. 25// NULL if something goes wrong. 26// Comments: 27// Application must call FPDFText_ClosePage to release the text page 28// information. 29// 30FPDF_EXPORT FPDF_TEXTPAGE FPDF_CALLCONV FPDFText_LoadPage(FPDF_PAGE page); 31 32// Function: FPDFText_ClosePage 33// Release all resources allocated for a text page information 34// structure. 35// Parameters: 36// text_page - Handle to a text page information structure. 37// Returned by FPDFText_LoadPage function. 38// Return Value: 39// None. 40// 41FPDF_EXPORT void FPDF_CALLCONV FPDFText_ClosePage(FPDF_TEXTPAGE text_page); 42 43// Function: FPDFText_CountChars 44// Get number of characters in a page. 45// Parameters: 46// text_page - Handle to a text page information structure. 47// Returned by FPDFText_LoadPage function. 48// Return value: 49// Number of characters in the page. Return -1 for error. 50// Generated characters, like additional space characters, new line 51// characters, are also counted. 52// Comments: 53// Characters in a page form a "stream", inside the stream, each 54// character has an index. 55// We will use the index parameters in many of FPDFTEXT functions. The 56// first character in the page 57// has an index value of zero. 58// 59FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountChars(FPDF_TEXTPAGE text_page); 60 61// Function: FPDFText_GetUnicode 62// Get Unicode of a character in a page. 63// Parameters: 64// text_page - Handle to a text page information structure. 65// Returned by FPDFText_LoadPage function. 66// index - Zero-based index of the character. 67// Return value: 68// The Unicode of the particular character. 69// If a character is not encoded in Unicode and Foxit engine can't 70// convert to Unicode, 71// the return value will be zero. 72// 73FPDF_EXPORT unsigned int FPDF_CALLCONV 74FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index); 75 76// Function: FPDFText_GetFontSize 77// Get the font size of a particular character. 78// Parameters: 79// text_page - Handle to a text page information structure. 80// Returned by FPDFText_LoadPage function. 81// index - Zero-based index of the character. 82// Return value: 83// The font size of the particular character, measured in points (about 84// 1/72 inch). 85// This is the typographic size of the font (so called "em size"). 86// 87FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, 88 int index); 89 90// Function: FPDFText_GetCharBox 91// Get bounding box of a particular character. 92// Parameters: 93// text_page - Handle to a text page information structure. 94// Returned by FPDFText_LoadPage function. 95// index - Zero-based index of the character. 96// left - Pointer to a double number receiving left position 97// of the character box. 98// right - Pointer to a double number receiving right position 99// of the character box. 100// bottom - Pointer to a double number receiving bottom position 101// of the character box. 102// top - Pointer to a double number receiving top position of 103// the character box. 104// Return Value: 105// On success, return TRUE and fill in |left|, |right|, |bottom|, and 106// |top|. If |text_page| is invalid, or if |index| is out of bounds, 107// then return FALSE, and the out parameters remain unmodified. 108// Comments: 109// All positions are measured in PDF "user space". 110// 111FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, 112 int index, 113 double* left, 114 double* right, 115 double* bottom, 116 double* top); 117 118// Function: FPDFText_GetCharOrigin 119// Get origin of a particular character. 120// Parameters: 121// text_page - Handle to a text page information structure. 122// Returned by FPDFText_LoadPage function. 123// index - Zero-based index of the character. 124// x - Pointer to a double number receiving x coordinate of 125// the character origin. 126// y - Pointer to a double number receiving y coordinate of 127// the character origin. 128// Return Value: 129// Whether the call succeeded. If false, x and y are unchanged. 130// Comments: 131// All positions are measured in PDF "user space". 132// 133FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV 134FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page, 135 int index, 136 double* x, 137 double* y); 138 139// Function: FPDFText_GetCharIndexAtPos 140// Get the index of a character at or nearby a certain position on the 141// page. 142// Parameters: 143// text_page - Handle to a text page information structure. 144// Returned by FPDFText_LoadPage function. 145// x - X position in PDF "user space". 146// y - Y position in PDF "user space". 147// xTolerance - An x-axis tolerance value for character hit 148// detection, in point unit. 149// yTolerance - A y-axis tolerance value for character hit 150// detection, in point unit. 151// Return Value: 152// The zero-based index of the character at, or nearby the point (x,y). 153// If there is no character at or nearby the point, return value will 154// be -1. 155// If an error occurs, -3 will be returned. 156// 157FPDF_EXPORT int FPDF_CALLCONV 158FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page, 159 double x, 160 double y, 161 double xTolerance, 162 double yTolerance); 163 164// Function: FPDFText_GetText 165// Extract unicode text string from the page. 166// Parameters: 167// text_page - Handle to a text page information structure. 168// Returned by FPDFText_LoadPage function. 169// start_index - Index for the start characters. 170// count - Number of characters to be extracted. 171// result - A buffer (allocated by application) receiving the 172// extracted unicodes. 173// The size of the buffer must be able to hold the 174// number of characters plus a terminator. 175// Return Value: 176// Number of characters written into the result buffer, including the 177// trailing terminator. 178// Comments: 179// This function ignores characters without unicode information. 180// 181FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page, 182 int start_index, 183 int count, 184 unsigned short* result); 185 186// Function: FPDFText_CountRects 187// Count number of rectangular areas occupied by a segment of texts. 188// Parameters: 189// text_page - Handle to a text page information structure. 190// Returned by FPDFText_LoadPage function. 191// start_index - Index for the start characters. 192// count - Number of characters. 193// Return value: 194// Number of rectangles. Zero for error. 195// Comments: 196// This function, along with FPDFText_GetRect can be used by 197// applications to detect the position 198// on the page for a text segment, so proper areas can be highlighted 199// or something. 200// FPDFTEXT will automatically merge small character boxes into bigger 201// one if those characters 202// are on the same line and use same font settings. 203// 204FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page, 205 int start_index, 206 int count); 207 208// Function: FPDFText_GetRect 209// Get a rectangular area from the result generated by 210// FPDFText_CountRects. 211// Parameters: 212// text_page - Handle to a text page information structure. 213// Returned by FPDFText_LoadPage function. 214// rect_index - Zero-based index for the rectangle. 215// left - Pointer to a double value receiving the rectangle 216// left boundary. 217// top - Pointer to a double value receiving the rectangle 218// top boundary. 219// right - Pointer to a double value receiving the rectangle 220// right boundary. 221// bottom - Pointer to a double value receiving the rectangle 222// bottom boundary. 223// Return Value: 224// On success, return TRUE and fill in |left|, |top|, |right|, and 225// |bottom|. If |link_page| is invalid then return FALSE, and the out 226// parameters remain unmodified. If |link_page| is valid but 227// |link_index| is out of bounds, then return FALSE and set the out 228// parameters to 0. 229// 230FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetRect(FPDF_TEXTPAGE text_page, 231 int rect_index, 232 double* left, 233 double* top, 234 double* right, 235 double* bottom); 236 237// Function: FPDFText_GetBoundedText 238// Extract unicode text within a rectangular boundary on the page. 239// Parameters: 240// text_page - Handle to a text page information structure. 241// Returned by FPDFText_LoadPage function. 242// left - Left boundary. 243// top - Top boundary. 244// right - Right boundary. 245// bottom - Bottom boundary. 246// buffer - A unicode buffer. 247// buflen - Number of characters (not bytes) for the buffer, 248// excluding an additional terminator. 249// Return Value: 250// If buffer is NULL or buflen is zero, return number of characters 251// (not bytes) of text present within 252// the rectangle, excluding a terminating NUL. Generally you should 253// pass a buffer at least one larger 254// than this if you want a terminating NUL, which will be provided if 255// space is available. 256// Otherwise, return number of characters copied into the buffer, 257// including the terminating NUL 258// when space for it is available. 259// Comment: 260// If the buffer is too small, as much text as will fit is copied into 261// it. 262// 263FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page, 264 double left, 265 double top, 266 double right, 267 double bottom, 268 unsigned short* buffer, 269 int buflen); 270 271// Flags used by FPDFText_FindStart function. 272#define FPDF_MATCHCASE \ 273 0x00000001 // If not set, it will not match case by default. 274#define FPDF_MATCHWHOLEWORD \ 275 0x00000002 // If not set, it will not match the whole word by default. 276 277// Function: FPDFText_FindStart 278// Start a search. 279// Parameters: 280// text_page - Handle to a text page information structure. 281// Returned by FPDFText_LoadPage function. 282// findwhat - A unicode match pattern. 283// flags - Option flags. 284// start_index - Start from this character. -1 for end of the page. 285// Return Value: 286// A handle for the search context. FPDFText_FindClose must be called 287// to release this handle. 288// 289FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV 290FPDFText_FindStart(FPDF_TEXTPAGE text_page, 291 FPDF_WIDESTRING findwhat, 292 unsigned long flags, 293 int start_index); 294 295// Function: FPDFText_FindNext 296// Search in the direction from page start to end. 297// Parameters: 298// handle - A search context handle returned by 299// FPDFText_FindStart. 300// Return Value: 301// Whether a match is found. 302// 303FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindNext(FPDF_SCHHANDLE handle); 304 305// Function: FPDFText_FindPrev 306// Search in the direction from page end to start. 307// Parameters: 308// handle - A search context handle returned by 309// FPDFText_FindStart. 310// Return Value: 311// Whether a match is found. 312// 313FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindPrev(FPDF_SCHHANDLE handle); 314 315// Function: FPDFText_GetSchResultIndex 316// Get the starting character index of the search result. 317// Parameters: 318// handle - A search context handle returned by 319// FPDFText_FindStart. 320// Return Value: 321// Index for the starting character. 322// 323FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle); 324 325// Function: FPDFText_GetSchCount 326// Get the number of matched characters in the search result. 327// Parameters: 328// handle - A search context handle returned by 329// FPDFText_FindStart. 330// Return Value: 331// Number of matched characters. 332// 333FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchCount(FPDF_SCHHANDLE handle); 334 335// Function: FPDFText_FindClose 336// Release a search context. 337// Parameters: 338// handle - A search context handle returned by 339// FPDFText_FindStart. 340// Return Value: 341// None. 342// 343FPDF_EXPORT void FPDF_CALLCONV FPDFText_FindClose(FPDF_SCHHANDLE handle); 344 345// Function: FPDFLink_LoadWebLinks 346// Prepare information about weblinks in a page. 347// Parameters: 348// text_page - Handle to a text page information structure. 349// Returned by FPDFText_LoadPage function. 350// Return Value: 351// A handle to the page's links information structure. 352// NULL if something goes wrong. 353// Comments: 354// Weblinks are those links implicitly embedded in PDF pages. PDF also 355// has a type of 356// annotation called "link", FPDFTEXT doesn't deal with that kind of 357// link. 358// FPDFTEXT weblink feature is useful for automatically detecting links 359// in the page 360// contents. For example, things like "http://www.foxitsoftware.com" 361// will be detected, 362// so applications can allow user to click on those characters to 363// activate the link, 364// even the PDF doesn't come with link annotations. 365// 366// FPDFLink_CloseWebLinks must be called to release resources. 367// 368FPDF_EXPORT FPDF_PAGELINK FPDF_CALLCONV 369FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page); 370 371// Function: FPDFLink_CountWebLinks 372// Count number of detected web links. 373// Parameters: 374// link_page - Handle returned by FPDFLink_LoadWebLinks. 375// Return Value: 376// Number of detected web links. 377// 378FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountWebLinks(FPDF_PAGELINK link_page); 379 380// Function: FPDFLink_GetURL 381// Fetch the URL information for a detected web link. 382// Parameters: 383// link_page - Handle returned by FPDFLink_LoadWebLinks. 384// link_index - Zero-based index for the link. 385// buffer - A unicode buffer for the result. 386// buflen - Number of characters (not bytes) for the buffer, 387// including an additional terminator. 388// Return Value: 389// If |buffer| is NULL or |buflen| is zero, return the number of 390// characters (not bytes) needed to buffer the result (an additional 391// terminator is included in this count). 392// Otherwise, copy the result into |buffer|, truncating at |buflen| if 393// the result is too large to fit, and return the number of characters 394// actually copied into the buffer (the additional terminator is also 395// included in this count). 396// If |link_index| does not correspond to a valid link, then the result 397// is an empty string. 398// 399FPDF_EXPORT int FPDF_CALLCONV FPDFLink_GetURL(FPDF_PAGELINK link_page, 400 int link_index, 401 unsigned short* buffer, 402 int buflen); 403 404// Function: FPDFLink_CountRects 405// Count number of rectangular areas for the link. 406// Parameters: 407// link_page - Handle returned by FPDFLink_LoadWebLinks. 408// link_index - Zero-based index for the link. 409// Return Value: 410// Number of rectangular areas for the link. If |link_index| does 411// not correspond to a valid link, then 0 is returned. 412// 413FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountRects(FPDF_PAGELINK link_page, 414 int link_index); 415 416// Function: FPDFLink_GetRect 417// Fetch the boundaries of a rectangle for a link. 418// Parameters: 419// link_page - Handle returned by FPDFLink_LoadWebLinks. 420// link_index - Zero-based index for the link. 421// rect_index - Zero-based index for a rectangle. 422// left - Pointer to a double value receiving the rectangle 423// left boundary. 424// top - Pointer to a double value receiving the rectangle 425// top boundary. 426// right - Pointer to a double value receiving the rectangle 427// right boundary. 428// bottom - Pointer to a double value receiving the rectangle 429// bottom boundary. 430// Return Value: 431// On success, return TRUE and fill in |left|, |top|, |right|, and 432// |bottom|. If |link_page| is invalid or if |link_index| does not 433// correspond to a valid link, then return FALSE, and the out 434// parameters remain unmodified. 435// 436FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFLink_GetRect(FPDF_PAGELINK link_page, 437 int link_index, 438 int rect_index, 439 double* left, 440 double* top, 441 double* right, 442 double* bottom); 443 444// Function: FPDFLink_CloseWebLinks 445// Release resources used by weblink feature. 446// Parameters: 447// link_page - Handle returned by FPDFLink_LoadWebLinks. 448// Return Value: 449// None. 450// 451FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page); 452 453#ifdef __cplusplus 454} 455#endif 456 457#endif // PUBLIC_FPDF_TEXT_H_ 458