fpdf_text.h revision ac3d58cff7c80b0ef56bf55130d91da17cbaa3c4
1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#ifndef PUBLIC_FPDF_TEXT_H_
8#define PUBLIC_FPDF_TEXT_H_
9
10#include "fpdfview.h"
11
12// Exported Functions
13#ifdef __cplusplus
14extern "C" {
15#endif
16
17// Function: FPDFText_LoadPage
18//          Prepare information about all characters in a page.
19// Parameters:
20//          page    -   Handle to the page. Returned by FPDF_LoadPage function
21//          (in FPDFVIEW module).
22// Return value:
23//          A handle to the text page information structure.
24//          NULL if something goes wrong.
25// Comments:
26//          Application must call FPDFText_ClosePage to release the text page
27//          information.
28//
29DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page);
30
31// Function: FPDFText_ClosePage
32//          Release all resources allocated for a text page information
33//          structure.
34// Parameters:
35//          text_page   -   Handle to a text page information structure.
36//          Returned by FPDFText_LoadPage function.
37// Return Value:
38//          None.
39//
40DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
41
42// Function: FPDFText_CountChars
43//          Get number of characters in a page.
44// Parameters:
45//          text_page   -   Handle to a text page information structure.
46//          Returned by FPDFText_LoadPage function.
47// Return value:
48//          Number of characters in the page. Return -1 for error.
49//          Generated characters, like additional space characters, new line
50//          characters, are also counted.
51// Comments:
52//          Characters in a page form a "stream", inside the stream, each
53//          character has an index.
54//          We will use the index parameters in many of FPDFTEXT functions. The
55//          first character in the page
56//          has an index value of zero.
57//
58DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page);
59
60// Function: FPDFText_GetUnicode
61//          Get Unicode of a character in a page.
62// Parameters:
63//          text_page   -   Handle to a text page information structure.
64//          Returned by FPDFText_LoadPage function.
65//          index       -   Zero-based index of the character.
66// Return value:
67//          The Unicode of the particular character.
68//          If a character is not encoded in Unicode and Foxit engine can't
69//          convert to Unicode,
70//          the return value will be zero.
71//
72DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page,
73                                                   int index);
74
75// Function: FPDFText_GetFontSize
76//          Get the font size of a particular character.
77// Parameters:
78//          text_page   -   Handle to a text page information structure.
79//          Returned by FPDFText_LoadPage function.
80//          index       -   Zero-based index of the character.
81// Return value:
82//          The font size of the particular character, measured in points (about
83//          1/72 inch).
84//          This is the typographic size of the font (so called "em size").
85//
86DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
87                                              int index);
88
89// Function: FPDFText_GetCharBox
90//          Get bounding box of a particular character.
91// Parameters:
92//          text_page   -   Handle to a text page information structure.
93//          Returned by FPDFText_LoadPage function.
94//          index       -   Zero-based index of the character.
95//          left        -   Pointer to a double number receiving left position
96//          of the character box.
97//          right       -   Pointer to a double number receiving right position
98//          of the character box.
99//          bottom      -   Pointer to a double number receiving bottom position
100//          of the character box.
101//          top         -   Pointer to a double number receiving top position of
102//          the character box.
103// Return Value:
104//          None.
105// Comments:
106//          All positions are measured in PDF "user space".
107//
108DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
109                                           int index,
110                                           double* left,
111                                           double* right,
112                                           double* bottom,
113                                           double* top);
114
115// Function: FPDFText_GetCharIndexAtPos
116//          Get the index of a character at or nearby a certain position on the
117//          page.
118// Parameters:
119//          text_page   -   Handle to a text page information structure.
120//          Returned by FPDFText_LoadPage function.
121//          x           -   X position in PDF "user space".
122//          y           -   Y position in PDF "user space".
123//          xTolerance  -   An x-axis tolerance value for character hit
124//          detection, in point unit.
125//          yTolerance  -   A y-axis tolerance value for character hit
126//          detection, in point unit.
127// Return Value:
128//          The zero-based index of the character at, or nearby the point (x,y).
129//          If there is no character at or nearby the point, return value will
130//          be -1.
131//          If an error occurs, -3 will be returned.
132//
133DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
134                                                 double x,
135                                                 double y,
136                                                 double xTolerance,
137                                                 double yTolerance);
138
139// Function: FPDFText_GetText
140//          Extract unicode text string from the page.
141// Parameters:
142//          text_page   -   Handle to a text page information structure.
143//          Returned by FPDFText_LoadPage function.
144//          start_index -   Index for the start characters.
145//          count       -   Number of characters to be extracted.
146//          result      -   A buffer (allocated by application) receiving the
147//          extracted unicodes.
148//                          The size of the buffer must be able to hold the
149//                          number of characters plus a terminator.
150// Return Value:
151//          Number of characters written into the result buffer, including the
152//          trailing terminator.
153// Comments:
154//          This function ignores characters without unicode information.
155//
156DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page,
157                                       int start_index,
158                                       int count,
159                                       unsigned short* result);
160
161// Function: FPDFText_CountRects
162//          Count number of rectangular areas occupied by a segment of texts.
163// Parameters:
164//          text_page   -   Handle to a text page information structure.
165//          Returned by FPDFText_LoadPage function.
166//          start_index -   Index for the start characters.
167//          count       -   Number of characters.
168// Return value:
169//          Number of rectangles. Zero for error.
170// Comments:
171//          This function, along with FPDFText_GetRect can be used by
172//          applications to detect the position
173//          on the page for a text segment, so proper areas can be highlighted
174//          or something.
175//          FPDFTEXT will automatically merge small character boxes into bigger
176//          one if those characters
177//          are on the same line and use same font settings.
178//
179DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page,
180                                          int start_index,
181                                          int count);
182
183// Function: FPDFText_GetRect
184//          Get a rectangular area from the result generated by
185//          FPDFText_CountRects.
186// Parameters:
187//          text_page   -   Handle to a text page information structure.
188//          Returned by FPDFText_LoadPage function.
189//          rect_index  -   Zero-based index for the rectangle.
190//          left        -   Pointer to a double value receiving the rectangle
191//          left boundary.
192//          top         -   Pointer to a double value receiving the rectangle
193//          top boundary.
194//          right       -   Pointer to a double value receiving the rectangle
195//          right boundary.
196//          bottom      -   Pointer to a double value receiving the rectangle
197//          bottom boundary.
198// Return Value:
199//          None.
200//
201DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page,
202                                        int rect_index,
203                                        double* left,
204                                        double* top,
205                                        double* right,
206                                        double* bottom);
207
208// Function: FPDFText_GetBoundedText
209//          Extract unicode text within a rectangular boundary on the page.
210// Parameters:
211//          text_page   -   Handle to a text page information structure.
212//          Returned by FPDFText_LoadPage function.
213//          left        -   Left boundary.
214//          top         -   Top boundary.
215//          right       -   Right boundary.
216//          bottom      -   Bottom boundary.
217//          buffer      -   A unicode buffer.
218//          buflen      -   Number of characters (not bytes) for the buffer,
219//          excluding an additional terminator.
220// Return Value:
221//          If buffer is NULL or buflen is zero, return number of characters
222//          (not bytes) of text present within
223//          the rectangle, excluding a terminating NUL.  Generally you should
224//          pass a buffer at least one larger
225//          than this if you want a terminating NUL, which will be provided if
226//          space is available.
227//          Otherwise, return number of characters copied into the buffer,
228//          including the terminating NUL
229//          when space for it is available.
230// Comment:
231//          If the buffer is too small, as much text as will fit is copied into
232//          it.
233//
234DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
235                                              double left,
236                                              double top,
237                                              double right,
238                                              double bottom,
239                                              unsigned short* buffer,
240                                              int buflen);
241
242// Flags used by FPDFText_FindStart function.
243#define FPDF_MATCHCASE \
244  0x00000001  // If not set, it will not match case by default.
245#define FPDF_MATCHWHOLEWORD \
246  0x00000002  // If not set, it will not match the whole word by default.
247
248// Function: FPDFText_FindStart
249//          Start a search.
250// Parameters:
251//          text_page   -   Handle to a text page information structure.
252//          Returned by FPDFText_LoadPage function.
253//          findwhat    -   A unicode match pattern.
254//          flags       -   Option flags.
255//          start_index -   Start from this character. -1 for end of the page.
256// Return Value:
257//          A handle for the search context. FPDFText_FindClose must be called
258//          to release this handle.
259//
260DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page,
261                                                    FPDF_WIDESTRING findwhat,
262                                                    unsigned long flags,
263                                                    int start_index);
264
265// Function: FPDFText_FindNext
266//          Search in the direction from page start to end.
267// Parameters:
268//          handle      -   A search context handle returned by
269//          FPDFText_FindStart.
270// Return Value:
271//          Whether a match is found.
272//
273DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle);
274
275// Function: FPDFText_FindPrev
276//          Search in the direction from page end to start.
277// Parameters:
278//          handle      -   A search context handle returned by
279//          FPDFText_FindStart.
280// Return Value:
281//          Whether a match is found.
282//
283DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle);
284
285// Function: FPDFText_GetSchResultIndex
286//          Get the starting character index of the search result.
287// Parameters:
288//          handle      -   A search context handle returned by
289//          FPDFText_FindStart.
290// Return Value:
291//          Index for the starting character.
292//
293DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
294
295// Function: FPDFText_GetSchCount
296//          Get the number of matched characters in the search result.
297// Parameters:
298//          handle      -   A search context handle returned by
299//          FPDFText_FindStart.
300// Return Value:
301//          Number of matched characters.
302//
303DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
304
305// Function: FPDFText_FindClose
306//          Release a search context.
307// Parameters:
308//          handle      -   A search context handle returned by
309//          FPDFText_FindStart.
310// Return Value:
311//          None.
312//
313DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle);
314
315// Function: FPDFLink_LoadWebLinks
316//          Prepare information about weblinks in a page.
317// Parameters:
318//          text_page   -   Handle to a text page information structure.
319//          Returned by FPDFText_LoadPage function.
320// Return Value:
321//          A handle to the page's links information structure.
322//          NULL if something goes wrong.
323// Comments:
324//          Weblinks are those links implicitly embedded in PDF pages. PDF also
325//          has a type of
326//          annotation called "link", FPDFTEXT doesn't deal with that kind of
327//          link.
328//          FPDFTEXT weblink feature is useful for automatically detecting links
329//          in the page
330//          contents. For example, things like "http://www.foxitsoftware.com"
331//          will be detected,
332//          so applications can allow user to click on those characters to
333//          activate the link,
334//          even the PDF doesn't come with link annotations.
335//
336//          FPDFLink_CloseWebLinks must be called to release resources.
337//
338DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
339
340// Function: FPDFLink_CountWebLinks
341//          Count number of detected web links.
342// Parameters:
343//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
344// Return Value:
345//          Number of detected web links.
346//
347DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
348
349// Function: FPDFLink_GetURL
350//          Fetch the URL information for a detected web link.
351// Parameters:
352//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
353//          link_index  -   Zero-based index for the link.
354//          buffer      -   A unicode buffer.
355//          buflen      -   Number of characters (not bytes) for the buffer,
356//          including an additional terminator.
357// Return Value:
358//          If buffer is NULL or buflen is zero, return number of characters
359//          (not bytes and an additional terminator is also counted) needed,
360//          otherwise, return number of characters copied into the buffer.
361//
362DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page,
363                                      int link_index,
364                                      unsigned short* buffer,
365                                      int buflen);
366
367// Function: FPDFLink_CountRects
368//          Count number of rectangular areas for the link.
369// Parameters:
370//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
371//          link_index  -   Zero-based index for the link.
372// Return Value:
373//          Number of rectangular areas for the link.
374//
375DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page,
376                                          int link_index);
377
378// Function: FPDFLink_GetRect
379//          Fetch the boundaries of a rectangle for a link.
380// Parameters:
381//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
382//          link_index  -   Zero-based index for the link.
383//          rect_index  -   Zero-based index for a rectangle.
384//          left        -   Pointer to a double value receiving the rectangle
385//          left boundary.
386//          top         -   Pointer to a double value receiving the rectangle
387//          top boundary.
388//          right       -   Pointer to a double value receiving the rectangle
389//          right boundary.
390//          bottom      -   Pointer to a double value receiving the rectangle
391//          bottom boundary.
392// Return Value:
393//          None.
394//
395DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page,
396                                        int link_index,
397                                        int rect_index,
398                                        double* left,
399                                        double* top,
400                                        double* right,
401                                        double* bottom);
402
403// Function: FPDFLink_CloseWebLinks
404//          Release resources used by weblink feature.
405// Parameters:
406//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
407// Return Value:
408//          None.
409//
410DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
411
412#ifdef __cplusplus
413}
414#endif
415
416#endif  // PUBLIC_FPDF_TEXT_H_
417