1// Copyright 2014 PDFium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#ifndef PUBLIC_FPDF_TEXT_H_
8#define PUBLIC_FPDF_TEXT_H_
9
10// NOLINTNEXTLINE(build/include)
11#include "fpdfview.h"
12
13// Exported Functions
14#ifdef __cplusplus
15extern "C" {
16#endif
17
18// Function: FPDFText_LoadPage
19//          Prepare information about all characters in a page.
20// Parameters:
21//          page    -   Handle to the page. Returned by FPDF_LoadPage function
22//          (in FPDFVIEW module).
23// Return value:
24//          A handle to the text page information structure.
25//          NULL if something goes wrong.
26// Comments:
27//          Application must call FPDFText_ClosePage to release the text page
28//          information.
29//
30FPDF_EXPORT FPDF_TEXTPAGE FPDF_CALLCONV FPDFText_LoadPage(FPDF_PAGE page);
31
32// Function: FPDFText_ClosePage
33//          Release all resources allocated for a text page information
34//          structure.
35// Parameters:
36//          text_page   -   Handle to a text page information structure.
37//          Returned by FPDFText_LoadPage function.
38// Return Value:
39//          None.
40//
41FPDF_EXPORT void FPDF_CALLCONV FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
42
43// Function: FPDFText_CountChars
44//          Get number of characters in a page.
45// Parameters:
46//          text_page   -   Handle to a text page information structure.
47//          Returned by FPDFText_LoadPage function.
48// Return value:
49//          Number of characters in the page. Return -1 for error.
50//          Generated characters, like additional space characters, new line
51//          characters, are also counted.
52// Comments:
53//          Characters in a page form a "stream", inside the stream, each
54//          character has an index.
55//          We will use the index parameters in many of FPDFTEXT functions. The
56//          first character in the page
57//          has an index value of zero.
58//
59FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountChars(FPDF_TEXTPAGE text_page);
60
61// Function: FPDFText_GetUnicode
62//          Get Unicode of a character in a page.
63// Parameters:
64//          text_page   -   Handle to a text page information structure.
65//          Returned by FPDFText_LoadPage function.
66//          index       -   Zero-based index of the character.
67// Return value:
68//          The Unicode of the particular character.
69//          If a character is not encoded in Unicode and Foxit engine can't
70//          convert to Unicode,
71//          the return value will be zero.
72//
73FPDF_EXPORT unsigned int FPDF_CALLCONV
74FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index);
75
76// Function: FPDFText_GetFontSize
77//          Get the font size of a particular character.
78// Parameters:
79//          text_page   -   Handle to a text page information structure.
80//          Returned by FPDFText_LoadPage function.
81//          index       -   Zero-based index of the character.
82// Return value:
83//          The font size of the particular character, measured in points (about
84//          1/72 inch).
85//          This is the typographic size of the font (so called "em size").
86//
87FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
88                                                      int index);
89
90// Function: FPDFText_GetCharBox
91//          Get bounding box of a particular character.
92// Parameters:
93//          text_page   -   Handle to a text page information structure.
94//          Returned by FPDFText_LoadPage function.
95//          index       -   Zero-based index of the character.
96//          left        -   Pointer to a double number receiving left position
97//          of the character box.
98//          right       -   Pointer to a double number receiving right position
99//          of the character box.
100//          bottom      -   Pointer to a double number receiving bottom position
101//          of the character box.
102//          top         -   Pointer to a double number receiving top position of
103//          the character box.
104// Return Value:
105//          On success, return TRUE and fill in |left|, |right|, |bottom|, and
106//          |top|. If |text_page| is invalid, or if |index| is out of bounds,
107//          then return FALSE, and the out parameters remain unmodified.
108// Comments:
109//          All positions are measured in PDF "user space".
110//
111FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
112                                                        int index,
113                                                        double* left,
114                                                        double* right,
115                                                        double* bottom,
116                                                        double* top);
117
118// Function: FPDFText_GetCharOrigin
119//          Get origin of a particular character.
120// Parameters:
121//          text_page   -   Handle to a text page information structure.
122//          Returned by FPDFText_LoadPage function.
123//          index       -   Zero-based index of the character.
124//          x           -   Pointer to a double number receiving x coordinate of
125//          the character origin.
126//          y           -   Pointer to a double number receiving y coordinate of
127//          the character origin.
128// Return Value:
129//          Whether the call succeeded. If false, x and y are unchanged.
130// Comments:
131//          All positions are measured in PDF "user space".
132//
133FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
134FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page,
135                       int index,
136                       double* x,
137                       double* y);
138
139// Function: FPDFText_GetCharIndexAtPos
140//          Get the index of a character at or nearby a certain position on the
141//          page.
142// Parameters:
143//          text_page   -   Handle to a text page information structure.
144//          Returned by FPDFText_LoadPage function.
145//          x           -   X position in PDF "user space".
146//          y           -   Y position in PDF "user space".
147//          xTolerance  -   An x-axis tolerance value for character hit
148//          detection, in point unit.
149//          yTolerance  -   A y-axis tolerance value for character hit
150//          detection, in point unit.
151// Return Value:
152//          The zero-based index of the character at, or nearby the point (x,y).
153//          If there is no character at or nearby the point, return value will
154//          be -1.
155//          If an error occurs, -3 will be returned.
156//
157FPDF_EXPORT int FPDF_CALLCONV
158FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
159                           double x,
160                           double y,
161                           double xTolerance,
162                           double yTolerance);
163
164// Function: FPDFText_GetText
165//          Extract unicode text string from the page.
166// Parameters:
167//          text_page   -   Handle to a text page information structure.
168//          Returned by FPDFText_LoadPage function.
169//          start_index -   Index for the start characters.
170//          count       -   Number of characters to be extracted.
171//          result      -   A buffer (allocated by application) receiving the
172//          extracted unicodes.
173//                          The size of the buffer must be able to hold the
174//                          number of characters plus a terminator.
175// Return Value:
176//          Number of characters written into the result buffer, including the
177//          trailing terminator.
178// Comments:
179//          This function ignores characters without unicode information.
180//
181FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page,
182                                               int start_index,
183                                               int count,
184                                               unsigned short* result);
185
186// Function: FPDFText_CountRects
187//          Count number of rectangular areas occupied by a segment of texts.
188// Parameters:
189//          text_page   -   Handle to a text page information structure.
190//          Returned by FPDFText_LoadPage function.
191//          start_index -   Index for the start characters.
192//          count       -   Number of characters.
193// Return value:
194//          Number of rectangles. Zero for error.
195// Comments:
196//          This function, along with FPDFText_GetRect can be used by
197//          applications to detect the position
198//          on the page for a text segment, so proper areas can be highlighted
199//          or something.
200//          FPDFTEXT will automatically merge small character boxes into bigger
201//          one if those characters
202//          are on the same line and use same font settings.
203//
204FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page,
205                                                  int start_index,
206                                                  int count);
207
208// Function: FPDFText_GetRect
209//          Get a rectangular area from the result generated by
210//          FPDFText_CountRects.
211// Parameters:
212//          text_page   -   Handle to a text page information structure.
213//          Returned by FPDFText_LoadPage function.
214//          rect_index  -   Zero-based index for the rectangle.
215//          left        -   Pointer to a double value receiving the rectangle
216//          left boundary.
217//          top         -   Pointer to a double value receiving the rectangle
218//          top boundary.
219//          right       -   Pointer to a double value receiving the rectangle
220//          right boundary.
221//          bottom      -   Pointer to a double value receiving the rectangle
222//          bottom boundary.
223// Return Value:
224//          On success, return TRUE and fill in |left|, |top|, |right|, and
225//          |bottom|. If |link_page| is invalid then return FALSE, and the out
226//          parameters remain unmodified. If |link_page| is valid but
227//          |link_index| is out of bounds, then return FALSE and set the out
228//          parameters to 0.
229//
230FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetRect(FPDF_TEXTPAGE text_page,
231                                                     int rect_index,
232                                                     double* left,
233                                                     double* top,
234                                                     double* right,
235                                                     double* bottom);
236
237// Function: FPDFText_GetBoundedText
238//          Extract unicode text within a rectangular boundary on the page.
239// Parameters:
240//          text_page   -   Handle to a text page information structure.
241//          Returned by FPDFText_LoadPage function.
242//          left        -   Left boundary.
243//          top         -   Top boundary.
244//          right       -   Right boundary.
245//          bottom      -   Bottom boundary.
246//          buffer      -   A unicode buffer.
247//          buflen      -   Number of characters (not bytes) for the buffer,
248//          excluding an additional terminator.
249// Return Value:
250//          If buffer is NULL or buflen is zero, return number of characters
251//          (not bytes) of text present within
252//          the rectangle, excluding a terminating NUL.  Generally you should
253//          pass a buffer at least one larger
254//          than this if you want a terminating NUL, which will be provided if
255//          space is available.
256//          Otherwise, return number of characters copied into the buffer,
257//          including the terminating NUL
258//          when space for it is available.
259// Comment:
260//          If the buffer is too small, as much text as will fit is copied into
261//          it.
262//
263FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
264                                                      double left,
265                                                      double top,
266                                                      double right,
267                                                      double bottom,
268                                                      unsigned short* buffer,
269                                                      int buflen);
270
271// Flags used by FPDFText_FindStart function.
272#define FPDF_MATCHCASE \
273  0x00000001  // If not set, it will not match case by default.
274#define FPDF_MATCHWHOLEWORD \
275  0x00000002  // If not set, it will not match the whole word by default.
276
277// Function: FPDFText_FindStart
278//          Start a search.
279// Parameters:
280//          text_page   -   Handle to a text page information structure.
281//          Returned by FPDFText_LoadPage function.
282//          findwhat    -   A unicode match pattern.
283//          flags       -   Option flags.
284//          start_index -   Start from this character. -1 for end of the page.
285// Return Value:
286//          A handle for the search context. FPDFText_FindClose must be called
287//          to release this handle.
288//
289FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV
290FPDFText_FindStart(FPDF_TEXTPAGE text_page,
291                   FPDF_WIDESTRING findwhat,
292                   unsigned long flags,
293                   int start_index);
294
295// Function: FPDFText_FindNext
296//          Search in the direction from page start to end.
297// Parameters:
298//          handle      -   A search context handle returned by
299//          FPDFText_FindStart.
300// Return Value:
301//          Whether a match is found.
302//
303FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindNext(FPDF_SCHHANDLE handle);
304
305// Function: FPDFText_FindPrev
306//          Search in the direction from page end to start.
307// Parameters:
308//          handle      -   A search context handle returned by
309//          FPDFText_FindStart.
310// Return Value:
311//          Whether a match is found.
312//
313FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindPrev(FPDF_SCHHANDLE handle);
314
315// Function: FPDFText_GetSchResultIndex
316//          Get the starting character index of the search result.
317// Parameters:
318//          handle      -   A search context handle returned by
319//          FPDFText_FindStart.
320// Return Value:
321//          Index for the starting character.
322//
323FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
324
325// Function: FPDFText_GetSchCount
326//          Get the number of matched characters in the search result.
327// Parameters:
328//          handle      -   A search context handle returned by
329//          FPDFText_FindStart.
330// Return Value:
331//          Number of matched characters.
332//
333FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
334
335// Function: FPDFText_FindClose
336//          Release a search context.
337// Parameters:
338//          handle      -   A search context handle returned by
339//          FPDFText_FindStart.
340// Return Value:
341//          None.
342//
343FPDF_EXPORT void FPDF_CALLCONV FPDFText_FindClose(FPDF_SCHHANDLE handle);
344
345// Function: FPDFLink_LoadWebLinks
346//          Prepare information about weblinks in a page.
347// Parameters:
348//          text_page   -   Handle to a text page information structure.
349//          Returned by FPDFText_LoadPage function.
350// Return Value:
351//          A handle to the page's links information structure.
352//          NULL if something goes wrong.
353// Comments:
354//          Weblinks are those links implicitly embedded in PDF pages. PDF also
355//          has a type of
356//          annotation called "link", FPDFTEXT doesn't deal with that kind of
357//          link.
358//          FPDFTEXT weblink feature is useful for automatically detecting links
359//          in the page
360//          contents. For example, things like "http://www.foxitsoftware.com"
361//          will be detected,
362//          so applications can allow user to click on those characters to
363//          activate the link,
364//          even the PDF doesn't come with link annotations.
365//
366//          FPDFLink_CloseWebLinks must be called to release resources.
367//
368FPDF_EXPORT FPDF_PAGELINK FPDF_CALLCONV
369FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
370
371// Function: FPDFLink_CountWebLinks
372//          Count number of detected web links.
373// Parameters:
374//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
375// Return Value:
376//          Number of detected web links.
377//
378FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
379
380// Function: FPDFLink_GetURL
381//          Fetch the URL information for a detected web link.
382// Parameters:
383//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
384//          link_index  -   Zero-based index for the link.
385//          buffer      -   A unicode buffer for the result.
386//          buflen      -   Number of characters (not bytes) for the buffer,
387//                          including an additional terminator.
388// Return Value:
389//          If |buffer| is NULL or |buflen| is zero, return the number of
390//          characters (not bytes) needed to buffer the result (an additional
391//          terminator is included in this count).
392//          Otherwise, copy the result into |buffer|, truncating at |buflen| if
393//          the result is too large to fit, and return the number of characters
394//          actually copied into the buffer (the additional terminator is also
395//          included in this count).
396//          If |link_index| does not correspond to a valid link, then the result
397//          is an empty string.
398//
399FPDF_EXPORT int FPDF_CALLCONV FPDFLink_GetURL(FPDF_PAGELINK link_page,
400                                              int link_index,
401                                              unsigned short* buffer,
402                                              int buflen);
403
404// Function: FPDFLink_CountRects
405//          Count number of rectangular areas for the link.
406// Parameters:
407//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
408//          link_index  -   Zero-based index for the link.
409// Return Value:
410//          Number of rectangular areas for the link.  If |link_index| does
411//          not correspond to a valid link, then 0 is returned.
412//
413FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountRects(FPDF_PAGELINK link_page,
414                                                  int link_index);
415
416// Function: FPDFLink_GetRect
417//          Fetch the boundaries of a rectangle for a link.
418// Parameters:
419//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
420//          link_index  -   Zero-based index for the link.
421//          rect_index  -   Zero-based index for a rectangle.
422//          left        -   Pointer to a double value receiving the rectangle
423//                          left boundary.
424//          top         -   Pointer to a double value receiving the rectangle
425//                          top boundary.
426//          right       -   Pointer to a double value receiving the rectangle
427//                          right boundary.
428//          bottom      -   Pointer to a double value receiving the rectangle
429//                          bottom boundary.
430// Return Value:
431//          On success, return TRUE and fill in |left|, |top|, |right|, and
432//          |bottom|. If |link_page| is invalid or if |link_index| does not
433//          correspond to a valid link, then return FALSE, and the out
434//          parameters remain unmodified.
435//
436FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFLink_GetRect(FPDF_PAGELINK link_page,
437                                                     int link_index,
438                                                     int rect_index,
439                                                     double* left,
440                                                     double* top,
441                                                     double* right,
442                                                     double* bottom);
443
444// Function: FPDFLink_CloseWebLinks
445//          Release resources used by weblink feature.
446// Parameters:
447//          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
448// Return Value:
449//          None.
450//
451FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
452
453#ifdef __cplusplus
454}
455#endif
456
457#endif  // PUBLIC_FPDF_TEXT_H_
458