1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef PUBLIC_FPDF_TEXT_H_
8 #define PUBLIC_FPDF_TEXT_H_
9 
10 // NOLINTNEXTLINE(build/include)
11 #include "fpdfview.h"
12 
13 // Exported Functions
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 // Function: FPDFText_LoadPage
19 //          Prepare information about all characters in a page.
20 // Parameters:
21 //          page    -   Handle to the page. Returned by FPDF_LoadPage function
22 //          (in FPDFVIEW module).
23 // Return value:
24 //          A handle to the text page information structure.
25 //          NULL if something goes wrong.
26 // Comments:
27 //          Application must call FPDFText_ClosePage to release the text page
28 //          information.
29 //
30 FPDF_EXPORT FPDF_TEXTPAGE FPDF_CALLCONV FPDFText_LoadPage(FPDF_PAGE page);
31 
32 // Function: FPDFText_ClosePage
33 //          Release all resources allocated for a text page information
34 //          structure.
35 // Parameters:
36 //          text_page   -   Handle to a text page information structure.
37 //          Returned by FPDFText_LoadPage function.
38 // Return Value:
39 //          None.
40 //
41 FPDF_EXPORT void FPDF_CALLCONV FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
42 
43 // Function: FPDFText_CountChars
44 //          Get number of characters in a page.
45 // Parameters:
46 //          text_page   -   Handle to a text page information structure.
47 //          Returned by FPDFText_LoadPage function.
48 // Return value:
49 //          Number of characters in the page. Return -1 for error.
50 //          Generated characters, like additional space characters, new line
51 //          characters, are also counted.
52 // Comments:
53 //          Characters in a page form a "stream", inside the stream, each
54 //          character has an index.
55 //          We will use the index parameters in many of FPDFTEXT functions. The
56 //          first character in the page
57 //          has an index value of zero.
58 //
59 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountChars(FPDF_TEXTPAGE text_page);
60 
61 // Function: FPDFText_GetUnicode
62 //          Get Unicode of a character in a page.
63 // Parameters:
64 //          text_page   -   Handle to a text page information structure.
65 //          Returned by FPDFText_LoadPage function.
66 //          index       -   Zero-based index of the character.
67 // Return value:
68 //          The Unicode of the particular character.
69 //          If a character is not encoded in Unicode and Foxit engine can't
70 //          convert to Unicode,
71 //          the return value will be zero.
72 //
73 FPDF_EXPORT unsigned int FPDF_CALLCONV
74 FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index);
75 
76 // Function: FPDFText_GetFontSize
77 //          Get the font size of a particular character.
78 // Parameters:
79 //          text_page   -   Handle to a text page information structure.
80 //          Returned by FPDFText_LoadPage function.
81 //          index       -   Zero-based index of the character.
82 // Return value:
83 //          The font size of the particular character, measured in points (about
84 //          1/72 inch).
85 //          This is the typographic size of the font (so called "em size").
86 //
87 FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
88                                                       int index);
89 
90 // Function: FPDFText_GetCharBox
91 //          Get bounding box of a particular character.
92 // Parameters:
93 //          text_page   -   Handle to a text page information structure.
94 //          Returned by FPDFText_LoadPage function.
95 //          index       -   Zero-based index of the character.
96 //          left        -   Pointer to a double number receiving left position
97 //          of the character box.
98 //          right       -   Pointer to a double number receiving right position
99 //          of the character box.
100 //          bottom      -   Pointer to a double number receiving bottom position
101 //          of the character box.
102 //          top         -   Pointer to a double number receiving top position of
103 //          the character box.
104 // Return Value:
105 //          On success, return TRUE and fill in |left|, |right|, |bottom|, and
106 //          |top|. If |text_page| is invalid, or if |index| is out of bounds,
107 //          then return FALSE, and the out parameters remain unmodified.
108 // Comments:
109 //          All positions are measured in PDF "user space".
110 //
111 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
112                                                         int index,
113                                                         double* left,
114                                                         double* right,
115                                                         double* bottom,
116                                                         double* top);
117 
118 // Function: FPDFText_GetCharOrigin
119 //          Get origin of a particular character.
120 // Parameters:
121 //          text_page   -   Handle to a text page information structure.
122 //          Returned by FPDFText_LoadPage function.
123 //          index       -   Zero-based index of the character.
124 //          x           -   Pointer to a double number receiving x coordinate of
125 //          the character origin.
126 //          y           -   Pointer to a double number receiving y coordinate of
127 //          the character origin.
128 // Return Value:
129 //          Whether the call succeeded. If false, x and y are unchanged.
130 // Comments:
131 //          All positions are measured in PDF "user space".
132 //
133 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
134 FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page,
135                        int index,
136                        double* x,
137                        double* y);
138 
139 // Function: FPDFText_GetCharIndexAtPos
140 //          Get the index of a character at or nearby a certain position on the
141 //          page.
142 // Parameters:
143 //          text_page   -   Handle to a text page information structure.
144 //          Returned by FPDFText_LoadPage function.
145 //          x           -   X position in PDF "user space".
146 //          y           -   Y position in PDF "user space".
147 //          xTolerance  -   An x-axis tolerance value for character hit
148 //          detection, in point unit.
149 //          yTolerance  -   A y-axis tolerance value for character hit
150 //          detection, in point unit.
151 // Return Value:
152 //          The zero-based index of the character at, or nearby the point (x,y).
153 //          If there is no character at or nearby the point, return value will
154 //          be -1.
155 //          If an error occurs, -3 will be returned.
156 //
157 FPDF_EXPORT int FPDF_CALLCONV
158 FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
159                            double x,
160                            double y,
161                            double xTolerance,
162                            double yTolerance);
163 
164 // Function: FPDFText_GetText
165 //          Extract unicode text string from the page.
166 // Parameters:
167 //          text_page   -   Handle to a text page information structure.
168 //          Returned by FPDFText_LoadPage function.
169 //          start_index -   Index for the start characters.
170 //          count       -   Number of characters to be extracted.
171 //          result      -   A buffer (allocated by application) receiving the
172 //          extracted unicodes.
173 //                          The size of the buffer must be able to hold the
174 //                          number of characters plus a terminator.
175 // Return Value:
176 //          Number of characters written into the result buffer, including the
177 //          trailing terminator.
178 // Comments:
179 //          This function ignores characters without unicode information.
180 //
181 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page,
182                                                int start_index,
183                                                int count,
184                                                unsigned short* result);
185 
186 // Function: FPDFText_CountRects
187 //          Count number of rectangular areas occupied by a segment of texts.
188 // Parameters:
189 //          text_page   -   Handle to a text page information structure.
190 //          Returned by FPDFText_LoadPage function.
191 //          start_index -   Index for the start characters.
192 //          count       -   Number of characters.
193 // Return value:
194 //          Number of rectangles. Zero for error.
195 // Comments:
196 //          This function, along with FPDFText_GetRect can be used by
197 //          applications to detect the position
198 //          on the page for a text segment, so proper areas can be highlighted
199 //          or something.
200 //          FPDFTEXT will automatically merge small character boxes into bigger
201 //          one if those characters
202 //          are on the same line and use same font settings.
203 //
204 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page,
205                                                   int start_index,
206                                                   int count);
207 
208 // Function: FPDFText_GetRect
209 //          Get a rectangular area from the result generated by
210 //          FPDFText_CountRects.
211 // Parameters:
212 //          text_page   -   Handle to a text page information structure.
213 //          Returned by FPDFText_LoadPage function.
214 //          rect_index  -   Zero-based index for the rectangle.
215 //          left        -   Pointer to a double value receiving the rectangle
216 //          left boundary.
217 //          top         -   Pointer to a double value receiving the rectangle
218 //          top boundary.
219 //          right       -   Pointer to a double value receiving the rectangle
220 //          right boundary.
221 //          bottom      -   Pointer to a double value receiving the rectangle
222 //          bottom boundary.
223 // Return Value:
224 //          On success, return TRUE and fill in |left|, |top|, |right|, and
225 //          |bottom|. If |link_page| is invalid then return FALSE, and the out
226 //          parameters remain unmodified. If |link_page| is valid but
227 //          |link_index| is out of bounds, then return FALSE and set the out
228 //          parameters to 0.
229 //
230 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetRect(FPDF_TEXTPAGE text_page,
231                                                      int rect_index,
232                                                      double* left,
233                                                      double* top,
234                                                      double* right,
235                                                      double* bottom);
236 
237 // Function: FPDFText_GetBoundedText
238 //          Extract unicode text within a rectangular boundary on the page.
239 // Parameters:
240 //          text_page   -   Handle to a text page information structure.
241 //          Returned by FPDFText_LoadPage function.
242 //          left        -   Left boundary.
243 //          top         -   Top boundary.
244 //          right       -   Right boundary.
245 //          bottom      -   Bottom boundary.
246 //          buffer      -   A unicode buffer.
247 //          buflen      -   Number of characters (not bytes) for the buffer,
248 //          excluding an additional terminator.
249 // Return Value:
250 //          If buffer is NULL or buflen is zero, return number of characters
251 //          (not bytes) of text present within
252 //          the rectangle, excluding a terminating NUL.  Generally you should
253 //          pass a buffer at least one larger
254 //          than this if you want a terminating NUL, which will be provided if
255 //          space is available.
256 //          Otherwise, return number of characters copied into the buffer,
257 //          including the terminating NUL
258 //          when space for it is available.
259 // Comment:
260 //          If the buffer is too small, as much text as will fit is copied into
261 //          it.
262 //
263 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
264                                                       double left,
265                                                       double top,
266                                                       double right,
267                                                       double bottom,
268                                                       unsigned short* buffer,
269                                                       int buflen);
270 
271 // Flags used by FPDFText_FindStart function.
272 #define FPDF_MATCHCASE \
273   0x00000001  // If not set, it will not match case by default.
274 #define FPDF_MATCHWHOLEWORD \
275   0x00000002  // If not set, it will not match the whole word by default.
276 
277 // Function: FPDFText_FindStart
278 //          Start a search.
279 // Parameters:
280 //          text_page   -   Handle to a text page information structure.
281 //          Returned by FPDFText_LoadPage function.
282 //          findwhat    -   A unicode match pattern.
283 //          flags       -   Option flags.
284 //          start_index -   Start from this character. -1 for end of the page.
285 // Return Value:
286 //          A handle for the search context. FPDFText_FindClose must be called
287 //          to release this handle.
288 //
289 FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV
290 FPDFText_FindStart(FPDF_TEXTPAGE text_page,
291                    FPDF_WIDESTRING findwhat,
292                    unsigned long flags,
293                    int start_index);
294 
295 // Function: FPDFText_FindNext
296 //          Search in the direction from page start to end.
297 // Parameters:
298 //          handle      -   A search context handle returned by
299 //          FPDFText_FindStart.
300 // Return Value:
301 //          Whether a match is found.
302 //
303 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindNext(FPDF_SCHHANDLE handle);
304 
305 // Function: FPDFText_FindPrev
306 //          Search in the direction from page end to start.
307 // Parameters:
308 //          handle      -   A search context handle returned by
309 //          FPDFText_FindStart.
310 // Return Value:
311 //          Whether a match is found.
312 //
313 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindPrev(FPDF_SCHHANDLE handle);
314 
315 // Function: FPDFText_GetSchResultIndex
316 //          Get the starting character index of the search result.
317 // Parameters:
318 //          handle      -   A search context handle returned by
319 //          FPDFText_FindStart.
320 // Return Value:
321 //          Index for the starting character.
322 //
323 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
324 
325 // Function: FPDFText_GetSchCount
326 //          Get the number of matched characters in the search result.
327 // Parameters:
328 //          handle      -   A search context handle returned by
329 //          FPDFText_FindStart.
330 // Return Value:
331 //          Number of matched characters.
332 //
333 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
334 
335 // Function: FPDFText_FindClose
336 //          Release a search context.
337 // Parameters:
338 //          handle      -   A search context handle returned by
339 //          FPDFText_FindStart.
340 // Return Value:
341 //          None.
342 //
343 FPDF_EXPORT void FPDF_CALLCONV FPDFText_FindClose(FPDF_SCHHANDLE handle);
344 
345 // Function: FPDFLink_LoadWebLinks
346 //          Prepare information about weblinks in a page.
347 // Parameters:
348 //          text_page   -   Handle to a text page information structure.
349 //          Returned by FPDFText_LoadPage function.
350 // Return Value:
351 //          A handle to the page's links information structure.
352 //          NULL if something goes wrong.
353 // Comments:
354 //          Weblinks are those links implicitly embedded in PDF pages. PDF also
355 //          has a type of
356 //          annotation called "link", FPDFTEXT doesn't deal with that kind of
357 //          link.
358 //          FPDFTEXT weblink feature is useful for automatically detecting links
359 //          in the page
360 //          contents. For example, things like "http://www.foxitsoftware.com"
361 //          will be detected,
362 //          so applications can allow user to click on those characters to
363 //          activate the link,
364 //          even the PDF doesn't come with link annotations.
365 //
366 //          FPDFLink_CloseWebLinks must be called to release resources.
367 //
368 FPDF_EXPORT FPDF_PAGELINK FPDF_CALLCONV
369 FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
370 
371 // Function: FPDFLink_CountWebLinks
372 //          Count number of detected web links.
373 // Parameters:
374 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
375 // Return Value:
376 //          Number of detected web links.
377 //
378 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
379 
380 // Function: FPDFLink_GetURL
381 //          Fetch the URL information for a detected web link.
382 // Parameters:
383 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
384 //          link_index  -   Zero-based index for the link.
385 //          buffer      -   A unicode buffer for the result.
386 //          buflen      -   Number of characters (not bytes) for the buffer,
387 //                          including an additional terminator.
388 // Return Value:
389 //          If |buffer| is NULL or |buflen| is zero, return the number of
390 //          characters (not bytes) needed to buffer the result (an additional
391 //          terminator is included in this count).
392 //          Otherwise, copy the result into |buffer|, truncating at |buflen| if
393 //          the result is too large to fit, and return the number of characters
394 //          actually copied into the buffer (the additional terminator is also
395 //          included in this count).
396 //          If |link_index| does not correspond to a valid link, then the result
397 //          is an empty string.
398 //
399 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_GetURL(FPDF_PAGELINK link_page,
400                                               int link_index,
401                                               unsigned short* buffer,
402                                               int buflen);
403 
404 // Function: FPDFLink_CountRects
405 //          Count number of rectangular areas for the link.
406 // Parameters:
407 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
408 //          link_index  -   Zero-based index for the link.
409 // Return Value:
410 //          Number of rectangular areas for the link.  If |link_index| does
411 //          not correspond to a valid link, then 0 is returned.
412 //
413 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountRects(FPDF_PAGELINK link_page,
414                                                   int link_index);
415 
416 // Function: FPDFLink_GetRect
417 //          Fetch the boundaries of a rectangle for a link.
418 // Parameters:
419 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
420 //          link_index  -   Zero-based index for the link.
421 //          rect_index  -   Zero-based index for a rectangle.
422 //          left        -   Pointer to a double value receiving the rectangle
423 //                          left boundary.
424 //          top         -   Pointer to a double value receiving the rectangle
425 //                          top boundary.
426 //          right       -   Pointer to a double value receiving the rectangle
427 //                          right boundary.
428 //          bottom      -   Pointer to a double value receiving the rectangle
429 //                          bottom boundary.
430 // Return Value:
431 //          On success, return TRUE and fill in |left|, |top|, |right|, and
432 //          |bottom|. If |link_page| is invalid or if |link_index| does not
433 //          correspond to a valid link, then return FALSE, and the out
434 //          parameters remain unmodified.
435 //
436 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFLink_GetRect(FPDF_PAGELINK link_page,
437                                                      int link_index,
438                                                      int rect_index,
439                                                      double* left,
440                                                      double* top,
441                                                      double* right,
442                                                      double* bottom);
443 
444 // Function: FPDFLink_CloseWebLinks
445 //          Release resources used by weblink feature.
446 // Parameters:
447 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
448 // Return Value:
449 //          None.
450 //
451 FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
452 
453 #ifdef __cplusplus
454 }
455 #endif
456 
457 #endif  // PUBLIC_FPDF_TEXT_H_
458