1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef PUBLIC_FPDF_TEXT_H_
8 #define PUBLIC_FPDF_TEXT_H_
9 
10 #include "fpdfview.h"
11 
12 // Exported Functions
13 #ifdef __cplusplus
14 extern "C" {
15 #endif
16 
17 // Function: FPDFText_LoadPage
18 //          Prepare information about all characters in a page.
19 // Parameters:
20 //          page    -   Handle to the page. Returned by FPDF_LoadPage function
21 //          (in FPDFVIEW module).
22 // Return value:
23 //          A handle to the text page information structure.
24 //          NULL if something goes wrong.
25 // Comments:
26 //          Application must call FPDFText_ClosePage to release the text page
27 //          information.
28 //
29 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page);
30 
31 // Function: FPDFText_ClosePage
32 //          Release all resources allocated for a text page information
33 //          structure.
34 // Parameters:
35 //          text_page   -   Handle to a text page information structure.
36 //          Returned by FPDFText_LoadPage function.
37 // Return Value:
38 //          None.
39 //
40 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
41 
42 // Function: FPDFText_CountChars
43 //          Get number of characters in a page.
44 // Parameters:
45 //          text_page   -   Handle to a text page information structure.
46 //          Returned by FPDFText_LoadPage function.
47 // Return value:
48 //          Number of characters in the page. Return -1 for error.
49 //          Generated characters, like additional space characters, new line
50 //          characters, are also counted.
51 // Comments:
52 //          Characters in a page form a "stream", inside the stream, each
53 //          character has an index.
54 //          We will use the index parameters in many of FPDFTEXT functions. The
55 //          first character in the page
56 //          has an index value of zero.
57 //
58 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page);
59 
60 // Function: FPDFText_GetUnicode
61 //          Get Unicode of a character in a page.
62 // Parameters:
63 //          text_page   -   Handle to a text page information structure.
64 //          Returned by FPDFText_LoadPage function.
65 //          index       -   Zero-based index of the character.
66 // Return value:
67 //          The Unicode of the particular character.
68 //          If a character is not encoded in Unicode and Foxit engine can't
69 //          convert to Unicode,
70 //          the return value will be zero.
71 //
72 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page,
73                                                    int index);
74 
75 // Function: FPDFText_GetFontSize
76 //          Get the font size of a particular character.
77 // Parameters:
78 //          text_page   -   Handle to a text page information structure.
79 //          Returned by FPDFText_LoadPage function.
80 //          index       -   Zero-based index of the character.
81 // Return value:
82 //          The font size of the particular character, measured in points (about
83 //          1/72 inch).
84 //          This is the typographic size of the font (so called "em size").
85 //
86 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
87                                               int index);
88 
89 // Function: FPDFText_GetCharBox
90 //          Get bounding box of a particular character.
91 // Parameters:
92 //          text_page   -   Handle to a text page information structure.
93 //          Returned by FPDFText_LoadPage function.
94 //          index       -   Zero-based index of the character.
95 //          left        -   Pointer to a double number receiving left position
96 //          of the character box.
97 //          right       -   Pointer to a double number receiving right position
98 //          of the character box.
99 //          bottom      -   Pointer to a double number receiving bottom position
100 //          of the character box.
101 //          top         -   Pointer to a double number receiving top position of
102 //          the character box.
103 // Return Value:
104 //          None.
105 // Comments:
106 //          All positions are measured in PDF "user space".
107 //
108 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
109                                            int index,
110                                            double* left,
111                                            double* right,
112                                            double* bottom,
113                                            double* top);
114 
115 // Function: FPDFText_GetCharIndexAtPos
116 //          Get the index of a character at or nearby a certain position on the
117 //          page.
118 // Parameters:
119 //          text_page   -   Handle to a text page information structure.
120 //          Returned by FPDFText_LoadPage function.
121 //          x           -   X position in PDF "user space".
122 //          y           -   Y position in PDF "user space".
123 //          xTolerance  -   An x-axis tolerance value for character hit
124 //          detection, in point unit.
125 //          yTolerance  -   A y-axis tolerance value for character hit
126 //          detection, in point unit.
127 // Return Value:
128 //          The zero-based index of the character at, or nearby the point (x,y).
129 //          If there is no character at or nearby the point, return value will
130 //          be -1.
131 //          If an error occurs, -3 will be returned.
132 //
133 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
134                                                  double x,
135                                                  double y,
136                                                  double xTolerance,
137                                                  double yTolerance);
138 
139 // Function: FPDFText_GetText
140 //          Extract unicode text string from the page.
141 // Parameters:
142 //          text_page   -   Handle to a text page information structure.
143 //          Returned by FPDFText_LoadPage function.
144 //          start_index -   Index for the start characters.
145 //          count       -   Number of characters to be extracted.
146 //          result      -   A buffer (allocated by application) receiving the
147 //          extracted unicodes.
148 //                          The size of the buffer must be able to hold the
149 //                          number of characters plus a terminator.
150 // Return Value:
151 //          Number of characters written into the result buffer, including the
152 //          trailing terminator.
153 // Comments:
154 //          This function ignores characters without unicode information.
155 //
156 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page,
157                                        int start_index,
158                                        int count,
159                                        unsigned short* result);
160 
161 // Function: FPDFText_CountRects
162 //          Count number of rectangular areas occupied by a segment of texts.
163 // Parameters:
164 //          text_page   -   Handle to a text page information structure.
165 //          Returned by FPDFText_LoadPage function.
166 //          start_index -   Index for the start characters.
167 //          count       -   Number of characters.
168 // Return value:
169 //          Number of rectangles. Zero for error.
170 // Comments:
171 //          This function, along with FPDFText_GetRect can be used by
172 //          applications to detect the position
173 //          on the page for a text segment, so proper areas can be highlighted
174 //          or something.
175 //          FPDFTEXT will automatically merge small character boxes into bigger
176 //          one if those characters
177 //          are on the same line and use same font settings.
178 //
179 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page,
180                                           int start_index,
181                                           int count);
182 
183 // Function: FPDFText_GetRect
184 //          Get a rectangular area from the result generated by
185 //          FPDFText_CountRects.
186 // Parameters:
187 //          text_page   -   Handle to a text page information structure.
188 //          Returned by FPDFText_LoadPage function.
189 //          rect_index  -   Zero-based index for the rectangle.
190 //          left        -   Pointer to a double value receiving the rectangle
191 //          left boundary.
192 //          top         -   Pointer to a double value receiving the rectangle
193 //          top boundary.
194 //          right       -   Pointer to a double value receiving the rectangle
195 //          right boundary.
196 //          bottom      -   Pointer to a double value receiving the rectangle
197 //          bottom boundary.
198 // Return Value:
199 //          None.
200 //
201 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page,
202                                         int rect_index,
203                                         double* left,
204                                         double* top,
205                                         double* right,
206                                         double* bottom);
207 
208 // Function: FPDFText_GetBoundedText
209 //          Extract unicode text within a rectangular boundary on the page.
210 // Parameters:
211 //          text_page   -   Handle to a text page information structure.
212 //          Returned by FPDFText_LoadPage function.
213 //          left        -   Left boundary.
214 //          top         -   Top boundary.
215 //          right       -   Right boundary.
216 //          bottom      -   Bottom boundary.
217 //          buffer      -   A unicode buffer.
218 //          buflen      -   Number of characters (not bytes) for the buffer,
219 //          excluding an additional terminator.
220 // Return Value:
221 //          If buffer is NULL or buflen is zero, return number of characters
222 //          (not bytes) of text present within
223 //          the rectangle, excluding a terminating NUL.  Generally you should
224 //          pass a buffer at least one larger
225 //          than this if you want a terminating NUL, which will be provided if
226 //          space is available.
227 //          Otherwise, return number of characters copied into the buffer,
228 //          including the terminating NUL
229 //          when space for it is available.
230 // Comment:
231 //          If the buffer is too small, as much text as will fit is copied into
232 //          it.
233 //
234 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
235                                               double left,
236                                               double top,
237                                               double right,
238                                               double bottom,
239                                               unsigned short* buffer,
240                                               int buflen);
241 
242 // Flags used by FPDFText_FindStart function.
243 #define FPDF_MATCHCASE \
244   0x00000001  // If not set, it will not match case by default.
245 #define FPDF_MATCHWHOLEWORD \
246   0x00000002  // If not set, it will not match the whole word by default.
247 
248 // Function: FPDFText_FindStart
249 //          Start a search.
250 // Parameters:
251 //          text_page   -   Handle to a text page information structure.
252 //          Returned by FPDFText_LoadPage function.
253 //          findwhat    -   A unicode match pattern.
254 //          flags       -   Option flags.
255 //          start_index -   Start from this character. -1 for end of the page.
256 // Return Value:
257 //          A handle for the search context. FPDFText_FindClose must be called
258 //          to release this handle.
259 //
260 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page,
261                                                     FPDF_WIDESTRING findwhat,
262                                                     unsigned long flags,
263                                                     int start_index);
264 
265 // Function: FPDFText_FindNext
266 //          Search in the direction from page start to end.
267 // Parameters:
268 //          handle      -   A search context handle returned by
269 //          FPDFText_FindStart.
270 // Return Value:
271 //          Whether a match is found.
272 //
273 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle);
274 
275 // Function: FPDFText_FindPrev
276 //          Search in the direction from page end to start.
277 // Parameters:
278 //          handle      -   A search context handle returned by
279 //          FPDFText_FindStart.
280 // Return Value:
281 //          Whether a match is found.
282 //
283 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle);
284 
285 // Function: FPDFText_GetSchResultIndex
286 //          Get the starting character index of the search result.
287 // Parameters:
288 //          handle      -   A search context handle returned by
289 //          FPDFText_FindStart.
290 // Return Value:
291 //          Index for the starting character.
292 //
293 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
294 
295 // Function: FPDFText_GetSchCount
296 //          Get the number of matched characters in the search result.
297 // Parameters:
298 //          handle      -   A search context handle returned by
299 //          FPDFText_FindStart.
300 // Return Value:
301 //          Number of matched characters.
302 //
303 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
304 
305 // Function: FPDFText_FindClose
306 //          Release a search context.
307 // Parameters:
308 //          handle      -   A search context handle returned by
309 //          FPDFText_FindStart.
310 // Return Value:
311 //          None.
312 //
313 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle);
314 
315 // Function: FPDFLink_LoadWebLinks
316 //          Prepare information about weblinks in a page.
317 // Parameters:
318 //          text_page   -   Handle to a text page information structure.
319 //          Returned by FPDFText_LoadPage function.
320 // Return Value:
321 //          A handle to the page's links information structure.
322 //          NULL if something goes wrong.
323 // Comments:
324 //          Weblinks are those links implicitly embedded in PDF pages. PDF also
325 //          has a type of
326 //          annotation called "link", FPDFTEXT doesn't deal with that kind of
327 //          link.
328 //          FPDFTEXT weblink feature is useful for automatically detecting links
329 //          in the page
330 //          contents. For example, things like "http://www.foxitsoftware.com"
331 //          will be detected,
332 //          so applications can allow user to click on those characters to
333 //          activate the link,
334 //          even the PDF doesn't come with link annotations.
335 //
336 //          FPDFLink_CloseWebLinks must be called to release resources.
337 //
338 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
339 
340 // Function: FPDFLink_CountWebLinks
341 //          Count number of detected web links.
342 // Parameters:
343 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
344 // Return Value:
345 //          Number of detected web links.
346 //
347 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
348 
349 // Function: FPDFLink_GetURL
350 //          Fetch the URL information for a detected web link.
351 // Parameters:
352 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
353 //          link_index  -   Zero-based index for the link.
354 //          buffer      -   A unicode buffer.
355 //          buflen      -   Number of characters (not bytes) for the buffer,
356 //          including an additional terminator.
357 // Return Value:
358 //          If buffer is NULL or buflen is zero, return number of characters
359 //          (not bytes and an additional terminator is also counted) needed,
360 //          otherwise, return number of characters copied into the buffer.
361 //
362 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page,
363                                       int link_index,
364                                       unsigned short* buffer,
365                                       int buflen);
366 
367 // Function: FPDFLink_CountRects
368 //          Count number of rectangular areas for the link.
369 // Parameters:
370 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
371 //          link_index  -   Zero-based index for the link.
372 // Return Value:
373 //          Number of rectangular areas for the link.
374 //
375 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page,
376                                           int link_index);
377 
378 // Function: FPDFLink_GetRect
379 //          Fetch the boundaries of a rectangle for a link.
380 // Parameters:
381 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
382 //          link_index  -   Zero-based index for the link.
383 //          rect_index  -   Zero-based index for a rectangle.
384 //          left        -   Pointer to a double value receiving the rectangle
385 //          left boundary.
386 //          top         -   Pointer to a double value receiving the rectangle
387 //          top boundary.
388 //          right       -   Pointer to a double value receiving the rectangle
389 //          right boundary.
390 //          bottom      -   Pointer to a double value receiving the rectangle
391 //          bottom boundary.
392 // Return Value:
393 //          None.
394 //
395 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page,
396                                         int link_index,
397                                         int rect_index,
398                                         double* left,
399                                         double* top,
400                                         double* right,
401                                         double* bottom);
402 
403 // Function: FPDFLink_CloseWebLinks
404 //          Release resources used by weblink feature.
405 // Parameters:
406 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
407 // Return Value:
408 //          None.
409 //
410 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
411 
412 #ifdef __cplusplus
413 }
414 #endif
415 
416 #endif  // PUBLIC_FPDF_TEXT_H_
417