1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef PUBLIC_FPDF_TEXT_H_ 8 #define PUBLIC_FPDF_TEXT_H_ 9 10 #include "fpdfview.h" 11 12 // Exported Functions 13 #ifdef __cplusplus 14 extern "C" { 15 #endif 16 17 // Function: FPDFText_LoadPage 18 // Prepare information about all characters in a page. 19 // Parameters: 20 // page - Handle to the page. Returned by FPDF_LoadPage function 21 // (in FPDFVIEW module). 22 // Return value: 23 // A handle to the text page information structure. 24 // NULL if something goes wrong. 25 // Comments: 26 // Application must call FPDFText_ClosePage to release the text page 27 // information. 28 // 29 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page); 30 31 // Function: FPDFText_ClosePage 32 // Release all resources allocated for a text page information 33 // structure. 34 // Parameters: 35 // text_page - Handle to a text page information structure. 36 // Returned by FPDFText_LoadPage function. 37 // Return Value: 38 // None. 39 // 40 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page); 41 42 // Function: FPDFText_CountChars 43 // Get number of characters in a page. 44 // Parameters: 45 // text_page - Handle to a text page information structure. 46 // Returned by FPDFText_LoadPage function. 47 // Return value: 48 // Number of characters in the page. Return -1 for error. 49 // Generated characters, like additional space characters, new line 50 // characters, are also counted. 51 // Comments: 52 // Characters in a page form a "stream", inside the stream, each 53 // character has an index. 54 // We will use the index parameters in many of FPDFTEXT functions. The 55 // first character in the page 56 // has an index value of zero. 57 // 58 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page); 59 60 // Function: FPDFText_GetUnicode 61 // Get Unicode of a character in a page. 62 // Parameters: 63 // text_page - Handle to a text page information structure. 64 // Returned by FPDFText_LoadPage function. 65 // index - Zero-based index of the character. 66 // Return value: 67 // The Unicode of the particular character. 68 // If a character is not encoded in Unicode and Foxit engine can't 69 // convert to Unicode, 70 // the return value will be zero. 71 // 72 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, 73 int index); 74 75 // Function: FPDFText_GetFontSize 76 // Get the font size of a particular character. 77 // Parameters: 78 // text_page - Handle to a text page information structure. 79 // Returned by FPDFText_LoadPage function. 80 // index - Zero-based index of the character. 81 // Return value: 82 // The font size of the particular character, measured in points (about 83 // 1/72 inch). 84 // This is the typographic size of the font (so called "em size"). 85 // 86 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, 87 int index); 88 89 // Function: FPDFText_GetCharBox 90 // Get bounding box of a particular character. 91 // Parameters: 92 // text_page - Handle to a text page information structure. 93 // Returned by FPDFText_LoadPage function. 94 // index - Zero-based index of the character. 95 // left - Pointer to a double number receiving left position 96 // of the character box. 97 // right - Pointer to a double number receiving right position 98 // of the character box. 99 // bottom - Pointer to a double number receiving bottom position 100 // of the character box. 101 // top - Pointer to a double number receiving top position of 102 // the character box. 103 // Return Value: 104 // None. 105 // Comments: 106 // All positions are measured in PDF "user space". 107 // 108 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, 109 int index, 110 double* left, 111 double* right, 112 double* bottom, 113 double* top); 114 115 // Function: FPDFText_GetCharIndexAtPos 116 // Get the index of a character at or nearby a certain position on the 117 // page. 118 // Parameters: 119 // text_page - Handle to a text page information structure. 120 // Returned by FPDFText_LoadPage function. 121 // x - X position in PDF "user space". 122 // y - Y position in PDF "user space". 123 // xTolerance - An x-axis tolerance value for character hit 124 // detection, in point unit. 125 // yTolerance - A y-axis tolerance value for character hit 126 // detection, in point unit. 127 // Return Value: 128 // The zero-based index of the character at, or nearby the point (x,y). 129 // If there is no character at or nearby the point, return value will 130 // be -1. 131 // If an error occurs, -3 will be returned. 132 // 133 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page, 134 double x, 135 double y, 136 double xTolerance, 137 double yTolerance); 138 139 // Function: FPDFText_GetText 140 // Extract unicode text string from the page. 141 // Parameters: 142 // text_page - Handle to a text page information structure. 143 // Returned by FPDFText_LoadPage function. 144 // start_index - Index for the start characters. 145 // count - Number of characters to be extracted. 146 // result - A buffer (allocated by application) receiving the 147 // extracted unicodes. 148 // The size of the buffer must be able to hold the 149 // number of characters plus a terminator. 150 // Return Value: 151 // Number of characters written into the result buffer, including the 152 // trailing terminator. 153 // Comments: 154 // This function ignores characters without unicode information. 155 // 156 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page, 157 int start_index, 158 int count, 159 unsigned short* result); 160 161 // Function: FPDFText_CountRects 162 // Count number of rectangular areas occupied by a segment of texts. 163 // Parameters: 164 // text_page - Handle to a text page information structure. 165 // Returned by FPDFText_LoadPage function. 166 // start_index - Index for the start characters. 167 // count - Number of characters. 168 // Return value: 169 // Number of rectangles. Zero for error. 170 // Comments: 171 // This function, along with FPDFText_GetRect can be used by 172 // applications to detect the position 173 // on the page for a text segment, so proper areas can be highlighted 174 // or something. 175 // FPDFTEXT will automatically merge small character boxes into bigger 176 // one if those characters 177 // are on the same line and use same font settings. 178 // 179 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page, 180 int start_index, 181 int count); 182 183 // Function: FPDFText_GetRect 184 // Get a rectangular area from the result generated by 185 // FPDFText_CountRects. 186 // Parameters: 187 // text_page - Handle to a text page information structure. 188 // Returned by FPDFText_LoadPage function. 189 // rect_index - Zero-based index for the rectangle. 190 // left - Pointer to a double value receiving the rectangle 191 // left boundary. 192 // top - Pointer to a double value receiving the rectangle 193 // top boundary. 194 // right - Pointer to a double value receiving the rectangle 195 // right boundary. 196 // bottom - Pointer to a double value receiving the rectangle 197 // bottom boundary. 198 // Return Value: 199 // None. 200 // 201 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page, 202 int rect_index, 203 double* left, 204 double* top, 205 double* right, 206 double* bottom); 207 208 // Function: FPDFText_GetBoundedText 209 // Extract unicode text within a rectangular boundary on the page. 210 // Parameters: 211 // text_page - Handle to a text page information structure. 212 // Returned by FPDFText_LoadPage function. 213 // left - Left boundary. 214 // top - Top boundary. 215 // right - Right boundary. 216 // bottom - Bottom boundary. 217 // buffer - A unicode buffer. 218 // buflen - Number of characters (not bytes) for the buffer, 219 // excluding an additional terminator. 220 // Return Value: 221 // If buffer is NULL or buflen is zero, return number of characters 222 // (not bytes) of text present within 223 // the rectangle, excluding a terminating NUL. Generally you should 224 // pass a buffer at least one larger 225 // than this if you want a terminating NUL, which will be provided if 226 // space is available. 227 // Otherwise, return number of characters copied into the buffer, 228 // including the terminating NUL 229 // when space for it is available. 230 // Comment: 231 // If the buffer is too small, as much text as will fit is copied into 232 // it. 233 // 234 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page, 235 double left, 236 double top, 237 double right, 238 double bottom, 239 unsigned short* buffer, 240 int buflen); 241 242 // Flags used by FPDFText_FindStart function. 243 #define FPDF_MATCHCASE \ 244 0x00000001 // If not set, it will not match case by default. 245 #define FPDF_MATCHWHOLEWORD \ 246 0x00000002 // If not set, it will not match the whole word by default. 247 248 // Function: FPDFText_FindStart 249 // Start a search. 250 // Parameters: 251 // text_page - Handle to a text page information structure. 252 // Returned by FPDFText_LoadPage function. 253 // findwhat - A unicode match pattern. 254 // flags - Option flags. 255 // start_index - Start from this character. -1 for end of the page. 256 // Return Value: 257 // A handle for the search context. FPDFText_FindClose must be called 258 // to release this handle. 259 // 260 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page, 261 FPDF_WIDESTRING findwhat, 262 unsigned long flags, 263 int start_index); 264 265 // Function: FPDFText_FindNext 266 // Search in the direction from page start to end. 267 // Parameters: 268 // handle - A search context handle returned by 269 // FPDFText_FindStart. 270 // Return Value: 271 // Whether a match is found. 272 // 273 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle); 274 275 // Function: FPDFText_FindPrev 276 // Search in the direction from page end to start. 277 // Parameters: 278 // handle - A search context handle returned by 279 // FPDFText_FindStart. 280 // Return Value: 281 // Whether a match is found. 282 // 283 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle); 284 285 // Function: FPDFText_GetSchResultIndex 286 // Get the starting character index of the search result. 287 // Parameters: 288 // handle - A search context handle returned by 289 // FPDFText_FindStart. 290 // Return Value: 291 // Index for the starting character. 292 // 293 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle); 294 295 // Function: FPDFText_GetSchCount 296 // Get the number of matched characters in the search result. 297 // Parameters: 298 // handle - A search context handle returned by 299 // FPDFText_FindStart. 300 // Return Value: 301 // Number of matched characters. 302 // 303 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle); 304 305 // Function: FPDFText_FindClose 306 // Release a search context. 307 // Parameters: 308 // handle - A search context handle returned by 309 // FPDFText_FindStart. 310 // Return Value: 311 // None. 312 // 313 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle); 314 315 // Function: FPDFLink_LoadWebLinks 316 // Prepare information about weblinks in a page. 317 // Parameters: 318 // text_page - Handle to a text page information structure. 319 // Returned by FPDFText_LoadPage function. 320 // Return Value: 321 // A handle to the page's links information structure. 322 // NULL if something goes wrong. 323 // Comments: 324 // Weblinks are those links implicitly embedded in PDF pages. PDF also 325 // has a type of 326 // annotation called "link", FPDFTEXT doesn't deal with that kind of 327 // link. 328 // FPDFTEXT weblink feature is useful for automatically detecting links 329 // in the page 330 // contents. For example, things like "http://www.foxitsoftware.com" 331 // will be detected, 332 // so applications can allow user to click on those characters to 333 // activate the link, 334 // even the PDF doesn't come with link annotations. 335 // 336 // FPDFLink_CloseWebLinks must be called to release resources. 337 // 338 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page); 339 340 // Function: FPDFLink_CountWebLinks 341 // Count number of detected web links. 342 // Parameters: 343 // link_page - Handle returned by FPDFLink_LoadWebLinks. 344 // Return Value: 345 // Number of detected web links. 346 // 347 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page); 348 349 // Function: FPDFLink_GetURL 350 // Fetch the URL information for a detected web link. 351 // Parameters: 352 // link_page - Handle returned by FPDFLink_LoadWebLinks. 353 // link_index - Zero-based index for the link. 354 // buffer - A unicode buffer. 355 // buflen - Number of characters (not bytes) for the buffer, 356 // including an additional terminator. 357 // Return Value: 358 // If buffer is NULL or buflen is zero, return number of characters 359 // (not bytes and an additional terminator is also counted) needed, 360 // otherwise, return number of characters copied into the buffer. 361 // 362 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page, 363 int link_index, 364 unsigned short* buffer, 365 int buflen); 366 367 // Function: FPDFLink_CountRects 368 // Count number of rectangular areas for the link. 369 // Parameters: 370 // link_page - Handle returned by FPDFLink_LoadWebLinks. 371 // link_index - Zero-based index for the link. 372 // Return Value: 373 // Number of rectangular areas for the link. 374 // 375 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page, 376 int link_index); 377 378 // Function: FPDFLink_GetRect 379 // Fetch the boundaries of a rectangle for a link. 380 // Parameters: 381 // link_page - Handle returned by FPDFLink_LoadWebLinks. 382 // link_index - Zero-based index for the link. 383 // rect_index - Zero-based index for a rectangle. 384 // left - Pointer to a double value receiving the rectangle 385 // left boundary. 386 // top - Pointer to a double value receiving the rectangle 387 // top boundary. 388 // right - Pointer to a double value receiving the rectangle 389 // right boundary. 390 // bottom - Pointer to a double value receiving the rectangle 391 // bottom boundary. 392 // Return Value: 393 // None. 394 // 395 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page, 396 int link_index, 397 int rect_index, 398 double* left, 399 double* top, 400 double* right, 401 double* bottom); 402 403 // Function: FPDFLink_CloseWebLinks 404 // Release resources used by weblink feature. 405 // Parameters: 406 // link_page - Handle returned by FPDFLink_LoadWebLinks. 407 // Return Value: 408 // None. 409 // 410 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page); 411 412 #ifdef __cplusplus 413 } 414 #endif 415 416 #endif // PUBLIC_FPDF_TEXT_H_ 417