1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef PUBLIC_FPDF_TEXT_H_ 8 #define PUBLIC_FPDF_TEXT_H_ 9 10 // NOLINTNEXTLINE(build/include) 11 #include "fpdfview.h" 12 13 // Exported Functions 14 #ifdef __cplusplus 15 extern "C" { 16 #endif 17 18 // Function: FPDFText_LoadPage 19 // Prepare information about all characters in a page. 20 // Parameters: 21 // page - Handle to the page. Returned by FPDF_LoadPage function 22 // (in FPDFVIEW module). 23 // Return value: 24 // A handle to the text page information structure. 25 // NULL if something goes wrong. 26 // Comments: 27 // Application must call FPDFText_ClosePage to release the text page 28 // information. 29 // 30 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page); 31 32 // Function: FPDFText_ClosePage 33 // Release all resources allocated for a text page information 34 // structure. 35 // Parameters: 36 // text_page - Handle to a text page information structure. 37 // Returned by FPDFText_LoadPage function. 38 // Return Value: 39 // None. 40 // 41 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page); 42 43 // Function: FPDFText_CountChars 44 // Get number of characters in a page. 45 // Parameters: 46 // text_page - Handle to a text page information structure. 47 // Returned by FPDFText_LoadPage function. 48 // Return value: 49 // Number of characters in the page. Return -1 for error. 50 // Generated characters, like additional space characters, new line 51 // characters, are also counted. 52 // Comments: 53 // Characters in a page form a "stream", inside the stream, each 54 // character has an index. 55 // We will use the index parameters in many of FPDFTEXT functions. The 56 // first character in the page 57 // has an index value of zero. 58 // 59 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page); 60 61 // Function: FPDFText_GetUnicode 62 // Get Unicode of a character in a page. 63 // Parameters: 64 // text_page - Handle to a text page information structure. 65 // Returned by FPDFText_LoadPage function. 66 // index - Zero-based index of the character. 67 // Return value: 68 // The Unicode of the particular character. 69 // If a character is not encoded in Unicode and Foxit engine can't 70 // convert to Unicode, 71 // the return value will be zero. 72 // 73 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, 74 int index); 75 76 // Function: FPDFText_GetFontSize 77 // Get the font size of a particular character. 78 // Parameters: 79 // text_page - Handle to a text page information structure. 80 // Returned by FPDFText_LoadPage function. 81 // index - Zero-based index of the character. 82 // Return value: 83 // The font size of the particular character, measured in points (about 84 // 1/72 inch). 85 // This is the typographic size of the font (so called "em size"). 86 // 87 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, 88 int index); 89 90 // Function: FPDFText_GetCharBox 91 // Get bounding box of a particular character. 92 // Parameters: 93 // text_page - Handle to a text page information structure. 94 // Returned by FPDFText_LoadPage function. 95 // index - Zero-based index of the character. 96 // left - Pointer to a double number receiving left position 97 // of the character box. 98 // right - Pointer to a double number receiving right position 99 // of the character box. 100 // bottom - Pointer to a double number receiving bottom position 101 // of the character box. 102 // top - Pointer to a double number receiving top position of 103 // the character box. 104 // Return Value: 105 // None. 106 // Comments: 107 // All positions are measured in PDF "user space". 108 // 109 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, 110 int index, 111 double* left, 112 double* right, 113 double* bottom, 114 double* top); 115 116 // Function: FPDFText_GetCharIndexAtPos 117 // Get the index of a character at or nearby a certain position on the 118 // page. 119 // Parameters: 120 // text_page - Handle to a text page information structure. 121 // Returned by FPDFText_LoadPage function. 122 // x - X position in PDF "user space". 123 // y - Y position in PDF "user space". 124 // xTolerance - An x-axis tolerance value for character hit 125 // detection, in point unit. 126 // yTolerance - A y-axis tolerance value for character hit 127 // detection, in point unit. 128 // Return Value: 129 // The zero-based index of the character at, or nearby the point (x,y). 130 // If there is no character at or nearby the point, return value will 131 // be -1. 132 // If an error occurs, -3 will be returned. 133 // 134 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page, 135 double x, 136 double y, 137 double xTolerance, 138 double yTolerance); 139 140 // Function: FPDFText_GetText 141 // Extract unicode text string from the page. 142 // Parameters: 143 // text_page - Handle to a text page information structure. 144 // Returned by FPDFText_LoadPage function. 145 // start_index - Index for the start characters. 146 // count - Number of characters to be extracted. 147 // result - A buffer (allocated by application) receiving the 148 // extracted unicodes. 149 // The size of the buffer must be able to hold the 150 // number of characters plus a terminator. 151 // Return Value: 152 // Number of characters written into the result buffer, including the 153 // trailing terminator. 154 // Comments: 155 // This function ignores characters without unicode information. 156 // 157 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page, 158 int start_index, 159 int count, 160 unsigned short* result); 161 162 // Function: FPDFText_CountRects 163 // Count number of rectangular areas occupied by a segment of texts. 164 // Parameters: 165 // text_page - Handle to a text page information structure. 166 // Returned by FPDFText_LoadPage function. 167 // start_index - Index for the start characters. 168 // count - Number of characters. 169 // Return value: 170 // Number of rectangles. Zero for error. 171 // Comments: 172 // This function, along with FPDFText_GetRect can be used by 173 // applications to detect the position 174 // on the page for a text segment, so proper areas can be highlighted 175 // or something. 176 // FPDFTEXT will automatically merge small character boxes into bigger 177 // one if those characters 178 // are on the same line and use same font settings. 179 // 180 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page, 181 int start_index, 182 int count); 183 184 // Function: FPDFText_GetRect 185 // Get a rectangular area from the result generated by 186 // FPDFText_CountRects. 187 // Parameters: 188 // text_page - Handle to a text page information structure. 189 // Returned by FPDFText_LoadPage function. 190 // rect_index - Zero-based index for the rectangle. 191 // left - Pointer to a double value receiving the rectangle 192 // left boundary. 193 // top - Pointer to a double value receiving the rectangle 194 // top boundary. 195 // right - Pointer to a double value receiving the rectangle 196 // right boundary. 197 // bottom - Pointer to a double value receiving the rectangle 198 // bottom boundary. 199 // Return Value: 200 // None. 201 // 202 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page, 203 int rect_index, 204 double* left, 205 double* top, 206 double* right, 207 double* bottom); 208 209 // Function: FPDFText_GetBoundedText 210 // Extract unicode text within a rectangular boundary on the page. 211 // Parameters: 212 // text_page - Handle to a text page information structure. 213 // Returned by FPDFText_LoadPage function. 214 // left - Left boundary. 215 // top - Top boundary. 216 // right - Right boundary. 217 // bottom - Bottom boundary. 218 // buffer - A unicode buffer. 219 // buflen - Number of characters (not bytes) for the buffer, 220 // excluding an additional terminator. 221 // Return Value: 222 // If buffer is NULL or buflen is zero, return number of characters 223 // (not bytes) of text present within 224 // the rectangle, excluding a terminating NUL. Generally you should 225 // pass a buffer at least one larger 226 // than this if you want a terminating NUL, which will be provided if 227 // space is available. 228 // Otherwise, return number of characters copied into the buffer, 229 // including the terminating NUL 230 // when space for it is available. 231 // Comment: 232 // If the buffer is too small, as much text as will fit is copied into 233 // it. 234 // 235 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page, 236 double left, 237 double top, 238 double right, 239 double bottom, 240 unsigned short* buffer, 241 int buflen); 242 243 // Flags used by FPDFText_FindStart function. 244 #define FPDF_MATCHCASE \ 245 0x00000001 // If not set, it will not match case by default. 246 #define FPDF_MATCHWHOLEWORD \ 247 0x00000002 // If not set, it will not match the whole word by default. 248 249 // Function: FPDFText_FindStart 250 // Start a search. 251 // Parameters: 252 // text_page - Handle to a text page information structure. 253 // Returned by FPDFText_LoadPage function. 254 // findwhat - A unicode match pattern. 255 // flags - Option flags. 256 // start_index - Start from this character. -1 for end of the page. 257 // Return Value: 258 // A handle for the search context. FPDFText_FindClose must be called 259 // to release this handle. 260 // 261 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page, 262 FPDF_WIDESTRING findwhat, 263 unsigned long flags, 264 int start_index); 265 266 // Function: FPDFText_FindNext 267 // Search in the direction from page start to end. 268 // Parameters: 269 // handle - A search context handle returned by 270 // FPDFText_FindStart. 271 // Return Value: 272 // Whether a match is found. 273 // 274 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle); 275 276 // Function: FPDFText_FindPrev 277 // Search in the direction from page end to start. 278 // Parameters: 279 // handle - A search context handle returned by 280 // FPDFText_FindStart. 281 // Return Value: 282 // Whether a match is found. 283 // 284 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle); 285 286 // Function: FPDFText_GetSchResultIndex 287 // Get the starting character index of the search result. 288 // Parameters: 289 // handle - A search context handle returned by 290 // FPDFText_FindStart. 291 // Return Value: 292 // Index for the starting character. 293 // 294 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle); 295 296 // Function: FPDFText_GetSchCount 297 // Get the number of matched characters in the search result. 298 // Parameters: 299 // handle - A search context handle returned by 300 // FPDFText_FindStart. 301 // Return Value: 302 // Number of matched characters. 303 // 304 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle); 305 306 // Function: FPDFText_FindClose 307 // Release a search context. 308 // Parameters: 309 // handle - A search context handle returned by 310 // FPDFText_FindStart. 311 // Return Value: 312 // None. 313 // 314 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle); 315 316 // Function: FPDFLink_LoadWebLinks 317 // Prepare information about weblinks in a page. 318 // Parameters: 319 // text_page - Handle to a text page information structure. 320 // Returned by FPDFText_LoadPage function. 321 // Return Value: 322 // A handle to the page's links information structure. 323 // NULL if something goes wrong. 324 // Comments: 325 // Weblinks are those links implicitly embedded in PDF pages. PDF also 326 // has a type of 327 // annotation called "link", FPDFTEXT doesn't deal with that kind of 328 // link. 329 // FPDFTEXT weblink feature is useful for automatically detecting links 330 // in the page 331 // contents. For example, things like "http://www.foxitsoftware.com" 332 // will be detected, 333 // so applications can allow user to click on those characters to 334 // activate the link, 335 // even the PDF doesn't come with link annotations. 336 // 337 // FPDFLink_CloseWebLinks must be called to release resources. 338 // 339 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page); 340 341 // Function: FPDFLink_CountWebLinks 342 // Count number of detected web links. 343 // Parameters: 344 // link_page - Handle returned by FPDFLink_LoadWebLinks. 345 // Return Value: 346 // Number of detected web links. 347 // 348 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page); 349 350 // Function: FPDFLink_GetURL 351 // Fetch the URL information for a detected web link. 352 // Parameters: 353 // link_page - Handle returned by FPDFLink_LoadWebLinks. 354 // link_index - Zero-based index for the link. 355 // buffer - A unicode buffer for the result. 356 // buflen - Number of characters (not bytes) for the buffer, 357 // including an additional terminator. 358 // Return Value: 359 // If |buffer| is NULL or |buflen| is zero, return the number of 360 // characters (not bytes) needed to buffer the result (an additional 361 // terminator is included in this count). 362 // Otherwise, copy the result into |buffer|, truncating at |buflen| if 363 // the result is too large to fit, and return the number of characters 364 // actually copied into the buffer (the additional terminator is also 365 // included in this count). 366 // If |link_index| does not correspond to a valid link, then the result 367 // is an empty string. 368 // 369 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page, 370 int link_index, 371 unsigned short* buffer, 372 int buflen); 373 374 // Function: FPDFLink_CountRects 375 // Count number of rectangular areas for the link. 376 // Parameters: 377 // link_page - Handle returned by FPDFLink_LoadWebLinks. 378 // link_index - Zero-based index for the link. 379 // Return Value: 380 // Number of rectangular areas for the link. If |link_index| does 381 // not correspond to a valid link, then 0 is returned. 382 // 383 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page, 384 int link_index); 385 386 // Function: FPDFLink_GetRect 387 // Fetch the boundaries of a rectangle for a link. 388 // Parameters: 389 // link_page - Handle returned by FPDFLink_LoadWebLinks. 390 // link_index - Zero-based index for the link. 391 // rect_index - Zero-based index for a rectangle. 392 // left - Pointer to a double value receiving the rectangle 393 // left boundary. 394 // top - Pointer to a double value receiving the rectangle 395 // top boundary. 396 // right - Pointer to a double value receiving the rectangle 397 // right boundary. 398 // bottom - Pointer to a double value receiving the rectangle 399 // bottom boundary. 400 // Return Value: 401 // None. If |link_index| does not correspond to a valid link, then 402 // |left|, |top|, |right|, and |bottom| remain unmodified. 403 // 404 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page, 405 int link_index, 406 int rect_index, 407 double* left, 408 double* top, 409 double* right, 410 double* bottom); 411 412 // Function: FPDFLink_CloseWebLinks 413 // Release resources used by weblink feature. 414 // Parameters: 415 // link_page - Handle returned by FPDFLink_LoadWebLinks. 416 // Return Value: 417 // None. 418 // 419 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page); 420 421 #ifdef __cplusplus 422 } 423 #endif 424 425 #endif // PUBLIC_FPDF_TEXT_H_ 426