1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2004-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: uregex.h 9 * encoding: UTF-8 10 * indentation:4 11 * 12 * created on: 2004mar09 13 * created by: Andy Heninger 14 * 15 * ICU Regular Expressions, API for C 16 */ 17 18 /** 19 * \file 20 * \brief C API: Regular Expressions 21 * 22 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 23 */ 24 25 #ifndef UREGEX_H 26 #define UREGEX_H 27 28 #include "unicode/utext.h" 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 32 33 #include "unicode/localpointer.h" 34 #include "unicode/parseerr.h" 35 36 struct URegularExpression; 37 /** 38 * Structure representing a compiled regular expression, plus the results 39 * of a match operation. 40 * @stable ICU 3.0 41 */ 42 typedef struct URegularExpression URegularExpression; 43 44 45 /** 46 * Constants for Regular Expression Match Modes. 47 * @stable ICU 2.4 48 */ 49 typedef enum URegexpFlag{ 50 51 #ifndef U_HIDE_DRAFT_API 52 /** Forces normalization of pattern and strings. 53 Not implemented yet, just a placeholder, hence draft. 54 @draft ICU 2.4 */ 55 UREGEX_CANON_EQ = 128, 56 #endif /* U_HIDE_DRAFT_API */ 57 /** Enable case insensitive matching. @stable ICU 2.4 */ 58 UREGEX_CASE_INSENSITIVE = 2, 59 60 /** Allow white space and comments within patterns @stable ICU 2.4 */ 61 UREGEX_COMMENTS = 4, 62 63 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 64 * @stable ICU 2.4 */ 65 UREGEX_DOTALL = 32, 66 67 /** If set, treat the entire pattern as a literal string. 68 * Metacharacters or escape sequences in the input sequence will be given 69 * no special meaning. 70 * 71 * The flag UREGEX_CASE_INSENSITIVE retains its impact 72 * on matching when used in conjunction with this flag. 73 * The other flags become superfluous. 74 * 75 * @stable ICU 4.0 76 */ 77 UREGEX_LITERAL = 16, 78 79 /** Control behavior of "$" and "^" 80 * If set, recognize line terminators within string, 81 * otherwise, match only at start and end of input string. 82 * @stable ICU 2.4 */ 83 UREGEX_MULTILINE = 8, 84 85 /** Unix-only line endings. 86 * When this mode is enabled, only \\u000a is recognized as a line ending 87 * in the behavior of ., ^, and $. 88 * @stable ICU 4.0 89 */ 90 UREGEX_UNIX_LINES = 1, 91 92 /** Unicode word boundaries. 93 * If set, \b uses the Unicode TR 29 definition of word boundaries. 94 * Warning: Unicode word boundaries are quite different from 95 * traditional regular expression word boundaries. See 96 * http://unicode.org/reports/tr29/#Word_Boundaries 97 * @stable ICU 2.8 98 */ 99 UREGEX_UWORD = 256, 100 101 /** Error on Unrecognized backslash escapes. 102 * If set, fail with an error on patterns that contain 103 * backslash-escaped ASCII letters without a known special 104 * meaning. If this flag is not set, these 105 * escaped letters represent themselves. 106 * @stable ICU 4.0 107 */ 108 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 109 110 } URegexpFlag; 111 112 /** 113 * Open (compile) an ICU regular expression. Compiles the regular expression in 114 * string form into an internal representation using the specified match mode flags. 115 * The resulting regular expression handle can then be used to perform various 116 * matching operations. 117 * 118 * 119 * @param pattern The Regular Expression pattern to be compiled. 120 * @param patternLength The length of the pattern, or -1 if the pattern is 121 * NUL terminated. 122 * @param flags Flags that alter the default matching behavior for 123 * the regular expression, UREGEX_CASE_INSENSITIVE, for 124 * example. For default behavior, set this parameter to zero. 125 * See <code>enum URegexpFlag</code>. All desired flags 126 * are bitwise-ORed together. 127 * @param pe Receives the position (line and column numbers) of any syntax 128 * error within the source regular expression string. If this 129 * information is not wanted, pass NULL for this parameter. 130 * @param status Receives error detected by this function. 131 * @stable ICU 3.0 132 * 133 */ 134 U_STABLE URegularExpression * U_EXPORT2 135 uregex_open( const UChar *pattern, 136 int32_t patternLength, 137 uint32_t flags, 138 UParseError *pe, 139 UErrorCode *status); 140 141 /** 142 * Open (compile) an ICU regular expression. Compiles the regular expression in 143 * string form into an internal representation using the specified match mode flags. 144 * The resulting regular expression handle can then be used to perform various 145 * matching operations. 146 * <p> 147 * The contents of the pattern UText will be extracted and saved. Ownership of the 148 * UText struct itself remains with the caller. This is to match the behavior of 149 * uregex_open(). 150 * 151 * @param pattern The Regular Expression pattern to be compiled. 152 * @param flags Flags that alter the default matching behavior for 153 * the regular expression, UREGEX_CASE_INSENSITIVE, for 154 * example. For default behavior, set this parameter to zero. 155 * See <code>enum URegexpFlag</code>. All desired flags 156 * are bitwise-ORed together. 157 * @param pe Receives the position (line and column numbers) of any syntax 158 * error within the source regular expression string. If this 159 * information is not wanted, pass NULL for this parameter. 160 * @param status Receives error detected by this function. 161 * 162 * @stable ICU 4.6 163 */ 164 U_STABLE URegularExpression * U_EXPORT2 165 uregex_openUText(UText *pattern, 166 uint32_t flags, 167 UParseError *pe, 168 UErrorCode *status); 169 170 #if !UCONFIG_NO_CONVERSION 171 /** 172 * Open (compile) an ICU regular expression. The resulting regular expression 173 * handle can then be used to perform various matching operations. 174 * <p> 175 * This function is the same as uregex_open, except that the pattern 176 * is supplied as an 8 bit char * string in the default code page. 177 * 178 * @param pattern The Regular Expression pattern to be compiled, 179 * NUL terminated. 180 * @param flags Flags that alter the default matching behavior for 181 * the regular expression, UREGEX_CASE_INSENSITIVE, for 182 * example. For default behavior, set this parameter to zero. 183 * See <code>enum URegexpFlag</code>. All desired flags 184 * are bitwise-ORed together. 185 * @param pe Receives the position (line and column numbers) of any syntax 186 * error within the source regular expression string. If this 187 * information is not wanted, pass NULL for this parameter. 188 * @param status Receives errors detected by this function. 189 * @return The URegularExpression object representing the compiled 190 * pattern. 191 * 192 * @stable ICU 3.0 193 */ 194 U_STABLE URegularExpression * U_EXPORT2 195 uregex_openC( const char *pattern, 196 uint32_t flags, 197 UParseError *pe, 198 UErrorCode *status); 199 #endif 200 201 202 203 /** 204 * Close the regular expression, recovering all resources (memory) it 205 * was holding. 206 * 207 * @param regexp The regular expression to be closed. 208 * @stable ICU 3.0 209 */ 210 U_STABLE void U_EXPORT2 211 uregex_close(URegularExpression *regexp); 212 213 #if U_SHOW_CPLUSPLUS_API 214 215 U_NAMESPACE_BEGIN 216 217 /** 218 * \class LocalURegularExpressionPointer 219 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 220 * For most methods see the LocalPointerBase base class. 221 * 222 * @see LocalPointerBase 223 * @see LocalPointer 224 * @stable ICU 4.4 225 */ 226 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 227 228 U_NAMESPACE_END 229 230 #endif 231 232 /** 233 * Make a copy of a compiled regular expression. Cloning a regular 234 * expression is faster than opening a second instance from the source 235 * form of the expression, and requires less memory. 236 * <p> 237 * Note that the current input string and the position of any matched text 238 * within it are not cloned; only the pattern itself and the 239 * match mode flags are copied. 240 * <p> 241 * Cloning can be particularly useful to threaded applications that perform 242 * multiple match operations in parallel. Each concurrent RE 243 * operation requires its own instance of a URegularExpression. 244 * 245 * @param regexp The compiled regular expression to be cloned. 246 * @param status Receives indication of any errors encountered 247 * @return the cloned copy of the compiled regular expression. 248 * @stable ICU 3.0 249 */ 250 U_STABLE URegularExpression * U_EXPORT2 251 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 252 253 /** 254 * Returns a pointer to the source form of the pattern for this regular expression. 255 * This function will work even if the pattern was originally specified as a UText. 256 * 257 * @param regexp The compiled regular expression. 258 * @param patLength This output parameter will be set to the length of the 259 * pattern string. A NULL pointer may be used here if the 260 * pattern length is not needed, as would be the case if 261 * the pattern is known in advance to be a NUL terminated 262 * string. 263 * @param status Receives errors detected by this function. 264 * @return a pointer to the pattern string. The storage for the string is 265 * owned by the regular expression object, and must not be 266 * altered or deleted by the application. The returned string 267 * will remain valid until the regular expression is closed. 268 * @stable ICU 3.0 269 */ 270 U_STABLE const UChar * U_EXPORT2 271 uregex_pattern(const URegularExpression *regexp, 272 int32_t *patLength, 273 UErrorCode *status); 274 275 /** 276 * Returns the source text of the pattern for this regular expression. 277 * This function will work even if the pattern was originally specified as a UChar string. 278 * 279 * @param regexp The compiled regular expression. 280 * @param status Receives errors detected by this function. 281 * @return the pattern text. The storage for the text is owned by the regular expression 282 * object, and must not be altered or deleted. 283 * 284 * @stable ICU 4.6 285 */ 286 U_STABLE UText * U_EXPORT2 287 uregex_patternUText(const URegularExpression *regexp, 288 UErrorCode *status); 289 290 /** 291 * Get the match mode flags that were specified when compiling this regular expression. 292 * @param status Receives errors detected by this function. 293 * @param regexp The compiled regular expression. 294 * @return The match mode flags 295 * @see URegexpFlag 296 * @stable ICU 3.0 297 */ 298 U_STABLE int32_t U_EXPORT2 299 uregex_flags(const URegularExpression *regexp, 300 UErrorCode *status); 301 302 303 /** 304 * Set the subject text string upon which the regular expression will look for matches. 305 * This function may be called any number of times, allowing the regular 306 * expression pattern to be applied to different strings. 307 * <p> 308 * Regular expression matching operations work directly on the application's 309 * string data. No copy is made. The subject string data must not be 310 * altered after calling this function until after all regular expression 311 * operations involving this string data are completed. 312 * <p> 313 * Zero length strings are permitted. In this case, no subsequent match 314 * operation will dereference the text string pointer. 315 * 316 * @param regexp The compiled regular expression. 317 * @param text The subject text string. 318 * @param textLength The length of the subject text, or -1 if the string 319 * is NUL terminated. 320 * @param status Receives errors detected by this function. 321 * @stable ICU 3.0 322 */ 323 U_STABLE void U_EXPORT2 324 uregex_setText(URegularExpression *regexp, 325 const UChar *text, 326 int32_t textLength, 327 UErrorCode *status); 328 329 330 /** 331 * Set the subject text string upon which the regular expression will look for matches. 332 * This function may be called any number of times, allowing the regular 333 * expression pattern to be applied to different strings. 334 * <p> 335 * Regular expression matching operations work directly on the application's 336 * string data; only a shallow clone is made. The subject string data must not be 337 * altered after calling this function until after all regular expression 338 * operations involving this string data are completed. 339 * 340 * @param regexp The compiled regular expression. 341 * @param text The subject text string. 342 * @param status Receives errors detected by this function. 343 * 344 * @stable ICU 4.6 345 */ 346 U_STABLE void U_EXPORT2 347 uregex_setUText(URegularExpression *regexp, 348 UText *text, 349 UErrorCode *status); 350 351 /** 352 * Get the subject text that is currently associated with this 353 * regular expression object. If the input was supplied using uregex_setText(), 354 * that pointer will be returned. Otherwise, the characters in the input will 355 * be extracted to a buffer and returned. In either case, ownership remains 356 * with the regular expression object. 357 * 358 * This function will work even if the input was originally specified as a UText. 359 * 360 * @param regexp The compiled regular expression. 361 * @param textLength The length of the string is returned in this output parameter. 362 * A NULL pointer may be used here if the 363 * text length is not needed, as would be the case if 364 * the text is known in advance to be a NUL terminated 365 * string. 366 * @param status Receives errors detected by this function. 367 * @return Pointer to the subject text string currently associated with 368 * this regular expression. 369 * @stable ICU 3.0 370 */ 371 U_STABLE const UChar * U_EXPORT2 372 uregex_getText(URegularExpression *regexp, 373 int32_t *textLength, 374 UErrorCode *status); 375 376 /** 377 * Get the subject text that is currently associated with this 378 * regular expression object. 379 * 380 * This function will work even if the input was originally specified as a UChar string. 381 * 382 * @param regexp The compiled regular expression. 383 * @param dest A mutable UText in which to store the current input. 384 * If NULL, a new UText will be created as an immutable shallow clone 385 * of the actual input string. 386 * @param status Receives errors detected by this function. 387 * @return The subject text currently associated with this regular expression. 388 * If a pre-allocated UText was provided, it will always be used and returned. 389 * 390 * @stable ICU 4.6 391 */ 392 U_STABLE UText * U_EXPORT2 393 uregex_getUText(URegularExpression *regexp, 394 UText *dest, 395 UErrorCode *status); 396 397 /** 398 * Set the subject text string upon which the regular expression is looking for matches 399 * without changing any other aspect of the matching state. 400 * The new and previous text strings must have the same content. 401 * 402 * This function is intended for use in environments where ICU is operating on 403 * strings that may move around in memory. It provides a mechanism for notifying 404 * ICU that the string has been relocated, and providing a new UText to access the 405 * string in its new position. 406 * 407 * Note that the regular expression implementation never copies the underlying text 408 * of a string being matched, but always operates directly on the original text 409 * provided by the user. Refreshing simply drops the references to the old text 410 * and replaces them with references to the new. 411 * 412 * Caution: this function is normally used only by very specialized 413 * system-level code. One example use case is with garbage collection 414 * that moves the text in memory. 415 * 416 * @param regexp The compiled regular expression. 417 * @param text The new (moved) text string. 418 * @param status Receives errors detected by this function. 419 * 420 * @stable ICU 4.8 421 */ 422 U_STABLE void U_EXPORT2 423 uregex_refreshUText(URegularExpression *regexp, 424 UText *text, 425 UErrorCode *status); 426 427 /** 428 * Attempts to match the input string against the pattern. 429 * To succeed, the match must extend to the end of the string, 430 * or cover the complete match region. 431 * 432 * If startIndex >= zero the match operation starts at the specified 433 * index and must extend to the end of the input string. Any region 434 * that has been specified is reset. 435 * 436 * If startIndex == -1 the match must cover the input region, or the entire 437 * input string if no region has been set. This directly corresponds to 438 * Matcher.matches() in Java 439 * 440 * @param regexp The compiled regular expression. 441 * @param startIndex The input string (native) index at which to begin matching, or -1 442 * to match the input Region. 443 * @param status Receives errors detected by this function. 444 * @return TRUE if there is a match 445 * @stable ICU 3.0 446 */ 447 U_STABLE UBool U_EXPORT2 448 uregex_matches(URegularExpression *regexp, 449 int32_t startIndex, 450 UErrorCode *status); 451 452 /** 453 * 64bit version of uregex_matches. 454 * Attempts to match the input string against the pattern. 455 * To succeed, the match must extend to the end of the string, 456 * or cover the complete match region. 457 * 458 * If startIndex >= zero the match operation starts at the specified 459 * index and must extend to the end of the input string. Any region 460 * that has been specified is reset. 461 * 462 * If startIndex == -1 the match must cover the input region, or the entire 463 * input string if no region has been set. This directly corresponds to 464 * Matcher.matches() in Java 465 * 466 * @param regexp The compiled regular expression. 467 * @param startIndex The input string (native) index at which to begin matching, or -1 468 * to match the input Region. 469 * @param status Receives errors detected by this function. 470 * @return TRUE if there is a match 471 * @stable ICU 4.6 472 */ 473 U_STABLE UBool U_EXPORT2 474 uregex_matches64(URegularExpression *regexp, 475 int64_t startIndex, 476 UErrorCode *status); 477 478 /** 479 * Attempts to match the input string, starting from the specified index, against the pattern. 480 * The match may be of any length, and is not required to extend to the end 481 * of the input string. Contrast with uregex_matches(). 482 * 483 * <p>If startIndex is >= 0 any input region that was set for this 484 * URegularExpression is reset before the operation begins. 485 * 486 * <p>If the specified starting index == -1 the match begins at the start of the input 487 * region, or at the start of the full string if no region has been specified. 488 * This corresponds directly with Matcher.lookingAt() in Java. 489 * 490 * <p>If the match succeeds then more information can be obtained via the 491 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 492 * and <code>uregex_group()</code> functions.</p> 493 * 494 * @param regexp The compiled regular expression. 495 * @param startIndex The input string (native) index at which to begin matching, or 496 * -1 to match the Input Region 497 * @param status A reference to a UErrorCode to receive any errors. 498 * @return TRUE if there is a match. 499 * @stable ICU 3.0 500 */ 501 U_STABLE UBool U_EXPORT2 502 uregex_lookingAt(URegularExpression *regexp, 503 int32_t startIndex, 504 UErrorCode *status); 505 506 /** 507 * 64bit version of uregex_lookingAt. 508 * Attempts to match the input string, starting from the specified index, against the pattern. 509 * The match may be of any length, and is not required to extend to the end 510 * of the input string. Contrast with uregex_matches(). 511 * 512 * <p>If startIndex is >= 0 any input region that was set for this 513 * URegularExpression is reset before the operation begins. 514 * 515 * <p>If the specified starting index == -1 the match begins at the start of the input 516 * region, or at the start of the full string if no region has been specified. 517 * This corresponds directly with Matcher.lookingAt() in Java. 518 * 519 * <p>If the match succeeds then more information can be obtained via the 520 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 521 * and <code>uregex_group()</code> functions.</p> 522 * 523 * @param regexp The compiled regular expression. 524 * @param startIndex The input string (native) index at which to begin matching, or 525 * -1 to match the Input Region 526 * @param status A reference to a UErrorCode to receive any errors. 527 * @return TRUE if there is a match. 528 * @stable ICU 4.6 529 */ 530 U_STABLE UBool U_EXPORT2 531 uregex_lookingAt64(URegularExpression *regexp, 532 int64_t startIndex, 533 UErrorCode *status); 534 535 /** 536 * Find the first matching substring of the input string that matches the pattern. 537 * If startIndex is >= zero the search for a match begins at the specified index, 538 * and any match region is reset. This corresponds directly with 539 * Matcher.find(startIndex) in Java. 540 * 541 * If startIndex == -1 the search begins at the start of the input region, 542 * or at the start of the full string if no region has been specified. 543 * 544 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 545 * <code>uregex_group()</code> will provide more information regarding the match. 546 * 547 * @param regexp The compiled regular expression. 548 * @param startIndex The position (native) in the input string to begin the search, or 549 * -1 to search within the Input Region. 550 * @param status A reference to a UErrorCode to receive any errors. 551 * @return TRUE if a match is found. 552 * @stable ICU 3.0 553 */ 554 U_STABLE UBool U_EXPORT2 555 uregex_find(URegularExpression *regexp, 556 int32_t startIndex, 557 UErrorCode *status); 558 559 /** 560 * 64bit version of uregex_find. 561 * Find the first matching substring of the input string that matches the pattern. 562 * If startIndex is >= zero the search for a match begins at the specified index, 563 * and any match region is reset. This corresponds directly with 564 * Matcher.find(startIndex) in Java. 565 * 566 * If startIndex == -1 the search begins at the start of the input region, 567 * or at the start of the full string if no region has been specified. 568 * 569 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 570 * <code>uregex_group()</code> will provide more information regarding the match. 571 * 572 * @param regexp The compiled regular expression. 573 * @param startIndex The position (native) in the input string to begin the search, or 574 * -1 to search within the Input Region. 575 * @param status A reference to a UErrorCode to receive any errors. 576 * @return TRUE if a match is found. 577 * @stable ICU 4.6 578 */ 579 U_STABLE UBool U_EXPORT2 580 uregex_find64(URegularExpression *regexp, 581 int64_t startIndex, 582 UErrorCode *status); 583 584 /** 585 * Find the next pattern match in the input string. Begin searching 586 * the input at the location following the end of he previous match, 587 * or at the start of the string (or region) if there is no 588 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 589 * <code>uregex_group()</code> will provide more information regarding the match. 590 * 591 * @param regexp The compiled regular expression. 592 * @param status A reference to a UErrorCode to receive any errors. 593 * @return TRUE if a match is found. 594 * @see uregex_reset 595 * @stable ICU 3.0 596 */ 597 U_STABLE UBool U_EXPORT2 598 uregex_findNext(URegularExpression *regexp, 599 UErrorCode *status); 600 601 /** 602 * Get the number of capturing groups in this regular expression's pattern. 603 * @param regexp The compiled regular expression. 604 * @param status A reference to a UErrorCode to receive any errors. 605 * @return the number of capture groups 606 * @stable ICU 3.0 607 */ 608 U_STABLE int32_t U_EXPORT2 609 uregex_groupCount(URegularExpression *regexp, 610 UErrorCode *status); 611 612 /** 613 * Get the group number corresponding to a named capture group. 614 * The returned number can be used with any function that access 615 * capture groups by number. 616 * 617 * The function returns an error status if the specified name does not 618 * appear in the pattern. 619 * 620 * @param regexp The compiled regular expression. 621 * @param groupName The capture group name. 622 * @param nameLength The length of the name, or -1 if the name is a 623 * nul-terminated string. 624 * @param status A pointer to a UErrorCode to receive any errors. 625 * 626 * @stable ICU 55 627 */ 628 U_STABLE int32_t U_EXPORT2 629 uregex_groupNumberFromName(URegularExpression *regexp, 630 const UChar *groupName, 631 int32_t nameLength, 632 UErrorCode *status); 633 634 635 /** 636 * Get the group number corresponding to a named capture group. 637 * The returned number can be used with any function that access 638 * capture groups by number. 639 * 640 * The function returns an error status if the specified name does not 641 * appear in the pattern. 642 * 643 * @param regexp The compiled regular expression. 644 * @param groupName The capture group name, 645 * platform invariant characters only. 646 * @param nameLength The length of the name, or -1 if the name is 647 * nul-terminated. 648 * @param status A pointer to a UErrorCode to receive any errors. 649 * 650 * @stable ICU 55 651 */ 652 U_STABLE int32_t U_EXPORT2 653 uregex_groupNumberFromCName(URegularExpression *regexp, 654 const char *groupName, 655 int32_t nameLength, 656 UErrorCode *status); 657 658 /** Extract the string for the specified matching expression or subexpression. 659 * Group #0 is the complete string of matched text. 660 * Group #1 is the text matched by the first set of capturing parentheses. 661 * 662 * @param regexp The compiled regular expression. 663 * @param groupNum The capture group to extract. Group 0 is the complete 664 * match. The value of this parameter must be 665 * less than or equal to the number of capture groups in 666 * the pattern. 667 * @param dest Buffer to receive the matching string data 668 * @param destCapacity Capacity of the dest buffer. 669 * @param status A reference to a UErrorCode to receive any errors. 670 * @return Length of matching data, 671 * or -1 if no applicable match. 672 * @stable ICU 3.0 673 */ 674 U_STABLE int32_t U_EXPORT2 675 uregex_group(URegularExpression *regexp, 676 int32_t groupNum, 677 UChar *dest, 678 int32_t destCapacity, 679 UErrorCode *status); 680 681 /** Returns a shallow immutable clone of the entire input string with the current index set 682 * to the beginning of the requested capture group. The capture group length is also 683 * returned via groupLength. 684 * Group #0 is the complete string of matched text. 685 * Group #1 is the text matched by the first set of capturing parentheses. 686 * 687 * @param regexp The compiled regular expression. 688 * @param groupNum The capture group to extract. Group 0 is the complete 689 * match. The value of this parameter must be 690 * less than or equal to the number of capture groups in 691 * the pattern. 692 * @param dest A mutable UText in which to store the current input. 693 * If NULL, a new UText will be created as an immutable shallow clone 694 * of the entire input string. 695 * @param groupLength The group length of the desired capture group. Output parameter. 696 * @param status A reference to a UErrorCode to receive any errors. 697 * @return The subject text currently associated with this regular expression. 698 * If a pre-allocated UText was provided, it will always be used and returned. 699 700 * 701 * @stable ICU 4.6 702 */ 703 U_STABLE UText * U_EXPORT2 704 uregex_groupUText(URegularExpression *regexp, 705 int32_t groupNum, 706 UText *dest, 707 int64_t *groupLength, 708 UErrorCode *status); 709 710 /** 711 * Returns the index in the input string of the start of the text matched by the 712 * specified capture group during the previous match operation. Return -1 if 713 * the capture group was not part of the last match. 714 * Group #0 refers to the complete range of matched text. 715 * Group #1 refers to the text matched by the first set of capturing parentheses. 716 * 717 * @param regexp The compiled regular expression. 718 * @param groupNum The capture group number 719 * @param status A reference to a UErrorCode to receive any errors. 720 * @return the starting (native) position in the input of the text matched 721 * by the specified group. 722 * @stable ICU 3.0 723 */ 724 U_STABLE int32_t U_EXPORT2 725 uregex_start(URegularExpression *regexp, 726 int32_t groupNum, 727 UErrorCode *status); 728 729 /** 730 * 64bit version of uregex_start. 731 * Returns the index in the input string of the start of the text matched by the 732 * specified capture group during the previous match operation. Return -1 if 733 * the capture group was not part of the last match. 734 * Group #0 refers to the complete range of matched text. 735 * Group #1 refers to the text matched by the first set of capturing parentheses. 736 * 737 * @param regexp The compiled regular expression. 738 * @param groupNum The capture group number 739 * @param status A reference to a UErrorCode to receive any errors. 740 * @return the starting (native) position in the input of the text matched 741 * by the specified group. 742 * @stable ICU 4.6 743 */ 744 U_STABLE int64_t U_EXPORT2 745 uregex_start64(URegularExpression *regexp, 746 int32_t groupNum, 747 UErrorCode *status); 748 749 /** 750 * Returns the index in the input string of the position following the end 751 * of the text matched by the specified capture group. 752 * Return -1 if the capture group was not part of the last match. 753 * Group #0 refers to the complete range of matched text. 754 * Group #1 refers to the text matched by the first set of capturing parentheses. 755 * 756 * @param regexp The compiled regular expression. 757 * @param groupNum The capture group number 758 * @param status A reference to a UErrorCode to receive any errors. 759 * @return the (native) index of the position following the last matched character. 760 * @stable ICU 3.0 761 */ 762 U_STABLE int32_t U_EXPORT2 763 uregex_end(URegularExpression *regexp, 764 int32_t groupNum, 765 UErrorCode *status); 766 767 /** 768 * 64bit version of uregex_end. 769 * Returns the index in the input string of the position following the end 770 * of the text matched by the specified capture group. 771 * Return -1 if the capture group was not part of the last match. 772 * Group #0 refers to the complete range of matched text. 773 * Group #1 refers to the text matched by the first set of capturing parentheses. 774 * 775 * @param regexp The compiled regular expression. 776 * @param groupNum The capture group number 777 * @param status A reference to a UErrorCode to receive any errors. 778 * @return the (native) index of the position following the last matched character. 779 * @stable ICU 4.6 780 */ 781 U_STABLE int64_t U_EXPORT2 782 uregex_end64(URegularExpression *regexp, 783 int32_t groupNum, 784 UErrorCode *status); 785 786 /** 787 * Reset any saved state from the previous match. Has the effect of 788 * causing uregex_findNext to begin at the specified index, and causing 789 * uregex_start(), uregex_end() and uregex_group() to return an error 790 * indicating that there is no match information available. Clears any 791 * match region that may have been set. 792 * 793 * @param regexp The compiled regular expression. 794 * @param index The position (native) in the text at which a 795 * uregex_findNext() should begin searching. 796 * @param status A reference to a UErrorCode to receive any errors. 797 * @stable ICU 3.0 798 */ 799 U_STABLE void U_EXPORT2 800 uregex_reset(URegularExpression *regexp, 801 int32_t index, 802 UErrorCode *status); 803 804 /** 805 * 64bit version of uregex_reset. 806 * Reset any saved state from the previous match. Has the effect of 807 * causing uregex_findNext to begin at the specified index, and causing 808 * uregex_start(), uregex_end() and uregex_group() to return an error 809 * indicating that there is no match information available. Clears any 810 * match region that may have been set. 811 * 812 * @param regexp The compiled regular expression. 813 * @param index The position (native) in the text at which a 814 * uregex_findNext() should begin searching. 815 * @param status A reference to a UErrorCode to receive any errors. 816 * @stable ICU 4.6 817 */ 818 U_STABLE void U_EXPORT2 819 uregex_reset64(URegularExpression *regexp, 820 int64_t index, 821 UErrorCode *status); 822 823 /** 824 * Sets the limits of the matching region for this URegularExpression. 825 * The region is the part of the input string that will be considered when matching. 826 * Invoking this method resets any saved state from the previous match, 827 * then sets the region to start at the index specified by the start parameter 828 * and end at the index specified by the end parameter. 829 * 830 * Depending on the transparency and anchoring being used (see useTransparentBounds 831 * and useAnchoringBounds), certain constructs such as anchors may behave differently 832 * at or around the boundaries of the region 833 * 834 * The function will fail if start is greater than limit, or if either index 835 * is less than zero or greater than the length of the string being matched. 836 * 837 * @param regexp The compiled regular expression. 838 * @param regionStart The (native) index to begin searches at. 839 * @param regionLimit The (native) index to end searches at (exclusive). 840 * @param status A pointer to a UErrorCode to receive any errors. 841 * @stable ICU 4.0 842 */ 843 U_STABLE void U_EXPORT2 844 uregex_setRegion(URegularExpression *regexp, 845 int32_t regionStart, 846 int32_t regionLimit, 847 UErrorCode *status); 848 849 /** 850 * 64bit version of uregex_setRegion. 851 * Sets the limits of the matching region for this URegularExpression. 852 * The region is the part of the input string that will be considered when matching. 853 * Invoking this method resets any saved state from the previous match, 854 * then sets the region to start at the index specified by the start parameter 855 * and end at the index specified by the end parameter. 856 * 857 * Depending on the transparency and anchoring being used (see useTransparentBounds 858 * and useAnchoringBounds), certain constructs such as anchors may behave differently 859 * at or around the boundaries of the region 860 * 861 * The function will fail if start is greater than limit, or if either index 862 * is less than zero or greater than the length of the string being matched. 863 * 864 * @param regexp The compiled regular expression. 865 * @param regionStart The (native) index to begin searches at. 866 * @param regionLimit The (native) index to end searches at (exclusive). 867 * @param status A pointer to a UErrorCode to receive any errors. 868 * @stable ICU 4.6 869 */ 870 U_STABLE void U_EXPORT2 871 uregex_setRegion64(URegularExpression *regexp, 872 int64_t regionStart, 873 int64_t regionLimit, 874 UErrorCode *status); 875 876 /** 877 * Set the matching region and the starting index for subsequent matches 878 * in a single operation. 879 * This is useful because the usual function for setting the starting 880 * index, urgex_reset(), also resets any region limits. 881 * 882 * @param regexp The compiled regular expression. 883 * @param regionStart The (native) index to begin searches at. 884 * @param regionLimit The (native) index to end searches at (exclusive). 885 * @param startIndex The index in the input text at which the next 886 * match operation should begin. 887 * @param status A pointer to a UErrorCode to receive any errors. 888 * @stable ICU 4.6 889 */ 890 U_STABLE void U_EXPORT2 891 uregex_setRegionAndStart(URegularExpression *regexp, 892 int64_t regionStart, 893 int64_t regionLimit, 894 int64_t startIndex, 895 UErrorCode *status); 896 897 /** 898 * Reports the start index of the matching region. Any matches found are limited to 899 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 900 * 901 * @param regexp The compiled regular expression. 902 * @param status A pointer to a UErrorCode to receive any errors. 903 * @return The starting (native) index of this matcher's region. 904 * @stable ICU 4.0 905 */ 906 U_STABLE int32_t U_EXPORT2 907 uregex_regionStart(const URegularExpression *regexp, 908 UErrorCode *status); 909 910 /** 911 * 64bit version of uregex_regionStart. 912 * Reports the start index of the matching region. Any matches found are limited to 913 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 914 * 915 * @param regexp The compiled regular expression. 916 * @param status A pointer to a UErrorCode to receive any errors. 917 * @return The starting (native) index of this matcher's region. 918 * @stable ICU 4.6 919 */ 920 U_STABLE int64_t U_EXPORT2 921 uregex_regionStart64(const URegularExpression *regexp, 922 UErrorCode *status); 923 924 /** 925 * Reports the end index (exclusive) of the matching region for this URegularExpression. 926 * Any matches found are limited to to the region bounded by regionStart (inclusive) 927 * and regionEnd (exclusive). 928 * 929 * @param regexp The compiled regular expression. 930 * @param status A pointer to a UErrorCode to receive any errors. 931 * @return The ending point (native) of this matcher's region. 932 * @stable ICU 4.0 933 */ 934 U_STABLE int32_t U_EXPORT2 935 uregex_regionEnd(const URegularExpression *regexp, 936 UErrorCode *status); 937 938 /** 939 * 64bit version of uregex_regionEnd. 940 * Reports the end index (exclusive) of the matching region for this URegularExpression. 941 * Any matches found are limited to to the region bounded by regionStart (inclusive) 942 * and regionEnd (exclusive). 943 * 944 * @param regexp The compiled regular expression. 945 * @param status A pointer to a UErrorCode to receive any errors. 946 * @return The ending point (native) of this matcher's region. 947 * @stable ICU 4.6 948 */ 949 U_STABLE int64_t U_EXPORT2 950 uregex_regionEnd64(const URegularExpression *regexp, 951 UErrorCode *status); 952 953 /** 954 * Queries the transparency of region bounds for this URegularExpression. 955 * See useTransparentBounds for a description of transparent and opaque bounds. 956 * By default, matching boundaries are opaque. 957 * 958 * @param regexp The compiled regular expression. 959 * @param status A pointer to a UErrorCode to receive any errors. 960 * @return TRUE if this matcher is using opaque bounds, false if it is not. 961 * @stable ICU 4.0 962 */ 963 U_STABLE UBool U_EXPORT2 964 uregex_hasTransparentBounds(const URegularExpression *regexp, 965 UErrorCode *status); 966 967 968 /** 969 * Sets the transparency of region bounds for this URegularExpression. 970 * Invoking this function with an argument of TRUE will set matches to use transparent bounds. 971 * If the boolean argument is FALSE, then opaque bounds will be used. 972 * 973 * Using transparent bounds, the boundaries of the matching region are transparent 974 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 975 * see text beyond the boundaries of the region while checking for a match. 976 * 977 * With opaque bounds, no text outside of the matching region is visible to lookahead, 978 * lookbehind, and boundary matching constructs. 979 * 980 * By default, opaque bounds are used. 981 * 982 * @param regexp The compiled regular expression. 983 * @param b TRUE for transparent bounds; FALSE for opaque bounds 984 * @param status A pointer to a UErrorCode to receive any errors. 985 * @stable ICU 4.0 986 **/ 987 U_STABLE void U_EXPORT2 988 uregex_useTransparentBounds(URegularExpression *regexp, 989 UBool b, 990 UErrorCode *status); 991 992 993 /** 994 * Return true if this URegularExpression is using anchoring bounds. 995 * By default, anchoring region bounds are used. 996 * 997 * @param regexp The compiled regular expression. 998 * @param status A pointer to a UErrorCode to receive any errors. 999 * @return TRUE if this matcher is using anchoring bounds. 1000 * @stable ICU 4.0 1001 */ 1002 U_STABLE UBool U_EXPORT2 1003 uregex_hasAnchoringBounds(const URegularExpression *regexp, 1004 UErrorCode *status); 1005 1006 1007 /** 1008 * Set whether this URegularExpression is using Anchoring Bounds for its region. 1009 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 1010 * and end of the region. Without Anchoring Bounds, anchors will only match at 1011 * the positions they would in the complete text. 1012 * 1013 * Anchoring Bounds are the default for regions. 1014 * 1015 * @param regexp The compiled regular expression. 1016 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 1017 * @param status A pointer to a UErrorCode to receive any errors. 1018 * @stable ICU 4.0 1019 */ 1020 U_STABLE void U_EXPORT2 1021 uregex_useAnchoringBounds(URegularExpression *regexp, 1022 UBool b, 1023 UErrorCode *status); 1024 1025 /** 1026 * Return TRUE if the most recent matching operation touched the 1027 * end of the text being processed. In this case, additional input text could 1028 * change the results of that match. 1029 * 1030 * @param regexp The compiled regular expression. 1031 * @param status A pointer to a UErrorCode to receive any errors. 1032 * @return TRUE if the most recent match hit the end of input 1033 * @stable ICU 4.0 1034 */ 1035 U_STABLE UBool U_EXPORT2 1036 uregex_hitEnd(const URegularExpression *regexp, 1037 UErrorCode *status); 1038 1039 /** 1040 * Return TRUE the most recent match succeeded and additional input could cause 1041 * it to fail. If this function returns false and a match was found, then more input 1042 * might change the match but the match won't be lost. If a match was not found, 1043 * then requireEnd has no meaning. 1044 * 1045 * @param regexp The compiled regular expression. 1046 * @param status A pointer to a UErrorCode to receive any errors. 1047 * @return TRUE if more input could cause the most recent match to no longer match. 1048 * @stable ICU 4.0 1049 */ 1050 U_STABLE UBool U_EXPORT2 1051 uregex_requireEnd(const URegularExpression *regexp, 1052 UErrorCode *status); 1053 1054 1055 1056 1057 1058 /** 1059 * Replaces every substring of the input that matches the pattern 1060 * with the given replacement string. This is a convenience function that 1061 * provides a complete find-and-replace-all operation. 1062 * 1063 * This method scans the input string looking for matches of the pattern. 1064 * Input that is not part of any match is copied unchanged to the 1065 * destination buffer. Matched regions are replaced in the output 1066 * buffer by the replacement string. The replacement string may contain 1067 * references to capture groups; these take the form of $1, $2, etc. 1068 * 1069 * @param regexp The compiled regular expression. 1070 * @param replacementText A string containing the replacement text. 1071 * @param replacementLength The length of the replacement string, or 1072 * -1 if it is NUL terminated. 1073 * @param destBuf A (UChar *) buffer that will receive the result. 1074 * @param destCapacity The capacity of the destination buffer. 1075 * @param status A reference to a UErrorCode to receive any errors. 1076 * @return The length of the string resulting from the find 1077 * and replace operation. In the event that the 1078 * destination capacity is inadequate, the return value 1079 * is still the full length of the untruncated string. 1080 * @stable ICU 3.0 1081 */ 1082 U_STABLE int32_t U_EXPORT2 1083 uregex_replaceAll(URegularExpression *regexp, 1084 const UChar *replacementText, 1085 int32_t replacementLength, 1086 UChar *destBuf, 1087 int32_t destCapacity, 1088 UErrorCode *status); 1089 1090 /** 1091 * Replaces every substring of the input that matches the pattern 1092 * with the given replacement string. This is a convenience function that 1093 * provides a complete find-and-replace-all operation. 1094 * 1095 * This method scans the input string looking for matches of the pattern. 1096 * Input that is not part of any match is copied unchanged to the 1097 * destination buffer. Matched regions are replaced in the output 1098 * buffer by the replacement string. The replacement string may contain 1099 * references to capture groups; these take the form of $1, $2, etc. 1100 * 1101 * @param regexp The compiled regular expression. 1102 * @param replacement A string containing the replacement text. 1103 * @param dest A mutable UText that will receive the result. 1104 * If NULL, a new UText will be created (which may not be mutable). 1105 * @param status A reference to a UErrorCode to receive any errors. 1106 * @return A UText containing the results of the find and replace. 1107 * If a pre-allocated UText was provided, it will always be used and returned. 1108 * 1109 * @stable ICU 4.6 1110 */ 1111 U_STABLE UText * U_EXPORT2 1112 uregex_replaceAllUText(URegularExpression *regexp, 1113 UText *replacement, 1114 UText *dest, 1115 UErrorCode *status); 1116 1117 /** 1118 * Replaces the first substring of the input that matches the pattern 1119 * with the given replacement string. This is a convenience function that 1120 * provides a complete find-and-replace operation. 1121 * 1122 * This method scans the input string looking for a match of the pattern. 1123 * All input that is not part of the match is copied unchanged to the 1124 * destination buffer. The matched region is replaced in the output 1125 * buffer by the replacement string. The replacement string may contain 1126 * references to capture groups; these take the form of $1, $2, etc. 1127 * 1128 * @param regexp The compiled regular expression. 1129 * @param replacementText A string containing the replacement text. 1130 * @param replacementLength The length of the replacement string, or 1131 * -1 if it is NUL terminated. 1132 * @param destBuf A (UChar *) buffer that will receive the result. 1133 * @param destCapacity The capacity of the destination buffer. 1134 * @param status a reference to a UErrorCode to receive any errors. 1135 * @return The length of the string resulting from the find 1136 * and replace operation. In the event that the 1137 * destination capacity is inadequate, the return value 1138 * is still the full length of the untruncated string. 1139 * @stable ICU 3.0 1140 */ 1141 U_STABLE int32_t U_EXPORT2 1142 uregex_replaceFirst(URegularExpression *regexp, 1143 const UChar *replacementText, 1144 int32_t replacementLength, 1145 UChar *destBuf, 1146 int32_t destCapacity, 1147 UErrorCode *status); 1148 1149 /** 1150 * Replaces the first substring of the input that matches the pattern 1151 * with the given replacement string. This is a convenience function that 1152 * provides a complete find-and-replace operation. 1153 * 1154 * This method scans the input string looking for a match of the pattern. 1155 * All input that is not part of the match is copied unchanged to the 1156 * destination buffer. The matched region is replaced in the output 1157 * buffer by the replacement string. The replacement string may contain 1158 * references to capture groups; these take the form of $1, $2, etc. 1159 * 1160 * @param regexp The compiled regular expression. 1161 * @param replacement A string containing the replacement text. 1162 * @param dest A mutable UText that will receive the result. 1163 * If NULL, a new UText will be created (which may not be mutable). 1164 * @param status A reference to a UErrorCode to receive any errors. 1165 * @return A UText containing the results of the find and replace. 1166 * If a pre-allocated UText was provided, it will always be used and returned. 1167 * 1168 * @stable ICU 4.6 1169 */ 1170 U_STABLE UText * U_EXPORT2 1171 uregex_replaceFirstUText(URegularExpression *regexp, 1172 UText *replacement, 1173 UText *dest, 1174 UErrorCode *status); 1175 1176 /** 1177 * Implements a replace operation intended to be used as part of an 1178 * incremental find-and-replace. 1179 * 1180 * <p>The input string, starting from the end of the previous match and ending at 1181 * the start of the current match, is appended to the destination string. Then the 1182 * replacement string is appended to the output string, 1183 * including handling any substitutions of captured text.</p> 1184 * 1185 * <p>A note on preflight computation of buffersize and error handling: 1186 * Calls to uregex_appendReplacement() and uregex_appendTail() are 1187 * designed to be chained, one after another, with the destination 1188 * buffer pointer and buffer capacity updated after each in preparation 1189 * to for the next. If the destination buffer is exhausted partway through such a 1190 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1191 * ICU conventions are for a function to perform no action if it is 1192 * called with an error status, but for this one case, uregex_appendRepacement() 1193 * will operate normally so that buffer size computations will complete 1194 * correctly. 1195 * 1196 * <p>For simple, prepackaged, non-incremental find-and-replace 1197 * operations, see replaceFirst() or replaceAll().</p> 1198 * 1199 * @param regexp The regular expression object. 1200 * @param replacementText The string that will replace the matched portion of the 1201 * input string as it is copied to the destination buffer. 1202 * The replacement text may contain references ($1, for 1203 * example) to capture groups from the match. 1204 * @param replacementLength The length of the replacement text string, 1205 * or -1 if the string is NUL terminated. 1206 * @param destBuf The buffer into which the results of the 1207 * find-and-replace are placed. On return, this pointer 1208 * will be updated to refer to the beginning of the 1209 * unused portion of buffer, leaving it in position for 1210 * a subsequent call to this function. 1211 * @param destCapacity The size of the output buffer, On return, this 1212 * parameter will be updated to reflect the space remaining 1213 * unused in the output buffer. 1214 * @param status A reference to a UErrorCode to receive any errors. 1215 * @return The length of the result string. In the event that 1216 * destCapacity is inadequate, the full length of the 1217 * untruncated output string is returned. 1218 * 1219 * @stable ICU 3.0 1220 * 1221 */ 1222 U_STABLE int32_t U_EXPORT2 1223 uregex_appendReplacement(URegularExpression *regexp, 1224 const UChar *replacementText, 1225 int32_t replacementLength, 1226 UChar **destBuf, 1227 int32_t *destCapacity, 1228 UErrorCode *status); 1229 1230 /** 1231 * Implements a replace operation intended to be used as part of an 1232 * incremental find-and-replace. 1233 * 1234 * <p>The input string, starting from the end of the previous match and ending at 1235 * the start of the current match, is appended to the destination string. Then the 1236 * replacement string is appended to the output string, 1237 * including handling any substitutions of captured text.</p> 1238 * 1239 * <p>For simple, prepackaged, non-incremental find-and-replace 1240 * operations, see replaceFirst() or replaceAll().</p> 1241 * 1242 * @param regexp The regular expression object. 1243 * @param replacementText The string that will replace the matched portion of the 1244 * input string as it is copied to the destination buffer. 1245 * The replacement text may contain references ($1, for 1246 * example) to capture groups from the match. 1247 * @param dest A mutable UText that will receive the result. Must not be NULL. 1248 * @param status A reference to a UErrorCode to receive any errors. 1249 * 1250 * @stable ICU 4.6 1251 */ 1252 U_STABLE void U_EXPORT2 1253 uregex_appendReplacementUText(URegularExpression *regexp, 1254 UText *replacementText, 1255 UText *dest, 1256 UErrorCode *status); 1257 1258 /** 1259 * As the final step in a find-and-replace operation, append the remainder 1260 * of the input string, starting at the position following the last match, 1261 * to the destination string. <code>uregex_appendTail()</code> is intended 1262 * to be invoked after one or more invocations of the 1263 * <code>uregex_appendReplacement()</code> function. 1264 * 1265 * @param regexp The regular expression object. This is needed to 1266 * obtain the input string and with the position 1267 * of the last match within it. 1268 * @param destBuf The buffer in which the results of the 1269 * find-and-replace are placed. On return, the pointer 1270 * will be updated to refer to the beginning of the 1271 * unused portion of buffer. 1272 * @param destCapacity The size of the output buffer, On return, this 1273 * value will be updated to reflect the space remaining 1274 * unused in the output buffer. 1275 * @param status A reference to a UErrorCode to receive any errors. 1276 * @return The length of the result string. In the event that 1277 * destCapacity is inadequate, the full length of the 1278 * untruncated output string is returned. 1279 * 1280 * @stable ICU 3.0 1281 */ 1282 U_STABLE int32_t U_EXPORT2 1283 uregex_appendTail(URegularExpression *regexp, 1284 UChar **destBuf, 1285 int32_t *destCapacity, 1286 UErrorCode *status); 1287 1288 /** 1289 * As the final step in a find-and-replace operation, append the remainder 1290 * of the input string, starting at the position following the last match, 1291 * to the destination string. <code>uregex_appendTailUText()</code> is intended 1292 * to be invoked after one or more invocations of the 1293 * <code>uregex_appendReplacementUText()</code> function. 1294 * 1295 * @param regexp The regular expression object. This is needed to 1296 * obtain the input string and with the position 1297 * of the last match within it. 1298 * @param dest A mutable UText that will receive the result. Must not be NULL. 1299 * 1300 * @param status Error code 1301 * 1302 * @return The destination UText. 1303 * 1304 * @stable ICU 4.6 1305 */ 1306 U_STABLE UText * U_EXPORT2 1307 uregex_appendTailUText(URegularExpression *regexp, 1308 UText *dest, 1309 UErrorCode *status); 1310 1311 /** 1312 * Split a string into fields. Somewhat like split() from Perl. 1313 * The pattern matches identify delimiters that separate the input 1314 * into fields. The input data between the matches becomes the 1315 * fields themselves. 1316 * 1317 * Each of the fields is copied from the input string to the destination 1318 * buffer, and NUL terminated. The position of each field within 1319 * the destination buffer is returned in the destFields array. 1320 * 1321 * If the delimiter pattern includes capture groups, the captured text will 1322 * also appear in the destination array of output strings, interspersed 1323 * with the fields. This is similar to Perl, but differs from Java, 1324 * which ignores the presence of capture groups in the pattern. 1325 * 1326 * Trailing empty fields will always be returned, assuming sufficient 1327 * destination capacity. This differs from the default behavior for Java 1328 * and Perl where trailing empty fields are not returned. 1329 * 1330 * The number of strings produced by the split operation is returned. 1331 * This count includes the strings from capture groups in the delimiter pattern. 1332 * This behavior differs from Java, which ignores capture groups. 1333 * 1334 * @param regexp The compiled regular expression. 1335 * @param destBuf A (UChar *) buffer to receive the fields that 1336 * are extracted from the input string. These 1337 * field pointers will refer to positions within the 1338 * destination buffer supplied by the caller. Any 1339 * extra positions within the destFields array will be 1340 * set to NULL. 1341 * @param destCapacity The capacity of the destBuf. 1342 * @param requiredCapacity The actual capacity required of the destBuf. 1343 * If destCapacity is too small, requiredCapacity will return 1344 * the total capacity required to hold all of the output, and 1345 * a U_BUFFER_OVERFLOW_ERROR will be returned. 1346 * @param destFields An array to be filled with the position of each 1347 * of the extracted fields within destBuf. 1348 * @param destFieldsCapacity The number of elements in the destFields array. 1349 * If the number of fields found is less than destFieldsCapacity, 1350 * the extra destFields elements are set to zero. 1351 * If destFieldsCapacity is too small, the trailing part of the 1352 * input, including any field delimiters, is treated as if it 1353 * were the last field - it is copied to the destBuf, and 1354 * its position is in the destBuf is stored in the last element 1355 * of destFields. This behavior mimics that of Perl. It is not 1356 * an error condition, and no error status is returned when all destField 1357 * positions are used. 1358 * @param status A reference to a UErrorCode to receive any errors. 1359 * @return The number of fields into which the input string was split. 1360 * @stable ICU 3.0 1361 */ 1362 U_STABLE int32_t U_EXPORT2 1363 uregex_split( URegularExpression *regexp, 1364 UChar *destBuf, 1365 int32_t destCapacity, 1366 int32_t *requiredCapacity, 1367 UChar *destFields[], 1368 int32_t destFieldsCapacity, 1369 UErrorCode *status); 1370 1371 /** 1372 * Split a string into fields. Somewhat like split() from Perl. 1373 * The pattern matches identify delimiters that separate the input 1374 * into fields. The input data between the matches becomes the 1375 * fields themselves. 1376 * <p> 1377 * The behavior of this function is not very closely aligned with uregex_split(); 1378 * instead, it is based on (and implemented directly on top of) the C++ split method. 1379 * 1380 * @param regexp The compiled regular expression. 1381 * @param destFields An array of mutable UText structs to receive the results of the split. 1382 * If a field is NULL, a new UText is allocated to contain the results for 1383 * that field. This new UText is not guaranteed to be mutable. 1384 * @param destFieldsCapacity The number of elements in the destination array. 1385 * If the number of fields found is less than destCapacity, the 1386 * extra strings in the destination array are not altered. 1387 * If the number of destination strings is less than the number 1388 * of fields, the trailing part of the input string, including any 1389 * field delimiters, is placed in the last destination string. 1390 * This behavior mimics that of Perl. It is not an error condition, and no 1391 * error status is returned when all destField positions are used. 1392 * @param status A reference to a UErrorCode to receive any errors. 1393 * @return The number of fields into which the input string was split. 1394 * 1395 * @stable ICU 4.6 1396 */ 1397 U_STABLE int32_t U_EXPORT2 1398 uregex_splitUText(URegularExpression *regexp, 1399 UText *destFields[], 1400 int32_t destFieldsCapacity, 1401 UErrorCode *status); 1402 1403 /** 1404 * Set a processing time limit for match operations with this URegularExpression. 1405 * 1406 * Some patterns, when matching certain strings, can run in exponential time. 1407 * For practical purposes, the match operation may appear to be in an 1408 * infinite loop. 1409 * When a limit is set a match operation will fail with an error if the 1410 * limit is exceeded. 1411 * <p> 1412 * The units of the limit are steps of the match engine. 1413 * Correspondence with actual processor time will depend on the speed 1414 * of the processor and the details of the specific pattern, but will 1415 * typically be on the order of milliseconds. 1416 * <p> 1417 * By default, the matching time is not limited. 1418 * <p> 1419 * 1420 * @param regexp The compiled regular expression. 1421 * @param limit The limit value, or 0 for no limit. 1422 * @param status A reference to a UErrorCode to receive any errors. 1423 * @stable ICU 4.0 1424 */ 1425 U_STABLE void U_EXPORT2 1426 uregex_setTimeLimit(URegularExpression *regexp, 1427 int32_t limit, 1428 UErrorCode *status); 1429 1430 /** 1431 * Get the time limit for for matches with this URegularExpression. 1432 * A return value of zero indicates that there is no limit. 1433 * 1434 * @param regexp The compiled regular expression. 1435 * @param status A reference to a UErrorCode to receive any errors. 1436 * @return the maximum allowed time for a match, in units of processing steps. 1437 * @stable ICU 4.0 1438 */ 1439 U_STABLE int32_t U_EXPORT2 1440 uregex_getTimeLimit(const URegularExpression *regexp, 1441 UErrorCode *status); 1442 1443 /** 1444 * Set the amount of heap storage available for use by the match backtracking stack. 1445 * <p> 1446 * ICU uses a backtracking regular expression engine, with the backtrack stack 1447 * maintained on the heap. This function sets the limit to the amount of memory 1448 * that can be used for this purpose. A backtracking stack overflow will 1449 * result in an error from the match operation that caused it. 1450 * <p> 1451 * A limit is desirable because a malicious or poorly designed pattern can use 1452 * excessive memory, potentially crashing the process. A limit is enabled 1453 * by default. 1454 * <p> 1455 * @param regexp The compiled regular expression. 1456 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1457 * A value of zero means no limit. 1458 * The limit must be greater than or equal to zero. 1459 * @param status A reference to a UErrorCode to receive any errors. 1460 * 1461 * @stable ICU 4.0 1462 */ 1463 U_STABLE void U_EXPORT2 1464 uregex_setStackLimit(URegularExpression *regexp, 1465 int32_t limit, 1466 UErrorCode *status); 1467 1468 /** 1469 * Get the size of the heap storage available for use by the back tracking stack. 1470 * 1471 * @return the maximum backtracking stack size, in bytes, or zero if the 1472 * stack size is unlimited. 1473 * @stable ICU 4.0 1474 */ 1475 U_STABLE int32_t U_EXPORT2 1476 uregex_getStackLimit(const URegularExpression *regexp, 1477 UErrorCode *status); 1478 1479 1480 /** 1481 * Function pointer for a regular expression matching callback function. 1482 * When set, a callback function will be called periodically during matching 1483 * operations. If the call back function returns FALSE, the matching 1484 * operation will be terminated early. 1485 * 1486 * Note: the callback function must not call other functions on this 1487 * URegularExpression. 1488 * 1489 * @param context context pointer. The callback function will be invoked 1490 * with the context specified at the time that 1491 * uregex_setMatchCallback() is called. 1492 * @param steps the accumulated processing time, in match steps, 1493 * for this matching operation. 1494 * @return TRUE to continue the matching operation. 1495 * FALSE to terminate the matching operation. 1496 * @stable ICU 4.0 1497 */ 1498 U_CDECL_BEGIN 1499 typedef UBool U_CALLCONV URegexMatchCallback ( 1500 const void *context, 1501 int32_t steps); 1502 U_CDECL_END 1503 1504 /** 1505 * Set a callback function for this URegularExpression. 1506 * During matching operations the function will be called periodically, 1507 * giving the application the opportunity to terminate a long-running 1508 * match. 1509 * 1510 * @param regexp The compiled regular expression. 1511 * @param callback A pointer to the user-supplied callback function. 1512 * @param context User context pointer. The value supplied at the 1513 * time the callback function is set will be saved 1514 * and passed to the callback each time that it is called. 1515 * @param status A reference to a UErrorCode to receive any errors. 1516 * @stable ICU 4.0 1517 */ 1518 U_STABLE void U_EXPORT2 1519 uregex_setMatchCallback(URegularExpression *regexp, 1520 URegexMatchCallback *callback, 1521 const void *context, 1522 UErrorCode *status); 1523 1524 1525 /** 1526 * Get the callback function for this URegularExpression. 1527 * 1528 * @param regexp The compiled regular expression. 1529 * @param callback Out parameter, receives a pointer to the user-supplied 1530 * callback function. 1531 * @param context Out parameter, receives the user context pointer that 1532 * was set when uregex_setMatchCallback() was called. 1533 * @param status A reference to a UErrorCode to receive any errors. 1534 * @stable ICU 4.0 1535 */ 1536 U_STABLE void U_EXPORT2 1537 uregex_getMatchCallback(const URegularExpression *regexp, 1538 URegexMatchCallback **callback, 1539 const void **context, 1540 UErrorCode *status); 1541 1542 /** 1543 * Function pointer for a regular expression find callback function. 1544 * 1545 * When set, a callback function will be called during a find operation 1546 * and for operations that depend on find, such as findNext, split and some replace 1547 * operations like replaceFirst. 1548 * The callback will usually be called after each attempt at a match, but this is not a 1549 * guarantee that the callback will be invoked at each character. For finds where the 1550 * match engine is invoked at each character, this may be close to true, but less likely 1551 * for more optimized loops where the pattern is known to only start, and the match 1552 * engine invoked, at certain characters. 1553 * When invoked, this callback will specify the index at which a match operation is about 1554 * to be attempted, giving the application the opportunity to terminate a long-running 1555 * find operation. 1556 * 1557 * If the call back function returns FALSE, the find operation will be terminated early. 1558 * 1559 * Note: the callback function must not call other functions on this 1560 * URegularExpression 1561 * 1562 * @param context context pointer. The callback function will be invoked 1563 * with the context specified at the time that 1564 * uregex_setFindProgressCallback() is called. 1565 * @param matchIndex the next index at which a match attempt will be attempted for this 1566 * find operation. If this callback interrupts the search, this is the 1567 * index at which a find/findNext operation may be re-initiated. 1568 * @return TRUE to continue the matching operation. 1569 * FALSE to terminate the matching operation. 1570 * @stable ICU 4.6 1571 */ 1572 U_CDECL_BEGIN 1573 typedef UBool U_CALLCONV URegexFindProgressCallback ( 1574 const void *context, 1575 int64_t matchIndex); 1576 U_CDECL_END 1577 1578 1579 /** 1580 * Set the find progress callback function for this URegularExpression. 1581 * 1582 * @param regexp The compiled regular expression. 1583 * @param callback A pointer to the user-supplied callback function. 1584 * @param context User context pointer. The value supplied at the 1585 * time the callback function is set will be saved 1586 * and passed to the callback each time that it is called. 1587 * @param status A reference to a UErrorCode to receive any errors. 1588 * @stable ICU 4.6 1589 */ 1590 U_STABLE void U_EXPORT2 1591 uregex_setFindProgressCallback(URegularExpression *regexp, 1592 URegexFindProgressCallback *callback, 1593 const void *context, 1594 UErrorCode *status); 1595 1596 /** 1597 * Get the find progress callback function for this URegularExpression. 1598 * 1599 * @param regexp The compiled regular expression. 1600 * @param callback Out parameter, receives a pointer to the user-supplied 1601 * callback function. 1602 * @param context Out parameter, receives the user context pointer that 1603 * was set when uregex_setFindProgressCallback() was called. 1604 * @param status A reference to a UErrorCode to receive any errors. 1605 * @stable ICU 4.6 1606 */ 1607 U_STABLE void U_EXPORT2 1608 uregex_getFindProgressCallback(const URegularExpression *regexp, 1609 URegexFindProgressCallback **callback, 1610 const void **context, 1611 UErrorCode *status); 1612 1613 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1614 #endif /* UREGEX_H */ 1615