1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2011,2014 IBM and others. All rights reserved. 4 ********************************************************************** 5 * Date Name Description 6 * 06/28/2001 synwee Creation. 7 ********************************************************************** 8 */ 9 #ifndef USEARCH_H 10 #define USEARCH_H 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 15 16 #include "unicode/localpointer.h" 17 #include "unicode/ucol.h" 18 #include "unicode/ucoleitr.h" 19 #include "unicode/ubrk.h" 20 21 /** 22 * \file 23 * \brief C API: StringSearch 24 * 25 * C Apis for an engine that provides language-sensitive text searching based 26 * on the comparison rules defined in a <tt>UCollator</tt> data struct, 27 * see <tt>ucol.h</tt>. This ensures that language eccentricity can be 28 * handled, e.g. for the German collator, characters ß and SS will be matched 29 * if case is chosen to be ignored. 30 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 31 * "ICU Collation Design Document"</a> for more information. 32 * <p> 33 * The implementation may use a linear search or a modified form of the Boyer-Moore 34 * search; for more information on the latter see 35 * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.html"> 36 * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i> 37 * in February, 1999. 38 * <p> 39 * There are 2 match options for selection:<br> 40 * Let S' be the sub-string of a text string S between the offsets start and 41 * end <start, end>. 42 * <br> 43 * A pattern string P matches a text string S at the offsets <start, end> 44 * if 45 * <pre> 46 * option 1. Some canonical equivalent of P matches some canonical equivalent 47 * of S' 48 * option 2. P matches S' and if P starts or ends with a combining mark, 49 * there exists no non-ignorable combining mark before or after S' 50 * in S respectively. 51 * </pre> 52 * Option 2. will be the default. 53 * <p> 54 * This search has APIs similar to that of other text iteration mechanisms 55 * such as the break iterators in <tt>ubrk.h</tt>. Using these 56 * APIs, it is easy to scan through text looking for all occurances of 57 * a given pattern. This search iterator allows changing of direction by 58 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. 59 * Though a direction change can occur without calling <tt>reset</tt> first, 60 * this operation comes with some speed penalty. 61 * Generally, match results in the forward direction will match the result 62 * matches in the backwards direction in the reverse order 63 * <p> 64 * <tt>usearch.h</tt> provides APIs to specify the starting position 65 * within the text string to be searched, e.g. <tt>usearch_setOffset</tt>, 66 * <tt>usearch_preceding</tt> and <tt>usearch_following</tt>. Since the 67 * starting position will be set as it is specified, please take note that 68 * there are some dangerous positions which the search may render incorrect 69 * results: 70 * <ul> 71 * <li> The midst of a substring that requires normalization. 72 * <li> If the following match is to be found, the position should not be the 73 * second character which requires to be swapped with the preceding 74 * character. Vice versa, if the preceding match is to be found, 75 * position to search from should not be the first character which 76 * requires to be swapped with the next character. E.g certain Thai and 77 * Lao characters require swapping. 78 * <li> If a following pattern match is to be found, any position within a 79 * contracting sequence except the first will fail. Vice versa if a 80 * preceding pattern match is to be found, a invalid starting point 81 * would be any character within a contracting sequence except the last. 82 * </ul> 83 * <p> 84 * A breakiterator can be used if only matches at logical breaks are desired. 85 * Using a breakiterator will only give you results that exactly matches the 86 * boundaries given by the breakiterator. For instance the pattern "e" will 87 * not be found in the string "\u00e9" if a character break iterator is used. 88 * <p> 89 * Options are provided to handle overlapping matches. 90 * E.g. In English, overlapping matches produces the result 0 and 2 91 * for the pattern "abab" in the text "ababab", where else mutually 92 * exclusive matches only produce the result of 0. 93 * <p> 94 * Options are also provided to implement "asymmetric search" as described in 95 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 96 * UTS #10 Unicode Collation Algorithm</a>, specifically the USearchAttribute 97 * USEARCH_ELEMENT_COMPARISON and its values. 98 * <p> 99 * Though collator attributes will be taken into consideration while 100 * performing matches, there are no APIs here for setting and getting the 101 * attributes. These attributes can be set by getting the collator 102 * from <tt>usearch_getCollator</tt> and using the APIs in <tt>ucol.h</tt>. 103 * Lastly to update String Search to the new collator attributes, 104 * usearch_reset() has to be called. 105 * <p> 106 * Restriction: <br> 107 * Currently there are no composite characters that consists of a 108 * character with combining class > 0 before a character with combining 109 * class == 0. However, if such a character exists in the future, the 110 * search mechanism does not guarantee the results for option 1. 111 * 112 * <p> 113 * Example of use:<br> 114 * <pre><code> 115 * char *tgtstr = "The quick brown fox jumped over the lazy fox"; 116 * char *patstr = "fox"; 117 * UChar target[64]; 118 * UChar pattern[16]; 119 * UErrorCode status = U_ZERO_ERROR; 120 * u_uastrcpy(target, tgtstr); 121 * u_uastrcpy(pattern, patstr); 122 * 123 * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US", 124 * NULL, &status); 125 * if (U_SUCCESS(status)) { 126 * for (int pos = usearch_first(search, &status); 127 * pos != USEARCH_DONE; 128 * pos = usearch_next(search, &status)) 129 * { 130 * printf("Found match at %d pos, length is %d\n", pos, 131 * usearch_getMatchLength(search)); 132 * } 133 * } 134 * 135 * usearch_close(search); 136 * </code></pre> 137 * @stable ICU 2.4 138 */ 139 140 /** 141 * DONE is returned by previous() and next() after all valid matches have 142 * been returned, and by first() and last() if there are no matches at all. 143 * @stable ICU 2.4 144 */ 145 #define USEARCH_DONE -1 146 147 /** 148 * Data structure for searching 149 * @stable ICU 2.4 150 */ 151 struct UStringSearch; 152 /** 153 * Data structure for searching 154 * @stable ICU 2.4 155 */ 156 typedef struct UStringSearch UStringSearch; 157 158 /** 159 * @stable ICU 2.4 160 */ 161 typedef enum { 162 /** 163 * Option for overlapping matches 164 * @stable ICU 2.4 165 */ 166 USEARCH_OVERLAP = 0, 167 #ifndef U_HIDE_DEPRECATED_API 168 /** 169 * Option for canonical matches; option 1 in header documentation. 170 * The default value will be USEARCH_OFF. 171 * Note: Setting this option to USEARCH_ON currently has no effect on 172 * search behavior, and this option is deprecated. Instead, to control 173 * canonical match behavior, you must set UCOL_NORMALIZATION_MODE 174 * appropriately (to UCOL_OFF or UCOL_ON) in the UCollator used by 175 * the UStringSearch object. 176 * @see usearch_openFromCollator 177 * @see usearch_getCollator 178 * @see usearch_setCollator 179 * @see ucol_getAttribute 180 * @deprecated ICU 53 181 */ 182 USEARCH_CANONICAL_MATCH = 1, 183 #endif /* U_HIDE_DEPRECATED_API */ 184 /** 185 * Option to control how collation elements are compared. 186 * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. 187 * @stable ICU 4.4 188 */ 189 USEARCH_ELEMENT_COMPARISON = 2, 190 191 /** 192 * Count of attribute types 193 * @stable ICU 2.4 194 */ 195 USEARCH_ATTRIBUTE_COUNT = 3 196 } USearchAttribute; 197 198 /** 199 * @stable ICU 2.4 200 */ 201 typedef enum { 202 /** 203 * Default value for any USearchAttribute 204 * @stable ICU 2.4 205 */ 206 USEARCH_DEFAULT = -1, 207 /** 208 * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH 209 * @stable ICU 2.4 210 */ 211 USEARCH_OFF, 212 /** 213 * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH 214 * @stable ICU 2.4 215 */ 216 USEARCH_ON, 217 /** 218 * Value (default) for USEARCH_ELEMENT_COMPARISON; 219 * standard collation element comparison at the specified collator 220 * strength. 221 * @stable ICU 4.4 222 */ 223 USEARCH_STANDARD_ELEMENT_COMPARISON, 224 /** 225 * Value for USEARCH_ELEMENT_COMPARISON; 226 * collation element comparison is modified to effectively provide 227 * behavior between the specified strength and strength - 1. Collation 228 * elements in the pattern that have the base weight for the specified 229 * strength are treated as "wildcards" that match an element with any 230 * other weight at that collation level in the searched text. For 231 * example, with a secondary-strength English collator, a plain 'e' in 232 * the pattern will match a plain e or an e with any diacritic in the 233 * searched text, but an e with diacritic in the pattern will only 234 * match an e with the same diacritic in the searched text. 235 * 236 * This supports "asymmetric search" as described in 237 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 238 * UTS #10 Unicode Collation Algorithm</a>. 239 * 240 * @stable ICU 4.4 241 */ 242 USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, 243 /** 244 * Value for USEARCH_ELEMENT_COMPARISON. 245 * collation element comparison is modified to effectively provide 246 * behavior between the specified strength and strength - 1. Collation 247 * elements in either the pattern or the searched text that have the 248 * base weight for the specified strength are treated as "wildcards" 249 * that match an element with any other weight at that collation level. 250 * For example, with a secondary-strength English collator, a plain 'e' 251 * in the pattern will match a plain e or an e with any diacritic in the 252 * searched text, but an e with diacritic in the pattern will only 253 * match an e with the same diacritic or a plain e in the searched text. 254 * 255 * This option is similar to "asymmetric search" as described in 256 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 257 * UTS #10 Unicode Collation Algorithm</a, but also allows unmarked 258 * characters in the searched text to match marked or unmarked versions of 259 * that character in the pattern. 260 * 261 * @stable ICU 4.4 262 */ 263 USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, 264 265 /** 266 * Count of attribute values 267 * @stable ICU 2.4 268 */ 269 USEARCH_ATTRIBUTE_VALUE_COUNT 270 } USearchAttributeValue; 271 272 /* open and close ------------------------------------------------------ */ 273 274 /** 275 * Creating a search iterator data struct using the argument locale language 276 * rule set. A collator will be created in the process, which will be owned by 277 * this search and will be deleted in <tt>usearch_close</tt>. 278 * @param pattern for matching 279 * @param patternlength length of the pattern, -1 for null-termination 280 * @param text text string 281 * @param textlength length of the text string, -1 for null-termination 282 * @param locale name of locale for the rules to be used 283 * @param breakiter A BreakIterator that will be used to restrict the points 284 * at which matches are detected. If a match is found, but 285 * the match's start or end index is not a boundary as 286 * determined by the <tt>BreakIterator</tt>, the match will 287 * be rejected and another will be searched for. 288 * If this parameter is <tt>NULL</tt>, no break detection is 289 * attempted. 290 * @param status for errors if it occurs. If pattern or text is NULL, or if 291 * patternlength or textlength is 0 then an 292 * U_ILLEGAL_ARGUMENT_ERROR is returned. 293 * @return search iterator data structure, or NULL if there is an error. 294 * @stable ICU 2.4 295 */ 296 U_STABLE UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 297 int32_t patternlength, 298 const UChar *text, 299 int32_t textlength, 300 const char *locale, 301 UBreakIterator *breakiter, 302 UErrorCode *status); 303 304 /** 305 * Creating a search iterator data struct using the argument collator language 306 * rule set. Note, user retains the ownership of this collator, thus the 307 * responsibility of deletion lies with the user. 308 * NOTE: string search cannot be instantiated from a collator that has 309 * collate digits as numbers (CODAN) turned on. 310 * @param pattern for matching 311 * @param patternlength length of the pattern, -1 for null-termination 312 * @param text text string 313 * @param textlength length of the text string, -1 for null-termination 314 * @param collator used for the language rules 315 * @param breakiter A BreakIterator that will be used to restrict the points 316 * at which matches are detected. If a match is found, but 317 * the match's start or end index is not a boundary as 318 * determined by the <tt>BreakIterator</tt>, the match will 319 * be rejected and another will be searched for. 320 * If this parameter is <tt>NULL</tt>, no break detection is 321 * attempted. 322 * @param status for errors if it occurs. If collator, pattern or text is NULL, 323 * or if patternlength or textlength is 0 then an 324 * U_ILLEGAL_ARGUMENT_ERROR is returned. 325 * @return search iterator data structure, or NULL if there is an error. 326 * @stable ICU 2.4 327 */ 328 U_STABLE UStringSearch * U_EXPORT2 usearch_openFromCollator( 329 const UChar *pattern, 330 int32_t patternlength, 331 const UChar *text, 332 int32_t textlength, 333 const UCollator *collator, 334 UBreakIterator *breakiter, 335 UErrorCode *status); 336 337 /** 338 * Destroying and cleaning up the search iterator data struct. 339 * If a collator is created in <tt>usearch_open</tt>, it will be destroyed here. 340 * @param searchiter data struct to clean up 341 * @stable ICU 2.4 342 */ 343 U_STABLE void U_EXPORT2 usearch_close(UStringSearch *searchiter); 344 345 #if U_SHOW_CPLUSPLUS_API 346 347 U_NAMESPACE_BEGIN 348 349 /** 350 * \class LocalUStringSearchPointer 351 * "Smart pointer" class, closes a UStringSearch via usearch_close(). 352 * For most methods see the LocalPointerBase base class. 353 * 354 * @see LocalPointerBase 355 * @see LocalPointer 356 * @stable ICU 4.4 357 */ 358 U_DEFINE_LOCAL_OPEN_POINTER(LocalUStringSearchPointer, UStringSearch, usearch_close); 359 360 U_NAMESPACE_END 361 362 #endif 363 364 /* get and set methods -------------------------------------------------- */ 365 366 /** 367 * Sets the current position in the text string which the next search will 368 * start from. Clears previous states. 369 * This method takes the argument index and sets the position in the text 370 * string accordingly without checking if the index is pointing to a 371 * valid starting point to begin searching. 372 * Search positions that may render incorrect results are highlighted in the 373 * header comments 374 * @param strsrch search iterator data struct 375 * @param position position to start next search from. If position is less 376 * than or greater than the text range for searching, 377 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 378 * @param status error status if any. 379 * @stable ICU 2.4 380 */ 381 U_STABLE void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 382 int32_t position, 383 UErrorCode *status); 384 385 /** 386 * Return the current index in the string text being searched. 387 * If the iteration has gone past the end of the text (or past the beginning 388 * for a backwards search), <tt>USEARCH_DONE</tt> is returned. 389 * @param strsrch search iterator data struct 390 * @see #USEARCH_DONE 391 * @stable ICU 2.4 392 */ 393 U_STABLE int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch); 394 395 /** 396 * Sets the text searching attributes located in the enum USearchAttribute 397 * with values from the enum USearchAttributeValue. 398 * <tt>USEARCH_DEFAULT</tt> can be used for all attributes for resetting. 399 * @param strsrch search iterator data struct 400 * @param attribute text attribute to be set 401 * @param value text attribute value 402 * @param status for errors if it occurs 403 * @see #usearch_getAttribute 404 * @stable ICU 2.4 405 */ 406 U_STABLE void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 407 USearchAttribute attribute, 408 USearchAttributeValue value, 409 UErrorCode *status); 410 411 /** 412 * Gets the text searching attributes. 413 * @param strsrch search iterator data struct 414 * @param attribute text attribute to be retrieve 415 * @return text attribute value 416 * @see #usearch_setAttribute 417 * @stable ICU 2.4 418 */ 419 U_STABLE USearchAttributeValue U_EXPORT2 usearch_getAttribute( 420 const UStringSearch *strsrch, 421 USearchAttribute attribute); 422 423 /** 424 * Returns the index to the match in the text string that was searched. 425 * This call returns a valid result only after a successful call to 426 * <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, 427 * or <tt>usearch_last</tt>. 428 * Just after construction, or after a searching method returns 429 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. 430 * <p> 431 * Use <tt>usearch_getMatchedLength</tt> to get the matched string length. 432 * @param strsrch search iterator data struct 433 * @return index to a substring within the text string that is being 434 * searched. 435 * @see #usearch_first 436 * @see #usearch_next 437 * @see #usearch_previous 438 * @see #usearch_last 439 * @see #USEARCH_DONE 440 * @stable ICU 2.4 441 */ 442 U_STABLE int32_t U_EXPORT2 usearch_getMatchedStart( 443 const UStringSearch *strsrch); 444 445 /** 446 * Returns the length of text in the string which matches the search pattern. 447 * This call returns a valid result only after a successful call to 448 * <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, 449 * or <tt>usearch_last</tt>. 450 * Just after construction, or after a searching method returns 451 * <tt>USEARCH_DONE</tt>, this method will return 0. 452 * @param strsrch search iterator data struct 453 * @return The length of the match in the string text, or 0 if there is no 454 * match currently. 455 * @see #usearch_first 456 * @see #usearch_next 457 * @see #usearch_previous 458 * @see #usearch_last 459 * @see #USEARCH_DONE 460 * @stable ICU 2.4 461 */ 462 U_STABLE int32_t U_EXPORT2 usearch_getMatchedLength( 463 const UStringSearch *strsrch); 464 465 /** 466 * Returns the text that was matched by the most recent call to 467 * <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, 468 * or <tt>usearch_last</tt>. 469 * If the iterator is not pointing at a valid match (e.g. just after 470 * construction or after <tt>USEARCH_DONE</tt> has been returned, returns 471 * an empty string. If result is not large enough to store the matched text, 472 * result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR 473 * will be returned in status. result will be null-terminated whenever 474 * possible. If the buffer fits the matched text exactly, a null-termination 475 * is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status. 476 * Pre-flighting can be either done with length = 0 or the API 477 * <tt>usearch_getMatchLength</tt>. 478 * @param strsrch search iterator data struct 479 * @param result UChar buffer to store the matched string 480 * @param resultCapacity length of the result buffer 481 * @param status error returned if result is not large enough 482 * @return exact length of the matched text, not counting the null-termination 483 * @see #usearch_first 484 * @see #usearch_next 485 * @see #usearch_previous 486 * @see #usearch_last 487 * @see #USEARCH_DONE 488 * @stable ICU 2.4 489 */ 490 U_STABLE int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 491 UChar *result, 492 int32_t resultCapacity, 493 UErrorCode *status); 494 495 #if !UCONFIG_NO_BREAK_ITERATION 496 497 /** 498 * Set the BreakIterator that will be used to restrict the points at which 499 * matches are detected. 500 * @param strsrch search iterator data struct 501 * @param breakiter A BreakIterator that will be used to restrict the points 502 * at which matches are detected. If a match is found, but 503 * the match's start or end index is not a boundary as 504 * determined by the <tt>BreakIterator</tt>, the match will 505 * be rejected and another will be searched for. 506 * If this parameter is <tt>NULL</tt>, no break detection is 507 * attempted. 508 * @param status for errors if it occurs 509 * @see #usearch_getBreakIterator 510 * @stable ICU 2.4 511 */ 512 U_STABLE void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 513 UBreakIterator *breakiter, 514 UErrorCode *status); 515 516 /** 517 * Returns the BreakIterator that is used to restrict the points at which 518 * matches are detected. This will be the same object that was passed to the 519 * constructor or to <tt>usearch_setBreakIterator</tt>. Note that 520 * <tt>NULL</tt> 521 * is a legal value; it means that break detection should not be attempted. 522 * @param strsrch search iterator data struct 523 * @return break iterator used 524 * @see #usearch_setBreakIterator 525 * @stable ICU 2.4 526 */ 527 U_STABLE const UBreakIterator * U_EXPORT2 usearch_getBreakIterator( 528 const UStringSearch *strsrch); 529 530 #endif 531 532 /** 533 * Set the string text to be searched. Text iteration will hence begin at the 534 * start of the text string. This method is useful if you want to re-use an 535 * iterator to search for the same pattern within a different body of text. 536 * @param strsrch search iterator data struct 537 * @param text new string to look for match 538 * @param textlength length of the new string, -1 for null-termination 539 * @param status for errors if it occurs. If text is NULL, or textlength is 0 540 * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change 541 * done to strsrch. 542 * @see #usearch_getText 543 * @stable ICU 2.4 544 */ 545 U_STABLE void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 546 const UChar *text, 547 int32_t textlength, 548 UErrorCode *status); 549 550 /** 551 * Return the string text to be searched. 552 * @param strsrch search iterator data struct 553 * @param length returned string text length 554 * @return string text 555 * @see #usearch_setText 556 * @stable ICU 2.4 557 */ 558 U_STABLE const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 559 int32_t *length); 560 561 /** 562 * Gets the collator used for the language rules. 563 * <p> 564 * Deleting the returned <tt>UCollator</tt> before calling 565 * <tt>usearch_close</tt> would cause the string search to fail. 566 * <tt>usearch_close</tt> will delete the collator if this search owns it. 567 * @param strsrch search iterator data struct 568 * @return collator 569 * @stable ICU 2.4 570 */ 571 U_STABLE UCollator * U_EXPORT2 usearch_getCollator( 572 const UStringSearch *strsrch); 573 574 /** 575 * Sets the collator used for the language rules. User retains the ownership 576 * of this collator, thus the responsibility of deletion lies with the user. 577 * This method causes internal data such as Boyer-Moore shift tables to 578 * be recalculated, but the iterator's position is unchanged. 579 * @param strsrch search iterator data struct 580 * @param collator to be used 581 * @param status for errors if it occurs 582 * @stable ICU 2.4 583 */ 584 U_STABLE void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 585 const UCollator *collator, 586 UErrorCode *status); 587 588 /** 589 * Sets the pattern used for matching. 590 * Internal data like the Boyer Moore table will be recalculated, but the 591 * iterator's position is unchanged. 592 * @param strsrch search iterator data struct 593 * @param pattern string 594 * @param patternlength pattern length, -1 for null-terminated string 595 * @param status for errors if it occurs. If text is NULL, or textlength is 0 596 * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change 597 * done to strsrch. 598 * @stable ICU 2.4 599 */ 600 U_STABLE void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 601 const UChar *pattern, 602 int32_t patternlength, 603 UErrorCode *status); 604 605 /** 606 * Gets the search pattern 607 * @param strsrch search iterator data struct 608 * @param length return length of the pattern, -1 indicates that the pattern 609 * is null-terminated 610 * @return pattern string 611 * @stable ICU 2.4 612 */ 613 U_STABLE const UChar * U_EXPORT2 usearch_getPattern( 614 const UStringSearch *strsrch, 615 int32_t *length); 616 617 /* methods ------------------------------------------------------------- */ 618 619 /** 620 * Returns the first index at which the string text matches the search 621 * pattern. 622 * The iterator is adjusted so that its current index (as returned by 623 * <tt>usearch_getOffset</tt>) is the match position if one was found. 624 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 625 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 626 * @param strsrch search iterator data struct 627 * @param status for errors if it occurs 628 * @return The character index of the first match, or 629 * <tt>USEARCH_DONE</tt> if there are no matches. 630 * @see #usearch_getOffset 631 * @see #USEARCH_DONE 632 * @stable ICU 2.4 633 */ 634 U_STABLE int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 635 UErrorCode *status); 636 637 /** 638 * Returns the first index equal or greater than <tt>position</tt> at which 639 * the string text 640 * matches the search pattern. The iterator is adjusted so that its current 641 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if 642 * one was found. 643 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 644 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 645 * <p> 646 * Search positions that may render incorrect results are highlighted in the 647 * header comments. If position is less than or greater than the text range 648 * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned 649 * @param strsrch search iterator data struct 650 * @param position to start the search at 651 * @param status for errors if it occurs 652 * @return The character index of the first match following <tt>pos</tt>, 653 * or <tt>USEARCH_DONE</tt> if there are no matches. 654 * @see #usearch_getOffset 655 * @see #USEARCH_DONE 656 * @stable ICU 2.4 657 */ 658 U_STABLE int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 659 int32_t position, 660 UErrorCode *status); 661 662 /** 663 * Returns the last index in the target text at which it matches the search 664 * pattern. The iterator is adjusted so that its current 665 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if 666 * one was found. 667 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 668 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 669 * @param strsrch search iterator data struct 670 * @param status for errors if it occurs 671 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if there 672 * are no matches. 673 * @see #usearch_getOffset 674 * @see #USEARCH_DONE 675 * @stable ICU 2.4 676 */ 677 U_STABLE int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 678 UErrorCode *status); 679 680 /** 681 * Returns the first index less than <tt>position</tt> at which the string text 682 * matches the search pattern. The iterator is adjusted so that its current 683 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if 684 * one was found. 685 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 686 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 687 * <p> 688 * Search positions that may render incorrect results are highlighted in the 689 * header comments. If position is less than or greater than the text range 690 * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. 691 * <p> 692 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the 693 * result match is always less than <tt>position</tt>. 694 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across 695 * <tt>position</tt>. 696 * @param strsrch search iterator data struct 697 * @param position index position the search is to begin at 698 * @param status for errors if it occurs 699 * @return The character index of the first match preceding <tt>pos</tt>, 700 * or <tt>USEARCH_DONE</tt> if there are no matches. 701 * @see #usearch_getOffset 702 * @see #USEARCH_DONE 703 * @stable ICU 2.4 704 */ 705 U_STABLE int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 706 int32_t position, 707 UErrorCode *status); 708 709 /** 710 * Returns the index of the next point at which the string text matches the 711 * search pattern, starting from the current position. 712 * The iterator is adjusted so that its current 713 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if 714 * one was found. 715 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 716 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 717 * @param strsrch search iterator data struct 718 * @param status for errors if it occurs 719 * @return The index of the next match after the current position, or 720 * <tt>USEARCH_DONE</tt> if there are no more matches. 721 * @see #usearch_first 722 * @see #usearch_getOffset 723 * @see #USEARCH_DONE 724 * @stable ICU 2.4 725 */ 726 U_STABLE int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 727 UErrorCode *status); 728 729 /** 730 * Returns the index of the previous point at which the string text matches 731 * the search pattern, starting at the current position. 732 * The iterator is adjusted so that its current 733 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if 734 * one was found. 735 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 736 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 737 * @param strsrch search iterator data struct 738 * @param status for errors if it occurs 739 * @return The index of the previous match before the current position, 740 * or <tt>USEARCH_DONE</tt> if there are no more matches. 741 * @see #usearch_last 742 * @see #usearch_getOffset 743 * @see #USEARCH_DONE 744 * @stable ICU 2.4 745 */ 746 U_STABLE int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 747 UErrorCode *status); 748 749 /** 750 * Reset the iteration. 751 * Search will begin at the start of the text string if a forward iteration 752 * is initiated before a backwards iteration. Otherwise if a backwards 753 * iteration is initiated before a forwards iteration, the search will begin 754 * at the end of the text string. 755 * @param strsrch search iterator data struct 756 * @see #usearch_first 757 * @stable ICU 2.4 758 */ 759 U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch); 760 761 #ifndef U_HIDE_INTERNAL_API 762 /** 763 * Simple forward search for the pattern, starting at a specified index, 764 * and using using a default set search options. 765 * 766 * This is an experimental function, and is not an official part of the 767 * ICU API. 768 * 769 * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored. 770 * 771 * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and 772 * any Break Iterator are ignored. 773 * 774 * Matches obey the following constraints: 775 * 776 * Characters at the start or end positions of a match that are ignorable 777 * for collation are not included as part of the match, unless they 778 * are part of a combining sequence, as described below. 779 * 780 * A match will not include a partial combining sequence. Combining 781 * character sequences are considered to be inseperable units, 782 * and either match the pattern completely, or are considered to not match 783 * at all. Thus, for example, an A followed a combining accent mark will 784 * not be found when searching for a plain (unaccented) A. (unless 785 * the collation strength has been set to ignore all accents). 786 * 787 * When beginning a search, the initial starting position, startIdx, 788 * is assumed to be an acceptable match boundary with respect to 789 * combining characters. A combining sequence that spans across the 790 * starting point will not supress a match beginning at startIdx. 791 * 792 * Characters that expand to multiple collation elements 793 * (German sharp-S becoming 'ss', or the composed forms of accented 794 * characters, for example) also must match completely. 795 * Searching for a single 's' in a string containing only a sharp-s will 796 * find no match. 797 * 798 * 799 * @param strsrch the UStringSearch struct, which references both 800 * the text to be searched and the pattern being sought. 801 * @param startIdx The index into the text to begin the search. 802 * @param matchStart An out parameter, the starting index of the matched text. 803 * This parameter may be NULL. 804 * A value of -1 will be returned if no match was found. 805 * @param matchLimit Out parameter, the index of the first position following the matched text. 806 * The matchLimit will be at a suitable position for beginning a subsequent search 807 * in the input text. 808 * This parameter may be NULL. 809 * A value of -1 will be returned if no match was found. 810 * 811 * @param status Report any errors. Note that no match found is not an error. 812 * @return TRUE if a match was found, FALSE otherwise. 813 * 814 * @internal 815 */ 816 U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 817 int32_t startIdx, 818 int32_t *matchStart, 819 int32_t *matchLimit, 820 UErrorCode *status); 821 822 /** 823 * Simple backwards search for the pattern, starting at a specified index, 824 * and using using a default set search options. 825 * 826 * This is an experimental function, and is not an official part of the 827 * ICU API. 828 * 829 * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored. 830 * 831 * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and 832 * any Break Iterator are ignored. 833 * 834 * Matches obey the following constraints: 835 * 836 * Characters at the start or end positions of a match that are ignorable 837 * for collation are not included as part of the match, unless they 838 * are part of a combining sequence, as described below. 839 * 840 * A match will not include a partial combining sequence. Combining 841 * character sequences are considered to be inseperable units, 842 * and either match the pattern completely, or are considered to not match 843 * at all. Thus, for example, an A followed a combining accent mark will 844 * not be found when searching for a plain (unaccented) A. (unless 845 * the collation strength has been set to ignore all accents). 846 * 847 * When beginning a search, the initial starting position, startIdx, 848 * is assumed to be an acceptable match boundary with respect to 849 * combining characters. A combining sequence that spans across the 850 * starting point will not supress a match beginning at startIdx. 851 * 852 * Characters that expand to multiple collation elements 853 * (German sharp-S becoming 'ss', or the composed forms of accented 854 * characters, for example) also must match completely. 855 * Searching for a single 's' in a string containing only a sharp-s will 856 * find no match. 857 * 858 * 859 * @param strsrch the UStringSearch struct, which references both 860 * the text to be searched and the pattern being sought. 861 * @param startIdx The index into the text to begin the search. 862 * @param matchStart An out parameter, the starting index of the matched text. 863 * This parameter may be NULL. 864 * A value of -1 will be returned if no match was found. 865 * @param matchLimit Out parameter, the index of the first position following the matched text. 866 * The matchLimit will be at a suitable position for beginning a subsequent search 867 * in the input text. 868 * This parameter may be NULL. 869 * A value of -1 will be returned if no match was found. 870 * 871 * @param status Report any errors. Note that no match found is not an error. 872 * @return TRUE if a match was found, FALSE otherwise. 873 * 874 * @internal 875 */ 876 U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 877 int32_t startIdx, 878 int32_t *matchStart, 879 int32_t *matchLimit, 880 UErrorCode *status); 881 #endif /* U_HIDE_INTERNAL_API */ 882 883 #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */ 884 885 #endif 886