1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10 /** 11 * \file 12 * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class. 13 */ 14 15 /** 16 * File tblcoll.h 17 * 18 * Created by: Helena Shih 19 * 20 * Modification History: 21 * 22 * Date Name Description 23 * 2/5/97 aliu Added streamIn and streamOut methods. Added 24 * constructor which reads RuleBasedCollator object from 25 * a binary file. Added writeToFile method which streams 26 * RuleBasedCollator out to a binary file. The streamIn 27 * and streamOut methods use istream and ostream objects 28 * in binary mode. 29 * 2/12/97 aliu Modified to use TableCollationData sub-object to 30 * hold invariant data. 31 * 2/13/97 aliu Moved several methods into this class from Collation. 32 * Added a private RuleBasedCollator(Locale&) constructor, 33 * to be used by Collator::createDefault(). General 34 * clean up. 35 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 36 * constructor and getDynamicClassID. 37 * 3/5/97 aliu Modified constructFromFile() to add parameter 38 * specifying whether or not binary loading is to be 39 * attempted. This is required for dynamic rule loading. 40 * 05/07/97 helena Added memory allocation error detection. 41 * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to 42 * use MergeCollation::getPattern. 43 * 6/20/97 helena Java class name change. 44 * 8/18/97 helena Added internal API documentation. 45 * 09/03/97 helena Added createCollationKeyValues(). 46 * 02/10/98 damiba Added compare with "length" parameter 47 * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java 48 * 04/23/99 stephen Removed EDecompositionMode, merged with 49 * Normalizer::EMode 50 * 06/14/99 stephen Removed kResourceBundleSuffix 51 * 11/02/99 helena Collator performance enhancements. Eliminates the 52 * UnicodeString construction and special case for NO_OP. 53 * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator 54 * internal state management. 55 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 56 * to implementation file. 57 * 01/29/01 synwee Modified into a C++ wrapper which calls C API 58 * (ucol.h) 59 * 2012-2014 markus Rewritten in C++ again. 60 */ 61 62 #ifndef TBLCOLL_H 63 #define TBLCOLL_H 64 65 #include "unicode/utypes.h" 66 67 #if !UCONFIG_NO_COLLATION 68 69 #include "unicode/coll.h" 70 #include "unicode/locid.h" 71 #include "unicode/uiter.h" 72 #include "unicode/ucol.h" 73 74 U_NAMESPACE_BEGIN 75 76 struct CollationCacheEntry; 77 struct CollationData; 78 struct CollationSettings; 79 struct CollationTailoring; 80 /** 81 * @stable ICU 2.0 82 */ 83 class StringSearch; 84 /** 85 * @stable ICU 2.0 86 */ 87 class CollationElementIterator; 88 class CollationKey; 89 class SortKeyByteSink; 90 class UnicodeSet; 91 class UnicodeString; 92 class UVector64; 93 94 /** 95 * The RuleBasedCollator class provides the implementation of 96 * Collator, using data-driven tables. The user can create a customized 97 * table-based collation. 98 * <p> 99 * For more information about the collation service see 100 * <a href="http://userguide.icu-project.org/collation">the User Guide</a>. 101 * <p> 102 * Collation service provides correct sorting orders for most locales supported in ICU. 103 * If specific data for a locale is not available, the orders eventually falls back 104 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>. 105 * <p> 106 * Sort ordering may be customized by providing your own set of rules. For more on 107 * this subject see the <a href="http://userguide.icu-project.org/collation/customization"> 108 * Collation Customization</a> section of the User Guide. 109 * <p> 110 * Note, RuleBasedCollator is not to be subclassed. 111 * @see Collator 112 */ 113 class U_I18N_API RuleBasedCollator : public Collator { 114 public: 115 /** 116 * RuleBasedCollator constructor. This takes the table rules and builds a 117 * collation table out of them. Please see RuleBasedCollator class 118 * description for more details on the collation rule syntax. 119 * @param rules the collation rules to build the collation table from. 120 * @param status reporting a success or an error. 121 * @stable ICU 2.0 122 */ 123 RuleBasedCollator(const UnicodeString& rules, UErrorCode& status); 124 125 /** 126 * RuleBasedCollator constructor. This takes the table rules and builds a 127 * collation table out of them. Please see RuleBasedCollator class 128 * description for more details on the collation rule syntax. 129 * @param rules the collation rules to build the collation table from. 130 * @param collationStrength strength for comparison 131 * @param status reporting a success or an error. 132 * @stable ICU 2.0 133 */ 134 RuleBasedCollator(const UnicodeString& rules, 135 ECollationStrength collationStrength, 136 UErrorCode& status); 137 138 /** 139 * RuleBasedCollator constructor. This takes the table rules and builds a 140 * collation table out of them. Please see RuleBasedCollator class 141 * description for more details on the collation rule syntax. 142 * @param rules the collation rules to build the collation table from. 143 * @param decompositionMode the normalisation mode 144 * @param status reporting a success or an error. 145 * @stable ICU 2.0 146 */ 147 RuleBasedCollator(const UnicodeString& rules, 148 UColAttributeValue decompositionMode, 149 UErrorCode& status); 150 151 /** 152 * RuleBasedCollator constructor. This takes the table rules and builds a 153 * collation table out of them. Please see RuleBasedCollator class 154 * description for more details on the collation rule syntax. 155 * @param rules the collation rules to build the collation table from. 156 * @param collationStrength strength for comparison 157 * @param decompositionMode the normalisation mode 158 * @param status reporting a success or an error. 159 * @stable ICU 2.0 160 */ 161 RuleBasedCollator(const UnicodeString& rules, 162 ECollationStrength collationStrength, 163 UColAttributeValue decompositionMode, 164 UErrorCode& status); 165 166 #ifndef U_HIDE_INTERNAL_API 167 /** 168 * TODO: document & propose as public API 169 * @internal 170 */ 171 RuleBasedCollator(const UnicodeString &rules, 172 UParseError &parseError, UnicodeString &reason, 173 UErrorCode &errorCode); 174 #endif /* U_HIDE_INTERNAL_API */ 175 176 /** 177 * Copy constructor. 178 * @param other the RuleBasedCollator object to be copied 179 * @stable ICU 2.0 180 */ 181 RuleBasedCollator(const RuleBasedCollator& other); 182 183 184 /** Opens a collator from a collator binary image created using 185 * cloneBinary. Binary image used in instantiation of the 186 * collator remains owned by the user and should stay around for 187 * the lifetime of the collator. The API also takes a base collator 188 * which must be the root collator. 189 * @param bin binary image owned by the user and required through the 190 * lifetime of the collator 191 * @param length size of the image. If negative, the API will try to 192 * figure out the length of the image 193 * @param base Base collator, for lookup of untailored characters. 194 * Must be the root collator, must not be NULL. 195 * The base is required to be present through the lifetime of the collator. 196 * @param status for catching errors 197 * @return newly created collator 198 * @see cloneBinary 199 * @stable ICU 3.4 200 */ 201 RuleBasedCollator(const uint8_t *bin, int32_t length, 202 const RuleBasedCollator *base, 203 UErrorCode &status); 204 205 /** 206 * Destructor. 207 * @stable ICU 2.0 208 */ 209 virtual ~RuleBasedCollator(); 210 211 /** 212 * Assignment operator. 213 * @param other other RuleBasedCollator object to copy from. 214 * @stable ICU 2.0 215 */ 216 RuleBasedCollator& operator=(const RuleBasedCollator& other); 217 218 /** 219 * Returns true if argument is the same as this object. 220 * @param other Collator object to be compared. 221 * @return true if arguments is the same as this object. 222 * @stable ICU 2.0 223 */ 224 virtual UBool operator==(const Collator& other) const; 225 226 /** 227 * Makes a copy of this object. 228 * @return a copy of this object, owned by the caller 229 * @stable ICU 2.0 230 */ 231 virtual Collator* clone(void) const; 232 233 /** 234 * Creates a collation element iterator for the source string. The caller of 235 * this method is responsible for the memory management of the return 236 * pointer. 237 * @param source the string over which the CollationElementIterator will 238 * iterate. 239 * @return the collation element iterator of the source string using this as 240 * the based Collator. 241 * @stable ICU 2.2 242 */ 243 virtual CollationElementIterator* createCollationElementIterator( 244 const UnicodeString& source) const; 245 246 /** 247 * Creates a collation element iterator for the source. The caller of this 248 * method is responsible for the memory management of the returned pointer. 249 * @param source the CharacterIterator which produces the characters over 250 * which the CollationElementItgerator will iterate. 251 * @return the collation element iterator of the source using this as the 252 * based Collator. 253 * @stable ICU 2.2 254 */ 255 virtual CollationElementIterator* createCollationElementIterator( 256 const CharacterIterator& source) const; 257 258 // Make deprecated versions of Collator::compare() visible. 259 using Collator::compare; 260 261 /** 262 * The comparison function compares the character data stored in two 263 * different strings. Returns information about whether a string is less 264 * than, greater than or equal to another string. 265 * @param source the source string to be compared with. 266 * @param target the string that is to be compared with the source string. 267 * @param status possible error code 268 * @return Returns an enum value. UCOL_GREATER if source is greater 269 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 270 * than target 271 * @stable ICU 2.6 272 **/ 273 virtual UCollationResult compare(const UnicodeString& source, 274 const UnicodeString& target, 275 UErrorCode &status) const; 276 277 /** 278 * Does the same thing as compare but limits the comparison to a specified 279 * length 280 * @param source the source string to be compared with. 281 * @param target the string that is to be compared with the source string. 282 * @param length the length the comparison is limited to 283 * @param status possible error code 284 * @return Returns an enum value. UCOL_GREATER if source (up to the specified 285 * length) is greater than target; UCOL_EQUAL if source (up to specified 286 * length) is equal to target; UCOL_LESS if source (up to the specified 287 * length) is less than target. 288 * @stable ICU 2.6 289 */ 290 virtual UCollationResult compare(const UnicodeString& source, 291 const UnicodeString& target, 292 int32_t length, 293 UErrorCode &status) const; 294 295 /** 296 * The comparison function compares the character data stored in two 297 * different string arrays. Returns information about whether a string array 298 * is less than, greater than or equal to another string array. 299 * @param source the source string array to be compared with. 300 * @param sourceLength the length of the source string array. If this value 301 * is equal to -1, the string array is null-terminated. 302 * @param target the string that is to be compared with the source string. 303 * @param targetLength the length of the target string array. If this value 304 * is equal to -1, the string array is null-terminated. 305 * @param status possible error code 306 * @return Returns an enum value. UCOL_GREATER if source is greater 307 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 308 * than target 309 * @stable ICU 2.6 310 */ 311 virtual UCollationResult compare(const UChar* source, int32_t sourceLength, 312 const UChar* target, int32_t targetLength, 313 UErrorCode &status) const; 314 315 /** 316 * Compares two strings using the Collator. 317 * Returns whether the first one compares less than/equal to/greater than 318 * the second one. 319 * This version takes UCharIterator input. 320 * @param sIter the first ("source") string iterator 321 * @param tIter the second ("target") string iterator 322 * @param status ICU status 323 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 324 * @stable ICU 4.2 325 */ 326 virtual UCollationResult compare(UCharIterator &sIter, 327 UCharIterator &tIter, 328 UErrorCode &status) const; 329 330 /** 331 * Compares two UTF-8 strings using the Collator. 332 * Returns whether the first one compares less than/equal to/greater than 333 * the second one. 334 * This version takes UTF-8 input. 335 * Note that a StringPiece can be implicitly constructed 336 * from a std::string or a NUL-terminated const char * string. 337 * @param source the first UTF-8 string 338 * @param target the second UTF-8 string 339 * @param status ICU status 340 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 341 * @stable ICU 51 342 */ 343 virtual UCollationResult compareUTF8(const StringPiece &source, 344 const StringPiece &target, 345 UErrorCode &status) const; 346 347 /** 348 * Transforms the string into a series of characters 349 * that can be compared with CollationKey.compare(). 350 * 351 * Note that sort keys are often less efficient than simply doing comparison. 352 * For more details, see the ICU User Guide. 353 * 354 * @param source the source string. 355 * @param key the transformed key of the source string. 356 * @param status the error code status. 357 * @return the transformed key. 358 * @see CollationKey 359 * @stable ICU 2.0 360 */ 361 virtual CollationKey& getCollationKey(const UnicodeString& source, 362 CollationKey& key, 363 UErrorCode& status) const; 364 365 /** 366 * Transforms a specified region of the string into a series of characters 367 * that can be compared with CollationKey.compare. 368 * 369 * Note that sort keys are often less efficient than simply doing comparison. 370 * For more details, see the ICU User Guide. 371 * 372 * @param source the source string. 373 * @param sourceLength the length of the source string. 374 * @param key the transformed key of the source string. 375 * @param status the error code status. 376 * @return the transformed key. 377 * @see CollationKey 378 * @stable ICU 2.0 379 */ 380 virtual CollationKey& getCollationKey(const UChar *source, 381 int32_t sourceLength, 382 CollationKey& key, 383 UErrorCode& status) const; 384 385 /** 386 * Generates the hash code for the rule-based collation object. 387 * @return the hash code. 388 * @stable ICU 2.0 389 */ 390 virtual int32_t hashCode() const; 391 392 /** 393 * Gets the locale of the Collator 394 * @param type can be either requested, valid or actual locale. For more 395 * information see the definition of ULocDataLocaleType in 396 * uloc.h 397 * @param status the error code status. 398 * @return locale where the collation data lives. If the collator 399 * was instantiated from rules, locale is empty. 400 * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback 401 */ 402 virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const; 403 404 /** 405 * Gets the tailoring rules for this collator. 406 * @return the collation tailoring from which this collator was created 407 * @stable ICU 2.0 408 */ 409 const UnicodeString& getRules() const; 410 411 /** 412 * Gets the version information for a Collator. 413 * @param info the version # information, the result will be filled in 414 * @stable ICU 2.0 415 */ 416 virtual void getVersion(UVersionInfo info) const; 417 418 #ifndef U_HIDE_DEPRECATED_API 419 /** 420 * Returns the maximum length of any expansion sequences that end with the 421 * specified comparison order. 422 * 423 * This is specific to the kind of collation element values and sequences 424 * returned by the CollationElementIterator. 425 * Call CollationElementIterator::getMaxExpansion() instead. 426 * 427 * @param order a collation order returned by CollationElementIterator::previous 428 * or CollationElementIterator::next. 429 * @return maximum size of the expansion sequences ending with the collation 430 * element, or 1 if the collation element does not occur at the end of 431 * any expansion sequence 432 * @see CollationElementIterator#getMaxExpansion 433 * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead. 434 */ 435 int32_t getMaxExpansion(int32_t order) const; 436 #endif /* U_HIDE_DEPRECATED_API */ 437 438 /** 439 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This 440 * method is to implement a simple version of RTTI, since not all C++ 441 * compilers support genuine RTTI. Polymorphic operator==() and clone() 442 * methods call this method. 443 * @return The class ID for this object. All objects of a given class have 444 * the same class ID. Objects of other classes have different class 445 * IDs. 446 * @stable ICU 2.0 447 */ 448 virtual UClassID getDynamicClassID(void) const; 449 450 /** 451 * Returns the class ID for this class. This is useful only for comparing to 452 * a return value from getDynamicClassID(). For example: 453 * <pre> 454 * Base* polymorphic_pointer = createPolymorphicObject(); 455 * if (polymorphic_pointer->getDynamicClassID() == 456 * Derived::getStaticClassID()) ... 457 * </pre> 458 * @return The class ID for all objects of this class. 459 * @stable ICU 2.0 460 */ 461 static UClassID U_EXPORT2 getStaticClassID(void); 462 463 #ifndef U_HIDE_DEPRECATED_API 464 /** 465 * Do not use this method: The caller and the ICU library might use different heaps. 466 * Use cloneBinary() instead which writes to caller-provided memory. 467 * 468 * Returns a binary format of this collator. 469 * @param length Returns the length of the data, in bytes 470 * @param status the error code status. 471 * @return memory, owned by the caller, of size 'length' bytes. 472 * @deprecated ICU 52. Use cloneBinary() instead. 473 */ 474 uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const; 475 #endif /* U_HIDE_DEPRECATED_API */ 476 477 /** Creates a binary image of a collator. This binary image can be stored and 478 * later used to instantiate a collator using ucol_openBinary. 479 * This API supports preflighting. 480 * @param buffer a fill-in buffer to receive the binary image 481 * @param capacity capacity of the destination buffer 482 * @param status for catching errors 483 * @return size of the image 484 * @see ucol_openBinary 485 * @stable ICU 3.4 486 */ 487 int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const; 488 489 /** 490 * Returns current rules. Delta defines whether full rules are returned or 491 * just the tailoring. 492 * 493 * getRules(void) should normally be used instead. 494 * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales 495 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. 496 * @param buffer UnicodeString to store the result rules 497 * @stable ICU 2.2 498 * @see UCOL_FULL_RULES 499 */ 500 void getRules(UColRuleOption delta, UnicodeString &buffer) const; 501 502 /** 503 * Universal attribute setter 504 * @param attr attribute type 505 * @param value attribute value 506 * @param status to indicate whether the operation went on smoothly or there were errors 507 * @stable ICU 2.2 508 */ 509 virtual void setAttribute(UColAttribute attr, UColAttributeValue value, 510 UErrorCode &status); 511 512 /** 513 * Universal attribute getter. 514 * @param attr attribute type 515 * @param status to indicate whether the operation went on smoothly or there were errors 516 * @return attribute value 517 * @stable ICU 2.2 518 */ 519 virtual UColAttributeValue getAttribute(UColAttribute attr, 520 UErrorCode &status) const; 521 522 /** 523 * Sets the variable top to the top of the specified reordering group. 524 * The variable top determines the highest-sorting character 525 * which is affected by UCOL_ALTERNATE_HANDLING. 526 * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. 527 * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, 528 * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; 529 * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group 530 * @param errorCode Standard ICU error code. Its input value must 531 * pass the U_SUCCESS() test, or else the function returns 532 * immediately. Check for U_FAILURE() on output or use with 533 * function chaining. (See User Guide for details.) 534 * @return *this 535 * @see getMaxVariable 536 * @stable ICU 53 537 */ 538 virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode); 539 540 /** 541 * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. 542 * @return the maximum variable reordering group. 543 * @see setMaxVariable 544 * @stable ICU 53 545 */ 546 virtual UColReorderCode getMaxVariable() const; 547 548 /** 549 * Sets the variable top to the primary weight of the specified string. 550 * 551 * Beginning with ICU 53, the variable top is pinned to 552 * the top of one of the supported reordering groups, 553 * and it must not be beyond the last of those groups. 554 * See setMaxVariable(). 555 * @param varTop one or more (if contraction) UChars to which the variable top should be set 556 * @param len length of variable top string. If -1 it is considered to be zero terminated. 557 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 558 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 559 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 560 * the last reordering group supported by setMaxVariable() 561 * @return variable top primary weight 562 * @deprecated ICU 53 Call setMaxVariable() instead. 563 */ 564 virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status); 565 566 /** 567 * Sets the variable top to the primary weight of the specified string. 568 * 569 * Beginning with ICU 53, the variable top is pinned to 570 * the top of one of the supported reordering groups, 571 * and it must not be beyond the last of those groups. 572 * See setMaxVariable(). 573 * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set 574 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 575 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 576 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 577 * the last reordering group supported by setMaxVariable() 578 * @return variable top primary weight 579 * @deprecated ICU 53 Call setMaxVariable() instead. 580 */ 581 virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status); 582 583 /** 584 * Sets the variable top to the specified primary weight. 585 * 586 * Beginning with ICU 53, the variable top is pinned to 587 * the top of one of the supported reordering groups, 588 * and it must not be beyond the last of those groups. 589 * See setMaxVariable(). 590 * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop 591 * @param status error code 592 * @deprecated ICU 53 Call setMaxVariable() instead. 593 */ 594 virtual void setVariableTop(uint32_t varTop, UErrorCode &status); 595 596 /** 597 * Gets the variable top value of a Collator. 598 * @param status error code (not changed by function). If error code is set, the return value is undefined. 599 * @return the variable top primary weight 600 * @see getMaxVariable 601 * @stable ICU 2.0 602 */ 603 virtual uint32_t getVariableTop(UErrorCode &status) const; 604 605 /** 606 * Get a UnicodeSet that contains all the characters and sequences tailored in 607 * this collator. 608 * @param status error code of the operation 609 * @return a pointer to a UnicodeSet object containing all the 610 * code points and sequences that may sort differently than 611 * in the root collator. The object must be disposed of by using delete 612 * @stable ICU 2.4 613 */ 614 virtual UnicodeSet *getTailoredSet(UErrorCode &status) const; 615 616 /** 617 * Get the sort key as an array of bytes from a UnicodeString. 618 * 619 * Note that sort keys are often less efficient than simply doing comparison. 620 * For more details, see the ICU User Guide. 621 * 622 * @param source string to be processed. 623 * @param result buffer to store result in. If NULL, number of bytes needed 624 * will be returned. 625 * @param resultLength length of the result buffer. If if not enough the 626 * buffer will be filled to capacity. 627 * @return Number of bytes needed for storing the sort key 628 * @stable ICU 2.0 629 */ 630 virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result, 631 int32_t resultLength) const; 632 633 /** 634 * Get the sort key as an array of bytes from a UChar buffer. 635 * 636 * Note that sort keys are often less efficient than simply doing comparison. 637 * For more details, see the ICU User Guide. 638 * 639 * @param source string to be processed. 640 * @param sourceLength length of string to be processed. If -1, the string 641 * is 0 terminated and length will be decided by the function. 642 * @param result buffer to store result in. If NULL, number of bytes needed 643 * will be returned. 644 * @param resultLength length of the result buffer. If if not enough the 645 * buffer will be filled to capacity. 646 * @return Number of bytes needed for storing the sort key 647 * @stable ICU 2.2 648 */ 649 virtual int32_t getSortKey(const UChar *source, int32_t sourceLength, 650 uint8_t *result, int32_t resultLength) const; 651 652 /** 653 * Retrieves the reordering codes for this collator. 654 * @param dest The array to fill with the script ordering. 655 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function 656 * will only return the length of the result without writing any codes (pre-flighting). 657 * @param status A reference to an error code value, which must not indicate 658 * a failure before the function call. 659 * @return The length of the script ordering array. 660 * @see ucol_setReorderCodes 661 * @see Collator#getEquivalentReorderCodes 662 * @see Collator#setReorderCodes 663 * @stable ICU 4.8 664 */ 665 virtual int32_t getReorderCodes(int32_t *dest, 666 int32_t destCapacity, 667 UErrorCode& status) const; 668 669 /** 670 * Sets the ordering of scripts for this collator. 671 * @param reorderCodes An array of script codes in the new order. This can be NULL if the 672 * length is also set to 0. An empty array will clear any reordering codes on the collator. 673 * @param reorderCodesLength The length of reorderCodes. 674 * @param status error code 675 * @see ucol_setReorderCodes 676 * @see Collator#getReorderCodes 677 * @see Collator#getEquivalentReorderCodes 678 * @stable ICU 4.8 679 */ 680 virtual void setReorderCodes(const int32_t* reorderCodes, 681 int32_t reorderCodesLength, 682 UErrorCode& status) ; 683 684 /** 685 * Implements ucol_strcollUTF8(). 686 * @internal 687 */ 688 virtual UCollationResult internalCompareUTF8( 689 const char *left, int32_t leftLength, 690 const char *right, int32_t rightLength, 691 UErrorCode &errorCode) const; 692 693 /** Get the short definition string for a collator. This internal API harvests the collator's 694 * locale and the attribute set and produces a string that can be used for opening 695 * a collator with the same attributes using the ucol_openFromShortString API. 696 * This string will be normalized. 697 * The structure and the syntax of the string is defined in the "Naming collators" 698 * section of the users guide: 699 * http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme 700 * This function supports preflighting. 701 * 702 * This is internal, and intended to be used with delegate converters. 703 * 704 * @param locale a locale that will appear as a collators locale in the resulting 705 * short string definition. If NULL, the locale will be harvested 706 * from the collator. 707 * @param buffer space to hold the resulting string 708 * @param capacity capacity of the buffer 709 * @param status for returning errors. All the preflighting errors are featured 710 * @return length of the resulting string 711 * @see ucol_openFromShortString 712 * @see ucol_normalizeShortDefinitionString 713 * @see ucol_getShortDefinitionString 714 * @internal 715 */ 716 virtual int32_t internalGetShortDefinitionString(const char *locale, 717 char *buffer, 718 int32_t capacity, 719 UErrorCode &status) const; 720 721 /** 722 * Implements ucol_nextSortKeyPart(). 723 * @internal 724 */ 725 virtual int32_t internalNextSortKeyPart( 726 UCharIterator *iter, uint32_t state[2], 727 uint8_t *dest, int32_t count, UErrorCode &errorCode) const; 728 729 // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API 730 /** 731 * Only for use in ucol_openRules(). 732 * @internal 733 */ 734 RuleBasedCollator(); 735 736 #ifndef U_HIDE_INTERNAL_API 737 /** 738 * Implements ucol_getLocaleByType(). 739 * Needed because the lifetime of the locale ID string must match that of the collator. 740 * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper. 741 * @internal 742 */ 743 const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const; 744 745 /** 746 * Implements ucol_getContractionsAndExpansions(). 747 * Gets this collator's sets of contraction strings and/or 748 * characters and strings that map to multiple collation elements (expansions). 749 * If addPrefixes is TRUE, then contractions that are expressed as 750 * prefix/pre-context rules are included. 751 * @param contractions if not NULL, the set to hold the contractions 752 * @param expansions if not NULL, the set to hold the expansions 753 * @param addPrefixes include prefix contextual mappings 754 * @param errorCode in/out ICU error code 755 * @internal 756 */ 757 void internalGetContractionsAndExpansions( 758 UnicodeSet *contractions, UnicodeSet *expansions, 759 UBool addPrefixes, UErrorCode &errorCode) const; 760 761 /** 762 * Adds the contractions that start with character c to the set. 763 * Ignores prefixes. Used by AlphabeticIndex. 764 * @internal 765 */ 766 void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const; 767 768 /** 769 * Implements from-rule constructors, and ucol_openRules(). 770 * @internal 771 */ 772 void internalBuildTailoring( 773 const UnicodeString &rules, 774 int32_t strength, 775 UColAttributeValue decompositionMode, 776 UParseError *outParseError, UnicodeString *outReason, 777 UErrorCode &errorCode); 778 779 /** @internal */ rbcFromUCollator(UCollator * uc)780 static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) { 781 return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc)); 782 } 783 /** @internal */ rbcFromUCollator(const UCollator * uc)784 static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) { 785 return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc)); 786 } 787 788 /** 789 * Appends the CEs for the string to the vector. 790 * @internal for tests & tools 791 */ 792 void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const; 793 #endif // U_HIDE_INTERNAL_API 794 795 protected: 796 /** 797 * Used internally by registration to define the requested and valid locales. 798 * @param requestedLocale the requested locale 799 * @param validLocale the valid locale 800 * @param actualLocale the actual locale 801 * @internal 802 */ 803 virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale); 804 805 private: 806 friend class CollationElementIterator; 807 friend class Collator; 808 809 RuleBasedCollator(const CollationCacheEntry *entry); 810 811 /** 812 * Enumeration of attributes that are relevant for short definition strings 813 * (e.g., ucol_getShortDefinitionString()). 814 * Effectively extends UColAttribute. 815 */ 816 enum Attributes { 817 ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT, 818 ATTR_LIMIT 819 }; 820 821 void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode); 822 823 // Both lengths must be <0 or else both must be >=0. 824 UCollationResult doCompare(const UChar *left, int32_t leftLength, 825 const UChar *right, int32_t rightLength, 826 UErrorCode &errorCode) const; 827 UCollationResult doCompare(const uint8_t *left, int32_t leftLength, 828 const uint8_t *right, int32_t rightLength, 829 UErrorCode &errorCode) const; 830 831 void writeSortKey(const UChar *s, int32_t length, 832 SortKeyByteSink &sink, UErrorCode &errorCode) const; 833 834 void writeIdenticalLevel(const UChar *s, const UChar *limit, 835 SortKeyByteSink &sink, UErrorCode &errorCode) const; 836 837 const CollationSettings &getDefaultSettings() const; 838 setAttributeDefault(int32_t attribute)839 void setAttributeDefault(int32_t attribute) { 840 explicitlySetAttributes &= ~((uint32_t)1 << attribute); 841 } setAttributeExplicitly(int32_t attribute)842 void setAttributeExplicitly(int32_t attribute) { 843 explicitlySetAttributes |= (uint32_t)1 << attribute; 844 } attributeHasBeenSetExplicitly(int32_t attribute)845 UBool attributeHasBeenSetExplicitly(int32_t attribute) const { 846 // assert(0 <= attribute < ATTR_LIMIT); 847 return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0); 848 } 849 850 /** 851 * Tests whether a character is "unsafe" for use as a collation starting point. 852 * 853 * @param c code point or code unit 854 * @return TRUE if c is unsafe 855 * @see CollationElementIterator#setOffset(int) 856 */ 857 UBool isUnsafe(UChar32 c) const; 858 859 static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode); 860 UBool initMaxExpansions(UErrorCode &errorCode) const; 861 862 void setFastLatinOptions(CollationSettings &ownedSettings) const; 863 864 const CollationData *data; 865 const CollationSettings *settings; // reference-counted 866 const CollationTailoring *tailoring; // alias of cacheEntry->tailoring 867 const CollationCacheEntry *cacheEntry; // reference-counted 868 Locale validLocale; 869 uint32_t explicitlySetAttributes; 870 871 UBool actualLocaleIsSameAsValid; 872 }; 873 874 U_NAMESPACE_END 875 876 #endif // !UCONFIG_NO_COLLATION 877 #endif // TBLCOLL_H 878