1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 */
9 
10 /**
11  * \file
12  * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
13  */
14 
15 /**
16 * File tblcoll.h
17 *
18 * Created by: Helena Shih
19 *
20 * Modification History:
21 *
22 *  Date        Name        Description
23 *  2/5/97      aliu        Added streamIn and streamOut methods.  Added
24 *                          constructor which reads RuleBasedCollator object from
25 *                          a binary file.  Added writeToFile method which streams
26 *                          RuleBasedCollator out to a binary file.  The streamIn
27 *                          and streamOut methods use istream and ostream objects
28 *                          in binary mode.
29 *  2/12/97     aliu        Modified to use TableCollationData sub-object to
30 *                          hold invariant data.
31 *  2/13/97     aliu        Moved several methods into this class from Collation.
32 *                          Added a private RuleBasedCollator(Locale&) constructor,
33 *                          to be used by Collator::createDefault().  General
34 *                          clean up.
35 *  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
36 *                          constructor and getDynamicClassID.
37 *  3/5/97      aliu        Modified constructFromFile() to add parameter
38 *                          specifying whether or not binary loading is to be
39 *                          attempted.  This is required for dynamic rule loading.
40 * 05/07/97     helena      Added memory allocation error detection.
41 *  6/17/97     helena      Added IDENTICAL strength for compare, changed getRules to
42 *                          use MergeCollation::getPattern.
43 *  6/20/97     helena      Java class name change.
44 *  8/18/97     helena      Added internal API documentation.
45 * 09/03/97     helena      Added createCollationKeyValues().
46 * 02/10/98     damiba      Added compare with "length" parameter
47 * 08/05/98     erm         Synched with 1.2 version of RuleBasedCollator.java
48 * 04/23/99     stephen     Removed EDecompositionMode, merged with
49 *                          Normalizer::EMode
50 * 06/14/99     stephen     Removed kResourceBundleSuffix
51 * 11/02/99     helena      Collator performance enhancements.  Eliminates the
52 *                          UnicodeString construction and special case for NO_OP.
53 * 11/23/99     srl         More performance enhancements. Updates to NormalizerIterator
54 *                          internal state management.
55 * 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
56 *                          to implementation file.
57 * 01/29/01     synwee      Modified into a C++ wrapper which calls C API
58 *                          (ucol.h)
59 * 2012-2014    markus      Rewritten in C++ again.
60 */
61 
62 #ifndef TBLCOLL_H
63 #define TBLCOLL_H
64 
65 #include "unicode/utypes.h"
66 
67 #if !UCONFIG_NO_COLLATION
68 
69 #include "unicode/coll.h"
70 #include "unicode/locid.h"
71 #include "unicode/uiter.h"
72 #include "unicode/ucol.h"
73 
74 U_NAMESPACE_BEGIN
75 
76 struct CollationCacheEntry;
77 struct CollationData;
78 struct CollationSettings;
79 struct CollationTailoring;
80 /**
81 * @stable ICU 2.0
82 */
83 class StringSearch;
84 /**
85 * @stable ICU 2.0
86 */
87 class CollationElementIterator;
88 class CollationKey;
89 class SortKeyByteSink;
90 class UnicodeSet;
91 class UnicodeString;
92 class UVector64;
93 
94 /**
95  * The RuleBasedCollator class provides the implementation of
96  * Collator, using data-driven tables. The user can create a customized
97  * table-based collation.
98  * <p>
99  * For more information about the collation service see
100  * <a href="http://userguide.icu-project.org/collation">the User Guide</a>.
101  * <p>
102  * Collation service provides correct sorting orders for most locales supported in ICU.
103  * If specific data for a locale is not available, the orders eventually falls back
104  * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
105  * <p>
106  * Sort ordering may be customized by providing your own set of rules. For more on
107  * this subject see the <a href="http://userguide.icu-project.org/collation/customization">
108  * Collation Customization</a> section of the User Guide.
109  * <p>
110  * Note, RuleBasedCollator is not to be subclassed.
111  * @see        Collator
112  */
113 class U_I18N_API RuleBasedCollator : public Collator {
114 public:
115     /**
116      * RuleBasedCollator constructor. This takes the table rules and builds a
117      * collation table out of them. Please see RuleBasedCollator class
118      * description for more details on the collation rule syntax.
119      * @param rules the collation rules to build the collation table from.
120      * @param status reporting a success or an error.
121      * @stable ICU 2.0
122      */
123     RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
124 
125     /**
126      * RuleBasedCollator constructor. This takes the table rules and builds a
127      * collation table out of them. Please see RuleBasedCollator class
128      * description for more details on the collation rule syntax.
129      * @param rules the collation rules to build the collation table from.
130      * @param collationStrength strength for comparison
131      * @param status reporting a success or an error.
132      * @stable ICU 2.0
133      */
134     RuleBasedCollator(const UnicodeString& rules,
135                        ECollationStrength collationStrength,
136                        UErrorCode& status);
137 
138     /**
139      * RuleBasedCollator constructor. This takes the table rules and builds a
140      * collation table out of them. Please see RuleBasedCollator class
141      * description for more details on the collation rule syntax.
142      * @param rules the collation rules to build the collation table from.
143      * @param decompositionMode the normalisation mode
144      * @param status reporting a success or an error.
145      * @stable ICU 2.0
146      */
147     RuleBasedCollator(const UnicodeString& rules,
148                     UColAttributeValue decompositionMode,
149                     UErrorCode& status);
150 
151     /**
152      * RuleBasedCollator constructor. This takes the table rules and builds a
153      * collation table out of them. Please see RuleBasedCollator class
154      * description for more details on the collation rule syntax.
155      * @param rules the collation rules to build the collation table from.
156      * @param collationStrength strength for comparison
157      * @param decompositionMode the normalisation mode
158      * @param status reporting a success or an error.
159      * @stable ICU 2.0
160      */
161     RuleBasedCollator(const UnicodeString& rules,
162                     ECollationStrength collationStrength,
163                     UColAttributeValue decompositionMode,
164                     UErrorCode& status);
165 
166 #ifndef U_HIDE_INTERNAL_API
167     /**
168      * TODO: document & propose as public API
169      * @internal
170      */
171     RuleBasedCollator(const UnicodeString &rules,
172                       UParseError &parseError, UnicodeString &reason,
173                       UErrorCode &errorCode);
174 #endif  /* U_HIDE_INTERNAL_API */
175 
176     /**
177      * Copy constructor.
178      * @param other the RuleBasedCollator object to be copied
179      * @stable ICU 2.0
180      */
181     RuleBasedCollator(const RuleBasedCollator& other);
182 
183 
184     /** Opens a collator from a collator binary image created using
185     *  cloneBinary. Binary image used in instantiation of the
186     *  collator remains owned by the user and should stay around for
187     *  the lifetime of the collator. The API also takes a base collator
188     *  which must be the root collator.
189     *  @param bin binary image owned by the user and required through the
190     *             lifetime of the collator
191     *  @param length size of the image. If negative, the API will try to
192     *                figure out the length of the image
193     *  @param base Base collator, for lookup of untailored characters.
194     *              Must be the root collator, must not be NULL.
195     *              The base is required to be present through the lifetime of the collator.
196     *  @param status for catching errors
197     *  @return newly created collator
198     *  @see cloneBinary
199     *  @stable ICU 3.4
200     */
201     RuleBasedCollator(const uint8_t *bin, int32_t length,
202                     const RuleBasedCollator *base,
203                     UErrorCode &status);
204 
205     /**
206      * Destructor.
207      * @stable ICU 2.0
208      */
209     virtual ~RuleBasedCollator();
210 
211     /**
212      * Assignment operator.
213      * @param other other RuleBasedCollator object to copy from.
214      * @stable ICU 2.0
215      */
216     RuleBasedCollator& operator=(const RuleBasedCollator& other);
217 
218     /**
219      * Returns true if argument is the same as this object.
220      * @param other Collator object to be compared.
221      * @return true if arguments is the same as this object.
222      * @stable ICU 2.0
223      */
224     virtual UBool operator==(const Collator& other) const;
225 
226     /**
227      * Makes a copy of this object.
228      * @return a copy of this object, owned by the caller
229      * @stable ICU 2.0
230      */
231     virtual Collator* clone(void) const;
232 
233     /**
234      * Creates a collation element iterator for the source string. The caller of
235      * this method is responsible for the memory management of the return
236      * pointer.
237      * @param source the string over which the CollationElementIterator will
238      *        iterate.
239      * @return the collation element iterator of the source string using this as
240      *         the based Collator.
241      * @stable ICU 2.2
242      */
243     virtual CollationElementIterator* createCollationElementIterator(
244                                            const UnicodeString& source) const;
245 
246     /**
247      * Creates a collation element iterator for the source. The caller of this
248      * method is responsible for the memory management of the returned pointer.
249      * @param source the CharacterIterator which produces the characters over
250      *        which the CollationElementItgerator will iterate.
251      * @return the collation element iterator of the source using this as the
252      *         based Collator.
253      * @stable ICU 2.2
254      */
255     virtual CollationElementIterator* createCollationElementIterator(
256                                          const CharacterIterator& source) const;
257 
258     // Make deprecated versions of Collator::compare() visible.
259     using Collator::compare;
260 
261     /**
262     * The comparison function compares the character data stored in two
263     * different strings. Returns information about whether a string is less
264     * than, greater than or equal to another string.
265     * @param source the source string to be compared with.
266     * @param target the string that is to be compared with the source string.
267     * @param status possible error code
268     * @return Returns an enum value. UCOL_GREATER if source is greater
269     * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
270     * than target
271     * @stable ICU 2.6
272     **/
273     virtual UCollationResult compare(const UnicodeString& source,
274                                      const UnicodeString& target,
275                                      UErrorCode &status) const;
276 
277     /**
278     * Does the same thing as compare but limits the comparison to a specified
279     * length
280     * @param source the source string to be compared with.
281     * @param target the string that is to be compared with the source string.
282     * @param length the length the comparison is limited to
283     * @param status possible error code
284     * @return Returns an enum value. UCOL_GREATER if source (up to the specified
285     *         length) is greater than target; UCOL_EQUAL if source (up to specified
286     *         length) is equal to target; UCOL_LESS if source (up to the specified
287     *         length) is less  than target.
288     * @stable ICU 2.6
289     */
290     virtual UCollationResult compare(const UnicodeString& source,
291                                      const UnicodeString& target,
292                                      int32_t length,
293                                      UErrorCode &status) const;
294 
295     /**
296     * The comparison function compares the character data stored in two
297     * different string arrays. Returns information about whether a string array
298     * is less than, greater than or equal to another string array.
299     * @param source the source string array to be compared with.
300     * @param sourceLength the length of the source string array.  If this value
301     *        is equal to -1, the string array is null-terminated.
302     * @param target the string that is to be compared with the source string.
303     * @param targetLength the length of the target string array.  If this value
304     *        is equal to -1, the string array is null-terminated.
305     * @param status possible error code
306     * @return Returns an enum value. UCOL_GREATER if source is greater
307     * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
308     * than target
309     * @stable ICU 2.6
310     */
311     virtual UCollationResult compare(const UChar* source, int32_t sourceLength,
312                                      const UChar* target, int32_t targetLength,
313                                      UErrorCode &status) const;
314 
315     /**
316      * Compares two strings using the Collator.
317      * Returns whether the first one compares less than/equal to/greater than
318      * the second one.
319      * This version takes UCharIterator input.
320      * @param sIter the first ("source") string iterator
321      * @param tIter the second ("target") string iterator
322      * @param status ICU status
323      * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
324      * @stable ICU 4.2
325      */
326     virtual UCollationResult compare(UCharIterator &sIter,
327                                      UCharIterator &tIter,
328                                      UErrorCode &status) const;
329 
330     /**
331      * Compares two UTF-8 strings using the Collator.
332      * Returns whether the first one compares less than/equal to/greater than
333      * the second one.
334      * This version takes UTF-8 input.
335      * Note that a StringPiece can be implicitly constructed
336      * from a std::string or a NUL-terminated const char * string.
337      * @param source the first UTF-8 string
338      * @param target the second UTF-8 string
339      * @param status ICU status
340      * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
341      * @stable ICU 51
342      */
343     virtual UCollationResult compareUTF8(const StringPiece &source,
344                                          const StringPiece &target,
345                                          UErrorCode &status) const;
346 
347     /**
348      * Transforms the string into a series of characters
349      * that can be compared with CollationKey.compare().
350      *
351      * Note that sort keys are often less efficient than simply doing comparison.
352      * For more details, see the ICU User Guide.
353      *
354      * @param source the source string.
355      * @param key the transformed key of the source string.
356      * @param status the error code status.
357      * @return the transformed key.
358      * @see CollationKey
359      * @stable ICU 2.0
360      */
361     virtual CollationKey& getCollationKey(const UnicodeString& source,
362                                           CollationKey& key,
363                                           UErrorCode& status) const;
364 
365     /**
366      * Transforms a specified region of the string into a series of characters
367      * that can be compared with CollationKey.compare.
368      *
369      * Note that sort keys are often less efficient than simply doing comparison.
370      * For more details, see the ICU User Guide.
371      *
372      * @param source the source string.
373      * @param sourceLength the length of the source string.
374      * @param key the transformed key of the source string.
375      * @param status the error code status.
376      * @return the transformed key.
377      * @see CollationKey
378      * @stable ICU 2.0
379      */
380     virtual CollationKey& getCollationKey(const UChar *source,
381                                           int32_t sourceLength,
382                                           CollationKey& key,
383                                           UErrorCode& status) const;
384 
385     /**
386      * Generates the hash code for the rule-based collation object.
387      * @return the hash code.
388      * @stable ICU 2.0
389      */
390     virtual int32_t hashCode() const;
391 
392     /**
393     * Gets the locale of the Collator
394     * @param type can be either requested, valid or actual locale. For more
395     *             information see the definition of ULocDataLocaleType in
396     *             uloc.h
397     * @param status the error code status.
398     * @return locale where the collation data lives. If the collator
399     *         was instantiated from rules, locale is empty.
400     * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
401     */
402     virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
403 
404     /**
405      * Gets the tailoring rules for this collator.
406      * @return the collation tailoring from which this collator was created
407      * @stable ICU 2.0
408      */
409     const UnicodeString& getRules() const;
410 
411     /**
412      * Gets the version information for a Collator.
413      * @param info the version # information, the result will be filled in
414      * @stable ICU 2.0
415      */
416     virtual void getVersion(UVersionInfo info) const;
417 
418 #ifndef U_HIDE_DEPRECATED_API
419     /**
420      * Returns the maximum length of any expansion sequences that end with the
421      * specified comparison order.
422      *
423      * This is specific to the kind of collation element values and sequences
424      * returned by the CollationElementIterator.
425      * Call CollationElementIterator::getMaxExpansion() instead.
426      *
427      * @param order a collation order returned by CollationElementIterator::previous
428      *              or CollationElementIterator::next.
429      * @return maximum size of the expansion sequences ending with the collation
430      *         element, or 1 if the collation element does not occur at the end of
431      *         any expansion sequence
432      * @see CollationElementIterator#getMaxExpansion
433      * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
434      */
435     int32_t getMaxExpansion(int32_t order) const;
436 #endif  /* U_HIDE_DEPRECATED_API */
437 
438     /**
439      * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
440      * method is to implement a simple version of RTTI, since not all C++
441      * compilers support genuine RTTI. Polymorphic operator==() and clone()
442      * methods call this method.
443      * @return The class ID for this object. All objects of a given class have
444      *         the same class ID. Objects of other classes have different class
445      *         IDs.
446      * @stable ICU 2.0
447      */
448     virtual UClassID getDynamicClassID(void) const;
449 
450     /**
451      * Returns the class ID for this class. This is useful only for comparing to
452      * a return value from getDynamicClassID(). For example:
453      * <pre>
454      * Base* polymorphic_pointer = createPolymorphicObject();
455      * if (polymorphic_pointer->getDynamicClassID() ==
456      *                                          Derived::getStaticClassID()) ...
457      * </pre>
458      * @return The class ID for all objects of this class.
459      * @stable ICU 2.0
460      */
461     static UClassID U_EXPORT2 getStaticClassID(void);
462 
463 #ifndef U_HIDE_DEPRECATED_API
464     /**
465      * Do not use this method: The caller and the ICU library might use different heaps.
466      * Use cloneBinary() instead which writes to caller-provided memory.
467      *
468      * Returns a binary format of this collator.
469      * @param length Returns the length of the data, in bytes
470      * @param status the error code status.
471      * @return memory, owned by the caller, of size 'length' bytes.
472      * @deprecated ICU 52. Use cloneBinary() instead.
473      */
474     uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
475 #endif  /* U_HIDE_DEPRECATED_API */
476 
477     /** Creates a binary image of a collator. This binary image can be stored and
478     *  later used to instantiate a collator using ucol_openBinary.
479     *  This API supports preflighting.
480     *  @param buffer a fill-in buffer to receive the binary image
481     *  @param capacity capacity of the destination buffer
482     *  @param status for catching errors
483     *  @return size of the image
484     *  @see ucol_openBinary
485     *  @stable ICU 3.4
486     */
487     int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
488 
489     /**
490      * Returns current rules. Delta defines whether full rules are returned or
491      * just the tailoring.
492      *
493      * getRules(void) should normally be used instead.
494      * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales
495      * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
496      * @param buffer UnicodeString to store the result rules
497      * @stable ICU 2.2
498      * @see UCOL_FULL_RULES
499      */
500     void getRules(UColRuleOption delta, UnicodeString &buffer) const;
501 
502     /**
503      * Universal attribute setter
504      * @param attr attribute type
505      * @param value attribute value
506      * @param status to indicate whether the operation went on smoothly or there were errors
507      * @stable ICU 2.2
508      */
509     virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
510                               UErrorCode &status);
511 
512     /**
513      * Universal attribute getter.
514      * @param attr attribute type
515      * @param status to indicate whether the operation went on smoothly or there were errors
516      * @return attribute value
517      * @stable ICU 2.2
518      */
519     virtual UColAttributeValue getAttribute(UColAttribute attr,
520                                             UErrorCode &status) const;
521 
522     /**
523      * Sets the variable top to the top of the specified reordering group.
524      * The variable top determines the highest-sorting character
525      * which is affected by UCOL_ALTERNATE_HANDLING.
526      * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
527      * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
528      *              UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
529      *              or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
530      * @param errorCode Standard ICU error code. Its input value must
531      *                  pass the U_SUCCESS() test, or else the function returns
532      *                  immediately. Check for U_FAILURE() on output or use with
533      *                  function chaining. (See User Guide for details.)
534      * @return *this
535      * @see getMaxVariable
536      * @stable ICU 53
537      */
538     virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode);
539 
540     /**
541      * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
542      * @return the maximum variable reordering group.
543      * @see setMaxVariable
544      * @stable ICU 53
545      */
546     virtual UColReorderCode getMaxVariable() const;
547 
548     /**
549      * Sets the variable top to the primary weight of the specified string.
550      *
551      * Beginning with ICU 53, the variable top is pinned to
552      * the top of one of the supported reordering groups,
553      * and it must not be beyond the last of those groups.
554      * See setMaxVariable().
555      * @param varTop one or more (if contraction) UChars to which the variable top should be set
556      * @param len length of variable top string. If -1 it is considered to be zero terminated.
557      * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
558      *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
559      *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
560      *    the last reordering group supported by setMaxVariable()
561      * @return variable top primary weight
562      * @deprecated ICU 53 Call setMaxVariable() instead.
563      */
564     virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status);
565 
566     /**
567      * Sets the variable top to the primary weight of the specified string.
568      *
569      * Beginning with ICU 53, the variable top is pinned to
570      * the top of one of the supported reordering groups,
571      * and it must not be beyond the last of those groups.
572      * See setMaxVariable().
573      * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set
574      * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
575      *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
576      *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
577      *    the last reordering group supported by setMaxVariable()
578      * @return variable top primary weight
579      * @deprecated ICU 53 Call setMaxVariable() instead.
580      */
581     virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status);
582 
583     /**
584      * Sets the variable top to the specified primary weight.
585      *
586      * Beginning with ICU 53, the variable top is pinned to
587      * the top of one of the supported reordering groups,
588      * and it must not be beyond the last of those groups.
589      * See setMaxVariable().
590      * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
591      * @param status error code
592      * @deprecated ICU 53 Call setMaxVariable() instead.
593      */
594     virtual void setVariableTop(uint32_t varTop, UErrorCode &status);
595 
596     /**
597      * Gets the variable top value of a Collator.
598      * @param status error code (not changed by function). If error code is set, the return value is undefined.
599      * @return the variable top primary weight
600      * @see getMaxVariable
601      * @stable ICU 2.0
602      */
603     virtual uint32_t getVariableTop(UErrorCode &status) const;
604 
605     /**
606      * Get a UnicodeSet that contains all the characters and sequences tailored in
607      * this collator.
608      * @param status      error code of the operation
609      * @return a pointer to a UnicodeSet object containing all the
610      *         code points and sequences that may sort differently than
611      *         in the root collator. The object must be disposed of by using delete
612      * @stable ICU 2.4
613      */
614     virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
615 
616     /**
617      * Get the sort key as an array of bytes from a UnicodeString.
618      *
619      * Note that sort keys are often less efficient than simply doing comparison.
620      * For more details, see the ICU User Guide.
621      *
622      * @param source string to be processed.
623      * @param result buffer to store result in. If NULL, number of bytes needed
624      *        will be returned.
625      * @param resultLength length of the result buffer. If if not enough the
626      *        buffer will be filled to capacity.
627      * @return Number of bytes needed for storing the sort key
628      * @stable ICU 2.0
629      */
630     virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
631                                int32_t resultLength) const;
632 
633     /**
634      * Get the sort key as an array of bytes from a UChar buffer.
635      *
636      * Note that sort keys are often less efficient than simply doing comparison.
637      * For more details, see the ICU User Guide.
638      *
639      * @param source string to be processed.
640      * @param sourceLength length of string to be processed. If -1, the string
641      *        is 0 terminated and length will be decided by the function.
642      * @param result buffer to store result in. If NULL, number of bytes needed
643      *        will be returned.
644      * @param resultLength length of the result buffer. If if not enough the
645      *        buffer will be filled to capacity.
646      * @return Number of bytes needed for storing the sort key
647      * @stable ICU 2.2
648      */
649     virtual int32_t getSortKey(const UChar *source, int32_t sourceLength,
650                                uint8_t *result, int32_t resultLength) const;
651 
652     /**
653      * Retrieves the reordering codes for this collator.
654      * @param dest The array to fill with the script ordering.
655      * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
656      *  will only return the length of the result without writing any codes (pre-flighting).
657      * @param status A reference to an error code value, which must not indicate
658      * a failure before the function call.
659      * @return The length of the script ordering array.
660      * @see ucol_setReorderCodes
661      * @see Collator#getEquivalentReorderCodes
662      * @see Collator#setReorderCodes
663      * @stable ICU 4.8
664      */
665      virtual int32_t getReorderCodes(int32_t *dest,
666                                      int32_t destCapacity,
667                                      UErrorCode& status) const;
668 
669     /**
670      * Sets the ordering of scripts for this collator.
671      * @param reorderCodes An array of script codes in the new order. This can be NULL if the
672      * length is also set to 0. An empty array will clear any reordering codes on the collator.
673      * @param reorderCodesLength The length of reorderCodes.
674      * @param status error code
675      * @see ucol_setReorderCodes
676      * @see Collator#getReorderCodes
677      * @see Collator#getEquivalentReorderCodes
678      * @stable ICU 4.8
679      */
680      virtual void setReorderCodes(const int32_t* reorderCodes,
681                                   int32_t reorderCodesLength,
682                                   UErrorCode& status) ;
683 
684     /**
685      * Implements ucol_strcollUTF8().
686      * @internal
687      */
688     virtual UCollationResult internalCompareUTF8(
689             const char *left, int32_t leftLength,
690             const char *right, int32_t rightLength,
691             UErrorCode &errorCode) const;
692 
693     /** Get the short definition string for a collator. This internal API harvests the collator's
694      *  locale and the attribute set and produces a string that can be used for opening
695      *  a collator with the same attributes using the ucol_openFromShortString API.
696      *  This string will be normalized.
697      *  The structure and the syntax of the string is defined in the "Naming collators"
698      *  section of the users guide:
699      *  http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme
700      *  This function supports preflighting.
701      *
702      *  This is internal, and intended to be used with delegate converters.
703      *
704      *  @param locale a locale that will appear as a collators locale in the resulting
705      *                short string definition. If NULL, the locale will be harvested
706      *                from the collator.
707      *  @param buffer space to hold the resulting string
708      *  @param capacity capacity of the buffer
709      *  @param status for returning errors. All the preflighting errors are featured
710      *  @return length of the resulting string
711      *  @see ucol_openFromShortString
712      *  @see ucol_normalizeShortDefinitionString
713      *  @see ucol_getShortDefinitionString
714      *  @internal
715      */
716     virtual int32_t internalGetShortDefinitionString(const char *locale,
717                                                      char *buffer,
718                                                      int32_t capacity,
719                                                      UErrorCode &status) const;
720 
721     /**
722      * Implements ucol_nextSortKeyPart().
723      * @internal
724      */
725     virtual int32_t internalNextSortKeyPart(
726             UCharIterator *iter, uint32_t state[2],
727             uint8_t *dest, int32_t count, UErrorCode &errorCode) const;
728 
729     // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API
730     /**
731      * Only for use in ucol_openRules().
732      * @internal
733      */
734     RuleBasedCollator();
735 
736 #ifndef U_HIDE_INTERNAL_API
737     /**
738      * Implements ucol_getLocaleByType().
739      * Needed because the lifetime of the locale ID string must match that of the collator.
740      * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
741      * @internal
742      */
743     const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
744 
745     /**
746      * Implements ucol_getContractionsAndExpansions().
747      * Gets this collator's sets of contraction strings and/or
748      * characters and strings that map to multiple collation elements (expansions).
749      * If addPrefixes is TRUE, then contractions that are expressed as
750      * prefix/pre-context rules are included.
751      * @param contractions if not NULL, the set to hold the contractions
752      * @param expansions if not NULL, the set to hold the expansions
753      * @param addPrefixes include prefix contextual mappings
754      * @param errorCode in/out ICU error code
755      * @internal
756      */
757     void internalGetContractionsAndExpansions(
758             UnicodeSet *contractions, UnicodeSet *expansions,
759             UBool addPrefixes, UErrorCode &errorCode) const;
760 
761     /**
762      * Adds the contractions that start with character c to the set.
763      * Ignores prefixes. Used by AlphabeticIndex.
764      * @internal
765      */
766     void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
767 
768     /**
769      * Implements from-rule constructors, and ucol_openRules().
770      * @internal
771      */
772     void internalBuildTailoring(
773             const UnicodeString &rules,
774             int32_t strength,
775             UColAttributeValue decompositionMode,
776             UParseError *outParseError, UnicodeString *outReason,
777             UErrorCode &errorCode);
778 
779     /** @internal */
rbcFromUCollator(UCollator * uc)780     static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
781         return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
782     }
783     /** @internal */
rbcFromUCollator(const UCollator * uc)784     static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
785         return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
786     }
787 
788     /**
789      * Appends the CEs for the string to the vector.
790      * @internal for tests & tools
791      */
792     void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
793 #endif  // U_HIDE_INTERNAL_API
794 
795 protected:
796    /**
797     * Used internally by registration to define the requested and valid locales.
798     * @param requestedLocale the requested locale
799     * @param validLocale the valid locale
800     * @param actualLocale the actual locale
801     * @internal
802     */
803     virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale);
804 
805 private:
806     friend class CollationElementIterator;
807     friend class Collator;
808 
809     RuleBasedCollator(const CollationCacheEntry *entry);
810 
811     /**
812      * Enumeration of attributes that are relevant for short definition strings
813      * (e.g., ucol_getShortDefinitionString()).
814      * Effectively extends UColAttribute.
815      */
816     enum Attributes {
817         ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
818         ATTR_LIMIT
819     };
820 
821     void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode);
822 
823     // Both lengths must be <0 or else both must be >=0.
824     UCollationResult doCompare(const UChar *left, int32_t leftLength,
825                                const UChar *right, int32_t rightLength,
826                                UErrorCode &errorCode) const;
827     UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
828                                const uint8_t *right, int32_t rightLength,
829                                UErrorCode &errorCode) const;
830 
831     void writeSortKey(const UChar *s, int32_t length,
832                       SortKeyByteSink &sink, UErrorCode &errorCode) const;
833 
834     void writeIdenticalLevel(const UChar *s, const UChar *limit,
835                              SortKeyByteSink &sink, UErrorCode &errorCode) const;
836 
837     const CollationSettings &getDefaultSettings() const;
838 
setAttributeDefault(int32_t attribute)839     void setAttributeDefault(int32_t attribute) {
840         explicitlySetAttributes &= ~((uint32_t)1 << attribute);
841     }
setAttributeExplicitly(int32_t attribute)842     void setAttributeExplicitly(int32_t attribute) {
843         explicitlySetAttributes |= (uint32_t)1 << attribute;
844     }
attributeHasBeenSetExplicitly(int32_t attribute)845     UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
846         // assert(0 <= attribute < ATTR_LIMIT);
847         return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
848     }
849 
850     /**
851      * Tests whether a character is "unsafe" for use as a collation starting point.
852      *
853      * @param c code point or code unit
854      * @return TRUE if c is unsafe
855      * @see CollationElementIterator#setOffset(int)
856      */
857     UBool isUnsafe(UChar32 c) const;
858 
859     static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
860     UBool initMaxExpansions(UErrorCode &errorCode) const;
861 
862     void setFastLatinOptions(CollationSettings &ownedSettings) const;
863 
864     const CollationData *data;
865     const CollationSettings *settings;  // reference-counted
866     const CollationTailoring *tailoring;  // alias of cacheEntry->tailoring
867     const CollationCacheEntry *cacheEntry;  // reference-counted
868     Locale validLocale;
869     uint32_t explicitlySetAttributes;
870 
871     UBool actualLocaleIsSameAsValid;
872 };
873 
874 U_NAMESPACE_END
875 
876 #endif  // !UCONFIG_NO_COLLATION
877 #endif  // TBLCOLL_H
878