1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationsettings.h
9 *
10 * created on: 2013feb07
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __COLLATIONSETTINGS_H__
15 #define __COLLATIONSETTINGS_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/ucol.h"
22 #include "collation.h"
23 #include "sharedobject.h"
24 #include "umutex.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 struct CollationData;
29 
30 /**
31  * Collation settings/options/attributes.
32  * These are the values that can be changed via API.
33  */
34 struct U_I18N_API CollationSettings : public SharedObject {
35     /**
36      * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
37      */
38     static const int32_t CHECK_FCD = 1;
39     /**
40      * Options bit 1: Numeric collation.
41      * Also known as CODAN = COllate Digits As Numbers.
42      *
43      * Treat digit sequences as numbers with CE sequences in numeric order,
44      * rather than returning a normal CE for each digit.
45      */
46     static const int32_t NUMERIC = 2;
47     /**
48      * "Shifted" alternate handling, see ALTERNATE_MASK.
49      */
50     static const int32_t SHIFTED = 4;
51     /**
52      * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
53      * Reserve values 8 and 0xc for shift-trimmed and blanked.
54      */
55     static const int32_t ALTERNATE_MASK = 0xc;
56     /**
57      * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
58      */
59     static const int32_t MAX_VARIABLE_SHIFT = 4;
60     /** maxVariable options bit mask before shifting. */
61     static const int32_t MAX_VARIABLE_MASK = 0x70;
62     /** Options bit 7: Reserved/unused/0. */
63     /**
64      * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
65      */
66     static const int32_t UPPER_FIRST = 0x100;
67     /**
68      * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
69      * unless case level is on (when they are *moved* into the separate case level).
70      * By default, the case bits are removed from the tertiary weight (ignored).
71      *
72      * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
73      * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
74      */
75     static const int32_t CASE_FIRST = 0x200;
76     /**
77      * Options bit mask for caseFirst and upperFirst, before shifting.
78      * Same value as caseFirst==upperFirst.
79      */
80     static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
81     /**
82      * Options bit 10: Insert the case level between the secondary and tertiary levels.
83      */
84     static const int32_t CASE_LEVEL = 0x400;
85     /**
86      * Options bit 11: Compare secondary weights backwards. ("French secondary")
87      */
88     static const int32_t BACKWARD_SECONDARY = 0x800;
89     /**
90      * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
91      * It is the top used bit field in the options. (No need to mask after shifting.)
92      */
93     static const int32_t STRENGTH_SHIFT = 12;
94     /** Strength options bit mask before shifting. */
95     static const int32_t STRENGTH_MASK = 0xf000;
96 
97     /** maxVariable values */
98     enum MaxVariable {
99         MAX_VAR_SPACE,
100         MAX_VAR_PUNCT,
101         MAX_VAR_SYMBOL,
102         MAX_VAR_CURRENCY
103     };
104 
CollationSettingsCollationSettings105     CollationSettings()
106             : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
107                       (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
108               variableTop(0),
109               reorderTable(NULL),
110               minHighNoReorder(0),
111               reorderRanges(NULL), reorderRangesLength(0),
112               reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
113               fastLatinOptions(-1) {}
114 
115     CollationSettings(const CollationSettings &other);
116     virtual ~CollationSettings();
117 
118     UBool operator==(const CollationSettings &other) const;
119 
120     inline UBool operator!=(const CollationSettings &other) const {
121         return !operator==(other);
122     }
123 
124     int32_t hashCode() const;
125 
126     void resetReordering();
127     void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
128                          const uint32_t *ranges, int32_t rangesLength,
129                          const uint8_t *table, UErrorCode &errorCode);
130     void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
131                        UErrorCode &errorCode);
132     void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
133 
hasReorderingCollationSettings134     inline UBool hasReordering() const { return reorderTable != NULL; }
135     static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
reorderCollationSettings136     inline uint32_t reorder(uint32_t p) const {
137         uint8_t b = reorderTable[p >> 24];
138         if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
139             return ((uint32_t)b << 24) | (p & 0xffffff);
140         } else {
141             return reorderEx(p);
142         }
143     }
144 
145     void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
146 
getStrengthCollationSettings147     static int32_t getStrength(int32_t options) {
148         return options >> STRENGTH_SHIFT;
149     }
150 
getStrengthCollationSettings151     int32_t getStrength() const {
152         return getStrength(options);
153     }
154 
155     /** Sets the options bit for an on/off attribute. */
156     void setFlag(int32_t bit, UColAttributeValue value,
157                  int32_t defaultOptions, UErrorCode &errorCode);
158 
getFlagCollationSettings159     UColAttributeValue getFlag(int32_t bit) const {
160         return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
161     }
162 
163     void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
164 
getCaseFirstCollationSettings165     UColAttributeValue getCaseFirst() const {
166         int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
167         return (option == 0) ? UCOL_OFF :
168                 (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
169     }
170 
171     void setAlternateHandling(UColAttributeValue value,
172                               int32_t defaultOptions, UErrorCode &errorCode);
173 
getAlternateHandlingCollationSettings174     UColAttributeValue getAlternateHandling() const {
175         return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
176     }
177 
178     void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
179 
getMaxVariableCollationSettings180     MaxVariable getMaxVariable() const {
181         return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
182     }
183 
184     /**
185      * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
186      */
isTertiaryWithCaseBitsCollationSettings187     static inline UBool isTertiaryWithCaseBits(int32_t options) {
188         return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
189     }
getTertiaryMaskCollationSettings190     static uint32_t getTertiaryMask(int32_t options) {
191         // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
192         return isTertiaryWithCaseBits(options) ?
193                 Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
194     }
195 
sortsTertiaryUpperCaseFirstCollationSettings196     static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
197         // On tertiary level, consider case bits and sort uppercase first
198         // if caseLevel is off and caseFirst==upperFirst.
199         return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
200     }
201 
dontCheckFCDCollationSettings202     inline UBool dontCheckFCD() const {
203         return (options & CHECK_FCD) == 0;
204     }
205 
hasBackwardSecondaryCollationSettings206     inline UBool hasBackwardSecondary() const {
207         return (options & BACKWARD_SECONDARY) != 0;
208     }
209 
isNumericCollationSettings210     inline UBool isNumeric() const {
211         return (options & NUMERIC) != 0;
212     }
213 
214     /** CHECK_FCD etc. */
215     int32_t options;
216     /** Variable-top primary weight. */
217     uint32_t variableTop;
218     /**
219      * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
220      * A 0 entry at a non-zero index means that the primary lead byte is "split"
221      * (there are different offsets for primaries that share that lead byte)
222      * and the reordering offset must be determined via the reorderRanges.
223      */
224     const uint8_t *reorderTable;
225     /** Limit of last reordered range. 0 if no reordering or no split bytes. */
226     uint32_t minHighNoReorder;
227     /**
228      * Primary-weight ranges for script reordering,
229      * to be used by reorder(p) for split-reordered primary lead bytes.
230      *
231      * Each entry is a (limit, offset) pair.
232      * The upper 16 bits of the entry are the upper 16 bits of the
233      * exclusive primary limit of a range.
234      * Primaries between the previous limit and this one have their lead bytes
235      * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
236      *
237      * CollationData::makeReorderRanges() writes a full list where the first range
238      * (at least for terminators and separators) has a 0 offset.
239      * The last range has a non-zero offset.
240      * minHighNoReorder is set to the limit of that last range.
241      *
242      * In the settings object, the initial ranges before the first split lead byte
243      * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
244      * If there are no split-reordered lead bytes, then no ranges are needed.
245      */
246     const uint32_t *reorderRanges;
247     int32_t reorderRangesLength;
248     /** Array of reorder codes; ignored if reorderCodesLength == 0. */
249     const int32_t *reorderCodes;
250     /** Number of reorder codes; 0 if no reordering. */
251     int32_t reorderCodesLength;
252     /**
253      * Capacity of reorderCodes.
254      * If 0, then the codes, the ranges, and the table are aliases.
255      * Otherwise, this object owns the memory via the reorderCodes pointer;
256      * the codes, the ranges, and the table are in the same memory block, in that order.
257      */
258     int32_t reorderCodesCapacity;
259 
260     /** Options for CollationFastLatin. Negative if disabled. */
261     int32_t fastLatinOptions;
262     uint16_t fastLatinPrimaries[0x180];
263 
264 private:
265     void setReorderArrays(const int32_t *codes, int32_t codesLength,
266                           const uint32_t *ranges, int32_t rangesLength,
267                           const uint8_t *table, UErrorCode &errorCode);
268     uint32_t reorderEx(uint32_t p) const;
269 };
270 
271 U_NAMESPACE_END
272 
273 #endif  // !UCONFIG_NO_COLLATION
274 #endif  // __COLLATIONSETTINGS_H__
275