1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationsettings.h
7 *
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
10 */
11 
12 #ifndef __COLLATIONSETTINGS_H__
13 #define __COLLATIONSETTINGS_H__
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_COLLATION
18 
19 #include "unicode/ucol.h"
20 #include "collation.h"
21 #include "sharedobject.h"
22 #include "umutex.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 struct CollationData;
27 
28 /**
29  * Collation settings/options/attributes.
30  * These are the values that can be changed via API.
31  */
32 struct U_I18N_API CollationSettings : public SharedObject {
33     /**
34      * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
35      */
36     static const int32_t CHECK_FCD = 1;
37     /**
38      * Options bit 1: Numeric collation.
39      * Also known as CODAN = COllate Digits As Numbers.
40      *
41      * Treat digit sequences as numbers with CE sequences in numeric order,
42      * rather than returning a normal CE for each digit.
43      */
44     static const int32_t NUMERIC = 2;
45     /**
46      * "Shifted" alternate handling, see ALTERNATE_MASK.
47      */
48     static const int32_t SHIFTED = 4;
49     /**
50      * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
51      * Reserve values 8 and 0xc for shift-trimmed and blanked.
52      */
53     static const int32_t ALTERNATE_MASK = 0xc;
54     /**
55      * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
56      */
57     static const int32_t MAX_VARIABLE_SHIFT = 4;
58     /** maxVariable options bit mask before shifting. */
59     static const int32_t MAX_VARIABLE_MASK = 0x70;
60     /** Options bit 7: Reserved/unused/0. */
61     /**
62      * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
63      */
64     static const int32_t UPPER_FIRST = 0x100;
65     /**
66      * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
67      * unless case level is on (when they are *moved* into the separate case level).
68      * By default, the case bits are removed from the tertiary weight (ignored).
69      *
70      * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
71      * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
72      */
73     static const int32_t CASE_FIRST = 0x200;
74     /**
75      * Options bit mask for caseFirst and upperFirst, before shifting.
76      * Same value as caseFirst==upperFirst.
77      */
78     static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
79     /**
80      * Options bit 10: Insert the case level between the secondary and tertiary levels.
81      */
82     static const int32_t CASE_LEVEL = 0x400;
83     /**
84      * Options bit 11: Compare secondary weights backwards. ("French secondary")
85      */
86     static const int32_t BACKWARD_SECONDARY = 0x800;
87     /**
88      * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
89      * It is the top used bit field in the options. (No need to mask after shifting.)
90      */
91     static const int32_t STRENGTH_SHIFT = 12;
92     /** Strength options bit mask before shifting. */
93     static const int32_t STRENGTH_MASK = 0xf000;
94 
95     /** maxVariable values */
96     enum MaxVariable {
97         MAX_VAR_SPACE,
98         MAX_VAR_PUNCT,
99         MAX_VAR_SYMBOL,
100         MAX_VAR_CURRENCY
101     };
102 
CollationSettingsCollationSettings103     CollationSettings()
104             : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
105                       (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
106               variableTop(0),
107               reorderTable(NULL),
108               minHighNoReorder(0),
109               reorderRanges(NULL), reorderRangesLength(0),
110               reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
111               fastLatinOptions(-1) {}
112 
113     CollationSettings(const CollationSettings &other);
114     virtual ~CollationSettings();
115 
116     UBool operator==(const CollationSettings &other) const;
117 
118     inline UBool operator!=(const CollationSettings &other) const {
119         return !operator==(other);
120     }
121 
122     int32_t hashCode() const;
123 
124     void resetReordering();
125     void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
126                          const uint32_t *ranges, int32_t rangesLength,
127                          const uint8_t *table, UErrorCode &errorCode);
128     void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
129                        UErrorCode &errorCode);
130     void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
131 
hasReorderingCollationSettings132     inline UBool hasReordering() const { return reorderTable != NULL; }
133     static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
reorderCollationSettings134     inline uint32_t reorder(uint32_t p) const {
135         uint8_t b = reorderTable[p >> 24];
136         if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
137             return ((uint32_t)b << 24) | (p & 0xffffff);
138         } else {
139             return reorderEx(p);
140         }
141     }
142 
143     void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
144 
getStrengthCollationSettings145     static int32_t getStrength(int32_t options) {
146         return options >> STRENGTH_SHIFT;
147     }
148 
getStrengthCollationSettings149     int32_t getStrength() const {
150         return getStrength(options);
151     }
152 
153     /** Sets the options bit for an on/off attribute. */
154     void setFlag(int32_t bit, UColAttributeValue value,
155                  int32_t defaultOptions, UErrorCode &errorCode);
156 
getFlagCollationSettings157     UColAttributeValue getFlag(int32_t bit) const {
158         return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
159     }
160 
161     void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
162 
getCaseFirstCollationSettings163     UColAttributeValue getCaseFirst() const {
164         int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
165         return (option == 0) ? UCOL_OFF :
166                 (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
167     }
168 
169     void setAlternateHandling(UColAttributeValue value,
170                               int32_t defaultOptions, UErrorCode &errorCode);
171 
getAlternateHandlingCollationSettings172     UColAttributeValue getAlternateHandling() const {
173         return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
174     }
175 
176     void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
177 
getMaxVariableCollationSettings178     MaxVariable getMaxVariable() const {
179         return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
180     }
181 
182     /**
183      * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
184      */
isTertiaryWithCaseBitsCollationSettings185     static inline UBool isTertiaryWithCaseBits(int32_t options) {
186         return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
187     }
getTertiaryMaskCollationSettings188     static uint32_t getTertiaryMask(int32_t options) {
189         // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
190         return isTertiaryWithCaseBits(options) ?
191                 Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
192     }
193 
sortsTertiaryUpperCaseFirstCollationSettings194     static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
195         // On tertiary level, consider case bits and sort uppercase first
196         // if caseLevel is off and caseFirst==upperFirst.
197         return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
198     }
199 
dontCheckFCDCollationSettings200     inline UBool dontCheckFCD() const {
201         return (options & CHECK_FCD) == 0;
202     }
203 
hasBackwardSecondaryCollationSettings204     inline UBool hasBackwardSecondary() const {
205         return (options & BACKWARD_SECONDARY) != 0;
206     }
207 
isNumericCollationSettings208     inline UBool isNumeric() const {
209         return (options & NUMERIC) != 0;
210     }
211 
212     /** CHECK_FCD etc. */
213     int32_t options;
214     /** Variable-top primary weight. */
215     uint32_t variableTop;
216     /**
217      * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
218      * A 0 entry at a non-zero index means that the primary lead byte is "split"
219      * (there are different offsets for primaries that share that lead byte)
220      * and the reordering offset must be determined via the reorderRanges.
221      */
222     const uint8_t *reorderTable;
223     /** Limit of last reordered range. 0 if no reordering or no split bytes. */
224     uint32_t minHighNoReorder;
225     /**
226      * Primary-weight ranges for script reordering,
227      * to be used by reorder(p) for split-reordered primary lead bytes.
228      *
229      * Each entry is a (limit, offset) pair.
230      * The upper 16 bits of the entry are the upper 16 bits of the
231      * exclusive primary limit of a range.
232      * Primaries between the previous limit and this one have their lead bytes
233      * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
234      *
235      * CollationData::makeReorderRanges() writes a full list where the first range
236      * (at least for terminators and separators) has a 0 offset.
237      * The last range has a non-zero offset.
238      * minHighNoReorder is set to the limit of that last range.
239      *
240      * In the settings object, the initial ranges before the first split lead byte
241      * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
242      * If there are no split-reordered lead bytes, then no ranges are needed.
243      */
244     const uint32_t *reorderRanges;
245     int32_t reorderRangesLength;
246     /** Array of reorder codes; ignored if reorderCodesLength == 0. */
247     const int32_t *reorderCodes;
248     /** Number of reorder codes; 0 if no reordering. */
249     int32_t reorderCodesLength;
250     /**
251      * Capacity of reorderCodes.
252      * If 0, then the codes, the ranges, and the table are aliases.
253      * Otherwise, this object owns the memory via the reorderCodes pointer;
254      * the codes, the ranges, and the table are in the same memory block, in that order.
255      */
256     int32_t reorderCodesCapacity;
257 
258     /** Options for CollationFastLatin. Negative if disabled. */
259     int32_t fastLatinOptions;
260     uint16_t fastLatinPrimaries[0x180];
261 
262 private:
263     void setReorderArrays(const int32_t *codes, int32_t codesLength,
264                           const uint32_t *ranges, int32_t rangesLength,
265                           const uint8_t *table, UErrorCode &errorCode);
266     uint32_t reorderEx(uint32_t p) const;
267 };
268 
269 U_NAMESPACE_END
270 
271 #endif  // !UCONFIG_NO_COLLATION
272 #endif  // __COLLATIONSETTINGS_H__
273