1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationdata.h
9 *
10 * created on: 2010oct27
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __COLLATIONDATA_H__
15 #define __COLLATIONDATA_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/ucol.h"
22 #include "unicode/uniset.h"
23 #include "collation.h"
24 #include "normalizer2impl.h"
25 #include "utrie2.h"
26 
27 struct UDataMemory;
28 
29 U_NAMESPACE_BEGIN
30 
31 class UVector32;
32 
33 /**
34  * Collation data container.
35  * Immutable data created by a CollationDataBuilder, or loaded from a file,
36  * or deserialized from API-provided binary data.
37  *
38  * Includes data for the collation base (root/default), aliased if this is not the base.
39  */
40 struct U_I18N_API CollationData : public UMemory {
41     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
42     // parallel with the ranges, and resetting ranges that are indexed.
43     // The reordering builder code could clone the resulting template array.
44     enum {
45         REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
46         REORDER_RESERVED_AFTER_LATIN
47     };
48 
49     enum {
50         MAX_NUM_SPECIAL_REORDER_CODES = 8,
51         /** C++ only, data reader check scriptStartsLength. */
52         MAX_NUM_SCRIPT_RANGES = 256
53     };
54 
CollationDataCollationData55     CollationData(const Normalizer2Impl &nfc)
56             : trie(NULL),
57               ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
58               jamoCE32s(NULL),
59               nfcImpl(nfc),
60               numericPrimary(0x12000000),
61               ce32sLength(0), cesLength(0), contextsLength(0),
62               compressibleBytes(NULL),
63               unsafeBackwardSet(NULL),
64               fastLatinTable(NULL), fastLatinTableLength(0),
65               numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
66               rootElements(NULL), rootElementsLength(0) {}
67 
getCE32CollationData68     uint32_t getCE32(UChar32 c) const {
69         return UTRIE2_GET32(trie, c);
70     }
71 
getCE32FromSupplementaryCollationData72     uint32_t getCE32FromSupplementary(UChar32 c) const {
73         return UTRIE2_GET32_FROM_SUPP(trie, c);
74     }
75 
isDigitCollationData76     UBool isDigit(UChar32 c) const {
77         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
78                 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
79     }
80 
isUnsafeBackwardCollationData81     UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
82         return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
83     }
84 
isCompressibleLeadByteCollationData85     UBool isCompressibleLeadByte(uint32_t b) const {
86         return compressibleBytes[b];
87     }
88 
isCompressiblePrimaryCollationData89     inline UBool isCompressiblePrimary(uint32_t p) const {
90         return isCompressibleLeadByte(p >> 24);
91     }
92 
93     /**
94      * Returns the CE32 from two contexts words.
95      * Access to the defaultCE32 for contraction and prefix matching.
96      */
readCE32CollationData97     static uint32_t readCE32(const UChar *p) {
98         return ((uint32_t)p[0] << 16) | p[1];
99     }
100 
101     /**
102      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
103      * Requires that ce32 is special.
104      */
105     uint32_t getIndirectCE32(uint32_t ce32) const;
106     /**
107      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
108      * if ce32 is special.
109      */
110     uint32_t getFinalCE32(uint32_t ce32) const;
111 
112     /**
113      * Computes a CE from c's ce32 which has the OFFSET_TAG.
114      */
getCEFromOffsetCE32CollationData115     int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
116         int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
117         return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
118     }
119 
120     /**
121      * Returns the single CE that c maps to.
122      * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
123      */
124     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
125 
126     /**
127      * Returns the FCD16 value for code point c. c must be >= 0.
128      */
getFCD16CollationData129     uint16_t getFCD16(UChar32 c) const {
130         return nfcImpl.getFCD16(c);
131     }
132 
133     /**
134      * Returns the first primary for the script's reordering group.
135      * @return the primary with only the first primary lead byte of the group
136      *         (not necessarily an actual root collator primary weight),
137      *         or 0 if the script is unknown
138      */
139     uint32_t getFirstPrimaryForGroup(int32_t script) const;
140 
141     /**
142      * Returns the last primary for the script's reordering group.
143      * @return the last primary of the group
144      *         (not an actual root collator primary weight),
145      *         or 0 if the script is unknown
146      */
147     uint32_t getLastPrimaryForGroup(int32_t script) const;
148 
149     /**
150      * Finds the reordering group which contains the primary weight.
151      * @return the first script of the group, or -1 if the weight is beyond the last group
152      */
153     int32_t getGroupForPrimary(uint32_t p) const;
154 
155     int32_t getEquivalentScripts(int32_t script,
156                                  int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
157 
158     /**
159      * Writes the permutation of primary-weight ranges
160      * for the given reordering of scripts and groups.
161      * The caller checks for illegal arguments and
162      * takes care of [DEFAULT] and memory allocation.
163      *
164      * Each list element will be a (limit, offset) pair as described
165      * for the CollationSettings::reorderRanges.
166      * The list will be empty if no ranges are reordered.
167      */
168     void makeReorderRanges(const int32_t *reorder, int32_t length,
169                            UVector32 &ranges, UErrorCode &errorCode) const;
170 
171     /** @see jamoCE32s */
172     static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
173 
174     /** Main lookup trie. */
175     const UTrie2 *trie;
176     /**
177      * Array of CE32 values.
178      * At index 0 there must be CE32(U+0000)
179      * to support U+0000's special-tag for NUL-termination handling.
180      */
181     const uint32_t *ce32s;
182     /** Array of CE values for expansions and OFFSET_TAG. */
183     const int64_t *ces;
184     /** Array of prefix and contraction-suffix matching data. */
185     const UChar *contexts;
186     /** Base collation data, or NULL if this data itself is a base. */
187     const CollationData *base;
188     /**
189      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
190      * They are normally simple CE32s, rarely expansions.
191      * For fast handling of HANGUL_TAG.
192      */
193     const uint32_t *jamoCE32s;
194     const Normalizer2Impl &nfcImpl;
195     /** The single-byte primary weight (xx000000) for numeric collation. */
196     uint32_t numericPrimary;
197 
198     int32_t ce32sLength;
199     int32_t cesLength;
200     int32_t contextsLength;
201 
202     /** 256 flags for which primary-weight lead bytes are compressible. */
203     const UBool *compressibleBytes;
204     /**
205      * Set of code points that are unsafe for starting string comparison after an identical prefix,
206      * or in backwards CE iteration.
207      */
208     const UnicodeSet *unsafeBackwardSet;
209 
210     /**
211      * Fast Latin table for common-Latin-text string comparisons.
212      * Data structure see class CollationFastLatin.
213      */
214     const uint16_t *fastLatinTable;
215     int32_t fastLatinTableLength;
216 
217     /**
218      * Data for scripts and reordering groups.
219      * Uses include building a reordering permutation table and
220      * providing script boundaries to AlphabeticIndex.
221      */
222     int32_t numScripts;
223     /**
224      * The length of scriptsIndex is numScripts+16.
225      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
226      * 16 special reorder codes (not all used) are mapped starting at numScripts.
227      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
228      * There are special codes at the end for reorder-reserved primary ranges.
229      *
230      * Multiple scripts may share a range and index, for example Hira & Kana.
231      */
232     const uint16_t *scriptsIndex;
233     /**
234      * Start primary weight (top 16 bits only) for a group/script/reserved range
235      * indexed by scriptsIndex.
236      * The first range (separators & terminators) and the last range (trailing weights)
237      * are not reorderable, and no scriptsIndex entry points to them.
238      */
239     const uint16_t *scriptStarts;
240     int32_t scriptStartsLength;
241 
242     /**
243      * Collation elements in the root collator.
244      * Used by the CollationRootElements class. The data structure is described there.
245      * NULL in a tailoring.
246      */
247     const uint32_t *rootElements;
248     int32_t rootElementsLength;
249 
250 private:
251     int32_t getScriptIndex(int32_t script) const;
252     void makeReorderRanges(const int32_t *reorder, int32_t length,
253                            UBool latinMustMove,
254                            UVector32 &ranges, UErrorCode &errorCode) const;
255     int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
256     int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
257 };
258 
259 U_NAMESPACE_END
260 
261 #endif  // !UCONFIG_NO_COLLATION
262 #endif  // __COLLATIONDATA_H__
263