1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdatabuilder.h
7 *
8 * created on: 2012apr01
9 * created by: Markus W. Scherer
10 */
11 
12 #ifndef __COLLATIONDATABUILDER_H__
13 #define __COLLATIONDATABUILDER_H__
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_COLLATION
18 
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/uversion.h"
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "collationsettings.h"
25 #include "normalizer2impl.h"
26 #include "utrie2.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "uvector.h"
30 
31 U_NAMESPACE_BEGIN
32 
33 struct ConditionalCE32;
34 
35 class CollationFastLatinBuilder;
36 class CopyHelper;
37 class DataBuilderCollationIterator;
38 class UCharsTrieBuilder;
39 
40 /**
41  * Low-level CollationData builder.
42  * Takes (character, CE) pairs and builds them into runtime data structures.
43  * Supports characters with context prefixes and contraction suffixes.
44  */
45 class U_I18N_API CollationDataBuilder : public UObject {
46 public:
47     /**
48      * Collation element modifier. Interface class for a modifier
49      * that changes a tailoring builder's temporary CEs to final CEs.
50      * Called for every non-special CE32 and every expansion CE.
51      */
52     class CEModifier : public UObject {
53     public:
54         virtual ~CEModifier();
55         /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
56         virtual int64_t modifyCE32(uint32_t ce32) const = 0;
57         /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
58         virtual int64_t modifyCE(int64_t ce) const = 0;
59     };
60 
61     CollationDataBuilder(UErrorCode &errorCode);
62 
63     virtual ~CollationDataBuilder();
64 
65     void initForTailoring(const CollationData *b, UErrorCode &errorCode);
66 
67     virtual UBool isCompressibleLeadByte(uint32_t b) const;
68 
isCompressiblePrimary(uint32_t p)69     inline UBool isCompressiblePrimary(uint32_t p) const {
70         return isCompressibleLeadByte(p >> 24);
71     }
72 
73     /**
74      * @return TRUE if this builder has mappings (e.g., add() has been called)
75      */
hasMappings()76     UBool hasMappings() const { return modified; }
77 
78     /**
79      * @return TRUE if c has CEs in this builder
80      */
81     UBool isAssigned(UChar32 c) const;
82 
83     /**
84      * @return the three-byte primary if c maps to a single such CE and has no context data,
85      * otherwise returns 0.
86      */
87     uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
88 
89     /**
90      * @return the single CE for c.
91      * Sets an error code if c does not have a single CE.
92      */
93     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
94 
95     void add(const UnicodeString &prefix, const UnicodeString &s,
96              const int64_t ces[], int32_t cesLength,
97              UErrorCode &errorCode);
98 
99     /**
100      * Encodes the ces as either the returned ce32 by itself,
101      * or by storing an expansion, with the returned ce32 referring to that.
102      *
103      * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
104      */
105     virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
106     void addCE32(const UnicodeString &prefix, const UnicodeString &s,
107                  uint32_t ce32, UErrorCode &errorCode);
108 
109     /**
110      * Sets three-byte-primary CEs for a range of code points in code point order,
111      * if it is worth doing; otherwise no change is made.
112      * None of the code points in the range should have complex mappings so far
113      * (expansions/contractions/prefixes).
114      * @param start first code point
115      * @param end last code point (inclusive)
116      * @param primary primary weight for 'start'
117      * @param step per-code point primary-weight increment
118      * @param errorCode ICU in/out error code
119      * @return TRUE if an OFFSET_TAG range was used for start..end
120      */
121     UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
122                                uint32_t primary, int32_t step,
123                                UErrorCode &errorCode);
124 
125     /**
126      * Sets three-byte-primary CEs for a range of code points in code point order.
127      * Sets range values if that is worth doing, or else individual values.
128      * None of the code points in the range should have complex mappings so far
129      * (expansions/contractions/prefixes).
130      * @param start first code point
131      * @param end last code point (inclusive)
132      * @param primary primary weight for 'start'
133      * @param step per-code point primary-weight increment
134      * @param errorCode ICU in/out error code
135      * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
136      */
137     uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
138                                           uint32_t primary, int32_t step,
139                                           UErrorCode &errorCode);
140 
141     /**
142      * Copies all mappings from the src builder, with modifications.
143      * This builder here must not be built yet, and should be empty.
144      */
145     void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
146                   UErrorCode &errorCode);
147 
148     void optimize(const UnicodeSet &set, UErrorCode &errorCode);
149     void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
150 
enableFastLatin()151     void enableFastLatin() { fastLatinEnabled = TRUE; }
152     virtual void build(CollationData &data, UErrorCode &errorCode);
153 
154     /**
155      * Looks up CEs for s and appends them to the ces array.
156      * Does not handle normalization: s should be in FCD form.
157      *
158      * Does not write completely ignorable CEs.
159      * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
160      *
161      * @return incremented cesLength
162      */
163     int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
164     int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
165                    int64_t ces[], int32_t cesLength);
166 
167 protected:
168     friend class CopyHelper;
169     friend class DataBuilderCollationIterator;
170 
171     uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
172 
173     int32_t addCE(int64_t ce, UErrorCode &errorCode);
174     int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
175     int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
176 
getConditionalCE32(int32_t index)177     inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
178         return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
179     }
getConditionalCE32ForCE32(uint32_t ce32)180     inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
181         return getConditionalCE32(Collation::indexFromCE32(ce32));
182     }
183 
makeBuilderContextCE32(int32_t index)184     static uint32_t makeBuilderContextCE32(int32_t index) {
185         return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
186     }
isBuilderContextCE32(uint32_t ce32)187     static inline UBool isBuilderContextCE32(uint32_t ce32) {
188         return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
189     }
190 
191     static uint32_t encodeOneCEAsCE32(int64_t ce);
192     uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
193     uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
194     uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
195 
196     uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
197     /**
198      * Copies base contractions to a list of ConditionalCE32.
199      * Sets cond->next to the index of the first new item
200      * and returns the index of the last new item.
201      */
202     int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
203                                          ConditionalCE32 *cond, UErrorCode &errorCode);
204 
205     UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
206     void setDigitTags(UErrorCode &errorCode);
207     void setLeadSurrogates(UErrorCode &errorCode);
208 
209     void buildMappings(CollationData &data, UErrorCode &errorCode);
210 
211     void clearContexts();
212     void buildContexts(UErrorCode &errorCode);
213     uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
214     int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
215                            UErrorCode &errorCode);
216 
217     void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
218 
219     int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
220 
jamoCpFromIndex(int32_t i)221     static UChar32 jamoCpFromIndex(int32_t i) {
222         // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
223         if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
224         i -= Hangul::JAMO_L_COUNT;
225         if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
226         i -= Hangul::JAMO_V_COUNT;
227         // i < 27
228         return Hangul::JAMO_T_BASE + 1 + i;
229     }
230 
231     /** @see Collation::BUILDER_DATA_TAG */
232     static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
233 
234     const Normalizer2Impl &nfcImpl;
235     const CollationData *base;
236     const CollationSettings *baseSettings;
237     UTrie2 *trie;
238     UVector32 ce32s;
239     UVector64 ce64s;
240     UVector conditionalCE32s;  // vector of ConditionalCE32
241     // Characters that have context (prefixes or contraction suffixes).
242     UnicodeSet contextChars;
243     // Serialized UCharsTrie structures for finalized contexts.
244     UnicodeString contexts;
245     UnicodeSet unsafeBackwardSet;
246     UBool modified;
247 
248     UBool fastLatinEnabled;
249     CollationFastLatinBuilder *fastLatinBuilder;
250 
251     DataBuilderCollationIterator *collIter;
252 };
253 
254 U_NAMESPACE_END
255 
256 #endif  // !UCONFIG_NO_COLLATION
257 #endif  // __COLLATIONDATABUILDER_H__
258