1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucharstriebuilder.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010nov14
12 * created by: Markus W. Scherer
13 */
14
15 #include "unicode/utypes.h"
16 #include "unicode/ucharstrie.h"
17 #include "unicode/ucharstriebuilder.h"
18 #include "unicode/unistr.h"
19 #include "unicode/ustring.h"
20 #include "cmemory.h"
21 #include "uarrsort.h"
22 #include "uassert.h"
23 #include "uhash.h"
24 #include "ustr_imp.h"
25
26 U_NAMESPACE_BEGIN
27
28 /*
29 * Note: This builder implementation stores (string, value) pairs with full copies
30 * of the 16-bit-unit sequences, until the UCharsTrie is built.
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32 */
33
34 class UCharsTrieElement : public UMemory {
35 public:
36 // Use compiler's default constructor, initializes nothing.
37
38 void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
39
getString(const UnicodeString & strings) const40 UnicodeString getString(const UnicodeString &strings) const {
41 int32_t length=strings[stringOffset];
42 return strings.tempSubString(stringOffset+1, length);
43 }
getStringLength(const UnicodeString & strings) const44 int32_t getStringLength(const UnicodeString &strings) const {
45 return strings[stringOffset];
46 }
47
charAt(int32_t index,const UnicodeString & strings) const48 UChar charAt(int32_t index, const UnicodeString &strings) const {
49 return strings[stringOffset+1+index];
50 }
51
getValue() const52 int32_t getValue() const { return value; }
53
54 int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
55
56 private:
57 // The first strings unit contains the string length.
58 // (Compared with a stringLength field here, this saves 2 bytes per string.)
59 int32_t stringOffset;
60 int32_t value;
61 };
62
63 void
setTo(const UnicodeString & s,int32_t val,UnicodeString & strings,UErrorCode & errorCode)64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
65 UnicodeString &strings, UErrorCode &errorCode) {
66 if(U_FAILURE(errorCode)) {
67 return;
68 }
69 int32_t length=s.length();
70 if(length>0xffff) {
71 // Too long: We store the length in 1 unit.
72 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
73 return;
74 }
75 stringOffset=strings.length();
76 strings.append((UChar)length);
77 value=val;
78 strings.append(s);
79 }
80
81 int32_t
compareStringTo(const UCharsTrieElement & other,const UnicodeString & strings) const82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
83 return getString(strings).compare(other.getString(strings));
84 }
85
UCharsTrieBuilder(UErrorCode &)86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
87 : elements(NULL), elementsCapacity(0), elementsLength(0),
88 uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
89
~UCharsTrieBuilder()90 UCharsTrieBuilder::~UCharsTrieBuilder() {
91 delete[] elements;
92 uprv_free(uchars);
93 }
94
95 UCharsTrieBuilder &
add(const UnicodeString & s,int32_t value,UErrorCode & errorCode)96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
97 if(U_FAILURE(errorCode)) {
98 return *this;
99 }
100 if(ucharsLength>0) {
101 // Cannot add elements after building.
102 errorCode=U_NO_WRITE_PERMISSION;
103 return *this;
104 }
105 if(elementsLength==elementsCapacity) {
106 int32_t newCapacity;
107 if(elementsCapacity==0) {
108 newCapacity=1024;
109 } else {
110 newCapacity=4*elementsCapacity;
111 }
112 UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
113 if(newElements==NULL) {
114 errorCode=U_MEMORY_ALLOCATION_ERROR;
115 return *this;
116 }
117 if(elementsLength>0) {
118 uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
119 }
120 delete[] elements;
121 elements=newElements;
122 elementsCapacity=newCapacity;
123 }
124 elements[elementsLength++].setTo(s, value, strings, errorCode);
125 if(U_SUCCESS(errorCode) && strings.isBogus()) {
126 errorCode=U_MEMORY_ALLOCATION_ERROR;
127 }
128 return *this;
129 }
130
131 U_CDECL_BEGIN
132
133 static int32_t U_CALLCONV
compareElementStrings(const void * context,const void * left,const void * right)134 compareElementStrings(const void *context, const void *left, const void *right) {
135 const UnicodeString *strings=static_cast<const UnicodeString *>(context);
136 const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
137 const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
138 return leftElement->compareStringTo(*rightElement, *strings);
139 }
140
141 U_CDECL_END
142
143 UCharsTrie *
build(UStringTrieBuildOption buildOption,UErrorCode & errorCode)144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
145 buildUChars(buildOption, errorCode);
146 UCharsTrie *newTrie=NULL;
147 if(U_SUCCESS(errorCode)) {
148 newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
149 if(newTrie==NULL) {
150 errorCode=U_MEMORY_ALLOCATION_ERROR;
151 } else {
152 uchars=NULL; // The new trie now owns the array.
153 ucharsCapacity=0;
154 }
155 }
156 return newTrie;
157 }
158
159 UnicodeString &
buildUnicodeString(UStringTrieBuildOption buildOption,UnicodeString & result,UErrorCode & errorCode)160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
161 UErrorCode &errorCode) {
162 buildUChars(buildOption, errorCode);
163 if(U_SUCCESS(errorCode)) {
164 result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
165 }
166 return result;
167 }
168
169 void
buildUChars(UStringTrieBuildOption buildOption,UErrorCode & errorCode)170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
171 if(U_FAILURE(errorCode)) {
172 return;
173 }
174 if(uchars!=NULL && ucharsLength>0) {
175 // Already built.
176 return;
177 }
178 if(ucharsLength==0) {
179 if(elementsLength==0) {
180 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
181 return;
182 }
183 if(strings.isBogus()) {
184 errorCode=U_MEMORY_ALLOCATION_ERROR;
185 return;
186 }
187 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
188 compareElementStrings, &strings,
189 FALSE, // need not be a stable sort
190 &errorCode);
191 if(U_FAILURE(errorCode)) {
192 return;
193 }
194 // Duplicate strings are not allowed.
195 UnicodeString prev=elements[0].getString(strings);
196 for(int32_t i=1; i<elementsLength; ++i) {
197 UnicodeString current=elements[i].getString(strings);
198 if(prev==current) {
199 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
200 return;
201 }
202 prev.fastCopyFrom(current);
203 }
204 }
205 // Create and UChar-serialize the trie for the elements.
206 ucharsLength=0;
207 int32_t capacity=strings.length();
208 if(capacity<1024) {
209 capacity=1024;
210 }
211 if(ucharsCapacity<capacity) {
212 uprv_free(uchars);
213 uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
214 if(uchars==NULL) {
215 errorCode=U_MEMORY_ALLOCATION_ERROR;
216 ucharsCapacity=0;
217 return;
218 }
219 ucharsCapacity=capacity;
220 }
221 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
222 if(uchars==NULL) {
223 errorCode=U_MEMORY_ALLOCATION_ERROR;
224 }
225 }
226
227 int32_t
getElementStringLength(int32_t i) const228 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
229 return elements[i].getStringLength(strings);
230 }
231
232 UChar
getElementUnit(int32_t i,int32_t unitIndex) const233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
234 return elements[i].charAt(unitIndex, strings);
235 }
236
237 int32_t
getElementValue(int32_t i) const238 UCharsTrieBuilder::getElementValue(int32_t i) const {
239 return elements[i].getValue();
240 }
241
242 int32_t
getLimitOfLinearMatch(int32_t first,int32_t last,int32_t unitIndex) const243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
244 const UCharsTrieElement &firstElement=elements[first];
245 const UCharsTrieElement &lastElement=elements[last];
246 int32_t minStringLength=firstElement.getStringLength(strings);
247 while(++unitIndex<minStringLength &&
248 firstElement.charAt(unitIndex, strings)==
249 lastElement.charAt(unitIndex, strings)) {}
250 return unitIndex;
251 }
252
253 int32_t
countElementUnits(int32_t start,int32_t limit,int32_t unitIndex) const254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
255 int32_t length=0; // Number of different units at unitIndex.
256 int32_t i=start;
257 do {
258 UChar unit=elements[i++].charAt(unitIndex, strings);
259 while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
260 ++i;
261 }
262 ++length;
263 } while(i<limit);
264 return length;
265 }
266
267 int32_t
skipElementsBySomeUnits(int32_t i,int32_t unitIndex,int32_t count) const268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
269 do {
270 UChar unit=elements[i++].charAt(unitIndex, strings);
271 while(unit==elements[i].charAt(unitIndex, strings)) {
272 ++i;
273 }
274 } while(--count>0);
275 return i;
276 }
277
278 int32_t
indexOfElementWithNextUnit(int32_t i,int32_t unitIndex,UChar unit) const279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
280 while(unit==elements[i].charAt(unitIndex, strings)) {
281 ++i;
282 }
283 return i;
284 }
285
UCTLinearMatchNode(const UChar * units,int32_t len,Node * nextNode)286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
287 : LinearMatchNode(len, nextNode), s(units) {
288 hash=hash*37+ustr_hashUCharsN(units, len);
289 }
290
291 UBool
operator ==(const Node & other) const292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
293 if(this==&other) {
294 return TRUE;
295 }
296 if(!LinearMatchNode::operator==(other)) {
297 return FALSE;
298 }
299 const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
300 return 0==u_memcmp(s, o.s, length);
301 }
302
303 void
write(StringTrieBuilder & builder)304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
305 UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
306 next->write(builder);
307 b.write(s, length);
308 offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
309 }
310
311 StringTrieBuilder::Node *
createLinearMatchNode(int32_t i,int32_t unitIndex,int32_t length,Node * nextNode) const312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
313 Node *nextNode) const {
314 return new UCTLinearMatchNode(
315 elements[i].getString(strings).getBuffer()+unitIndex,
316 length,
317 nextNode);
318 }
319
320 UBool
ensureCapacity(int32_t length)321 UCharsTrieBuilder::ensureCapacity(int32_t length) {
322 if(uchars==NULL) {
323 return FALSE; // previous memory allocation had failed
324 }
325 if(length>ucharsCapacity) {
326 int32_t newCapacity=ucharsCapacity;
327 do {
328 newCapacity*=2;
329 } while(newCapacity<=length);
330 UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
331 if(newUChars==NULL) {
332 // unable to allocate memory
333 uprv_free(uchars);
334 uchars=NULL;
335 ucharsCapacity=0;
336 return FALSE;
337 }
338 u_memcpy(newUChars+(newCapacity-ucharsLength),
339 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
340 uprv_free(uchars);
341 uchars=newUChars;
342 ucharsCapacity=newCapacity;
343 }
344 return TRUE;
345 }
346
347 int32_t
write(int32_t unit)348 UCharsTrieBuilder::write(int32_t unit) {
349 int32_t newLength=ucharsLength+1;
350 if(ensureCapacity(newLength)) {
351 ucharsLength=newLength;
352 uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
353 }
354 return ucharsLength;
355 }
356
357 int32_t
write(const UChar * s,int32_t length)358 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
359 int32_t newLength=ucharsLength+length;
360 if(ensureCapacity(newLength)) {
361 ucharsLength=newLength;
362 u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
363 }
364 return ucharsLength;
365 }
366
367 int32_t
writeElementUnits(int32_t i,int32_t unitIndex,int32_t length)368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
369 return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
370 }
371
372 int32_t
writeValueAndFinal(int32_t i,UBool isFinal)373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
374 if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
375 return write(i|(isFinal<<15));
376 }
377 UChar intUnits[3];
378 int32_t length;
379 if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
380 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
381 intUnits[1]=(UChar)((uint32_t)i>>16);
382 intUnits[2]=(UChar)i;
383 length=3;
384 // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
385 // intUnits[0]=(UChar)(i);
386 // length=1;
387 } else {
388 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
389 intUnits[1]=(UChar)i;
390 length=2;
391 }
392 intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
393 return write(intUnits, length);
394 }
395
396 int32_t
writeValueAndType(UBool hasValue,int32_t value,int32_t node)397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
398 if(!hasValue) {
399 return write(node);
400 }
401 UChar intUnits[3];
402 int32_t length;
403 if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
404 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
405 intUnits[1]=(UChar)((uint32_t)value>>16);
406 intUnits[2]=(UChar)value;
407 length=3;
408 } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
409 intUnits[0]=(UChar)((value+1)<<6);
410 length=1;
411 } else {
412 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
413 intUnits[1]=(UChar)value;
414 length=2;
415 }
416 intUnits[0]|=(UChar)node;
417 return write(intUnits, length);
418 }
419
420 int32_t
writeDeltaTo(int32_t jumpTarget)421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
422 int32_t i=ucharsLength-jumpTarget;
423 U_ASSERT(i>=0);
424 if(i<=UCharsTrie::kMaxOneUnitDelta) {
425 return write(i);
426 }
427 UChar intUnits[3];
428 int32_t length;
429 if(i<=UCharsTrie::kMaxTwoUnitDelta) {
430 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
431 length=1;
432 } else {
433 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
434 intUnits[1]=(UChar)(i>>16);
435 length=2;
436 }
437 intUnits[length++]=(UChar)i;
438 return write(intUnits, length);
439 }
440
441 U_NAMESPACE_END
442