1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  ucharstriebuilder.h
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010nov14
12 *   created by: Markus W. Scherer
13 */
14 
15 #include "unicode/utypes.h"
16 #include "unicode/ucharstrie.h"
17 #include "unicode/ucharstriebuilder.h"
18 #include "unicode/unistr.h"
19 #include "unicode/ustring.h"
20 #include "cmemory.h"
21 #include "uarrsort.h"
22 #include "uassert.h"
23 #include "uhash.h"
24 #include "ustr_imp.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 /*
29  * Note: This builder implementation stores (string, value) pairs with full copies
30  * of the 16-bit-unit sequences, until the UCharsTrie is built.
31  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32  */
33 
34 class UCharsTrieElement : public UMemory {
35 public:
36     // Use compiler's default constructor, initializes nothing.
37 
38     void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
39 
getString(const UnicodeString & strings) const40     UnicodeString getString(const UnicodeString &strings) const {
41         int32_t length=strings[stringOffset];
42         return strings.tempSubString(stringOffset+1, length);
43     }
getStringLength(const UnicodeString & strings) const44     int32_t getStringLength(const UnicodeString &strings) const {
45         return strings[stringOffset];
46     }
47 
charAt(int32_t index,const UnicodeString & strings) const48     UChar charAt(int32_t index, const UnicodeString &strings) const {
49         return strings[stringOffset+1+index];
50     }
51 
getValue() const52     int32_t getValue() const { return value; }
53 
54     int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
55 
56 private:
57     // The first strings unit contains the string length.
58     // (Compared with a stringLength field here, this saves 2 bytes per string.)
59     int32_t stringOffset;
60     int32_t value;
61 };
62 
63 void
setTo(const UnicodeString & s,int32_t val,UnicodeString & strings,UErrorCode & errorCode)64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
65                          UnicodeString &strings, UErrorCode &errorCode) {
66     if(U_FAILURE(errorCode)) {
67         return;
68     }
69     int32_t length=s.length();
70     if(length>0xffff) {
71         // Too long: We store the length in 1 unit.
72         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
73         return;
74     }
75     stringOffset=strings.length();
76     strings.append((UChar)length);
77     value=val;
78     strings.append(s);
79 }
80 
81 int32_t
compareStringTo(const UCharsTrieElement & other,const UnicodeString & strings) const82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
83     return getString(strings).compare(other.getString(strings));
84 }
85 
UCharsTrieBuilder(UErrorCode &)86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
87         : elements(NULL), elementsCapacity(0), elementsLength(0),
88           uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
89 
~UCharsTrieBuilder()90 UCharsTrieBuilder::~UCharsTrieBuilder() {
91     delete[] elements;
92     uprv_free(uchars);
93 }
94 
95 UCharsTrieBuilder &
add(const UnicodeString & s,int32_t value,UErrorCode & errorCode)96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
97     if(U_FAILURE(errorCode)) {
98         return *this;
99     }
100     if(ucharsLength>0) {
101         // Cannot add elements after building.
102         errorCode=U_NO_WRITE_PERMISSION;
103         return *this;
104     }
105     if(elementsLength==elementsCapacity) {
106         int32_t newCapacity;
107         if(elementsCapacity==0) {
108             newCapacity=1024;
109         } else {
110             newCapacity=4*elementsCapacity;
111         }
112         UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
113         if(newElements==NULL) {
114             errorCode=U_MEMORY_ALLOCATION_ERROR;
115             return *this;
116         }
117         if(elementsLength>0) {
118             uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
119         }
120         delete[] elements;
121         elements=newElements;
122         elementsCapacity=newCapacity;
123     }
124     elements[elementsLength++].setTo(s, value, strings, errorCode);
125     if(U_SUCCESS(errorCode) && strings.isBogus()) {
126         errorCode=U_MEMORY_ALLOCATION_ERROR;
127     }
128     return *this;
129 }
130 
131 U_CDECL_BEGIN
132 
133 static int32_t U_CALLCONV
compareElementStrings(const void * context,const void * left,const void * right)134 compareElementStrings(const void *context, const void *left, const void *right) {
135     const UnicodeString *strings=static_cast<const UnicodeString *>(context);
136     const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
137     const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
138     return leftElement->compareStringTo(*rightElement, *strings);
139 }
140 
141 U_CDECL_END
142 
143 UCharsTrie *
build(UStringTrieBuildOption buildOption,UErrorCode & errorCode)144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
145     buildUChars(buildOption, errorCode);
146     UCharsTrie *newTrie=NULL;
147     if(U_SUCCESS(errorCode)) {
148         newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
149         if(newTrie==NULL) {
150             errorCode=U_MEMORY_ALLOCATION_ERROR;
151         } else {
152             uchars=NULL;  // The new trie now owns the array.
153             ucharsCapacity=0;
154         }
155     }
156     return newTrie;
157 }
158 
159 UnicodeString &
buildUnicodeString(UStringTrieBuildOption buildOption,UnicodeString & result,UErrorCode & errorCode)160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
161                                       UErrorCode &errorCode) {
162     buildUChars(buildOption, errorCode);
163     if(U_SUCCESS(errorCode)) {
164         result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
165     }
166     return result;
167 }
168 
169 void
buildUChars(UStringTrieBuildOption buildOption,UErrorCode & errorCode)170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
171     if(U_FAILURE(errorCode)) {
172         return;
173     }
174     if(uchars!=NULL && ucharsLength>0) {
175         // Already built.
176         return;
177     }
178     if(ucharsLength==0) {
179         if(elementsLength==0) {
180             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
181             return;
182         }
183         if(strings.isBogus()) {
184             errorCode=U_MEMORY_ALLOCATION_ERROR;
185             return;
186         }
187         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
188                       compareElementStrings, &strings,
189                       FALSE,  // need not be a stable sort
190                       &errorCode);
191         if(U_FAILURE(errorCode)) {
192             return;
193         }
194         // Duplicate strings are not allowed.
195         UnicodeString prev=elements[0].getString(strings);
196         for(int32_t i=1; i<elementsLength; ++i) {
197             UnicodeString current=elements[i].getString(strings);
198             if(prev==current) {
199                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
200                 return;
201             }
202             prev.fastCopyFrom(current);
203         }
204     }
205     // Create and UChar-serialize the trie for the elements.
206     ucharsLength=0;
207     int32_t capacity=strings.length();
208     if(capacity<1024) {
209         capacity=1024;
210     }
211     if(ucharsCapacity<capacity) {
212         uprv_free(uchars);
213         uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
214         if(uchars==NULL) {
215             errorCode=U_MEMORY_ALLOCATION_ERROR;
216             ucharsCapacity=0;
217             return;
218         }
219         ucharsCapacity=capacity;
220     }
221     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
222     if(uchars==NULL) {
223         errorCode=U_MEMORY_ALLOCATION_ERROR;
224     }
225 }
226 
227 int32_t
getElementStringLength(int32_t i) const228 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
229     return elements[i].getStringLength(strings);
230 }
231 
232 UChar
getElementUnit(int32_t i,int32_t unitIndex) const233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
234     return elements[i].charAt(unitIndex, strings);
235 }
236 
237 int32_t
getElementValue(int32_t i) const238 UCharsTrieBuilder::getElementValue(int32_t i) const {
239     return elements[i].getValue();
240 }
241 
242 int32_t
getLimitOfLinearMatch(int32_t first,int32_t last,int32_t unitIndex) const243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
244     const UCharsTrieElement &firstElement=elements[first];
245     const UCharsTrieElement &lastElement=elements[last];
246     int32_t minStringLength=firstElement.getStringLength(strings);
247     while(++unitIndex<minStringLength &&
248             firstElement.charAt(unitIndex, strings)==
249             lastElement.charAt(unitIndex, strings)) {}
250     return unitIndex;
251 }
252 
253 int32_t
countElementUnits(int32_t start,int32_t limit,int32_t unitIndex) const254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
255     int32_t length=0;  // Number of different units at unitIndex.
256     int32_t i=start;
257     do {
258         UChar unit=elements[i++].charAt(unitIndex, strings);
259         while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
260             ++i;
261         }
262         ++length;
263     } while(i<limit);
264     return length;
265 }
266 
267 int32_t
skipElementsBySomeUnits(int32_t i,int32_t unitIndex,int32_t count) const268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
269     do {
270         UChar unit=elements[i++].charAt(unitIndex, strings);
271         while(unit==elements[i].charAt(unitIndex, strings)) {
272             ++i;
273         }
274     } while(--count>0);
275     return i;
276 }
277 
278 int32_t
indexOfElementWithNextUnit(int32_t i,int32_t unitIndex,UChar unit) const279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
280     while(unit==elements[i].charAt(unitIndex, strings)) {
281         ++i;
282     }
283     return i;
284 }
285 
UCTLinearMatchNode(const UChar * units,int32_t len,Node * nextNode)286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
287         : LinearMatchNode(len, nextNode), s(units) {
288     hash=hash*37+ustr_hashUCharsN(units, len);
289 }
290 
291 UBool
operator ==(const Node & other) const292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
293     if(this==&other) {
294         return TRUE;
295     }
296     if(!LinearMatchNode::operator==(other)) {
297         return FALSE;
298     }
299     const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
300     return 0==u_memcmp(s, o.s, length);
301 }
302 
303 void
write(StringTrieBuilder & builder)304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
305     UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
306     next->write(builder);
307     b.write(s, length);
308     offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
309 }
310 
311 StringTrieBuilder::Node *
createLinearMatchNode(int32_t i,int32_t unitIndex,int32_t length,Node * nextNode) const312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
313                                          Node *nextNode) const {
314     return new UCTLinearMatchNode(
315             elements[i].getString(strings).getBuffer()+unitIndex,
316             length,
317             nextNode);
318 }
319 
320 UBool
ensureCapacity(int32_t length)321 UCharsTrieBuilder::ensureCapacity(int32_t length) {
322     if(uchars==NULL) {
323         return FALSE;  // previous memory allocation had failed
324     }
325     if(length>ucharsCapacity) {
326         int32_t newCapacity=ucharsCapacity;
327         do {
328             newCapacity*=2;
329         } while(newCapacity<=length);
330         UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
331         if(newUChars==NULL) {
332             // unable to allocate memory
333             uprv_free(uchars);
334             uchars=NULL;
335             ucharsCapacity=0;
336             return FALSE;
337         }
338         u_memcpy(newUChars+(newCapacity-ucharsLength),
339                  uchars+(ucharsCapacity-ucharsLength), ucharsLength);
340         uprv_free(uchars);
341         uchars=newUChars;
342         ucharsCapacity=newCapacity;
343     }
344     return TRUE;
345 }
346 
347 int32_t
write(int32_t unit)348 UCharsTrieBuilder::write(int32_t unit) {
349     int32_t newLength=ucharsLength+1;
350     if(ensureCapacity(newLength)) {
351         ucharsLength=newLength;
352         uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
353     }
354     return ucharsLength;
355 }
356 
357 int32_t
write(const UChar * s,int32_t length)358 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
359     int32_t newLength=ucharsLength+length;
360     if(ensureCapacity(newLength)) {
361         ucharsLength=newLength;
362         u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
363     }
364     return ucharsLength;
365 }
366 
367 int32_t
writeElementUnits(int32_t i,int32_t unitIndex,int32_t length)368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
369     return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
370 }
371 
372 int32_t
writeValueAndFinal(int32_t i,UBool isFinal)373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
374     if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
375         return write(i|(isFinal<<15));
376     }
377     UChar intUnits[3];
378     int32_t length;
379     if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
380         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
381         intUnits[1]=(UChar)((uint32_t)i>>16);
382         intUnits[2]=(UChar)i;
383         length=3;
384     // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
385     //     intUnits[0]=(UChar)(i);
386     //     length=1;
387     } else {
388         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
389         intUnits[1]=(UChar)i;
390         length=2;
391     }
392     intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
393     return write(intUnits, length);
394 }
395 
396 int32_t
writeValueAndType(UBool hasValue,int32_t value,int32_t node)397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
398     if(!hasValue) {
399         return write(node);
400     }
401     UChar intUnits[3];
402     int32_t length;
403     if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
404         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
405         intUnits[1]=(UChar)((uint32_t)value>>16);
406         intUnits[2]=(UChar)value;
407         length=3;
408     } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
409         intUnits[0]=(UChar)((value+1)<<6);
410         length=1;
411     } else {
412         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
413         intUnits[1]=(UChar)value;
414         length=2;
415     }
416     intUnits[0]|=(UChar)node;
417     return write(intUnits, length);
418 }
419 
420 int32_t
writeDeltaTo(int32_t jumpTarget)421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
422     int32_t i=ucharsLength-jumpTarget;
423     U_ASSERT(i>=0);
424     if(i<=UCharsTrie::kMaxOneUnitDelta) {
425         return write(i);
426     }
427     UChar intUnits[3];
428     int32_t length;
429     if(i<=UCharsTrie::kMaxTwoUnitDelta) {
430         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
431         length=1;
432     } else {
433         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
434         intUnits[1]=(UChar)(i>>16);
435         length=2;
436     }
437     intUnits[length++]=(UChar)i;
438     return write(intUnits, length);
439 }
440 
441 U_NAMESPACE_END
442