1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  bytestriebuilder.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010sep25
12 *   created by: Markus W. Scherer
13 */
14 
15 #include "unicode/utypes.h"
16 #include "unicode/bytestrie.h"
17 #include "unicode/bytestriebuilder.h"
18 #include "unicode/stringpiece.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "uhash.h"
22 #include "uarrsort.h"
23 #include "uassert.h"
24 #include "ustr_imp.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 /*
29  * Note: This builder implementation stores (bytes, value) pairs with full copies
30  * of the byte sequences, until the BytesTrie is built.
31  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32  */
33 
34 class BytesTrieElement : public UMemory {
35 public:
36     // Use compiler's default constructor, initializes nothing.
37 
38     void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
39 
getString(const CharString & strings) const40     StringPiece getString(const CharString &strings) const {
41         int32_t offset=stringOffset;
42         int32_t length;
43         if(offset>=0) {
44             length=(uint8_t)strings[offset++];
45         } else {
46             offset=~offset;
47             length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
48             offset+=2;
49         }
50         return StringPiece(strings.data()+offset, length);
51     }
getStringLength(const CharString & strings) const52     int32_t getStringLength(const CharString &strings) const {
53         int32_t offset=stringOffset;
54         if(offset>=0) {
55             return (uint8_t)strings[offset];
56         } else {
57             offset=~offset;
58             return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
59         }
60     }
61 
charAt(int32_t index,const CharString & strings) const62     char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
63 
getValue() const64     int32_t getValue() const { return value; }
65 
66     int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
67 
68 private:
data(const CharString & strings) const69     const char *data(const CharString &strings) const {
70         int32_t offset=stringOffset;
71         if(offset>=0) {
72             ++offset;
73         } else {
74             offset=~offset+2;
75         }
76         return strings.data()+offset;
77     }
78 
79     // If the stringOffset is non-negative, then the first strings byte contains
80     // the string length.
81     // If the stringOffset is negative, then the first two strings bytes contain
82     // the string length (big-endian), and the offset needs to be bit-inverted.
83     // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
84     int32_t stringOffset;
85     int32_t value;
86 };
87 
88 void
setTo(const StringPiece & s,int32_t val,CharString & strings,UErrorCode & errorCode)89 BytesTrieElement::setTo(const StringPiece &s, int32_t val,
90                         CharString &strings, UErrorCode &errorCode) {
91     if(U_FAILURE(errorCode)) {
92         return;
93     }
94     int32_t length=s.length();
95     if(length>0xffff) {
96         // Too long: We store the length in 1 or 2 bytes.
97         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
98         return;
99     }
100     int32_t offset=strings.length();
101     if(length>0xff) {
102         offset=~offset;
103         strings.append((char)(length>>8), errorCode);
104     }
105     strings.append((char)length, errorCode);
106     stringOffset=offset;
107     value=val;
108     strings.append(s, errorCode);
109 }
110 
111 int32_t
compareStringTo(const BytesTrieElement & other,const CharString & strings) const112 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
113     // TODO: add StringPiece::compare(), see ticket #8187
114     StringPiece thisString=getString(strings);
115     StringPiece otherString=other.getString(strings);
116     int32_t lengthDiff=thisString.length()-otherString.length();
117     int32_t commonLength;
118     if(lengthDiff<=0) {
119         commonLength=thisString.length();
120     } else {
121         commonLength=otherString.length();
122     }
123     int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
124     return diff!=0 ? diff : lengthDiff;
125 }
126 
BytesTrieBuilder(UErrorCode & errorCode)127 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
128         : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
129           bytes(NULL), bytesCapacity(0), bytesLength(0) {
130     if(U_FAILURE(errorCode)) {
131         return;
132     }
133     strings=new CharString();
134     if(strings==NULL) {
135         errorCode=U_MEMORY_ALLOCATION_ERROR;
136     }
137 }
138 
~BytesTrieBuilder()139 BytesTrieBuilder::~BytesTrieBuilder() {
140     delete strings;
141     delete[] elements;
142     uprv_free(bytes);
143 }
144 
145 BytesTrieBuilder &
add(const StringPiece & s,int32_t value,UErrorCode & errorCode)146 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
147     if(U_FAILURE(errorCode)) {
148         return *this;
149     }
150     if(bytesLength>0) {
151         // Cannot add elements after building.
152         errorCode=U_NO_WRITE_PERMISSION;
153         return *this;
154     }
155     if(elementsLength==elementsCapacity) {
156         int32_t newCapacity;
157         if(elementsCapacity==0) {
158             newCapacity=1024;
159         } else {
160             newCapacity=4*elementsCapacity;
161         }
162         BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
163         if(newElements==NULL) {
164             errorCode=U_MEMORY_ALLOCATION_ERROR;
165             return *this; // error instead of dereferencing null
166         }
167         if(elementsLength>0) {
168             uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
169         }
170         delete[] elements;
171         elements=newElements;
172         elementsCapacity=newCapacity;
173     }
174     elements[elementsLength++].setTo(s, value, *strings, errorCode);
175     return *this;
176 }
177 
178 U_CDECL_BEGIN
179 
180 static int32_t U_CALLCONV
compareElementStrings(const void * context,const void * left,const void * right)181 compareElementStrings(const void *context, const void *left, const void *right) {
182     const CharString *strings=static_cast<const CharString *>(context);
183     const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
184     const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
185     return leftElement->compareStringTo(*rightElement, *strings);
186 }
187 
188 U_CDECL_END
189 
190 BytesTrie *
build(UStringTrieBuildOption buildOption,UErrorCode & errorCode)191 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
192     buildBytes(buildOption, errorCode);
193     BytesTrie *newTrie=NULL;
194     if(U_SUCCESS(errorCode)) {
195         newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
196         if(newTrie==NULL) {
197             errorCode=U_MEMORY_ALLOCATION_ERROR;
198         } else {
199             bytes=NULL;  // The new trie now owns the array.
200             bytesCapacity=0;
201         }
202     }
203     return newTrie;
204 }
205 
206 StringPiece
buildStringPiece(UStringTrieBuildOption buildOption,UErrorCode & errorCode)207 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
208     buildBytes(buildOption, errorCode);
209     StringPiece result;
210     if(U_SUCCESS(errorCode)) {
211         result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
212     }
213     return result;
214 }
215 
216 void
buildBytes(UStringTrieBuildOption buildOption,UErrorCode & errorCode)217 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
218     if(U_FAILURE(errorCode)) {
219         return;
220     }
221     if(bytes!=NULL && bytesLength>0) {
222         // Already built.
223         return;
224     }
225     if(bytesLength==0) {
226         if(elementsLength==0) {
227             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
228             return;
229         }
230         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
231                       compareElementStrings, strings,
232                       FALSE,  // need not be a stable sort
233                       &errorCode);
234         if(U_FAILURE(errorCode)) {
235             return;
236         }
237         // Duplicate strings are not allowed.
238         StringPiece prev=elements[0].getString(*strings);
239         for(int32_t i=1; i<elementsLength; ++i) {
240             StringPiece current=elements[i].getString(*strings);
241             if(prev==current) {
242                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
243                 return;
244             }
245             prev=current;
246         }
247     }
248     // Create and byte-serialize the trie for the elements.
249     bytesLength=0;
250     int32_t capacity=strings->length();
251     if(capacity<1024) {
252         capacity=1024;
253     }
254     if(bytesCapacity<capacity) {
255         uprv_free(bytes);
256         bytes=static_cast<char *>(uprv_malloc(capacity));
257         if(bytes==NULL) {
258             errorCode=U_MEMORY_ALLOCATION_ERROR;
259             bytesCapacity=0;
260             return;
261         }
262         bytesCapacity=capacity;
263     }
264     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
265     if(bytes==NULL) {
266         errorCode=U_MEMORY_ALLOCATION_ERROR;
267     }
268 }
269 
270 BytesTrieBuilder &
clear()271 BytesTrieBuilder::clear() {
272     strings->clear();
273     elementsLength=0;
274     bytesLength=0;
275     return *this;
276 }
277 
278 int32_t
getElementStringLength(int32_t i) const279 BytesTrieBuilder::getElementStringLength(int32_t i) const {
280     return elements[i].getStringLength(*strings);
281 }
282 
283 UChar
getElementUnit(int32_t i,int32_t byteIndex) const284 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
285     return (uint8_t)elements[i].charAt(byteIndex, *strings);
286 }
287 
288 int32_t
getElementValue(int32_t i) const289 BytesTrieBuilder::getElementValue(int32_t i) const {
290     return elements[i].getValue();
291 }
292 
293 int32_t
getLimitOfLinearMatch(int32_t first,int32_t last,int32_t byteIndex) const294 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
295     const BytesTrieElement &firstElement=elements[first];
296     const BytesTrieElement &lastElement=elements[last];
297     int32_t minStringLength=firstElement.getStringLength(*strings);
298     while(++byteIndex<minStringLength &&
299             firstElement.charAt(byteIndex, *strings)==
300             lastElement.charAt(byteIndex, *strings)) {}
301     return byteIndex;
302 }
303 
304 int32_t
countElementUnits(int32_t start,int32_t limit,int32_t byteIndex) const305 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
306     int32_t length=0;  // Number of different bytes at byteIndex.
307     int32_t i=start;
308     do {
309         char byte=elements[i++].charAt(byteIndex, *strings);
310         while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
311             ++i;
312         }
313         ++length;
314     } while(i<limit);
315     return length;
316 }
317 
318 int32_t
skipElementsBySomeUnits(int32_t i,int32_t byteIndex,int32_t count) const319 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
320     do {
321         char byte=elements[i++].charAt(byteIndex, *strings);
322         while(byte==elements[i].charAt(byteIndex, *strings)) {
323             ++i;
324         }
325     } while(--count>0);
326     return i;
327 }
328 
329 int32_t
indexOfElementWithNextUnit(int32_t i,int32_t byteIndex,UChar byte) const330 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
331     char b=(char)byte;
332     while(b==elements[i].charAt(byteIndex, *strings)) {
333         ++i;
334     }
335     return i;
336 }
337 
BTLinearMatchNode(const char * bytes,int32_t len,Node * nextNode)338 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
339         : LinearMatchNode(len, nextNode), s(bytes) {
340     hash=hash*37+ustr_hashCharsN(bytes, len);
341 }
342 
343 UBool
operator ==(const Node & other) const344 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
345     if(this==&other) {
346         return TRUE;
347     }
348     if(!LinearMatchNode::operator==(other)) {
349         return FALSE;
350     }
351     const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
352     return 0==uprv_memcmp(s, o.s, length);
353 }
354 
355 void
write(StringTrieBuilder & builder)356 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
357     BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
358     next->write(builder);
359     b.write(s, length);
360     offset=b.write(b.getMinLinearMatch()+length-1);
361 }
362 
363 StringTrieBuilder::Node *
createLinearMatchNode(int32_t i,int32_t byteIndex,int32_t length,Node * nextNode) const364 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
365                                         Node *nextNode) const {
366     return new BTLinearMatchNode(
367             elements[i].getString(*strings).data()+byteIndex,
368             length,
369             nextNode);
370 }
371 
372 UBool
ensureCapacity(int32_t length)373 BytesTrieBuilder::ensureCapacity(int32_t length) {
374     if(bytes==NULL) {
375         return FALSE;  // previous memory allocation had failed
376     }
377     if(length>bytesCapacity) {
378         int32_t newCapacity=bytesCapacity;
379         do {
380             newCapacity*=2;
381         } while(newCapacity<=length);
382         char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
383         if(newBytes==NULL) {
384             // unable to allocate memory
385             uprv_free(bytes);
386             bytes=NULL;
387             bytesCapacity=0;
388             return FALSE;
389         }
390         uprv_memcpy(newBytes+(newCapacity-bytesLength),
391                     bytes+(bytesCapacity-bytesLength), bytesLength);
392         uprv_free(bytes);
393         bytes=newBytes;
394         bytesCapacity=newCapacity;
395     }
396     return TRUE;
397 }
398 
399 int32_t
write(int32_t byte)400 BytesTrieBuilder::write(int32_t byte) {
401     int32_t newLength=bytesLength+1;
402     if(ensureCapacity(newLength)) {
403         bytesLength=newLength;
404         bytes[bytesCapacity-bytesLength]=(char)byte;
405     }
406     return bytesLength;
407 }
408 
409 int32_t
write(const char * b,int32_t length)410 BytesTrieBuilder::write(const char *b, int32_t length) {
411     int32_t newLength=bytesLength+length;
412     if(ensureCapacity(newLength)) {
413         bytesLength=newLength;
414         uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
415     }
416     return bytesLength;
417 }
418 
419 int32_t
writeElementUnits(int32_t i,int32_t byteIndex,int32_t length)420 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
421     return write(elements[i].getString(*strings).data()+byteIndex, length);
422 }
423 
424 int32_t
writeValueAndFinal(int32_t i,UBool isFinal)425 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
426     if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
427         return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
428     }
429     char intBytes[5];
430     int32_t length=1;
431     if(i<0 || i>0xffffff) {
432         intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
433         intBytes[1]=(char)((uint32_t)i>>24);
434         intBytes[2]=(char)((uint32_t)i>>16);
435         intBytes[3]=(char)((uint32_t)i>>8);
436         intBytes[4]=(char)i;
437         length=5;
438     // } else if(i<=BytesTrie::kMaxOneByteValue) {
439     //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
440     } else {
441         if(i<=BytesTrie::kMaxTwoByteValue) {
442             intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
443         } else {
444             if(i<=BytesTrie::kMaxThreeByteValue) {
445                 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
446             } else {
447                 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
448                 intBytes[1]=(char)(i>>16);
449                 length=2;
450             }
451             intBytes[length++]=(char)(i>>8);
452         }
453         intBytes[length++]=(char)i;
454     }
455     intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
456     return write(intBytes, length);
457 }
458 
459 int32_t
writeValueAndType(UBool hasValue,int32_t value,int32_t node)460 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
461     int32_t offset=write(node);
462     if(hasValue) {
463         offset=writeValueAndFinal(value, FALSE);
464     }
465     return offset;
466 }
467 
468 int32_t
writeDeltaTo(int32_t jumpTarget)469 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
470     int32_t i=bytesLength-jumpTarget;
471     U_ASSERT(i>=0);
472     if(i<=BytesTrie::kMaxOneByteDelta) {
473         return write(i);
474     }
475     char intBytes[5];
476     int32_t length;
477     if(i<=BytesTrie::kMaxTwoByteDelta) {
478         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
479         length=1;
480     } else {
481         if(i<=BytesTrie::kMaxThreeByteDelta) {
482             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
483             length=2;
484         } else {
485             if(i<=0xffffff) {
486                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
487                 length=3;
488             } else {
489                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
490                 intBytes[1]=(char)(i>>24);
491                 length=4;
492             }
493             intBytes[1]=(char)(i>>16);
494         }
495         intBytes[1]=(char)(i>>8);
496     }
497     intBytes[length++]=(char)i;
498     return write(intBytes, length);
499 }
500 
501 U_NAMESPACE_END
502