1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7 
8 /*
9 * File coleitr.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 *  Date      Name        Description
16 *
17 *  6/23/97   helena      Adding comments to make code more readable.
18 * 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
19 * 12/10/99   aliu        Ported Thai collation support from Java.
20 * 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
21 * 02/19/01   swquek      Removed CollationElementIterator() since it is
22 *                        private constructor and no calls are made to it
23 * 2012-2014  markus      Rewritten in C++ again.
24 */
25 
26 #include "unicode/utypes.h"
27 
28 #if !UCONFIG_NO_COLLATION
29 
30 #include "unicode/coleitr.h"
31 #include "unicode/tblcoll.h"
32 #include "unicode/ustring.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationiterator.h"
37 #include "collationsets.h"
38 #include "collationtailoring.h"
39 #include "uassert.h"
40 #include "uhash.h"
41 #include "utf16collationiterator.h"
42 #include "uvectr32.h"
43 
44 /* Constants --------------------------------------------------------------- */
45 
46 U_NAMESPACE_BEGIN
47 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
49 
50 /* CollationElementIterator public constructor/destructor ------------------ */
51 
52 CollationElementIterator::CollationElementIterator(
53                                          const CollationElementIterator& other)
54         : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
55     *this = other;
56 }
57 
~CollationElementIterator()58 CollationElementIterator::~CollationElementIterator()
59 {
60     delete iter_;
61     delete offsets_;
62 }
63 
64 /* CollationElementIterator public methods --------------------------------- */
65 
66 namespace {
67 
getFirstHalf(uint32_t p,uint32_t lower32)68 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
69     return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
70 }
getSecondHalf(uint32_t p,uint32_t lower32)71 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
72     return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
73 }
ceNeedsTwoParts(int64_t ce)74 UBool ceNeedsTwoParts(int64_t ce) {
75     return (ce & INT64_C(0xffff00ff003f)) != 0;
76 }
77 
78 }  // namespace
79 
getOffset() const80 int32_t CollationElementIterator::getOffset() const
81 {
82     if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
83         // CollationIterator::previousCE() decrements the CEs length
84         // while it pops CEs from its internal buffer.
85         int32_t i = iter_->getCEsLength();
86         if (otherHalf_ != 0) {
87             // Return the trailing CE offset while we are in the middle of a 64-bit CE.
88             ++i;
89         }
90         U_ASSERT(i < offsets_->size());
91         return offsets_->elementAti(i);
92     }
93     return iter_->getOffset();
94 }
95 
96 /**
97 * Get the ordering priority of the next character in the string.
98 * @return the next character's ordering. Returns NULLORDER if an error has
99 *         occured or if the end of string has been reached
100 */
next(UErrorCode & status)101 int32_t CollationElementIterator::next(UErrorCode& status)
102 {
103     if (U_FAILURE(status)) { return NULLORDER; }
104     if (dir_ > 1) {
105         // Continue forward iteration. Test this first.
106         if (otherHalf_ != 0) {
107             uint32_t oh = otherHalf_;
108             otherHalf_ = 0;
109             return oh;
110         }
111     } else if (dir_ == 1) {
112         // next() after setOffset()
113         dir_ = 2;
114     } else if (dir_ == 0) {
115         // The iter_ is already reset to the start of the text.
116         dir_ = 2;
117     } else /* dir_ < 0 */ {
118         // illegal change of direction
119         status = U_INVALID_STATE_ERROR;
120         return NULLORDER;
121     }
122     // No need to keep all CEs in the buffer when we iterate.
123     iter_->clearCEsIfNoneRemaining();
124     int64_t ce = iter_->nextCE(status);
125     if (ce == Collation::NO_CE) { return NULLORDER; }
126     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
127     uint32_t p = (uint32_t)(ce >> 32);
128     uint32_t lower32 = (uint32_t)ce;
129     uint32_t firstHalf = getFirstHalf(p, lower32);
130     uint32_t secondHalf = getSecondHalf(p, lower32);
131     if (secondHalf != 0) {
132         otherHalf_ = secondHalf | 0xc0;  // continuation CE
133     }
134     return firstHalf;
135 }
136 
operator !=(const CollationElementIterator & other) const137 UBool CollationElementIterator::operator!=(
138                                   const CollationElementIterator& other) const
139 {
140     return !(*this == other);
141 }
142 
operator ==(const CollationElementIterator & that) const143 UBool CollationElementIterator::operator==(
144                                     const CollationElementIterator& that) const
145 {
146     if (this == &that) {
147         return TRUE;
148     }
149 
150     return
151         (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
152         otherHalf_ == that.otherHalf_ &&
153         normalizeDir() == that.normalizeDir() &&
154         string_ == that.string_ &&
155         *iter_ == *that.iter_;
156 }
157 
158 /**
159 * Get the ordering priority of the previous collation element in the string.
160 * @param status the error code status.
161 * @return the previous element's ordering. Returns NULLORDER if an error has
162 *         occured or if the start of string has been reached.
163 */
previous(UErrorCode & status)164 int32_t CollationElementIterator::previous(UErrorCode& status)
165 {
166     if (U_FAILURE(status)) { return NULLORDER; }
167     if (dir_ < 0) {
168         // Continue backwards iteration. Test this first.
169         if (otherHalf_ != 0) {
170             uint32_t oh = otherHalf_;
171             otherHalf_ = 0;
172             return oh;
173         }
174     } else if (dir_ == 0) {
175         iter_->resetToOffset(string_.length());
176         dir_ = -1;
177     } else if (dir_ == 1) {
178         // previous() after setOffset()
179         dir_ = -1;
180     } else /* dir_ > 1 */ {
181         // illegal change of direction
182         status = U_INVALID_STATE_ERROR;
183         return NULLORDER;
184     }
185     if (offsets_ == NULL) {
186         offsets_ = new UVector32(status);
187         if (offsets_ == NULL) {
188             status = U_MEMORY_ALLOCATION_ERROR;
189             return NULLORDER;
190         }
191     }
192     // If we already have expansion CEs, then we also have offsets.
193     // Otherwise remember the trailing offset in case we need to
194     // write offsets for an artificial expansion.
195     int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
196     int64_t ce = iter_->previousCE(*offsets_, status);
197     if (ce == Collation::NO_CE) { return NULLORDER; }
198     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
199     uint32_t p = (uint32_t)(ce >> 32);
200     uint32_t lower32 = (uint32_t)ce;
201     uint32_t firstHalf = getFirstHalf(p, lower32);
202     uint32_t secondHalf = getSecondHalf(p, lower32);
203     if (secondHalf != 0) {
204         if (offsets_->isEmpty()) {
205             // When we convert a single 64-bit CE into two 32-bit CEs,
206             // we need to make this artificial expansion behave like a normal expansion.
207             // See CollationIterator::previousCE().
208             offsets_->addElement(iter_->getOffset(), status);
209             offsets_->addElement(limitOffset, status);
210         }
211         otherHalf_ = firstHalf;
212         return secondHalf | 0xc0;  // continuation CE
213     }
214     return firstHalf;
215 }
216 
217 /**
218 * Resets the cursor to the beginning of the string.
219 */
reset()220 void CollationElementIterator::reset()
221 {
222     iter_ ->resetToOffset(0);
223     otherHalf_ = 0;
224     dir_ = 0;
225 }
226 
setOffset(int32_t newOffset,UErrorCode & status)227 void CollationElementIterator::setOffset(int32_t newOffset,
228                                          UErrorCode& status)
229 {
230     if (U_FAILURE(status)) { return; }
231     if (0 < newOffset && newOffset < string_.length()) {
232         int32_t offset = newOffset;
233         do {
234             UChar c = string_.charAt(offset);
235             if (!rbc_->isUnsafe(c) ||
236                     (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
237                 break;
238             }
239             // Back up to before this unsafe character.
240             --offset;
241         } while (offset > 0);
242         if (offset < newOffset) {
243             // We might have backed up more than necessary.
244             // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
245             // but for text "chu" setOffset(2) should remain at 2
246             // although we initially back up to offset 0.
247             // Find the last safe offset no greater than newOffset by iterating forward.
248             int32_t lastSafeOffset = offset;
249             do {
250                 iter_->resetToOffset(lastSafeOffset);
251                 do {
252                     iter_->nextCE(status);
253                     if (U_FAILURE(status)) { return; }
254                 } while ((offset = iter_->getOffset()) == lastSafeOffset);
255                 if (offset <= newOffset) {
256                     lastSafeOffset = offset;
257                 }
258             } while (offset < newOffset);
259             newOffset = lastSafeOffset;
260         }
261     }
262     iter_->resetToOffset(newOffset);
263     otherHalf_ = 0;
264     dir_ = 1;
265 }
266 
267 /**
268 * Sets the source to the new source string.
269 */
setText(const UnicodeString & source,UErrorCode & status)270 void CollationElementIterator::setText(const UnicodeString& source,
271                                        UErrorCode& status)
272 {
273     if (U_FAILURE(status)) {
274         return;
275     }
276 
277     string_ = source;
278     const UChar *s = string_.getBuffer();
279     CollationIterator *newIter;
280     UBool numeric = rbc_->settings->isNumeric();
281     if (rbc_->settings->dontCheckFCD()) {
282         newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
283     } else {
284         newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
285     }
286     if (newIter == NULL) {
287         status = U_MEMORY_ALLOCATION_ERROR;
288         return;
289     }
290     delete iter_;
291     iter_ = newIter;
292     otherHalf_ = 0;
293     dir_ = 0;
294 }
295 
296 // Sets the source to the new character iterator.
setText(CharacterIterator & source,UErrorCode & status)297 void CollationElementIterator::setText(CharacterIterator& source,
298                                        UErrorCode& status)
299 {
300     if (U_FAILURE(status))
301         return;
302 
303     source.getText(string_);
304     setText(string_, status);
305 }
306 
strengthOrder(int32_t order) const307 int32_t CollationElementIterator::strengthOrder(int32_t order) const
308 {
309     UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
310     // Mask off the unwanted differences.
311     if (s == UCOL_PRIMARY) {
312         order &= 0xffff0000;
313     }
314     else if (s == UCOL_SECONDARY) {
315         order &= 0xffffff00;
316     }
317 
318     return order;
319 }
320 
321 /* CollationElementIterator private constructors/destructors --------------- */
322 
323 /**
324 * This is the "real" constructor for this class; it constructs an iterator
325 * over the source text using the specified collator
326 */
CollationElementIterator(const UnicodeString & source,const RuleBasedCollator * coll,UErrorCode & status)327 CollationElementIterator::CollationElementIterator(
328                                                const UnicodeString &source,
329                                                const RuleBasedCollator *coll,
330                                                UErrorCode &status)
331         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
332     setText(source, status);
333 }
334 
335 /**
336 * This is the "real" constructor for this class; it constructs an iterator over
337 * the source text using the specified collator
338 */
CollationElementIterator(const CharacterIterator & source,const RuleBasedCollator * coll,UErrorCode & status)339 CollationElementIterator::CollationElementIterator(
340                                            const CharacterIterator &source,
341                                            const RuleBasedCollator *coll,
342                                            UErrorCode &status)
343         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
344     // We only call source.getText() which should be const anyway.
345     setText(const_cast<CharacterIterator &>(source), status);
346 }
347 
348 /* CollationElementIterator private methods -------------------------------- */
349 
operator =(const CollationElementIterator & other)350 const CollationElementIterator& CollationElementIterator::operator=(
351                                          const CollationElementIterator& other)
352 {
353     if (this == &other) {
354         return *this;
355     }
356 
357     CollationIterator *newIter;
358     const FCDUTF16CollationIterator *otherFCDIter =
359             dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
360     if(otherFCDIter != NULL) {
361         newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
362     } else {
363         const UTF16CollationIterator *otherIter =
364                 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
365         if(otherIter != NULL) {
366             newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
367         } else {
368             newIter = NULL;
369         }
370     }
371     if(newIter != NULL) {
372         delete iter_;
373         iter_ = newIter;
374         rbc_ = other.rbc_;
375         otherHalf_ = other.otherHalf_;
376         dir_ = other.dir_;
377 
378         string_ = other.string_;
379     }
380     if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
381         UErrorCode errorCode = U_ZERO_ERROR;
382         if(offsets_ == NULL) {
383             offsets_ = new UVector32(other.offsets_->size(), errorCode);
384         }
385         if(offsets_ != NULL) {
386             offsets_->assign(*other.offsets_, errorCode);
387         }
388     }
389     return *this;
390 }
391 
392 namespace {
393 
394 class MaxExpSink : public ContractionsAndExpansions::CESink {
395 public:
MaxExpSink(UHashtable * h,UErrorCode & ec)396     MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
397     virtual ~MaxExpSink();
handleCE(int64_t)398     virtual void handleCE(int64_t /*ce*/) {}
handleExpansion(const int64_t ces[],int32_t length)399     virtual void handleExpansion(const int64_t ces[], int32_t length) {
400         if (length <= 1) {
401             // We do not need to add single CEs into the map.
402             return;
403         }
404         int32_t count = 0;  // number of CE "halves"
405         for (int32_t i = 0; i < length; ++i) {
406             count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
407         }
408         // last "half" of the last CE
409         int64_t ce = ces[length - 1];
410         uint32_t p = (uint32_t)(ce >> 32);
411         uint32_t lower32 = (uint32_t)ce;
412         uint32_t lastHalf = getSecondHalf(p, lower32);
413         if (lastHalf == 0) {
414             lastHalf = getFirstHalf(p, lower32);
415             U_ASSERT(lastHalf != 0);
416         } else {
417             lastHalf |= 0xc0;  // old-style continuation CE
418         }
419         if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
420             uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
421         }
422     }
423 
424 private:
425     UHashtable *maxExpansions;
426     UErrorCode &errorCode;
427 };
428 
~MaxExpSink()429 MaxExpSink::~MaxExpSink() {}
430 
431 }  // namespace
432 
433 UHashtable *
computeMaxExpansions(const CollationData * data,UErrorCode & errorCode)434 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
435     if (U_FAILURE(errorCode)) { return NULL; }
436     UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
437                                            uhash_compareLong, &errorCode);
438     if (U_FAILURE(errorCode)) { return NULL; }
439     MaxExpSink sink(maxExpansions, errorCode);
440     ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
441     if (U_FAILURE(errorCode)) {
442         uhash_close(maxExpansions);
443         return NULL;
444     }
445     return maxExpansions;
446 }
447 
448 int32_t
getMaxExpansion(int32_t order) const449 CollationElementIterator::getMaxExpansion(int32_t order) const {
450     return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
451 }
452 
453 int32_t
getMaxExpansion(const UHashtable * maxExpansions,int32_t order)454 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
455     if (order == 0) { return 1; }
456     int32_t max;
457     if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
458         return max;
459     }
460     if ((order & 0xc0) == 0xc0) {
461         // old-style continuation CE
462         return 2;
463     } else {
464         return 1;
465     }
466 }
467 
468 U_NAMESPACE_END
469 
470 #endif /* #if !UCONFIG_NO_COLLATION */
471