1 /*
2 ******************************************************************************
3 * Copyright (C) 2001-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 * 2012-2014 markus Rewritten in C++ again.
15 ******************************************************************************/
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_COLLATION
20
21 #include "unicode/coleitr.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/ucoleitr.h"
24 #include "unicode/ustring.h"
25 #include "unicode/sortkey.h"
26 #include "unicode/uobject.h"
27 #include "cmemory.h"
28 #include "usrchimp.h"
29
30 U_NAMESPACE_USE
31
32 #define BUFFER_LENGTH 100
33
34 #define DEFAULT_BUFFER_SIZE 16
35 #define BUFFER_GROW 8
36
37 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
38
39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
40
41 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
42
43 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
44
45 #define DELETE_ARRAY(array) uprv_free((void *) (array))
46
47 struct RCEI
48 {
49 uint32_t ce;
50 int32_t low;
51 int32_t high;
52 };
53
54 U_NAMESPACE_BEGIN
55
56 struct RCEBuffer
57 {
58 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
59 RCEI *buffer;
60 int32_t bufferIndex;
61 int32_t bufferSize;
62
63 RCEBuffer();
64 ~RCEBuffer();
65
66 UBool empty() const;
67 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
68 const RCEI *get();
69 };
70
RCEBuffer()71 RCEBuffer::RCEBuffer()
72 {
73 buffer = defaultBuffer;
74 bufferIndex = 0;
75 bufferSize = UPRV_LENGTHOF(defaultBuffer);
76 }
77
~RCEBuffer()78 RCEBuffer::~RCEBuffer()
79 {
80 if (buffer != defaultBuffer) {
81 DELETE_ARRAY(buffer);
82 }
83 }
84
empty() const85 UBool RCEBuffer::empty() const
86 {
87 return bufferIndex <= 0;
88 }
89
put(uint32_t ce,int32_t ixLow,int32_t ixHigh)90 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
91 {
92 if (bufferIndex >= bufferSize) {
93 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
94
95 ARRAY_COPY(newBuffer, buffer, bufferSize);
96
97 if (buffer != defaultBuffer) {
98 DELETE_ARRAY(buffer);
99 }
100
101 buffer = newBuffer;
102 bufferSize += BUFFER_GROW;
103 }
104
105 buffer[bufferIndex].ce = ce;
106 buffer[bufferIndex].low = ixLow;
107 buffer[bufferIndex].high = ixHigh;
108
109 bufferIndex += 1;
110 }
111
get()112 const RCEI *RCEBuffer::get()
113 {
114 if (bufferIndex > 0) {
115 return &buffer[--bufferIndex];
116 }
117
118 return NULL;
119 }
120
PCEBuffer()121 PCEBuffer::PCEBuffer()
122 {
123 buffer = defaultBuffer;
124 bufferIndex = 0;
125 bufferSize = UPRV_LENGTHOF(defaultBuffer);
126 }
127
~PCEBuffer()128 PCEBuffer::~PCEBuffer()
129 {
130 if (buffer != defaultBuffer) {
131 DELETE_ARRAY(buffer);
132 }
133 }
134
reset()135 void PCEBuffer::reset()
136 {
137 bufferIndex = 0;
138 }
139
empty() const140 UBool PCEBuffer::empty() const
141 {
142 return bufferIndex <= 0;
143 }
144
put(uint64_t ce,int32_t ixLow,int32_t ixHigh)145 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
146 {
147 if (bufferIndex >= bufferSize) {
148 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
149
150 ARRAY_COPY(newBuffer, buffer, bufferSize);
151
152 if (buffer != defaultBuffer) {
153 DELETE_ARRAY(buffer);
154 }
155
156 buffer = newBuffer;
157 bufferSize += BUFFER_GROW;
158 }
159
160 buffer[bufferIndex].ce = ce;
161 buffer[bufferIndex].low = ixLow;
162 buffer[bufferIndex].high = ixHigh;
163
164 bufferIndex += 1;
165 }
166
get()167 const PCEI *PCEBuffer::get()
168 {
169 if (bufferIndex > 0) {
170 return &buffer[--bufferIndex];
171 }
172
173 return NULL;
174 }
175
UCollationPCE(UCollationElements * elems)176 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
177
UCollationPCE(CollationElementIterator * iter)178 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
179
init(UCollationElements * elems)180 void UCollationPCE::init(UCollationElements *elems) {
181 init(CollationElementIterator::fromUCollationElements(elems));
182 }
183
init(CollationElementIterator * iter)184 void UCollationPCE::init(CollationElementIterator *iter)
185 {
186 cei = iter;
187 init(*iter->rbc_);
188 }
189
init(const Collator & coll)190 void UCollationPCE::init(const Collator &coll)
191 {
192 UErrorCode status = U_ZERO_ERROR;
193
194 strength = coll.getAttribute(UCOL_STRENGTH, status);
195 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
196 isShifted = FALSE;
197 variableTop = coll.getVariableTop(status);
198 }
199
~UCollationPCE()200 UCollationPCE::~UCollationPCE()
201 {
202 // nothing to do
203 }
204
processCE(uint32_t ce)205 uint64_t UCollationPCE::processCE(uint32_t ce)
206 {
207 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
208
209 // This is clean, but somewhat slow...
210 // We could apply the mask to ce and then
211 // just get all three orders...
212 switch(strength) {
213 default:
214 tertiary = ucol_tertiaryOrder(ce);
215 /* note fall-through */
216
217 case UCOL_SECONDARY:
218 secondary = ucol_secondaryOrder(ce);
219 /* note fall-through */
220
221 case UCOL_PRIMARY:
222 primary = ucol_primaryOrder(ce);
223 }
224
225 // **** This should probably handle continuations too. ****
226 // **** That means that we need 24 bits for the primary ****
227 // **** instead of the 16 that we're currently using. ****
228 // **** So we can lay out the 64 bits as: 24.12.12.16. ****
229 // **** Another complication with continuations is that ****
230 // **** the *second* CE is marked as a continuation, so ****
231 // **** we always have to peek ahead to know how long ****
232 // **** the primary is... ****
233 if ((toShift && variableTop > ce && primary != 0)
234 || (isShifted && primary == 0)) {
235
236 if (primary == 0) {
237 return UCOL_IGNORABLE;
238 }
239
240 if (strength >= UCOL_QUATERNARY) {
241 quaternary = primary;
242 }
243
244 primary = secondary = tertiary = 0;
245 isShifted = TRUE;
246 } else {
247 if (strength >= UCOL_QUATERNARY) {
248 quaternary = 0xFFFF;
249 }
250
251 isShifted = FALSE;
252 }
253
254 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
255 }
256
257 U_NAMESPACE_END
258
259 /* public methods ---------------------------------------------------- */
260
261 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)262 ucol_openElements(const UCollator *coll,
263 const UChar *text,
264 int32_t textLength,
265 UErrorCode *status)
266 {
267 if (U_FAILURE(*status)) {
268 return NULL;
269 }
270 if (coll == NULL || (text == NULL && textLength != 0)) {
271 *status = U_ILLEGAL_ARGUMENT_ERROR;
272 return NULL;
273 }
274 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
275 if (rbc == NULL) {
276 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator
277 return NULL;
278 }
279
280 UnicodeString s((UBool)(textLength < 0), text, textLength);
281 CollationElementIterator *cei = rbc->createCollationElementIterator(s);
282 if (cei == NULL) {
283 *status = U_MEMORY_ALLOCATION_ERROR;
284 return NULL;
285 }
286
287 return cei->toUCollationElements();
288 }
289
290
291 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)292 ucol_closeElements(UCollationElements *elems)
293 {
294 delete CollationElementIterator::fromUCollationElements(elems);
295 }
296
297 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)298 ucol_reset(UCollationElements *elems)
299 {
300 CollationElementIterator::fromUCollationElements(elems)->reset();
301 }
302
303 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)304 ucol_next(UCollationElements *elems,
305 UErrorCode *status)
306 {
307 if (U_FAILURE(*status)) {
308 return UCOL_NULLORDER;
309 }
310
311 return CollationElementIterator::fromUCollationElements(elems)->next(*status);
312 }
313
314 U_NAMESPACE_BEGIN
315
316 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)317 UCollationPCE::nextProcessed(
318 int32_t *ixLow,
319 int32_t *ixHigh,
320 UErrorCode *status)
321 {
322 int64_t result = UCOL_IGNORABLE;
323 uint32_t low = 0, high = 0;
324
325 if (U_FAILURE(*status)) {
326 return UCOL_PROCESSED_NULLORDER;
327 }
328
329 pceBuffer.reset();
330
331 do {
332 low = cei->getOffset();
333 int32_t ce = cei->next(*status);
334 high = cei->getOffset();
335
336 if (ce == UCOL_NULLORDER) {
337 result = UCOL_PROCESSED_NULLORDER;
338 break;
339 }
340
341 result = processCE((uint32_t)ce);
342 } while (result == UCOL_IGNORABLE);
343
344 if (ixLow != NULL) {
345 *ixLow = low;
346 }
347
348 if (ixHigh != NULL) {
349 *ixHigh = high;
350 }
351
352 return result;
353 }
354
355 U_NAMESPACE_END
356
357 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)358 ucol_previous(UCollationElements *elems,
359 UErrorCode *status)
360 {
361 if(U_FAILURE(*status)) {
362 return UCOL_NULLORDER;
363 }
364 return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
365 }
366
367 U_NAMESPACE_BEGIN
368
369 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)370 UCollationPCE::previousProcessed(
371 int32_t *ixLow,
372 int32_t *ixHigh,
373 UErrorCode *status)
374 {
375 int64_t result = UCOL_IGNORABLE;
376 int32_t low = 0, high = 0;
377
378 if (U_FAILURE(*status)) {
379 return UCOL_PROCESSED_NULLORDER;
380 }
381
382 // pceBuffer.reset();
383
384 while (pceBuffer.empty()) {
385 // buffer raw CEs up to non-ignorable primary
386 RCEBuffer rceb;
387 int32_t ce;
388
389 // **** do we need to reset rceb, or will it always be empty at this point ****
390 do {
391 high = cei->getOffset();
392 ce = cei->previous(*status);
393 low = cei->getOffset();
394
395 if (ce == UCOL_NULLORDER) {
396 if (! rceb.empty()) {
397 break;
398 }
399
400 goto finish;
401 }
402
403 rceb.put((uint32_t)ce, low, high);
404 } while ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce));
405
406 // process the raw CEs
407 while (! rceb.empty()) {
408 const RCEI *rcei = rceb.get();
409
410 result = processCE(rcei->ce);
411
412 if (result != UCOL_IGNORABLE) {
413 pceBuffer.put(result, rcei->low, rcei->high);
414 }
415 }
416 }
417
418 finish:
419 if (pceBuffer.empty()) {
420 // **** Is -1 the right value for ixLow, ixHigh? ****
421 if (ixLow != NULL) {
422 *ixLow = -1;
423 }
424
425 if (ixHigh != NULL) {
426 *ixHigh = -1
427 ;
428 }
429 return UCOL_PROCESSED_NULLORDER;
430 }
431
432 const PCEI *pcei = pceBuffer.get();
433
434 if (ixLow != NULL) {
435 *ixLow = pcei->low;
436 }
437
438 if (ixHigh != NULL) {
439 *ixHigh = pcei->high;
440 }
441
442 return pcei->ce;
443 }
444
445 U_NAMESPACE_END
446
447 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)448 ucol_getMaxExpansion(const UCollationElements *elems,
449 int32_t order)
450 {
451 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
452
453 // TODO: The old code masked the order according to strength and then did a binary search.
454 // However this was probably at least partially broken because of the following comment.
455 // Still, it might have found a match when this version may not.
456
457 // FIXME: with a masked search, there might be more than one hit,
458 // so we need to look forward and backward from the match to find all
459 // of the hits...
460 }
461
462 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)463 ucol_setText( UCollationElements *elems,
464 const UChar *text,
465 int32_t textLength,
466 UErrorCode *status)
467 {
468 if (U_FAILURE(*status)) {
469 return;
470 }
471
472 if ((text == NULL && textLength != 0)) {
473 *status = U_ILLEGAL_ARGUMENT_ERROR;
474 return;
475 }
476 UnicodeString s((UBool)(textLength < 0), text, textLength);
477 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
478 }
479
480 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)481 ucol_getOffset(const UCollationElements *elems)
482 {
483 return CollationElementIterator::fromUCollationElements(elems)->getOffset();
484 }
485
486 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)487 ucol_setOffset(UCollationElements *elems,
488 int32_t offset,
489 UErrorCode *status)
490 {
491 if (U_FAILURE(*status)) {
492 return;
493 }
494
495 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
496 }
497
498 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)499 ucol_primaryOrder (int32_t order)
500 {
501 return (order >> 16) & 0xffff;
502 }
503
504 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)505 ucol_secondaryOrder (int32_t order)
506 {
507 return (order >> 8) & 0xff;
508 }
509
510 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)511 ucol_tertiaryOrder (int32_t order)
512 {
513 return order & 0xff;
514 }
515
516 #endif /* #if !UCONFIG_NO_COLLATION */
517