1 /*
2 ******************************************************************************
3 * Copyright (C) 2001-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 * 2012-2014 markus Rewritten in C++ again.
15 ******************************************************************************/
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_COLLATION
20
21 #include "unicode/coleitr.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/ucoleitr.h"
24 #include "unicode/ustring.h"
25 #include "unicode/sortkey.h"
26 #include "unicode/uobject.h"
27 #include "cmemory.h"
28 #include "usrchimp.h"
29
30 U_NAMESPACE_USE
31
32 #define BUFFER_LENGTH 100
33
34 #define DEFAULT_BUFFER_SIZE 16
35 #define BUFFER_GROW 8
36
37 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
38
39 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40
41 #define DELETE_ARRAY(array) uprv_free((void *) (array))
42
43 struct RCEI
44 {
45 uint32_t ce;
46 int32_t low;
47 int32_t high;
48 };
49
50 U_NAMESPACE_BEGIN
51
52 struct RCEBuffer
53 {
54 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
55 RCEI *buffer;
56 int32_t bufferIndex;
57 int32_t bufferSize;
58
59 RCEBuffer();
60 ~RCEBuffer();
61
62 UBool isEmpty() const;
63 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
64 const RCEI *get();
65 };
66
RCEBuffer()67 RCEBuffer::RCEBuffer()
68 {
69 buffer = defaultBuffer;
70 bufferIndex = 0;
71 bufferSize = UPRV_LENGTHOF(defaultBuffer);
72 }
73
~RCEBuffer()74 RCEBuffer::~RCEBuffer()
75 {
76 if (buffer != defaultBuffer) {
77 DELETE_ARRAY(buffer);
78 }
79 }
80
isEmpty() const81 UBool RCEBuffer::isEmpty() const
82 {
83 return bufferIndex <= 0;
84 }
85
put(uint32_t ce,int32_t ixLow,int32_t ixHigh,UErrorCode & errorCode)86 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
87 {
88 if (U_FAILURE(errorCode)) {
89 return;
90 }
91 if (bufferIndex >= bufferSize) {
92 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
93 if (newBuffer == NULL) {
94 errorCode = U_MEMORY_ALLOCATION_ERROR;
95 return;
96 }
97
98 ARRAY_COPY(newBuffer, buffer, bufferSize);
99
100 if (buffer != defaultBuffer) {
101 DELETE_ARRAY(buffer);
102 }
103
104 buffer = newBuffer;
105 bufferSize += BUFFER_GROW;
106 }
107
108 buffer[bufferIndex].ce = ce;
109 buffer[bufferIndex].low = ixLow;
110 buffer[bufferIndex].high = ixHigh;
111
112 bufferIndex += 1;
113 }
114
get()115 const RCEI *RCEBuffer::get()
116 {
117 if (bufferIndex > 0) {
118 return &buffer[--bufferIndex];
119 }
120
121 return NULL;
122 }
123
PCEBuffer()124 PCEBuffer::PCEBuffer()
125 {
126 buffer = defaultBuffer;
127 bufferIndex = 0;
128 bufferSize = UPRV_LENGTHOF(defaultBuffer);
129 }
130
~PCEBuffer()131 PCEBuffer::~PCEBuffer()
132 {
133 if (buffer != defaultBuffer) {
134 DELETE_ARRAY(buffer);
135 }
136 }
137
reset()138 void PCEBuffer::reset()
139 {
140 bufferIndex = 0;
141 }
142
isEmpty() const143 UBool PCEBuffer::isEmpty() const
144 {
145 return bufferIndex <= 0;
146 }
147
put(uint64_t ce,int32_t ixLow,int32_t ixHigh,UErrorCode & errorCode)148 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
149 {
150 if (U_FAILURE(errorCode)) {
151 return;
152 }
153 if (bufferIndex >= bufferSize) {
154 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
155 if (newBuffer == NULL) {
156 errorCode = U_MEMORY_ALLOCATION_ERROR;
157 return;
158 }
159
160 ARRAY_COPY(newBuffer, buffer, bufferSize);
161
162 if (buffer != defaultBuffer) {
163 DELETE_ARRAY(buffer);
164 }
165
166 buffer = newBuffer;
167 bufferSize += BUFFER_GROW;
168 }
169
170 buffer[bufferIndex].ce = ce;
171 buffer[bufferIndex].low = ixLow;
172 buffer[bufferIndex].high = ixHigh;
173
174 bufferIndex += 1;
175 }
176
get()177 const PCEI *PCEBuffer::get()
178 {
179 if (bufferIndex > 0) {
180 return &buffer[--bufferIndex];
181 }
182
183 return NULL;
184 }
185
UCollationPCE(UCollationElements * elems)186 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
187
UCollationPCE(CollationElementIterator * iter)188 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
189
init(UCollationElements * elems)190 void UCollationPCE::init(UCollationElements *elems) {
191 init(CollationElementIterator::fromUCollationElements(elems));
192 }
193
init(CollationElementIterator * iter)194 void UCollationPCE::init(CollationElementIterator *iter)
195 {
196 cei = iter;
197 init(*iter->rbc_);
198 }
199
init(const Collator & coll)200 void UCollationPCE::init(const Collator &coll)
201 {
202 UErrorCode status = U_ZERO_ERROR;
203
204 strength = coll.getAttribute(UCOL_STRENGTH, status);
205 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
206 isShifted = FALSE;
207 variableTop = coll.getVariableTop(status);
208 }
209
~UCollationPCE()210 UCollationPCE::~UCollationPCE()
211 {
212 // nothing to do
213 }
214
processCE(uint32_t ce)215 uint64_t UCollationPCE::processCE(uint32_t ce)
216 {
217 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
218
219 // This is clean, but somewhat slow...
220 // We could apply the mask to ce and then
221 // just get all three orders...
222 switch(strength) {
223 default:
224 tertiary = ucol_tertiaryOrder(ce);
225 /* note fall-through */
226
227 case UCOL_SECONDARY:
228 secondary = ucol_secondaryOrder(ce);
229 /* note fall-through */
230
231 case UCOL_PRIMARY:
232 primary = ucol_primaryOrder(ce);
233 }
234
235 // **** This should probably handle continuations too. ****
236 // **** That means that we need 24 bits for the primary ****
237 // **** instead of the 16 that we're currently using. ****
238 // **** So we can lay out the 64 bits as: 24.12.12.16. ****
239 // **** Another complication with continuations is that ****
240 // **** the *second* CE is marked as a continuation, so ****
241 // **** we always have to peek ahead to know how long ****
242 // **** the primary is... ****
243 if ((toShift && variableTop > ce && primary != 0)
244 || (isShifted && primary == 0)) {
245
246 if (primary == 0) {
247 return UCOL_IGNORABLE;
248 }
249
250 if (strength >= UCOL_QUATERNARY) {
251 quaternary = primary;
252 }
253
254 primary = secondary = tertiary = 0;
255 isShifted = TRUE;
256 } else {
257 if (strength >= UCOL_QUATERNARY) {
258 quaternary = 0xFFFF;
259 }
260
261 isShifted = FALSE;
262 }
263
264 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
265 }
266
267 U_NAMESPACE_END
268
269 /* public methods ---------------------------------------------------- */
270
271 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)272 ucol_openElements(const UCollator *coll,
273 const UChar *text,
274 int32_t textLength,
275 UErrorCode *status)
276 {
277 if (U_FAILURE(*status)) {
278 return NULL;
279 }
280 if (coll == NULL || (text == NULL && textLength != 0)) {
281 *status = U_ILLEGAL_ARGUMENT_ERROR;
282 return NULL;
283 }
284 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
285 if (rbc == NULL) {
286 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator
287 return NULL;
288 }
289
290 UnicodeString s((UBool)(textLength < 0), text, textLength);
291 CollationElementIterator *cei = rbc->createCollationElementIterator(s);
292 if (cei == NULL) {
293 *status = U_MEMORY_ALLOCATION_ERROR;
294 return NULL;
295 }
296
297 return cei->toUCollationElements();
298 }
299
300
301 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)302 ucol_closeElements(UCollationElements *elems)
303 {
304 delete CollationElementIterator::fromUCollationElements(elems);
305 }
306
307 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)308 ucol_reset(UCollationElements *elems)
309 {
310 CollationElementIterator::fromUCollationElements(elems)->reset();
311 }
312
313 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)314 ucol_next(UCollationElements *elems,
315 UErrorCode *status)
316 {
317 if (U_FAILURE(*status)) {
318 return UCOL_NULLORDER;
319 }
320
321 return CollationElementIterator::fromUCollationElements(elems)->next(*status);
322 }
323
324 U_NAMESPACE_BEGIN
325
326 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)327 UCollationPCE::nextProcessed(
328 int32_t *ixLow,
329 int32_t *ixHigh,
330 UErrorCode *status)
331 {
332 int64_t result = UCOL_IGNORABLE;
333 uint32_t low = 0, high = 0;
334
335 if (U_FAILURE(*status)) {
336 return UCOL_PROCESSED_NULLORDER;
337 }
338
339 pceBuffer.reset();
340
341 do {
342 low = cei->getOffset();
343 int32_t ce = cei->next(*status);
344 high = cei->getOffset();
345
346 if (ce == UCOL_NULLORDER) {
347 result = UCOL_PROCESSED_NULLORDER;
348 break;
349 }
350
351 result = processCE((uint32_t)ce);
352 } while (result == UCOL_IGNORABLE);
353
354 if (ixLow != NULL) {
355 *ixLow = low;
356 }
357
358 if (ixHigh != NULL) {
359 *ixHigh = high;
360 }
361
362 return result;
363 }
364
365 U_NAMESPACE_END
366
367 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)368 ucol_previous(UCollationElements *elems,
369 UErrorCode *status)
370 {
371 if(U_FAILURE(*status)) {
372 return UCOL_NULLORDER;
373 }
374 return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
375 }
376
377 U_NAMESPACE_BEGIN
378
379 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)380 UCollationPCE::previousProcessed(
381 int32_t *ixLow,
382 int32_t *ixHigh,
383 UErrorCode *status)
384 {
385 int64_t result = UCOL_IGNORABLE;
386 int32_t low = 0, high = 0;
387
388 if (U_FAILURE(*status)) {
389 return UCOL_PROCESSED_NULLORDER;
390 }
391
392 // pceBuffer.reset();
393
394 while (pceBuffer.isEmpty()) {
395 // buffer raw CEs up to non-ignorable primary
396 RCEBuffer rceb;
397 int32_t ce;
398
399 // **** do we need to reset rceb, or will it always be empty at this point ****
400 do {
401 high = cei->getOffset();
402 ce = cei->previous(*status);
403 low = cei->getOffset();
404
405 if (ce == UCOL_NULLORDER) {
406 if (!rceb.isEmpty()) {
407 break;
408 }
409
410 goto finish;
411 }
412
413 rceb.put((uint32_t)ce, low, high, *status);
414 } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
415
416 // process the raw CEs
417 while (U_SUCCESS(*status) && !rceb.isEmpty()) {
418 const RCEI *rcei = rceb.get();
419
420 result = processCE(rcei->ce);
421
422 if (result != UCOL_IGNORABLE) {
423 pceBuffer.put(result, rcei->low, rcei->high, *status);
424 }
425 }
426 if (U_FAILURE(*status)) {
427 return UCOL_PROCESSED_NULLORDER;
428 }
429 }
430
431 finish:
432 if (pceBuffer.isEmpty()) {
433 // **** Is -1 the right value for ixLow, ixHigh? ****
434 if (ixLow != NULL) {
435 *ixLow = -1;
436 }
437
438 if (ixHigh != NULL) {
439 *ixHigh = -1
440 ;
441 }
442 return UCOL_PROCESSED_NULLORDER;
443 }
444
445 const PCEI *pcei = pceBuffer.get();
446
447 if (ixLow != NULL) {
448 *ixLow = pcei->low;
449 }
450
451 if (ixHigh != NULL) {
452 *ixHigh = pcei->high;
453 }
454
455 return pcei->ce;
456 }
457
458 U_NAMESPACE_END
459
460 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)461 ucol_getMaxExpansion(const UCollationElements *elems,
462 int32_t order)
463 {
464 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
465
466 // TODO: The old code masked the order according to strength and then did a binary search.
467 // However this was probably at least partially broken because of the following comment.
468 // Still, it might have found a match when this version may not.
469
470 // FIXME: with a masked search, there might be more than one hit,
471 // so we need to look forward and backward from the match to find all
472 // of the hits...
473 }
474
475 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)476 ucol_setText( UCollationElements *elems,
477 const UChar *text,
478 int32_t textLength,
479 UErrorCode *status)
480 {
481 if (U_FAILURE(*status)) {
482 return;
483 }
484
485 if ((text == NULL && textLength != 0)) {
486 *status = U_ILLEGAL_ARGUMENT_ERROR;
487 return;
488 }
489 UnicodeString s((UBool)(textLength < 0), text, textLength);
490 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
491 }
492
493 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)494 ucol_getOffset(const UCollationElements *elems)
495 {
496 return CollationElementIterator::fromUCollationElements(elems)->getOffset();
497 }
498
499 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)500 ucol_setOffset(UCollationElements *elems,
501 int32_t offset,
502 UErrorCode *status)
503 {
504 if (U_FAILURE(*status)) {
505 return;
506 }
507
508 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
509 }
510
511 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)512 ucol_primaryOrder (int32_t order)
513 {
514 return (order >> 16) & 0xffff;
515 }
516
517 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)518 ucol_secondaryOrder (int32_t order)
519 {
520 return (order >> 8) & 0xff;
521 }
522
523 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)524 ucol_tertiaryOrder (int32_t order)
525 {
526 return order & 0xff;
527 }
528
529 #endif /* #if !UCONFIG_NO_COLLATION */
530