1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2008-2011, International Business Machines
7 * Corporation, Google and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 */
11 // Author : eldawy@google.com (Mohamed Eldawy)
12 // ucnvsel.cpp
13 //
14 // Purpose: To generate a list of encodings capable of handling
15 // a given Unicode text
16 //
17 // Started 09-April-2008
18
19 /**
20 * \file
21 *
22 * This is an implementation of an encoding selector.
23 * The goal is, given a unicode string, find the encodings
24 * this string can be mapped to. To make processing faster
25 * a trie is built when you call ucnvsel_open() that
26 * stores all encodings a codepoint can map to
27 */
28
29 #include "unicode/ucnvsel.h"
30
31 #if !UCONFIG_NO_CONVERSION
32
33 #include <string.h>
34
35 #include "unicode/uchar.h"
36 #include "unicode/uniset.h"
37 #include "unicode/ucnv.h"
38 #include "unicode/ustring.h"
39 #include "unicode/uchriter.h"
40 #include "utrie2.h"
41 #include "propsvec.h"
42 #include "uassert.h"
43 #include "ucmndata.h"
44 #include "uenumimp.h"
45 #include "cmemory.h"
46 #include "cstring.h"
47
48 U_NAMESPACE_USE
49
50 struct UConverterSelector {
51 UTrie2 *trie; // 16 bit trie containing offsets into pv
52 uint32_t* pv; // table of bits!
53 int32_t pvCount;
54 char** encodings; // which encodings did user ask to use?
55 int32_t encodingsCount;
56 int32_t encodingStrLength;
57 uint8_t* swapped;
58 UBool ownPv, ownEncodingStrings;
59 };
60
generateSelectorData(UConverterSelector * result,UPropsVectors * upvec,const USet * excludedCodePoints,const UConverterUnicodeSet whichSet,UErrorCode * status)61 static void generateSelectorData(UConverterSelector* result,
62 UPropsVectors *upvec,
63 const USet* excludedCodePoints,
64 const UConverterUnicodeSet whichSet,
65 UErrorCode* status) {
66 if (U_FAILURE(*status)) {
67 return;
68 }
69
70 int32_t columns = (result->encodingsCount+31)/32;
71
72 // set errorValue to all-ones
73 for (int32_t col = 0; col < columns; col++) {
74 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
75 col, ~0, ~0, status);
76 }
77
78 for (int32_t i = 0; i < result->encodingsCount; ++i) {
79 uint32_t mask;
80 uint32_t column;
81 int32_t item_count;
82 int32_t j;
83 UConverter* test_converter = ucnv_open(result->encodings[i], status);
84 if (U_FAILURE(*status)) {
85 return;
86 }
87 USet* unicode_point_set;
88 unicode_point_set = uset_open(1, 0); // empty set
89
90 ucnv_getUnicodeSet(test_converter, unicode_point_set,
91 whichSet, status);
92 if (U_FAILURE(*status)) {
93 ucnv_close(test_converter);
94 return;
95 }
96
97 column = i / 32;
98 mask = 1 << (i%32);
99 // now iterate over intervals on set i!
100 item_count = uset_getItemCount(unicode_point_set);
101
102 for (j = 0; j < item_count; ++j) {
103 UChar32 start_char;
104 UChar32 end_char;
105 UErrorCode smallStatus = U_ZERO_ERROR;
106 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
107 &smallStatus);
108 if (U_FAILURE(smallStatus)) {
109 // this will be reached for the converters that fill the set with
110 // strings. Those should be ignored by our system
111 } else {
112 upvec_setValue(upvec, start_char, end_char, column, ~0, mask,
113 status);
114 }
115 }
116 ucnv_close(test_converter);
117 uset_close(unicode_point_set);
118 if (U_FAILURE(*status)) {
119 return;
120 }
121 }
122
123 // handle excluded encodings! Simply set their values to all 1's in the upvec
124 if (excludedCodePoints) {
125 int32_t item_count = uset_getItemCount(excludedCodePoints);
126 for (int32_t j = 0; j < item_count; ++j) {
127 UChar32 start_char;
128 UChar32 end_char;
129
130 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0,
131 status);
132 for (int32_t col = 0; col < columns; col++) {
133 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0,
134 status);
135 }
136 }
137 }
138
139 // alright. Now, let's put things in the same exact form you'd get when you
140 // unserialize things.
141 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
142 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status);
143 result->pvCount *= columns; // number of uint32_t = rows * columns
144 result->ownPv = TRUE;
145 }
146
147 /* open a selector. If converterListSize is 0, build for all converters.
148 If excludedCodePoints is NULL, don't exclude any codepoints */
149 U_CAPI UConverterSelector* U_EXPORT2
ucnvsel_open(const char * const * converterList,int32_t converterListSize,const USet * excludedCodePoints,const UConverterUnicodeSet whichSet,UErrorCode * status)150 ucnvsel_open(const char* const* converterList, int32_t converterListSize,
151 const USet* excludedCodePoints,
152 const UConverterUnicodeSet whichSet, UErrorCode* status) {
153 // check if already failed
154 if (U_FAILURE(*status)) {
155 return NULL;
156 }
157 // ensure args make sense!
158 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) {
159 *status = U_ILLEGAL_ARGUMENT_ERROR;
160 return NULL;
161 }
162
163 // allocate a new converter
164 LocalUConverterSelectorPointer newSelector(
165 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)));
166 if (newSelector.isNull()) {
167 *status = U_MEMORY_ALLOCATION_ERROR;
168 return NULL;
169 }
170 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector));
171
172 if (converterListSize == 0) {
173 converterList = NULL;
174 converterListSize = ucnv_countAvailable();
175 }
176 newSelector->encodings =
177 (char**)uprv_malloc(converterListSize * sizeof(char*));
178 if (!newSelector->encodings) {
179 *status = U_MEMORY_ALLOCATION_ERROR;
180 return NULL;
181 }
182 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close()
183
184 // make a backup copy of the list of converters
185 int32_t totalSize = 0;
186 int32_t i;
187 for (i = 0; i < converterListSize; i++) {
188 totalSize +=
189 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1;
190 }
191 // 4-align the totalSize to 4-align the size of the serialized form
192 int32_t encodingStrPadding = totalSize & 3;
193 if (encodingStrPadding != 0) {
194 encodingStrPadding = 4 - encodingStrPadding;
195 }
196 newSelector->encodingStrLength = totalSize += encodingStrPadding;
197 char* allStrings = (char*) uprv_malloc(totalSize);
198 if (!allStrings) {
199 *status = U_MEMORY_ALLOCATION_ERROR;
200 return NULL;
201 }
202
203 for (i = 0; i < converterListSize; i++) {
204 newSelector->encodings[i] = allStrings;
205 uprv_strcpy(newSelector->encodings[i],
206 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i));
207 allStrings += uprv_strlen(newSelector->encodings[i]) + 1;
208 }
209 while (encodingStrPadding > 0) {
210 *allStrings++ = 0;
211 --encodingStrPadding;
212 }
213
214 newSelector->ownEncodingStrings = TRUE;
215 newSelector->encodingsCount = converterListSize;
216 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status);
217 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status);
218 upvec_close(upvec);
219
220 if (U_FAILURE(*status)) {
221 return NULL;
222 }
223
224 return newSelector.orphan();
225 }
226
227 /* close opened selector */
228 U_CAPI void U_EXPORT2
ucnvsel_close(UConverterSelector * sel)229 ucnvsel_close(UConverterSelector *sel) {
230 if (!sel) {
231 return;
232 }
233 if (sel->ownEncodingStrings) {
234 uprv_free(sel->encodings[0]);
235 }
236 uprv_free(sel->encodings);
237 if (sel->ownPv) {
238 uprv_free(sel->pv);
239 }
240 utrie2_close(sel->trie);
241 uprv_free(sel->swapped);
242 uprv_free(sel);
243 }
244
245 static const UDataInfo dataInfo = {
246 sizeof(UDataInfo),
247 0,
248
249 U_IS_BIG_ENDIAN,
250 U_CHARSET_FAMILY,
251 U_SIZEOF_UCHAR,
252 0,
253
254 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */
255 { 1, 0, 0, 0 }, /* formatVersion */
256 { 0, 0, 0, 0 } /* dataVersion */
257 };
258
259 enum {
260 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes
261 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors
262 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names
263 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding
264 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader
265 UCNVSEL_INDEX_COUNT = 16
266 };
267
268 /*
269 * Serialized form of a UConverterSelector, formatVersion 1:
270 *
271 * The serialized form begins with a standard ICU DataHeader with a UDataInfo
272 * as the template above.
273 * This is followed by:
274 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above
275 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes
276 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors
277 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding
278 */
279
280 /* serialize a selector */
281 U_CAPI int32_t U_EXPORT2
ucnvsel_serialize(const UConverterSelector * sel,void * buffer,int32_t bufferCapacity,UErrorCode * status)282 ucnvsel_serialize(const UConverterSelector* sel,
283 void* buffer, int32_t bufferCapacity, UErrorCode* status) {
284 // check if already failed
285 if (U_FAILURE(*status)) {
286 return 0;
287 }
288 // ensure args make sense!
289 uint8_t *p = (uint8_t *)buffer;
290 if (bufferCapacity < 0 ||
291 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
292 ) {
293 *status = U_ILLEGAL_ARGUMENT_ERROR;
294 return 0;
295 }
296 // add up the size of the serialized form
297 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status);
298 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
299 return 0;
300 }
301 *status = U_ZERO_ERROR;
302
303 DataHeader header;
304 uprv_memset(&header, 0, sizeof(header));
305 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15);
306 header.dataHeader.magic1 = 0xda;
307 header.dataHeader.magic2 = 0x27;
308 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo));
309
310 int32_t indexes[UCNVSEL_INDEX_COUNT] = {
311 serializedTrieSize,
312 sel->pvCount,
313 sel->encodingsCount,
314 sel->encodingStrLength
315 };
316
317 int32_t totalSize =
318 header.dataHeader.headerSize +
319 (int32_t)sizeof(indexes) +
320 serializedTrieSize +
321 sel->pvCount * 4 +
322 sel->encodingStrLength;
323 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize;
324 if (totalSize > bufferCapacity) {
325 *status = U_BUFFER_OVERFLOW_ERROR;
326 return totalSize;
327 }
328 // ok, save!
329 int32_t length = header.dataHeader.headerSize;
330 uprv_memcpy(p, &header, sizeof(header));
331 uprv_memset(p + sizeof(header), 0, length - sizeof(header));
332 p += length;
333
334 length = (int32_t)sizeof(indexes);
335 uprv_memcpy(p, indexes, length);
336 p += length;
337
338 utrie2_serialize(sel->trie, p, serializedTrieSize, status);
339 p += serializedTrieSize;
340
341 length = sel->pvCount * 4;
342 uprv_memcpy(p, sel->pv, length);
343 p += length;
344
345 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength);
346 p += sel->encodingStrLength;
347
348 return totalSize;
349 }
350
351 /**
352 * swap a selector into the desired Endianness and Asciiness of
353 * the system. Just as FYI, selectors are always saved in the format
354 * of the system that created them. They are only converted if used
355 * on another system. In other words, selectors created on different
356 * system can be different even if the params are identical (endianness
357 * and Asciiness differences only)
358 *
359 * @param ds pointer to data swapper containing swapping info
360 * @param inData pointer to incoming data
361 * @param length length of inData in bytes
362 * @param outData pointer to output data. Capacity should
363 * be at least equal to capacity of inData
364 * @param status an in/out ICU UErrorCode
365 * @return 0 on failure, number of bytes swapped on success
366 * number of bytes swapped can be smaller than length
367 */
368 static int32_t
ucnvsel_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)369 ucnvsel_swap(const UDataSwapper *ds,
370 const void *inData, int32_t length,
371 void *outData, UErrorCode *status) {
372 /* udata_swapDataHeader checks the arguments */
373 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status);
374 if(U_FAILURE(*status)) {
375 return 0;
376 }
377
378 /* check data format and format version */
379 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
380 if(!(
381 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */
382 pInfo->dataFormat[1] == 0x53 &&
383 pInfo->dataFormat[2] == 0x65 &&
384 pInfo->dataFormat[3] == 0x6c
385 )) {
386 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n",
387 pInfo->dataFormat[0], pInfo->dataFormat[1],
388 pInfo->dataFormat[2], pInfo->dataFormat[3]);
389 *status = U_INVALID_FORMAT_ERROR;
390 return 0;
391 }
392 if(pInfo->formatVersion[0] != 1) {
393 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n",
394 pInfo->formatVersion[0]);
395 *status = U_UNSUPPORTED_ERROR;
396 return 0;
397 }
398
399 if(length >= 0) {
400 length -= headerSize;
401 if(length < 16*4) {
402 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n",
403 length);
404 *status = U_INDEX_OUTOFBOUNDS_ERROR;
405 return 0;
406 }
407 }
408
409 const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
410 uint8_t *outBytes = (uint8_t *)outData + headerSize;
411
412 /* read the indexes */
413 const int32_t *inIndexes = (const int32_t *)inBytes;
414 int32_t indexes[16];
415 int32_t i;
416 for(i = 0; i < 16; ++i) {
417 indexes[i] = udata_readInt32(ds, inIndexes[i]);
418 }
419
420 /* get the total length of the data */
421 int32_t size = indexes[UCNVSEL_INDEX_SIZE];
422 if(length >= 0) {
423 if(length < size) {
424 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n",
425 length);
426 *status = U_INDEX_OUTOFBOUNDS_ERROR;
427 return 0;
428 }
429
430 /* copy the data for inaccessible bytes */
431 if(inBytes != outBytes) {
432 uprv_memcpy(outBytes, inBytes, size);
433 }
434
435 int32_t offset = 0, count;
436
437 /* swap the int32_t indexes[] */
438 count = UCNVSEL_INDEX_COUNT*4;
439 ds->swapArray32(ds, inBytes, count, outBytes, status);
440 offset += count;
441
442 /* swap the UTrie2 */
443 count = indexes[UCNVSEL_INDEX_TRIE_SIZE];
444 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status);
445 offset += count;
446
447 /* swap the uint32_t pv[] */
448 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4;
449 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status);
450 offset += count;
451
452 /* swap the encoding names */
453 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
454 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status);
455 offset += count;
456
457 U_ASSERT(offset == size);
458 }
459
460 return headerSize + size;
461 }
462
463 /* unserialize a selector */
464 U_CAPI UConverterSelector* U_EXPORT2
ucnvsel_openFromSerialized(const void * buffer,int32_t length,UErrorCode * status)465 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) {
466 // check if already failed
467 if (U_FAILURE(*status)) {
468 return NULL;
469 }
470 // ensure args make sense!
471 const uint8_t *p = (const uint8_t *)buffer;
472 if (length <= 0 ||
473 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
474 ) {
475 *status = U_ILLEGAL_ARGUMENT_ERROR;
476 return NULL;
477 }
478 // header
479 if (length < 32) {
480 // not even enough space for a minimal header
481 *status = U_INDEX_OUTOFBOUNDS_ERROR;
482 return NULL;
483 }
484 const DataHeader *pHeader = (const DataHeader *)p;
485 if (!(
486 pHeader->dataHeader.magic1==0xda &&
487 pHeader->dataHeader.magic2==0x27 &&
488 pHeader->info.dataFormat[0] == 0x43 &&
489 pHeader->info.dataFormat[1] == 0x53 &&
490 pHeader->info.dataFormat[2] == 0x65 &&
491 pHeader->info.dataFormat[3] == 0x6c
492 )) {
493 /* header not valid or dataFormat not recognized */
494 *status = U_INVALID_FORMAT_ERROR;
495 return NULL;
496 }
497 if (pHeader->info.formatVersion[0] != 1) {
498 *status = U_UNSUPPORTED_ERROR;
499 return NULL;
500 }
501 uint8_t* swapped = NULL;
502 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN ||
503 pHeader->info.charsetFamily != U_CHARSET_FAMILY
504 ) {
505 // swap the data
506 UDataSwapper *ds =
507 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status);
508 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status);
509 if (U_FAILURE(*status)) {
510 udata_closeSwapper(ds);
511 return NULL;
512 }
513 if (length < totalSize) {
514 udata_closeSwapper(ds);
515 *status = U_INDEX_OUTOFBOUNDS_ERROR;
516 return NULL;
517 }
518 swapped = (uint8_t*)uprv_malloc(totalSize);
519 if (swapped == NULL) {
520 udata_closeSwapper(ds);
521 *status = U_MEMORY_ALLOCATION_ERROR;
522 return NULL;
523 }
524 ucnvsel_swap(ds, p, length, swapped, status);
525 udata_closeSwapper(ds);
526 if (U_FAILURE(*status)) {
527 uprv_free(swapped);
528 return NULL;
529 }
530 p = swapped;
531 pHeader = (const DataHeader *)p;
532 }
533 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) {
534 // not even enough space for the header and the indexes
535 uprv_free(swapped);
536 *status = U_INDEX_OUTOFBOUNDS_ERROR;
537 return NULL;
538 }
539 p += pHeader->dataHeader.headerSize;
540 length -= pHeader->dataHeader.headerSize;
541 // indexes
542 const int32_t *indexes = (const int32_t *)p;
543 if (length < indexes[UCNVSEL_INDEX_SIZE]) {
544 uprv_free(swapped);
545 *status = U_INDEX_OUTOFBOUNDS_ERROR;
546 return NULL;
547 }
548 p += UCNVSEL_INDEX_COUNT * 4;
549 // create and populate the selector object
550 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector));
551 char **encodings =
552 (char **)uprv_malloc(
553 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *));
554 if (sel == NULL || encodings == NULL) {
555 uprv_free(swapped);
556 uprv_free(sel);
557 uprv_free(encodings);
558 *status = U_MEMORY_ALLOCATION_ERROR;
559 return NULL;
560 }
561 uprv_memset(sel, 0, sizeof(UConverterSelector));
562 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT];
563 sel->encodings = encodings;
564 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT];
565 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH];
566 sel->swapped = swapped;
567 // trie
568 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
569 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL,
570 status);
571 p += indexes[UCNVSEL_INDEX_TRIE_SIZE];
572 if (U_FAILURE(*status)) {
573 ucnvsel_close(sel);
574 return NULL;
575 }
576 // bit vectors
577 sel->pv = (uint32_t *)p;
578 p += sel->pvCount * 4;
579 // encoding names
580 char* s = (char*)p;
581 for (int32_t i = 0; i < sel->encodingsCount; ++i) {
582 sel->encodings[i] = s;
583 s += uprv_strlen(s) + 1;
584 }
585 p += sel->encodingStrLength;
586
587 return sel;
588 }
589
590 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just
591 // iterate over the selected encodings
592 struct Enumerator {
593 int16_t* index;
594 int16_t length;
595 int16_t cur;
596 const UConverterSelector* sel;
597 };
598
599 U_CDECL_BEGIN
600
601 static void U_CALLCONV
ucnvsel_close_selector_iterator(UEnumeration * enumerator)602 ucnvsel_close_selector_iterator(UEnumeration *enumerator) {
603 uprv_free(((Enumerator*)(enumerator->context))->index);
604 uprv_free(enumerator->context);
605 uprv_free(enumerator);
606 }
607
608
609 static int32_t U_CALLCONV
ucnvsel_count_encodings(UEnumeration * enumerator,UErrorCode * status)610 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) {
611 // check if already failed
612 if (U_FAILURE(*status)) {
613 return 0;
614 }
615 return ((Enumerator*)(enumerator->context))->length;
616 }
617
618
ucnvsel_next_encoding(UEnumeration * enumerator,int32_t * resultLength,UErrorCode * status)619 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator,
620 int32_t* resultLength,
621 UErrorCode* status) {
622 // check if already failed
623 if (U_FAILURE(*status)) {
624 return NULL;
625 }
626
627 int16_t cur = ((Enumerator*)(enumerator->context))->cur;
628 const UConverterSelector* sel;
629 const char* result;
630 if (cur >= ((Enumerator*)(enumerator->context))->length) {
631 return NULL;
632 }
633 sel = ((Enumerator*)(enumerator->context))->sel;
634 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ];
635 ((Enumerator*)(enumerator->context))->cur++;
636 if (resultLength) {
637 *resultLength = (int32_t)uprv_strlen(result);
638 }
639 return result;
640 }
641
ucnvsel_reset_iterator(UEnumeration * enumerator,UErrorCode * status)642 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator,
643 UErrorCode* status) {
644 // check if already failed
645 if (U_FAILURE(*status)) {
646 return ;
647 }
648 ((Enumerator*)(enumerator->context))->cur = 0;
649 }
650
651 U_CDECL_END
652
653
654 static const UEnumeration defaultEncodings = {
655 NULL,
656 NULL,
657 ucnvsel_close_selector_iterator,
658 ucnvsel_count_encodings,
659 uenum_unextDefault,
660 ucnvsel_next_encoding,
661 ucnvsel_reset_iterator
662 };
663
664
665 // internal fn to intersect two sets of masks
666 // returns whether the mask has reduced to all zeros
intersectMasks(uint32_t * dest,const uint32_t * source1,int32_t len)667 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) {
668 int32_t i;
669 uint32_t oredDest = 0;
670 for (i = 0 ; i < len ; ++i) {
671 oredDest |= (dest[i] &= source1[i]);
672 }
673 return oredDest == 0;
674 }
675
676 // internal fn to count how many 1's are there in a mask
677 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
countOnes(uint32_t * mask,int32_t len)678 static int16_t countOnes(uint32_t* mask, int32_t len) {
679 int32_t i, totalOnes = 0;
680 for (i = 0 ; i < len ; ++i) {
681 uint32_t ent = mask[i];
682 for (; ent; totalOnes++)
683 {
684 ent &= ent - 1; // clear the least significant bit set
685 }
686 }
687 return totalOnes;
688 }
689
690
691 /* internal function! */
selectForMask(const UConverterSelector * sel,uint32_t * mask,UErrorCode * status)692 static UEnumeration *selectForMask(const UConverterSelector* sel,
693 uint32_t *mask, UErrorCode *status) {
694 // this is the context we will use. Store a table of indices to which
695 // encodings are legit.
696 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator));
697 if (result == NULL) {
698 uprv_free(mask);
699 *status = U_MEMORY_ALLOCATION_ERROR;
700 return NULL;
701 }
702 result->index = NULL; // this will be allocated later!
703 result->length = result->cur = 0;
704 result->sel = sel;
705
706 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
707 if (en == NULL) {
708 // TODO(markus): Combine Enumerator and UEnumeration into one struct.
709 uprv_free(mask);
710 uprv_free(result);
711 *status = U_MEMORY_ALLOCATION_ERROR;
712 return NULL;
713 }
714 memcpy(en, &defaultEncodings, sizeof(UEnumeration));
715 en->context = result;
716
717 int32_t columns = (sel->encodingsCount+31)/32;
718 int16_t numOnes = countOnes(mask, columns);
719 // now, we know the exact space we need for index
720 if (numOnes > 0) {
721 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t));
722
723 int32_t i, j;
724 int16_t k = 0;
725 for (j = 0 ; j < columns; j++) {
726 uint32_t v = mask[j];
727 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) {
728 if ((v & 1) != 0) {
729 result->index[result->length++] = k;
730 }
731 v >>= 1;
732 }
733 }
734 } //otherwise, index will remain NULL (and will never be touched by
735 //the enumerator code anyway)
736 uprv_free(mask);
737 return en;
738 }
739
740 /* check a string against the selector - UTF16 version */
741 U_CAPI UEnumeration * U_EXPORT2
ucnvsel_selectForString(const UConverterSelector * sel,const UChar * s,int32_t length,UErrorCode * status)742 ucnvsel_selectForString(const UConverterSelector* sel,
743 const UChar *s, int32_t length, UErrorCode *status) {
744 // check if already failed
745 if (U_FAILURE(*status)) {
746 return NULL;
747 }
748 // ensure args make sense!
749 if (sel == NULL || (s == NULL && length != 0)) {
750 *status = U_ILLEGAL_ARGUMENT_ERROR;
751 return NULL;
752 }
753
754 int32_t columns = (sel->encodingsCount+31)/32;
755 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
756 if (mask == NULL) {
757 *status = U_MEMORY_ALLOCATION_ERROR;
758 return NULL;
759 }
760 uprv_memset(mask, ~0, columns *4);
761
762 if(s!=NULL) {
763 const UChar *limit;
764 if (length >= 0) {
765 limit = s + length;
766 } else {
767 limit = NULL;
768 }
769
770 while (limit == NULL ? *s != 0 : s != limit) {
771 UChar32 c;
772 uint16_t pvIndex;
773 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex);
774 if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
775 break;
776 }
777 }
778 }
779 return selectForMask(sel, mask, status);
780 }
781
782 /* check a string against the selector - UTF8 version */
783 U_CAPI UEnumeration * U_EXPORT2
ucnvsel_selectForUTF8(const UConverterSelector * sel,const char * s,int32_t length,UErrorCode * status)784 ucnvsel_selectForUTF8(const UConverterSelector* sel,
785 const char *s, int32_t length, UErrorCode *status) {
786 // check if already failed
787 if (U_FAILURE(*status)) {
788 return NULL;
789 }
790 // ensure args make sense!
791 if (sel == NULL || (s == NULL && length != 0)) {
792 *status = U_ILLEGAL_ARGUMENT_ERROR;
793 return NULL;
794 }
795
796 int32_t columns = (sel->encodingsCount+31)/32;
797 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4);
798 if (mask == NULL) {
799 *status = U_MEMORY_ALLOCATION_ERROR;
800 return NULL;
801 }
802 uprv_memset(mask, ~0, columns *4);
803
804 if (length < 0) {
805 length = (int32_t)uprv_strlen(s);
806 }
807
808 if(s!=NULL) {
809 const char *limit = s + length;
810
811 while (s != limit) {
812 uint16_t pvIndex;
813 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex);
814 if (intersectMasks(mask, sel->pv+pvIndex, columns)) {
815 break;
816 }
817 }
818 }
819 return selectForMask(sel, mask, status);
820 }
821
822 #endif // !UCONFIG_NO_CONVERSION
823