1 /*
2 **********************************************************************
3 * Copyright (C) 2008-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/uchar.h"
11 #include "unicode/uniset.h"
12 #include "unicode/utf16.h"
13 #include "utrie2.h"
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "identifier_info.h"
17 #include "scriptset.h"
18 #include "umutex.h"
19 #include "udataswp.h"
20 #include "uassert.h"
21 #include "uspoof_impl.h"
22
23 #if !UCONFIG_NO_NORMALIZATION
24
25
26 U_NAMESPACE_BEGIN
27
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
29
30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
31 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
32 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
33 if (U_FAILURE(status)) {
34 return;
35 }
36 fSpoofData = data;
37 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
38
39 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
40 allowedCharsSet->freeze();
41 fAllowedCharsSet = allowedCharsSet;
42 fAllowedLocales = uprv_strdup("");
43 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
44 status = U_MEMORY_ALLOCATION_ERROR;
45 return;
46 }
47 fMagic = USPOOF_MAGIC;
48 }
49
50
SpoofImpl()51 SpoofImpl::SpoofImpl() :
52 fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
53 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
54 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
55 allowedCharsSet->freeze();
56 fAllowedCharsSet = allowedCharsSet;
57 fAllowedLocales = uprv_strdup("");
58 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
59 }
60
61
62 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)63 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
64 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
65 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
66 if (U_FAILURE(status)) {
67 return;
68 }
69 fMagic = src.fMagic;
70 fChecks = src.fChecks;
71 if (src.fSpoofData != NULL) {
72 fSpoofData = src.fSpoofData->addReference();
73 }
74 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
75 if (fAllowedCharsSet == NULL) {
76 status = U_MEMORY_ALLOCATION_ERROR;
77 }
78 fAllowedLocales = uprv_strdup(src.fAllowedLocales);
79 fRestrictionLevel = src.fRestrictionLevel;
80 }
81
~SpoofImpl()82 SpoofImpl::~SpoofImpl() {
83 fMagic = 0; // head off application errors by preventing use of
84 // of deleted objects.
85 if (fSpoofData != NULL) {
86 fSpoofData->removeReference(); // Will delete if refCount goes to zero.
87 }
88 delete fAllowedCharsSet;
89 uprv_free((void *)fAllowedLocales);
90 delete fCachedIdentifierInfo;
91 }
92
93 //
94 // Incoming parameter check on Status and the SpoofChecker object
95 // received from the C API.
96 //
validateThis(const USpoofChecker * sc,UErrorCode & status)97 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
98 if (U_FAILURE(status)) {
99 return NULL;
100 }
101 if (sc == NULL) {
102 status = U_ILLEGAL_ARGUMENT_ERROR;
103 return NULL;
104 }
105 SpoofImpl *This = (SpoofImpl *)sc;
106 if (This->fMagic != USPOOF_MAGIC ||
107 This->fSpoofData == NULL) {
108 status = U_INVALID_FORMAT_ERROR;
109 return NULL;
110 }
111 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
112 return NULL;
113 }
114 return This;
115 }
116
validateThis(USpoofChecker * sc,UErrorCode & status)117 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
118 return const_cast<SpoofImpl *>
119 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
120 }
121
122
123
124 //--------------------------------------------------------------------------------------
125 //
126 // confusableLookup() This is the heart of the confusable skeleton generation
127 // implementation.
128 //
129 // Given a source character, produce the corresponding
130 // replacement character(s), appending them to the dest string.
131 //
132 //---------------------------------------------------------------------------------------
confusableLookup(UChar32 inChar,int32_t tableMask,UnicodeString & dest) const133 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
134
135 // Binary search the spoof data key table for the inChar
136 int32_t *low = fSpoofData->fCFUKeys;
137 int32_t *mid = NULL;
138 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
139 UChar32 midc;
140 do {
141 int32_t delta = ((int32_t)(limit-low))/2;
142 mid = low + delta;
143 midc = *mid & 0x1fffff;
144 if (inChar == midc) {
145 goto foundChar;
146 } else if (inChar < midc) {
147 limit = mid;
148 } else {
149 low = mid;
150 }
151 } while (low < limit-1);
152 mid = low;
153 midc = *mid & 0x1fffff;
154 if (inChar != midc) {
155 // Char not found. It maps to itself.
156 int i = 0;
157 dest.append(inChar);
158 return i;
159 }
160 foundChar:
161 int32_t keyFlags = *mid & 0xff000000;
162 if ((keyFlags & tableMask) == 0) {
163 // We found the right key char, but the entry doesn't pertain to the
164 // table we need. See if there is an adjacent key that does
165 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
166 int32_t *altMid;
167 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
168 keyFlags = *altMid & 0xff000000;
169 if (keyFlags & tableMask) {
170 mid = altMid;
171 goto foundKey;
172 }
173 }
174 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
175 keyFlags = *altMid & 0xff000000;
176 if (keyFlags & tableMask) {
177 mid = altMid;
178 goto foundKey;
179 }
180 }
181 }
182 // No key entry for this char & table.
183 // The input char maps to itself.
184 int i = 0;
185 dest.append(inChar);
186 return i;
187 }
188
189 foundKey:
190 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
191 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
192
193 // Value is either a UChar (for strings of length 1) or
194 // an index into the string table (for longer strings)
195 uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
196 if (stringLen == 1) {
197 dest.append((UChar)value);
198 return 1;
199 }
200
201 // String length of 4 from the above lookup is used for all strings of length >= 4.
202 // For these, get the real length from the string lengths table,
203 // which maps string table indexes to lengths.
204 // All strings of the same length are stored contiguously in the string table.
205 // 'value' from the lookup above is the starting index for the desired string.
206
207 int32_t ix;
208 if (stringLen == 4) {
209 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
210 for (ix = 0; ix < stringLengthsLimit; ix++) {
211 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
212 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
213 break;
214 }
215 }
216 U_ASSERT(ix < stringLengthsLimit);
217 }
218
219 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
220 UChar *src = &fSpoofData->fCFUStrings[value];
221 dest.append(src, stringLen);
222 return stringLen;
223 }
224
225
226 //---------------------------------------------------------------------------------------
227 //
228 // wholeScriptCheck()
229 //
230 // Input text is already normalized to NFD
231 // Return the set of scripts, each of which can represent something that is
232 // confusable with the input text. The script of the input text
233 // is included; input consisting of characters from a single script will
234 // always produce a result consisting of a set containing that script.
235 //
236 //---------------------------------------------------------------------------------------
wholeScriptCheck(const UnicodeString & text,ScriptSet * result,UErrorCode & status) const237 void SpoofImpl::wholeScriptCheck(
238 const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
239
240 UTrie2 *table =
241 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
242 result->setAll();
243 int32_t length = text.length();
244 for (int32_t inputIdx=0; inputIdx < length;) {
245 UChar32 c = text.char32At(inputIdx);
246 inputIdx += U16_LENGTH(c);
247 uint32_t index = utrie2_get32(table, c);
248 if (index == 0) {
249 // No confusables in another script for this char.
250 // TODO: we should change the data to have sets with just the single script
251 // bit for the script of this char. Gets rid of this special case.
252 // Until then, grab the script from the char and intersect it with the set.
253 UScriptCode cpScript = uscript_getScript(c, &status);
254 U_ASSERT(cpScript > USCRIPT_INHERITED);
255 result->intersect(cpScript, status);
256 } else if (index == 1) {
257 // Script == Common or Inherited. Nothing to do.
258 } else {
259 result->intersect(fSpoofData->fScriptSets[index]);
260 }
261 }
262 }
263
264
setAllowedLocales(const char * localesList,UErrorCode & status)265 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
266 UnicodeSet allowedChars;
267 UnicodeSet *tmpSet = NULL;
268 const char *locStart = localesList;
269 const char *locEnd = NULL;
270 const char *localesListEnd = localesList + uprv_strlen(localesList);
271 int32_t localeListCount = 0; // Number of locales provided by caller.
272
273 // Loop runs once per locale from the localesList, a comma separated list of locales.
274 do {
275 locEnd = uprv_strchr(locStart, ',');
276 if (locEnd == NULL) {
277 locEnd = localesListEnd;
278 }
279 while (*locStart == ' ') {
280 locStart++;
281 }
282 const char *trimmedEnd = locEnd-1;
283 while (trimmedEnd > locStart && *trimmedEnd == ' ') {
284 trimmedEnd--;
285 }
286 if (trimmedEnd <= locStart) {
287 break;
288 }
289 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
290 localeListCount++;
291
292 // We have one locale from the locales list.
293 // Add the script chars for this locale to the accumulating set of allowed chars.
294 // If the locale is no good, we will be notified back via status.
295 addScriptChars(locale, &allowedChars, status);
296 uprv_free((void *)locale);
297 if (U_FAILURE(status)) {
298 break;
299 }
300 locStart = locEnd + 1;
301 } while (locStart < localesListEnd);
302
303 // If our caller provided an empty list of locales, we disable the allowed characters checking
304 if (localeListCount == 0) {
305 uprv_free((void *)fAllowedLocales);
306 fAllowedLocales = uprv_strdup("");
307 tmpSet = new UnicodeSet(0, 0x10ffff);
308 if (fAllowedLocales == NULL || tmpSet == NULL) {
309 status = U_MEMORY_ALLOCATION_ERROR;
310 return;
311 }
312 tmpSet->freeze();
313 delete fAllowedCharsSet;
314 fAllowedCharsSet = tmpSet;
315 fChecks &= ~USPOOF_CHAR_LIMIT;
316 return;
317 }
318
319
320 // Add all common and inherited characters to the set of allowed chars.
321 UnicodeSet tempSet;
322 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
323 allowedChars.addAll(tempSet);
324 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
325 allowedChars.addAll(tempSet);
326
327 // If anything went wrong, we bail out without changing
328 // the state of the spoof checker.
329 if (U_FAILURE(status)) {
330 return;
331 }
332
333 // Store the updated spoof checker state.
334 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
335 const char *tmpLocalesList = uprv_strdup(localesList);
336 if (tmpSet == NULL || tmpLocalesList == NULL) {
337 status = U_MEMORY_ALLOCATION_ERROR;
338 return;
339 }
340 uprv_free((void *)fAllowedLocales);
341 fAllowedLocales = tmpLocalesList;
342 tmpSet->freeze();
343 delete fAllowedCharsSet;
344 fAllowedCharsSet = tmpSet;
345 fChecks |= USPOOF_CHAR_LIMIT;
346 }
347
348
getAllowedLocales(UErrorCode &)349 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
350 return fAllowedLocales;
351 }
352
353
354 // Given a locale (a language), add all the characters from all of the scripts used with that language
355 // to the allowedChars UnicodeSet
356
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)357 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
358 UScriptCode scripts[30];
359
360 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
361 if (U_FAILURE(status)) {
362 return;
363 }
364 if (status == U_USING_DEFAULT_WARNING) {
365 status = U_ILLEGAL_ARGUMENT_ERROR;
366 return;
367 }
368 UnicodeSet tmpSet;
369 int32_t i;
370 for (i=0; i<numScripts; i++) {
371 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
372 allowedChars->addAll(tmpSet);
373 }
374 }
375
376
377 // Convert a text format hex number. Utility function used by builder code. Static.
378 // Input: UChar *string text. Output: a UChar32
379 // Input has been pre-checked, and will have no non-hex chars.
380 // The number must fall in the code point range of 0..0x10ffff
381 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)382 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
383 if (U_FAILURE(status)) {
384 return 0;
385 }
386 U_ASSERT(limit-start > 0);
387 uint32_t val = 0;
388 int i;
389 for (i=start; i<limit; i++) {
390 int digitVal = s[i] - 0x30;
391 if (digitVal>9) {
392 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
393 }
394 if (digitVal>15) {
395 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
396 }
397 U_ASSERT(digitVal <= 0xf);
398 val <<= 4;
399 val += digitVal;
400 }
401 if (val > 0x10ffff) {
402 status = U_PARSE_ERROR;
403 val = 0;
404 }
405 return (UChar32)val;
406 }
407
408 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
409 // Maintain a one-element cache, which is sufficient to avoid repeatedly
410 // creating new ones unless we get multi-thread concurrency in spoof
411 // check operations, which should be statistically uncommon.
412
413 // These functions are used in place of new & delete of an IdentifierInfo.
414 // They will recycle the IdentifierInfo when possible.
415 // They are logically const, and used within const functions that must be thread safe.
getIdentifierInfo(UErrorCode & status) const416 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
417 IdentifierInfo *returnIdInfo = NULL;
418 if (U_FAILURE(status)) {
419 return returnIdInfo;
420 }
421 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
422 {
423 Mutex m;
424 returnIdInfo = nonConstThis->fCachedIdentifierInfo;
425 nonConstThis->fCachedIdentifierInfo = NULL;
426 }
427 if (returnIdInfo == NULL) {
428 returnIdInfo = new IdentifierInfo(status);
429 if (U_SUCCESS(status) && returnIdInfo == NULL) {
430 status = U_MEMORY_ALLOCATION_ERROR;
431 }
432 if (U_FAILURE(status) && returnIdInfo != NULL) {
433 delete returnIdInfo;
434 returnIdInfo = NULL;
435 }
436 }
437 return returnIdInfo;
438 }
439
440
releaseIdentifierInfo(IdentifierInfo * idInfo) const441 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
442 if (idInfo != NULL) {
443 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
444 {
445 Mutex m;
446 if (nonConstThis->fCachedIdentifierInfo == NULL) {
447 nonConstThis->fCachedIdentifierInfo = idInfo;
448 idInfo = NULL;
449 }
450 }
451 delete idInfo;
452 }
453 }
454
455
456
457
458 //----------------------------------------------------------------------------------------------
459 //
460 // class SpoofData Implementation
461 //
462 //----------------------------------------------------------------------------------------------
463
464
validateDataVersion(const SpoofDataHeader * rawData,UErrorCode & status)465 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
466 if (U_FAILURE(status) ||
467 rawData == NULL ||
468 rawData->fMagic != USPOOF_MAGIC ||
469 rawData->fFormatVersion[0] > 1 ||
470 rawData->fFormatVersion[1] > 0) {
471 status = U_INVALID_FORMAT_ERROR;
472 return FALSE;
473 }
474 return TRUE;
475 }
476
477 static UBool U_CALLCONV
spoofDataIsAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)478 spoofDataIsAcceptable(void *context,
479 const char * /* type */, const char * /*name*/,
480 const UDataInfo *pInfo) {
481 if(
482 pInfo->size >= 20 &&
483 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
484 pInfo->charsetFamily == U_CHARSET_FAMILY &&
485 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
486 pInfo->dataFormat[1] == 0x66 &&
487 pInfo->dataFormat[2] == 0x75 &&
488 pInfo->dataFormat[3] == 0x20 &&
489 pInfo->formatVersion[0] == 1
490 ) {
491 UVersionInfo *version = static_cast<UVersionInfo *>(context);
492 if(version != NULL) {
493 uprv_memcpy(version, pInfo->dataVersion, 4);
494 }
495 return TRUE;
496 } else {
497 return FALSE;
498 }
499 }
500
501 //
502 // SpoofData::getDefault() - return a wrapper around the spoof data that is
503 // baked into the default ICU data.
504 //
getDefault(UErrorCode & status)505 SpoofData *SpoofData::getDefault(UErrorCode &status) {
506 // TODO: Cache it. Lazy create, keep until cleanup.
507
508 UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
509 spoofDataIsAcceptable,
510 NULL, // context, would receive dataVersion if supplied.
511 &status);
512 if (U_FAILURE(status)) {
513 return NULL;
514 }
515 SpoofData *This = new SpoofData(udm, status);
516 if (U_FAILURE(status)) {
517 delete This;
518 return NULL;
519 }
520 if (This == NULL) {
521 status = U_MEMORY_ALLOCATION_ERROR;
522 }
523 return This;
524 }
525
SpoofData(UDataMemory * udm,UErrorCode & status)526 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
527 {
528 reset();
529 if (U_FAILURE(status)) {
530 return;
531 }
532 fUDM = udm;
533 // fRawData is non-const because it may be constructed by the data builder.
534 fRawData = reinterpret_cast<SpoofDataHeader *>(
535 const_cast<void *>(udata_getMemory(udm)));
536 validateDataVersion(fRawData, status);
537 initPtrs(status);
538 }
539
540
SpoofData(const void * data,int32_t length,UErrorCode & status)541 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
542 {
543 reset();
544 if (U_FAILURE(status)) {
545 return;
546 }
547 if ((size_t)length < sizeof(SpoofDataHeader)) {
548 status = U_INVALID_FORMAT_ERROR;
549 return;
550 }
551 void *ncData = const_cast<void *>(data);
552 fRawData = static_cast<SpoofDataHeader *>(ncData);
553 if (length < fRawData->fLength) {
554 status = U_INVALID_FORMAT_ERROR;
555 return;
556 }
557 validateDataVersion(fRawData, status);
558 initPtrs(status);
559 }
560
561
562 // Spoof Data constructor for use from data builder.
563 // Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)564 SpoofData::SpoofData(UErrorCode &status) {
565 reset();
566 if (U_FAILURE(status)) {
567 return;
568 }
569 fDataOwned = true;
570 fRefCount = 1;
571
572 // The spoof header should already be sized to be a multiple of 16 bytes.
573 // Just in case it's not, round it up.
574 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
575 U_ASSERT(initialSize == sizeof(SpoofDataHeader));
576
577 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
578 fMemLimit = initialSize;
579 if (fRawData == NULL) {
580 status = U_MEMORY_ALLOCATION_ERROR;
581 return;
582 }
583 uprv_memset(fRawData, 0, initialSize);
584
585 fRawData->fMagic = USPOOF_MAGIC;
586 fRawData->fFormatVersion[0] = 1;
587 fRawData->fFormatVersion[1] = 0;
588 fRawData->fFormatVersion[2] = 0;
589 fRawData->fFormatVersion[3] = 0;
590 initPtrs(status);
591 }
592
593 // reset() - initialize all fields.
594 // Should be updated if any new fields are added.
595 // Called by constructors to put things in a known initial state.
reset()596 void SpoofData::reset() {
597 fRawData = NULL;
598 fDataOwned = FALSE;
599 fUDM = NULL;
600 fMemLimit = 0;
601 fRefCount = 1;
602 fCFUKeys = NULL;
603 fCFUValues = NULL;
604 fCFUStringLengths = NULL;
605 fCFUStrings = NULL;
606 fAnyCaseTrie = NULL;
607 fLowerCaseTrie = NULL;
608 fScriptSets = NULL;
609 }
610
611
612 // SpoofData::initPtrs()
613 // Initialize the pointers to the various sections of the raw data.
614 //
615 // This function is used both during the Trie building process (multiple
616 // times, as the individual data sections are added), and
617 // during the opening of a Spoof Checker from prebuilt data.
618 //
619 // The pointers for non-existent data sections (identified by an offset of 0)
620 // are set to NULL.
621 //
622 // Note: During building the data, adding each new data section
623 // reallocs the raw data area, which likely relocates it, which
624 // in turn requires reinitializing all of the pointers into it, hence
625 // multiple calls to this function during building.
626 //
initPtrs(UErrorCode & status)627 void SpoofData::initPtrs(UErrorCode &status) {
628 fCFUKeys = NULL;
629 fCFUValues = NULL;
630 fCFUStringLengths = NULL;
631 fCFUStrings = NULL;
632 if (U_FAILURE(status)) {
633 return;
634 }
635 if (fRawData->fCFUKeys != 0) {
636 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
637 }
638 if (fRawData->fCFUStringIndex != 0) {
639 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
640 }
641 if (fRawData->fCFUStringLengths != 0) {
642 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
643 }
644 if (fRawData->fCFUStringTable != 0) {
645 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
646 }
647
648 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
649 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
650 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
651 }
652 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
653 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
654 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
655 }
656
657 if (fRawData->fScriptSets != 0) {
658 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
659 }
660 }
661
662
~SpoofData()663 SpoofData::~SpoofData() {
664 utrie2_close(fAnyCaseTrie);
665 fAnyCaseTrie = NULL;
666 utrie2_close(fLowerCaseTrie);
667 fLowerCaseTrie = NULL;
668 if (fDataOwned) {
669 uprv_free(fRawData);
670 }
671 fRawData = NULL;
672 if (fUDM != NULL) {
673 udata_close(fUDM);
674 }
675 fUDM = NULL;
676 }
677
678
removeReference()679 void SpoofData::removeReference() {
680 if (umtx_atomic_dec(&fRefCount) == 0) {
681 delete this;
682 }
683 }
684
685
addReference()686 SpoofData *SpoofData::addReference() {
687 umtx_atomic_inc(&fRefCount);
688 return this;
689 }
690
691
reserveSpace(int32_t numBytes,UErrorCode & status)692 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
693 if (U_FAILURE(status)) {
694 return NULL;
695 }
696 if (!fDataOwned) {
697 U_ASSERT(FALSE);
698 status = U_INTERNAL_PROGRAM_ERROR;
699 return NULL;
700 }
701
702 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
703 uint32_t returnOffset = fMemLimit;
704 fMemLimit += numBytes;
705 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
706 fRawData->fLength = fMemLimit;
707 uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
708 initPtrs(status);
709 return (char *)fRawData + returnOffset;
710 }
711
712
713 U_NAMESPACE_END
714
715 U_NAMESPACE_USE
716
717 //-----------------------------------------------------------------------------
718 //
719 // uspoof_swap - byte swap and char encoding swap of spoof data
720 //
721 //-----------------------------------------------------------------------------
722 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)723 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
724 UErrorCode *status) {
725
726 if (status == NULL || U_FAILURE(*status)) {
727 return 0;
728 }
729 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
730 *status=U_ILLEGAL_ARGUMENT_ERROR;
731 return 0;
732 }
733
734 //
735 // Check that the data header is for spoof data.
736 // (Header contents are defined in gencfu.cpp)
737 //
738 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
739 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
740 pInfo->dataFormat[1]==0x66 &&
741 pInfo->dataFormat[2]==0x75 &&
742 pInfo->dataFormat[3]==0x20 &&
743 pInfo->formatVersion[0]==1 )) {
744 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
745 "(format version %02x %02x %02x %02x) is not recognized\n",
746 pInfo->dataFormat[0], pInfo->dataFormat[1],
747 pInfo->dataFormat[2], pInfo->dataFormat[3],
748 pInfo->formatVersion[0], pInfo->formatVersion[1],
749 pInfo->formatVersion[2], pInfo->formatVersion[3]);
750 *status=U_UNSUPPORTED_ERROR;
751 return 0;
752 }
753
754 //
755 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
756 // header). This swap also conveniently gets us
757 // the size of the ICU d.h., which lets us locate the start
758 // of the uspoof specific data.
759 //
760 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
761
762
763 //
764 // Get the Spoof Data Header, and check that it appears to be OK.
765 //
766 //
767 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
768 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
769 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
770 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
771 {
772 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
773 *status=U_UNSUPPORTED_ERROR;
774 return 0;
775 }
776
777 //
778 // Prefight operation? Just return the size
779 //
780 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
781 int32_t totalSize = headerSize + spoofDataLength;
782 if (length < 0) {
783 return totalSize;
784 }
785
786 //
787 // Check that length passed in is consistent with length from Spoof data header.
788 //
789 if (length < totalSize) {
790 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
791 spoofDataLength);
792 *status=U_INDEX_OUTOFBOUNDS_ERROR;
793 return 0;
794 }
795
796
797 //
798 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
799 // we need to reference the header to locate the data, and an
800 // inplace swap of the header leaves it unusable.
801 //
802 uint8_t *outBytes = (uint8_t *)outData + headerSize;
803 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
804
805 int32_t sectionStart;
806 int32_t sectionLength;
807
808 //
809 // If not swapping in place, zero out the output buffer before starting.
810 // Gaps may exist between the individual sections, and these must be zeroed in
811 // the output buffer. The simplest way to do that is to just zero the whole thing.
812 //
813 if (inBytes != outBytes) {
814 uprv_memset(outBytes, 0, spoofDataLength);
815 }
816
817 // Confusables Keys Section (fCFUKeys)
818 sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
819 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
820 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
821
822 // String Index Section
823 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
824 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
825 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
826
827 // String Table Section
828 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
829 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
830 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
831
832 // String Lengths Section
833 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
834 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
835 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
836
837 // Any Case Trie
838 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
839 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
840 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
841
842 // Lower Case Trie
843 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
844 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
845 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
846
847 // Script Sets. The data is an array of int32_t
848 sectionStart = ds->readUInt32(spoofDH->fScriptSets);
849 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
850 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
851
852 // And, last, swap the header itself.
853 // int32_t fMagic // swap this
854 // uint8_t fFormatVersion[4] // Do not swap this, just copy
855 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
856 //
857 uint32_t magic = ds->readUInt32(spoofDH->fMagic);
858 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
859
860 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
861 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
862 }
863 // swap starting at fLength
864 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
865
866 return totalSize;
867 }
868
869 #endif
870
871
872