1 /*
2 **********************************************************************
3 * Copyright (C) 2008-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/uchar.h"
11 #include "unicode/uniset.h"
12 #include "unicode/utf16.h"
13 #include "utrie2.h"
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "identifier_info.h"
17 #include "scriptset.h"
18 #include "umutex.h"
19 #include "udataswp.h"
20 #include "uassert.h"
21 #include "uspoof_impl.h"
22
23 #if !UCONFIG_NO_NORMALIZATION
24
25
26 U_NAMESPACE_BEGIN
27
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
29
30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
31 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
32 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
33 if (U_FAILURE(status)) {
34 return;
35 }
36 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
37
38 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
39 allowedCharsSet->freeze();
40 fAllowedCharsSet = allowedCharsSet;
41 fAllowedLocales = uprv_strdup("");
42 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
43 status = U_MEMORY_ALLOCATION_ERROR;
44 return;
45 }
46 fMagic = USPOOF_MAGIC;
47 }
48
49
SpoofImpl()50 SpoofImpl::SpoofImpl() :
51 fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
52 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
53 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
54 allowedCharsSet->freeze();
55 fAllowedCharsSet = allowedCharsSet;
56 fAllowedLocales = uprv_strdup("");
57 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
58 }
59
60
61 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)62 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
63 fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
64 fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
65 if (U_FAILURE(status)) {
66 return;
67 }
68 fMagic = src.fMagic;
69 fChecks = src.fChecks;
70 if (src.fSpoofData != NULL) {
71 fSpoofData = src.fSpoofData->addReference();
72 }
73 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
74 if (fAllowedCharsSet == NULL) {
75 status = U_MEMORY_ALLOCATION_ERROR;
76 }
77 fAllowedLocales = uprv_strdup(src.fAllowedLocales);
78 fRestrictionLevel = src.fRestrictionLevel;
79 }
80
~SpoofImpl()81 SpoofImpl::~SpoofImpl() {
82 fMagic = 0; // head off application errors by preventing use of
83 // of deleted objects.
84 if (fSpoofData != NULL) {
85 fSpoofData->removeReference(); // Will delete if refCount goes to zero.
86 }
87 delete fAllowedCharsSet;
88 uprv_free((void *)fAllowedLocales);
89 delete fCachedIdentifierInfo;
90 }
91
92 //
93 // Incoming parameter check on Status and the SpoofChecker object
94 // received from the C API.
95 //
validateThis(const USpoofChecker * sc,UErrorCode & status)96 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
97 if (U_FAILURE(status)) {
98 return NULL;
99 }
100 if (sc == NULL) {
101 status = U_ILLEGAL_ARGUMENT_ERROR;
102 return NULL;
103 }
104 SpoofImpl *This = (SpoofImpl *)sc;
105 if (This->fMagic != USPOOF_MAGIC ||
106 This->fSpoofData == NULL) {
107 status = U_INVALID_FORMAT_ERROR;
108 return NULL;
109 }
110 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
111 return NULL;
112 }
113 return This;
114 }
115
validateThis(USpoofChecker * sc,UErrorCode & status)116 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
117 return const_cast<SpoofImpl *>
118 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
119 }
120
121
122
123 //--------------------------------------------------------------------------------------
124 //
125 // confusableLookup() This is the heart of the confusable skeleton generation
126 // implementation.
127 //
128 // Given a source character, produce the corresponding
129 // replacement character(s), appending them to the dest string.
130 //
131 //---------------------------------------------------------------------------------------
confusableLookup(UChar32 inChar,int32_t tableMask,UnicodeString & dest) const132 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
133
134 // Binary search the spoof data key table for the inChar
135 int32_t *low = fSpoofData->fCFUKeys;
136 int32_t *mid = NULL;
137 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
138 UChar32 midc;
139 do {
140 int32_t delta = ((int32_t)(limit-low))/2;
141 mid = low + delta;
142 midc = *mid & 0x1fffff;
143 if (inChar == midc) {
144 goto foundChar;
145 } else if (inChar < midc) {
146 limit = mid;
147 } else {
148 low = mid;
149 }
150 } while (low < limit-1);
151 mid = low;
152 midc = *mid & 0x1fffff;
153 if (inChar != midc) {
154 // Char not found. It maps to itself.
155 int i = 0;
156 dest.append(inChar);
157 return i;
158 }
159 foundChar:
160 int32_t keyFlags = *mid & 0xff000000;
161 if ((keyFlags & tableMask) == 0) {
162 // We found the right key char, but the entry doesn't pertain to the
163 // table we need. See if there is an adjacent key that does
164 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
165 int32_t *altMid;
166 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
167 keyFlags = *altMid & 0xff000000;
168 if (keyFlags & tableMask) {
169 mid = altMid;
170 goto foundKey;
171 }
172 }
173 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
174 keyFlags = *altMid & 0xff000000;
175 if (keyFlags & tableMask) {
176 mid = altMid;
177 goto foundKey;
178 }
179 }
180 }
181 // No key entry for this char & table.
182 // The input char maps to itself.
183 int i = 0;
184 dest.append(inChar);
185 return i;
186 }
187
188 foundKey:
189 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
190 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
191
192 // Value is either a UChar (for strings of length 1) or
193 // an index into the string table (for longer strings)
194 uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
195 if (stringLen == 1) {
196 dest.append((UChar)value);
197 return 1;
198 }
199
200 // String length of 4 from the above lookup is used for all strings of length >= 4.
201 // For these, get the real length from the string lengths table,
202 // which maps string table indexes to lengths.
203 // All strings of the same length are stored contiguously in the string table.
204 // 'value' from the lookup above is the starting index for the desired string.
205
206 int32_t ix;
207 if (stringLen == 4) {
208 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
209 for (ix = 0; ix < stringLengthsLimit; ix++) {
210 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
211 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
212 break;
213 }
214 }
215 U_ASSERT(ix < stringLengthsLimit);
216 }
217
218 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
219 UChar *src = &fSpoofData->fCFUStrings[value];
220 dest.append(src, stringLen);
221 return stringLen;
222 }
223
224
225 //---------------------------------------------------------------------------------------
226 //
227 // wholeScriptCheck()
228 //
229 // Input text is already normalized to NFD
230 // Return the set of scripts, each of which can represent something that is
231 // confusable with the input text. The script of the input text
232 // is included; input consisting of characters from a single script will
233 // always produce a result consisting of a set containing that script.
234 //
235 //---------------------------------------------------------------------------------------
wholeScriptCheck(const UnicodeString & text,ScriptSet * result,UErrorCode & status) const236 void SpoofImpl::wholeScriptCheck(
237 const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
238
239 UTrie2 *table =
240 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
241 result->setAll();
242 int32_t length = text.length();
243 for (int32_t inputIdx=0; inputIdx < length;) {
244 UChar32 c = text.char32At(inputIdx);
245 inputIdx += U16_LENGTH(c);
246 uint32_t index = utrie2_get32(table, c);
247 if (index == 0) {
248 // No confusables in another script for this char.
249 // TODO: we should change the data to have sets with just the single script
250 // bit for the script of this char. Gets rid of this special case.
251 // Until then, grab the script from the char and intersect it with the set.
252 UScriptCode cpScript = uscript_getScript(c, &status);
253 U_ASSERT(cpScript > USCRIPT_INHERITED);
254 result->intersect(cpScript, status);
255 } else if (index == 1) {
256 // Script == Common or Inherited. Nothing to do.
257 } else {
258 result->intersect(fSpoofData->fScriptSets[index]);
259 }
260 }
261 }
262
263
setAllowedLocales(const char * localesList,UErrorCode & status)264 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
265 UnicodeSet allowedChars;
266 UnicodeSet *tmpSet = NULL;
267 const char *locStart = localesList;
268 const char *locEnd = NULL;
269 const char *localesListEnd = localesList + uprv_strlen(localesList);
270 int32_t localeListCount = 0; // Number of locales provided by caller.
271
272 // Loop runs once per locale from the localesList, a comma separated list of locales.
273 do {
274 locEnd = uprv_strchr(locStart, ',');
275 if (locEnd == NULL) {
276 locEnd = localesListEnd;
277 }
278 while (*locStart == ' ') {
279 locStart++;
280 }
281 const char *trimmedEnd = locEnd-1;
282 while (trimmedEnd > locStart && *trimmedEnd == ' ') {
283 trimmedEnd--;
284 }
285 if (trimmedEnd <= locStart) {
286 break;
287 }
288 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
289 localeListCount++;
290
291 // We have one locale from the locales list.
292 // Add the script chars for this locale to the accumulating set of allowed chars.
293 // If the locale is no good, we will be notified back via status.
294 addScriptChars(locale, &allowedChars, status);
295 uprv_free((void *)locale);
296 if (U_FAILURE(status)) {
297 break;
298 }
299 locStart = locEnd + 1;
300 } while (locStart < localesListEnd);
301
302 // If our caller provided an empty list of locales, we disable the allowed characters checking
303 if (localeListCount == 0) {
304 uprv_free((void *)fAllowedLocales);
305 fAllowedLocales = uprv_strdup("");
306 tmpSet = new UnicodeSet(0, 0x10ffff);
307 if (fAllowedLocales == NULL || tmpSet == NULL) {
308 status = U_MEMORY_ALLOCATION_ERROR;
309 return;
310 }
311 tmpSet->freeze();
312 delete fAllowedCharsSet;
313 fAllowedCharsSet = tmpSet;
314 fChecks &= ~USPOOF_CHAR_LIMIT;
315 return;
316 }
317
318
319 // Add all common and inherited characters to the set of allowed chars.
320 UnicodeSet tempSet;
321 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
322 allowedChars.addAll(tempSet);
323 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
324 allowedChars.addAll(tempSet);
325
326 // If anything went wrong, we bail out without changing
327 // the state of the spoof checker.
328 if (U_FAILURE(status)) {
329 return;
330 }
331
332 // Store the updated spoof checker state.
333 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
334 const char *tmpLocalesList = uprv_strdup(localesList);
335 if (tmpSet == NULL || tmpLocalesList == NULL) {
336 status = U_MEMORY_ALLOCATION_ERROR;
337 return;
338 }
339 uprv_free((void *)fAllowedLocales);
340 fAllowedLocales = tmpLocalesList;
341 tmpSet->freeze();
342 delete fAllowedCharsSet;
343 fAllowedCharsSet = tmpSet;
344 fChecks |= USPOOF_CHAR_LIMIT;
345 }
346
347
getAllowedLocales(UErrorCode &)348 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
349 return fAllowedLocales;
350 }
351
352
353 // Given a locale (a language), add all the characters from all of the scripts used with that language
354 // to the allowedChars UnicodeSet
355
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)356 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
357 UScriptCode scripts[30];
358
359 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
360 if (U_FAILURE(status)) {
361 return;
362 }
363 if (status == U_USING_DEFAULT_WARNING) {
364 status = U_ILLEGAL_ARGUMENT_ERROR;
365 return;
366 }
367 UnicodeSet tmpSet;
368 int32_t i;
369 for (i=0; i<numScripts; i++) {
370 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
371 allowedChars->addAll(tmpSet);
372 }
373 }
374
375
376 // Convert a text format hex number. Utility function used by builder code. Static.
377 // Input: UChar *string text. Output: a UChar32
378 // Input has been pre-checked, and will have no non-hex chars.
379 // The number must fall in the code point range of 0..0x10ffff
380 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)381 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
382 if (U_FAILURE(status)) {
383 return 0;
384 }
385 U_ASSERT(limit-start > 0);
386 uint32_t val = 0;
387 int i;
388 for (i=start; i<limit; i++) {
389 int digitVal = s[i] - 0x30;
390 if (digitVal>9) {
391 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
392 }
393 if (digitVal>15) {
394 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
395 }
396 U_ASSERT(digitVal <= 0xf);
397 val <<= 4;
398 val += digitVal;
399 }
400 if (val > 0x10ffff) {
401 status = U_PARSE_ERROR;
402 val = 0;
403 }
404 return (UChar32)val;
405 }
406
407 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
408 // Maintain a one-element cache, which is sufficient to avoid repeatedly
409 // creating new ones unless we get multi-thread concurrency in spoof
410 // check operations, which should be statistically uncommon.
411
412 // These functions are used in place of new & delete of an IdentifierInfo.
413 // They will recycle the IdentifierInfo when possible.
414 // They are logically const, and used within const functions that must be thread safe.
getIdentifierInfo(UErrorCode & status) const415 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
416 IdentifierInfo *returnIdInfo = NULL;
417 if (U_FAILURE(status)) {
418 return returnIdInfo;
419 }
420 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
421 {
422 Mutex m;
423 returnIdInfo = nonConstThis->fCachedIdentifierInfo;
424 nonConstThis->fCachedIdentifierInfo = NULL;
425 }
426 if (returnIdInfo == NULL) {
427 returnIdInfo = new IdentifierInfo(status);
428 if (U_SUCCESS(status) && returnIdInfo == NULL) {
429 status = U_MEMORY_ALLOCATION_ERROR;
430 }
431 if (U_FAILURE(status) && returnIdInfo != NULL) {
432 delete returnIdInfo;
433 returnIdInfo = NULL;
434 }
435 }
436 return returnIdInfo;
437 }
438
439
releaseIdentifierInfo(IdentifierInfo * idInfo) const440 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
441 if (idInfo != NULL) {
442 SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
443 {
444 Mutex m;
445 if (nonConstThis->fCachedIdentifierInfo == NULL) {
446 nonConstThis->fCachedIdentifierInfo = idInfo;
447 idInfo = NULL;
448 }
449 }
450 delete idInfo;
451 }
452 }
453
454
455
456
457 //----------------------------------------------------------------------------------------------
458 //
459 // class SpoofData Implementation
460 //
461 //----------------------------------------------------------------------------------------------
462
463
validateDataVersion(const SpoofDataHeader * rawData,UErrorCode & status)464 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
465 if (U_FAILURE(status) ||
466 rawData == NULL ||
467 rawData->fMagic != USPOOF_MAGIC ||
468 rawData->fFormatVersion[0] > 1 ||
469 rawData->fFormatVersion[1] > 0) {
470 status = U_INVALID_FORMAT_ERROR;
471 return FALSE;
472 }
473 return TRUE;
474 }
475
476 static UBool U_CALLCONV
spoofDataIsAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)477 spoofDataIsAcceptable(void *context,
478 const char * /* type */, const char * /*name*/,
479 const UDataInfo *pInfo) {
480 if(
481 pInfo->size >= 20 &&
482 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
483 pInfo->charsetFamily == U_CHARSET_FAMILY &&
484 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
485 pInfo->dataFormat[1] == 0x66 &&
486 pInfo->dataFormat[2] == 0x75 &&
487 pInfo->dataFormat[3] == 0x20 &&
488 pInfo->formatVersion[0] == 1
489 ) {
490 UVersionInfo *version = static_cast<UVersionInfo *>(context);
491 if(version != NULL) {
492 uprv_memcpy(version, pInfo->dataVersion, 4);
493 }
494 return TRUE;
495 } else {
496 return FALSE;
497 }
498 }
499
500 //
501 // SpoofData::getDefault() - return a wrapper around the spoof data that is
502 // baked into the default ICU data.
503 //
504 // Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
505 // SpoofData is shared by all spoof checkers using the default data.
506 //
getDefault(UErrorCode & status)507 SpoofData *SpoofData::getDefault(UErrorCode &status) {
508 UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
509 spoofDataIsAcceptable,
510 NULL, // context, would receive dataVersion if supplied.
511 &status);
512 if (U_FAILURE(status)) {
513 return NULL;
514 }
515 SpoofData *This = new SpoofData(udm, status);
516 if (U_FAILURE(status)) {
517 delete This;
518 return NULL;
519 }
520 if (This == NULL) {
521 status = U_MEMORY_ALLOCATION_ERROR;
522 }
523 return This;
524 }
525
SpoofData(UDataMemory * udm,UErrorCode & status)526 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
527 {
528 reset();
529 if (U_FAILURE(status)) {
530 return;
531 }
532 fUDM = udm;
533 // fRawData is non-const because it may be constructed by the data builder.
534 fRawData = reinterpret_cast<SpoofDataHeader *>(
535 const_cast<void *>(udata_getMemory(udm)));
536 validateDataVersion(fRawData, status);
537 initPtrs(status);
538 }
539
540
SpoofData(const void * data,int32_t length,UErrorCode & status)541 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
542 {
543 reset();
544 if (U_FAILURE(status)) {
545 return;
546 }
547 if ((size_t)length < sizeof(SpoofDataHeader)) {
548 status = U_INVALID_FORMAT_ERROR;
549 return;
550 }
551 void *ncData = const_cast<void *>(data);
552 fRawData = static_cast<SpoofDataHeader *>(ncData);
553 if (length < fRawData->fLength) {
554 status = U_INVALID_FORMAT_ERROR;
555 return;
556 }
557 validateDataVersion(fRawData, status);
558 initPtrs(status);
559 }
560
561
562 // Spoof Data constructor for use from data builder.
563 // Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)564 SpoofData::SpoofData(UErrorCode &status) {
565 reset();
566 if (U_FAILURE(status)) {
567 return;
568 }
569 fDataOwned = true;
570
571 // The spoof header should already be sized to be a multiple of 16 bytes.
572 // Just in case it's not, round it up.
573 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
574 U_ASSERT(initialSize == sizeof(SpoofDataHeader));
575
576 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
577 fMemLimit = initialSize;
578 if (fRawData == NULL) {
579 status = U_MEMORY_ALLOCATION_ERROR;
580 return;
581 }
582 uprv_memset(fRawData, 0, initialSize);
583
584 fRawData->fMagic = USPOOF_MAGIC;
585 fRawData->fFormatVersion[0] = 1;
586 fRawData->fFormatVersion[1] = 0;
587 fRawData->fFormatVersion[2] = 0;
588 fRawData->fFormatVersion[3] = 0;
589 initPtrs(status);
590 }
591
592 // reset() - initialize all fields.
593 // Should be updated if any new fields are added.
594 // Called by constructors to put things in a known initial state.
reset()595 void SpoofData::reset() {
596 fRawData = NULL;
597 fDataOwned = FALSE;
598 fUDM = NULL;
599 fMemLimit = 0;
600 fRefCount = 1;
601 fCFUKeys = NULL;
602 fCFUValues = NULL;
603 fCFUStringLengths = NULL;
604 fCFUStrings = NULL;
605 fAnyCaseTrie = NULL;
606 fLowerCaseTrie = NULL;
607 fScriptSets = NULL;
608 }
609
610
611 // SpoofData::initPtrs()
612 // Initialize the pointers to the various sections of the raw data.
613 //
614 // This function is used both during the Trie building process (multiple
615 // times, as the individual data sections are added), and
616 // during the opening of a Spoof Checker from prebuilt data.
617 //
618 // The pointers for non-existent data sections (identified by an offset of 0)
619 // are set to NULL.
620 //
621 // Note: During building the data, adding each new data section
622 // reallocs the raw data area, which likely relocates it, which
623 // in turn requires reinitializing all of the pointers into it, hence
624 // multiple calls to this function during building.
625 //
initPtrs(UErrorCode & status)626 void SpoofData::initPtrs(UErrorCode &status) {
627 fCFUKeys = NULL;
628 fCFUValues = NULL;
629 fCFUStringLengths = NULL;
630 fCFUStrings = NULL;
631 if (U_FAILURE(status)) {
632 return;
633 }
634 if (fRawData->fCFUKeys != 0) {
635 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
636 }
637 if (fRawData->fCFUStringIndex != 0) {
638 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
639 }
640 if (fRawData->fCFUStringLengths != 0) {
641 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
642 }
643 if (fRawData->fCFUStringTable != 0) {
644 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
645 }
646
647 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
648 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
649 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
650 }
651 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
652 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
653 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
654 }
655
656 if (fRawData->fScriptSets != 0) {
657 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
658 }
659 }
660
661
~SpoofData()662 SpoofData::~SpoofData() {
663 utrie2_close(fAnyCaseTrie);
664 fAnyCaseTrie = NULL;
665 utrie2_close(fLowerCaseTrie);
666 fLowerCaseTrie = NULL;
667 if (fDataOwned) {
668 uprv_free(fRawData);
669 }
670 fRawData = NULL;
671 if (fUDM != NULL) {
672 udata_close(fUDM);
673 }
674 fUDM = NULL;
675 }
676
677
removeReference()678 void SpoofData::removeReference() {
679 if (umtx_atomic_dec(&fRefCount) == 0) {
680 delete this;
681 }
682 }
683
684
addReference()685 SpoofData *SpoofData::addReference() {
686 umtx_atomic_inc(&fRefCount);
687 return this;
688 }
689
690
reserveSpace(int32_t numBytes,UErrorCode & status)691 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
692 if (U_FAILURE(status)) {
693 return NULL;
694 }
695 if (!fDataOwned) {
696 U_ASSERT(FALSE);
697 status = U_INTERNAL_PROGRAM_ERROR;
698 return NULL;
699 }
700
701 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
702 uint32_t returnOffset = fMemLimit;
703 fMemLimit += numBytes;
704 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
705 fRawData->fLength = fMemLimit;
706 uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
707 initPtrs(status);
708 return (char *)fRawData + returnOffset;
709 }
710
711
712 U_NAMESPACE_END
713
714 U_NAMESPACE_USE
715
716 //-----------------------------------------------------------------------------
717 //
718 // uspoof_swap - byte swap and char encoding swap of spoof data
719 //
720 //-----------------------------------------------------------------------------
721 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)722 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
723 UErrorCode *status) {
724
725 if (status == NULL || U_FAILURE(*status)) {
726 return 0;
727 }
728 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
729 *status=U_ILLEGAL_ARGUMENT_ERROR;
730 return 0;
731 }
732
733 //
734 // Check that the data header is for spoof data.
735 // (Header contents are defined in gencfu.cpp)
736 //
737 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
738 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
739 pInfo->dataFormat[1]==0x66 &&
740 pInfo->dataFormat[2]==0x75 &&
741 pInfo->dataFormat[3]==0x20 &&
742 pInfo->formatVersion[0]==1 )) {
743 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
744 "(format version %02x %02x %02x %02x) is not recognized\n",
745 pInfo->dataFormat[0], pInfo->dataFormat[1],
746 pInfo->dataFormat[2], pInfo->dataFormat[3],
747 pInfo->formatVersion[0], pInfo->formatVersion[1],
748 pInfo->formatVersion[2], pInfo->formatVersion[3]);
749 *status=U_UNSUPPORTED_ERROR;
750 return 0;
751 }
752
753 //
754 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
755 // header). This swap also conveniently gets us
756 // the size of the ICU d.h., which lets us locate the start
757 // of the uspoof specific data.
758 //
759 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
760
761
762 //
763 // Get the Spoof Data Header, and check that it appears to be OK.
764 //
765 //
766 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
767 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
768 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
769 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
770 {
771 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
772 *status=U_UNSUPPORTED_ERROR;
773 return 0;
774 }
775
776 //
777 // Prefight operation? Just return the size
778 //
779 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
780 int32_t totalSize = headerSize + spoofDataLength;
781 if (length < 0) {
782 return totalSize;
783 }
784
785 //
786 // Check that length passed in is consistent with length from Spoof data header.
787 //
788 if (length < totalSize) {
789 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
790 spoofDataLength);
791 *status=U_INDEX_OUTOFBOUNDS_ERROR;
792 return 0;
793 }
794
795
796 //
797 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
798 // we need to reference the header to locate the data, and an
799 // inplace swap of the header leaves it unusable.
800 //
801 uint8_t *outBytes = (uint8_t *)outData + headerSize;
802 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
803
804 int32_t sectionStart;
805 int32_t sectionLength;
806
807 //
808 // If not swapping in place, zero out the output buffer before starting.
809 // Gaps may exist between the individual sections, and these must be zeroed in
810 // the output buffer. The simplest way to do that is to just zero the whole thing.
811 //
812 if (inBytes != outBytes) {
813 uprv_memset(outBytes, 0, spoofDataLength);
814 }
815
816 // Confusables Keys Section (fCFUKeys)
817 sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
818 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
819 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
820
821 // String Index Section
822 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
823 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
824 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
825
826 // String Table Section
827 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
828 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
829 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
830
831 // String Lengths Section
832 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
833 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
834 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
835
836 // Any Case Trie
837 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
838 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
839 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
840
841 // Lower Case Trie
842 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
843 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
844 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
845
846 // Script Sets. The data is an array of int32_t
847 sectionStart = ds->readUInt32(spoofDH->fScriptSets);
848 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
849 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
850
851 // And, last, swap the header itself.
852 // int32_t fMagic // swap this
853 // uint8_t fFormatVersion[4] // Do not swap this, just copy
854 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
855 //
856 uint32_t magic = ds->readUInt32(spoofDH->fMagic);
857 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
858
859 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
860 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
861 }
862 // swap starting at fLength
863 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
864
865 return totalSize;
866 }
867
868 #endif
869
870
871