1 /*
2 **********************************************************************
3 *   Copyright (C) 2008-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/uchar.h"
11 #include "unicode/uniset.h"
12 #include "unicode/utf16.h"
13 #include "utrie2.h"
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "identifier_info.h"
17 #include "scriptset.h"
18 #include "umutex.h"
19 #include "udataswp.h"
20 #include "uassert.h"
21 #include "uspoof_impl.h"
22 
23 #if !UCONFIG_NO_NORMALIZATION
24 
25 
26 U_NAMESPACE_BEGIN
27 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
29 
30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
31         fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
32         fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
33     if (U_FAILURE(status)) {
34         return;
35     }
36     fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
37 
38     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
39     allowedCharsSet->freeze();
40     fAllowedCharsSet = allowedCharsSet;
41     fAllowedLocales  = uprv_strdup("");
42     if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
43         status = U_MEMORY_ALLOCATION_ERROR;
44         return;
45     }
46     fMagic = USPOOF_MAGIC;
47 }
48 
49 
SpoofImpl()50 SpoofImpl::SpoofImpl() :
51         fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
52         fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
53     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
54     allowedCharsSet->freeze();
55     fAllowedCharsSet = allowedCharsSet;
56     fAllowedLocales  = uprv_strdup("");
57     fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
58 }
59 
60 
61 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)62 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
63         fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
64         fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
65     if (U_FAILURE(status)) {
66         return;
67     }
68     fMagic = src.fMagic;
69     fChecks = src.fChecks;
70     if (src.fSpoofData != NULL) {
71         fSpoofData = src.fSpoofData->addReference();
72     }
73     fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
74     if (fAllowedCharsSet == NULL) {
75         status = U_MEMORY_ALLOCATION_ERROR;
76     }
77     fAllowedLocales = uprv_strdup(src.fAllowedLocales);
78     fRestrictionLevel = src.fRestrictionLevel;
79 }
80 
~SpoofImpl()81 SpoofImpl::~SpoofImpl() {
82     fMagic = 0;                // head off application errors by preventing use of
83                                //    of deleted objects.
84     if (fSpoofData != NULL) {
85         fSpoofData->removeReference();   // Will delete if refCount goes to zero.
86     }
87     delete fAllowedCharsSet;
88     uprv_free((void *)fAllowedLocales);
89     delete fCachedIdentifierInfo;
90 }
91 
92 //
93 //  Incoming parameter check on Status and the SpoofChecker object
94 //    received from the C API.
95 //
validateThis(const USpoofChecker * sc,UErrorCode & status)96 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
97     if (U_FAILURE(status)) {
98         return NULL;
99     }
100     if (sc == NULL) {
101         status = U_ILLEGAL_ARGUMENT_ERROR;
102         return NULL;
103     }
104     SpoofImpl *This = (SpoofImpl *)sc;
105     if (This->fMagic != USPOOF_MAGIC ||
106         This->fSpoofData == NULL) {
107         status = U_INVALID_FORMAT_ERROR;
108         return NULL;
109     }
110     if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
111         return NULL;
112     }
113     return This;
114 }
115 
validateThis(USpoofChecker * sc,UErrorCode & status)116 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
117     return const_cast<SpoofImpl *>
118         (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
119 }
120 
121 
122 
123 //--------------------------------------------------------------------------------------
124 //
125 //  confusableLookup()    This is the heart of the confusable skeleton generation
126 //                        implementation.
127 //
128 //                        Given a source character, produce the corresponding
129 //                        replacement character(s), appending them to the dest string.
130 //
131 //---------------------------------------------------------------------------------------
confusableLookup(UChar32 inChar,int32_t tableMask,UnicodeString & dest) const132 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
133 
134     // Binary search the spoof data key table for the inChar
135     int32_t  *low   = fSpoofData->fCFUKeys;
136     int32_t  *mid   = NULL;
137     int32_t  *limit = low + fSpoofData->fRawData->fCFUKeysSize;
138     UChar32   midc;
139     do {
140         int32_t delta = ((int32_t)(limit-low))/2;
141         mid = low + delta;
142         midc = *mid & 0x1fffff;
143         if (inChar == midc) {
144             goto foundChar;
145         } else if (inChar < midc) {
146             limit = mid;
147         } else {
148             low = mid;
149         }
150     } while (low < limit-1);
151     mid = low;
152     midc = *mid & 0x1fffff;
153     if (inChar != midc) {
154         // Char not found.  It maps to itself.
155         int i = 0;
156         dest.append(inChar);
157         return i;
158     }
159   foundChar:
160     int32_t keyFlags = *mid & 0xff000000;
161     if ((keyFlags & tableMask) == 0) {
162         // We found the right key char, but the entry doesn't pertain to the
163         //  table we need.  See if there is an adjacent key that does
164         if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
165             int32_t *altMid;
166             for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
167                 keyFlags = *altMid & 0xff000000;
168                 if (keyFlags & tableMask) {
169                     mid = altMid;
170                     goto foundKey;
171                 }
172             }
173             for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
174                 keyFlags = *altMid & 0xff000000;
175                 if (keyFlags & tableMask) {
176                     mid = altMid;
177                     goto foundKey;
178                 }
179             }
180         }
181         // No key entry for this char & table.
182         // The input char maps to itself.
183         int i = 0;
184         dest.append(inChar);
185         return i;
186     }
187 
188   foundKey:
189     int32_t  stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
190     int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
191 
192     // Value is either a UChar  (for strings of length 1) or
193     //                 an index into the string table (for longer strings)
194     uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
195     if (stringLen == 1) {
196         dest.append((UChar)value);
197         return 1;
198     }
199 
200     // String length of 4 from the above lookup is used for all strings of length >= 4.
201     // For these, get the real length from the string lengths table,
202     //   which maps string table indexes to lengths.
203     //   All strings of the same length are stored contiguously in the string table.
204     //   'value' from the lookup above is the starting index for the desired string.
205 
206     int32_t ix;
207     if (stringLen == 4) {
208         int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
209         for (ix = 0; ix < stringLengthsLimit; ix++) {
210             if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
211                 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
212                 break;
213             }
214         }
215         U_ASSERT(ix < stringLengthsLimit);
216     }
217 
218     U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
219     UChar *src = &fSpoofData->fCFUStrings[value];
220     dest.append(src, stringLen);
221     return stringLen;
222 }
223 
224 
225 //---------------------------------------------------------------------------------------
226 //
227 //  wholeScriptCheck()
228 //
229 //      Input text is already normalized to NFD
230 //      Return the set of scripts, each of which can represent something that is
231 //             confusable with the input text.  The script of the input text
232 //             is included; input consisting of characters from a single script will
233 //             always produce a result consisting of a set containing that script.
234 //
235 //---------------------------------------------------------------------------------------
wholeScriptCheck(const UnicodeString & text,ScriptSet * result,UErrorCode & status) const236 void SpoofImpl::wholeScriptCheck(
237         const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
238 
239     UTrie2 *table =
240         (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
241     result->setAll();
242     int32_t length = text.length();
243     for (int32_t inputIdx=0; inputIdx < length;) {
244         UChar32 c = text.char32At(inputIdx);
245         inputIdx += U16_LENGTH(c);
246         uint32_t index = utrie2_get32(table, c);
247         if (index == 0) {
248             // No confusables in another script for this char.
249             // TODO:  we should change the data to have sets with just the single script
250             //        bit for the script of this char.  Gets rid of this special case.
251             //        Until then, grab the script from the char and intersect it with the set.
252             UScriptCode cpScript = uscript_getScript(c, &status);
253             U_ASSERT(cpScript > USCRIPT_INHERITED);
254             result->intersect(cpScript, status);
255         } else if (index == 1) {
256             // Script == Common or Inherited.  Nothing to do.
257         } else {
258             result->intersect(fSpoofData->fScriptSets[index]);
259         }
260     }
261 }
262 
263 
setAllowedLocales(const char * localesList,UErrorCode & status)264 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
265     UnicodeSet    allowedChars;
266     UnicodeSet    *tmpSet = NULL;
267     const char    *locStart = localesList;
268     const char    *locEnd = NULL;
269     const char    *localesListEnd = localesList + uprv_strlen(localesList);
270     int32_t        localeListCount = 0;   // Number of locales provided by caller.
271 
272     // Loop runs once per locale from the localesList, a comma separated list of locales.
273     do {
274         locEnd = uprv_strchr(locStart, ',');
275         if (locEnd == NULL) {
276             locEnd = localesListEnd;
277         }
278         while (*locStart == ' ') {
279             locStart++;
280         }
281         const char *trimmedEnd = locEnd-1;
282         while (trimmedEnd > locStart && *trimmedEnd == ' ') {
283             trimmedEnd--;
284         }
285         if (trimmedEnd <= locStart) {
286             break;
287         }
288         const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
289         localeListCount++;
290 
291         // We have one locale from the locales list.
292         // Add the script chars for this locale to the accumulating set of allowed chars.
293         // If the locale is no good, we will be notified back via status.
294         addScriptChars(locale, &allowedChars, status);
295         uprv_free((void *)locale);
296         if (U_FAILURE(status)) {
297             break;
298         }
299         locStart = locEnd + 1;
300     } while (locStart < localesListEnd);
301 
302     // If our caller provided an empty list of locales, we disable the allowed characters checking
303     if (localeListCount == 0) {
304         uprv_free((void *)fAllowedLocales);
305         fAllowedLocales = uprv_strdup("");
306         tmpSet = new UnicodeSet(0, 0x10ffff);
307         if (fAllowedLocales == NULL || tmpSet == NULL) {
308             status = U_MEMORY_ALLOCATION_ERROR;
309             return;
310         }
311         tmpSet->freeze();
312         delete fAllowedCharsSet;
313         fAllowedCharsSet = tmpSet;
314         fChecks &= ~USPOOF_CHAR_LIMIT;
315         return;
316     }
317 
318 
319     // Add all common and inherited characters to the set of allowed chars.
320     UnicodeSet tempSet;
321     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
322     allowedChars.addAll(tempSet);
323     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
324     allowedChars.addAll(tempSet);
325 
326     // If anything went wrong, we bail out without changing
327     // the state of the spoof checker.
328     if (U_FAILURE(status)) {
329         return;
330     }
331 
332     // Store the updated spoof checker state.
333     tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
334     const char *tmpLocalesList = uprv_strdup(localesList);
335     if (tmpSet == NULL || tmpLocalesList == NULL) {
336         status = U_MEMORY_ALLOCATION_ERROR;
337         return;
338     }
339     uprv_free((void *)fAllowedLocales);
340     fAllowedLocales = tmpLocalesList;
341     tmpSet->freeze();
342     delete fAllowedCharsSet;
343     fAllowedCharsSet = tmpSet;
344     fChecks |= USPOOF_CHAR_LIMIT;
345 }
346 
347 
getAllowedLocales(UErrorCode &)348 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
349     return fAllowedLocales;
350 }
351 
352 
353 // Given a locale (a language), add all the characters from all of the scripts used with that language
354 // to the allowedChars UnicodeSet
355 
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)356 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
357     UScriptCode scripts[30];
358 
359     int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
360     if (U_FAILURE(status)) {
361         return;
362     }
363     if (status == U_USING_DEFAULT_WARNING) {
364         status = U_ILLEGAL_ARGUMENT_ERROR;
365         return;
366     }
367     UnicodeSet tmpSet;
368     int32_t    i;
369     for (i=0; i<numScripts; i++) {
370         tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
371         allowedChars->addAll(tmpSet);
372     }
373 }
374 
375 
376 // Convert a text format hex number.  Utility function used by builder code.  Static.
377 // Input: UChar *string text.  Output: a UChar32
378 // Input has been pre-checked, and will have no non-hex chars.
379 // The number must fall in the code point range of 0..0x10ffff
380 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)381 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
382     if (U_FAILURE(status)) {
383         return 0;
384     }
385     U_ASSERT(limit-start > 0);
386     uint32_t val = 0;
387     int i;
388     for (i=start; i<limit; i++) {
389         int digitVal = s[i] - 0x30;
390         if (digitVal>9) {
391             digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
392         }
393         if (digitVal>15) {
394             digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
395         }
396         U_ASSERT(digitVal <= 0xf);
397         val <<= 4;
398         val += digitVal;
399     }
400     if (val > 0x10ffff) {
401         status = U_PARSE_ERROR;
402         val = 0;
403     }
404     return (UChar32)val;
405 }
406 
407 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
408 //                       Maintain a one-element cache, which is sufficient to avoid repeatedly
409 //                       creating new ones unless we get multi-thread concurrency in spoof
410 //                       check operations, which should be statistically uncommon.
411 
412 // These functions are used in place of new & delete of an IdentifierInfo.
413 // They will recycle the IdentifierInfo when possible.
414 // They are logically const, and used within const functions that must be thread safe.
getIdentifierInfo(UErrorCode & status) const415 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
416     IdentifierInfo *returnIdInfo = NULL;
417     if (U_FAILURE(status)) {
418         return returnIdInfo;
419     }
420     SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
421     {
422         Mutex m;
423         returnIdInfo = nonConstThis->fCachedIdentifierInfo;
424         nonConstThis->fCachedIdentifierInfo = NULL;
425     }
426     if (returnIdInfo == NULL) {
427         returnIdInfo = new IdentifierInfo(status);
428         if (U_SUCCESS(status) && returnIdInfo == NULL) {
429             status = U_MEMORY_ALLOCATION_ERROR;
430         }
431         if (U_FAILURE(status) && returnIdInfo != NULL) {
432             delete returnIdInfo;
433             returnIdInfo = NULL;
434         }
435     }
436     return returnIdInfo;
437 }
438 
439 
releaseIdentifierInfo(IdentifierInfo * idInfo) const440 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
441     if (idInfo != NULL) {
442         SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
443         {
444             Mutex m;
445             if (nonConstThis->fCachedIdentifierInfo == NULL) {
446                 nonConstThis->fCachedIdentifierInfo = idInfo;
447                 idInfo = NULL;
448             }
449         }
450         delete idInfo;
451     }
452 }
453 
454 
455 
456 
457 //----------------------------------------------------------------------------------------------
458 //
459 //   class SpoofData Implementation
460 //
461 //----------------------------------------------------------------------------------------------
462 
463 
validateDataVersion(const SpoofDataHeader * rawData,UErrorCode & status)464 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
465     if (U_FAILURE(status) ||
466         rawData == NULL ||
467         rawData->fMagic != USPOOF_MAGIC ||
468         rawData->fFormatVersion[0] > 1 ||
469         rawData->fFormatVersion[1] > 0) {
470             status = U_INVALID_FORMAT_ERROR;
471             return FALSE;
472     }
473     return TRUE;
474 }
475 
476 static UBool U_CALLCONV
spoofDataIsAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)477 spoofDataIsAcceptable(void *context,
478                         const char * /* type */, const char * /*name*/,
479                         const UDataInfo *pInfo) {
480     if(
481         pInfo->size >= 20 &&
482         pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
483         pInfo->charsetFamily == U_CHARSET_FAMILY &&
484         pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
485         pInfo->dataFormat[1] == 0x66 &&
486         pInfo->dataFormat[2] == 0x75 &&
487         pInfo->dataFormat[3] == 0x20 &&
488         pInfo->formatVersion[0] == 1
489     ) {
490         UVersionInfo *version = static_cast<UVersionInfo *>(context);
491         if(version != NULL) {
492             uprv_memcpy(version, pInfo->dataVersion, 4);
493         }
494         return TRUE;
495     } else {
496         return FALSE;
497     }
498 }
499 
500 //
501 //  SpoofData::getDefault() - return a wrapper around the spoof data that is
502 //                            baked into the default ICU data.
503 //
504 //               Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
505 //               SpoofData is shared by all spoof checkers using the default data.
506 //
getDefault(UErrorCode & status)507 SpoofData *SpoofData::getDefault(UErrorCode &status) {
508     UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
509                                         spoofDataIsAcceptable,
510                                         NULL,       // context, would receive dataVersion if supplied.
511                                         &status);
512     if (U_FAILURE(status)) {
513         return NULL;
514     }
515     SpoofData *This = new SpoofData(udm, status);
516     if (U_FAILURE(status)) {
517         delete This;
518         return NULL;
519     }
520     if (This == NULL) {
521         status = U_MEMORY_ALLOCATION_ERROR;
522     }
523     return This;
524 }
525 
SpoofData(UDataMemory * udm,UErrorCode & status)526 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
527 {
528     reset();
529     if (U_FAILURE(status)) {
530         return;
531     }
532     fUDM = udm;
533     // fRawData is non-const because it may be constructed by the data builder.
534     fRawData = reinterpret_cast<SpoofDataHeader *>(
535             const_cast<void *>(udata_getMemory(udm)));
536     validateDataVersion(fRawData, status);
537     initPtrs(status);
538 }
539 
540 
SpoofData(const void * data,int32_t length,UErrorCode & status)541 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
542 {
543     reset();
544     if (U_FAILURE(status)) {
545         return;
546     }
547     if ((size_t)length < sizeof(SpoofDataHeader)) {
548         status = U_INVALID_FORMAT_ERROR;
549         return;
550     }
551     void *ncData = const_cast<void *>(data);
552     fRawData = static_cast<SpoofDataHeader *>(ncData);
553     if (length < fRawData->fLength) {
554         status = U_INVALID_FORMAT_ERROR;
555         return;
556     }
557     validateDataVersion(fRawData, status);
558     initPtrs(status);
559 }
560 
561 
562 // Spoof Data constructor for use from data builder.
563 //   Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)564 SpoofData::SpoofData(UErrorCode &status) {
565     reset();
566     if (U_FAILURE(status)) {
567         return;
568     }
569     fDataOwned = true;
570 
571     // The spoof header should already be sized to be a multiple of 16 bytes.
572     // Just in case it's not, round it up.
573     uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
574     U_ASSERT(initialSize == sizeof(SpoofDataHeader));
575 
576     fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
577     fMemLimit = initialSize;
578     if (fRawData == NULL) {
579         status = U_MEMORY_ALLOCATION_ERROR;
580         return;
581     }
582     uprv_memset(fRawData, 0, initialSize);
583 
584     fRawData->fMagic = USPOOF_MAGIC;
585     fRawData->fFormatVersion[0] = 1;
586     fRawData->fFormatVersion[1] = 0;
587     fRawData->fFormatVersion[2] = 0;
588     fRawData->fFormatVersion[3] = 0;
589     initPtrs(status);
590 }
591 
592 // reset() - initialize all fields.
593 //           Should be updated if any new fields are added.
594 //           Called by constructors to put things in a known initial state.
reset()595 void SpoofData::reset() {
596    fRawData = NULL;
597    fDataOwned = FALSE;
598    fUDM      = NULL;
599    fMemLimit = 0;
600    fRefCount = 1;
601    fCFUKeys = NULL;
602    fCFUValues = NULL;
603    fCFUStringLengths = NULL;
604    fCFUStrings = NULL;
605    fAnyCaseTrie = NULL;
606    fLowerCaseTrie = NULL;
607    fScriptSets = NULL;
608 }
609 
610 
611 //  SpoofData::initPtrs()
612 //            Initialize the pointers to the various sections of the raw data.
613 //
614 //            This function is used both during the Trie building process (multiple
615 //            times, as the individual data sections are added), and
616 //            during the opening of a Spoof Checker from prebuilt data.
617 //
618 //            The pointers for non-existent data sections (identified by an offset of 0)
619 //            are set to NULL.
620 //
621 //            Note:  During building the data, adding each new data section
622 //            reallocs the raw data area, which likely relocates it, which
623 //            in turn requires reinitializing all of the pointers into it, hence
624 //            multiple calls to this function during building.
625 //
initPtrs(UErrorCode & status)626 void SpoofData::initPtrs(UErrorCode &status) {
627     fCFUKeys = NULL;
628     fCFUValues = NULL;
629     fCFUStringLengths = NULL;
630     fCFUStrings = NULL;
631     if (U_FAILURE(status)) {
632         return;
633     }
634     if (fRawData->fCFUKeys != 0) {
635         fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
636     }
637     if (fRawData->fCFUStringIndex != 0) {
638         fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
639     }
640     if (fRawData->fCFUStringLengths != 0) {
641         fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
642     }
643     if (fRawData->fCFUStringTable != 0) {
644         fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
645     }
646 
647     if (fAnyCaseTrie ==  NULL && fRawData->fAnyCaseTrie != 0) {
648         fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
649             (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
650     }
651     if (fLowerCaseTrie ==  NULL && fRawData->fLowerCaseTrie != 0) {
652         fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
653             (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
654     }
655 
656     if (fRawData->fScriptSets != 0) {
657         fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
658     }
659 }
660 
661 
~SpoofData()662 SpoofData::~SpoofData() {
663     utrie2_close(fAnyCaseTrie);
664     fAnyCaseTrie = NULL;
665     utrie2_close(fLowerCaseTrie);
666     fLowerCaseTrie = NULL;
667     if (fDataOwned) {
668         uprv_free(fRawData);
669     }
670     fRawData = NULL;
671     if (fUDM != NULL) {
672         udata_close(fUDM);
673     }
674     fUDM = NULL;
675 }
676 
677 
removeReference()678 void SpoofData::removeReference() {
679     if (umtx_atomic_dec(&fRefCount) == 0) {
680         delete this;
681     }
682 }
683 
684 
addReference()685 SpoofData *SpoofData::addReference() {
686     umtx_atomic_inc(&fRefCount);
687     return this;
688 }
689 
690 
reserveSpace(int32_t numBytes,UErrorCode & status)691 void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
692     if (U_FAILURE(status)) {
693         return NULL;
694     }
695     if (!fDataOwned) {
696         U_ASSERT(FALSE);
697         status = U_INTERNAL_PROGRAM_ERROR;
698         return NULL;
699     }
700 
701     numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
702     uint32_t returnOffset = fMemLimit;
703     fMemLimit += numBytes;
704     fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
705     fRawData->fLength = fMemLimit;
706     uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
707     initPtrs(status);
708     return (char *)fRawData + returnOffset;
709 }
710 
711 
712 U_NAMESPACE_END
713 
714 U_NAMESPACE_USE
715 
716 //-----------------------------------------------------------------------------
717 //
718 //  uspoof_swap   -  byte swap and char encoding swap of spoof data
719 //
720 //-----------------------------------------------------------------------------
721 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)722 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
723            UErrorCode *status) {
724 
725     if (status == NULL || U_FAILURE(*status)) {
726         return 0;
727     }
728     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
729         *status=U_ILLEGAL_ARGUMENT_ERROR;
730         return 0;
731     }
732 
733     //
734     //  Check that the data header is for spoof data.
735     //    (Header contents are defined in gencfu.cpp)
736     //
737     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
738     if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
739            pInfo->dataFormat[1]==0x66 &&
740            pInfo->dataFormat[2]==0x75 &&
741            pInfo->dataFormat[3]==0x20 &&
742            pInfo->formatVersion[0]==1  )) {
743         udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
744                              "(format version %02x %02x %02x %02x) is not recognized\n",
745                          pInfo->dataFormat[0], pInfo->dataFormat[1],
746                          pInfo->dataFormat[2], pInfo->dataFormat[3],
747                          pInfo->formatVersion[0], pInfo->formatVersion[1],
748                          pInfo->formatVersion[2], pInfo->formatVersion[3]);
749         *status=U_UNSUPPORTED_ERROR;
750         return 0;
751     }
752 
753     //
754     // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
755     //                         header).  This swap also conveniently gets us
756     //                         the size of the ICU d.h., which lets us locate the start
757     //                         of the uspoof specific data.
758     //
759     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
760 
761 
762     //
763     // Get the Spoof Data Header, and check that it appears to be OK.
764     //
765     //
766     const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
767     SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
768     if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
769         ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader))
770     {
771         udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
772         *status=U_UNSUPPORTED_ERROR;
773         return 0;
774     }
775 
776     //
777     // Prefight operation?  Just return the size
778     //
779     int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
780     int32_t totalSize = headerSize + spoofDataLength;
781     if (length < 0) {
782         return totalSize;
783     }
784 
785     //
786     // Check that length passed in is consistent with length from Spoof data header.
787     //
788     if (length < totalSize) {
789         udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
790                             spoofDataLength);
791         *status=U_INDEX_OUTOFBOUNDS_ERROR;
792         return 0;
793         }
794 
795 
796     //
797     // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
798     //                 we need to reference the header to locate the data, and an
799     //                 inplace swap of the header leaves it unusable.
800     //
801     uint8_t          *outBytes = (uint8_t *)outData + headerSize;
802     SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
803 
804     int32_t   sectionStart;
805     int32_t   sectionLength;
806 
807     //
808     // If not swapping in place, zero out the output buffer before starting.
809     //    Gaps may exist between the individual sections, and these must be zeroed in
810     //    the output buffer.  The simplest way to do that is to just zero the whole thing.
811     //
812     if (inBytes != outBytes) {
813         uprv_memset(outBytes, 0, spoofDataLength);
814     }
815 
816     // Confusables Keys Section   (fCFUKeys)
817     sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
818     sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
819     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
820 
821     // String Index Section
822     sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
823     sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
824     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
825 
826     // String Table Section
827     sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
828     sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
829     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
830 
831     // String Lengths Section
832     sectionStart  = ds->readUInt32(spoofDH->fCFUStringLengths);
833     sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
834     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
835 
836     // Any Case Trie
837     sectionStart  = ds->readUInt32(spoofDH->fAnyCaseTrie);
838     sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
839     utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
840 
841     // Lower Case Trie
842     sectionStart  = ds->readUInt32(spoofDH->fLowerCaseTrie);
843     sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
844     utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
845 
846     // Script Sets.  The data is an array of int32_t
847     sectionStart  = ds->readUInt32(spoofDH->fScriptSets);
848     sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
849     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
850 
851     // And, last, swap the header itself.
852     //   int32_t   fMagic             // swap this
853     //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
854     //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
855     //
856     uint32_t magic = ds->readUInt32(spoofDH->fMagic);
857     ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
858 
859     if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
860         uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
861     }
862     // swap starting at fLength
863     ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
864 
865     return totalSize;
866 }
867 
868 #endif
869 
870 
871