1 /*
2 **********************************************************************
3 *   Copyright (C) 2008-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/uchar.h"
11 #include "unicode/uniset.h"
12 #include "unicode/utf16.h"
13 #include "utrie2.h"
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "identifier_info.h"
17 #include "scriptset.h"
18 #include "umutex.h"
19 #include "udataswp.h"
20 #include "uassert.h"
21 #include "uspoof_impl.h"
22 
23 #if !UCONFIG_NO_NORMALIZATION
24 
25 
26 U_NAMESPACE_BEGIN
27 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
29 
30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
31         fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
32         fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
33     if (U_FAILURE(status)) {
34         return;
35     }
36     fSpoofData = data;
37     fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
38 
39     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
40     allowedCharsSet->freeze();
41     fAllowedCharsSet = allowedCharsSet;
42     fAllowedLocales  = uprv_strdup("");
43     if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
44         status = U_MEMORY_ALLOCATION_ERROR;
45         return;
46     }
47     fMagic = USPOOF_MAGIC;
48 }
49 
50 
SpoofImpl()51 SpoofImpl::SpoofImpl() :
52         fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
53         fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
54     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
55     allowedCharsSet->freeze();
56     fAllowedCharsSet = allowedCharsSet;
57     fAllowedLocales  = uprv_strdup("");
58     fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
59 }
60 
61 
62 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)63 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
64         fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
65         fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
66     if (U_FAILURE(status)) {
67         return;
68     }
69     fMagic = src.fMagic;
70     fChecks = src.fChecks;
71     if (src.fSpoofData != NULL) {
72         fSpoofData = src.fSpoofData->addReference();
73     }
74     fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
75     if (fAllowedCharsSet == NULL) {
76         status = U_MEMORY_ALLOCATION_ERROR;
77     }
78     fAllowedLocales = uprv_strdup(src.fAllowedLocales);
79     fRestrictionLevel = src.fRestrictionLevel;
80 }
81 
~SpoofImpl()82 SpoofImpl::~SpoofImpl() {
83     fMagic = 0;                // head off application errors by preventing use of
84                                //    of deleted objects.
85     if (fSpoofData != NULL) {
86         fSpoofData->removeReference();   // Will delete if refCount goes to zero.
87     }
88     delete fAllowedCharsSet;
89     uprv_free((void *)fAllowedLocales);
90     delete fCachedIdentifierInfo;
91 }
92 
93 //
94 //  Incoming parameter check on Status and the SpoofChecker object
95 //    received from the C API.
96 //
validateThis(const USpoofChecker * sc,UErrorCode & status)97 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
98     if (U_FAILURE(status)) {
99         return NULL;
100     }
101     if (sc == NULL) {
102         status = U_ILLEGAL_ARGUMENT_ERROR;
103         return NULL;
104     }
105     SpoofImpl *This = (SpoofImpl *)sc;
106     if (This->fMagic != USPOOF_MAGIC ||
107         This->fSpoofData == NULL) {
108         status = U_INVALID_FORMAT_ERROR;
109         return NULL;
110     }
111     if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
112         return NULL;
113     }
114     return This;
115 }
116 
validateThis(USpoofChecker * sc,UErrorCode & status)117 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
118     return const_cast<SpoofImpl *>
119         (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
120 }
121 
122 
123 
124 //--------------------------------------------------------------------------------------
125 //
126 //  confusableLookup()    This is the heart of the confusable skeleton generation
127 //                        implementation.
128 //
129 //                        Given a source character, produce the corresponding
130 //                        replacement character(s), appending them to the dest string.
131 //
132 //---------------------------------------------------------------------------------------
confusableLookup(UChar32 inChar,int32_t tableMask,UnicodeString & dest) const133 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
134 
135     // Binary search the spoof data key table for the inChar
136     int32_t  *low   = fSpoofData->fCFUKeys;
137     int32_t  *mid   = NULL;
138     int32_t  *limit = low + fSpoofData->fRawData->fCFUKeysSize;
139     UChar32   midc;
140     do {
141         int32_t delta = ((int32_t)(limit-low))/2;
142         mid = low + delta;
143         midc = *mid & 0x1fffff;
144         if (inChar == midc) {
145             goto foundChar;
146         } else if (inChar < midc) {
147             limit = mid;
148         } else {
149             low = mid;
150         }
151     } while (low < limit-1);
152     mid = low;
153     midc = *mid & 0x1fffff;
154     if (inChar != midc) {
155         // Char not found.  It maps to itself.
156         int i = 0;
157         dest.append(inChar);
158         return i;
159     }
160   foundChar:
161     int32_t keyFlags = *mid & 0xff000000;
162     if ((keyFlags & tableMask) == 0) {
163         // We found the right key char, but the entry doesn't pertain to the
164         //  table we need.  See if there is an adjacent key that does
165         if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
166             int32_t *altMid;
167             for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
168                 keyFlags = *altMid & 0xff000000;
169                 if (keyFlags & tableMask) {
170                     mid = altMid;
171                     goto foundKey;
172                 }
173             }
174             for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
175                 keyFlags = *altMid & 0xff000000;
176                 if (keyFlags & tableMask) {
177                     mid = altMid;
178                     goto foundKey;
179                 }
180             }
181         }
182         // No key entry for this char & table.
183         // The input char maps to itself.
184         int i = 0;
185         dest.append(inChar);
186         return i;
187     }
188 
189   foundKey:
190     int32_t  stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
191     int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
192 
193     // Value is either a UChar  (for strings of length 1) or
194     //                 an index into the string table (for longer strings)
195     uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
196     if (stringLen == 1) {
197         dest.append((UChar)value);
198         return 1;
199     }
200 
201     // String length of 4 from the above lookup is used for all strings of length >= 4.
202     // For these, get the real length from the string lengths table,
203     //   which maps string table indexes to lengths.
204     //   All strings of the same length are stored contiguously in the string table.
205     //   'value' from the lookup above is the starting index for the desired string.
206 
207     int32_t ix;
208     if (stringLen == 4) {
209         int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
210         for (ix = 0; ix < stringLengthsLimit; ix++) {
211             if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
212                 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
213                 break;
214             }
215         }
216         U_ASSERT(ix < stringLengthsLimit);
217     }
218 
219     U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
220     UChar *src = &fSpoofData->fCFUStrings[value];
221     dest.append(src, stringLen);
222     return stringLen;
223 }
224 
225 
226 //---------------------------------------------------------------------------------------
227 //
228 //  wholeScriptCheck()
229 //
230 //      Input text is already normalized to NFD
231 //      Return the set of scripts, each of which can represent something that is
232 //             confusable with the input text.  The script of the input text
233 //             is included; input consisting of characters from a single script will
234 //             always produce a result consisting of a set containing that script.
235 //
236 //---------------------------------------------------------------------------------------
wholeScriptCheck(const UnicodeString & text,ScriptSet * result,UErrorCode & status) const237 void SpoofImpl::wholeScriptCheck(
238         const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
239 
240     UTrie2 *table =
241         (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
242     result->setAll();
243     int32_t length = text.length();
244     for (int32_t inputIdx=0; inputIdx < length;) {
245         UChar32 c = text.char32At(inputIdx);
246         inputIdx += U16_LENGTH(c);
247         uint32_t index = utrie2_get32(table, c);
248         if (index == 0) {
249             // No confusables in another script for this char.
250             // TODO:  we should change the data to have sets with just the single script
251             //        bit for the script of this char.  Gets rid of this special case.
252             //        Until then, grab the script from the char and intersect it with the set.
253             UScriptCode cpScript = uscript_getScript(c, &status);
254             U_ASSERT(cpScript > USCRIPT_INHERITED);
255             result->intersect(cpScript, status);
256         } else if (index == 1) {
257             // Script == Common or Inherited.  Nothing to do.
258         } else {
259             result->intersect(fSpoofData->fScriptSets[index]);
260         }
261     }
262 }
263 
264 
setAllowedLocales(const char * localesList,UErrorCode & status)265 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
266     UnicodeSet    allowedChars;
267     UnicodeSet    *tmpSet = NULL;
268     const char    *locStart = localesList;
269     const char    *locEnd = NULL;
270     const char    *localesListEnd = localesList + uprv_strlen(localesList);
271     int32_t        localeListCount = 0;   // Number of locales provided by caller.
272 
273     // Loop runs once per locale from the localesList, a comma separated list of locales.
274     do {
275         locEnd = uprv_strchr(locStart, ',');
276         if (locEnd == NULL) {
277             locEnd = localesListEnd;
278         }
279         while (*locStart == ' ') {
280             locStart++;
281         }
282         const char *trimmedEnd = locEnd-1;
283         while (trimmedEnd > locStart && *trimmedEnd == ' ') {
284             trimmedEnd--;
285         }
286         if (trimmedEnd <= locStart) {
287             break;
288         }
289         const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
290         localeListCount++;
291 
292         // We have one locale from the locales list.
293         // Add the script chars for this locale to the accumulating set of allowed chars.
294         // If the locale is no good, we will be notified back via status.
295         addScriptChars(locale, &allowedChars, status);
296         uprv_free((void *)locale);
297         if (U_FAILURE(status)) {
298             break;
299         }
300         locStart = locEnd + 1;
301     } while (locStart < localesListEnd);
302 
303     // If our caller provided an empty list of locales, we disable the allowed characters checking
304     if (localeListCount == 0) {
305         uprv_free((void *)fAllowedLocales);
306         fAllowedLocales = uprv_strdup("");
307         tmpSet = new UnicodeSet(0, 0x10ffff);
308         if (fAllowedLocales == NULL || tmpSet == NULL) {
309             status = U_MEMORY_ALLOCATION_ERROR;
310             return;
311         }
312         tmpSet->freeze();
313         delete fAllowedCharsSet;
314         fAllowedCharsSet = tmpSet;
315         fChecks &= ~USPOOF_CHAR_LIMIT;
316         return;
317     }
318 
319 
320     // Add all common and inherited characters to the set of allowed chars.
321     UnicodeSet tempSet;
322     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
323     allowedChars.addAll(tempSet);
324     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
325     allowedChars.addAll(tempSet);
326 
327     // If anything went wrong, we bail out without changing
328     // the state of the spoof checker.
329     if (U_FAILURE(status)) {
330         return;
331     }
332 
333     // Store the updated spoof checker state.
334     tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
335     const char *tmpLocalesList = uprv_strdup(localesList);
336     if (tmpSet == NULL || tmpLocalesList == NULL) {
337         status = U_MEMORY_ALLOCATION_ERROR;
338         return;
339     }
340     uprv_free((void *)fAllowedLocales);
341     fAllowedLocales = tmpLocalesList;
342     tmpSet->freeze();
343     delete fAllowedCharsSet;
344     fAllowedCharsSet = tmpSet;
345     fChecks |= USPOOF_CHAR_LIMIT;
346 }
347 
348 
getAllowedLocales(UErrorCode &)349 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
350     return fAllowedLocales;
351 }
352 
353 
354 // Given a locale (a language), add all the characters from all of the scripts used with that language
355 // to the allowedChars UnicodeSet
356 
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)357 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
358     UScriptCode scripts[30];
359 
360     int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
361     if (U_FAILURE(status)) {
362         return;
363     }
364     if (status == U_USING_DEFAULT_WARNING) {
365         status = U_ILLEGAL_ARGUMENT_ERROR;
366         return;
367     }
368     UnicodeSet tmpSet;
369     int32_t    i;
370     for (i=0; i<numScripts; i++) {
371         tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
372         allowedChars->addAll(tmpSet);
373     }
374 }
375 
376 
377 // Convert a text format hex number.  Utility function used by builder code.  Static.
378 // Input: UChar *string text.  Output: a UChar32
379 // Input has been pre-checked, and will have no non-hex chars.
380 // The number must fall in the code point range of 0..0x10ffff
381 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)382 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
383     if (U_FAILURE(status)) {
384         return 0;
385     }
386     U_ASSERT(limit-start > 0);
387     uint32_t val = 0;
388     int i;
389     for (i=start; i<limit; i++) {
390         int digitVal = s[i] - 0x30;
391         if (digitVal>9) {
392             digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
393         }
394         if (digitVal>15) {
395             digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
396         }
397         U_ASSERT(digitVal <= 0xf);
398         val <<= 4;
399         val += digitVal;
400     }
401     if (val > 0x10ffff) {
402         status = U_PARSE_ERROR;
403         val = 0;
404     }
405     return (UChar32)val;
406 }
407 
408 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
409 //                       Maintain a one-element cache, which is sufficient to avoid repeatedly
410 //                       creating new ones unless we get multi-thread concurrency in spoof
411 //                       check operations, which should be statistically uncommon.
412 
413 // These functions are used in place of new & delete of an IdentifierInfo.
414 // They will recycle the IdentifierInfo when possible.
415 // They are logically const, and used within const functions that must be thread safe.
getIdentifierInfo(UErrorCode & status) const416 IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
417     IdentifierInfo *returnIdInfo = NULL;
418     if (U_FAILURE(status)) {
419         return returnIdInfo;
420     }
421     SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
422     {
423         Mutex m;
424         returnIdInfo = nonConstThis->fCachedIdentifierInfo;
425         nonConstThis->fCachedIdentifierInfo = NULL;
426     }
427     if (returnIdInfo == NULL) {
428         returnIdInfo = new IdentifierInfo(status);
429         if (U_SUCCESS(status) && returnIdInfo == NULL) {
430             status = U_MEMORY_ALLOCATION_ERROR;
431         }
432         if (U_FAILURE(status) && returnIdInfo != NULL) {
433             delete returnIdInfo;
434             returnIdInfo = NULL;
435         }
436     }
437     return returnIdInfo;
438 }
439 
440 
releaseIdentifierInfo(IdentifierInfo * idInfo) const441 void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
442     if (idInfo != NULL) {
443         SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
444         {
445             Mutex m;
446             if (nonConstThis->fCachedIdentifierInfo == NULL) {
447                 nonConstThis->fCachedIdentifierInfo = idInfo;
448                 idInfo = NULL;
449             }
450         }
451         delete idInfo;
452     }
453 }
454 
455 
456 
457 
458 //----------------------------------------------------------------------------------------------
459 //
460 //   class SpoofData Implementation
461 //
462 //----------------------------------------------------------------------------------------------
463 
464 
validateDataVersion(const SpoofDataHeader * rawData,UErrorCode & status)465 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
466     if (U_FAILURE(status) ||
467         rawData == NULL ||
468         rawData->fMagic != USPOOF_MAGIC ||
469         rawData->fFormatVersion[0] > 1 ||
470         rawData->fFormatVersion[1] > 0) {
471             status = U_INVALID_FORMAT_ERROR;
472             return FALSE;
473     }
474     return TRUE;
475 }
476 
477 static UBool U_CALLCONV
spoofDataIsAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)478 spoofDataIsAcceptable(void *context,
479                         const char * /* type */, const char * /*name*/,
480                         const UDataInfo *pInfo) {
481     if(
482         pInfo->size >= 20 &&
483         pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
484         pInfo->charsetFamily == U_CHARSET_FAMILY &&
485         pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
486         pInfo->dataFormat[1] == 0x66 &&
487         pInfo->dataFormat[2] == 0x75 &&
488         pInfo->dataFormat[3] == 0x20 &&
489         pInfo->formatVersion[0] == 1
490     ) {
491         UVersionInfo *version = static_cast<UVersionInfo *>(context);
492         if(version != NULL) {
493             uprv_memcpy(version, pInfo->dataVersion, 4);
494         }
495         return TRUE;
496     } else {
497         return FALSE;
498     }
499 }
500 
501 //
502 //  SpoofData::getDefault() - return a wrapper around the spoof data that is
503 //                           baked into the default ICU data.
504 //
getDefault(UErrorCode & status)505 SpoofData *SpoofData::getDefault(UErrorCode &status) {
506     // TODO:  Cache it.  Lazy create, keep until cleanup.
507 
508     UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
509                                         spoofDataIsAcceptable,
510                                         NULL,       // context, would receive dataVersion if supplied.
511                                         &status);
512     if (U_FAILURE(status)) {
513         return NULL;
514     }
515     SpoofData *This = new SpoofData(udm, status);
516     if (U_FAILURE(status)) {
517         delete This;
518         return NULL;
519     }
520     if (This == NULL) {
521         status = U_MEMORY_ALLOCATION_ERROR;
522     }
523     return This;
524 }
525 
SpoofData(UDataMemory * udm,UErrorCode & status)526 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
527 {
528     reset();
529     if (U_FAILURE(status)) {
530         return;
531     }
532     fUDM = udm;
533     // fRawData is non-const because it may be constructed by the data builder.
534     fRawData = reinterpret_cast<SpoofDataHeader *>(
535             const_cast<void *>(udata_getMemory(udm)));
536     validateDataVersion(fRawData, status);
537     initPtrs(status);
538 }
539 
540 
SpoofData(const void * data,int32_t length,UErrorCode & status)541 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
542 {
543     reset();
544     if (U_FAILURE(status)) {
545         return;
546     }
547     if ((size_t)length < sizeof(SpoofDataHeader)) {
548         status = U_INVALID_FORMAT_ERROR;
549         return;
550     }
551     void *ncData = const_cast<void *>(data);
552     fRawData = static_cast<SpoofDataHeader *>(ncData);
553     if (length < fRawData->fLength) {
554         status = U_INVALID_FORMAT_ERROR;
555         return;
556     }
557     validateDataVersion(fRawData, status);
558     initPtrs(status);
559 }
560 
561 
562 // Spoof Data constructor for use from data builder.
563 //   Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)564 SpoofData::SpoofData(UErrorCode &status) {
565     reset();
566     if (U_FAILURE(status)) {
567         return;
568     }
569     fDataOwned = true;
570     fRefCount = 1;
571 
572     // The spoof header should already be sized to be a multiple of 16 bytes.
573     // Just in case it's not, round it up.
574     uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
575     U_ASSERT(initialSize == sizeof(SpoofDataHeader));
576 
577     fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
578     fMemLimit = initialSize;
579     if (fRawData == NULL) {
580         status = U_MEMORY_ALLOCATION_ERROR;
581         return;
582     }
583     uprv_memset(fRawData, 0, initialSize);
584 
585     fRawData->fMagic = USPOOF_MAGIC;
586     fRawData->fFormatVersion[0] = 1;
587     fRawData->fFormatVersion[1] = 0;
588     fRawData->fFormatVersion[2] = 0;
589     fRawData->fFormatVersion[3] = 0;
590     initPtrs(status);
591 }
592 
593 // reset() - initialize all fields.
594 //           Should be updated if any new fields are added.
595 //           Called by constructors to put things in a known initial state.
reset()596 void SpoofData::reset() {
597    fRawData = NULL;
598    fDataOwned = FALSE;
599    fUDM      = NULL;
600    fMemLimit = 0;
601    fRefCount = 1;
602    fCFUKeys = NULL;
603    fCFUValues = NULL;
604    fCFUStringLengths = NULL;
605    fCFUStrings = NULL;
606    fAnyCaseTrie = NULL;
607    fLowerCaseTrie = NULL;
608    fScriptSets = NULL;
609 }
610 
611 
612 //  SpoofData::initPtrs()
613 //            Initialize the pointers to the various sections of the raw data.
614 //
615 //            This function is used both during the Trie building process (multiple
616 //            times, as the individual data sections are added), and
617 //            during the opening of a Spoof Checker from prebuilt data.
618 //
619 //            The pointers for non-existent data sections (identified by an offset of 0)
620 //            are set to NULL.
621 //
622 //            Note:  During building the data, adding each new data section
623 //            reallocs the raw data area, which likely relocates it, which
624 //            in turn requires reinitializing all of the pointers into it, hence
625 //            multiple calls to this function during building.
626 //
initPtrs(UErrorCode & status)627 void SpoofData::initPtrs(UErrorCode &status) {
628     fCFUKeys = NULL;
629     fCFUValues = NULL;
630     fCFUStringLengths = NULL;
631     fCFUStrings = NULL;
632     if (U_FAILURE(status)) {
633         return;
634     }
635     if (fRawData->fCFUKeys != 0) {
636         fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
637     }
638     if (fRawData->fCFUStringIndex != 0) {
639         fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
640     }
641     if (fRawData->fCFUStringLengths != 0) {
642         fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
643     }
644     if (fRawData->fCFUStringTable != 0) {
645         fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
646     }
647 
648     if (fAnyCaseTrie ==  NULL && fRawData->fAnyCaseTrie != 0) {
649         fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
650             (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
651     }
652     if (fLowerCaseTrie ==  NULL && fRawData->fLowerCaseTrie != 0) {
653         fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
654             (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
655     }
656 
657     if (fRawData->fScriptSets != 0) {
658         fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
659     }
660 }
661 
662 
~SpoofData()663 SpoofData::~SpoofData() {
664     utrie2_close(fAnyCaseTrie);
665     fAnyCaseTrie = NULL;
666     utrie2_close(fLowerCaseTrie);
667     fLowerCaseTrie = NULL;
668     if (fDataOwned) {
669         uprv_free(fRawData);
670     }
671     fRawData = NULL;
672     if (fUDM != NULL) {
673         udata_close(fUDM);
674     }
675     fUDM = NULL;
676 }
677 
678 
removeReference()679 void SpoofData::removeReference() {
680     if (umtx_atomic_dec(&fRefCount) == 0) {
681         delete this;
682     }
683 }
684 
685 
addReference()686 SpoofData *SpoofData::addReference() {
687     umtx_atomic_inc(&fRefCount);
688     return this;
689 }
690 
691 
reserveSpace(int32_t numBytes,UErrorCode & status)692 void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
693     if (U_FAILURE(status)) {
694         return NULL;
695     }
696     if (!fDataOwned) {
697         U_ASSERT(FALSE);
698         status = U_INTERNAL_PROGRAM_ERROR;
699         return NULL;
700     }
701 
702     numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
703     uint32_t returnOffset = fMemLimit;
704     fMemLimit += numBytes;
705     fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
706     fRawData->fLength = fMemLimit;
707     uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
708     initPtrs(status);
709     return (char *)fRawData + returnOffset;
710 }
711 
712 
713 U_NAMESPACE_END
714 
715 U_NAMESPACE_USE
716 
717 //-----------------------------------------------------------------------------
718 //
719 //  uspoof_swap   -  byte swap and char encoding swap of spoof data
720 //
721 //-----------------------------------------------------------------------------
722 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)723 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
724            UErrorCode *status) {
725 
726     if (status == NULL || U_FAILURE(*status)) {
727         return 0;
728     }
729     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
730         *status=U_ILLEGAL_ARGUMENT_ERROR;
731         return 0;
732     }
733 
734     //
735     //  Check that the data header is for spoof data.
736     //    (Header contents are defined in gencfu.cpp)
737     //
738     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
739     if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
740            pInfo->dataFormat[1]==0x66 &&
741            pInfo->dataFormat[2]==0x75 &&
742            pInfo->dataFormat[3]==0x20 &&
743            pInfo->formatVersion[0]==1  )) {
744         udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
745                              "(format version %02x %02x %02x %02x) is not recognized\n",
746                          pInfo->dataFormat[0], pInfo->dataFormat[1],
747                          pInfo->dataFormat[2], pInfo->dataFormat[3],
748                          pInfo->formatVersion[0], pInfo->formatVersion[1],
749                          pInfo->formatVersion[2], pInfo->formatVersion[3]);
750         *status=U_UNSUPPORTED_ERROR;
751         return 0;
752     }
753 
754     //
755     // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
756     //                         header).  This swap also conveniently gets us
757     //                         the size of the ICU d.h., which lets us locate the start
758     //                         of the uspoof specific data.
759     //
760     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
761 
762 
763     //
764     // Get the Spoof Data Header, and check that it appears to be OK.
765     //
766     //
767     const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
768     SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
769     if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
770         ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader))
771     {
772         udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
773         *status=U_UNSUPPORTED_ERROR;
774         return 0;
775     }
776 
777     //
778     // Prefight operation?  Just return the size
779     //
780     int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
781     int32_t totalSize = headerSize + spoofDataLength;
782     if (length < 0) {
783         return totalSize;
784     }
785 
786     //
787     // Check that length passed in is consistent with length from Spoof data header.
788     //
789     if (length < totalSize) {
790         udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
791                             spoofDataLength);
792         *status=U_INDEX_OUTOFBOUNDS_ERROR;
793         return 0;
794         }
795 
796 
797     //
798     // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
799     //                 we need to reference the header to locate the data, and an
800     //                 inplace swap of the header leaves it unusable.
801     //
802     uint8_t          *outBytes = (uint8_t *)outData + headerSize;
803     SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
804 
805     int32_t   sectionStart;
806     int32_t   sectionLength;
807 
808     //
809     // If not swapping in place, zero out the output buffer before starting.
810     //    Gaps may exist between the individual sections, and these must be zeroed in
811     //    the output buffer.  The simplest way to do that is to just zero the whole thing.
812     //
813     if (inBytes != outBytes) {
814         uprv_memset(outBytes, 0, spoofDataLength);
815     }
816 
817     // Confusables Keys Section   (fCFUKeys)
818     sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
819     sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
820     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
821 
822     // String Index Section
823     sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
824     sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
825     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
826 
827     // String Table Section
828     sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
829     sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
830     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
831 
832     // String Lengths Section
833     sectionStart  = ds->readUInt32(spoofDH->fCFUStringLengths);
834     sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
835     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
836 
837     // Any Case Trie
838     sectionStart  = ds->readUInt32(spoofDH->fAnyCaseTrie);
839     sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
840     utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
841 
842     // Lower Case Trie
843     sectionStart  = ds->readUInt32(spoofDH->fLowerCaseTrie);
844     sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
845     utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
846 
847     // Script Sets.  The data is an array of int32_t
848     sectionStart  = ds->readUInt32(spoofDH->fScriptSets);
849     sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
850     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
851 
852     // And, last, swap the header itself.
853     //   int32_t   fMagic             // swap this
854     //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
855     //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
856     //
857     uint32_t magic = ds->readUInt32(spoofDH->fMagic);
858     ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
859 
860     if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
861         uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
862     }
863     // swap starting at fLength
864     ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
865 
866     return totalSize;
867 }
868 
869 #endif
870 
871 
872