1 /*
2 **********************************************************************
3 *   Copyright (C) 2012-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
12 
13 #include "identifier_info.h"
14 #include "mutex.h"
15 #include "scriptset.h"
16 #include "ucln_in.h"
17 #include "uvector.h"
18 
19 U_NAMESPACE_BEGIN
20 
21 static UnicodeSet *ASCII;
22 static ScriptSet *JAPANESE;
23 static ScriptSet *CHINESE;
24 static ScriptSet *KOREAN;
25 static ScriptSet *CONFUSABLE_WITH_LATIN;
26 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
27 
28 
29 U_CDECL_BEGIN
30 static UBool U_CALLCONV
IdentifierInfo_cleanup(void)31 IdentifierInfo_cleanup(void) {
32     delete ASCII;
33     ASCII = NULL;
34     delete JAPANESE;
35     JAPANESE = NULL;
36     delete CHINESE;
37     CHINESE = NULL;
38     delete KOREAN;
39     KOREAN = NULL;
40     delete CONFUSABLE_WITH_LATIN;
41     CONFUSABLE_WITH_LATIN = NULL;
42     gIdentifierInfoInitOnce.reset();
43     return TRUE;
44 }
45 
46 static void U_CALLCONV
IdentifierInfo_init(UErrorCode & status)47 IdentifierInfo_init(UErrorCode &status) {
48     ASCII    = new UnicodeSet(0, 0x7f);
49     JAPANESE = new ScriptSet();
50     CHINESE  = new ScriptSet();
51     KOREAN   = new ScriptSet();
52     CONFUSABLE_WITH_LATIN = new ScriptSet();
53     if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
54             || CONFUSABLE_WITH_LATIN == NULL) {
55         status = U_MEMORY_ALLOCATION_ERROR;
56         return;
57     }
58     ASCII->freeze();
59     JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
60              .set(USCRIPT_KATAKANA, status);
61     CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
62     KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
63     CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
64               .set(USCRIPT_CHEROKEE, status);
65     ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
66 }
67 U_CDECL_END
68 
69 
IdentifierInfo(UErrorCode & status)70 IdentifierInfo::IdentifierInfo(UErrorCode &status):
71          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
72          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
73     umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
74     if (U_FAILURE(status)) {
75         return;
76     }
77 
78     fIdentifier = new UnicodeString();
79     fRequiredScripts = new ScriptSet();
80     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
81     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
82     fCommonAmongAlternates = new ScriptSet();
83     fNumerics = new UnicodeSet();
84     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
85 
86     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
87                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
88         status = U_MEMORY_ALLOCATION_ERROR;
89     }
90 }
91 
~IdentifierInfo()92 IdentifierInfo::~IdentifierInfo() {
93     delete fIdentifier;
94     delete fRequiredScripts;
95     uhash_close(fScriptSetSet);
96     delete fCommonAmongAlternates;
97     delete fNumerics;
98     delete fIdentifierProfile;
99 }
100 
101 
clear()102 IdentifierInfo &IdentifierInfo::clear() {
103     fRequiredScripts->resetAll();
104     uhash_removeAll(fScriptSetSet);
105     fNumerics->clear();
106     fCommonAmongAlternates->resetAll();
107     return *this;
108 }
109 
110 
setIdentifierProfile(const UnicodeSet & identifierProfile)111 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
112     *fIdentifierProfile = identifierProfile;
113     return *this;
114 }
115 
116 
getIdentifierProfile() const117 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
118     return *fIdentifierProfile;
119 }
120 
121 
setIdentifier(const UnicodeString & identifier,UErrorCode & status)122 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
123     if (U_FAILURE(status)) {
124         return *this;
125     }
126     *fIdentifier = identifier;
127     clear();
128     ScriptSet scriptsForCP;
129     UChar32 cp;
130     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
131         cp = identifier.char32At(i);
132         // Store a representative character for each kind of decimal digit
133         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
134             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
135             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
136         }
137         UScriptCode extensions[500];
138         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
139         if (U_FAILURE(status)) {
140             return *this;
141         }
142         scriptsForCP.resetAll();
143         for (int32_t j=0; j<extensionsCount; j++) {
144             scriptsForCP.set(extensions[j], status);
145         }
146         scriptsForCP.reset(USCRIPT_COMMON, status);
147         scriptsForCP.reset(USCRIPT_INHERITED, status);
148         switch (scriptsForCP.countMembers()) {
149           case 0: break;
150           case 1:
151             // Single script, record it.
152             fRequiredScripts->Union(scriptsForCP);
153             break;
154           default:
155             if (!fRequiredScripts->intersects(scriptsForCP)
156                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
157                 // If the set hasn't been added already, add it
158                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
159                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
160             }
161             break;
162         }
163     }
164     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
165     // [Kana], [Kana Hira] => [Kana]
166     // This is relatively infrequent, so doesn't have to be optimized.
167     // We also compute any commonalities among the alternates.
168     if (uhash_count(fScriptSetSet) > 0) {
169         fCommonAmongAlternates->setAll();
170         for (int32_t it = UHASH_FIRST;;) {
171             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
172             if (nextHashEl == NULL) {
173                 break;
174             }
175             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
176             // [Kana], [Kana Hira] => [Kana]
177             if (fRequiredScripts->intersects(*next)) {
178                 uhash_removeElement(fScriptSetSet, nextHashEl);
179             } else {
180                 fCommonAmongAlternates->intersect(*next);
181                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
182                 for (int32_t otherIt = UHASH_FIRST;;) {
183                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
184                     if (otherHashEl == NULL) {
185                         break;
186                     }
187                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
188                     if (next != other && next->contains(*other)) {
189                         uhash_removeElement(fScriptSetSet, nextHashEl);
190                         break;
191                     }
192                 }
193             }
194         }
195     }
196     if (uhash_count(fScriptSetSet) == 0) {
197         fCommonAmongAlternates->resetAll();
198     }
199     return *this;
200 }
201 
202 
getIdentifier() const203 const UnicodeString *IdentifierInfo::getIdentifier() const {
204     return fIdentifier;
205 }
206 
getScripts() const207 const ScriptSet *IdentifierInfo::getScripts() const {
208     return fRequiredScripts;
209 }
210 
getAlternates() const211 const UHashtable *IdentifierInfo::getAlternates() const {
212     return fScriptSetSet;
213 }
214 
215 
getNumerics() const216 const UnicodeSet *IdentifierInfo::getNumerics() const {
217     return fNumerics;
218 }
219 
getCommonAmongAlternates() const220 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
221     return fCommonAmongAlternates;
222 }
223 
224 #if !UCONFIG_NO_NORMALIZATION
225 
getRestrictionLevel(UErrorCode & status) const226 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
227     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
228         return USPOOF_UNRESTRICTIVE;
229     }
230     if (ASCII->containsAll(*fIdentifier)) {
231         return USPOOF_ASCII;
232     }
233     // This is a bit tricky. We look at a number of factors.
234     // The number of scripts in the text.
235     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
236     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
237 
238     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
239     //       time it is created, in setIdentifier().
240     int32_t cardinalityPlus = fRequiredScripts->countMembers() +
241             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
242     if (cardinalityPlus < 2) {
243         return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
244     }
245     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
246             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
247         return USPOOF_HIGHLY_RESTRICTIVE;
248     }
249     if (cardinalityPlus == 2 &&
250             fRequiredScripts->test(USCRIPT_LATIN, status) &&
251             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
252         return USPOOF_MODERATELY_RESTRICTIVE;
253     }
254     return USPOOF_MINIMALLY_RESTRICTIVE;
255 }
256 
257 #endif /* !UCONFIG_NO_NORMALIZATION */
258 
getScriptCount() const259 int32_t IdentifierInfo::getScriptCount() const {
260     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
261     int32_t count = fRequiredScripts->countMembers() +
262             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
263     return count;
264 }
265 
266 
267 
containsWithAlternates(const ScriptSet & container,const ScriptSet & containee) const268 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
269     if (!container.contains(containee)) {
270         return FALSE;
271     }
272     for (int32_t iter = UHASH_FIRST; ;) {
273         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
274         if (hashEl == NULL) {
275             break;
276         }
277         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
278         if (!container.intersects(*alternatives)) {
279             return false;
280         }
281     }
282     return true;
283 }
284 
displayAlternates(UnicodeString & dest,const UHashtable * alternates,UErrorCode & status)285 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
286     UVector sorted(status);
287     if (U_FAILURE(status)) {
288         return dest;
289     }
290     for (int32_t pos = UHASH_FIRST; ;) {
291         const UHashElement *el = uhash_nextElement(alternates, &pos);
292         if (el == NULL) {
293             break;
294         }
295         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
296         sorted.addElement(ss, status);
297     }
298     sorted.sort(uhash_compareScriptSet, status);
299     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
300     for (int32_t i=0; i<sorted.size(); i++) {
301         if (i>0) {
302             dest.append(separator);
303         }
304         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
305         ss->displayScripts(dest);
306     }
307     return dest;
308 }
309 
310 U_NAMESPACE_END
311 
312