1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 1997-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8 *
9 * File locid.cpp
10 *
11 * Created by: Richard Gillam
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   02/11/97    aliu        Changed gLocPath to fgDataDirectory and added
17 *                           methods to get and set it.
18 *   04/02/97    aliu        Made operator!= inline; fixed return value
19 *                           of getName().
20 *   04/15/97    aliu        Cleanup for AIX/Win32.
21 *   04/24/97    aliu        Numerous changes per code review.
22 *   08/18/98    stephen     Changed getDisplayName()
23 *                           Added SIMPLIFIED_CHINESE, TRADITIONAL_CHINESE
24 *                           Added getISOCountries(), getISOLanguages(),
25 *                           getLanguagesForCountry()
26 *   03/16/99    bertrand    rehaul.
27 *   07/21/99    stephen     Added U_CFUNC setDefault
28 *   11/09/99    weiv        Added const char * getName() const;
29 *   04/12/00    srl         removing unicodestring api's and cached hash code
30 *   08/10/01    grhoten     Change the static Locales to accessor functions
31 ******************************************************************************
32 */
33 
34 #include <utility>
35 
36 #include "unicode/bytestream.h"
37 #include "unicode/locid.h"
38 #include "unicode/localebuilder.h"
39 #include "unicode/strenum.h"
40 #include "unicode/stringpiece.h"
41 #include "unicode/uloc.h"
42 #include "unicode/ures.h"
43 
44 #include "bytesinkutil.h"
45 #include "charstr.h"
46 #include "charstrmap.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "mutex.h"
50 #include "putilimp.h"
51 #include "uassert.h"
52 #include "ucln_cmn.h"
53 #include "uhash.h"
54 #include "ulocimp.h"
55 #include "umutex.h"
56 #include "uniquecharstr.h"
57 #include "ustr_imp.h"
58 #include "uvector.h"
59 
60 U_CDECL_BEGIN
61 static UBool U_CALLCONV locale_cleanup(void);
62 U_CDECL_END
63 
64 U_NAMESPACE_BEGIN
65 
66 static Locale   *gLocaleCache = NULL;
67 static UInitOnce gLocaleCacheInitOnce = U_INITONCE_INITIALIZER;
68 
69 // gDefaultLocaleMutex protects all access to gDefaultLocalesHashT and gDefaultLocale.
70 static UMutex gDefaultLocaleMutex;
71 static UHashtable *gDefaultLocalesHashT = NULL;
72 static Locale *gDefaultLocale = NULL;
73 
74 /**
75  * \def ULOC_STRING_LIMIT
76  * strings beyond this value crash in CharString
77  */
78 #define ULOC_STRING_LIMIT 357913941
79 
80 U_NAMESPACE_END
81 
82 typedef enum ELocalePos {
83     eENGLISH,
84     eFRENCH,
85     eGERMAN,
86     eITALIAN,
87     eJAPANESE,
88     eKOREAN,
89     eCHINESE,
90 
91     eFRANCE,
92     eGERMANY,
93     eITALY,
94     eJAPAN,
95     eKOREA,
96     eCHINA,      /* Alias for PRC */
97     eTAIWAN,
98     eUK,
99     eUS,
100     eCANADA,
101     eCANADA_FRENCH,
102     eROOT,
103 
104 
105     //eDEFAULT,
106     eMAX_LOCALES
107 } ELocalePos;
108 
109 U_CDECL_BEGIN
110 //
111 // Deleter function for Locales owned by the default Locale hash table/
112 //
113 static void U_CALLCONV
deleteLocale(void * obj)114 deleteLocale(void *obj) {
115     delete (icu::Locale *) obj;
116 }
117 
locale_cleanup(void)118 static UBool U_CALLCONV locale_cleanup(void)
119 {
120     U_NAMESPACE_USE
121 
122     delete [] gLocaleCache;
123     gLocaleCache = NULL;
124     gLocaleCacheInitOnce.reset();
125 
126     if (gDefaultLocalesHashT) {
127         uhash_close(gDefaultLocalesHashT);   // Automatically deletes all elements, using deleter func.
128         gDefaultLocalesHashT = NULL;
129     }
130     gDefaultLocale = NULL;
131     return TRUE;
132 }
133 
134 
locale_init(UErrorCode & status)135 static void U_CALLCONV locale_init(UErrorCode &status) {
136     U_NAMESPACE_USE
137 
138     U_ASSERT(gLocaleCache == NULL);
139     gLocaleCache = new Locale[(int)eMAX_LOCALES];
140     if (gLocaleCache == NULL) {
141         status = U_MEMORY_ALLOCATION_ERROR;
142         return;
143     }
144     ucln_common_registerCleanup(UCLN_COMMON_LOCALE, locale_cleanup);
145     gLocaleCache[eROOT]          = Locale("");
146     gLocaleCache[eENGLISH]       = Locale("en");
147     gLocaleCache[eFRENCH]        = Locale("fr");
148     gLocaleCache[eGERMAN]        = Locale("de");
149     gLocaleCache[eITALIAN]       = Locale("it");
150     gLocaleCache[eJAPANESE]      = Locale("ja");
151     gLocaleCache[eKOREAN]        = Locale("ko");
152     gLocaleCache[eCHINESE]       = Locale("zh");
153     gLocaleCache[eFRANCE]        = Locale("fr", "FR");
154     gLocaleCache[eGERMANY]       = Locale("de", "DE");
155     gLocaleCache[eITALY]         = Locale("it", "IT");
156     gLocaleCache[eJAPAN]         = Locale("ja", "JP");
157     gLocaleCache[eKOREA]         = Locale("ko", "KR");
158     gLocaleCache[eCHINA]         = Locale("zh", "CN");
159     gLocaleCache[eTAIWAN]        = Locale("zh", "TW");
160     gLocaleCache[eUK]            = Locale("en", "GB");
161     gLocaleCache[eUS]            = Locale("en", "US");
162     gLocaleCache[eCANADA]        = Locale("en", "CA");
163     gLocaleCache[eCANADA_FRENCH] = Locale("fr", "CA");
164 }
165 
166 U_CDECL_END
167 
168 U_NAMESPACE_BEGIN
169 
locale_set_default_internal(const char * id,UErrorCode & status)170 Locale *locale_set_default_internal(const char *id, UErrorCode& status) {
171     // Synchronize this entire function.
172     Mutex lock(&gDefaultLocaleMutex);
173 
174     UBool canonicalize = FALSE;
175 
176     // If given a NULL string for the locale id, grab the default
177     //   name from the system.
178     //   (Different from most other locale APIs, where a null name means use
179     //    the current ICU default locale.)
180     if (id == NULL) {
181         id = uprv_getDefaultLocaleID();   // This function not thread safe? TODO: verify.
182         canonicalize = TRUE; // always canonicalize host ID
183     }
184 
185     CharString localeNameBuf;
186     {
187         CharStringByteSink sink(&localeNameBuf);
188         if (canonicalize) {
189             ulocimp_canonicalize(id, sink, &status);
190         } else {
191             ulocimp_getName(id, sink, &status);
192         }
193     }
194 
195     if (U_FAILURE(status)) {
196         return gDefaultLocale;
197     }
198 
199     if (gDefaultLocalesHashT == NULL) {
200         gDefaultLocalesHashT = uhash_open(uhash_hashChars, uhash_compareChars, NULL, &status);
201         if (U_FAILURE(status)) {
202             return gDefaultLocale;
203         }
204         uhash_setValueDeleter(gDefaultLocalesHashT, deleteLocale);
205         ucln_common_registerCleanup(UCLN_COMMON_LOCALE, locale_cleanup);
206     }
207 
208     Locale *newDefault = (Locale *)uhash_get(gDefaultLocalesHashT, localeNameBuf.data());
209     if (newDefault == NULL) {
210         newDefault = new Locale(Locale::eBOGUS);
211         if (newDefault == NULL) {
212             status = U_MEMORY_ALLOCATION_ERROR;
213             return gDefaultLocale;
214         }
215         newDefault->init(localeNameBuf.data(), FALSE);
216         uhash_put(gDefaultLocalesHashT, (char*) newDefault->getName(), newDefault, &status);
217         if (U_FAILURE(status)) {
218             return gDefaultLocale;
219         }
220     }
221     gDefaultLocale = newDefault;
222     return gDefaultLocale;
223 }
224 
225 U_NAMESPACE_END
226 
227 /* sfb 07/21/99 */
228 U_CFUNC void
locale_set_default(const char * id)229 locale_set_default(const char *id)
230 {
231     U_NAMESPACE_USE
232     UErrorCode status = U_ZERO_ERROR;
233     locale_set_default_internal(id, status);
234 }
235 /* end */
236 
237 U_CFUNC const char *
locale_get_default(void)238 locale_get_default(void)
239 {
240     U_NAMESPACE_USE
241     return Locale::getDefault().getName();
242 }
243 
244 
245 U_NAMESPACE_BEGIN
246 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)247 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)
248 
249 /*Character separating the posix id fields*/
250 // '_'
251 // In the platform codepage.
252 #define SEP_CHAR '_'
253 #define NULL_CHAR '\0'
254 
255 Locale::~Locale()
256 {
257     if (baseName != fullName) {
258         uprv_free(baseName);
259     }
260     baseName = NULL;
261     /*if fullName is on the heap, we free it*/
262     if (fullName != fullNameBuffer)
263     {
264         uprv_free(fullName);
265         fullName = NULL;
266     }
267 }
268 
Locale()269 Locale::Locale()
270     : UObject(), fullName(fullNameBuffer), baseName(NULL)
271 {
272     init(NULL, FALSE);
273 }
274 
275 /*
276  * Internal constructor to allow construction of a locale object with
277  *   NO side effects.   (Default constructor tries to get
278  *   the default locale.)
279  */
Locale(Locale::ELocaleType)280 Locale::Locale(Locale::ELocaleType)
281     : UObject(), fullName(fullNameBuffer), baseName(NULL)
282 {
283     setToBogus();
284 }
285 
286 
Locale(const char * newLanguage,const char * newCountry,const char * newVariant,const char * newKeywords)287 Locale::Locale( const   char * newLanguage,
288                 const   char * newCountry,
289                 const   char * newVariant,
290                 const   char * newKeywords)
291     : UObject(), fullName(fullNameBuffer), baseName(NULL)
292 {
293     if( (newLanguage==NULL) && (newCountry == NULL) && (newVariant == NULL) )
294     {
295         init(NULL, FALSE); /* shortcut */
296     }
297     else
298     {
299         UErrorCode status = U_ZERO_ERROR;
300         int32_t size = 0;
301         int32_t lsize = 0;
302         int32_t csize = 0;
303         int32_t vsize = 0;
304         int32_t ksize = 0;
305 
306         // Calculate the size of the resulting string.
307 
308         // Language
309         if ( newLanguage != NULL )
310         {
311             lsize = (int32_t)uprv_strlen(newLanguage);
312             if ( lsize < 0 || lsize > ULOC_STRING_LIMIT ) { // int32 wrap
313                 setToBogus();
314                 return;
315             }
316             size = lsize;
317         }
318 
319         CharString togo(newLanguage, lsize, status); // start with newLanguage
320 
321         // _Country
322         if ( newCountry != NULL )
323         {
324             csize = (int32_t)uprv_strlen(newCountry);
325             if ( csize < 0 || csize > ULOC_STRING_LIMIT ) { // int32 wrap
326                 setToBogus();
327                 return;
328             }
329             size += csize;
330         }
331 
332         // _Variant
333         if ( newVariant != NULL )
334         {
335             // remove leading _'s
336             while(newVariant[0] == SEP_CHAR)
337             {
338                 newVariant++;
339             }
340 
341             // remove trailing _'s
342             vsize = (int32_t)uprv_strlen(newVariant);
343             if ( vsize < 0 || vsize > ULOC_STRING_LIMIT ) { // int32 wrap
344                 setToBogus();
345                 return;
346             }
347             while( (vsize>1) && (newVariant[vsize-1] == SEP_CHAR) )
348             {
349                 vsize--;
350             }
351         }
352 
353         if( vsize > 0 )
354         {
355             size += vsize;
356         }
357 
358         // Separator rules:
359         if ( vsize > 0 )
360         {
361             size += 2;  // at least: __v
362         }
363         else if ( csize > 0 )
364         {
365             size += 1;  // at least: _v
366         }
367 
368         if ( newKeywords != NULL)
369         {
370             ksize = (int32_t)uprv_strlen(newKeywords);
371             if ( ksize < 0 || ksize > ULOC_STRING_LIMIT ) {
372               setToBogus();
373               return;
374             }
375             size += ksize + 1;
376         }
377 
378         //  NOW we have the full locale string..
379         // Now, copy it back.
380 
381         // newLanguage is already copied
382 
383         if ( ( vsize != 0 ) || (csize != 0) )  // at least:  __v
384         {                                      //            ^
385             togo.append(SEP_CHAR, status);
386         }
387 
388         if ( csize != 0 )
389         {
390             togo.append(newCountry, status);
391         }
392 
393         if ( vsize != 0)
394         {
395             togo.append(SEP_CHAR, status)
396                 .append(newVariant, vsize, status);
397         }
398 
399         if ( ksize != 0)
400         {
401             if (uprv_strchr(newKeywords, '=')) {
402                 togo.append('@', status); /* keyword parsing */
403             }
404             else {
405                 togo.append('_', status); /* Variant parsing with a script */
406                 if ( vsize == 0) {
407                     togo.append('_', status); /* No country found */
408                 }
409             }
410             togo.append(newKeywords, status);
411         }
412 
413         if (U_FAILURE(status)) {
414             // Something went wrong with appending, etc.
415             setToBogus();
416             return;
417         }
418         // Parse it, because for example 'language' might really be a complete
419         // string.
420         init(togo.data(), FALSE);
421     }
422 }
423 
Locale(const Locale & other)424 Locale::Locale(const Locale &other)
425     : UObject(other), fullName(fullNameBuffer), baseName(NULL)
426 {
427     *this = other;
428 }
429 
Locale(Locale && other)430 Locale::Locale(Locale&& other) U_NOEXCEPT
431     : UObject(other), fullName(fullNameBuffer), baseName(fullName) {
432   *this = std::move(other);
433 }
434 
operator =(const Locale & other)435 Locale& Locale::operator=(const Locale& other) {
436     if (this == &other) {
437         return *this;
438     }
439 
440     setToBogus();
441 
442     if (other.fullName == other.fullNameBuffer) {
443         uprv_strcpy(fullNameBuffer, other.fullNameBuffer);
444     } else if (other.fullName == nullptr) {
445         fullName = nullptr;
446     } else {
447         fullName = uprv_strdup(other.fullName);
448         if (fullName == nullptr) return *this;
449     }
450 
451     if (other.baseName == other.fullName) {
452         baseName = fullName;
453     } else if (other.baseName != nullptr) {
454         baseName = uprv_strdup(other.baseName);
455         if (baseName == nullptr) return *this;
456     }
457 
458     uprv_strcpy(language, other.language);
459     uprv_strcpy(script, other.script);
460     uprv_strcpy(country, other.country);
461 
462     variantBegin = other.variantBegin;
463     fIsBogus = other.fIsBogus;
464 
465     return *this;
466 }
467 
operator =(Locale && other)468 Locale& Locale::operator=(Locale&& other) U_NOEXCEPT {
469     if (baseName != fullName) uprv_free(baseName);
470     if (fullName != fullNameBuffer) uprv_free(fullName);
471 
472     if (other.fullName == other.fullNameBuffer) {
473         uprv_strcpy(fullNameBuffer, other.fullNameBuffer);
474         fullName = fullNameBuffer;
475     } else {
476         fullName = other.fullName;
477     }
478 
479     if (other.baseName == other.fullName) {
480         baseName = fullName;
481     } else {
482         baseName = other.baseName;
483     }
484 
485     uprv_strcpy(language, other.language);
486     uprv_strcpy(script, other.script);
487     uprv_strcpy(country, other.country);
488 
489     variantBegin = other.variantBegin;
490     fIsBogus = other.fIsBogus;
491 
492     other.baseName = other.fullName = other.fullNameBuffer;
493 
494     return *this;
495 }
496 
497 Locale *
clone() const498 Locale::clone() const {
499     return new Locale(*this);
500 }
501 
502 UBool
operator ==(const Locale & other) const503 Locale::operator==( const   Locale& other) const
504 {
505     return (uprv_strcmp(other.fullName, fullName) == 0);
506 }
507 
508 namespace {
509 
510 UInitOnce gKnownCanonicalizedInitOnce = U_INITONCE_INITIALIZER;
511 UHashtable *gKnownCanonicalized = nullptr;
512 
513 static const char* const KNOWN_CANONICALIZED[] = {
514     "c",
515     // Commonly used locales known are already canonicalized
516     "af", "af_ZA", "am", "am_ET", "ar", "ar_001", "as", "as_IN", "az", "az_AZ",
517     "be", "be_BY", "bg", "bg_BG", "bn", "bn_IN", "bs", "bs_BA", "ca", "ca_ES",
518     "cs", "cs_CZ", "cy", "cy_GB", "da", "da_DK", "de", "de_DE", "el", "el_GR",
519     "en", "en_GB", "en_US", "es", "es_419", "es_ES", "et", "et_EE", "eu",
520     "eu_ES", "fa", "fa_IR", "fi", "fi_FI", "fil", "fil_PH", "fr", "fr_FR",
521     "ga", "ga_IE", "gl", "gl_ES", "gu", "gu_IN", "he", "he_IL", "hi", "hi_IN",
522     "hr", "hr_HR", "hu", "hu_HU", "hy", "hy_AM", "id", "id_ID", "is", "is_IS",
523     "it", "it_IT", "ja", "ja_JP", "jv", "jv_ID", "ka", "ka_GE", "kk", "kk_KZ",
524     "km", "km_KH", "kn", "kn_IN", "ko", "ko_KR", "ky", "ky_KG", "lo", "lo_LA",
525     "lt", "lt_LT", "lv", "lv_LV", "mk", "mk_MK", "ml", "ml_IN", "mn", "mn_MN",
526     "mr", "mr_IN", "ms", "ms_MY", "my", "my_MM", "nb", "nb_NO", "ne", "ne_NP",
527     "nl", "nl_NL", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF",
528     "pt", "pt_BR", "pt_PT", "ro", "ro_RO", "ru", "ru_RU", "sd", "sd_IN", "si",
529     "si_LK", "sk", "sk_SK", "sl", "sl_SI", "so", "so_SO", "sq", "sq_AL", "sr",
530     "sr_Cyrl_RS", "sr_Latn", "sr_RS", "sv", "sv_SE", "sw", "sw_TZ", "ta",
531     "ta_IN", "te", "te_IN", "th", "th_TH", "tk", "tk_TM", "tr", "tr_TR", "uk",
532     "uk_UA", "ur", "ur_PK", "uz", "uz_UZ", "vi", "vi_VN", "yue", "yue_Hant",
533     "yue_Hant_HK", "yue_HK", "zh", "zh_CN", "zh_Hans", "zh_Hans_CN", "zh_Hant",
534     "zh_Hant_TW", "zh_TW", "zu", "zu_ZA"
535 };
536 
cleanupKnownCanonicalized()537 static UBool U_CALLCONV cleanupKnownCanonicalized() {
538     gKnownCanonicalizedInitOnce.reset();
539     if (gKnownCanonicalized) { uhash_close(gKnownCanonicalized); }
540     return TRUE;
541 }
542 
loadKnownCanonicalized(UErrorCode & status)543 static void U_CALLCONV loadKnownCanonicalized(UErrorCode &status) {
544     ucln_common_registerCleanup(UCLN_COMMON_LOCALE_KNOWN_CANONICALIZED,
545                                 cleanupKnownCanonicalized);
546     LocalUHashtablePointer newKnownCanonicalizedMap(
547         uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &status));
548     for (int32_t i = 0;
549             U_SUCCESS(status) && i < UPRV_LENGTHOF(KNOWN_CANONICALIZED);
550             i++) {
551         uhash_puti(newKnownCanonicalizedMap.getAlias(),
552                    (void*)KNOWN_CANONICALIZED[i],
553                    1, &status);
554     }
555     if (U_FAILURE(status)) {
556         return;
557     }
558 
559     gKnownCanonicalized = newKnownCanonicalizedMap.orphan();
560 }
561 
562 class AliasData;
563 
564 /**
565  * A Builder class to build the alias data.
566  */
567 class AliasDataBuilder {
568 public:
AliasDataBuilder()569     AliasDataBuilder() {
570     }
571 
572     // Build the AliasData from resource.
573     AliasData* build(UErrorCode &status);
574 
575 private:
576     void readAlias(UResourceBundle* alias,
577                    UniqueCharStrings* strings,
578                    LocalMemory<const char*>& types,
579                    LocalMemory<int32_t>& replacementIndexes,
580                    int32_t &length,
581                    void (*checkType)(const char* type),
582                    void (*checkReplacement)(const UnicodeString& replacement),
583                    UErrorCode &status);
584 
585     // Read the languageAlias data from alias to
586     // strings+types+replacementIndexes
587     // The number of record will be stored into length.
588     // Allocate length items for types, to store the type field.
589     // Allocate length items for replacementIndexes,
590     // to store the index in the strings for the replacement script.
591     void readLanguageAlias(UResourceBundle* alias,
592                            UniqueCharStrings* strings,
593                            LocalMemory<const char*>& types,
594                            LocalMemory<int32_t>& replacementIndexes,
595                            int32_t &length,
596                            UErrorCode &status);
597 
598     // Read the scriptAlias data from alias to
599     // strings+types+replacementIndexes
600     // Allocate length items for types, to store the type field.
601     // Allocate length items for replacementIndexes,
602     // to store the index in the strings for the replacement script.
603     void readScriptAlias(UResourceBundle* alias,
604                          UniqueCharStrings* strings,
605                          LocalMemory<const char*>& types,
606                          LocalMemory<int32_t>& replacementIndexes,
607                          int32_t &length, UErrorCode &status);
608 
609     // Read the territoryAlias data from alias to
610     // strings+types+replacementIndexes
611     // Allocate length items for types, to store the type field.
612     // Allocate length items for replacementIndexes,
613     // to store the index in the strings for the replacement script.
614     void readTerritoryAlias(UResourceBundle* alias,
615                             UniqueCharStrings* strings,
616                             LocalMemory<const char*>& types,
617                             LocalMemory<int32_t>& replacementIndexes,
618                             int32_t &length, UErrorCode &status);
619 
620     // Read the variantAlias data from alias to
621     // strings+types+replacementIndexes
622     // Allocate length items for types, to store the type field.
623     // Allocate length items for replacementIndexes,
624     // to store the index in the strings for the replacement variant.
625     void readVariantAlias(UResourceBundle* alias,
626                           UniqueCharStrings* strings,
627                           LocalMemory<const char*>& types,
628                           LocalMemory<int32_t>& replacementIndexes,
629                           int32_t &length, UErrorCode &status);
630 };
631 
632 /**
633  * A class to hold the Alias Data.
634  */
635 class AliasData : public UMemory {
636 public:
singleton(UErrorCode & status)637     static const AliasData* singleton(UErrorCode& status) {
638         if (U_FAILURE(status)) {
639             // Do not get into loadData if the status already has error.
640             return nullptr;
641         }
642         umtx_initOnce(AliasData::gInitOnce, &AliasData::loadData, status);
643         return gSingleton;
644     }
645 
languageMap() const646     const CharStringMap& languageMap() const { return language; }
scriptMap() const647     const CharStringMap& scriptMap() const { return script; }
territoryMap() const648     const CharStringMap& territoryMap() const { return territory; }
variantMap() const649     const CharStringMap& variantMap() const { return variant; }
650 
651     static void U_CALLCONV loadData(UErrorCode &status);
652     static UBool U_CALLCONV cleanup();
653 
654     static UInitOnce gInitOnce;
655 
656 private:
AliasData(CharStringMap languageMap,CharStringMap scriptMap,CharStringMap territoryMap,CharStringMap variantMap,CharString * strings)657     AliasData(CharStringMap languageMap,
658               CharStringMap scriptMap,
659               CharStringMap territoryMap,
660               CharStringMap variantMap,
661               CharString* strings)
662         : language(std::move(languageMap)),
663           script(std::move(scriptMap)),
664           territory(std::move(territoryMap)),
665           variant(std::move(variantMap)),
666           strings(strings) {
667     }
668 
~AliasData()669     ~AliasData() {
670         delete strings;
671     }
672 
673     static const AliasData* gSingleton;
674 
675     CharStringMap language;
676     CharStringMap script;
677     CharStringMap territory;
678     CharStringMap variant;
679     CharString* strings;
680 
681     friend class AliasDataBuilder;
682 };
683 
684 
685 const AliasData* AliasData::gSingleton = nullptr;
686 UInitOnce AliasData::gInitOnce = U_INITONCE_INITIALIZER;
687 
688 UBool U_CALLCONV
cleanup()689 AliasData::cleanup()
690 {
691     gInitOnce.reset();
692     delete gSingleton;
693     return TRUE;
694 }
695 
696 void
readAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,void (* checkType)(const char * type),void (* checkReplacement)(const UnicodeString & replacement),UErrorCode & status)697 AliasDataBuilder::readAlias(
698         UResourceBundle* alias,
699         UniqueCharStrings* strings,
700         LocalMemory<const char*>& types,
701         LocalMemory<int32_t>& replacementIndexes,
702         int32_t &length,
703         void (*checkType)(const char* type),
704         void (*checkReplacement)(const UnicodeString& replacement),
705         UErrorCode &status) {
706     if (U_FAILURE(status)) {
707         return;
708     }
709     length = ures_getSize(alias);
710     const char** rawTypes = types.allocateInsteadAndCopy(length);
711     if (rawTypes == nullptr) {
712         status = U_MEMORY_ALLOCATION_ERROR;
713         return;
714     }
715     int32_t* rawIndexes = replacementIndexes.allocateInsteadAndCopy(length);
716     if (rawIndexes == nullptr) {
717         status = U_MEMORY_ALLOCATION_ERROR;
718         return;
719     }
720     int i = 0;
721     while (ures_hasNext(alias)) {
722         LocalUResourceBundlePointer res(
723             ures_getNextResource(alias, nullptr, &status));
724         const char* aliasFrom = ures_getKey(res.getAlias());
725         UnicodeString aliasTo =
726             ures_getUnicodeStringByKey(res.getAlias(), "replacement", &status);
727 
728         checkType(aliasFrom);
729         checkReplacement(aliasTo);
730 
731         rawTypes[i] = aliasFrom;
732         rawIndexes[i] = strings->add(aliasTo, status);
733         i++;
734     }
735 }
736 
737 /**
738  * Read the languageAlias data from alias to strings+types+replacementIndexes.
739  * Allocate length items for types, to store the type field. Allocate length
740  * items for replacementIndexes, to store the index in the strings for the
741  * replacement language.
742  */
743 void
readLanguageAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)744 AliasDataBuilder::readLanguageAlias(
745         UResourceBundle* alias,
746         UniqueCharStrings* strings,
747         LocalMemory<const char*>& types,
748         LocalMemory<int32_t>& replacementIndexes,
749         int32_t &length,
750         UErrorCode &status)
751 {
752     return readAlias(
753         alias, strings, types, replacementIndexes, length,
754 #if U_DEBUG
755         [](const char* type) {
756             // Assert the aliasFrom only contains the following possibilties
757             // language_REGION_variant
758             // language_REGION
759             // language_variant
760             // language
761             // und_variant
762             Locale test(type);
763             // Assert no script in aliasFrom
764             U_ASSERT(test.getScript()[0] == '\0');
765             // Assert when language is und, no REGION in aliasFrom.
766             U_ASSERT(test.getLanguage()[0] != '\0' || test.getCountry()[0] == '\0');
767         },
768 #else
769         [](const char*) {},
770 #endif
771         [](const UnicodeString&) {}, status);
772 }
773 
774 /**
775  * Read the scriptAlias data from alias to strings+types+replacementIndexes.
776  * Allocate length items for types, to store the type field. Allocate length
777  * items for replacementIndexes, to store the index in the strings for the
778  * replacement script.
779  */
780 void
readScriptAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)781 AliasDataBuilder::readScriptAlias(
782         UResourceBundle* alias,
783         UniqueCharStrings* strings,
784         LocalMemory<const char*>& types,
785         LocalMemory<int32_t>& replacementIndexes,
786         int32_t &length,
787         UErrorCode &status)
788 {
789     return readAlias(
790         alias, strings, types, replacementIndexes, length,
791 #if U_DEBUG
792         [](const char* type) {
793             U_ASSERT(uprv_strlen(type) == 4);
794         },
795         [](const UnicodeString& replacement) {
796             U_ASSERT(replacement.length() == 4);
797         },
798 #else
799         [](const char*) {},
800         [](const UnicodeString&) { },
801 #endif
802         status);
803 }
804 
805 /**
806  * Read the territoryAlias data from alias to strings+types+replacementIndexes.
807  * Allocate length items for types, to store the type field. Allocate length
808  * items for replacementIndexes, to store the index in the strings for the
809  * replacement regions.
810  */
811 void
readTerritoryAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)812 AliasDataBuilder::readTerritoryAlias(
813         UResourceBundle* alias,
814         UniqueCharStrings* strings,
815         LocalMemory<const char*>& types,
816         LocalMemory<int32_t>& replacementIndexes,
817         int32_t &length,
818         UErrorCode &status)
819 {
820     return readAlias(
821         alias, strings, types, replacementIndexes, length,
822 #if U_DEBUG
823         [](const char* type) {
824             U_ASSERT(uprv_strlen(type) == 2 || uprv_strlen(type) == 3);
825         },
826 #else
827         [](const char*) {},
828 #endif
829         [](const UnicodeString&) { },
830         status);
831 }
832 
833 /**
834  * Read the variantAlias data from alias to strings+types+replacementIndexes.
835  * Allocate length items for types, to store the type field. Allocate length
836  * items for replacementIndexes, to store the index in the strings for the
837  * replacement variant.
838  */
839 void
readVariantAlias(UResourceBundle * alias,UniqueCharStrings * strings,LocalMemory<const char * > & types,LocalMemory<int32_t> & replacementIndexes,int32_t & length,UErrorCode & status)840 AliasDataBuilder::readVariantAlias(
841         UResourceBundle* alias,
842         UniqueCharStrings* strings,
843         LocalMemory<const char*>& types,
844         LocalMemory<int32_t>& replacementIndexes,
845         int32_t &length,
846         UErrorCode &status)
847 {
848     return readAlias(
849         alias, strings, types, replacementIndexes, length,
850 #if U_DEBUG
851         [](const char* type) {
852             U_ASSERT(uprv_strlen(type) >= 4 && uprv_strlen(type) <= 8);
853             U_ASSERT(uprv_strlen(type) != 4 ||
854                      (type[0] >= '0' && type[0] <= '9'));
855         },
856         [](const UnicodeString& replacement) {
857             U_ASSERT(replacement.length() >= 4 && replacement.length() <= 8);
858             U_ASSERT(replacement.length() != 4 ||
859                      (replacement.charAt(0) >= u'0' &&
860                       replacement.charAt(0) <= u'9'));
861         },
862 #else
863         [](const char*) {},
864         [](const UnicodeString&) { },
865 #endif
866         status);
867 }
868 
869 /**
870  * Initializes the alias data from the ICU resource bundles. The alias data
871  * contains alias of language, country, script and variants.
872  *
873  * If the alias data has already loaded, then this method simply returns without
874  * doing anything meaningful.
875  */
876 void U_CALLCONV
loadData(UErrorCode & status)877 AliasData::loadData(UErrorCode &status)
878 {
879 #ifdef LOCALE_CANONICALIZATION_DEBUG
880     UDate start = uprv_getRawUTCtime();
881 #endif  // LOCALE_CANONICALIZATION_DEBUG
882     ucln_common_registerCleanup(UCLN_COMMON_LOCALE_ALIAS, cleanup);
883     AliasDataBuilder builder;
884     gSingleton = builder.build(status);
885 #ifdef LOCALE_CANONICALIZATION_DEBUG
886     UDate end = uprv_getRawUTCtime();
887     printf("AliasData::loadData took total %f ms\n", end - start);
888 #endif  // LOCALE_CANONICALIZATION_DEBUG
889 }
890 
891 /**
892  * Build the alias data from resources.
893  */
894 AliasData*
build(UErrorCode & status)895 AliasDataBuilder::build(UErrorCode &status) {
896     LocalUResourceBundlePointer metadata(
897         ures_openDirect(nullptr, "metadata", &status));
898     LocalUResourceBundlePointer metadataAlias(
899         ures_getByKey(metadata.getAlias(), "alias", nullptr, &status));
900     LocalUResourceBundlePointer languageAlias(
901         ures_getByKey(metadataAlias.getAlias(), "language", nullptr, &status));
902     LocalUResourceBundlePointer scriptAlias(
903         ures_getByKey(metadataAlias.getAlias(), "script", nullptr, &status));
904     LocalUResourceBundlePointer territoryAlias(
905         ures_getByKey(metadataAlias.getAlias(), "territory", nullptr, &status));
906     LocalUResourceBundlePointer variantAlias(
907         ures_getByKey(metadataAlias.getAlias(), "variant", nullptr, &status));
908 
909     if (U_FAILURE(status)) {
910         return nullptr;
911     }
912     int32_t languagesLength = 0, scriptLength = 0, territoryLength = 0,
913             variantLength = 0;
914 
915     // Read the languageAlias into languageTypes, languageReplacementIndexes
916     // and strings
917     UniqueCharStrings strings(status);
918     LocalMemory<const char*> languageTypes;
919     LocalMemory<int32_t> languageReplacementIndexes;
920     readLanguageAlias(languageAlias.getAlias(),
921                       &strings,
922                       languageTypes,
923                       languageReplacementIndexes,
924                       languagesLength,
925                       status);
926 
927     // Read the scriptAlias into scriptTypes, scriptReplacementIndexes
928     // and strings
929     LocalMemory<const char*> scriptTypes;
930     LocalMemory<int32_t> scriptReplacementIndexes;
931     readScriptAlias(scriptAlias.getAlias(),
932                     &strings,
933                     scriptTypes,
934                     scriptReplacementIndexes,
935                     scriptLength,
936                     status);
937 
938     // Read the territoryAlias into territoryTypes, territoryReplacementIndexes
939     // and strings
940     LocalMemory<const char*> territoryTypes;
941     LocalMemory<int32_t> territoryReplacementIndexes;
942     readTerritoryAlias(territoryAlias.getAlias(),
943                        &strings,
944                        territoryTypes,
945                        territoryReplacementIndexes,
946                        territoryLength, status);
947 
948     // Read the variantAlias into variantTypes, variantReplacementIndexes
949     // and strings
950     LocalMemory<const char*> variantTypes;
951     LocalMemory<int32_t> variantReplacementIndexes;
952     readVariantAlias(variantAlias.getAlias(),
953                      &strings,
954                      variantTypes,
955                      variantReplacementIndexes,
956                      variantLength, status);
957 
958     if (U_FAILURE(status)) {
959         return nullptr;
960     }
961 
962     // We can only use strings after freeze it.
963     strings.freeze();
964 
965     // Build the languageMap from languageTypes & languageReplacementIndexes
966     CharStringMap languageMap(490, status);
967     for (int32_t i = 0; U_SUCCESS(status) && i < languagesLength; i++) {
968         languageMap.put(languageTypes[i],
969                         strings.get(languageReplacementIndexes[i]),
970                         status);
971     }
972 
973     // Build the scriptMap from scriptTypes & scriptReplacementIndexes
974     CharStringMap scriptMap(1, status);
975     for (int32_t i = 0; U_SUCCESS(status) && i < scriptLength; i++) {
976         scriptMap.put(scriptTypes[i],
977                       strings.get(scriptReplacementIndexes[i]),
978                       status);
979     }
980 
981     // Build the territoryMap from territoryTypes & territoryReplacementIndexes
982     CharStringMap territoryMap(650, status);
983     for (int32_t i = 0; U_SUCCESS(status) && i < territoryLength; i++) {
984         territoryMap.put(territoryTypes[i],
985                          strings.get(territoryReplacementIndexes[i]),
986                          status);
987     }
988 
989     // Build the variantMap from variantTypes & variantReplacementIndexes.
990     CharStringMap variantMap(2, status);
991     for (int32_t i = 0; U_SUCCESS(status) && i < variantLength; i++) {
992         variantMap.put(variantTypes[i],
993                        strings.get(variantReplacementIndexes[i]),
994                        status);
995     }
996 
997     if (U_FAILURE(status)) {
998         return nullptr;
999     }
1000 
1001     // copy hashtables
1002     auto *data = new AliasData(
1003         std::move(languageMap),
1004         std::move(scriptMap),
1005         std::move(territoryMap),
1006         std::move(variantMap),
1007         strings.orphanCharStrings());
1008 
1009     if (data == nullptr) {
1010         status = U_MEMORY_ALLOCATION_ERROR;
1011     }
1012     return data;
1013 }
1014 
1015 /**
1016  * A class that find the replacement values of locale fields by using AliasData.
1017  */
1018 class AliasReplacer {
1019 public:
AliasReplacer(UErrorCode status)1020     AliasReplacer(UErrorCode status) :
1021             language(nullptr), script(nullptr), region(nullptr),
1022             extensions(nullptr), variants(status),
1023             data(nullptr) {
1024     }
~AliasReplacer()1025     ~AliasReplacer() {
1026     }
1027 
1028     // Check the fields inside locale, if need to replace fields,
1029     // place the the replaced locale ID in out and return true.
1030     // Otherwise return false for no replacement or error.
1031     bool replace(
1032         const Locale& locale, CharString& out, UErrorCode& status);
1033 
1034 private:
1035     const char* language;
1036     const char* script;
1037     const char* region;
1038     const char* extensions;
1039     UVector variants;
1040 
1041     const AliasData* data;
1042 
notEmpty(const char * str)1043     inline bool notEmpty(const char* str) {
1044         return str && str[0] != NULL_CHAR;
1045     }
1046 
1047     /**
1048      * If replacement is neither null nor empty and input is either null or empty,
1049      * return replacement.
1050      * If replacement is neither null nor empty but input is not empty, return input.
1051      * If replacement is either null or empty and type is either null or empty,
1052      * return input.
1053      * Otherwise return null.
1054      *   replacement     input      type        return
1055      *    AAA             nullptr    *           AAA
1056      *    AAA             BBB        *           BBB
1057      *    nullptr || ""   CCC        nullptr     CCC
1058      *    nullptr || ""   *          DDD         nullptr
1059      */
deleteOrReplace(const char * input,const char * type,const char * replacement)1060     inline const char* deleteOrReplace(
1061             const char* input, const char* type, const char* replacement) {
1062         return notEmpty(replacement) ?
1063             ((input == nullptr) ?  replacement : input) :
1064             ((type == nullptr) ? input  : nullptr);
1065     }
1066 
same(const char * a,const char * b)1067     inline bool same(const char* a, const char* b) {
1068         if (a == nullptr && b == nullptr) {
1069             return true;
1070         }
1071         if ((a == nullptr && b != nullptr) ||
1072             (a != nullptr && b == nullptr)) {
1073           return false;
1074         }
1075         return uprv_strcmp(a, b) == 0;
1076     }
1077 
1078     // Gather fields and generate locale ID into out.
1079     CharString& outputToString(CharString& out, UErrorCode status);
1080 
1081     // Generate the lookup key.
1082     CharString& generateKey(const char* language, const char* region,
1083                             const char* variant, CharString& out,
1084                             UErrorCode status);
1085 
1086     void parseLanguageReplacement(const char* replacement,
1087                                   const char*& replaceLanguage,
1088                                   const char*& replaceScript,
1089                                   const char*& replaceRegion,
1090                                   const char*& replaceVariant,
1091                                   const char*& replaceExtensions,
1092                                   UVector& toBeFreed,
1093                                   UErrorCode& status);
1094 
1095     // Replace by using languageAlias.
1096     bool replaceLanguage(bool checkLanguage, bool checkRegion,
1097                          bool checkVariants, UVector& toBeFreed,
1098                          UErrorCode& status);
1099 
1100     // Replace by using territoryAlias.
1101     bool replaceTerritory(UVector& toBeFreed, UErrorCode& status);
1102 
1103     // Replace by using scriptAlias.
1104     bool replaceScript(UErrorCode& status);
1105 
1106     // Replace by using variantAlias.
1107     bool replaceVariant(UErrorCode& status);
1108 };
1109 
1110 CharString&
generateKey(const char * language,const char * region,const char * variant,CharString & out,UErrorCode status)1111 AliasReplacer::generateKey(
1112         const char* language, const char* region, const char* variant,
1113         CharString& out, UErrorCode status)
1114 {
1115     out.append(language, status);
1116     if (notEmpty(region)) {
1117         out.append(SEP_CHAR, status)
1118             .append(region, status);
1119     }
1120     if (notEmpty(variant)) {
1121        out.append(SEP_CHAR, status)
1122            .append(variant, status);
1123     }
1124     return out;
1125 }
1126 
1127 void
parseLanguageReplacement(const char * replacement,const char * & replacedLanguage,const char * & replacedScript,const char * & replacedRegion,const char * & replacedVariant,const char * & replacedExtensions,UVector & toBeFreed,UErrorCode & status)1128 AliasReplacer::parseLanguageReplacement(
1129     const char* replacement,
1130     const char*& replacedLanguage,
1131     const char*& replacedScript,
1132     const char*& replacedRegion,
1133     const char*& replacedVariant,
1134     const char*& replacedExtensions,
1135     UVector& toBeFreed,
1136     UErrorCode& status)
1137 {
1138     if (U_FAILURE(status)) {
1139         return;
1140     }
1141     replacedScript = replacedRegion = replacedVariant
1142         = replacedExtensions = nullptr;
1143     if (uprv_strchr(replacement, '_') == nullptr) {
1144         replacedLanguage = replacement;
1145         // reach the end, just return it.
1146         return;
1147     }
1148     // We have multiple field so we have to allocate and parse
1149     CharString* str = new CharString(
1150         replacement, (int32_t)uprv_strlen(replacement), status);
1151     if (U_FAILURE(status)) {
1152         return;
1153     }
1154     if (str == nullptr) {
1155         status = U_MEMORY_ALLOCATION_ERROR;
1156         return;
1157     }
1158     toBeFreed.addElement(str, status);
1159     char* data = str->data();
1160     replacedLanguage = (const char*) data;
1161     char* endOfField = uprv_strchr(data, '_');
1162     *endOfField = '\0'; // null terminiate it.
1163     endOfField++;
1164     const char* start = endOfField;
1165     endOfField = (char*) uprv_strchr(start, '_');
1166     size_t len = 0;
1167     if (endOfField == nullptr) {
1168         len = uprv_strlen(start);
1169     } else {
1170         len = endOfField - start;
1171         *endOfField = '\0'; // null terminiate it.
1172     }
1173     if (len == 4 && uprv_isASCIILetter(*start)) {
1174         // Got a script
1175         replacedScript = start;
1176         if (endOfField == nullptr) {
1177             return;
1178         }
1179         start = endOfField++;
1180         endOfField = (char*)uprv_strchr(start, '_');
1181         if (endOfField == nullptr) {
1182             len = uprv_strlen(start);
1183         } else {
1184             len = endOfField - start;
1185             *endOfField = '\0'; // null terminiate it.
1186         }
1187     }
1188     if (len >= 2 && len <= 3) {
1189         // Got a region
1190         replacedRegion = start;
1191         if (endOfField == nullptr) {
1192             return;
1193         }
1194         start = endOfField++;
1195         endOfField = (char*)uprv_strchr(start, '_');
1196         if (endOfField == nullptr) {
1197             len = uprv_strlen(start);
1198         } else {
1199             len = endOfField - start;
1200             *endOfField = '\0'; // null terminiate it.
1201         }
1202     }
1203     if (len >= 4) {
1204         // Got a variant
1205         replacedVariant = start;
1206         if (endOfField == nullptr) {
1207             return;
1208         }
1209         start = endOfField++;
1210     }
1211     replacedExtensions = start;
1212 }
1213 
1214 bool
replaceLanguage(bool checkLanguage,bool checkRegion,bool checkVariants,UVector & toBeFreed,UErrorCode & status)1215 AliasReplacer::replaceLanguage(
1216         bool checkLanguage, bool checkRegion,
1217         bool checkVariants, UVector& toBeFreed, UErrorCode& status)
1218 {
1219     if (U_FAILURE(status)) {
1220         return false;
1221     }
1222     if (    (checkRegion && region == nullptr) ||
1223             (checkVariants && variants.size() == 0)) {
1224         // Nothing to search.
1225         return false;
1226     }
1227     int32_t variant_size = checkVariants ? variants.size() : 1;
1228     // Since we may have more than one variant, we need to loop through them.
1229     const char* searchLanguage = checkLanguage ? language : "und";
1230     const char* searchRegion = checkRegion ? region : nullptr;
1231     const char* searchVariant = nullptr;
1232     for (int32_t variant_index = 0;
1233             variant_index < variant_size;
1234             variant_index++) {
1235         if (checkVariants) {
1236             U_ASSERT(variant_index < variant_size);
1237             searchVariant = (const char*)(variants.elementAt(variant_index));
1238         }
1239 
1240         if (searchVariant != nullptr && uprv_strlen(searchVariant) < 4) {
1241             // Do not consider  ill-formed variant subtag.
1242             searchVariant = nullptr;
1243         }
1244         CharString typeKey;
1245         generateKey(searchLanguage, searchRegion, searchVariant, typeKey,
1246                     status);
1247         if (U_FAILURE(status)) {
1248             return false;
1249         }
1250         const char *replacement = data->languageMap().get(typeKey.data());
1251         if (replacement == nullptr) {
1252             // Found no replacement data.
1253             continue;
1254         }
1255 
1256         const char* replacedLanguage = nullptr;
1257         const char* replacedScript = nullptr;
1258         const char* replacedRegion = nullptr;
1259         const char* replacedVariant = nullptr;
1260         const char* replacedExtensions = nullptr;
1261         parseLanguageReplacement(replacement,
1262                                  replacedLanguage,
1263                                  replacedScript,
1264                                  replacedRegion,
1265                                  replacedVariant,
1266                                  replacedExtensions,
1267                                  toBeFreed,
1268                                  status);
1269         replacedLanguage =
1270             (replacedLanguage != nullptr && uprv_strcmp(replacedLanguage, "und") == 0) ?
1271             language : replacedLanguage;
1272         replacedScript = deleteOrReplace(script, nullptr, replacedScript);
1273         replacedRegion = deleteOrReplace(region, searchRegion, replacedRegion);
1274         replacedVariant = deleteOrReplace(
1275             searchVariant, searchVariant, replacedVariant);
1276 
1277         if (    same(language, replacedLanguage) &&
1278                 same(script, replacedScript) &&
1279                 same(region, replacedRegion) &&
1280                 same(searchVariant, replacedVariant) &&
1281                 replacedExtensions == nullptr) {
1282             // Replacement produce no changes.
1283             continue;
1284         }
1285 
1286         language = replacedLanguage;
1287         region = replacedRegion;
1288         script = replacedScript;
1289         if (searchVariant != nullptr) {
1290             if (notEmpty(replacedVariant)) {
1291                 variants.setElementAt((void*)replacedVariant, variant_index);
1292             } else {
1293                 variants.removeElementAt(variant_index);
1294             }
1295         }
1296         if (replacedExtensions != nullptr) {
1297             // TODO(ICU-21292)
1298             // DO NOTHING
1299             // UTS35 does not specifiy what should we do if we have extensions in the
1300             // replacement. Currently we know only the following 4 "BCP47 LegacyRules" have
1301             // extensions in them languageAlias:
1302             //  i_default => en_x_i_default
1303             //  i_enochian => und_x_i_enochian
1304             //  i_mingo => see_x_i_mingo
1305             //  zh_min => nan_x_zh_min
1306             // But all of them are already changed by code inside ultag_parse() before
1307             // hitting this code.
1308         }
1309 
1310         // Something changed by language alias data.
1311         return true;
1312     }
1313     // Nothing changed by language alias data.
1314     return false;
1315 }
1316 
1317 bool
replaceTerritory(UVector & toBeFreed,UErrorCode & status)1318 AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status)
1319 {
1320     if (U_FAILURE(status)) {
1321         return false;
1322     }
1323     if (region == nullptr) {
1324         // No region to search.
1325         return false;
1326     }
1327     const char *replacement = data->territoryMap().get(region);
1328     if (replacement == nullptr) {
1329         // Found no replacement data for this region.
1330         return false;
1331     }
1332     const char* replacedRegion = replacement;
1333     const char* firstSpace = uprv_strchr(replacement, ' ');
1334     if (firstSpace != nullptr) {
1335         // If there are are more than one region in the replacement.
1336         // We need to check which one match based on the language.
1337         // Cannot use nullptr for language because that will construct
1338         // the default locale, in that case, use "und" to get the correct
1339         // locale.
1340         Locale l = LocaleBuilder()
1341             .setLanguage(language == nullptr ? "und" : language)
1342             .setScript(script)
1343             .build(status);
1344         l.addLikelySubtags(status);
1345         const char* likelyRegion = l.getCountry();
1346         LocalPointer<CharString> item;
1347         if (likelyRegion != nullptr && uprv_strlen(likelyRegion) > 0) {
1348             size_t len = uprv_strlen(likelyRegion);
1349             const char* foundInReplacement = uprv_strstr(replacement,
1350                                                          likelyRegion);
1351             if (foundInReplacement != nullptr) {
1352                 // Assuming the case there are no three letter region code in
1353                 // the replacement of territoryAlias
1354                 U_ASSERT(foundInReplacement == replacement ||
1355                          *(foundInReplacement-1) == ' ');
1356                 U_ASSERT(foundInReplacement[len] == ' ' ||
1357                          foundInReplacement[len] == '\0');
1358                 item.adoptInsteadAndCheckErrorCode(
1359                     new CharString(foundInReplacement, (int32_t)len, status), status);
1360             }
1361         }
1362         if (item.isNull() && U_SUCCESS(status)) {
1363             item.adoptInsteadAndCheckErrorCode(
1364                 new CharString(replacement,
1365                                (int32_t)(firstSpace - replacement), status), status);
1366         }
1367         if (U_FAILURE(status)) { return false; }
1368         if (item.isNull()) {
1369             status = U_MEMORY_ALLOCATION_ERROR;
1370             return false;
1371         }
1372         replacedRegion = item->data();
1373         toBeFreed.addElement(item.orphan(), status);
1374     }
1375     U_ASSERT(!same(region, replacedRegion));
1376     region = replacedRegion;
1377     // The region is changed by data in territory alias.
1378     return true;
1379 }
1380 
1381 bool
replaceScript(UErrorCode & status)1382 AliasReplacer::replaceScript(UErrorCode& status)
1383 {
1384     if (U_FAILURE(status)) {
1385         return false;
1386     }
1387     if (script == nullptr) {
1388         // No script to search.
1389         return false;
1390     }
1391     const char *replacement = data->scriptMap().get(script);
1392     if (replacement == nullptr) {
1393         // Found no replacement data for this script.
1394         return false;
1395     }
1396     U_ASSERT(!same(script, replacement));
1397     script = replacement;
1398     // The script is changed by data in script alias.
1399     return true;
1400 }
1401 
1402 bool
replaceVariant(UErrorCode & status)1403 AliasReplacer::replaceVariant(UErrorCode& status)
1404 {
1405     if (U_FAILURE(status)) {
1406         return false;
1407     }
1408     // Since we may have more than one variant, we need to loop through them.
1409     for (int32_t i = 0; i < variants.size(); i++) {
1410         const char *variant = (const char*)(variants.elementAt(i));
1411         const char *replacement = data->variantMap().get(variant);
1412         if (replacement == nullptr) {
1413             // Found no replacement data for this variant.
1414             continue;
1415         }
1416         U_ASSERT((uprv_strlen(replacement) >= 5  &&
1417                   uprv_strlen(replacement) <= 8) ||
1418                  (uprv_strlen(replacement) == 4 &&
1419                   replacement[0] >= '0' &&
1420                   replacement[0] <= '9'));
1421         if (!same(variant, replacement)) {
1422             variants.setElementAt((void*)replacement, i);
1423             // Special hack to handle hepburn-heploc => alalc97
1424             if (uprv_strcmp(variant, "heploc") == 0) {
1425                 for (int32_t j = 0; j < variants.size(); j++) {
1426                      if (uprv_strcmp((const char*)(variants.elementAt(j)),
1427                                      "hepburn") == 0) {
1428                          variants.removeElementAt(j);
1429                      }
1430                 }
1431             }
1432             return true;
1433         }
1434     }
1435     return false;
1436 }
1437 
1438 CharString&
outputToString(CharString & out,UErrorCode status)1439 AliasReplacer::outputToString(
1440     CharString& out, UErrorCode status)
1441 {
1442     out.append(language, status);
1443     if (notEmpty(script)) {
1444         out.append(SEP_CHAR, status)
1445             .append(script, status);
1446     }
1447     if (notEmpty(region)) {
1448         out.append(SEP_CHAR, status)
1449             .append(region, status);
1450     }
1451     if (variants.size() > 0) {
1452         if (!notEmpty(script) && !notEmpty(region)) {
1453           out.append(SEP_CHAR, status);
1454         }
1455         variants.sort([](UElement e1, UElement e2) -> int8_t {
1456             // uprv_strcmp return int and in some platform, such as arm64-v8a,
1457             // it may return positive values > 127 which cause the casted value
1458             // of int8_t negative.
1459             int res = uprv_strcmp(
1460                 (const char*)e1.pointer, (const char*)e2.pointer);
1461             return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
1462         }, status);
1463         int32_t variantsStart = out.length();
1464         for (int32_t i = 0; i < variants.size(); i++) {
1465              out.append(SEP_CHAR, status)
1466                  .append((const char*)(variants.elementAt(i)),
1467                          status);
1468         }
1469         T_CString_toUpperCase(out.data() + variantsStart);
1470     }
1471     if (notEmpty(extensions)) {
1472         CharString tmp("und_", status);
1473         tmp.append(extensions, status);
1474         Locale tmpLocale(tmp.data());
1475         // only support x extension inside CLDR for now.
1476         U_ASSERT(extensions[0] == 'x');
1477         out.append(tmpLocale.getName() + 1, status);
1478     }
1479     return out;
1480 }
1481 
1482 bool
replace(const Locale & locale,CharString & out,UErrorCode & status)1483 AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status)
1484 {
1485     data = AliasData::singleton(status);
1486     if (U_FAILURE(status)) {
1487         return false;
1488     }
1489     U_ASSERT(data != nullptr);
1490     out.clear();
1491     language = locale.getLanguage();
1492     if (!notEmpty(language)) {
1493         language = nullptr;
1494     }
1495     script = locale.getScript();
1496     if (!notEmpty(script)) {
1497         script = nullptr;
1498     }
1499     region = locale.getCountry();
1500     if (!notEmpty(region)) {
1501         region = nullptr;
1502     }
1503     const char* variantsStr = locale.getVariant();
1504     const char* extensionsStr = locale_getKeywordsStart(locale.getName());
1505     CharString variantsBuff(variantsStr, -1, status);
1506     if (!variantsBuff.isEmpty()) {
1507         if (U_FAILURE(status)) { return false; }
1508         char* start = variantsBuff.data();
1509         T_CString_toLowerCase(start);
1510         char* end;
1511         while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
1512                U_SUCCESS(status)) {
1513             *end = NULL_CHAR;  // null terminate inside variantsBuff
1514             variants.addElement(start, status);
1515             start = end + 1;
1516         }
1517         variants.addElement(start, status);
1518     }
1519     if (U_FAILURE(status)) { return false; }
1520 
1521     // Sort the variants
1522     variants.sort([](UElement e1, UElement e2) -> int8_t {
1523         // uprv_strcmp return int and in some platform, such as arm64-v8a,
1524         // it may return positive values > 127 which cause the casted value
1525         // of int8_t negative.
1526         int res = uprv_strcmp(
1527             (const char*)e1.pointer, (const char*)e2.pointer);
1528         return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
1529     }, status);
1530 
1531     // A changed count to assert when loop too many times.
1532     int changed = 0;
1533     // A UVector to to hold CharString allocated by the replace* method
1534     // and freed when out of scope from his function.
1535     UVector stringsToBeFreed([](void *obj){ delete ((CharString*) obj); },
1536                              nullptr, 10, status);
1537     while (U_SUCCESS(status)) {
1538         // Something wrong with the data cause looping here more than 10 times
1539         // already.
1540         U_ASSERT(changed < 5);
1541         // From observation of key in data/misc/metadata.txt
1542         // we know currently we only need to search in the following combination
1543         // of fields for type in languageAlias:
1544         // * lang_region_variant
1545         // * lang_region
1546         // * lang_variant
1547         // * lang
1548         // * und_variant
1549         // This assumption is ensured by the U_ASSERT in readLanguageAlias
1550         //
1551         //                      lang  REGION variant
1552         if (    replaceLanguage(true, true,  true,  stringsToBeFreed, status) ||
1553                 replaceLanguage(true, true,  false, stringsToBeFreed, status) ||
1554                 replaceLanguage(true, false, true,  stringsToBeFreed, status) ||
1555                 replaceLanguage(true, false, false, stringsToBeFreed, status) ||
1556                 replaceLanguage(false,false, true,  stringsToBeFreed, status) ||
1557                 replaceTerritory(stringsToBeFreed, status) ||
1558                 replaceScript(status) ||
1559                 replaceVariant(status)) {
1560             // Some values in data is changed, try to match from the beginning
1561             // again.
1562             changed++;
1563             continue;
1564         }
1565         // Nothing changed. Break out.
1566         break;
1567     }  // while(1)
1568 
1569     if (U_FAILURE(status)) { return false; }
1570     // Nothing changed and we know the order of the vaiants are not change
1571     // because we have no variant or only one.
1572     if (changed == 0 && variants.size() <= 1) {
1573         return false;
1574     }
1575     outputToString(out, status);
1576     if (extensionsStr != nullptr) {
1577         out.append(extensionsStr, status);
1578     }
1579     if (U_FAILURE(status)) {
1580         return false;
1581     }
1582     // If the tag is not changed, return.
1583     if (uprv_strcmp(out.data(), locale.getName()) == 0) {
1584         U_ASSERT(changed == 0);
1585         U_ASSERT(variants.size() > 1);
1586         out.clear();
1587         return false;
1588     }
1589     return true;
1590 }
1591 
1592 // Return true if the locale is changed during canonicalization.
1593 // The replaced value then will be put into out.
1594 bool
canonicalizeLocale(const Locale & locale,CharString & out,UErrorCode & status)1595 canonicalizeLocale(const Locale& locale, CharString& out, UErrorCode& status)
1596 {
1597     AliasReplacer replacer(status);
1598     return replacer.replace(locale, out, status);
1599 }
1600 
1601 // Function to optimize for known cases without so we can skip the loading
1602 // of resources in the startup time until we really need it.
1603 bool
isKnownCanonicalizedLocale(const char * locale,UErrorCode & status)1604 isKnownCanonicalizedLocale(const char* locale, UErrorCode& status)
1605 {
1606     if (    uprv_strcmp(locale, "c") == 0 ||
1607             uprv_strcmp(locale, "en") == 0 ||
1608             uprv_strcmp(locale, "en_US") == 0) {
1609         return true;
1610     }
1611 
1612     // common well-known Canonicalized.
1613     umtx_initOnce(gKnownCanonicalizedInitOnce,
1614                   &loadKnownCanonicalized, status);
1615     if (U_FAILURE(status)) {
1616         return false;
1617     }
1618     U_ASSERT(gKnownCanonicalized != nullptr);
1619     return uhash_geti(gKnownCanonicalized, locale) != 0;
1620 }
1621 
1622 }  // namespace
1623 
1624 // Function for testing.
1625 U_CAPI const char* const*
ulocimp_getKnownCanonicalizedLocaleForTest(int32_t * length)1626 ulocimp_getKnownCanonicalizedLocaleForTest(int32_t* length)
1627 {
1628     *length = UPRV_LENGTHOF(KNOWN_CANONICALIZED);
1629     return KNOWN_CANONICALIZED;
1630 }
1631 
1632 // Function for testing.
1633 U_CAPI bool
ulocimp_isCanonicalizedLocaleForTest(const char * localeName)1634 ulocimp_isCanonicalizedLocaleForTest(const char* localeName)
1635 {
1636     Locale l(localeName);
1637     UErrorCode status = U_ZERO_ERROR;
1638     CharString temp;
1639     return !canonicalizeLocale(l, temp, status) && U_SUCCESS(status);
1640 }
1641 
1642 /*This function initializes a Locale from a C locale ID*/
init(const char * localeID,UBool canonicalize)1643 Locale& Locale::init(const char* localeID, UBool canonicalize)
1644 {
1645     fIsBogus = FALSE;
1646     /* Free our current storage */
1647     if (baseName != fullName) {
1648         uprv_free(baseName);
1649     }
1650     baseName = NULL;
1651     if(fullName != fullNameBuffer) {
1652         uprv_free(fullName);
1653         fullName = fullNameBuffer;
1654     }
1655 
1656     // not a loop:
1657     // just an easy way to have a common error-exit
1658     // without goto and without another function
1659     do {
1660         char *separator;
1661         char *field[5] = {0};
1662         int32_t fieldLen[5] = {0};
1663         int32_t fieldIdx;
1664         int32_t variantField;
1665         int32_t length;
1666         UErrorCode err;
1667 
1668         if(localeID == NULL) {
1669             // not an error, just set the default locale
1670             return *this = getDefault();
1671         }
1672 
1673         /* preset all fields to empty */
1674         language[0] = script[0] = country[0] = 0;
1675 
1676         // "canonicalize" the locale ID to ICU/Java format
1677         err = U_ZERO_ERROR;
1678         length = canonicalize ?
1679             uloc_canonicalize(localeID, fullName, sizeof(fullNameBuffer), &err) :
1680             uloc_getName(localeID, fullName, sizeof(fullNameBuffer), &err);
1681 
1682         if(err == U_BUFFER_OVERFLOW_ERROR || length >= (int32_t)sizeof(fullNameBuffer)) {
1683             /*Go to heap for the fullName if necessary*/
1684             fullName = (char *)uprv_malloc(sizeof(char)*(length + 1));
1685             if(fullName == 0) {
1686                 fullName = fullNameBuffer;
1687                 break; // error: out of memory
1688             }
1689             err = U_ZERO_ERROR;
1690             length = canonicalize ?
1691                 uloc_canonicalize(localeID, fullName, length+1, &err) :
1692                 uloc_getName(localeID, fullName, length+1, &err);
1693         }
1694         if(U_FAILURE(err) || err == U_STRING_NOT_TERMINATED_WARNING) {
1695             /* should never occur */
1696             break;
1697         }
1698 
1699         variantBegin = length;
1700 
1701         /* after uloc_getName/canonicalize() we know that only '_' are separators */
1702         /* But _ could also appeared in timezone such as "en@timezone=America/Los_Angeles" */
1703         separator = field[0] = fullName;
1704         fieldIdx = 1;
1705         char* at = uprv_strchr(fullName, '@');
1706         while ((separator = uprv_strchr(field[fieldIdx-1], SEP_CHAR)) != 0 &&
1707                fieldIdx < UPRV_LENGTHOF(field)-1 &&
1708                (at == nullptr || separator < at)) {
1709             field[fieldIdx] = separator + 1;
1710             fieldLen[fieldIdx-1] = (int32_t)(separator - field[fieldIdx-1]);
1711             fieldIdx++;
1712         }
1713         // variant may contain @foo or .foo POSIX cruft; remove it
1714         separator = uprv_strchr(field[fieldIdx-1], '@');
1715         char* sep2 = uprv_strchr(field[fieldIdx-1], '.');
1716         if (separator!=NULL || sep2!=NULL) {
1717             if (separator==NULL || (sep2!=NULL && separator > sep2)) {
1718                 separator = sep2;
1719             }
1720             fieldLen[fieldIdx-1] = (int32_t)(separator - field[fieldIdx-1]);
1721         } else {
1722             fieldLen[fieldIdx-1] = length - (int32_t)(field[fieldIdx-1] - fullName);
1723         }
1724 
1725         if (fieldLen[0] >= (int32_t)(sizeof(language)))
1726         {
1727             break; // error: the language field is too long
1728         }
1729 
1730         variantField = 1; /* Usually the 2nd one, except when a script or country is also used. */
1731         if (fieldLen[0] > 0) {
1732             /* We have a language */
1733             uprv_memcpy(language, fullName, fieldLen[0]);
1734             language[fieldLen[0]] = 0;
1735         }
1736         if (fieldLen[1] == 4 && uprv_isASCIILetter(field[1][0]) &&
1737                 uprv_isASCIILetter(field[1][1]) && uprv_isASCIILetter(field[1][2]) &&
1738                 uprv_isASCIILetter(field[1][3])) {
1739             /* We have at least a script */
1740             uprv_memcpy(script, field[1], fieldLen[1]);
1741             script[fieldLen[1]] = 0;
1742             variantField++;
1743         }
1744 
1745         if (fieldLen[variantField] == 2 || fieldLen[variantField] == 3) {
1746             /* We have a country */
1747             uprv_memcpy(country, field[variantField], fieldLen[variantField]);
1748             country[fieldLen[variantField]] = 0;
1749             variantField++;
1750         } else if (fieldLen[variantField] == 0) {
1751             variantField++; /* script or country empty but variant in next field (i.e. en__POSIX) */
1752         }
1753 
1754         if (fieldLen[variantField] > 0) {
1755             /* We have a variant */
1756             variantBegin = (int32_t)(field[variantField] - fullName);
1757         }
1758 
1759         err = U_ZERO_ERROR;
1760         initBaseName(err);
1761         if (U_FAILURE(err)) {
1762             break;
1763         }
1764 
1765         if (canonicalize) {
1766             if (!isKnownCanonicalizedLocale(fullName, err)) {
1767                 CharString replaced;
1768                 // Not sure it is already canonicalized
1769                 if (canonicalizeLocale(*this, replaced, err)) {
1770                     U_ASSERT(U_SUCCESS(err));
1771                     // If need replacement, call init again.
1772                     init(replaced.data(), false);
1773                 }
1774                 if (U_FAILURE(err)) {
1775                     break;
1776                 }
1777             }
1778         }   // if (canonicalize) {
1779 
1780         // successful end of init()
1781         return *this;
1782     } while(0); /*loop doesn't iterate*/
1783 
1784     // when an error occurs, then set this object to "bogus" (there is no UErrorCode here)
1785     setToBogus();
1786 
1787     return *this;
1788 }
1789 
1790 /*
1791  * Set up the base name.
1792  * If there are no key words, it's exactly the full name.
1793  * If key words exist, it's the full name truncated at the '@' character.
1794  * Need to set up both at init() and after setting a keyword.
1795  */
1796 void
initBaseName(UErrorCode & status)1797 Locale::initBaseName(UErrorCode &status) {
1798     if (U_FAILURE(status)) {
1799         return;
1800     }
1801     U_ASSERT(baseName==NULL || baseName==fullName);
1802     const char *atPtr = uprv_strchr(fullName, '@');
1803     const char *eqPtr = uprv_strchr(fullName, '=');
1804     if (atPtr && eqPtr && atPtr < eqPtr) {
1805         // Key words exist.
1806         int32_t baseNameLength = (int32_t)(atPtr - fullName);
1807         baseName = (char *)uprv_malloc(baseNameLength + 1);
1808         if (baseName == NULL) {
1809             status = U_MEMORY_ALLOCATION_ERROR;
1810             return;
1811         }
1812         uprv_strncpy(baseName, fullName, baseNameLength);
1813         baseName[baseNameLength] = 0;
1814 
1815         // The original computation of variantBegin leaves it equal to the length
1816         // of fullName if there is no variant.  It should instead be
1817         // the length of the baseName.
1818         if (variantBegin > baseNameLength) {
1819             variantBegin = baseNameLength;
1820         }
1821     } else {
1822         baseName = fullName;
1823     }
1824 }
1825 
1826 
1827 int32_t
hashCode() const1828 Locale::hashCode() const
1829 {
1830     return ustr_hashCharsN(fullName, static_cast<int32_t>(uprv_strlen(fullName)));
1831 }
1832 
1833 void
setToBogus()1834 Locale::setToBogus() {
1835     /* Free our current storage */
1836     if(baseName != fullName) {
1837         uprv_free(baseName);
1838     }
1839     baseName = NULL;
1840     if(fullName != fullNameBuffer) {
1841         uprv_free(fullName);
1842         fullName = fullNameBuffer;
1843     }
1844     *fullNameBuffer = 0;
1845     *language = 0;
1846     *script = 0;
1847     *country = 0;
1848     fIsBogus = TRUE;
1849     variantBegin = 0;
1850 }
1851 
1852 const Locale& U_EXPORT2
getDefault()1853 Locale::getDefault()
1854 {
1855     {
1856         Mutex lock(&gDefaultLocaleMutex);
1857         if (gDefaultLocale != NULL) {
1858             return *gDefaultLocale;
1859         }
1860     }
1861     UErrorCode status = U_ZERO_ERROR;
1862     return *locale_set_default_internal(NULL, status);
1863 }
1864 
1865 
1866 
1867 void U_EXPORT2
setDefault(const Locale & newLocale,UErrorCode & status)1868 Locale::setDefault( const   Locale&     newLocale,
1869                             UErrorCode&  status)
1870 {
1871     if (U_FAILURE(status)) {
1872         return;
1873     }
1874 
1875     /* Set the default from the full name string of the supplied locale.
1876      * This is a convenient way to access the default locale caching mechanisms.
1877      */
1878     const char *localeID = newLocale.getName();
1879     locale_set_default_internal(localeID, status);
1880 }
1881 
1882 void
addLikelySubtags(UErrorCode & status)1883 Locale::addLikelySubtags(UErrorCode& status) {
1884     if (U_FAILURE(status)) {
1885         return;
1886     }
1887 
1888     CharString maximizedLocaleID;
1889     {
1890         CharStringByteSink sink(&maximizedLocaleID);
1891         ulocimp_addLikelySubtags(fullName, sink, &status);
1892     }
1893 
1894     if (U_FAILURE(status)) {
1895         return;
1896     }
1897 
1898     init(maximizedLocaleID.data(), /*canonicalize=*/FALSE);
1899     if (isBogus()) {
1900         status = U_ILLEGAL_ARGUMENT_ERROR;
1901     }
1902 }
1903 
1904 void
minimizeSubtags(UErrorCode & status)1905 Locale::minimizeSubtags(UErrorCode& status) {
1906     if (U_FAILURE(status)) {
1907         return;
1908     }
1909 
1910     CharString minimizedLocaleID;
1911     {
1912         CharStringByteSink sink(&minimizedLocaleID);
1913         ulocimp_minimizeSubtags(fullName, sink, &status);
1914     }
1915 
1916     if (U_FAILURE(status)) {
1917         return;
1918     }
1919 
1920     init(minimizedLocaleID.data(), /*canonicalize=*/FALSE);
1921     if (isBogus()) {
1922         status = U_ILLEGAL_ARGUMENT_ERROR;
1923     }
1924 }
1925 
1926 void
canonicalize(UErrorCode & status)1927 Locale::canonicalize(UErrorCode& status) {
1928     if (U_FAILURE(status)) {
1929         return;
1930     }
1931     if (isBogus()) {
1932         status = U_ILLEGAL_ARGUMENT_ERROR;
1933         return;
1934     }
1935     CharString uncanonicalized(fullName, status);
1936     if (U_FAILURE(status)) {
1937         return;
1938     }
1939     init(uncanonicalized.data(), /*canonicalize=*/TRUE);
1940     if (isBogus()) {
1941         status = U_ILLEGAL_ARGUMENT_ERROR;
1942     }
1943 }
1944 
1945 Locale U_EXPORT2
forLanguageTag(StringPiece tag,UErrorCode & status)1946 Locale::forLanguageTag(StringPiece tag, UErrorCode& status)
1947 {
1948     Locale result(Locale::eBOGUS);
1949 
1950     if (U_FAILURE(status)) {
1951         return result;
1952     }
1953 
1954     // If a BCP 47 language tag is passed as the language parameter to the
1955     // normal Locale constructor, it will actually fall back to invoking
1956     // uloc_forLanguageTag() to parse it if it somehow is able to detect that
1957     // the string actually is BCP 47. This works well for things like strings
1958     // using BCP 47 extensions, but it does not at all work for things like
1959     // legacy language tags (marked as “Type: grandfathered” in BCP 47,
1960     // e.g., "en-GB-oed") which are possible to also
1961     // interpret as ICU locale IDs and because of that won't trigger the BCP 47
1962     // parsing. Therefore the code here explicitly calls uloc_forLanguageTag()
1963     // and then Locale::init(), instead of just calling the normal constructor.
1964 
1965     CharString localeID;
1966     int32_t parsedLength;
1967     {
1968         CharStringByteSink sink(&localeID);
1969         ulocimp_forLanguageTag(
1970                 tag.data(),
1971                 tag.length(),
1972                 sink,
1973                 &parsedLength,
1974                 &status);
1975     }
1976 
1977     if (U_FAILURE(status)) {
1978         return result;
1979     }
1980 
1981     if (parsedLength != tag.size()) {
1982         status = U_ILLEGAL_ARGUMENT_ERROR;
1983         return result;
1984     }
1985 
1986     result.init(localeID.data(), /*canonicalize=*/FALSE);
1987     if (result.isBogus()) {
1988         status = U_ILLEGAL_ARGUMENT_ERROR;
1989     }
1990     return result;
1991 }
1992 
1993 void
toLanguageTag(ByteSink & sink,UErrorCode & status) const1994 Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const
1995 {
1996     if (U_FAILURE(status)) {
1997         return;
1998     }
1999 
2000     if (fIsBogus) {
2001         status = U_ILLEGAL_ARGUMENT_ERROR;
2002         return;
2003     }
2004 
2005     ulocimp_toLanguageTag(fullName, sink, /*strict=*/FALSE, &status);
2006 }
2007 
2008 Locale U_EXPORT2
createFromName(const char * name)2009 Locale::createFromName (const char *name)
2010 {
2011     if (name) {
2012         Locale l("");
2013         l.init(name, FALSE);
2014         return l;
2015     }
2016     else {
2017         return getDefault();
2018     }
2019 }
2020 
2021 Locale U_EXPORT2
createCanonical(const char * name)2022 Locale::createCanonical(const char* name) {
2023     Locale loc("");
2024     loc.init(name, TRUE);
2025     return loc;
2026 }
2027 
2028 const char *
getISO3Language() const2029 Locale::getISO3Language() const
2030 {
2031     return uloc_getISO3Language(fullName);
2032 }
2033 
2034 
2035 const char *
getISO3Country() const2036 Locale::getISO3Country() const
2037 {
2038     return uloc_getISO3Country(fullName);
2039 }
2040 
2041 /**
2042  * Return the LCID value as specified in the "LocaleID" resource for this
2043  * locale.  The LocaleID must be expressed as a hexadecimal number, from
2044  * one to four digits.  If the LocaleID resource is not present, or is
2045  * in an incorrect format, 0 is returned.  The LocaleID is for use in
2046  * Windows (it is an LCID), but is available on all platforms.
2047  */
2048 uint32_t
getLCID() const2049 Locale::getLCID() const
2050 {
2051     return uloc_getLCID(fullName);
2052 }
2053 
getISOCountries()2054 const char* const* U_EXPORT2 Locale::getISOCountries()
2055 {
2056     return uloc_getISOCountries();
2057 }
2058 
getISOLanguages()2059 const char* const* U_EXPORT2 Locale::getISOLanguages()
2060 {
2061     return uloc_getISOLanguages();
2062 }
2063 
2064 // Set the locale's data based on a posix id.
setFromPOSIXID(const char * posixID)2065 void Locale::setFromPOSIXID(const char *posixID)
2066 {
2067     init(posixID, TRUE);
2068 }
2069 
2070 const Locale & U_EXPORT2
getRoot(void)2071 Locale::getRoot(void)
2072 {
2073     return getLocale(eROOT);
2074 }
2075 
2076 const Locale & U_EXPORT2
getEnglish(void)2077 Locale::getEnglish(void)
2078 {
2079     return getLocale(eENGLISH);
2080 }
2081 
2082 const Locale & U_EXPORT2
getFrench(void)2083 Locale::getFrench(void)
2084 {
2085     return getLocale(eFRENCH);
2086 }
2087 
2088 const Locale & U_EXPORT2
getGerman(void)2089 Locale::getGerman(void)
2090 {
2091     return getLocale(eGERMAN);
2092 }
2093 
2094 const Locale & U_EXPORT2
getItalian(void)2095 Locale::getItalian(void)
2096 {
2097     return getLocale(eITALIAN);
2098 }
2099 
2100 const Locale & U_EXPORT2
getJapanese(void)2101 Locale::getJapanese(void)
2102 {
2103     return getLocale(eJAPANESE);
2104 }
2105 
2106 const Locale & U_EXPORT2
getKorean(void)2107 Locale::getKorean(void)
2108 {
2109     return getLocale(eKOREAN);
2110 }
2111 
2112 const Locale & U_EXPORT2
getChinese(void)2113 Locale::getChinese(void)
2114 {
2115     return getLocale(eCHINESE);
2116 }
2117 
2118 const Locale & U_EXPORT2
getSimplifiedChinese(void)2119 Locale::getSimplifiedChinese(void)
2120 {
2121     return getLocale(eCHINA);
2122 }
2123 
2124 const Locale & U_EXPORT2
getTraditionalChinese(void)2125 Locale::getTraditionalChinese(void)
2126 {
2127     return getLocale(eTAIWAN);
2128 }
2129 
2130 
2131 const Locale & U_EXPORT2
getFrance(void)2132 Locale::getFrance(void)
2133 {
2134     return getLocale(eFRANCE);
2135 }
2136 
2137 const Locale & U_EXPORT2
getGermany(void)2138 Locale::getGermany(void)
2139 {
2140     return getLocale(eGERMANY);
2141 }
2142 
2143 const Locale & U_EXPORT2
getItaly(void)2144 Locale::getItaly(void)
2145 {
2146     return getLocale(eITALY);
2147 }
2148 
2149 const Locale & U_EXPORT2
getJapan(void)2150 Locale::getJapan(void)
2151 {
2152     return getLocale(eJAPAN);
2153 }
2154 
2155 const Locale & U_EXPORT2
getKorea(void)2156 Locale::getKorea(void)
2157 {
2158     return getLocale(eKOREA);
2159 }
2160 
2161 const Locale & U_EXPORT2
getChina(void)2162 Locale::getChina(void)
2163 {
2164     return getLocale(eCHINA);
2165 }
2166 
2167 const Locale & U_EXPORT2
getPRC(void)2168 Locale::getPRC(void)
2169 {
2170     return getLocale(eCHINA);
2171 }
2172 
2173 const Locale & U_EXPORT2
getTaiwan(void)2174 Locale::getTaiwan(void)
2175 {
2176     return getLocale(eTAIWAN);
2177 }
2178 
2179 const Locale & U_EXPORT2
getUK(void)2180 Locale::getUK(void)
2181 {
2182     return getLocale(eUK);
2183 }
2184 
2185 const Locale & U_EXPORT2
getUS(void)2186 Locale::getUS(void)
2187 {
2188     return getLocale(eUS);
2189 }
2190 
2191 const Locale & U_EXPORT2
getCanada(void)2192 Locale::getCanada(void)
2193 {
2194     return getLocale(eCANADA);
2195 }
2196 
2197 const Locale & U_EXPORT2
getCanadaFrench(void)2198 Locale::getCanadaFrench(void)
2199 {
2200     return getLocale(eCANADA_FRENCH);
2201 }
2202 
2203 const Locale &
getLocale(int locid)2204 Locale::getLocale(int locid)
2205 {
2206     Locale *localeCache = getLocaleCache();
2207     U_ASSERT((locid < eMAX_LOCALES)&&(locid>=0));
2208     if (localeCache == NULL) {
2209         // Failure allocating the locale cache.
2210         //   The best we can do is return a NULL reference.
2211         locid = 0;
2212     }
2213     return localeCache[locid]; /*operating on NULL*/
2214 }
2215 
2216 /*
2217 This function is defined this way in order to get around static
2218 initialization and static destruction.
2219  */
2220 Locale *
getLocaleCache(void)2221 Locale::getLocaleCache(void)
2222 {
2223     UErrorCode status = U_ZERO_ERROR;
2224     umtx_initOnce(gLocaleCacheInitOnce, locale_init, status);
2225     return gLocaleCache;
2226 }
2227 
2228 class KeywordEnumeration : public StringEnumeration {
2229 private:
2230     char *keywords;
2231     char *current;
2232     int32_t length;
2233     UnicodeString currUSKey;
2234     static const char fgClassID;/* Warning this is used beyond the typical RTTI usage. */
2235 
2236 public:
getStaticClassID(void)2237     static UClassID U_EXPORT2 getStaticClassID(void) { return (UClassID)&fgClassID; }
getDynamicClassID(void) const2238     virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }
2239 public:
KeywordEnumeration(const char * keys,int32_t keywordLen,int32_t currentIndex,UErrorCode & status)2240     KeywordEnumeration(const char *keys, int32_t keywordLen, int32_t currentIndex, UErrorCode &status)
2241         : keywords((char *)&fgClassID), current((char *)&fgClassID), length(0) {
2242         if(U_SUCCESS(status) && keywordLen != 0) {
2243             if(keys == NULL || keywordLen < 0) {
2244                 status = U_ILLEGAL_ARGUMENT_ERROR;
2245             } else {
2246                 keywords = (char *)uprv_malloc(keywordLen+1);
2247                 if (keywords == NULL) {
2248                     status = U_MEMORY_ALLOCATION_ERROR;
2249                 }
2250                 else {
2251                     uprv_memcpy(keywords, keys, keywordLen);
2252                     keywords[keywordLen] = 0;
2253                     current = keywords + currentIndex;
2254                     length = keywordLen;
2255                 }
2256             }
2257         }
2258     }
2259 
2260     virtual ~KeywordEnumeration();
2261 
clone() const2262     virtual StringEnumeration * clone() const
2263     {
2264         UErrorCode status = U_ZERO_ERROR;
2265         return new KeywordEnumeration(keywords, length, (int32_t)(current - keywords), status);
2266     }
2267 
count(UErrorCode &) const2268     virtual int32_t count(UErrorCode &/*status*/) const {
2269         char *kw = keywords;
2270         int32_t result = 0;
2271         while(*kw) {
2272             result++;
2273             kw += uprv_strlen(kw)+1;
2274         }
2275         return result;
2276     }
2277 
next(int32_t * resultLength,UErrorCode & status)2278     virtual const char* next(int32_t* resultLength, UErrorCode& status) {
2279         const char* result;
2280         int32_t len;
2281         if(U_SUCCESS(status) && *current != 0) {
2282             result = current;
2283             len = (int32_t)uprv_strlen(current);
2284             current += len+1;
2285             if(resultLength != NULL) {
2286                 *resultLength = len;
2287             }
2288         } else {
2289             if(resultLength != NULL) {
2290                 *resultLength = 0;
2291             }
2292             result = NULL;
2293         }
2294         return result;
2295     }
2296 
snext(UErrorCode & status)2297     virtual const UnicodeString* snext(UErrorCode& status) {
2298         int32_t resultLength = 0;
2299         const char *s = next(&resultLength, status);
2300         return setChars(s, resultLength, status);
2301     }
2302 
reset(UErrorCode &)2303     virtual void reset(UErrorCode& /*status*/) {
2304         current = keywords;
2305     }
2306 };
2307 
2308 const char KeywordEnumeration::fgClassID = '\0';
2309 
~KeywordEnumeration()2310 KeywordEnumeration::~KeywordEnumeration() {
2311     uprv_free(keywords);
2312 }
2313 
2314 // A wrapper around KeywordEnumeration that calls uloc_toUnicodeLocaleKey() in
2315 // the next() method for each keyword before returning it.
2316 class UnicodeKeywordEnumeration : public KeywordEnumeration {
2317 public:
2318     using KeywordEnumeration::KeywordEnumeration;
2319     virtual ~UnicodeKeywordEnumeration();
2320 
next(int32_t * resultLength,UErrorCode & status)2321     virtual const char* next(int32_t* resultLength, UErrorCode& status) {
2322         const char* legacy_key = KeywordEnumeration::next(nullptr, status);
2323         if (U_SUCCESS(status) && legacy_key != nullptr) {
2324             const char* key = uloc_toUnicodeLocaleKey(legacy_key);
2325             if (key == nullptr) {
2326                 status = U_ILLEGAL_ARGUMENT_ERROR;
2327             } else {
2328                 if (resultLength != nullptr) {
2329                     *resultLength = static_cast<int32_t>(uprv_strlen(key));
2330                 }
2331                 return key;
2332             }
2333         }
2334         if (resultLength != nullptr) *resultLength = 0;
2335         return nullptr;
2336     }
2337 };
2338 
2339 // Out-of-line virtual destructor to serve as the "key function".
2340 UnicodeKeywordEnumeration::~UnicodeKeywordEnumeration() = default;
2341 
2342 StringEnumeration *
createKeywords(UErrorCode & status) const2343 Locale::createKeywords(UErrorCode &status) const
2344 {
2345     StringEnumeration *result = NULL;
2346 
2347     if (U_FAILURE(status)) {
2348         return result;
2349     }
2350 
2351     const char* variantStart = uprv_strchr(fullName, '@');
2352     const char* assignment = uprv_strchr(fullName, '=');
2353     if(variantStart) {
2354         if(assignment > variantStart) {
2355             CharString keywords;
2356             CharStringByteSink sink(&keywords);
2357             ulocimp_getKeywords(variantStart+1, '@', sink, FALSE, &status);
2358             if (U_SUCCESS(status) && !keywords.isEmpty()) {
2359                 result = new KeywordEnumeration(keywords.data(), keywords.length(), 0, status);
2360                 if (!result) {
2361                     status = U_MEMORY_ALLOCATION_ERROR;
2362                 }
2363             }
2364         } else {
2365             status = U_INVALID_FORMAT_ERROR;
2366         }
2367     }
2368     return result;
2369 }
2370 
2371 StringEnumeration *
createUnicodeKeywords(UErrorCode & status) const2372 Locale::createUnicodeKeywords(UErrorCode &status) const
2373 {
2374     StringEnumeration *result = NULL;
2375 
2376     if (U_FAILURE(status)) {
2377         return result;
2378     }
2379 
2380     const char* variantStart = uprv_strchr(fullName, '@');
2381     const char* assignment = uprv_strchr(fullName, '=');
2382     if(variantStart) {
2383         if(assignment > variantStart) {
2384             CharString keywords;
2385             CharStringByteSink sink(&keywords);
2386             ulocimp_getKeywords(variantStart+1, '@', sink, FALSE, &status);
2387             if (U_SUCCESS(status) && !keywords.isEmpty()) {
2388                 result = new UnicodeKeywordEnumeration(keywords.data(), keywords.length(), 0, status);
2389                 if (!result) {
2390                     status = U_MEMORY_ALLOCATION_ERROR;
2391                 }
2392             }
2393         } else {
2394             status = U_INVALID_FORMAT_ERROR;
2395         }
2396     }
2397     return result;
2398 }
2399 
2400 int32_t
getKeywordValue(const char * keywordName,char * buffer,int32_t bufLen,UErrorCode & status) const2401 Locale::getKeywordValue(const char* keywordName, char *buffer, int32_t bufLen, UErrorCode &status) const
2402 {
2403     return uloc_getKeywordValue(fullName, keywordName, buffer, bufLen, &status);
2404 }
2405 
2406 void
getKeywordValue(StringPiece keywordName,ByteSink & sink,UErrorCode & status) const2407 Locale::getKeywordValue(StringPiece keywordName, ByteSink& sink, UErrorCode& status) const {
2408     if (U_FAILURE(status)) {
2409         return;
2410     }
2411 
2412     if (fIsBogus) {
2413         status = U_ILLEGAL_ARGUMENT_ERROR;
2414         return;
2415     }
2416 
2417     // TODO: Remove the need for a const char* to a NUL terminated buffer.
2418     const CharString keywordName_nul(keywordName, status);
2419     if (U_FAILURE(status)) {
2420         return;
2421     }
2422 
2423     ulocimp_getKeywordValue(fullName, keywordName_nul.data(), sink, &status);
2424 }
2425 
2426 void
getUnicodeKeywordValue(StringPiece keywordName,ByteSink & sink,UErrorCode & status) const2427 Locale::getUnicodeKeywordValue(StringPiece keywordName,
2428                                ByteSink& sink,
2429                                UErrorCode& status) const {
2430     // TODO: Remove the need for a const char* to a NUL terminated buffer.
2431     const CharString keywordName_nul(keywordName, status);
2432     if (U_FAILURE(status)) {
2433         return;
2434     }
2435 
2436     const char* legacy_key = uloc_toLegacyKey(keywordName_nul.data());
2437 
2438     if (legacy_key == nullptr) {
2439         status = U_ILLEGAL_ARGUMENT_ERROR;
2440         return;
2441     }
2442 
2443     CharString legacy_value;
2444     {
2445         CharStringByteSink sink(&legacy_value);
2446         getKeywordValue(legacy_key, sink, status);
2447     }
2448 
2449     if (U_FAILURE(status)) {
2450         return;
2451     }
2452 
2453     const char* unicode_value = uloc_toUnicodeLocaleType(
2454             keywordName_nul.data(), legacy_value.data());
2455 
2456     if (unicode_value == nullptr) {
2457         status = U_ILLEGAL_ARGUMENT_ERROR;
2458         return;
2459     }
2460 
2461     sink.Append(unicode_value, static_cast<int32_t>(uprv_strlen(unicode_value)));
2462 }
2463 
2464 void
setKeywordValue(const char * keywordName,const char * keywordValue,UErrorCode & status)2465 Locale::setKeywordValue(const char* keywordName, const char* keywordValue, UErrorCode &status)
2466 {
2467     if (U_FAILURE(status)) {
2468         return;
2469     }
2470     if (status == U_STRING_NOT_TERMINATED_WARNING) {
2471         status = U_ZERO_ERROR;
2472     }
2473     int32_t bufferLength = uprv_max((int32_t)(uprv_strlen(fullName) + 1), ULOC_FULLNAME_CAPACITY);
2474     int32_t newLength = uloc_setKeywordValue(keywordName, keywordValue, fullName,
2475                                              bufferLength, &status) + 1;
2476     U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
2477     /* Handle the case the current buffer is not enough to hold the new id */
2478     if (status == U_BUFFER_OVERFLOW_ERROR) {
2479         U_ASSERT(newLength > bufferLength);
2480         char* newFullName = (char *)uprv_malloc(newLength);
2481         if (newFullName == nullptr) {
2482             status = U_MEMORY_ALLOCATION_ERROR;
2483             return;
2484         }
2485         uprv_strcpy(newFullName, fullName);
2486         if (fullName != fullNameBuffer) {
2487             // if full Name is already on the heap, need to free it.
2488             uprv_free(fullName);
2489         }
2490         fullName = newFullName;
2491         status = U_ZERO_ERROR;
2492         uloc_setKeywordValue(keywordName, keywordValue, fullName, newLength, &status);
2493         U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
2494     } else {
2495         U_ASSERT(newLength <= bufferLength);
2496     }
2497     if (U_SUCCESS(status) && baseName == fullName) {
2498         // May have added the first keyword, meaning that the fullName is no longer also the baseName.
2499         initBaseName(status);
2500     }
2501 }
2502 
2503 void
setKeywordValue(StringPiece keywordName,StringPiece keywordValue,UErrorCode & status)2504 Locale::setKeywordValue(StringPiece keywordName,
2505                         StringPiece keywordValue,
2506                         UErrorCode& status) {
2507     // TODO: Remove the need for a const char* to a NUL terminated buffer.
2508     const CharString keywordName_nul(keywordName, status);
2509     const CharString keywordValue_nul(keywordValue, status);
2510     setKeywordValue(keywordName_nul.data(), keywordValue_nul.data(), status);
2511 }
2512 
2513 void
setUnicodeKeywordValue(StringPiece keywordName,StringPiece keywordValue,UErrorCode & status)2514 Locale::setUnicodeKeywordValue(StringPiece keywordName,
2515                                StringPiece keywordValue,
2516                                UErrorCode& status) {
2517     // TODO: Remove the need for a const char* to a NUL terminated buffer.
2518     const CharString keywordName_nul(keywordName, status);
2519     const CharString keywordValue_nul(keywordValue, status);
2520 
2521     if (U_FAILURE(status)) {
2522         return;
2523     }
2524 
2525     const char* legacy_key = uloc_toLegacyKey(keywordName_nul.data());
2526 
2527     if (legacy_key == nullptr) {
2528         status = U_ILLEGAL_ARGUMENT_ERROR;
2529         return;
2530     }
2531 
2532     const char* legacy_value = nullptr;
2533 
2534     if (!keywordValue_nul.isEmpty()) {
2535         legacy_value =
2536             uloc_toLegacyType(keywordName_nul.data(), keywordValue_nul.data());
2537 
2538         if (legacy_value == nullptr) {
2539             status = U_ILLEGAL_ARGUMENT_ERROR;
2540             return;
2541         }
2542     }
2543 
2544     setKeywordValue(legacy_key, legacy_value, status);
2545 }
2546 
2547 const char *
getBaseName() const2548 Locale::getBaseName() const {
2549     return baseName;
2550 }
2551 
2552 Locale::Iterator::~Iterator() = default;
2553 
2554 //eof
2555 U_NAMESPACE_END
2556