1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include "unicode/bytestream.h"
34 #include "unicode/errorcode.h"
35 #include "unicode/stringpiece.h"
36 #include "unicode/utypes.h"
37 #include "unicode/ustring.h"
38 #include "unicode/uloc.h"
39 
40 #include "bytesinkutil.h"
41 #include "putilimp.h"
42 #include "ustr_imp.h"
43 #include "ulocimp.h"
44 #include "umutex.h"
45 #include "cstring.h"
46 #include "cmemory.h"
47 #include "locmap.h"
48 #include "uarrsort.h"
49 #include "uenumimp.h"
50 #include "uassert.h"
51 #include "charstr.h"
52 
53 U_NAMESPACE_USE
54 
55 /* ### Declarations **************************************************/
56 
57 /* Locale stuff from locid.cpp */
58 U_CFUNC void locale_set_default(const char *id);
59 U_CFUNC const char *locale_get_default(void);
60 
61 /* ### Data tables **************************************************/
62 
63 /**
64  * Table of language codes, both 2- and 3-letter, with preference
65  * given to 2-letter codes where possible.  Includes 3-letter codes
66  * that lack a 2-letter equivalent.
67  *
68  * This list must be in sorted order.  This list is returned directly
69  * to the user by some API.
70  *
71  * This list must be kept in sync with LANGUAGES_3, with corresponding
72  * entries matched.
73  *
74  * This table should be terminated with a NULL entry, followed by a
75  * second list, and another NULL entry.  The first list is visible to
76  * user code when this array is returned by API.  The second list
77  * contains codes we support, but do not expose through user API.
78  *
79  * Notes
80  *
81  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82  * include the revisions up to 2001/7/27 *CWB*
83  *
84  * The 3 character codes are the terminology codes like RFC 3066.  This
85  * is compatible with prior ICU codes
86  *
87  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88  * table but now at the end of the table because 3 character codes are
89  * duplicates.  This avoids bad searches going from 3 to 2 character
90  * codes.
91  *
92  * The range qaa-qtz is reserved for local use
93  */
94 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
95 /* ISO639 table version is 20150505 */
96 /* Subsequent hand addition of selected languages */
97 static const char * const LANGUAGES[] = {
98     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
99     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
100     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
101     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
102     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
103     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
104     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
105     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
106     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
107     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
108     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
109     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
110     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
111     "cs",  "csb", "cu",  "cv",  "cy",
112     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
113     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
114     "dyo", "dyu", "dz",  "dzg",
115     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
116     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
117     "ext",
118     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
119     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
120     "frs", "fur", "fy",
121     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
122     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
123     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
124     "gur", "guz", "gv",  "gwi",
125     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
126     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
127     "hup", "hy",  "hz",
128     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
129     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
130     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
131     "jv",
132     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
133     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
134     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
135     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
136     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
137     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
138     "kv",  "kw",  "ky",
139     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
140     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
141     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
142     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
143     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
144     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
145     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
146     "ml",  "mn",  "mnc", "mni", "mo",
147     "moh", "mos", "mr",  "mrj",
148     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
149     "my",  "mye", "myv", "mzn",
150     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
151     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
152     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
153     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
154     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
155     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
156     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
157     "pon", "prg", "pro", "ps",  "pt",
158     "qu",  "quc", "qug",
159     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
160     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
161     "rw",  "rwk",
162     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
163     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
164     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
165     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
166     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
167     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
168     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
169     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
170     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
171     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
172     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
173     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
174     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
175     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
176     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
177     "vot", "vro", "vun",
178     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
179     "xal", "xh",  "xmf", "xog",
180     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
181     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
182     "zun", "zxx", "zza",
183 NULL,
184     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
185 NULL
186 };
187 
188 static const char* const DEPRECATED_LANGUAGES[]={
189     "in", "iw", "ji", "jw", NULL, NULL
190 };
191 static const char* const REPLACEMENT_LANGUAGES[]={
192     "id", "he", "yi", "jv", NULL, NULL
193 };
194 
195 /**
196  * Table of 3-letter language codes.
197  *
198  * This is a lookup table used to convert 3-letter language codes to
199  * their 2-letter equivalent, where possible.  It must be kept in sync
200  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
201  * same language as LANGUAGES_3[i].  The commented-out lines are
202  * copied from LANGUAGES to make eyeballing this baby easier.
203  *
204  * Where a 3-letter language code has no 2-letter equivalent, the
205  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
206  *
207  * This table should be terminated with a NULL entry, followed by a
208  * second list, and another NULL entry.  The two lists correspond to
209  * the two lists in LANGUAGES.
210  */
211 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
212 /* ISO639 table version is 20150505 */
213 /* Subsequent hand addition of selected languages */
214 static const char * const LANGUAGES_3[] = {
215     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
216     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
217     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
218     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
219     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
220     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
221     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
222     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
223     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
224     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
225     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
226     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
227     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
228     "ces", "csb", "chu", "chv", "cym",
229     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
230     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
231     "dyo", "dyu", "dzo", "dzg",
232     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
233     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
234     "ext",
235     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
236     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
237     "frs", "fur", "fry",
238     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
239     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
240     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
241     "gur", "guz", "glv", "gwi",
242     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
243     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
244     "hup", "hye", "her",
245     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
246     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
247     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
248     "jav",
249     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
250     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
251     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
252     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
253     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
254     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
255     "kom", "cor", "kir",
256     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
257     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
258     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
259     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
260     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
261     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
262     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
263     "mal", "mon", "mnc", "mni", "mol",
264     "moh", "mos", "mar", "mrj",
265     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266     "mya", "mye", "myv", "mzn",
267     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
273     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274     "pon", "prg", "pro", "pus", "por",
275     "que", "quc", "qug",
276     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278     "kin", "rwk",
279     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
287     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
289     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294     "vot", "vro", "vun",
295     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296     "xal", "xho", "xmf", "xog",
297     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299     "zun", "zxx", "zza",
300 NULL,
301 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
302     "ind", "heb", "yid", "jaw", "srp",
303 NULL
304 };
305 
306 /**
307  * Table of 2-letter country codes.
308  *
309  * This list must be in sorted order.  This list is returned directly
310  * to the user by some API.
311  *
312  * This list must be kept in sync with COUNTRIES_3, with corresponding
313  * entries matched.
314  *
315  * This table should be terminated with a NULL entry, followed by a
316  * second list, and another NULL entry.  The first list is visible to
317  * user code when this array is returned by API.  The second list
318  * contains codes we support, but do not expose through user API.
319  *
320  * Notes:
321  *
322  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324  * new codes keeping the old ones for compatibility updated to include
325  * 1999/12/03 revisions *CWB*
326  *
327  * RO(ROM) is now RO(ROU) according to
328  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329  */
330 static const char * const COUNTRIES[] = {
331     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
332     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
333     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
334     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
335     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
336     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
337     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
338     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
339     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
340     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
341     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
342     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
343     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
344     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
345     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
346     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
347     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
348     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
349     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
350     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
351     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
352     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
353     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
354     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
355     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
356     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
357     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
358     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
359     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
360     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
361 NULL,
362     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
363 NULL
364 };
365 
366 static const char* const DEPRECATED_COUNTRIES[] = {
367     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
368 };
369 static const char* const REPLACEMENT_COUNTRIES[] = {
370 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
372 };
373 
374 /**
375  * Table of 3-letter country codes.
376  *
377  * This is a lookup table used to convert 3-letter country codes to
378  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
379  * For all valid i, COUNTRIES[i] must refer to the same country as
380  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
381  * to make eyeballing this baby easier.
382  *
383  * This table should be terminated with a NULL entry, followed by a
384  * second list, and another NULL entry.  The two lists correspond to
385  * the two lists in COUNTRIES.
386  */
387 static const char * const COUNTRIES_3[] = {
388 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
389     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
391     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
393     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
395     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
397     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
399     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
400 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
401     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
402 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
403     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
404 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
405     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
407     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
409     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
411     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
413     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
415     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
417     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
419     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
421     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
423     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
425     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
427     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
429     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
431     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
433     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
435     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
437     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
439     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
441     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
443     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
445     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
447     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448 NULL,
449 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
450     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451 NULL
452 };
453 
454 typedef struct CanonicalizationMap {
455     const char *id;          /* input ID */
456     const char *canonicalID; /* canonicalized output ID */
457 } CanonicalizationMap;
458 
459 /**
460  * A map to canonicalize locale IDs.  This handles a variety of
461  * different semantic kinds of transformations.
462  */
463 static const CanonicalizationMap CANONICALIZE_MAP[] = {
464     { "art__LOJBAN",    "jbo" }, /* registered name */
465     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
466     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
467     { "zh__GUOYU",      "zh" }, /* registered name */
468     { "zh__HAKKA",      "hak" }, /* registered name */
469     { "zh__XIANG",      "hsn" }, /* registered name */
470     // subtags with 3 chars won't be treated as variants.
471     { "zh_GAN",         "gan" }, /* registered name */
472     { "zh_MIN_NAN",     "nan" }, /* registered name */
473     { "zh_WUU",         "wuu" }, /* registered name */
474     { "zh_YUE",         "yue" }, /* registered name */
475 };
476 
477 /* ### BCP47 Conversion *******************************************/
478 /* Test if the locale id has BCP47 u extension and does not have '@' */
479 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
480 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
481 #define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
482     if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
483             U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
484         finalID=id; \
485         if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
486     } else { \
487         finalID=buffer; \
488     } \
489 } UPRV_BLOCK_MACRO_END
490 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)491 static int32_t getShortestSubtagLength(const char *localeID) {
492     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
493     int32_t length = localeIDLength;
494     int32_t tmpLength = 0;
495     int32_t i;
496     UBool reset = TRUE;
497 
498     for (i = 0; i < localeIDLength; i++) {
499         if (localeID[i] != '_' && localeID[i] != '-') {
500             if (reset) {
501                 tmpLength = 0;
502                 reset = FALSE;
503             }
504             tmpLength++;
505         } else {
506             if (tmpLength != 0 && tmpLength < length) {
507                 length = tmpLength;
508             }
509             reset = TRUE;
510         }
511     }
512 
513     return length;
514 }
515 
516 /* ### Keywords **************************************************/
517 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
518 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
519 /* Punctuation/symbols allowed in legacy key values */
520 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
521 
522 #define ULOC_KEYWORD_BUFFER_LEN 25
523 #define ULOC_MAX_NO_KEYWORDS 25
524 
525 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)526 locale_getKeywordsStart(const char *localeID) {
527     const char *result = NULL;
528     if((result = uprv_strchr(localeID, '@')) != NULL) {
529         return result;
530     }
531 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
532     else {
533         /* We do this because the @ sign is variant, and the @ sign used on one
534         EBCDIC machine won't be compiled the same way on other EBCDIC based
535         machines. */
536         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
537         const uint8_t *charToFind = ebcdicSigns;
538         while(*charToFind) {
539             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
540                 return result;
541             }
542             charToFind++;
543         }
544     }
545 #endif
546     return NULL;
547 }
548 
549 /**
550  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
551  * @param keywordName incoming name to be canonicalized
552  * @param status return status (keyword too long)
553  * @return length of the keyword name
554  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)555 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
556 {
557   int32_t keywordNameLen = 0;
558 
559   for (; *keywordName != 0; keywordName++) {
560     if (!UPRV_ISALPHANUM(*keywordName)) {
561       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
562       return 0;
563     }
564     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
565       buf[keywordNameLen++] = uprv_tolower(*keywordName);
566     } else {
567       /* keyword name too long for internal buffer */
568       *status = U_INTERNAL_PROGRAM_ERROR;
569       return 0;
570     }
571   }
572   if (keywordNameLen == 0) {
573     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
574     return 0;
575   }
576   buf[keywordNameLen] = 0; /* terminate */
577 
578   return keywordNameLen;
579 }
580 
581 typedef struct {
582     char keyword[ULOC_KEYWORD_BUFFER_LEN];
583     int32_t keywordLen;
584     const char *valueStart;
585     int32_t valueLen;
586 } KeywordStruct;
587 
588 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)589 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
590     const char* leftString = ((const KeywordStruct *)left)->keyword;
591     const char* rightString = ((const KeywordStruct *)right)->keyword;
592     return uprv_strcmp(leftString, rightString);
593 }
594 
595 U_CFUNC void
ulocimp_getKeywords(const char * localeID,char prev,ByteSink & sink,UBool valuesToo,UErrorCode * status)596 ulocimp_getKeywords(const char *localeID,
597                     char prev,
598                     ByteSink& sink,
599                     UBool valuesToo,
600                     UErrorCode *status)
601 {
602     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
603 
604     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
605     int32_t numKeywords = 0;
606     const char* pos = localeID;
607     const char* equalSign = NULL;
608     const char* semicolon = NULL;
609     int32_t i = 0, j, n;
610 
611     if(prev == '@') { /* start of keyword definition */
612         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
613         do {
614             UBool duplicate = FALSE;
615             /* skip leading spaces */
616             while(*pos == ' ') {
617                 pos++;
618             }
619             if (!*pos) { /* handle trailing "; " */
620                 break;
621             }
622             if(numKeywords == maxKeywords) {
623                 *status = U_INTERNAL_PROGRAM_ERROR;
624                 return;
625             }
626             equalSign = uprv_strchr(pos, '=');
627             semicolon = uprv_strchr(pos, ';');
628             /* lack of '=' [foo@currency] is illegal */
629             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
630             if(!equalSign || (semicolon && semicolon<equalSign)) {
631                 *status = U_INVALID_FORMAT_ERROR;
632                 return;
633             }
634             /* need to normalize both keyword and keyword name */
635             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
636                 /* keyword name too long for internal buffer */
637                 *status = U_INTERNAL_PROGRAM_ERROR;
638                 return;
639             }
640             for(i = 0, n = 0; i < equalSign - pos; ++i) {
641                 if (pos[i] != ' ') {
642                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
643                 }
644             }
645 
646             /* zero-length keyword is an error. */
647             if (n == 0) {
648                 *status = U_INVALID_FORMAT_ERROR;
649                 return;
650             }
651 
652             keywordList[numKeywords].keyword[n] = 0;
653             keywordList[numKeywords].keywordLen = n;
654             /* now grab the value part. First we skip the '=' */
655             equalSign++;
656             /* then we leading spaces */
657             while(*equalSign == ' ') {
658                 equalSign++;
659             }
660 
661             /* Premature end or zero-length value */
662             if (!*equalSign || equalSign == semicolon) {
663                 *status = U_INVALID_FORMAT_ERROR;
664                 return;
665             }
666 
667             keywordList[numKeywords].valueStart = equalSign;
668 
669             pos = semicolon;
670             i = 0;
671             if(pos) {
672                 while(*(pos - i - 1) == ' ') {
673                     i++;
674                 }
675                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
676                 pos++;
677             } else {
678                 i = (int32_t)uprv_strlen(equalSign);
679                 while(i && equalSign[i-1] == ' ') {
680                     i--;
681                 }
682                 keywordList[numKeywords].valueLen = i;
683             }
684             /* If this is a duplicate keyword, then ignore it */
685             for (j=0; j<numKeywords; ++j) {
686                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
687                     duplicate = TRUE;
688                     break;
689                 }
690             }
691             if (!duplicate) {
692                 ++numKeywords;
693             }
694         } while(pos);
695 
696         /* now we have a list of keywords */
697         /* we need to sort it */
698         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
699 
700         /* Now construct the keyword part */
701         for(i = 0; i < numKeywords; i++) {
702             sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
703             if(valuesToo) {
704                 sink.Append("=", 1);
705                 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
706                 if(i < numKeywords - 1) {
707                     sink.Append(";", 1);
708                 }
709             } else {
710                 sink.Append("\0", 1);
711             }
712         }
713     }
714 }
715 
716 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)717 uloc_getKeywordValue(const char* localeID,
718                      const char* keywordName,
719                      char* buffer, int32_t bufferCapacity,
720                      UErrorCode* status)
721 {
722     if (U_FAILURE(*status)) {
723         return 0;
724     }
725 
726     CheckedArrayByteSink sink(buffer, bufferCapacity);
727     ulocimp_getKeywordValue(localeID, keywordName, sink, status);
728 
729     int32_t reslen = sink.NumberOfBytesAppended();
730 
731     if (U_FAILURE(*status)) {
732         return reslen;
733     }
734 
735     if (sink.Overflowed()) {
736         *status = U_BUFFER_OVERFLOW_ERROR;
737     } else {
738         u_terminateChars(buffer, bufferCapacity, reslen, status);
739     }
740 
741     return reslen;
742 }
743 
744 U_CAPI void U_EXPORT2
ulocimp_getKeywordValue(const char * localeID,const char * keywordName,icu::ByteSink & sink,UErrorCode * status)745 ulocimp_getKeywordValue(const char* localeID,
746                         const char* keywordName,
747                         icu::ByteSink& sink,
748                         UErrorCode* status)
749 {
750     const char* startSearchHere = NULL;
751     const char* nextSeparator = NULL;
752     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
753     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
754 
755     if(status && U_SUCCESS(*status) && localeID) {
756       char tempBuffer[ULOC_FULLNAME_CAPACITY];
757       const char* tmpLocaleID;
758 
759       if (keywordName == NULL || keywordName[0] == 0) {
760         *status = U_ILLEGAL_ARGUMENT_ERROR;
761         return;
762       }
763 
764       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
765       if(U_FAILURE(*status)) {
766         return;
767       }
768 
769       if (_hasBCP47Extension(localeID)) {
770           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
771       } else {
772           tmpLocaleID=localeID;
773       }
774 
775       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
776       if(startSearchHere == NULL) {
777           /* no keywords, return at once */
778           return;
779       }
780 
781       /* find the first keyword */
782       while(startSearchHere) {
783           const char* keyValueTail;
784           int32_t keyValueLen;
785 
786           startSearchHere++; /* skip @ or ; */
787           nextSeparator = uprv_strchr(startSearchHere, '=');
788           if(!nextSeparator) {
789               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
790               return;
791           }
792           /* strip leading & trailing spaces (TC decided to tolerate these) */
793           while(*startSearchHere == ' ') {
794               startSearchHere++;
795           }
796           keyValueTail = nextSeparator;
797           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
798               keyValueTail--;
799           }
800           /* now keyValueTail points to first char after the keyName */
801           /* copy & normalize keyName from locale */
802           if (startSearchHere == keyValueTail) {
803               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
804               return;
805           }
806           keyValueLen = 0;
807           while (startSearchHere < keyValueTail) {
808             if (!UPRV_ISALPHANUM(*startSearchHere)) {
809               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
810               return;
811             }
812             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
813               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
814             } else {
815               /* keyword name too long for internal buffer */
816               *status = U_INTERNAL_PROGRAM_ERROR;
817               return;
818             }
819           }
820           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
821 
822           startSearchHere = uprv_strchr(nextSeparator, ';');
823 
824           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
825                /* current entry matches the keyword. */
826              nextSeparator++; /* skip '=' */
827               /* First strip leading & trailing spaces (TC decided to tolerate these) */
828               while(*nextSeparator == ' ') {
829                 nextSeparator++;
830               }
831               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
832               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
833                 keyValueTail--;
834               }
835               /* Now copy the value, but check well-formedness */
836               if (nextSeparator == keyValueTail) {
837                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
838                 return;
839               }
840               while (nextSeparator < keyValueTail) {
841                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
842                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
843                   return;
844                 }
845                 /* Should we lowercase value to return here? Tests expect as-is. */
846                 sink.Append(nextSeparator++, 1);
847               }
848               return;
849           }
850       }
851     }
852 }
853 
854 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)855 uloc_setKeywordValue(const char* keywordName,
856                      const char* keywordValue,
857                      char* buffer, int32_t bufferCapacity,
858                      UErrorCode* status)
859 {
860     /* TODO: sorting. removal. */
861     int32_t keywordNameLen;
862     int32_t keywordValueLen;
863     int32_t bufLen;
864     int32_t needLen = 0;
865     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
866     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
867     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
868     int32_t rc;
869     char* nextSeparator = NULL;
870     char* nextEqualsign = NULL;
871     char* startSearchHere = NULL;
872     char* keywordStart = NULL;
873     CharString updatedKeysAndValues;
874     UBool handledInputKeyAndValue = FALSE;
875     char keyValuePrefix = '@';
876 
877     if(U_FAILURE(*status)) {
878         return -1;
879     }
880     if (*status == U_STRING_NOT_TERMINATED_WARNING) {
881         *status = U_ZERO_ERROR;
882     }
883     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
884         *status = U_ILLEGAL_ARGUMENT_ERROR;
885         return 0;
886     }
887     bufLen = (int32_t)uprv_strlen(buffer);
888     if(bufferCapacity<bufLen) {
889         /* The capacity is less than the length?! Is this NULL terminated? */
890         *status = U_ILLEGAL_ARGUMENT_ERROR;
891         return 0;
892     }
893     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
894     if(U_FAILURE(*status)) {
895         return 0;
896     }
897 
898     keywordValueLen = 0;
899     if(keywordValue) {
900         while (*keywordValue != 0) {
901             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
902                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
903                 return 0;
904             }
905             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
906                 /* Should we force lowercase in value to set? */
907                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
908             } else {
909                 /* keywordValue too long for internal buffer */
910                 *status = U_INTERNAL_PROGRAM_ERROR;
911                 return 0;
912             }
913         }
914     }
915     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
916 
917     startSearchHere = (char*)locale_getKeywordsStart(buffer);
918     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
919         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
920             U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
921             return bufLen;
922         }
923 
924         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
925         if(startSearchHere) { /* had a single @ */
926             needLen--; /* already had the @ */
927             /* startSearchHere points at the @ */
928         } else {
929             startSearchHere=buffer+bufLen;
930         }
931         if(needLen >= bufferCapacity) {
932             *status = U_BUFFER_OVERFLOW_ERROR;
933             return needLen; /* no change */
934         }
935         *startSearchHere++ = '@';
936         uprv_strcpy(startSearchHere, keywordNameBuffer);
937         startSearchHere += keywordNameLen;
938         *startSearchHere++ = '=';
939         uprv_strcpy(startSearchHere, keywordValueBuffer);
940         U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
941         return needLen;
942     } /* end shortcut - no @ */
943 
944     keywordStart = startSearchHere;
945     /* search for keyword */
946     while(keywordStart) {
947         const char* keyValueTail;
948         int32_t keyValueLen;
949 
950         keywordStart++; /* skip @ or ; */
951         nextEqualsign = uprv_strchr(keywordStart, '=');
952         if (!nextEqualsign) {
953             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
954             return 0;
955         }
956         /* strip leading & trailing spaces (TC decided to tolerate these) */
957         while(*keywordStart == ' ') {
958             keywordStart++;
959         }
960         keyValueTail = nextEqualsign;
961         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
962             keyValueTail--;
963         }
964         /* now keyValueTail points to first char after the keyName */
965         /* copy & normalize keyName from locale */
966         if (keywordStart == keyValueTail) {
967             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
968             return 0;
969         }
970         keyValueLen = 0;
971         while (keywordStart < keyValueTail) {
972             if (!UPRV_ISALPHANUM(*keywordStart)) {
973                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
974                 return 0;
975             }
976             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
977                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
978             } else {
979                 /* keyword name too long for internal buffer */
980                 *status = U_INTERNAL_PROGRAM_ERROR;
981                 return 0;
982             }
983         }
984         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
985 
986         nextSeparator = uprv_strchr(nextEqualsign, ';');
987 
988         /* start processing the value part */
989         nextEqualsign++; /* skip '=' */
990         /* First strip leading & trailing spaces (TC decided to tolerate these) */
991         while(*nextEqualsign == ' ') {
992             nextEqualsign++;
993         }
994         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
995         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
996             keyValueTail--;
997         }
998         if (nextEqualsign == keyValueTail) {
999             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1000             return 0;
1001         }
1002 
1003         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1004         if(rc == 0) {
1005             /* Current entry matches the input keyword. Update the entry */
1006             if(keywordValueLen > 0) { /* updating a value */
1007                 updatedKeysAndValues.append(keyValuePrefix, *status);
1008                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1009                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1010                 updatedKeysAndValues.append('=', *status);
1011                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1012             } /* else removing this entry, don't emit anything */
1013             handledInputKeyAndValue = TRUE;
1014         } else {
1015            /* input keyword sorts earlier than current entry, add before current entry */
1016             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1017                 /* insert new entry at this location */
1018                 updatedKeysAndValues.append(keyValuePrefix, *status);
1019                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1020                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1021                 updatedKeysAndValues.append('=', *status);
1022                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1023                 handledInputKeyAndValue = TRUE;
1024             }
1025             /* copy the current entry */
1026             updatedKeysAndValues.append(keyValuePrefix, *status);
1027             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1028             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1029             updatedKeysAndValues.append('=', *status);
1030             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1031         }
1032         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1033             /* append new entry at the end, it sorts later than existing entries */
1034             updatedKeysAndValues.append(keyValuePrefix, *status);
1035             /* skip keyValuePrefix update, no subsequent key-value pair */
1036             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1037             updatedKeysAndValues.append('=', *status);
1038             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1039             handledInputKeyAndValue = TRUE;
1040         }
1041         keywordStart = nextSeparator;
1042     } /* end loop searching */
1043 
1044     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1045      * problems with the passed-in locale. So if we did encounter problems with the
1046      * passed-in locale above, those errors took precedence and overrode any error
1047      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1048      * are errors here they are from updatedKeysAndValues.append; they do cause an
1049      * error return but the passed-in locale is unmodified and the original bufLen is
1050      * returned.
1051      */
1052     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1053         /* if input key/value specified removal of a keyword not present in locale, or
1054          * there was an error in CharString.append, leave original locale alone. */
1055         U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1056         return bufLen;
1057     }
1058 
1059     // needLen = length of the part before '@'
1060     needLen = (int32_t)(startSearchHere - buffer);
1061     // Check to see can we fit the startSearchHere, if not, return
1062     // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
1063     // We do this because this API function does not behave like most others:
1064     // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
1065     // When the contents fits but without the terminating NUL, in this case we need to not change
1066     // the buffer contents and return with a buffer overflow error.
1067     int32_t appendLength = updatedKeysAndValues.length();
1068     if (appendLength >= bufferCapacity - needLen) {
1069         *status = U_BUFFER_OVERFLOW_ERROR;
1070         return needLen + appendLength;
1071     }
1072     needLen += updatedKeysAndValues.extract(
1073                          startSearchHere, bufferCapacity - needLen, *status);
1074     U_ASSERT(*status != U_STRING_NOT_TERMINATED_WARNING);
1075     return needLen;
1076 }
1077 
1078 /* ### ID parsing implementation **************************************************/
1079 
1080 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1081 
1082 /*returns TRUE if one of the special prefixes is here (s=string)
1083   'x-' or 'i-' */
1084 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1085 
1086 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1087  * except for variant
1088  */
1089 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1090 
1091 /**
1092  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1093  * a NULL entry, followed by more entries, and a second NULL entry.
1094  *
1095  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1096  * COUNTRIES_3.
1097  */
_findIndex(const char * const * list,const char * key)1098 static int16_t _findIndex(const char* const* list, const char* key)
1099 {
1100     const char* const* anchor = list;
1101     int32_t pass = 0;
1102 
1103     /* Make two passes through two NULL-terminated arrays at 'list' */
1104     while (pass++ < 2) {
1105         while (*list) {
1106             if (uprv_strcmp(key, *list) == 0) {
1107                 return (int16_t)(list - anchor);
1108             }
1109             list++;
1110         }
1111         ++list;     /* skip final NULL *CWB*/
1112     }
1113     return -1;
1114 }
1115 
1116 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1117 uloc_getCurrentCountryID(const char* oldID){
1118     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1119     if (offset >= 0) {
1120         return REPLACEMENT_COUNTRIES[offset];
1121     }
1122     return oldID;
1123 }
1124 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1125 uloc_getCurrentLanguageID(const char* oldID){
1126     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1127     if (offset >= 0) {
1128         return REPLACEMENT_LANGUAGES[offset];
1129     }
1130     return oldID;
1131 }
1132 /*
1133  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1134  * avoid duplicating code to handle the earlier locale ID pieces
1135  * in the functions for the later ones by
1136  * setting the *pEnd pointer to where they stopped parsing
1137  *
1138  * TODO try to use this in Locale
1139  */
1140 CharString U_EXPORT2
ulocimp_getLanguage(const char * localeID,const char ** pEnd,UErrorCode & status)1141 ulocimp_getLanguage(const char *localeID,
1142                     const char **pEnd,
1143                     UErrorCode &status) {
1144     CharString result;
1145 
1146     if (uprv_stricmp(localeID, "root") == 0) {
1147         localeID += 4;
1148     } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1149                (localeID[3] == '\0' ||
1150                 localeID[3] == '-' ||
1151                 localeID[3] == '_' ||
1152                 localeID[3] == '@')) {
1153         localeID += 3;
1154     }
1155 
1156     /* if it starts with i- or x- then copy that prefix */
1157     if(_isIDPrefix(localeID)) {
1158         result.append((char)uprv_tolower(*localeID), status);
1159         result.append('-', status);
1160         localeID+=2;
1161     }
1162 
1163     /* copy the language as far as possible and count its length */
1164     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1165         result.append((char)uprv_tolower(*localeID), status);
1166         localeID++;
1167     }
1168 
1169     if(result.length()==3) {
1170         /* convert 3 character code to 2 character code if possible *CWB*/
1171         int32_t offset = _findIndex(LANGUAGES_3, result.data());
1172         if(offset>=0) {
1173             result.clear();
1174             result.append(LANGUAGES[offset], status);
1175         }
1176     }
1177 
1178     if(pEnd!=NULL) {
1179         *pEnd=localeID;
1180     }
1181 
1182     return result;
1183 }
1184 
1185 CharString U_EXPORT2
ulocimp_getScript(const char * localeID,const char ** pEnd,UErrorCode & status)1186 ulocimp_getScript(const char *localeID,
1187                   const char **pEnd,
1188                   UErrorCode &status) {
1189     CharString result;
1190     int32_t idLen = 0;
1191 
1192     if (pEnd != NULL) {
1193         *pEnd = localeID;
1194     }
1195 
1196     /* copy the second item as far as possible and count its length */
1197     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1198             && uprv_isASCIILetter(localeID[idLen])) {
1199         idLen++;
1200     }
1201 
1202     /* If it's exactly 4 characters long, then it's a script and not a country. */
1203     if (idLen == 4) {
1204         int32_t i;
1205         if (pEnd != NULL) {
1206             *pEnd = localeID+idLen;
1207         }
1208         if (idLen >= 1) {
1209             result.append((char)uprv_toupper(*(localeID++)), status);
1210         }
1211         for (i = 1; i < idLen; i++) {
1212             result.append((char)uprv_tolower(*(localeID++)), status);
1213         }
1214     }
1215 
1216     return result;
1217 }
1218 
1219 CharString U_EXPORT2
ulocimp_getCountry(const char * localeID,const char ** pEnd,UErrorCode & status)1220 ulocimp_getCountry(const char *localeID,
1221                    const char **pEnd,
1222                    UErrorCode &status) {
1223     CharString result;
1224     int32_t idLen=0;
1225 
1226     /* copy the country as far as possible and count its length */
1227     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1228         result.append((char)uprv_toupper(localeID[idLen]), status);
1229         idLen++;
1230     }
1231 
1232     /* the country should be either length 2 or 3 */
1233     if (idLen == 2 || idLen == 3) {
1234         /* convert 3 character code to 2 character code if possible *CWB*/
1235         if(idLen==3) {
1236             int32_t offset = _findIndex(COUNTRIES_3, result.data());
1237             if(offset>=0) {
1238                 result.clear();
1239                 result.append(COUNTRIES[offset], status);
1240             }
1241         }
1242         localeID+=idLen;
1243     } else {
1244         result.clear();
1245     }
1246 
1247     if(pEnd!=NULL) {
1248         *pEnd=localeID;
1249     }
1250 
1251     return result;
1252 }
1253 
1254 /**
1255  * @param needSeparator if true, then add leading '_' if any variants
1256  * are added to 'variant'
1257  */
1258 static void
_getVariant(const char * localeID,char prev,ByteSink & sink,UBool needSeparator)1259 _getVariant(const char *localeID,
1260             char prev,
1261             ByteSink& sink,
1262             UBool needSeparator) {
1263     UBool hasVariant = FALSE;
1264 
1265     /* get one or more variant tags and separate them with '_' */
1266     if(_isIDSeparator(prev)) {
1267         /* get a variant string after a '-' or '_' */
1268         while(!_isTerminator(*localeID)) {
1269             if (needSeparator) {
1270                 sink.Append("_", 1);
1271                 needSeparator = FALSE;
1272             }
1273             char c = (char)uprv_toupper(*localeID);
1274             if (c == '-') c = '_';
1275             sink.Append(&c, 1);
1276             hasVariant = TRUE;
1277             localeID++;
1278         }
1279     }
1280 
1281     /* if there is no variant tag after a '-' or '_' then look for '@' */
1282     if(!hasVariant) {
1283         if(prev=='@') {
1284             /* keep localeID */
1285         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1286             ++localeID; /* point after the '@' */
1287         } else {
1288             return;
1289         }
1290         while(!_isTerminator(*localeID)) {
1291             if (needSeparator) {
1292                 sink.Append("_", 1);
1293                 needSeparator = FALSE;
1294             }
1295             char c = (char)uprv_toupper(*localeID);
1296             if (c == '-' || c == ',') c = '_';
1297             sink.Append(&c, 1);
1298             localeID++;
1299         }
1300     }
1301 }
1302 
1303 /* Keyword enumeration */
1304 
1305 typedef struct UKeywordsContext {
1306     char* keywords;
1307     char* current;
1308 } UKeywordsContext;
1309 
1310 U_CDECL_BEGIN
1311 
1312 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1313 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1314     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1315     uprv_free(enumerator->context);
1316     uprv_free(enumerator);
1317 }
1318 
1319 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1320 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1321     char *kw = ((UKeywordsContext *)en->context)->keywords;
1322     int32_t result = 0;
1323     while(*kw) {
1324         result++;
1325         kw += uprv_strlen(kw)+1;
1326     }
1327     return result;
1328 }
1329 
1330 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1331 uloc_kw_nextKeyword(UEnumeration* en,
1332                     int32_t* resultLength,
1333                     UErrorCode* /*status*/) {
1334     const char* result = ((UKeywordsContext *)en->context)->current;
1335     int32_t len = 0;
1336     if(*result) {
1337         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1338         ((UKeywordsContext *)en->context)->current += len+1;
1339     } else {
1340         result = NULL;
1341     }
1342     if (resultLength) {
1343         *resultLength = len;
1344     }
1345     return result;
1346 }
1347 
1348 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1349 uloc_kw_resetKeywords(UEnumeration* en,
1350                       UErrorCode* /*status*/) {
1351     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1352 }
1353 
1354 U_CDECL_END
1355 
1356 
1357 static const UEnumeration gKeywordsEnum = {
1358     NULL,
1359     NULL,
1360     uloc_kw_closeKeywords,
1361     uloc_kw_countKeywords,
1362     uenum_unextDefault,
1363     uloc_kw_nextKeyword,
1364     uloc_kw_resetKeywords
1365 };
1366 
1367 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1368 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1369 {
1370     LocalMemory<UKeywordsContext> myContext;
1371     LocalMemory<UEnumeration> result;
1372 
1373     if (U_FAILURE(*status)) {
1374         return nullptr;
1375     }
1376     myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1377     result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1378     if (myContext.isNull() || result.isNull()) {
1379         *status = U_MEMORY_ALLOCATION_ERROR;
1380         return nullptr;
1381     }
1382     uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1383     myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1384     if (myContext->keywords == nullptr) {
1385         *status = U_MEMORY_ALLOCATION_ERROR;
1386         return nullptr;
1387     }
1388     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1389     myContext->keywords[keywordListSize] = 0;
1390     myContext->current = myContext->keywords;
1391     result->context = myContext.orphan();
1392     return result.orphan();
1393 }
1394 
1395 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1396 uloc_openKeywords(const char* localeID,
1397                         UErrorCode* status)
1398 {
1399     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1400     const char* tmpLocaleID;
1401 
1402     if(status==NULL || U_FAILURE(*status)) {
1403         return 0;
1404     }
1405 
1406     if (_hasBCP47Extension(localeID)) {
1407         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1408     } else {
1409         if (localeID==NULL) {
1410            localeID=uloc_getDefault();
1411         }
1412         tmpLocaleID=localeID;
1413     }
1414 
1415     /* Skip the language */
1416     ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *status);
1417     if (U_FAILURE(*status)) {
1418         return 0;
1419     }
1420 
1421     if(_isIDSeparator(*tmpLocaleID)) {
1422         const char *scriptID;
1423         /* Skip the script if available */
1424         ulocimp_getScript(tmpLocaleID+1, &scriptID, *status);
1425         if (U_FAILURE(*status)) {
1426             return 0;
1427         }
1428         if(scriptID != tmpLocaleID+1) {
1429             /* Found optional script */
1430             tmpLocaleID = scriptID;
1431         }
1432         /* Skip the Country */
1433         if (_isIDSeparator(*tmpLocaleID)) {
1434             ulocimp_getCountry(tmpLocaleID+1, &tmpLocaleID, *status);
1435             if (U_FAILURE(*status)) {
1436                 return 0;
1437             }
1438         }
1439     }
1440 
1441     /* keywords are located after '@' */
1442     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1443         CharString keywords;
1444         CharStringByteSink sink(&keywords);
1445         ulocimp_getKeywords(tmpLocaleID+1, '@', sink, FALSE, status);
1446         if (U_FAILURE(*status)) {
1447             return NULL;
1448         }
1449         return uloc_openKeywordList(keywords.data(), keywords.length(), status);
1450     }
1451     return NULL;
1452 }
1453 
1454 
1455 /* bit-flags for 'options' parameter of _canonicalize */
1456 #define _ULOC_STRIP_KEYWORDS 0x2
1457 #define _ULOC_CANONICALIZE   0x1
1458 
1459 #define OPTION_SET(options, mask) ((options & mask) != 0)
1460 
1461 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1462 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1463 
1464 /**
1465  * Canonicalize the given localeID, to level 1 or to level 2,
1466  * depending on the options.  To specify level 1, pass in options=0.
1467  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1468  *
1469  * This is the code underlying uloc_getName and uloc_canonicalize.
1470  */
1471 static void
_canonicalize(const char * localeID,ByteSink & sink,uint32_t options,UErrorCode * err)1472 _canonicalize(const char* localeID,
1473               ByteSink& sink,
1474               uint32_t options,
1475               UErrorCode* err) {
1476     int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
1477     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1478     const char* origLocaleID;
1479     const char* tmpLocaleID;
1480     const char* keywordAssign = NULL;
1481     const char* separatorIndicator = NULL;
1482 
1483     if (U_FAILURE(*err)) {
1484         return;
1485     }
1486 
1487     if (_hasBCP47Extension(localeID)) {
1488         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1489     } else {
1490         if (localeID==NULL) {
1491            localeID=uloc_getDefault();
1492         }
1493         tmpLocaleID=localeID;
1494     }
1495 
1496     origLocaleID=tmpLocaleID;
1497 
1498     /* get all pieces, one after another, and separate with '_' */
1499     CharString tag = ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1500 
1501     if (tag.length() == I_DEFAULT_LENGTH &&
1502             uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
1503         tag.clear();
1504         tag.append(uloc_getDefault(), *err);
1505     } else if(_isIDSeparator(*tmpLocaleID)) {
1506         const char *scriptID;
1507 
1508         ++fieldCount;
1509         tag.append('_', *err);
1510 
1511         CharString script = ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1512         tag.append(script, *err);
1513         scriptSize = script.length();
1514         if(scriptSize > 0) {
1515             /* Found optional script */
1516             tmpLocaleID = scriptID;
1517             ++fieldCount;
1518             if (_isIDSeparator(*tmpLocaleID)) {
1519                 /* If there is something else, then we add the _ */
1520                 tag.append('_', *err);
1521             }
1522         }
1523 
1524         if (_isIDSeparator(*tmpLocaleID)) {
1525             const char *cntryID;
1526 
1527             CharString country = ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1528             tag.append(country, *err);
1529             if (!country.isEmpty()) {
1530                 /* Found optional country */
1531                 tmpLocaleID = cntryID;
1532             }
1533             if(_isIDSeparator(*tmpLocaleID)) {
1534                 /* If there is something else, then we add the _  if we found country before. */
1535                 if (!_isIDSeparator(*(tmpLocaleID+1))) {
1536                     ++fieldCount;
1537                     tag.append('_', *err);
1538                 }
1539 
1540                 variantSize = -tag.length();
1541                 {
1542                     CharStringByteSink s(&tag);
1543                     _getVariant(tmpLocaleID+1, *tmpLocaleID, s, FALSE);
1544                 }
1545                 variantSize += tag.length();
1546                 if (variantSize > 0) {
1547                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1548                 }
1549             }
1550         }
1551     }
1552 
1553     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1554     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1555         UBool done = FALSE;
1556         do {
1557             char c = *tmpLocaleID;
1558             switch (c) {
1559             case 0:
1560             case '@':
1561                 done = TRUE;
1562                 break;
1563             default:
1564                 tag.append(c, *err);
1565                 ++tmpLocaleID;
1566                 break;
1567             }
1568         } while (!done);
1569     }
1570 
1571     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1572        After this, tmpLocaleID either points to '@' or is NULL */
1573     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1574         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1575         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1576     }
1577 
1578     /* Copy POSIX-style variant, if any [mr@FOO] */
1579     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1580         tmpLocaleID != NULL && keywordAssign == NULL) {
1581         for (;;) {
1582             char c = *tmpLocaleID;
1583             if (c == 0) {
1584                 break;
1585             }
1586             tag.append(c, *err);
1587             ++tmpLocaleID;
1588         }
1589     }
1590 
1591     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1592         /* Handle @FOO variant if @ is present and not followed by = */
1593         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1594             /* Add missing '_' if needed */
1595             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1596                 do {
1597                     tag.append('_', *err);
1598                     ++fieldCount;
1599                 } while(fieldCount<2);
1600             }
1601 
1602             int32_t posixVariantSize = -tag.length();
1603             {
1604                 CharStringByteSink s(&tag);
1605                 _getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
1606             }
1607             posixVariantSize += tag.length();
1608             if (posixVariantSize > 0) {
1609                 variantSize += posixVariantSize;
1610             }
1611         }
1612 
1613         /* Look up the ID in the canonicalization map */
1614         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1615             StringPiece id(CANONICALIZE_MAP[j].id);
1616             if (tag == id) {
1617                 if (id.empty() && tmpLocaleID != NULL) {
1618                     break; /* Don't remap "" if keywords present */
1619                 }
1620                 tag.clear();
1621                 tag.append(CANONICALIZE_MAP[j].canonicalID, *err);
1622                 break;
1623             }
1624         }
1625     }
1626 
1627     sink.Append(tag.data(), tag.length());
1628 
1629     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1630         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1631             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1632             sink.Append("@", 1);
1633             ++fieldCount;
1634             ulocimp_getKeywords(tmpLocaleID+1, '@', sink, TRUE, err);
1635         }
1636     }
1637 }
1638 
1639 /* ### ID parsing API **************************************************/
1640 
1641 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1642 uloc_getParent(const char*    localeID,
1643                char* parent,
1644                int32_t parentCapacity,
1645                UErrorCode* err)
1646 {
1647     const char *lastUnderscore;
1648     int32_t i;
1649 
1650     if (U_FAILURE(*err))
1651         return 0;
1652 
1653     if (localeID == NULL)
1654         localeID = uloc_getDefault();
1655 
1656     lastUnderscore=uprv_strrchr(localeID, '_');
1657     if(lastUnderscore!=NULL) {
1658         i=(int32_t)(lastUnderscore-localeID);
1659     } else {
1660         i=0;
1661     }
1662 
1663     if (i > 0) {
1664         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1665             localeID += 3;
1666             i -= 3;
1667             uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1668         } else if (parent != localeID) {
1669             uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1670         }
1671     }
1672 
1673     return u_terminateChars(parent, parentCapacity, i, err);
1674 }
1675 
1676 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1677 uloc_getLanguage(const char*    localeID,
1678          char* language,
1679          int32_t languageCapacity,
1680          UErrorCode* err)
1681 {
1682     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1683 
1684     if (err==NULL || U_FAILURE(*err)) {
1685         return 0;
1686     }
1687 
1688     if(localeID==NULL) {
1689         localeID=uloc_getDefault();
1690     }
1691 
1692     return ulocimp_getLanguage(localeID, NULL, *err).extract(language, languageCapacity, *err);
1693 }
1694 
1695 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1696 uloc_getScript(const char*    localeID,
1697          char* script,
1698          int32_t scriptCapacity,
1699          UErrorCode* err)
1700 {
1701     if(err==NULL || U_FAILURE(*err)) {
1702         return 0;
1703     }
1704 
1705     if(localeID==NULL) {
1706         localeID=uloc_getDefault();
1707     }
1708 
1709     /* skip the language */
1710     ulocimp_getLanguage(localeID, &localeID, *err);
1711     if (U_FAILURE(*err)) {
1712         return 0;
1713     }
1714 
1715     if(_isIDSeparator(*localeID)) {
1716         return ulocimp_getScript(localeID+1, NULL, *err).extract(script, scriptCapacity, *err);
1717     }
1718     return u_terminateChars(script, scriptCapacity, 0, err);
1719 }
1720 
1721 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1722 uloc_getCountry(const char* localeID,
1723             char* country,
1724             int32_t countryCapacity,
1725             UErrorCode* err)
1726 {
1727     if(err==NULL || U_FAILURE(*err)) {
1728         return 0;
1729     }
1730 
1731     if(localeID==NULL) {
1732         localeID=uloc_getDefault();
1733     }
1734 
1735     /* Skip the language */
1736     ulocimp_getLanguage(localeID, &localeID, *err);
1737     if (U_FAILURE(*err)) {
1738         return 0;
1739     }
1740 
1741     if(_isIDSeparator(*localeID)) {
1742         const char *scriptID;
1743         /* Skip the script if available */
1744         ulocimp_getScript(localeID+1, &scriptID, *err);
1745         if (U_FAILURE(*err)) {
1746             return 0;
1747         }
1748         if(scriptID != localeID+1) {
1749             /* Found optional script */
1750             localeID = scriptID;
1751         }
1752         if(_isIDSeparator(*localeID)) {
1753             return ulocimp_getCountry(localeID+1, NULL, *err).extract(country, countryCapacity, *err);
1754         }
1755     }
1756     return u_terminateChars(country, countryCapacity, 0, err);
1757 }
1758 
1759 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1760 uloc_getVariant(const char* localeID,
1761                 char* variant,
1762                 int32_t variantCapacity,
1763                 UErrorCode* err)
1764 {
1765     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1766     const char* tmpLocaleID;
1767     int32_t i=0;
1768 
1769     if(err==NULL || U_FAILURE(*err)) {
1770         return 0;
1771     }
1772 
1773     if (_hasBCP47Extension(localeID)) {
1774         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1775     } else {
1776         if (localeID==NULL) {
1777            localeID=uloc_getDefault();
1778         }
1779         tmpLocaleID=localeID;
1780     }
1781 
1782     /* Skip the language */
1783     ulocimp_getLanguage(tmpLocaleID, &tmpLocaleID, *err);
1784     if (U_FAILURE(*err)) {
1785         return 0;
1786     }
1787 
1788     if(_isIDSeparator(*tmpLocaleID)) {
1789         const char *scriptID;
1790         /* Skip the script if available */
1791         ulocimp_getScript(tmpLocaleID+1, &scriptID, *err);
1792         if (U_FAILURE(*err)) {
1793             return 0;
1794         }
1795         if(scriptID != tmpLocaleID+1) {
1796             /* Found optional script */
1797             tmpLocaleID = scriptID;
1798         }
1799         /* Skip the Country */
1800         if (_isIDSeparator(*tmpLocaleID)) {
1801             const char *cntryID;
1802             ulocimp_getCountry(tmpLocaleID+1, &cntryID, *err);
1803             if (U_FAILURE(*err)) {
1804                 return 0;
1805             }
1806             if (cntryID != tmpLocaleID+1) {
1807                 /* Found optional country */
1808                 tmpLocaleID = cntryID;
1809             }
1810             if(_isIDSeparator(*tmpLocaleID)) {
1811                 /* If there was no country ID, skip a possible extra IDSeparator */
1812                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1813                     tmpLocaleID++;
1814                 }
1815 
1816                 CheckedArrayByteSink sink(variant, variantCapacity);
1817                 _getVariant(tmpLocaleID+1, *tmpLocaleID, sink, FALSE);
1818 
1819                 i = sink.NumberOfBytesAppended();
1820 
1821                 if (U_FAILURE(*err)) {
1822                     return i;
1823                 }
1824 
1825                 if (sink.Overflowed()) {
1826                     *err = U_BUFFER_OVERFLOW_ERROR;
1827                     return i;
1828                 }
1829             }
1830         }
1831     }
1832 
1833     return u_terminateChars(variant, variantCapacity, i, err);
1834 }
1835 
1836 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1837 uloc_getName(const char* localeID,
1838              char* name,
1839              int32_t nameCapacity,
1840              UErrorCode* err)
1841 {
1842     if (U_FAILURE(*err)) {
1843         return 0;
1844     }
1845 
1846     CheckedArrayByteSink sink(name, nameCapacity);
1847     ulocimp_getName(localeID, sink, err);
1848 
1849     int32_t reslen = sink.NumberOfBytesAppended();
1850 
1851     if (U_FAILURE(*err)) {
1852         return reslen;
1853     }
1854 
1855     if (sink.Overflowed()) {
1856         *err = U_BUFFER_OVERFLOW_ERROR;
1857     } else {
1858         u_terminateChars(name, nameCapacity, reslen, err);
1859     }
1860 
1861     return reslen;
1862 }
1863 
1864 U_CAPI void U_EXPORT2
ulocimp_getName(const char * localeID,ByteSink & sink,UErrorCode * err)1865 ulocimp_getName(const char* localeID,
1866                 ByteSink& sink,
1867                 UErrorCode* err)
1868 {
1869     _canonicalize(localeID, sink, 0, err);
1870 }
1871 
1872 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1873 uloc_getBaseName(const char* localeID,
1874                  char* name,
1875                  int32_t nameCapacity,
1876                  UErrorCode* err)
1877 {
1878     if (U_FAILURE(*err)) {
1879         return 0;
1880     }
1881 
1882     CheckedArrayByteSink sink(name, nameCapacity);
1883     ulocimp_getBaseName(localeID, sink, err);
1884 
1885     int32_t reslen = sink.NumberOfBytesAppended();
1886 
1887     if (U_FAILURE(*err)) {
1888         return reslen;
1889     }
1890 
1891     if (sink.Overflowed()) {
1892         *err = U_BUFFER_OVERFLOW_ERROR;
1893     } else {
1894         u_terminateChars(name, nameCapacity, reslen, err);
1895     }
1896 
1897     return reslen;
1898 }
1899 
1900 U_CAPI void U_EXPORT2
ulocimp_getBaseName(const char * localeID,ByteSink & sink,UErrorCode * err)1901 ulocimp_getBaseName(const char* localeID,
1902                     ByteSink& sink,
1903                     UErrorCode* err)
1904 {
1905     _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
1906 }
1907 
1908 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1909 uloc_canonicalize(const char* localeID,
1910                   char* name,
1911                   int32_t nameCapacity,
1912                   UErrorCode* err)
1913 {
1914     if (U_FAILURE(*err)) {
1915         return 0;
1916     }
1917 
1918     CheckedArrayByteSink sink(name, nameCapacity);
1919     ulocimp_canonicalize(localeID, sink, err);
1920 
1921     int32_t reslen = sink.NumberOfBytesAppended();
1922 
1923     if (U_FAILURE(*err)) {
1924         return reslen;
1925     }
1926 
1927     if (sink.Overflowed()) {
1928         *err = U_BUFFER_OVERFLOW_ERROR;
1929     } else {
1930         u_terminateChars(name, nameCapacity, reslen, err);
1931     }
1932 
1933     return reslen;
1934 }
1935 
1936 U_CAPI void U_EXPORT2
ulocimp_canonicalize(const char * localeID,ByteSink & sink,UErrorCode * err)1937 ulocimp_canonicalize(const char* localeID,
1938                      ByteSink& sink,
1939                      UErrorCode* err)
1940 {
1941     _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
1942 }
1943 
1944 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)1945 uloc_getISO3Language(const char* localeID)
1946 {
1947     int16_t offset;
1948     char lang[ULOC_LANG_CAPACITY];
1949     UErrorCode err = U_ZERO_ERROR;
1950 
1951     if (localeID == NULL)
1952     {
1953         localeID = uloc_getDefault();
1954     }
1955     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1956     if (U_FAILURE(err))
1957         return "";
1958     offset = _findIndex(LANGUAGES, lang);
1959     if (offset < 0)
1960         return "";
1961     return LANGUAGES_3[offset];
1962 }
1963 
1964 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)1965 uloc_getISO3Country(const char* localeID)
1966 {
1967     int16_t offset;
1968     char cntry[ULOC_LANG_CAPACITY];
1969     UErrorCode err = U_ZERO_ERROR;
1970 
1971     if (localeID == NULL)
1972     {
1973         localeID = uloc_getDefault();
1974     }
1975     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
1976     if (U_FAILURE(err))
1977         return "";
1978     offset = _findIndex(COUNTRIES, cntry);
1979     if (offset < 0)
1980         return "";
1981 
1982     return COUNTRIES_3[offset];
1983 }
1984 
1985 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)1986 uloc_getLCID(const char* localeID)
1987 {
1988     UErrorCode status = U_ZERO_ERROR;
1989     char       langID[ULOC_FULLNAME_CAPACITY];
1990     uint32_t   lcid = 0;
1991 
1992     /* Check for incomplete id. */
1993     if (!localeID || uprv_strlen(localeID) < 2) {
1994         return 0;
1995     }
1996 
1997     // First, attempt Windows platform lookup if available, but fall
1998     // through to catch any special cases (ICU vs Windows name differences).
1999     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2000     if (U_FAILURE(status)) {
2001         return 0;
2002     }
2003     if (lcid > 0) {
2004         // Windows found an LCID, return that
2005         return lcid;
2006     }
2007 
2008     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2009     if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2010         return 0;
2011     }
2012 
2013     if (uprv_strchr(localeID, '@')) {
2014         // uprv_convertToLCID does not support keywords other than collation.
2015         // Remove all keywords except collation.
2016         int32_t len;
2017         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2018 
2019         CharString collVal;
2020         {
2021             CharStringByteSink sink(&collVal);
2022             ulocimp_getKeywordValue(localeID, "collation", sink, &status);
2023         }
2024 
2025         if (U_SUCCESS(status) && !collVal.isEmpty()) {
2026             len = uloc_getBaseName(localeID, tmpLocaleID,
2027                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2028 
2029             if (U_SUCCESS(status) && len > 0) {
2030                 tmpLocaleID[len] = 0;
2031 
2032                 len = uloc_setKeywordValue("collation", collVal.data(), tmpLocaleID,
2033                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2034 
2035                 if (U_SUCCESS(status) && len > 0) {
2036                     tmpLocaleID[len] = 0;
2037                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2038                 }
2039             }
2040         }
2041 
2042         // fall through - all keywords are simply ignored
2043         status = U_ZERO_ERROR;
2044     }
2045 
2046     return uprv_convertToLCID(langID, localeID, &status);
2047 }
2048 
2049 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2050 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2051                 UErrorCode *status)
2052 {
2053     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2054 }
2055 
2056 /* ### Default locale **************************************************/
2057 
2058 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2059 uloc_getDefault()
2060 {
2061     return locale_get_default();
2062 }
2063 
2064 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2065 uloc_setDefault(const char*   newDefaultLocale,
2066              UErrorCode* err)
2067 {
2068     if (U_FAILURE(*err))
2069         return;
2070     /* the error code isn't currently used for anything by this function*/
2071 
2072     /* propagate change to C++ */
2073     locale_set_default(newDefaultLocale);
2074 }
2075 
2076 /**
2077  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2078  * to an array of pointers to arrays of char.  All of these pointers are owned
2079  * by ICU-- do not delete them, and do not write through them.  The array is
2080  * terminated with a null pointer.
2081  */
2082 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2083 uloc_getISOLanguages()
2084 {
2085     return LANGUAGES;
2086 }
2087 
2088 /**
2089  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2090  * pointer to an array of pointers to arrays of char.  All of these pointers are
2091  * owned by ICU-- do not delete them, and do not write through them.  The array is
2092  * terminated with a null pointer.
2093  */
2094 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2095 uloc_getISOCountries()
2096 {
2097     return COUNTRIES;
2098 }
2099 
2100 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2101 uloc_toUnicodeLocaleKey(const char* keyword)
2102 {
2103     const char* bcpKey = ulocimp_toBcpKey(keyword);
2104     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2105         // unknown keyword, but syntax is fine..
2106         return keyword;
2107     }
2108     return bcpKey;
2109 }
2110 
2111 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2112 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2113 {
2114     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2115     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2116         // unknown keyword, but syntax is fine..
2117         return value;
2118     }
2119     return bcpType;
2120 }
2121 
2122 static UBool
isWellFormedLegacyKey(const char * legacyKey)2123 isWellFormedLegacyKey(const char* legacyKey)
2124 {
2125     const char* p = legacyKey;
2126     while (*p) {
2127         if (!UPRV_ISALPHANUM(*p)) {
2128             return FALSE;
2129         }
2130         p++;
2131     }
2132     return TRUE;
2133 }
2134 
2135 static UBool
isWellFormedLegacyType(const char * legacyType)2136 isWellFormedLegacyType(const char* legacyType)
2137 {
2138     const char* p = legacyType;
2139     int32_t alphaNumLen = 0;
2140     while (*p) {
2141         if (*p == '_' || *p == '/' || *p == '-') {
2142             if (alphaNumLen == 0) {
2143                 return FALSE;
2144             }
2145             alphaNumLen = 0;
2146         } else if (UPRV_ISALPHANUM(*p)) {
2147             alphaNumLen++;
2148         } else {
2149             return FALSE;
2150         }
2151         p++;
2152     }
2153     return (alphaNumLen != 0);
2154 }
2155 
2156 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2157 uloc_toLegacyKey(const char* keyword)
2158 {
2159     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2160     if (legacyKey == NULL) {
2161         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2162         //
2163         // Note:
2164         //  LDML/CLDR provides some definition of keyword syntax in
2165         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2166         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2167         //  Keys can only consist of [0-9a-zA-Z].
2168         if (isWellFormedLegacyKey(keyword)) {
2169             return keyword;
2170         }
2171     }
2172     return legacyKey;
2173 }
2174 
2175 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2176 uloc_toLegacyType(const char* keyword, const char* value)
2177 {
2178     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2179     if (legacyType == NULL) {
2180         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2181         //
2182         // Note:
2183         //  LDML/CLDR provides some definition of keyword syntax in
2184         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2185         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2186         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2187         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2188         if (isWellFormedLegacyType(value)) {
2189             return value;
2190         }
2191     }
2192     return legacyType;
2193 }
2194 
2195 /*eof*/
2196