1 /*
2 **********************************************************************
3 *   Copyright (C) 1997-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *
7 * File ULOC.CPP
8 *
9 * Modification History:
10 *
11 *   Date        Name        Description
12 *   04/01/97    aliu        Creation.
13 *   08/21/98    stephen     JDK 1.2 sync
14 *   12/08/98    rtg         New Locale implementation and C API
15 *   03/15/99    damiba      overhaul.
16 *   04/06/99    stephen     changed setDefault() to realloc and copy
17 *   06/14/99    stephen     Changed calls to ures_open for new params
18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
20 *                           brought canonicalization code into line with spec
21 *****************************************************************************/
22 
23 /*
24    POSIX's locale format, from putil.c: [no spaces]
25 
26      ll [ _CC ] [ . MM ] [ @ VV]
27 
28      l = lang, C = ctry, M = charmap, V = variant
29 */
30 
31 #include "unicode/utypes.h"
32 #include "unicode/ustring.h"
33 #include "unicode/uloc.h"
34 
35 #include "putilimp.h"
36 #include "ustr_imp.h"
37 #include "ulocimp.h"
38 #include "umutex.h"
39 #include "cstring.h"
40 #include "cmemory.h"
41 #include "locmap.h"
42 #include "uarrsort.h"
43 #include "uenumimp.h"
44 #include "uassert.h"
45 
46 #include <stdio.h> /* for sprintf */
47 
48 /* ### Declarations **************************************************/
49 
50 /* Locale stuff from locid.cpp */
51 U_CFUNC void locale_set_default(const char *id);
52 U_CFUNC const char *locale_get_default(void);
53 U_CFUNC int32_t
54 locale_getKeywords(const char *localeID,
55             char prev,
56             char *keywords, int32_t keywordCapacity,
57             char *values, int32_t valuesCapacity, int32_t *valLen,
58             UBool valuesToo,
59             UErrorCode *status);
60 
61 /* ### Data tables **************************************************/
62 
63 /**
64  * Table of language codes, both 2- and 3-letter, with preference
65  * given to 2-letter codes where possible.  Includes 3-letter codes
66  * that lack a 2-letter equivalent.
67  *
68  * This list must be in sorted order.  This list is returned directly
69  * to the user by some API.
70  *
71  * This list must be kept in sync with LANGUAGES_3, with corresponding
72  * entries matched.
73  *
74  * This table should be terminated with a NULL entry, followed by a
75  * second list, and another NULL entry.  The first list is visible to
76  * user code when this array is returned by API.  The second list
77  * contains codes we support, but do not expose through user API.
78  *
79  * Notes
80  *
81  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
82  * include the revisions up to 2001/7/27 *CWB*
83  *
84  * The 3 character codes are the terminology codes like RFC 3066.  This
85  * is compatible with prior ICU codes
86  *
87  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
88  * table but now at the end of the table because 3 character codes are
89  * duplicates.  This avoids bad searches going from 3 to 2 character
90  * codes.
91  *
92  * The range qaa-qtz is reserved for local use
93  */
94 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
95 /* ISO639 table version is 20150505 */
96 static const char * const LANGUAGES[] = {
97     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
98     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
99     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
100     "arn", "aro", "arp", "arq", "arw", "ary", "arz", "as",
101     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
102     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
103     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
104     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
105     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
106     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
107     "ca",  "cad", "car", "cay", "cch", "ce",  "ceb", "cgg",
108     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
109     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
110     "cs",  "csb", "cu",  "cv",  "cy",
111     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
112     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
113     "dyo", "dyu", "dz",  "dzg",
114     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
115     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
116     "ext",
117     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
118     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
119     "frs", "fur", "fy",
120     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
121     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
122     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
123     "gur", "guz", "gv",  "gwi",
124     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
125     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
126     "hup", "hy",  "hz",
127     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
128     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
129     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
130     "jv",
131     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
132     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
133     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
134     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
135     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
136     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
137     "kv",  "kw",  "ky",
138     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
139     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
140     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
141     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
142     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
143     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
144     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
145     "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
146     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
147     "my",  "mye", "myv", "mzn",
148     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
149     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
150     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
151     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
152     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
153     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
154     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
155     "pon", "prg", "pro", "ps",  "pt",
156     "qu",  "quc", "qug",
157     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
158     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
159     "rw",  "rwk",
160     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
161     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
162     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
163     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
164     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
165     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
166     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
167     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
168     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
169     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
170     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
171     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
172     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
173     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
174     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
175     "vot", "vro", "vun",
176     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
177     "xal", "xh",  "xmf", "xog",
178     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
179     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
180     "zun", "zxx", "zza",
181 NULL,
182     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
183 NULL
184 };
185 
186 static const char* const DEPRECATED_LANGUAGES[]={
187     "in", "iw", "ji", "jw", NULL, NULL
188 };
189 static const char* const REPLACEMENT_LANGUAGES[]={
190     "id", "he", "yi", "jv", NULL, NULL
191 };
192 
193 /**
194  * Table of 3-letter language codes.
195  *
196  * This is a lookup table used to convert 3-letter language codes to
197  * their 2-letter equivalent, where possible.  It must be kept in sync
198  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
199  * same language as LANGUAGES_3[i].  The commented-out lines are
200  * copied from LANGUAGES to make eyeballing this baby easier.
201  *
202  * Where a 3-letter language code has no 2-letter equivalent, the
203  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
204  *
205  * This table should be terminated with a NULL entry, followed by a
206  * second list, and another NULL entry.  The two lists correspond to
207  * the two lists in LANGUAGES.
208  */
209 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
210 /* ISO639 table version is 20150505 */
211 static const char * const LANGUAGES_3[] = {
212     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
213     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
214     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
215     "arn", "aro", "arp", "arq", "arw", "ary", "arz", "asm",
216     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
217     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
218     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
219     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
220     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
221     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
222     "cat", "cad", "car", "cay", "cch", "che", "ceb", "cgg",
223     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
224     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
225     "ces", "csb", "chu", "chv", "cym",
226     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
227     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
228     "dyo", "dyu", "dzo", "dzg",
229     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
230     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
231     "ext",
232     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
233     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
234     "frs", "fur", "fry",
235     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
236     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
237     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
238     "gur", "guz", "glv", "gwi",
239     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
240     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
241     "hup", "hye", "her",
242     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
243     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
244     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
245     "jav",
246     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
247     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
248     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
249     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
250     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
251     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
252     "kom", "cor", "kir",
253     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
254     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
255     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
256     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
257     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
258     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
259     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
260     "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
261     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
262     "mya", "mye", "myv", "mzn",
263     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
264     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
265     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
266     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
267     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
268     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
269     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
270     "pon", "prg", "pro", "pus", "por",
271     "que", "quc", "qug",
272     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
273     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
274     "kin", "rwk",
275     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
276     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
277     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
278     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
279     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
280     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
281     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
282     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
283     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
284     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
285     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
286     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
287     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
288     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
289     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
290     "vot", "vro", "vun",
291     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
292     "xal", "xho", "xmf", "xog",
293     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
294     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
295     "zun", "zxx", "zza",
296 NULL,
297 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
298     "ind", "heb", "yid", "jaw", "srp",
299 NULL
300 };
301 
302 /**
303  * Table of 2-letter country codes.
304  *
305  * This list must be in sorted order.  This list is returned directly
306  * to the user by some API.
307  *
308  * This list must be kept in sync with COUNTRIES_3, with corresponding
309  * entries matched.
310  *
311  * This table should be terminated with a NULL entry, followed by a
312  * second list, and another NULL entry.  The first list is visible to
313  * user code when this array is returned by API.  The second list
314  * contains codes we support, but do not expose through user API.
315  *
316  * Notes:
317  *
318  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
319  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
320  * new codes keeping the old ones for compatibility updated to include
321  * 1999/12/03 revisions *CWB*
322  *
323  * RO(ROM) is now RO(ROU) according to
324  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
325  */
326 static const char * const COUNTRIES[] = {
327     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
328     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
329     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
330     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
331     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
332     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
333     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
334     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
335     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
336     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
337     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
338     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
339     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
340     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
341     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
342     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
343     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
344     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
345     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
346     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
347     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
348     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
349     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
350     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
351     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
352     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
353     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
354     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
355     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
356     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
357 NULL,
358     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
359 NULL
360 };
361 
362 static const char* const DEPRECATED_COUNTRIES[] = {
363     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
364 };
365 static const char* const REPLACEMENT_COUNTRIES[] = {
366 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
367     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
368 };
369 
370 /**
371  * Table of 3-letter country codes.
372  *
373  * This is a lookup table used to convert 3-letter country codes to
374  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
375  * For all valid i, COUNTRIES[i] must refer to the same country as
376  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
377  * to make eyeballing this baby easier.
378  *
379  * This table should be terminated with a NULL entry, followed by a
380  * second list, and another NULL entry.  The two lists correspond to
381  * the two lists in COUNTRIES.
382  */
383 static const char * const COUNTRIES_3[] = {
384 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
385     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
386 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
387     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
388 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
389     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
390 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
391     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
392 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
393     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
394 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
395     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
396 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
397     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
398 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
399     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
400 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
401     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
402 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
403     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
404 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
405     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
406 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
407     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
408 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
409     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
410 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
411     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
412 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
413     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
414 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
415     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
416 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
417     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
418 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
419     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
420 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
421     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
422 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
423     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
424 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
425     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
426 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
427     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
428 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
429     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
430 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
431     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
432 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
433     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
434 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
435     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
436 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
437     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
438 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
439     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
440 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
441     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
442 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
443     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
444 NULL,
445 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
446     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
447 NULL
448 };
449 
450 typedef struct CanonicalizationMap {
451     const char *id;          /* input ID */
452     const char *canonicalID; /* canonicalized output ID */
453     const char *keyword;     /* keyword, or NULL if none */
454     const char *value;       /* keyword value, or NULL if kw==NULL */
455 } CanonicalizationMap;
456 
457 /**
458  * A map to canonicalize locale IDs.  This handles a variety of
459  * different semantic kinds of transformations.
460  */
461 static const CanonicalizationMap CANONICALIZE_MAP[] = {
462     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
463     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
464     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
465     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
466     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
467     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
468     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
469     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
470     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
471     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
472     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
473     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
474     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
475     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
476     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
477     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
478     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
479     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
480     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
481     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
482     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
483     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
484     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
485     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
486     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
487     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
488     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
489     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
490     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
491     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
492     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
493     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
494     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
495     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
496     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
497     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
498     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
499     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
500     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
501     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
502     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
503     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
504     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
505     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
506     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
507     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
508 };
509 
510 typedef struct VariantMap {
511     const char *variant;          /* input ID */
512     const char *keyword;     /* keyword, or NULL if none */
513     const char *value;       /* keyword value, or NULL if kw==NULL */
514 } VariantMap;
515 
516 static const VariantMap VARIANT_MAP[] = {
517     { "EURO",   "currency", "EUR" },
518     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
519     { "STROKE", "collation", "stroke" }  /* Solaris variant */
520 };
521 
522 /* ### BCP47 Conversion *******************************************/
523 /* Test if the locale id has BCP47 u extension and does not have '@' */
524 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
525 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
526 #define _ConvertBCP47(finalID, id, buffer, length,err) \
527         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
528             finalID=id; \
529         } else { \
530             finalID=buffer; \
531         }
532 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)533 static int32_t getShortestSubtagLength(const char *localeID) {
534     int32_t localeIDLength = uprv_strlen(localeID);
535     int32_t length = localeIDLength;
536     int32_t tmpLength = 0;
537     int32_t i;
538     UBool reset = TRUE;
539 
540     for (i = 0; i < localeIDLength; i++) {
541         if (localeID[i] != '_' && localeID[i] != '-') {
542             if (reset) {
543                 tmpLength = 0;
544                 reset = FALSE;
545             }
546             tmpLength++;
547         } else {
548             if (tmpLength != 0 && tmpLength < length) {
549                 length = tmpLength;
550             }
551             reset = TRUE;
552         }
553     }
554 
555     return length;
556 }
557 
558 /* ### Keywords **************************************************/
559 
560 #define ULOC_KEYWORD_BUFFER_LEN 25
561 #define ULOC_MAX_NO_KEYWORDS 25
562 
563 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)564 locale_getKeywordsStart(const char *localeID) {
565     const char *result = NULL;
566     if((result = uprv_strchr(localeID, '@')) != NULL) {
567         return result;
568     }
569 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
570     else {
571         /* We do this because the @ sign is variant, and the @ sign used on one
572         EBCDIC machine won't be compiled the same way on other EBCDIC based
573         machines. */
574         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
575         const uint8_t *charToFind = ebcdicSigns;
576         while(*charToFind) {
577             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
578                 return result;
579             }
580             charToFind++;
581         }
582     }
583 #endif
584     return NULL;
585 }
586 
587 /**
588  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
589  * @param keywordName incoming name to be canonicalized
590  * @param status return status (keyword too long)
591  * @return length of the keyword name
592  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)593 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
594 {
595   int32_t i;
596   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
597 
598   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
599     /* keyword name too long for internal buffer */
600     *status = U_INTERNAL_PROGRAM_ERROR;
601           return 0;
602   }
603 
604   /* normalize the keyword name */
605   for(i = 0; i < keywordNameLen; i++) {
606     buf[i] = uprv_tolower(keywordName[i]);
607   }
608   buf[i] = 0;
609 
610   return keywordNameLen;
611 }
612 
613 typedef struct {
614     char keyword[ULOC_KEYWORD_BUFFER_LEN];
615     int32_t keywordLen;
616     const char *valueStart;
617     int32_t valueLen;
618 } KeywordStruct;
619 
620 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)621 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
622     const char* leftString = ((const KeywordStruct *)left)->keyword;
623     const char* rightString = ((const KeywordStruct *)right)->keyword;
624     return uprv_strcmp(leftString, rightString);
625 }
626 
627 /**
628  * Both addKeyword and addValue must already be in canonical form.
629  * Either both addKeyword and addValue are NULL, or neither is NULL.
630  * If they are not NULL they must be zero terminated.
631  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
632  */
633 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,const char * addKeyword,const char * addValue,UErrorCode * status)634 _getKeywords(const char *localeID,
635              char prev,
636              char *keywords, int32_t keywordCapacity,
637              char *values, int32_t valuesCapacity, int32_t *valLen,
638              UBool valuesToo,
639              const char* addKeyword,
640              const char* addValue,
641              UErrorCode *status)
642 {
643     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
644 
645     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
646     int32_t numKeywords = 0;
647     const char* pos = localeID;
648     const char* equalSign = NULL;
649     const char* semicolon = NULL;
650     int32_t i = 0, j, n;
651     int32_t keywordsLen = 0;
652     int32_t valuesLen = 0;
653 
654     if(prev == '@') { /* start of keyword definition */
655         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
656         do {
657             UBool duplicate = FALSE;
658             /* skip leading spaces */
659             while(*pos == ' ') {
660                 pos++;
661             }
662             if (!*pos) { /* handle trailing "; " */
663                 break;
664             }
665             if(numKeywords == maxKeywords) {
666                 *status = U_INTERNAL_PROGRAM_ERROR;
667                 return 0;
668             }
669             equalSign = uprv_strchr(pos, '=');
670             semicolon = uprv_strchr(pos, ';');
671             /* lack of '=' [foo@currency] is illegal */
672             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
673             if(!equalSign || (semicolon && semicolon<equalSign)) {
674                 *status = U_INVALID_FORMAT_ERROR;
675                 return 0;
676             }
677             /* need to normalize both keyword and keyword name */
678             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
679                 /* keyword name too long for internal buffer */
680                 *status = U_INTERNAL_PROGRAM_ERROR;
681                 return 0;
682             }
683             for(i = 0, n = 0; i < equalSign - pos; ++i) {
684                 if (pos[i] != ' ') {
685                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
686                 }
687             }
688 
689             /* zero-length keyword is an error. */
690             if (n == 0) {
691                 *status = U_INVALID_FORMAT_ERROR;
692                 return 0;
693             }
694 
695             keywordList[numKeywords].keyword[n] = 0;
696             keywordList[numKeywords].keywordLen = n;
697             /* now grab the value part. First we skip the '=' */
698             equalSign++;
699             /* then we leading spaces */
700             while(*equalSign == ' ') {
701                 equalSign++;
702             }
703 
704             /* Premature end or zero-length value */
705             if (!*equalSign || equalSign == semicolon) {
706                 *status = U_INVALID_FORMAT_ERROR;
707                 return 0;
708             }
709 
710             keywordList[numKeywords].valueStart = equalSign;
711 
712             pos = semicolon;
713             i = 0;
714             if(pos) {
715                 while(*(pos - i - 1) == ' ') {
716                     i++;
717                 }
718                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
719                 pos++;
720             } else {
721                 i = (int32_t)uprv_strlen(equalSign);
722                 while(i && equalSign[i-1] == ' ') {
723                     i--;
724                 }
725                 keywordList[numKeywords].valueLen = i;
726             }
727             /* If this is a duplicate keyword, then ignore it */
728             for (j=0; j<numKeywords; ++j) {
729                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
730                     duplicate = TRUE;
731                     break;
732                 }
733             }
734             if (!duplicate) {
735                 ++numKeywords;
736             }
737         } while(pos);
738 
739         /* Handle addKeyword/addValue. */
740         if (addKeyword != NULL) {
741             UBool duplicate = FALSE;
742             U_ASSERT(addValue != NULL);
743             /* Search for duplicate; if found, do nothing. Explicit keyword
744                overrides addKeyword. */
745             for (j=0; j<numKeywords; ++j) {
746                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
747                     duplicate = TRUE;
748                     break;
749                 }
750             }
751             if (!duplicate) {
752                 if (numKeywords == maxKeywords) {
753                     *status = U_INTERNAL_PROGRAM_ERROR;
754                     return 0;
755                 }
756                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
757                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
758                 keywordList[numKeywords].valueStart = addValue;
759                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
760                 ++numKeywords;
761             }
762         } else {
763             U_ASSERT(addValue == NULL);
764         }
765 
766         /* now we have a list of keywords */
767         /* we need to sort it */
768         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
769 
770         /* Now construct the keyword part */
771         for(i = 0; i < numKeywords; i++) {
772             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
773                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
774                 if(valuesToo) {
775                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
776                 } else {
777                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
778                 }
779             }
780             keywordsLen += keywordList[i].keywordLen + 1;
781             if(valuesToo) {
782                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
783                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
784                 }
785                 keywordsLen += keywordList[i].valueLen;
786 
787                 if(i < numKeywords - 1) {
788                     if(keywordsLen < keywordCapacity) {
789                         keywords[keywordsLen] = ';';
790                     }
791                     keywordsLen++;
792                 }
793             }
794             if(values) {
795                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
796                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
797                     values[valuesLen + keywordList[i].valueLen] = 0;
798                 }
799                 valuesLen += keywordList[i].valueLen + 1;
800             }
801         }
802         if(values) {
803             values[valuesLen] = 0;
804             if(valLen) {
805                 *valLen = valuesLen;
806             }
807         }
808         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
809     } else {
810         return 0;
811     }
812 }
813 
814 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)815 locale_getKeywords(const char *localeID,
816                    char prev,
817                    char *keywords, int32_t keywordCapacity,
818                    char *values, int32_t valuesCapacity, int32_t *valLen,
819                    UBool valuesToo,
820                    UErrorCode *status) {
821     return _getKeywords(localeID, prev, keywords, keywordCapacity,
822                         values, valuesCapacity, valLen, valuesToo,
823                         NULL, NULL, status);
824 }
825 
826 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)827 uloc_getKeywordValue(const char* localeID,
828                      const char* keywordName,
829                      char* buffer, int32_t bufferCapacity,
830                      UErrorCode* status)
831 {
832     const char* startSearchHere = NULL;
833     const char* nextSeparator = NULL;
834     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
835     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
836     int32_t i = 0;
837     int32_t result = 0;
838 
839     if(status && U_SUCCESS(*status) && localeID) {
840       char tempBuffer[ULOC_FULLNAME_CAPACITY];
841       const char* tmpLocaleID;
842 
843       if (_hasBCP47Extension(localeID)) {
844           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
845       } else {
846           tmpLocaleID=localeID;
847       }
848 
849       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
850       if(startSearchHere == NULL) {
851           /* no keywords, return at once */
852           return 0;
853       }
854 
855       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
856       if(U_FAILURE(*status)) {
857         return 0;
858       }
859 
860       /* find the first keyword */
861       while(startSearchHere) {
862           startSearchHere++;
863           /* skip leading spaces (allowed?) */
864           while(*startSearchHere == ' ') {
865               startSearchHere++;
866           }
867           nextSeparator = uprv_strchr(startSearchHere, '=');
868           /* need to normalize both keyword and keyword name */
869           if(!nextSeparator) {
870               break;
871           }
872           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
873               /* keyword name too long for internal buffer */
874               *status = U_INTERNAL_PROGRAM_ERROR;
875               return 0;
876           }
877           for(i = 0; i < nextSeparator - startSearchHere; i++) {
878               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
879           }
880           /* trim trailing spaces */
881           while(startSearchHere[i-1] == ' ') {
882               i--;
883               U_ASSERT(i>=0);
884           }
885           localeKeywordNameBuffer[i] = 0;
886 
887           startSearchHere = uprv_strchr(nextSeparator, ';');
888 
889           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
890               nextSeparator++;
891               while(*nextSeparator == ' ') {
892                   nextSeparator++;
893               }
894               /* we actually found the keyword. Copy the value */
895               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
896                   while(*(startSearchHere-1) == ' ') {
897                       startSearchHere--;
898                   }
899                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
900                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
901               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
902                   i = (int32_t)uprv_strlen(nextSeparator);
903                   while(nextSeparator[i - 1] == ' ') {
904                       i--;
905                   }
906                   uprv_strncpy(buffer, nextSeparator, i);
907                   result = u_terminateChars(buffer, bufferCapacity, i, status);
908               } else {
909                   /* give a bigger buffer, please */
910                   *status = U_BUFFER_OVERFLOW_ERROR;
911                   if(startSearchHere) {
912                       result = (int32_t)(startSearchHere - nextSeparator);
913                   } else {
914                       result = (int32_t)uprv_strlen(nextSeparator);
915                   }
916               }
917               return result;
918           }
919       }
920     }
921     return 0;
922 }
923 
924 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)925 uloc_setKeywordValue(const char* keywordName,
926                      const char* keywordValue,
927                      char* buffer, int32_t bufferCapacity,
928                      UErrorCode* status)
929 {
930     /* TODO: sorting. removal. */
931     int32_t keywordNameLen;
932     int32_t keywordValueLen;
933     int32_t bufLen;
934     int32_t needLen = 0;
935     int32_t foundValueLen;
936     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
937     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
938     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
939     int32_t i = 0;
940     int32_t rc;
941     char* nextSeparator = NULL;
942     char* nextEqualsign = NULL;
943     char* startSearchHere = NULL;
944     char* keywordStart = NULL;
945     char *insertHere = NULL;
946     if(U_FAILURE(*status)) {
947         return -1;
948     }
949     if(bufferCapacity>1) {
950         bufLen = (int32_t)uprv_strlen(buffer);
951     } else {
952         *status = U_ILLEGAL_ARGUMENT_ERROR;
953         return 0;
954     }
955     if(bufferCapacity<bufLen) {
956         /* The capacity is less than the length?! Is this NULL terminated? */
957         *status = U_ILLEGAL_ARGUMENT_ERROR;
958         return 0;
959     }
960     if(keywordValue && !*keywordValue) {
961         keywordValue = NULL;
962     }
963     if(keywordValue) {
964         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
965     } else {
966         keywordValueLen = 0;
967     }
968     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
969     if(U_FAILURE(*status)) {
970         return 0;
971     }
972     startSearchHere = (char*)locale_getKeywordsStart(buffer);
973     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
974         if(!keywordValue) { /* no keywords = nothing to remove */
975             return bufLen;
976         }
977 
978         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
979         if(startSearchHere) { /* had a single @ */
980             needLen--; /* already had the @ */
981             /* startSearchHere points at the @ */
982         } else {
983             startSearchHere=buffer+bufLen;
984         }
985         if(needLen >= bufferCapacity) {
986             *status = U_BUFFER_OVERFLOW_ERROR;
987             return needLen; /* no change */
988         }
989         *startSearchHere = '@';
990         startSearchHere++;
991         uprv_strcpy(startSearchHere, keywordNameBuffer);
992         startSearchHere += keywordNameLen;
993         *startSearchHere = '=';
994         startSearchHere++;
995         uprv_strcpy(startSearchHere, keywordValue);
996         startSearchHere+=keywordValueLen;
997         return needLen;
998     } /* end shortcut - no @ */
999 
1000     keywordStart = startSearchHere;
1001     /* search for keyword */
1002     while(keywordStart) {
1003         keywordStart++;
1004         /* skip leading spaces (allowed?) */
1005         while(*keywordStart == ' ') {
1006             keywordStart++;
1007         }
1008         nextEqualsign = uprv_strchr(keywordStart, '=');
1009         /* need to normalize both keyword and keyword name */
1010         if(!nextEqualsign) {
1011             break;
1012         }
1013         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1014             /* keyword name too long for internal buffer */
1015             *status = U_INTERNAL_PROGRAM_ERROR;
1016             return 0;
1017         }
1018         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1019             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1020         }
1021         /* trim trailing spaces */
1022         while(keywordStart[i-1] == ' ') {
1023             i--;
1024         }
1025         U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
1026         localeKeywordNameBuffer[i] = 0;
1027 
1028         nextSeparator = uprv_strchr(nextEqualsign, ';');
1029         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1030         if(rc == 0) {
1031             nextEqualsign++;
1032             while(*nextEqualsign == ' ') {
1033                 nextEqualsign++;
1034             }
1035             /* we actually found the keyword. Change the value */
1036             if (nextSeparator) {
1037                 keywordAtEnd = 0;
1038                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1039             } else {
1040                 keywordAtEnd = 1;
1041                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1042             }
1043             if(keywordValue) { /* adding a value - not removing */
1044               if(foundValueLen == keywordValueLen) {
1045                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1046                 return bufLen; /* no change in size */
1047               } else if(foundValueLen > keywordValueLen) {
1048                 int32_t delta = foundValueLen - keywordValueLen;
1049                 if(nextSeparator) { /* RH side */
1050                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1051                 }
1052                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1053                 bufLen -= delta;
1054                 buffer[bufLen]=0;
1055                 return bufLen;
1056               } else { /* FVL < KVL */
1057                 int32_t delta = keywordValueLen - foundValueLen;
1058                 if((bufLen+delta) >= bufferCapacity) {
1059                   *status = U_BUFFER_OVERFLOW_ERROR;
1060                   return bufLen+delta;
1061                 }
1062                 if(nextSeparator) { /* RH side */
1063                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1064                 }
1065                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1066                 bufLen += delta;
1067                 buffer[bufLen]=0;
1068                 return bufLen;
1069               }
1070             } else { /* removing a keyword */
1071               if(keywordAtEnd) {
1072                 /* zero out the ';' or '@' just before startSearchhere */
1073                 keywordStart[-1] = 0;
1074                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1075               } else {
1076                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1077                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1078                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1079               }
1080             }
1081         } else if(rc<0){ /* end match keyword */
1082           /* could insert at this location. */
1083           insertHere = keywordStart;
1084         }
1085         keywordStart = nextSeparator;
1086     } /* end loop searching */
1087 
1088     if(!keywordValue) {
1089       return bufLen; /* removal of non-extant keyword - no change */
1090     }
1091 
1092     /* we know there is at least one keyword. */
1093     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1094     if(needLen >= bufferCapacity) {
1095         *status = U_BUFFER_OVERFLOW_ERROR;
1096         return needLen; /* no change */
1097     }
1098 
1099     if(insertHere) {
1100       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1101       keywordStart = insertHere;
1102     } else {
1103       keywordStart = buffer+bufLen;
1104       *keywordStart = ';';
1105       keywordStart++;
1106     }
1107     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1108     keywordStart += keywordNameLen;
1109     *keywordStart = '=';
1110     keywordStart++;
1111     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1112     keywordStart+=keywordValueLen;
1113     if(insertHere) {
1114       *keywordStart = ';';
1115       keywordStart++;
1116     }
1117     buffer[needLen]=0;
1118     return needLen;
1119 }
1120 
1121 /* ### ID parsing implementation **************************************************/
1122 
1123 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1124 
1125 /*returns TRUE if one of the special prefixes is here (s=string)
1126   'x-' or 'i-' */
1127 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1128 
1129 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1130  * except for variant
1131  */
1132 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1133 
_strnchr(const char * str,int32_t len,char c)1134 static char* _strnchr(const char* str, int32_t len, char c) {
1135     U_ASSERT(str != 0 && len >= 0);
1136     while (len-- != 0) {
1137         char d = *str;
1138         if (d == c) {
1139             return (char*) str;
1140         } else if (d == 0) {
1141             break;
1142         }
1143         ++str;
1144     }
1145     return NULL;
1146 }
1147 
1148 /**
1149  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1150  * a NULL entry, followed by more entries, and a second NULL entry.
1151  *
1152  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1153  * COUNTRIES_3.
1154  */
_findIndex(const char * const * list,const char * key)1155 static int16_t _findIndex(const char* const* list, const char* key)
1156 {
1157     const char* const* anchor = list;
1158     int32_t pass = 0;
1159 
1160     /* Make two passes through two NULL-terminated arrays at 'list' */
1161     while (pass++ < 2) {
1162         while (*list) {
1163             if (uprv_strcmp(key, *list) == 0) {
1164                 return (int16_t)(list - anchor);
1165             }
1166             list++;
1167         }
1168         ++list;     /* skip final NULL *CWB*/
1169     }
1170     return -1;
1171 }
1172 
1173 /* count the length of src while copying it to dest; return strlen(src) */
1174 static inline int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1175 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1176     const char *anchor;
1177     char c;
1178 
1179     anchor=src;
1180     for(;;) {
1181         if((c=*src)==0) {
1182             return (int32_t)(src-anchor);
1183         }
1184         if(destCapacity<=0) {
1185             return (int32_t)((src-anchor)+uprv_strlen(src));
1186         }
1187         ++src;
1188         *dest++=c;
1189         --destCapacity;
1190     }
1191 }
1192 
1193 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1194 uloc_getCurrentCountryID(const char* oldID){
1195     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1196     if (offset >= 0) {
1197         return REPLACEMENT_COUNTRIES[offset];
1198     }
1199     return oldID;
1200 }
1201 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1202 uloc_getCurrentLanguageID(const char* oldID){
1203     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1204     if (offset >= 0) {
1205         return REPLACEMENT_LANGUAGES[offset];
1206     }
1207     return oldID;
1208 }
1209 /*
1210  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1211  * avoid duplicating code to handle the earlier locale ID pieces
1212  * in the functions for the later ones by
1213  * setting the *pEnd pointer to where they stopped parsing
1214  *
1215  * TODO try to use this in Locale
1216  */
1217 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1218 ulocimp_getLanguage(const char *localeID,
1219                     char *language, int32_t languageCapacity,
1220                     const char **pEnd) {
1221     int32_t i=0;
1222     int32_t offset;
1223     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1224 
1225     /* if it starts with i- or x- then copy that prefix */
1226     if(_isIDPrefix(localeID)) {
1227         if(i<languageCapacity) {
1228             language[i]=(char)uprv_tolower(*localeID);
1229         }
1230         if(i<languageCapacity) {
1231             language[i+1]='-';
1232         }
1233         i+=2;
1234         localeID+=2;
1235     }
1236 
1237     /* copy the language as far as possible and count its length */
1238     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1239         if(i<languageCapacity) {
1240             language[i]=(char)uprv_tolower(*localeID);
1241         }
1242         if(i<3) {
1243             U_ASSERT(i>=0);
1244             lang[i]=(char)uprv_tolower(*localeID);
1245         }
1246         i++;
1247         localeID++;
1248     }
1249 
1250     if(i==3) {
1251         /* convert 3 character code to 2 character code if possible *CWB*/
1252         offset=_findIndex(LANGUAGES_3, lang);
1253         if(offset>=0) {
1254             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1255         }
1256     }
1257 
1258     if(pEnd!=NULL) {
1259         *pEnd=localeID;
1260     }
1261     return i;
1262 }
1263 
1264 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1265 ulocimp_getScript(const char *localeID,
1266                   char *script, int32_t scriptCapacity,
1267                   const char **pEnd)
1268 {
1269     int32_t idLen = 0;
1270 
1271     if (pEnd != NULL) {
1272         *pEnd = localeID;
1273     }
1274 
1275     /* copy the second item as far as possible and count its length */
1276     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1277             && uprv_isASCIILetter(localeID[idLen])) {
1278         idLen++;
1279     }
1280 
1281     /* If it's exactly 4 characters long, then it's a script and not a country. */
1282     if (idLen == 4) {
1283         int32_t i;
1284         if (pEnd != NULL) {
1285             *pEnd = localeID+idLen;
1286         }
1287         if(idLen > scriptCapacity) {
1288             idLen = scriptCapacity;
1289         }
1290         if (idLen >= 1) {
1291             script[0]=(char)uprv_toupper(*(localeID++));
1292         }
1293         for (i = 1; i < idLen; i++) {
1294             script[i]=(char)uprv_tolower(*(localeID++));
1295         }
1296     }
1297     else {
1298         idLen = 0;
1299     }
1300     return idLen;
1301 }
1302 
1303 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1304 ulocimp_getCountry(const char *localeID,
1305                    char *country, int32_t countryCapacity,
1306                    const char **pEnd)
1307 {
1308     int32_t idLen=0;
1309     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1310     int32_t offset;
1311 
1312     /* copy the country as far as possible and count its length */
1313     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1314         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1315             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1316         }
1317         idLen++;
1318     }
1319 
1320     /* the country should be either length 2 or 3 */
1321     if (idLen == 2 || idLen == 3) {
1322         UBool gotCountry = FALSE;
1323         /* convert 3 character code to 2 character code if possible *CWB*/
1324         if(idLen==3) {
1325             offset=_findIndex(COUNTRIES_3, cnty);
1326             if(offset>=0) {
1327                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1328                 gotCountry = TRUE;
1329             }
1330         }
1331         if (!gotCountry) {
1332             int32_t i = 0;
1333             for (i = 0; i < idLen; i++) {
1334                 if (i < countryCapacity) {
1335                     country[i]=(char)uprv_toupper(localeID[i]);
1336                 }
1337             }
1338         }
1339         localeID+=idLen;
1340     } else {
1341         idLen = 0;
1342     }
1343 
1344     if(pEnd!=NULL) {
1345         *pEnd=localeID;
1346     }
1347 
1348     return idLen;
1349 }
1350 
1351 /**
1352  * @param needSeparator if true, then add leading '_' if any variants
1353  * are added to 'variant'
1354  */
1355 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1356 _getVariantEx(const char *localeID,
1357               char prev,
1358               char *variant, int32_t variantCapacity,
1359               UBool needSeparator) {
1360     int32_t i=0;
1361 
1362     /* get one or more variant tags and separate them with '_' */
1363     if(_isIDSeparator(prev)) {
1364         /* get a variant string after a '-' or '_' */
1365         while(!_isTerminator(*localeID)) {
1366             if (needSeparator) {
1367                 if (i<variantCapacity) {
1368                     variant[i] = '_';
1369                 }
1370                 ++i;
1371                 needSeparator = FALSE;
1372             }
1373             if(i<variantCapacity) {
1374                 variant[i]=(char)uprv_toupper(*localeID);
1375                 if(variant[i]=='-') {
1376                     variant[i]='_';
1377                 }
1378             }
1379             i++;
1380             localeID++;
1381         }
1382     }
1383 
1384     /* if there is no variant tag after a '-' or '_' then look for '@' */
1385     if(i==0) {
1386         if(prev=='@') {
1387             /* keep localeID */
1388         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1389             ++localeID; /* point after the '@' */
1390         } else {
1391             return 0;
1392         }
1393         while(!_isTerminator(*localeID)) {
1394             if (needSeparator) {
1395                 if (i<variantCapacity) {
1396                     variant[i] = '_';
1397                 }
1398                 ++i;
1399                 needSeparator = FALSE;
1400             }
1401             if(i<variantCapacity) {
1402                 variant[i]=(char)uprv_toupper(*localeID);
1403                 if(variant[i]=='-' || variant[i]==',') {
1404                     variant[i]='_';
1405                 }
1406             }
1407             i++;
1408             localeID++;
1409         }
1410     }
1411 
1412     return i;
1413 }
1414 
1415 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1416 _getVariant(const char *localeID,
1417             char prev,
1418             char *variant, int32_t variantCapacity) {
1419     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1420 }
1421 
1422 /**
1423  * Delete ALL instances of a variant from the given list of one or
1424  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1425  * @param variants the source string of one or more variants,
1426  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1427  * terminated; if it is, trailing zero will NOT be maintained.
1428  * @param variantsLen length of variants
1429  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1430  * or "PREEURO"; not zero terminated
1431  * @param toDeleteLen length of toDelete
1432  * @return number of characters deleted from variants
1433  */
1434 static int32_t
_deleteVariant(char * variants,int32_t variantsLen,const char * toDelete,int32_t toDeleteLen)1435 _deleteVariant(char* variants, int32_t variantsLen,
1436                const char* toDelete, int32_t toDeleteLen)
1437 {
1438     int32_t delta = 0; /* number of chars deleted */
1439     for (;;) {
1440         UBool flag = FALSE;
1441         if (variantsLen < toDeleteLen) {
1442             return delta;
1443         }
1444         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1445             (variantsLen == toDeleteLen ||
1446              (flag=(variants[toDeleteLen] == '_'))))
1447         {
1448             int32_t d = toDeleteLen + (flag?1:0);
1449             variantsLen -= d;
1450             delta += d;
1451             if (variantsLen > 0) {
1452                 uprv_memmove(variants, variants+d, variantsLen);
1453             }
1454         } else {
1455             char* p = _strnchr(variants, variantsLen, '_');
1456             if (p == NULL) {
1457                 return delta;
1458             }
1459             ++p;
1460             variantsLen -= (int32_t)(p - variants);
1461             variants = p;
1462         }
1463     }
1464 }
1465 
1466 /* Keyword enumeration */
1467 
1468 typedef struct UKeywordsContext {
1469     char* keywords;
1470     char* current;
1471 } UKeywordsContext;
1472 
1473 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1474 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1475     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1476     uprv_free(enumerator->context);
1477     uprv_free(enumerator);
1478 }
1479 
1480 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1481 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1482     char *kw = ((UKeywordsContext *)en->context)->keywords;
1483     int32_t result = 0;
1484     while(*kw) {
1485         result++;
1486         kw += uprv_strlen(kw)+1;
1487     }
1488     return result;
1489 }
1490 
1491 static const char* U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1492 uloc_kw_nextKeyword(UEnumeration* en,
1493                     int32_t* resultLength,
1494                     UErrorCode* /*status*/) {
1495     const char* result = ((UKeywordsContext *)en->context)->current;
1496     int32_t len = 0;
1497     if(*result) {
1498         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1499         ((UKeywordsContext *)en->context)->current += len+1;
1500     } else {
1501         result = NULL;
1502     }
1503     if (resultLength) {
1504         *resultLength = len;
1505     }
1506     return result;
1507 }
1508 
1509 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1510 uloc_kw_resetKeywords(UEnumeration* en,
1511                       UErrorCode* /*status*/) {
1512     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1513 }
1514 
1515 static const UEnumeration gKeywordsEnum = {
1516     NULL,
1517     NULL,
1518     uloc_kw_closeKeywords,
1519     uloc_kw_countKeywords,
1520     uenum_unextDefault,
1521     uloc_kw_nextKeyword,
1522     uloc_kw_resetKeywords
1523 };
1524 
1525 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1526 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1527 {
1528     UKeywordsContext *myContext = NULL;
1529     UEnumeration *result = NULL;
1530 
1531     if(U_FAILURE(*status)) {
1532         return NULL;
1533     }
1534     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1535     /* Null pointer test */
1536     if (result == NULL) {
1537         *status = U_MEMORY_ALLOCATION_ERROR;
1538         return NULL;
1539     }
1540     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1541     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1542     if (myContext == NULL) {
1543         *status = U_MEMORY_ALLOCATION_ERROR;
1544         uprv_free(result);
1545         return NULL;
1546     }
1547     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1548     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1549     myContext->keywords[keywordListSize] = 0;
1550     myContext->current = myContext->keywords;
1551     result->context = myContext;
1552     return result;
1553 }
1554 
1555 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1556 uloc_openKeywords(const char* localeID,
1557                         UErrorCode* status)
1558 {
1559     int32_t i=0;
1560     char keywords[256];
1561     int32_t keywordsCapacity = 256;
1562     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1563     const char* tmpLocaleID;
1564 
1565     if(status==NULL || U_FAILURE(*status)) {
1566         return 0;
1567     }
1568 
1569     if (_hasBCP47Extension(localeID)) {
1570         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1571     } else {
1572         if (localeID==NULL) {
1573            localeID=uloc_getDefault();
1574         }
1575         tmpLocaleID=localeID;
1576     }
1577 
1578     /* Skip the language */
1579     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1580     if(_isIDSeparator(*tmpLocaleID)) {
1581         const char *scriptID;
1582         /* Skip the script if available */
1583         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1584         if(scriptID != tmpLocaleID+1) {
1585             /* Found optional script */
1586             tmpLocaleID = scriptID;
1587         }
1588         /* Skip the Country */
1589         if (_isIDSeparator(*tmpLocaleID)) {
1590             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1591             if(_isIDSeparator(*tmpLocaleID)) {
1592                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1593             }
1594         }
1595     }
1596 
1597     /* keywords are located after '@' */
1598     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1599         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1600     }
1601 
1602     if(i) {
1603         return uloc_openKeywordList(keywords, i, status);
1604     } else {
1605         return NULL;
1606     }
1607 }
1608 
1609 
1610 /* bit-flags for 'options' parameter of _canonicalize */
1611 #define _ULOC_STRIP_KEYWORDS 0x2
1612 #define _ULOC_CANONICALIZE   0x1
1613 
1614 #define OPTION_SET(options, mask) ((options & mask) != 0)
1615 
1616 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1617 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1618 
1619 /**
1620  * Canonicalize the given localeID, to level 1 or to level 2,
1621  * depending on the options.  To specify level 1, pass in options=0.
1622  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1623  *
1624  * This is the code underlying uloc_getName and uloc_canonicalize.
1625  */
1626 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1627 _canonicalize(const char* localeID,
1628               char* result,
1629               int32_t resultCapacity,
1630               uint32_t options,
1631               UErrorCode* err) {
1632     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1633     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1634     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1635     const char* origLocaleID;
1636     const char* tmpLocaleID;
1637     const char* keywordAssign = NULL;
1638     const char* separatorIndicator = NULL;
1639     const char* addKeyword = NULL;
1640     const char* addValue = NULL;
1641     char* name;
1642     char* variant = NULL; /* pointer into name, or NULL */
1643 
1644     if (U_FAILURE(*err)) {
1645         return 0;
1646     }
1647 
1648     if (_hasBCP47Extension(localeID)) {
1649         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1650     } else {
1651         if (localeID==NULL) {
1652            localeID=uloc_getDefault();
1653         }
1654         tmpLocaleID=localeID;
1655     }
1656 
1657     origLocaleID=tmpLocaleID;
1658 
1659     /* if we are doing a full canonicalization, then put results in
1660        localeBuffer, if necessary; otherwise send them to result. */
1661     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1662         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1663         name = localeBuffer;
1664         nameCapacity = (int32_t)sizeof(localeBuffer);
1665     } else {
1666         name = result;
1667         nameCapacity = resultCapacity;
1668     }
1669 
1670     /* get all pieces, one after another, and separate with '_' */
1671     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1672 
1673     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1674         const char *d = uloc_getDefault();
1675 
1676         len = (int32_t)uprv_strlen(d);
1677 
1678         if (name != NULL) {
1679             uprv_strncpy(name, d, len);
1680         }
1681     } else if(_isIDSeparator(*tmpLocaleID)) {
1682         const char *scriptID;
1683 
1684         ++fieldCount;
1685         if(len<nameCapacity) {
1686             name[len]='_';
1687         }
1688         ++len;
1689 
1690         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1691             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1692         if(scriptSize > 0) {
1693             /* Found optional script */
1694             tmpLocaleID = scriptID;
1695             ++fieldCount;
1696             len+=scriptSize;
1697             if (_isIDSeparator(*tmpLocaleID)) {
1698                 /* If there is something else, then we add the _ */
1699                 if(len<nameCapacity) {
1700                     name[len]='_';
1701                 }
1702                 ++len;
1703             }
1704         }
1705 
1706         if (_isIDSeparator(*tmpLocaleID)) {
1707             const char *cntryID;
1708             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1709                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1710             if (cntrySize > 0) {
1711                 /* Found optional country */
1712                 tmpLocaleID = cntryID;
1713                 len+=cntrySize;
1714             }
1715             if(_isIDSeparator(*tmpLocaleID)) {
1716                 /* If there is something else, then we add the _  if we found country before. */
1717                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1718                     ++fieldCount;
1719                     if(len<nameCapacity) {
1720                         name[len]='_';
1721                     }
1722                     ++len;
1723                 }
1724 
1725                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1726                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1727                 if (variantSize > 0) {
1728                     variant = len<nameCapacity ? name+len : NULL;
1729                     len += variantSize;
1730                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1731                 }
1732             }
1733         }
1734     }
1735 
1736     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1737     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1738         UBool done = FALSE;
1739         do {
1740             char c = *tmpLocaleID;
1741             switch (c) {
1742             case 0:
1743             case '@':
1744                 done = TRUE;
1745                 break;
1746             default:
1747                 if (len<nameCapacity) {
1748                     name[len] = c;
1749                 }
1750                 ++len;
1751                 ++tmpLocaleID;
1752                 break;
1753             }
1754         } while (!done);
1755     }
1756 
1757     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1758        After this, tmpLocaleID either points to '@' or is NULL */
1759     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1760         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1761         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1762     }
1763 
1764     /* Copy POSIX-style variant, if any [mr@FOO] */
1765     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1766         tmpLocaleID != NULL && keywordAssign == NULL) {
1767         for (;;) {
1768             char c = *tmpLocaleID;
1769             if (c == 0) {
1770                 break;
1771             }
1772             if (len<nameCapacity) {
1773                 name[len] = c;
1774             }
1775             ++len;
1776             ++tmpLocaleID;
1777         }
1778     }
1779 
1780     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1781         /* Handle @FOO variant if @ is present and not followed by = */
1782         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1783             int32_t posixVariantSize;
1784             /* Add missing '_' if needed */
1785             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1786                 do {
1787                     if(len<nameCapacity) {
1788                         name[len]='_';
1789                     }
1790                     ++len;
1791                     ++fieldCount;
1792                 } while(fieldCount<2);
1793             }
1794             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1795                                              (UBool)(variantSize > 0));
1796             if (posixVariantSize > 0) {
1797                 if (variant == NULL) {
1798                     variant = name+len;
1799                 }
1800                 len += posixVariantSize;
1801                 variantSize += posixVariantSize;
1802             }
1803         }
1804 
1805         /* Handle generic variants first */
1806         if (variant) {
1807             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1808                 const char* variantToCompare = VARIANT_MAP[j].variant;
1809                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1810                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1811                 len -= variantLen;
1812                 if (variantLen > 0) {
1813                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1814                         --len;
1815                     }
1816                     addKeyword = VARIANT_MAP[j].keyword;
1817                     addValue = VARIANT_MAP[j].value;
1818                     break;
1819                 }
1820             }
1821             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1822                 --len;
1823             }
1824         }
1825 
1826         /* Look up the ID in the canonicalization map */
1827         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1828             const char* id = CANONICALIZE_MAP[j].id;
1829             int32_t n = (int32_t)uprv_strlen(id);
1830             if (len == n && uprv_strncmp(name, id, n) == 0) {
1831                 if (n == 0 && tmpLocaleID != NULL) {
1832                     break; /* Don't remap "" if keywords present */
1833                 }
1834                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1835                 if (CANONICALIZE_MAP[j].keyword) {
1836                     addKeyword = CANONICALIZE_MAP[j].keyword;
1837                     addValue = CANONICALIZE_MAP[j].value;
1838                 }
1839                 break;
1840             }
1841         }
1842     }
1843 
1844     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1845         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1846             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1847             if(len<nameCapacity) {
1848                 name[len]='@';
1849             }
1850             ++len;
1851             ++fieldCount;
1852             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1853                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1854         } else if (addKeyword != NULL) {
1855             U_ASSERT(addValue != NULL && len < nameCapacity);
1856             /* inelegant but works -- later make _getKeywords do this? */
1857             len += _copyCount(name+len, nameCapacity-len, "@");
1858             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1859             len += _copyCount(name+len, nameCapacity-len, "=");
1860             len += _copyCount(name+len, nameCapacity-len, addValue);
1861         }
1862     }
1863 
1864     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1865         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1866     }
1867 
1868     return u_terminateChars(result, resultCapacity, len, err);
1869 }
1870 
1871 /* ### ID parsing API **************************************************/
1872 
1873 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1874 uloc_getParent(const char*    localeID,
1875                char* parent,
1876                int32_t parentCapacity,
1877                UErrorCode* err)
1878 {
1879     const char *lastUnderscore;
1880     int32_t i;
1881 
1882     if (U_FAILURE(*err))
1883         return 0;
1884 
1885     if (localeID == NULL)
1886         localeID = uloc_getDefault();
1887 
1888     lastUnderscore=uprv_strrchr(localeID, '_');
1889     if(lastUnderscore!=NULL) {
1890         i=(int32_t)(lastUnderscore-localeID);
1891     } else {
1892         i=0;
1893     }
1894 
1895     if(i>0 && parent != localeID) {
1896         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1897     }
1898     return u_terminateChars(parent, parentCapacity, i, err);
1899 }
1900 
1901 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1902 uloc_getLanguage(const char*    localeID,
1903          char* language,
1904          int32_t languageCapacity,
1905          UErrorCode* err)
1906 {
1907     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1908     int32_t i=0;
1909 
1910     if (err==NULL || U_FAILURE(*err)) {
1911         return 0;
1912     }
1913 
1914     if(localeID==NULL) {
1915         localeID=uloc_getDefault();
1916     }
1917 
1918     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1919     return u_terminateChars(language, languageCapacity, i, err);
1920 }
1921 
1922 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1923 uloc_getScript(const char*    localeID,
1924          char* script,
1925          int32_t scriptCapacity,
1926          UErrorCode* err)
1927 {
1928     int32_t i=0;
1929 
1930     if(err==NULL || U_FAILURE(*err)) {
1931         return 0;
1932     }
1933 
1934     if(localeID==NULL) {
1935         localeID=uloc_getDefault();
1936     }
1937 
1938     /* skip the language */
1939     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1940     if(_isIDSeparator(*localeID)) {
1941         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1942     }
1943     return u_terminateChars(script, scriptCapacity, i, err);
1944 }
1945 
1946 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1947 uloc_getCountry(const char* localeID,
1948             char* country,
1949             int32_t countryCapacity,
1950             UErrorCode* err)
1951 {
1952     int32_t i=0;
1953 
1954     if(err==NULL || U_FAILURE(*err)) {
1955         return 0;
1956     }
1957 
1958     if(localeID==NULL) {
1959         localeID=uloc_getDefault();
1960     }
1961 
1962     /* Skip the language */
1963     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1964     if(_isIDSeparator(*localeID)) {
1965         const char *scriptID;
1966         /* Skip the script if available */
1967         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1968         if(scriptID != localeID+1) {
1969             /* Found optional script */
1970             localeID = scriptID;
1971         }
1972         if(_isIDSeparator(*localeID)) {
1973             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1974         }
1975     }
1976     return u_terminateChars(country, countryCapacity, i, err);
1977 }
1978 
1979 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1980 uloc_getVariant(const char* localeID,
1981                 char* variant,
1982                 int32_t variantCapacity,
1983                 UErrorCode* err)
1984 {
1985     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1986     const char* tmpLocaleID;
1987     int32_t i=0;
1988 
1989     if(err==NULL || U_FAILURE(*err)) {
1990         return 0;
1991     }
1992 
1993     if (_hasBCP47Extension(localeID)) {
1994         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1995     } else {
1996         if (localeID==NULL) {
1997            localeID=uloc_getDefault();
1998         }
1999         tmpLocaleID=localeID;
2000     }
2001 
2002     /* Skip the language */
2003     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2004     if(_isIDSeparator(*tmpLocaleID)) {
2005         const char *scriptID;
2006         /* Skip the script if available */
2007         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2008         if(scriptID != tmpLocaleID+1) {
2009             /* Found optional script */
2010             tmpLocaleID = scriptID;
2011         }
2012         /* Skip the Country */
2013         if (_isIDSeparator(*tmpLocaleID)) {
2014             const char *cntryID;
2015             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2016             if (cntryID != tmpLocaleID+1) {
2017                 /* Found optional country */
2018                 tmpLocaleID = cntryID;
2019             }
2020             if(_isIDSeparator(*tmpLocaleID)) {
2021                 /* If there was no country ID, skip a possible extra IDSeparator */
2022                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2023                     tmpLocaleID++;
2024                 }
2025                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2026             }
2027         }
2028     }
2029 
2030     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2031     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2032 /*
2033     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2034         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2035     }
2036 */
2037     return u_terminateChars(variant, variantCapacity, i, err);
2038 }
2039 
2040 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2041 uloc_getName(const char* localeID,
2042              char* name,
2043              int32_t nameCapacity,
2044              UErrorCode* err)
2045 {
2046     return _canonicalize(localeID, name, nameCapacity, 0, err);
2047 }
2048 
2049 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2050 uloc_getBaseName(const char* localeID,
2051                  char* name,
2052                  int32_t nameCapacity,
2053                  UErrorCode* err)
2054 {
2055     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2056 }
2057 
2058 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)2059 uloc_canonicalize(const char* localeID,
2060                   char* name,
2061                   int32_t nameCapacity,
2062                   UErrorCode* err)
2063 {
2064     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2065 }
2066 
2067 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)2068 uloc_getISO3Language(const char* localeID)
2069 {
2070     int16_t offset;
2071     char lang[ULOC_LANG_CAPACITY];
2072     UErrorCode err = U_ZERO_ERROR;
2073 
2074     if (localeID == NULL)
2075     {
2076         localeID = uloc_getDefault();
2077     }
2078     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2079     if (U_FAILURE(err))
2080         return "";
2081     offset = _findIndex(LANGUAGES, lang);
2082     if (offset < 0)
2083         return "";
2084     return LANGUAGES_3[offset];
2085 }
2086 
2087 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)2088 uloc_getISO3Country(const char* localeID)
2089 {
2090     int16_t offset;
2091     char cntry[ULOC_LANG_CAPACITY];
2092     UErrorCode err = U_ZERO_ERROR;
2093 
2094     if (localeID == NULL)
2095     {
2096         localeID = uloc_getDefault();
2097     }
2098     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2099     if (U_FAILURE(err))
2100         return "";
2101     offset = _findIndex(COUNTRIES, cntry);
2102     if (offset < 0)
2103         return "";
2104 
2105     return COUNTRIES_3[offset];
2106 }
2107 
2108 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2109 uloc_getLCID(const char* localeID)
2110 {
2111     UErrorCode status = U_ZERO_ERROR;
2112     char       langID[ULOC_FULLNAME_CAPACITY];
2113 
2114     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2115     if (U_FAILURE(status)) {
2116         return 0;
2117     }
2118 
2119     if (uprv_strchr(localeID, '@')) {
2120         // uprv_convertToLCID does not support keywords other than collation.
2121         // Remove all keywords except collation.
2122         int32_t len;
2123         char collVal[ULOC_KEYWORDS_CAPACITY];
2124         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2125 
2126         len = uloc_getKeywordValue(localeID, "collation", collVal,
2127             sizeof(collVal)/sizeof(collVal[0]) - 1, &status);
2128 
2129         if (U_SUCCESS(status) && len > 0) {
2130             collVal[len] = 0;
2131 
2132             len = uloc_getBaseName(localeID, tmpLocaleID,
2133                 sizeof(tmpLocaleID)/sizeof(tmpLocaleID[0]) - 1, &status);
2134 
2135             if (U_SUCCESS(status)) {
2136                 tmpLocaleID[len] = 0;
2137 
2138                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2139                     sizeof(tmpLocaleID)/sizeof(tmpLocaleID[0]) - len - 1, &status);
2140 
2141                 if (U_SUCCESS(status)) {
2142                     tmpLocaleID[len] = 0;
2143                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2144                 }
2145             }
2146         }
2147 
2148         // fall through - all keywords are simply ignored
2149         status = U_ZERO_ERROR;
2150     }
2151 
2152     return uprv_convertToLCID(langID, localeID, &status);
2153 }
2154 
2155 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2156 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2157                 UErrorCode *status)
2158 {
2159     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2160 }
2161 
2162 /* ### Default locale **************************************************/
2163 
2164 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2165 uloc_getDefault()
2166 {
2167     return locale_get_default();
2168 }
2169 
2170 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2171 uloc_setDefault(const char*   newDefaultLocale,
2172              UErrorCode* err)
2173 {
2174     if (U_FAILURE(*err))
2175         return;
2176     /* the error code isn't currently used for anything by this function*/
2177 
2178     /* propagate change to C++ */
2179     locale_set_default(newDefaultLocale);
2180 }
2181 
2182 /**
2183  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2184  * to an array of pointers to arrays of char.  All of these pointers are owned
2185  * by ICU-- do not delete them, and do not write through them.  The array is
2186  * terminated with a null pointer.
2187  */
2188 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2189 uloc_getISOLanguages()
2190 {
2191     return LANGUAGES;
2192 }
2193 
2194 /**
2195  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2196  * pointer to an array of pointers to arrays of char.  All of these pointers are
2197  * owned by ICU-- do not delete them, and do not write through them.  The array is
2198  * terminated with a null pointer.
2199  */
2200 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2201 uloc_getISOCountries()
2202 {
2203     return COUNTRIES;
2204 }
2205 
2206 
2207 /* this function to be moved into cstring.c later */
2208 static char gDecimal = 0;
2209 
2210 static /* U_CAPI */
2211 double
2212 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2213 _uloc_strtod(const char *start, char **end) {
2214     char *decimal;
2215     char *myEnd;
2216     char buf[30];
2217     double rv;
2218     if (!gDecimal) {
2219         char rep[5];
2220         /* For machines that decide to change the decimal on you,
2221         and try to be too smart with localization.
2222         This normally should be just a '.'. */
2223         sprintf(rep, "%+1.1f", 1.0);
2224         gDecimal = rep[2];
2225     }
2226 
2227     if(gDecimal == '.') {
2228         return uprv_strtod(start, end); /* fall through to OS */
2229     } else {
2230         uprv_strncpy(buf, start, 29);
2231         buf[29]=0;
2232         decimal = uprv_strchr(buf, '.');
2233         if(decimal) {
2234             *decimal = gDecimal;
2235         } else {
2236             return uprv_strtod(start, end); /* no decimal point */
2237         }
2238         rv = uprv_strtod(buf, &myEnd);
2239         if(end) {
2240             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2241         }
2242         return rv;
2243     }
2244 }
2245 
2246 typedef struct {
2247     float q;
2248     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2249     char *locale;
2250 } _acceptLangItem;
2251 
2252 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void *,const void * a,const void * b)2253 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2254 {
2255     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2256     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2257 
2258     int32_t rc = 0;
2259     if(bb->q < aa->q) {
2260         rc = -1;  /* A > B */
2261     } else if(bb->q > aa->q) {
2262         rc = 1;   /* A < B */
2263     } else {
2264         rc = 0;   /* A = B */
2265     }
2266 
2267     if(rc==0) {
2268         rc = uprv_stricmp(aa->locale, bb->locale);
2269     }
2270 
2271 #if defined(ULOC_DEBUG)
2272     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2273     aa->locale, aa->q,
2274     bb->locale, bb->q,
2275     rc);*/
2276 #endif
2277 
2278     return rc;
2279 }
2280 
2281 /*
2282 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2283 */
2284 
2285 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2286 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2287                             const char *httpAcceptLanguage,
2288                             UEnumeration* availableLocales,
2289                             UErrorCode *status)
2290 {
2291     _acceptLangItem *j;
2292     _acceptLangItem smallBuffer[30];
2293     char **strs;
2294     char tmp[ULOC_FULLNAME_CAPACITY +1];
2295     int32_t n = 0;
2296     const char *itemEnd;
2297     const char *paramEnd;
2298     const char *s;
2299     const char *t;
2300     int32_t res;
2301     int32_t i;
2302     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2303     int32_t jSize;
2304     char *tempstr; /* Use for null pointer check */
2305 
2306     j = smallBuffer;
2307     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2308     if(U_FAILURE(*status)) {
2309         return -1;
2310     }
2311 
2312     for(s=httpAcceptLanguage;s&&*s;) {
2313         while(isspace(*s)) /* eat space at the beginning */
2314             s++;
2315         itemEnd=uprv_strchr(s,',');
2316         paramEnd=uprv_strchr(s,';');
2317         if(!itemEnd) {
2318             itemEnd = httpAcceptLanguage+l; /* end of string */
2319         }
2320         if(paramEnd && paramEnd<itemEnd) {
2321             /* semicolon (;) is closer than end (,) */
2322             t = paramEnd+1;
2323             if(*t=='q') {
2324                 t++;
2325             }
2326             while(isspace(*t)) {
2327                 t++;
2328             }
2329             if(*t=='=') {
2330                 t++;
2331             }
2332             while(isspace(*t)) {
2333                 t++;
2334             }
2335             j[n].q = (float)_uloc_strtod(t,NULL);
2336         } else {
2337             /* no semicolon - it's 1.0 */
2338             j[n].q = 1.0f;
2339             paramEnd = itemEnd;
2340         }
2341         j[n].dummy=0;
2342         /* eat spaces prior to semi */
2343         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2344             ;
2345         /* Check for null pointer from uprv_strndup */
2346         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2347         if (tempstr == NULL) {
2348             *status = U_MEMORY_ALLOCATION_ERROR;
2349             return -1;
2350         }
2351         j[n].locale = tempstr;
2352         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2353         if(strcmp(j[n].locale,tmp)) {
2354             uprv_free(j[n].locale);
2355             j[n].locale=uprv_strdup(tmp);
2356         }
2357 #if defined(ULOC_DEBUG)
2358         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2359 #endif
2360         n++;
2361         s = itemEnd;
2362         while(*s==',') { /* eat duplicate commas */
2363             s++;
2364         }
2365         if(n>=jSize) {
2366             if(j==smallBuffer) {  /* overflowed the small buffer. */
2367                 j = static_cast<_acceptLangItem *>(uprv_malloc(sizeof(j[0])*(jSize*2)));
2368                 if(j!=NULL) {
2369                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2370                 }
2371 #if defined(ULOC_DEBUG)
2372                 fprintf(stderr,"malloced at size %d\n", jSize);
2373 #endif
2374             } else {
2375                 j = static_cast<_acceptLangItem *>(uprv_realloc(j, sizeof(j[0])*jSize*2));
2376 #if defined(ULOC_DEBUG)
2377                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2378 #endif
2379             }
2380             jSize *= 2;
2381             if(j==NULL) {
2382                 *status = U_MEMORY_ALLOCATION_ERROR;
2383                 return -1;
2384             }
2385         }
2386     }
2387     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2388     if(U_FAILURE(*status)) {
2389         if(j != smallBuffer) {
2390 #if defined(ULOC_DEBUG)
2391             fprintf(stderr,"freeing j %p\n", j);
2392 #endif
2393             uprv_free(j);
2394         }
2395         return -1;
2396     }
2397     strs = static_cast<char **>(uprv_malloc((size_t)(sizeof(strs[0])*n)));
2398     /* Check for null pointer */
2399     if (strs == NULL) {
2400         uprv_free(j); /* Free to avoid memory leak */
2401         *status = U_MEMORY_ALLOCATION_ERROR;
2402         return -1;
2403     }
2404     for(i=0;i<n;i++) {
2405 #if defined(ULOC_DEBUG)
2406         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2407 #endif
2408         strs[i]=j[i].locale;
2409     }
2410     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2411         (const char**)strs, n, availableLocales, status);
2412     for(i=0;i<n;i++) {
2413         uprv_free(strs[i]);
2414     }
2415     uprv_free(strs);
2416     if(j != smallBuffer) {
2417 #if defined(ULOC_DEBUG)
2418         fprintf(stderr,"freeing j %p\n", j);
2419 #endif
2420         uprv_free(j);
2421     }
2422     return res;
2423 }
2424 
2425 
2426 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2427 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2428                     UAcceptResult *outResult, const char **acceptList,
2429                     int32_t acceptListCount,
2430                     UEnumeration* availableLocales,
2431                     UErrorCode *status)
2432 {
2433     int32_t i,j;
2434     int32_t len;
2435     int32_t maxLen=0;
2436     char tmp[ULOC_FULLNAME_CAPACITY+1];
2437     const char *l;
2438     char **fallbackList;
2439     if(U_FAILURE(*status)) {
2440         return -1;
2441     }
2442     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2443     if(fallbackList==NULL) {
2444         *status = U_MEMORY_ALLOCATION_ERROR;
2445         return -1;
2446     }
2447     for(i=0;i<acceptListCount;i++) {
2448 #if defined(ULOC_DEBUG)
2449         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2450 #endif
2451         while((l=uenum_next(availableLocales, NULL, status))) {
2452 #if defined(ULOC_DEBUG)
2453             fprintf(stderr,"  %s\n", l);
2454 #endif
2455             len = (int32_t)uprv_strlen(l);
2456             if(!uprv_strcmp(acceptList[i], l)) {
2457                 if(outResult) {
2458                     *outResult = ULOC_ACCEPT_VALID;
2459                 }
2460 #if defined(ULOC_DEBUG)
2461                 fprintf(stderr, "MATCH! %s\n", l);
2462 #endif
2463                 if(len>0) {
2464                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2465                 }
2466                 for(j=0;j<i;j++) {
2467                     uprv_free(fallbackList[j]);
2468                 }
2469                 uprv_free(fallbackList);
2470                 return u_terminateChars(result, resultAvailable, len, status);
2471             }
2472             if(len>maxLen) {
2473                 maxLen = len;
2474             }
2475         }
2476         uenum_reset(availableLocales, status);
2477         /* save off parent info */
2478         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2479             fallbackList[i] = uprv_strdup(tmp);
2480         } else {
2481             fallbackList[i]=0;
2482         }
2483     }
2484 
2485     for(maxLen--;maxLen>0;maxLen--) {
2486         for(i=0;i<acceptListCount;i++) {
2487             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2488 #if defined(ULOC_DEBUG)
2489                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2490 #endif
2491                 while((l=uenum_next(availableLocales, NULL, status))) {
2492 #if defined(ULOC_DEBUG)
2493                     fprintf(stderr,"  %s\n", l);
2494 #endif
2495                     len = (int32_t)uprv_strlen(l);
2496                     if(!uprv_strcmp(fallbackList[i], l)) {
2497                         if(outResult) {
2498                             *outResult = ULOC_ACCEPT_FALLBACK;
2499                         }
2500 #if defined(ULOC_DEBUG)
2501                         fprintf(stderr, "fallback MATCH! %s\n", l);
2502 #endif
2503                         if(len>0) {
2504                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2505                         }
2506                         for(j=0;j<acceptListCount;j++) {
2507                             uprv_free(fallbackList[j]);
2508                         }
2509                         uprv_free(fallbackList);
2510                         return u_terminateChars(result, resultAvailable, len, status);
2511                     }
2512                 }
2513                 uenum_reset(availableLocales, status);
2514 
2515                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2516                     uprv_free(fallbackList[i]);
2517                     fallbackList[i] = uprv_strdup(tmp);
2518                 } else {
2519                     uprv_free(fallbackList[i]);
2520                     fallbackList[i]=0;
2521                 }
2522             }
2523         }
2524         if(outResult) {
2525             *outResult = ULOC_ACCEPT_FAILED;
2526         }
2527     }
2528     for(i=0;i<acceptListCount;i++) {
2529         uprv_free(fallbackList[i]);
2530     }
2531     uprv_free(fallbackList);
2532     return -1;
2533 }
2534 
2535 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2536 uloc_toUnicodeLocaleKey(const char* keyword)
2537 {
2538     const char* bcpKey = ulocimp_toBcpKey(keyword);
2539     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2540         // unknown keyword, but syntax is fine..
2541         return keyword;
2542     }
2543     return bcpKey;
2544 }
2545 
2546 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2547 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2548 {
2549     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2550     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2551         // unknown keyword, but syntax is fine..
2552         return value;
2553     }
2554     return bcpType;
2555 }
2556 
2557 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
2558 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
2559 
2560 static UBool
isWellFormedLegacyKey(const char * legacyKey)2561 isWellFormedLegacyKey(const char* legacyKey)
2562 {
2563     const char* p = legacyKey;
2564     while (*p) {
2565         if (!UPRV_ISALPHANUM(*p)) {
2566             return FALSE;
2567         }
2568         p++;
2569     }
2570     return TRUE;
2571 }
2572 
2573 static UBool
isWellFormedLegacyType(const char * legacyType)2574 isWellFormedLegacyType(const char* legacyType)
2575 {
2576     const char* p = legacyType;
2577     int32_t alphaNumLen = 0;
2578     while (*p) {
2579         if (*p == '_' || *p == '/' || *p == '-') {
2580             if (alphaNumLen == 0) {
2581                 return FALSE;
2582             }
2583             alphaNumLen = 0;
2584         } else if (UPRV_ISALPHANUM(*p)) {
2585             alphaNumLen++;
2586         } else {
2587             return FALSE;
2588         }
2589         p++;
2590     }
2591     return (alphaNumLen != 0);
2592 }
2593 
2594 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2595 uloc_toLegacyKey(const char* keyword)
2596 {
2597     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2598     if (legacyKey == NULL) {
2599         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2600         //
2601         // Note:
2602         //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2603         //  However, a key should not contain '=' obviously. For now, all existing
2604         //  keys are using ASCII alphabetic letters only. We won't add any new key
2605         //  that is not compatible with the BCP 47 syntax. Therefore, we assume
2606         //  a valid key consist from [0-9a-zA-Z], no symbols.
2607         if (isWellFormedLegacyKey(keyword)) {
2608             return keyword;
2609         }
2610     }
2611     return legacyKey;
2612 }
2613 
2614 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2615 uloc_toLegacyType(const char* keyword, const char* value)
2616 {
2617     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2618     if (legacyType == NULL) {
2619         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2620         //
2621         // Note:
2622         //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2623         //  However, a type should not contain '=' obviously. For now, all existing
2624         //  types are using ASCII alphabetic letters with a few symbol letters. We won't
2625         //  add any new type that is not compatible with the BCP 47 syntax except timezone
2626         //  IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain
2627         //  '-' '_' '/' in the middle.
2628         if (isWellFormedLegacyType(value)) {
2629             return value;
2630         }
2631     }
2632     return legacyType;
2633 }
2634 
2635 /*eof*/
2636