1 /*
2  *******************************************************************************
3  * Copyright (C) 2010-2013, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.impl.locale;
8 
9 import java.util.ArrayList;
10 import java.util.Collections;
11 import java.util.HashMap;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Set;
15 
16 public class LanguageTag {
17     private static final boolean JDKIMPL = false;
18 
19     //
20     // static fields
21     //
22     public static final String SEP = "-";
23     public static final String PRIVATEUSE = "x";
24     public static String UNDETERMINED = "und";
25     public static final String PRIVUSE_VARIANT_PREFIX = "lvariant";
26 
27     //
28     // Language subtag fields
29     //
30     private String _language = "";      // language subtag
31     private String _script = "";        // script subtag
32     private String _region = "";        // region subtag
33     private String _privateuse = "";    // privateuse
34 
35     private List<String> _extlangs = Collections.emptyList();   // extlang subtags
36     private List<String> _variants = Collections.emptyList();   // variant subtags
37     private List<String> _extensions = Collections.emptyList(); // extensions
38 
39     // Map contains grandfathered tags and its preferred mappings from
40     // http://www.ietf.org/rfc/rfc5646.txt
41     private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
42         new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();
43 
44     static {
45         // grandfathered = irregular           ; non-redundant tags registered
46         //               / regular             ; during the RFC 3066 era
47         //
48         // irregular     = "en-GB-oed"         ; irregular tags do not match
49         //               / "i-ami"             ; the 'langtag' production and
50         //               / "i-bnn"             ; would not otherwise be
51         //               / "i-default"         ; considered 'well-formed'
52         //               / "i-enochian"        ; These tags are all valid,
53         //               / "i-hak"             ; but most are deprecated
54         //               / "i-klingon"         ; in favor of more modern
55         //               / "i-lux"             ; subtags or subtag
56         //               / "i-mingo"           ; combination
57         //               / "i-navajo"
58         //               / "i-pwn"
59         //               / "i-tao"
60         //               / "i-tay"
61         //               / "i-tsu"
62         //               / "sgn-BE-FR"
63         //               / "sgn-BE-NL"
64         //               / "sgn-CH-DE"
65         //
66         // regular       = "art-lojban"        ; these tags match the 'langtag'
67         //               / "cel-gaulish"       ; production, but their subtags
68         //               / "no-bok"            ; are not extended language
69         //               / "no-nyn"            ; or variant subtags: their meaning
70         //               / "zh-guoyu"          ; is defined by their registration
71         //               / "zh-hakka"          ; and all of these are deprecated
72         //               / "zh-min"            ; in favor of a more modern
73         //               / "zh-min-nan"        ; subtag or sequence of subtags
74         //               / "zh-xiang"
75 
76         final String[][] entries = {
77           //{"tag",         "preferred"},
78             {"art-lojban",  "jbo"},
79             {"cel-gaulish", "xtg-x-cel-gaulish"},   // fallback
80             {"en-GB-oed",   "en-GB-x-oed"},         // fallback
81             {"i-ami",       "ami"},
82             {"i-bnn",       "bnn"},
83             {"i-default",   "en-x-i-default"},      // fallback
84             {"i-enochian",  "und-x-i-enochian"},    // fallback
85             {"i-hak",       "hak"},
86             {"i-klingon",   "tlh"},
87             {"i-lux",       "lb"},
88             {"i-mingo",     "see-x-i-mingo"},       // fallback
89             {"i-navajo",    "nv"},
90             {"i-pwn",       "pwn"},
91             {"i-tao",       "tao"},
92             {"i-tay",       "tay"},
93             {"i-tsu",       "tsu"},
94             {"no-bok",      "nb"},
95             {"no-nyn",      "nn"},
96             {"sgn-BE-FR",   "sfb"},
97             {"sgn-BE-NL",   "vgt"},
98             {"sgn-CH-DE",   "sgg"},
99             {"zh-guoyu",    "cmn"},
100             {"zh-hakka",    "hak"},
101             {"zh-min",      "nan-x-zh-min"},        // fallback
102             {"zh-min-nan",  "nan"},
103             {"zh-xiang",    "hsn"},
104         };
105         for (String[] e : entries) {
GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e)106             GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
107         }
108     }
109 
LanguageTag()110     private LanguageTag() {
111     }
112 
113     /*
114      * BNF in RFC5464
115      *
116      * Language-Tag  = langtag             ; normal language tags
117      *               / privateuse          ; private use tag
118      *               / grandfathered       ; grandfathered tags
119      *
120      *
121      * langtag       = language
122      *                 ["-" script]
123      *                 ["-" region]
124      *                 *("-" variant)
125      *                 *("-" extension)
126      *                 ["-" privateuse]
127      *
128      * language      = 2*3ALPHA            ; shortest ISO 639 code
129      *                 ["-" extlang]       ; sometimes followed by
130      *                                     ; extended language subtags
131      *               / 4ALPHA              ; or reserved for future use
132      *               / 5*8ALPHA            ; or registered language subtag
133      *
134      * extlang       = 3ALPHA              ; selected ISO 639 codes
135      *                 *2("-" 3ALPHA)      ; permanently reserved
136      *
137      * script        = 4ALPHA              ; ISO 15924 code
138      *
139      * region        = 2ALPHA              ; ISO 3166-1 code
140      *               / 3DIGIT              ; UN M.49 code
141      *
142      * variant       = 5*8alphanum         ; registered variants
143      *               / (DIGIT 3alphanum)
144      *
145      * extension     = singleton 1*("-" (2*8alphanum))
146      *
147      *                                     ; Single alphanumerics
148      *                                     ; "x" reserved for private use
149      * singleton     = DIGIT               ; 0 - 9
150      *               / %x41-57             ; A - W
151      *               / %x59-5A             ; Y - Z
152      *               / %x61-77             ; a - w
153      *               / %x79-7A             ; y - z
154      *
155      * privateuse    = "x" 1*("-" (1*8alphanum))
156      *
157      */
parse(String languageTag, ParseStatus sts)158     public static LanguageTag parse(String languageTag, ParseStatus sts) {
159         if (sts == null) {
160             sts = new ParseStatus();
161         } else {
162             sts.reset();
163         }
164 
165         StringTokenIterator itr;
166         boolean isGrandfathered = false;
167 
168         // Check if the tag is grandfathered
169         String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
170         if (gfmap != null) {
171             // use preferred mapping
172             itr = new StringTokenIterator(gfmap[1], SEP);
173             isGrandfathered = true;
174         } else {
175             itr = new StringTokenIterator(languageTag, SEP);
176         }
177 
178         LanguageTag tag = new LanguageTag();
179 
180         // langtag must start with either language or privateuse
181         if (tag.parseLanguage(itr, sts)) {
182             tag.parseExtlangs(itr, sts);
183             tag.parseScript(itr, sts);
184             tag.parseRegion(itr, sts);
185             tag.parseVariants(itr, sts);
186             tag.parseExtensions(itr, sts);
187         }
188         tag.parsePrivateuse(itr, sts);
189 
190         if (isGrandfathered) {
191             // Grandfathered tag is replaced with a well-formed tag above.
192             // However, the parsed length must be the original tag length.
193             assert (itr.isDone());
194             assert (!sts.isError());
195             sts._parseLength = languageTag.length();
196         } else if (!itr.isDone() && !sts.isError()) {
197             String s = itr.current();
198             sts._errorIndex = itr.currentStart();
199             if (s.length() == 0) {
200                 sts._errorMsg = "Empty subtag";
201             } else {
202                 sts._errorMsg = "Invalid subtag: " + s;
203             }
204         }
205 
206         return tag;
207     }
208 
209     //
210     // Language subtag parsers
211     //
212 
parseLanguage(StringTokenIterator itr, ParseStatus sts)213     private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) {
214         if (itr.isDone() || sts.isError()) {
215             return false;
216         }
217 
218         boolean found = false;
219 
220         String s = itr.current();
221         if (isLanguage(s)) {
222             found = true;
223             _language = s;
224             sts._parseLength = itr.currentEnd();
225             itr.next();
226         }
227 
228         return found;
229     }
230 
parseExtlangs(StringTokenIterator itr, ParseStatus sts)231     private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) {
232         if (itr.isDone() || sts.isError()) {
233             return false;
234         }
235 
236         boolean found = false;
237 
238         while (!itr.isDone()) {
239             String s = itr.current();
240             if (!isExtlang(s)) {
241                 break;
242             }
243             found = true;
244             if (_extlangs.isEmpty()) {
245                 _extlangs = new ArrayList<String>(3);
246             }
247             _extlangs.add(s);
248             sts._parseLength = itr.currentEnd();
249             itr.next();
250 
251             if (_extlangs.size() == 3) {
252                 // Maximum 3 extlangs
253                 break;
254             }
255         }
256 
257         return found;
258     }
259 
parseScript(StringTokenIterator itr, ParseStatus sts)260     private boolean parseScript(StringTokenIterator itr, ParseStatus sts) {
261         if (itr.isDone() || sts.isError()) {
262             return false;
263         }
264 
265         boolean found = false;
266 
267         String s = itr.current();
268         if (isScript(s)) {
269             found = true;
270             _script = s;
271             sts._parseLength = itr.currentEnd();
272             itr.next();
273         }
274 
275         return found;
276     }
277 
parseRegion(StringTokenIterator itr, ParseStatus sts)278     private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) {
279         if (itr.isDone() || sts.isError()) {
280             return false;
281         }
282 
283         boolean found = false;
284 
285         String s = itr.current();
286         if (isRegion(s)) {
287             found = true;
288             _region = s;
289             sts._parseLength = itr.currentEnd();
290             itr.next();
291         }
292 
293         return found;
294     }
295 
parseVariants(StringTokenIterator itr, ParseStatus sts)296     private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) {
297         if (itr.isDone() || sts.isError()) {
298             return false;
299         }
300 
301         boolean found = false;
302 
303         while (!itr.isDone()) {
304             String s = itr.current();
305             if (!isVariant(s)) {
306                 break;
307             }
308             found = true;
309             if (_variants.isEmpty()) {
310                 _variants = new ArrayList<String>(3);
311             }
312             _variants.add(s);
313             sts._parseLength = itr.currentEnd();
314             itr.next();
315         }
316 
317         return found;
318     }
319 
parseExtensions(StringTokenIterator itr, ParseStatus sts)320     private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) {
321         if (itr.isDone() || sts.isError()) {
322             return false;
323         }
324 
325         boolean found = false;
326 
327         while (!itr.isDone()) {
328             String s = itr.current();
329             if (isExtensionSingleton(s)) {
330                 int start = itr.currentStart();
331                 String singleton = s;
332                 StringBuilder sb = new StringBuilder(singleton);
333 
334                 itr.next();
335                 while (!itr.isDone()) {
336                     s = itr.current();
337                     if (isExtensionSubtag(s)) {
338                         sb.append(SEP).append(s);
339                         sts._parseLength = itr.currentEnd();
340                     } else {
341                         break;
342                     }
343                     itr.next();
344                 }
345 
346                 if (sts._parseLength <= start) {
347                     sts._errorIndex = start;
348                     sts._errorMsg = "Incomplete extension '" + singleton + "'";
349                     break;
350                 }
351 
352                 if (_extensions.size() == 0) {
353                     _extensions = new ArrayList<String>(4);
354                 }
355                 _extensions.add(sb.toString());
356                 found = true;
357             } else {
358                 break;
359             }
360         }
361         return found;
362     }
363 
parsePrivateuse(StringTokenIterator itr, ParseStatus sts)364     private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) {
365         if (itr.isDone() || sts.isError()) {
366             return false;
367         }
368 
369         boolean found = false;
370 
371         String s = itr.current();
372         if (isPrivateusePrefix(s)) {
373             int start = itr.currentStart();
374             StringBuilder sb = new StringBuilder(s);
375 
376             itr.next();
377             while (!itr.isDone()) {
378                 s = itr.current();
379                 if (!isPrivateuseSubtag(s)) {
380                     break;
381                 }
382                 sb.append(SEP).append(s);
383                 sts._parseLength = itr.currentEnd();
384 
385                 itr.next();
386             }
387 
388             if (sts._parseLength <= start) {
389                 // need at least 1 private subtag
390                 sts._errorIndex = start;
391                 sts._errorMsg = "Incomplete privateuse";
392             } else {
393                 _privateuse = sb.toString();
394                 found = true;
395             }
396         }
397 
398         return found;
399     }
400 
parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions)401     public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) {
402         LanguageTag tag = new LanguageTag();
403 
404         String language = baseLocale.getLanguage();
405         String script = baseLocale.getScript();
406         String region = baseLocale.getRegion();
407         String variant = baseLocale.getVariant();
408 
409         boolean hasSubtag = false;
410 
411         String privuseVar = null;   // store ill-formed variant subtags
412 
413         if (language.length() > 0 && isLanguage(language)) {
414             // Convert a deprecated language code used by Java to
415             // a new code
416             if (language.equals("iw")) {
417                 language = "he";
418             } else if (language.equals("ji")) {
419                 language = "yi";
420             } else if (language.equals("in")) {
421                 language = "id";
422             }
423             tag._language = language;
424         }
425 
426         if (script.length() > 0 && isScript(script)) {
427             tag._script = canonicalizeScript(script);
428             hasSubtag = true;
429         }
430 
431         if (region.length() > 0 && isRegion(region)) {
432             tag._region = canonicalizeRegion(region);
433             hasSubtag = true;
434         }
435 
436         if (JDKIMPL) {
437             // Special handling for no_NO_NY - use nn_NO for language tag
438             if (tag._language.equals("no") && tag._region.equals("NO") && variant.equals("NY")) {
439                 tag._language = "nn";
440                 variant = "";
441             }
442         }
443 
444         if (variant.length() > 0) {
445             List<String> variants = null;
446             StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP);
447             while (!varitr.isDone()) {
448                 String var = varitr.current();
449                 if (!isVariant(var)) {
450                     break;
451                 }
452                 if (variants == null) {
453                     variants = new ArrayList<String>();
454                 }
455                 if (JDKIMPL) {
456                     variants.add(var);  // Do not canonicalize!
457                 } else {
458                     variants.add(canonicalizeVariant(var));
459                 }
460                 varitr.next();
461             }
462             if (variants != null) {
463                 tag._variants = variants;
464                 hasSubtag = true;
465             }
466             if (!varitr.isDone()) {
467                 // ill-formed variant subtags
468                 StringBuilder buf = new StringBuilder();
469                 while (!varitr.isDone()) {
470                     String prvv = varitr.current();
471                     if (!isPrivateuseSubtag(prvv)) {
472                         // cannot use private use subtag - truncated
473                         break;
474                     }
475                     if (buf.length() > 0) {
476                         buf.append(SEP);
477                     }
478                     if (!JDKIMPL) {
479                         prvv = AsciiUtil.toLowerString(prvv);
480                     }
481                     buf.append(prvv);
482                     varitr.next();
483                 }
484                 if (buf.length() > 0) {
485                     privuseVar = buf.toString();
486                 }
487             }
488         }
489 
490         List<String> extensions = null;
491         String privateuse = null;
492 
493         Set<Character> locextKeys = localeExtensions.getKeys();
494         for (Character locextKey : locextKeys) {
495             Extension ext = localeExtensions.getExtension(locextKey);
496             if (isPrivateusePrefixChar(locextKey.charValue())) {
497                 privateuse = ext.getValue();
498             } else {
499                 if (extensions == null) {
500                     extensions = new ArrayList<String>();
501                 }
502                 extensions.add(locextKey.toString() + SEP + ext.getValue());
503             }
504         }
505 
506         if (extensions != null) {
507             tag._extensions = extensions;
508             hasSubtag = true;
509         }
510 
511         // append ill-formed variant subtags to private use
512         if (privuseVar != null) {
513             if (privateuse == null) {
514                 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar;
515             } else {
516                 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX + SEP + privuseVar.replace(BaseLocale.SEP, SEP);
517             }
518         }
519 
520         if (privateuse != null) {
521             tag._privateuse = privateuse;
522         }
523 
524         if (tag._language.length() == 0 && (hasSubtag || privateuse == null)) {
525             // use lang "und" when 1) no language is available AND
526             // 2) any of other subtags other than private use are available or
527             // no private use tag is available
528             tag._language = UNDETERMINED;
529         }
530 
531         return tag;
532     }
533 
534     //
535     // Getter methods for language subtag fields
536     //
537 
getLanguage()538     public String getLanguage() {
539         return _language;
540     }
541 
getExtlangs()542     public List<String> getExtlangs() {
543         return Collections.unmodifiableList(_extlangs);
544     }
545 
getScript()546     public String getScript() {
547         return _script;
548     }
549 
getRegion()550     public String getRegion() {
551         return _region;
552     }
553 
getVariants()554     public List<String> getVariants() {
555         return Collections.unmodifiableList(_variants);
556     }
557 
getExtensions()558     public List<String> getExtensions() {
559         return Collections.unmodifiableList(_extensions);
560     }
561 
getPrivateuse()562     public String getPrivateuse() {
563         return _privateuse;
564     }
565 
566     //
567     // Language subtag syntax checking methods
568     //
569 
isLanguage(String s)570     public static boolean isLanguage(String s) {
571         // language      = 2*3ALPHA            ; shortest ISO 639 code
572         //                 ["-" extlang]       ; sometimes followed by
573         //                                     ;   extended language subtags
574         //               / 4ALPHA              ; or reserved for future use
575         //               / 5*8ALPHA            ; or registered language subtag
576         return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s);
577     }
578 
isExtlang(String s)579     public static boolean isExtlang(String s) {
580         // extlang       = 3ALPHA              ; selected ISO 639 codes
581         //                 *2("-" 3ALPHA)      ; permanently reserved
582         return (s.length() == 3) && AsciiUtil.isAlphaString(s);
583     }
584 
isScript(String s)585     public static boolean isScript(String s) {
586         // script        = 4ALPHA              ; ISO 15924 code
587         return (s.length() == 4) && AsciiUtil.isAlphaString(s);
588     }
589 
isRegion(String s)590     public static boolean isRegion(String s) {
591         // region        = 2ALPHA              ; ISO 3166-1 code
592         //               / 3DIGIT              ; UN M.49 code
593         return ((s.length() == 2) && AsciiUtil.isAlphaString(s))
594                 || ((s.length() == 3) && AsciiUtil.isNumericString(s));
595     }
596 
isVariant(String s)597     public static boolean isVariant(String s) {
598         // variant       = 5*8alphanum         ; registered variants
599         //               / (DIGIT 3alphanum)
600         int len = s.length();
601         if (len >= 5 && len <= 8) {
602             return AsciiUtil.isAlphaNumericString(s);
603         }
604         if (len == 4) {
605             return AsciiUtil.isNumeric(s.charAt(0))
606                     && AsciiUtil.isAlphaNumeric(s.charAt(1))
607                     && AsciiUtil.isAlphaNumeric(s.charAt(2))
608                     && AsciiUtil.isAlphaNumeric(s.charAt(3));
609         }
610         return false;
611     }
612 
isExtensionSingleton(String s)613     public static boolean isExtensionSingleton(String s) {
614         // singleton     = DIGIT               ; 0 - 9
615         //               / %x41-57             ; A - W
616         //               / %x59-5A             ; Y - Z
617         //               / %x61-77             ; a - w
618         //               / %x79-7A             ; y - z
619 
620         return (s.length() == 1)
621                 && AsciiUtil.isAlphaString(s)
622                 && !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s);
623     }
624 
isExtensionSingletonChar(char c)625     public static boolean isExtensionSingletonChar(char c) {
626         return isExtensionSingleton(String.valueOf(c));
627     }
628 
isExtensionSubtag(String s)629     public static boolean isExtensionSubtag(String s) {
630         // extension     = singleton 1*("-" (2*8alphanum))
631         return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s);
632     }
633 
isPrivateusePrefix(String s)634     public static boolean isPrivateusePrefix(String s) {
635         // privateuse    = "x" 1*("-" (1*8alphanum))
636         return (s.length() == 1)
637                 && AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s);
638     }
639 
isPrivateusePrefixChar(char c)640     public static boolean isPrivateusePrefixChar(char c) {
641         return (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c)));
642     }
643 
isPrivateuseSubtag(String s)644     public static boolean isPrivateuseSubtag(String s) {
645         // privateuse    = "x" 1*("-" (1*8alphanum))
646         return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s);
647     }
648 
649     //
650     // Language subtag canonicalization methods
651     //
652 
canonicalizeLanguage(String s)653     public static String canonicalizeLanguage(String s) {
654         return AsciiUtil.toLowerString(s);
655     }
656 
canonicalizeExtlang(String s)657     public static String canonicalizeExtlang(String s) {
658         return AsciiUtil.toLowerString(s);
659     }
660 
canonicalizeScript(String s)661     public static String canonicalizeScript(String s) {
662         return AsciiUtil.toTitleString(s);
663     }
664 
canonicalizeRegion(String s)665     public static String canonicalizeRegion(String s) {
666         return AsciiUtil.toUpperString(s);
667     }
668 
canonicalizeVariant(String s)669     public static String canonicalizeVariant(String s) {
670         return AsciiUtil.toLowerString(s);
671     }
672 
canonicalizeExtension(String s)673     public static String canonicalizeExtension(String s) {
674         return AsciiUtil.toLowerString(s);
675     }
676 
canonicalizeExtensionSingleton(String s)677     public static String canonicalizeExtensionSingleton(String s) {
678         return AsciiUtil.toLowerString(s);
679     }
680 
canonicalizeExtensionSubtag(String s)681     public static String canonicalizeExtensionSubtag(String s) {
682         return AsciiUtil.toLowerString(s);
683     }
684 
canonicalizePrivateuse(String s)685     public static String canonicalizePrivateuse(String s) {
686         return AsciiUtil.toLowerString(s);
687     }
688 
canonicalizePrivateuseSubtag(String s)689     public static String canonicalizePrivateuseSubtag(String s) {
690         return AsciiUtil.toLowerString(s);
691     }
692 
toString()693     public String toString() {
694         StringBuilder sb = new StringBuilder();
695 
696         if (_language.length() > 0) {
697             sb.append(_language);
698 
699             for (String extlang : _extlangs) {
700                 sb.append(SEP).append(extlang);
701             }
702 
703             if (_script.length() > 0) {
704                 sb.append(SEP).append(_script);
705             }
706 
707             if (_region.length() > 0) {
708                 sb.append(SEP).append(_region);
709             }
710 
711             for (String variant : _variants) {
712                 sb.append(SEP).append(variant);
713             }
714 
715             for (String extension : _extensions) {
716                 sb.append(SEP).append(extension);
717             }
718         }
719         if (_privateuse.length() > 0) {
720             if (sb.length() > 0) {
721                 sb.append(SEP);
722             }
723             sb.append(_privateuse);
724         }
725 
726         return sb.toString();
727     }
728 }
729