1 /*
2  **********************************************************************
3  * Copyright (c) 2002-2011, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 package org.unicode.cldr.util;
10 
11 import java.util.Collection;
12 import java.util.Collections;
13 import java.util.EnumSet;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Locale;
17 import java.util.Map;
18 import java.util.Map.Entry;
19 import java.util.NoSuchElementException;
20 import java.util.Set;
21 import java.util.StringTokenizer;
22 import java.util.TreeMap;
23 import java.util.TreeSet;
24 import java.util.regex.Pattern;
25 
26 import org.unicode.cldr.tool.LikelySubtags;
27 
28 import com.google.common.base.CharMatcher;
29 import com.google.common.base.Joiner;
30 import com.google.common.base.Splitter;
31 import com.google.common.collect.ImmutableList;
32 import com.google.common.collect.ImmutableMap;
33 import com.ibm.icu.impl.Relation;
34 import com.ibm.icu.impl.Row.R2;
35 import com.ibm.icu.text.UnicodeSet;
36 
37 public class LanguageTagParser {
38     /**
39      * @return Returns the language, or "" if none.
40      */
getLanguage()41     public String getLanguage() {
42         return language;
43     }
44 
45     /**
46      * @return Returns the script, or "" if none.
47      */
getScript()48     public String getScript() {
49         return script;
50     }
51 
52     /**
53      * @return Returns the region, or "" if none.
54      */
getRegion()55     public String getRegion() {
56         return region;
57     }
58 
59     /**
60      * @return Returns the variants.
61      */
getVariants()62     public List<String> getVariants() {
63         return ImmutableList.copyOf(variants);
64     }
65 
66     /**
67      * @return Returns the grandfathered flag
68      */
isGrandfathered()69     public boolean isGrandfathered() {
70         return grandfathered;
71     }
72 
73     /**
74      * @return Returns the extensions.
75      */
76     @Deprecated
getExtensions()77     public Map<String, String> getExtensions() {
78         return OutputOption.ICU.convert(extensions);
79     }
80 
81     /**
82      * @return Returns the localeExtensions.
83      */
84     @Deprecated
getLocaleExtensions()85     public Map<String, String> getLocaleExtensions() {
86         return OutputOption.ICU.convert(localeExtensions);
87     }
88 
89     /**
90      * @return Returns the extensions.
91      */
getExtensionsDetailed()92     public Map<String, List<String>> getExtensionsDetailed() {
93         return ImmutableMap.copyOf(extensions);
94     }
95 
96     /**
97      * @return Returns the localeExtensions.
98      */
getLocaleExtensionsDetailed()99     public Map<String, List<String>> getLocaleExtensionsDetailed() {
100         return ImmutableMap.copyOf(localeExtensions);
101     }
102 
103     /**
104      * @return Returns the original, preparsed language tag
105      */
getOriginal()106     public String getOriginal() {
107         return original;
108     }
109 
110     /**
111      * @return Returns the language-script (or language) part of a tag.
112      */
getLanguageScript()113     public String getLanguageScript() {
114         if (script.length() != 0) return language + "_" + script;
115         return language;
116     }
117 
118     /**
119      * @param in
120      *            Collection of language tag strings
121      * @return Returns each of the language-script tags in the collection.
122      */
getLanguageScript(Collection<String> in)123     public static Set<String> getLanguageScript(Collection<String> in) {
124         return getLanguageAndScript(in, null);
125     }
126 
127     /**
128      * @param in
129      *            Collection of language tag strings
130      * @return Returns each of the language-script tags in the collection.
131      */
getLanguageAndScript(Collection<String> in, Set<String> output)132     public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) {
133         if (output == null) output = new TreeSet<String>();
134         LanguageTagParser lparser = new LanguageTagParser();
135         for (Iterator<String> it = in.iterator(); it.hasNext();) {
136             output.add(lparser.set(it.next()).getLanguageScript());
137         }
138         return output;
139     }
140 
141     // private fields
142 
143     private String original;
144     private boolean grandfathered = false;
145     private String language;
146     private String script;
147     private String region;
148     private Set<String> variants = new TreeSet<String>();
149     private Map<String, List<String>> extensions = new TreeMap<String, List<String>>(); // use tree map
150     private Map<String, List<String>> localeExtensions = new TreeMap<String, List<String>>();
151 
152     private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze();
153     private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze();
154     private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze();
155     private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze();
156     private static final UnicodeSet X = new UnicodeSet("[xX]").freeze();
157     private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze();
158     private static StandardCodes standardCodes = StandardCodes.make();
159     private static final Set<String> grandfatheredCodes = standardCodes.getAvailableCodes("grandfathered");
160     private static final String separator = "-_"; // '-' alone for 3066bis language tags
161     private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze();
162     private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator));
163     private static final Splitter SPLIT_COLON = Splitter.on(';');
164     private static final Splitter SPLIT_EQUAL = Splitter.on('=');
165     private static final SupplementalDataInfo SDI = SupplementalDataInfo.getInstance();
166     private static final Relation<R2<String, String>, String> BCP47_ALIASES = SDI.getBcp47Aliases();
167 
168     /**
169      * Parses out a language tag, setting a number of fields that can subsequently be retrieved.
170      * If a private-use field is found, it is returned as the last extension.<br>
171      * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see
172      * isValid.
173      *
174      * @param languageTag
175      * @return
176      */
set(String languageTag)177     public LanguageTagParser set(String languageTag) {
178         if (languageTag.length() == 0) {
179             throw new IllegalArgumentException("Language tag cannot be empty");
180         }
181         languageTag = languageTag.toLowerCase(Locale.ROOT);
182 
183         // clear everything out
184         language = region = script = "";
185         grandfathered = false;
186         variants.clear();
187         extensions.clear();
188         localeExtensions.clear();
189         original = languageTag;
190         int localeExtensionsPosition = languageTag.indexOf('@');
191         if (localeExtensionsPosition >= 0) {
192             final String localeExtensionsString = languageTag.substring(localeExtensionsPosition + 1);
193             for (String keyValue : SPLIT_COLON.split(localeExtensionsString)) {
194                 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator();
195                 final String key = keyValuePair.next();
196                 final String value = keyValuePair.next();
197                 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) {
198                     throwError(keyValue, "Invalid key/value pair");
199                 }
200                 localeExtensions.put(key, SPLIT_BAR.splitToList(value));
201             }
202             languageTag = languageTag.substring(0, localeExtensionsPosition);
203         }
204 
205         // first test for grandfathered
206         if (grandfatheredCodes.contains(languageTag)) {
207             language = languageTag;
208             grandfathered = true;
209             return this;
210         }
211 
212         // each time we fetch a token, we check for length from 1..8, and all alphanum
213         StringTokenizer st = new StringTokenizer(languageTag, separator);
214         String subtag;
215         try {
216             subtag = getSubtag(st);
217         } catch (Exception e1) {
218             throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1);
219         }
220 
221         // check for private use (x-...) and return if so
222         if (subtag.equalsIgnoreCase("x")) {
223             getExtension(subtag, st, 1);
224             return this;
225         }
226 
227         // check that language subtag is valid
228         if (!ALPHA.containsAll(subtag) || subtag.length() < 2) {
229             throwError(subtag, "Invalid language subtag");
230         }
231         try { // The try block is to catch the out-of-tokens case. Easier than checking each time.
232             language = subtag;
233             subtag = getSubtag(st); // prepare for next
234 
235             // check for script, 4 letters
236             if (subtag.length() == 4 && ALPHA.containsAll(subtag)) {
237                 script = subtag;
238                 script = script.substring(0, 1).toUpperCase(Locale.ROOT)
239                     + script.substring(1);
240                 subtag = getSubtag(st); // prepare for next
241             }
242 
243             // check for region, 2 letters or 3 digits
244             if (subtag.length() == 2 && ALPHA.containsAll(subtag)
245                 || subtag.length() == 3 && DIGIT.containsAll(subtag)) {
246                 region = subtag.toUpperCase(Locale.ENGLISH);
247                 subtag = getSubtag(st); // prepare for next
248             }
249 
250             // get variants: length > 4 or len=4 & starts with digit
251             while (isValidVariant(subtag)) {
252                 variants.add(subtag);
253                 subtag = getSubtag(st); // prepare for next
254             }
255 
256             // get extensions: singleton '-' subtag (2-8 long)
257             while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) {
258                 subtag = getExtension(subtag, st, 2);
259                 if (subtag == null) return this; // done
260             }
261 
262             if (subtag.equalsIgnoreCase("x")) {
263                 getExtension(subtag, st, 1);
264                 return this;
265             }
266 
267             // if we make it to this point, then we have an error
268             throwError(subtag, "Illegal subtag");
269 
270         } catch (NoSuchElementException e) {
271             // this exception just means we ran out of tokens. That's ok, so we just return.
272         }
273         return this;
274     }
275 
isValidVariant(String subtag)276     private boolean isValidVariant(String subtag) {
277         return subtag != null && ALPHANUM.containsAll(subtag)
278             && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0)));
279     }
280 
281     /**
282      *
283      * @return true iff the language tag validates
284      */
isValid()285     public boolean isValid() {
286         if (grandfathered) return true; // don't need further checking, since we already did so when parsing
287         if (!validates(language, "language")) return false;
288         if (!validates(script, "script")) return false;
289         if (!validates(region, "territory")) return false;
290         for (Iterator<String> it = variants.iterator(); it.hasNext();) {
291             if (!validates(it.next(), "variant")) return false;
292         }
293         return true; // passed the gauntlet
294     }
295 
296     public enum Status {
297         WELL_FORMED, VALID, CANONICAL, MINIMAL
298     }
299 
getStatus(Set<String> errors)300     public Status getStatus(Set<String> errors) {
301         errors.clear();
302         if (!isValid()) {
303             return Status.WELL_FORMED;
304             // TODO, check the bcp47 extension codes also
305         }
306         Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo();
307         Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language");
308 
309         if (aliasInfo.get("language").containsKey(language)) {
310             errors.add("Non-canonical language: " + language);
311         }
312         Map<String, String> lstrInfo = languageInfo.get(language);
313         if (lstrInfo != null) {
314             String scope = lstrInfo.get("Scope");
315             if ("collection".equals(scope)) {
316                 errors.add("Collection language: " + language);
317             }
318         }
319         if (aliasInfo.get("script").containsKey(script)) {
320             errors.add("Non-canonical script: " + script);
321         }
322         if (aliasInfo.get("territory").containsKey(region)) {
323             errors.add("Non-canonical region: " + region);
324         }
325         if (!errors.isEmpty()) {
326             return Status.VALID;
327         }
328         String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region);
329         String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false);
330         if (minimized == null) {
331             errors.add("No minimal data for:" + tag);
332             if (script.isEmpty() && region.isEmpty()) {
333                 return Status.MINIMAL;
334             } else {
335                 return Status.CANONICAL;
336             }
337         }
338         if (!tag.equals(minimized)) {
339             errors.add("Not minimal:" + tag + "-->" + minimized);
340             return Status.CANONICAL;
341         }
342         return Status.MINIMAL;
343     }
344 
345     /**
346      * @param subtag
347      * @param type
348      * @return true if the subtag is empty, or if it is in the registry
349      */
validates(String subtag, String type)350     private boolean validates(String subtag, String type) {
351         return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag);
352     }
353 
354     /**
355      * Internal method
356      *
357      * @param minLength
358      *            TODO
359      */
getExtension(String subtag, StringTokenizer st, int minLength)360     private String getExtension(String subtag, StringTokenizer st, int minLength) {
361         final String key = subtag;
362         if (extensions.containsKey(key)) {
363             throwError(subtag, "Can't have two extensions with the same key");
364         }
365         if (!st.hasMoreElements()) {
366             throwError(subtag, "Private Use / Extension requires subsequent subtag");
367         }
368         ImmutableList.Builder<String> result = ImmutableList.builder();
369         try {
370             while (st.hasMoreElements()) {
371                 subtag = getSubtag(st);
372                 if (subtag.length() < minLength) {
373                     return subtag;
374                 }
375                 result.add(subtag);
376             }
377             return null;
378         } finally {
379             extensions.put(key, result.build());
380         }
381     }
382 
383     /**
384      * Internal method
385      */
getSubtag(StringTokenizer st)386     private String getSubtag(StringTokenizer st) {
387         String result = st.nextToken();
388         if (result.length() < 1 || result.length() > 8) {
389             throwError(result, "Illegal length (must be 1..8)");
390         }
391         if (!ALPHANUM.containsAll(result)) {
392             throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")");
393         }
394         return result;
395     }
396 
397     /**
398      * Internal method
399      */
throwError(String subtag, String errorText)400     private void throwError(String subtag, String errorText) {
401         throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original);
402     }
403 
setRegion(String region)404     public LanguageTagParser setRegion(String region) {
405         this.region = region;
406         return this;
407     }
408 
setScript(String script)409     public LanguageTagParser setScript(String script) {
410         this.script = script;
411         return this;
412     }
413 
414     public enum OutputOption {
415         ICU('_'), BCP47('-');
416         final char separator;
417         final Joiner joiner;
418 
OutputOption(char separator)419         private OutputOption(char separator) {
420             this.separator = separator;
421             joiner = Joiner.on(separator);
422         }
423 
convert(Map<String, List<String>> mapToList)424         public Map<String, String> convert(Map<String, List<String>> mapToList) {
425             if (mapToList.isEmpty()) {
426                 return Collections.emptyMap();
427             }
428             ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
429             for (Entry<String, List<String>> entry : mapToList.entrySet()) {
430                 builder.put(entry.getKey(), joiner.join(entry.getValue()));
431             }
432             return builder.build();
433         }
434     }
435 
toString()436     public String toString() {
437         return toString(OutputOption.ICU);
438     }
439 
toString(OutputOption oo)440     public String toString(OutputOption oo) {
441         StringBuilder result = new StringBuilder(language); // optimize for the simple cases
442         if (this.script.length() != 0) result.append(oo.separator).append(script);
443         if (this.region.length() != 0) result.append(oo.separator).append(region);
444         if (this.variants.size() != 0) {
445             for (String variant : variants) {
446                 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT));
447             }
448         }
449         if (this.extensions.size() != 0) {
450             for (Entry<String, List<String>> extension : extensions.entrySet()) {
451                 String key = extension.getKey();
452                 String value = oo.joiner.join(extension.getValue());
453                 result.append(oo.separator).append(key)
454                     .append(oo.separator).append(value);
455             }
456         }
457         if (this.localeExtensions.size() != 0) {
458             if (oo == OutputOption.BCP47) {
459                 throw new IllegalArgumentException("Cannot represent as BCP47 without canonicalizing first");
460             }
461             result.append('@');
462             for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
463                 String key = extension.getKey();
464                 String value = oo.joiner.join(extension.getValue());
465                 result.append(oo != OutputOption.ICU ? key : key.toUpperCase(Locale.ROOT))
466                     .append('=').append(oo != OutputOption.ICU ? value : value.toUpperCase(Locale.ROOT));
467             }
468         }
469         return result.toString();
470     }
471 
472     /**
473      * Return just the language, script, and region (no variants or extensions)
474      * @return
475      */
toLSR()476     public String toLSR() {
477         String result = language; // optimize for the simple cases
478         if (this.script.length() != 0) result += "_" + script;
479         if (this.region.length() != 0) result += "_" + region;
480         return result;
481     }
482 
483     public enum Fields {
484         LANGUAGE, SCRIPT, REGION, VARIANTS
485     };
486 
487     public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT));
488     public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION));
489     public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE,
490         Fields.SCRIPT, Fields.REGION));
491 
toString(Set<Fields> selection)492     public String toString(Set<Fields> selection) {
493         String result = language;
494         if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script;
495         if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
496         if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
497             for (String variant : (Collection<String>) variants) {
498                 result += "_" + variant;
499             }
500         }
501         return result;
502     }
503 
setLanguage(String language)504     public LanguageTagParser setLanguage(String language) {
505         if (SEPARATORS.containsSome(language)) {
506             String oldScript = script;
507             String oldRegion = region;
508             Set<String> oldVariants = variants;
509             set(language);
510             if (script.length() == 0) {
511                 script = oldScript;
512             }
513             if (region.length() == 0) {
514                 region = oldRegion;
515             }
516             if (oldVariants.size() != 0) {
517                 variants = oldVariants;
518             }
519         } else {
520             this.language = language;
521         }
522         return this;
523     }
524 
setLocaleExtensions(Map<String, String> localeExtensions)525     public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) {
526         this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE);
527         return this;
528     }
529 
setVariants(Collection<String> newVariants)530     public LanguageTagParser setVariants(Collection<String> newVariants) {
531         for (String variant : newVariants) {
532             if (!isValidVariant(variant)) {
533                 throw new IllegalArgumentException("Illegal variant: " + variant);
534             }
535         }
536         variants.clear();
537         variants.addAll(newVariants);
538         return this;
539     }
540 
541     static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?");
542 
setExtensions(Map<String, String> newExtensions)543     public LanguageTagParser setExtensions(Map<String, String> newExtensions) {
544         this.extensions = expandMap(newExtensions, 2, 8);
545         return this;
546     }
547 
getSimpleParent(String s)548     public static String getSimpleParent(String s) {
549         int lastBar = s.lastIndexOf('_');
550         return lastBar >= 0 ? s.substring(0, lastBar) : "";
551     }
552 
expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength)553     private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) {
554         if (newLocaleExtensions.isEmpty()) {
555             return Collections.emptyMap();
556         }
557         ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder();
558         for (Entry<String, String> entry : newLocaleExtensions.entrySet()) {
559             result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength));
560         }
561         return result.build();
562     }
563 
split(String value, int minLength, int maxLength)564     private List<String> split(String value, int minLength, int maxLength) {
565         List<String> values = SPLIT_BAR.splitToList(value);
566         for (String s : values) {
567             if (s.length() < minLength || s.length() > maxLength) {
568                 throw new IllegalArgumentException("Illegal subtag length for: " + s);
569             }
570             if (!ALPHANUM.contains(s)) {
571                 throw new IllegalArgumentException("Illegal locale character in: " + s);
572             }
573         }
574         return values;
575     }
576 }