1 /*
2  **********************************************************************
3  * Copyright (c) 2002-2011, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 package org.unicode.cldr.util;
10 
11 import java.util.ArrayList;
12 import java.util.Collection;
13 import java.util.EnumSet;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Set;
17 import java.util.TreeSet;
18 
19 import com.ibm.icu.impl.Utility;
20 import com.ibm.icu.text.UnicodeSet;
21 
22 public class LocaleIDParser {
23     /**
24      * @return Returns the language.
25      */
getLanguage()26     public String getLanguage() {
27         return language;
28     }
29 
30     /**
31      * @return Returns the language.
32      */
getLanguageScript()33     public String getLanguageScript() {
34         if (script.length() != 0) return language + "_" + script;
35         return language;
36     }
37 
getLanguageScript(Collection<String> in)38     public static Set<String> getLanguageScript(Collection<String> in) {
39         return getLanguageScript(in, null);
40     }
41 
getLanguageScript(Collection<String> in, Set<String> output)42     public static Set<String> getLanguageScript(Collection<String> in, Set<String> output) {
43         if (output == null) output = new TreeSet<>();
44         LocaleIDParser lparser = new LocaleIDParser();
45         for (Iterator<String> it = in.iterator(); it.hasNext();) {
46             output.add(lparser.set(it.next()).getLanguageScript());
47         }
48         return output;
49     }
50 
51     /**
52      * @return Returns the region.
53      */
getRegion()54     public String getRegion() {
55         return region;
56     }
57 
58     /**
59      * @return Returns the script.
60      */
getScript()61     public String getScript() {
62         return script;
63     }
64 
65     /**
66      * @return Returns the variants.
67      */
getVariants()68     public String[] getVariants() {
69         return variants.clone();
70     }
71 
72     // TODO, update to RFC3066
73     // http://www.inter-locale.com/ID/draft-phillips-langtags-08.html
74     private String language;
75     private String script;
76     private String region;
77     private String[] variants;
78 
79     static final UnicodeSet letters = new UnicodeSet("[a-zA-Z]");
80     static final UnicodeSet digits = new UnicodeSet("[0-9]");
81 
set(String localeID)82     public LocaleIDParser set(String localeID) {
83         region = script = "";
84         variants = new String[0];
85 
86         String[] pieces = new String[100]; // fix limitation later
87         Utility.split(localeID, '_', pieces);
88         int i = 0;
89         language = pieces[i++];
90         if (i >= pieces.length) return this;
91         if (pieces[i].length() == 4) {
92             script = pieces[i++];
93             if (i >= pieces.length) return this;
94         }
95         if (pieces[i].length() == 2 && letters.containsAll(pieces[i])
96             || pieces[i].length() == 3 && digits.containsAll(pieces[i])) {
97             region = pieces[i++];
98             if (i >= pieces.length) return this;
99         }
100         List<String> al = new ArrayList<>();
101         while (i < pieces.length && pieces[i].length() > 0) {
102             al.add(pieces[i++]);
103         }
104         variants = new String[al.size()];
105         al.toArray(variants);
106         return this;
107     }
108 
109     /**
110      * Get the parent of a locale. If the input is "root", then return null.
111      * For example, if localeName is "fr_CA", return "fr".
112      *
113      * Only works on canonical locale names (right casing, etc.)!
114      *
115      * Formerly this function returned an empty string when localeName was "_VETTING".
116      * Now it returns "root" where it would have returned an empty string.
117      * TODO: explain "__VETTING", somehow related to SUMMARY_LOCALE. Note that
118      * CLDRLocale.process() changes "__" to "_" before this function is called.
119      * Reference: https://unicode-org.atlassian.net/browse/CLDR-13133
120      */
getParent(String localeName)121     public static String getParent(String localeName) {
122         int pos = localeName.lastIndexOf('_');
123         if (pos >= 0) {
124             SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
125             String explicitParent = sdi.getExplicitParentLocale(localeName);
126             if (explicitParent != null) {
127                 return explicitParent;
128             }
129             String truncated = localeName.substring(0, pos);
130             // if the final item is a script, and it is not the default content, then go directly to root
131             int pos2 = getScriptPosition(localeName);
132             if (pos2 > 0) {
133                 String script = localeName.substring(pos + 1);
134                 String defaultScript = sdi.getDefaultScript(truncated);
135                 if (!script.equals(defaultScript)) {
136                     return "root";
137                 }
138             }
139             if (truncated.length() == 0) {
140                 return "root";
141             }
142             return truncated;
143         }
144         if (localeName.equals("root")) {
145             return null;
146         }
147         return "root";
148     }
149 
150     /**
151      * If the locale consists of baseLanguage+script, return the position of the separator, otherwise -1.
152      * @param s
153      */
getScriptPosition(String locale)154     public static int getScriptPosition(String locale) {
155         int pos = locale.indexOf('_');
156         if (pos >= 0 && pos + 5 == locale.length()) {
157             int pos2 = locale.indexOf('_', pos + 1);
158             if (pos2 < 0) {
159                 return pos;
160             }
161         }
162         return -1;
163     }
164 
165     /**
166      * Utility to get the simple parent of a locale. If the input is "root", then the output is null.
167      * This method is similar to the getParent() method above, except that it does NOT pay any attention
168      * to the explicit parent locales information. Thus, getParent("zh_Hant") will return "root",
169      * but getSimpleParent("zh_Hant") would return "zh".
170      */
getSimpleParent(String localeName)171     public static String getSimpleParent(String localeName) {
172         int pos = localeName.lastIndexOf('_');
173         if (pos >= 0) {
174             return localeName.substring(0, pos);
175         }
176         if (localeName.equals("root") || localeName.equals(CLDRFile.SUPPLEMENTAL_NAME)) return null;
177         return "root";
178     }
179 
setLanguage(String language)180     public LocaleIDParser setLanguage(String language) {
181         this.language = language;
182         return this;
183     }
184 
setRegion(String region)185     public LocaleIDParser setRegion(String region) {
186         this.region = region;
187         return this;
188     }
189 
setScript(String script)190     public LocaleIDParser setScript(String script) {
191         this.script = script;
192         return this;
193     }
194 
setVariants(String[] variants)195     public LocaleIDParser setVariants(String[] variants) {
196         this.variants = variants.clone();
197         return this;
198     }
199 
200     public enum Level {
201         Language, Script, Region, Variants, Other
202     }
203 
204     /**
205      * Returns an int mask indicating the level
206      *
207      * @return (2 if script is present) + (4 if region is present) + (8 if region is present)
208      */
getLevels()209     public Set<Level> getLevels() {
210         EnumSet<Level> result = EnumSet.of(Level.Language);
211         if (getScript().length() != 0) result.add(Level.Script);
212         if (getRegion().length() != 0) result.add(Level.Region);
213         if (getVariants().length != 0) result.add(Level.Variants);
214         return result;
215     }
216 
getSiblings(Set<String> set)217     public Set<String> getSiblings(Set<String> set) {
218         Set<Level> myLevel = getLevels();
219         String localeID = toString();
220         String parentID = getParent(localeID);
221 
222         String prefix = (parentID == null || "root".equals(parentID)) ? "" : parentID + "_";
223         Set<String> siblings = new TreeSet<>();
224         for (String id : set) {
225             if (id.startsWith(prefix) && set(id).getLevels().equals(myLevel)) {
226                 siblings.add(id);
227             }
228         }
229         set(localeID); // leave in known state
230         return siblings;
231     }
232 
233     @Override
toString()234     public String toString() {
235         StringBuffer result = new StringBuffer(language);
236         if (script.length() != 0) result.append('_').append(script);
237         if (region.length() != 0) result.append('_').append(region);
238         if (variants != null) {
239             for (int i = 0; i < variants.length; ++i) {
240                 result.append('_').append(variants[i]);
241             }
242         }
243         return result.toString();
244     }
245 }