1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2015-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl.locale;
10 
11 import java.util.Arrays;
12 import java.util.EnumSet;
13 import java.util.HashSet;
14 import java.util.Set;
15 import java.util.regex.Pattern;
16 
17 import com.ibm.icu.impl.ValidIdentifiers;
18 import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
19 import com.ibm.icu.impl.ValidIdentifiers.Datatype;
20 import com.ibm.icu.impl.locale.KeyTypeData.ValueType;
21 import com.ibm.icu.util.IllformedLocaleException;
22 import com.ibm.icu.util.Output;
23 import com.ibm.icu.util.ULocale;
24 
25 /**
26  * @author markdavis
27  *
28  */
29 public class LocaleValidityChecker {
30     private final Set<Datasubtype> datasubtypes;
31     private final boolean allowsDeprecated;
32     public static class Where {
33         public Datatype fieldFailure;
34         public String codeFailure;
35 
set(Datatype datatype, String code)36         public boolean set(Datatype datatype, String code) {
37             fieldFailure = datatype;
38             codeFailure = code;
39             return false;
40         }
41         @Override
toString()42         public String toString() {
43             return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}";
44         }
45     }
46 
LocaleValidityChecker(Set<Datasubtype> datasubtypes)47     public LocaleValidityChecker(Set<Datasubtype> datasubtypes) {
48         this.datasubtypes = EnumSet.copyOf(datasubtypes);
49         allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated);
50     }
51 
LocaleValidityChecker(Datasubtype... datasubtypes)52     public LocaleValidityChecker(Datasubtype... datasubtypes) {
53         this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes));
54         allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated);
55     }
56 
57     /**
58      * @return the datasubtypes
59      */
getDatasubtypes()60     public Set<Datasubtype> getDatasubtypes() {
61         return EnumSet.copyOf(datasubtypes);
62     }
63 
64     static Pattern SEPARATOR = Pattern.compile("[-_]");
65 
66     @SuppressWarnings("unused")
67     private static final Pattern VALID_X = Pattern.compile("[a-zA-Z0-9]{2,8}(-[a-zA-Z0-9]{2,8})*");
68 
isValid(ULocale locale, Where where)69     public boolean isValid(ULocale locale, Where where) {
70         where.set(null, null);
71         final String language = locale.getLanguage();
72         final String script = locale.getScript();
73         final String region = locale.getCountry();
74         final String variantString = locale.getVariant();
75         final Set<Character> extensionKeys = locale.getExtensionKeys();
76         //        if (language.isEmpty()) {
77         //            // the only case where this is valid is if there is only an 'x' extension string
78         //            if (!script.isEmpty() || !region.isEmpty() || variantString.isEmpty()
79         //                    || extensionKeys.size() != 1 || !extensionKeys.contains('x')) {
80         //                return where.set(Datatype.x, "Null language only with x-...");
81         //            }
82         //            return true; // for x string, wellformedness = valid
83         //        }
84         if (!isValid(Datatype.language, language, where)) {
85             // special case x
86             if (language.equals("x")) {
87                 where.set(null, null); // for x, well-formed == valid
88                 return true;
89             }
90             return false;
91         }
92         if (!isValid(Datatype.script, script, where)) return false;
93         if (!isValid(Datatype.region, region, where)) return false;
94         if (!variantString.isEmpty()) {
95             for (String variant : SEPARATOR.split(variantString)) {
96                 if (!isValid(Datatype.variant, variant, where)) return false;
97             }
98         }
99         for (Character c : extensionKeys) {
100             try {
101                 Datatype datatype = Datatype.valueOf(c+"");
102                 switch (datatype) {
103                 case x:
104                     return true; // if it is syntactic (checked by ULocale) it is valid
105                 case t:
106                 case u:
107                     if (!isValidU(locale, datatype, locale.getExtension(c), where)) return false;
108                     break;
109                 default:
110                     break;
111                 }
112             } catch (Exception e) {
113                 return where.set(Datatype.illegal, c+"");
114             }
115         }
116         return true;
117     }
118 
119     // TODO combine this with the KeyTypeData.SpecialType, and get it from the type, not the key
120     enum SpecialCase {
121         normal, anything, reorder, codepoints, subdivision, rgKey;
get(String key)122         static SpecialCase get(String key) {
123             if (key.equals("kr")) {
124                 return reorder;
125             } else if (key.equals("vt")) {
126                 return codepoints;
127             } else if (key.equals("sd")) {
128                 return subdivision;
129             } else if (key.equals("rg")) {
130                 return rgKey;
131             } else if (key.equals("x0")) {
132                 return anything;
133             } else {
134                 return normal;
135             }
136         }
137     }
138 
139     /**
140      * @param locale
141      * @param datatype
142      * @param extension
143      * @param where
144      * @return
145      */
isValidU(ULocale locale, Datatype datatype, String extensionString, Where where)146     private boolean isValidU(ULocale locale, Datatype datatype, String extensionString, Where where) {
147         String key = "";
148         int typeCount = 0;
149         ValueType valueType = null;
150         SpecialCase specialCase = null;
151         StringBuilder prefix = new StringBuilder();
152         Set<String> seen = new HashSet<String>();
153 
154         StringBuilder tBuffer = datatype == Datatype.t ? new StringBuilder() : null;
155 
156         // TODO: is empty -u- valid?
157 
158         for (String subtag : SEPARATOR.split(extensionString)) {
159             if (subtag.length() == 2
160                     && (tBuffer == null || subtag.charAt(1) <= '9')) {
161                 // if we have accumulated a t buffer, check that first
162                 if (tBuffer != null) {
163                     // Check t buffer. Empty after 't' is ok.
164                     if (tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) {
165                         return false;
166                     }
167                     tBuffer = null;
168                 }
169                 key = KeyTypeData.toBcpKey(subtag);
170                 if (key == null) {
171                     return where.set(datatype, subtag);
172                 }
173                 if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) {
174                     return where.set(datatype, key);
175                 }
176                 valueType = KeyTypeData.getValueType(key);
177                 specialCase = SpecialCase.get(key);
178                 typeCount = 0;
179             } else if (tBuffer != null) {
180                 if (tBuffer.length() != 0) {
181                     tBuffer.append('-');
182                 }
183                 tBuffer.append(subtag);
184             } else {
185                 ++typeCount;
186                 switch (valueType) {
187                 case single:
188                     if (typeCount > 1) {
189                         return where.set(datatype, key+"-"+subtag);
190                     }
191                     break;
192                 case incremental:
193                     if (typeCount == 1) {
194                         prefix.setLength(0);
195                         prefix.append(subtag);
196                     } else {
197                         prefix.append('-').append(subtag);
198                         subtag = prefix.toString();
199                     }
200                     break;
201                 case multiple:
202                     if (typeCount == 1) {
203                         seen.clear();
204                     }
205                     break;
206                 default:
207                     break;
208                 }
209                 switch (specialCase) {
210                 case anything:
211                     continue;
212                 case codepoints:
213                     try {
214                         if (Integer.parseInt(subtag,16) > 0x10FFFF) {
215                             return where.set(datatype, key+"-"+subtag);
216                         }
217                     } catch (NumberFormatException e) {
218                         return where.set(datatype, key+"-"+subtag);
219                     }
220                     continue;
221                 case reorder:
222                     boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag);
223                     if (!newlyAdded || !isScriptReorder(subtag)) {
224                         return where.set(datatype, key+"-"+subtag);
225                     }
226                     continue;
227                 case subdivision:
228                     if (!isSubdivision(locale, subtag)) {
229                         return where.set(datatype, key+"-"+subtag);
230                     }
231                     continue;
232                 case rgKey:
233                     if (subtag.length() < 6 || !subtag.endsWith("zzzz")) {
234                         return where.set(datatype, subtag);
235                     }
236                     if (!isValid(Datatype.region, subtag.substring(0,subtag.length()-4), where)) {
237                         return false;
238                     }
239                     continue;
240                 default:
241                     break;
242                 }
243 
244                 // en-u-sd-usca
245                 // en-US-u-sd-usca
246                 Output<Boolean> isKnownKey = new Output<Boolean>();
247                 Output<Boolean> isSpecialType = new Output<Boolean>();
248                 String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType);
249                 if (type == null) {
250                     return where.set(datatype, key+"-"+subtag);
251                 }
252                 if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) {
253                     return where.set(datatype, key+"-"+subtag);
254                 }
255             }
256         }
257         // Check t buffer. Empty after 't' is ok.
258         if (tBuffer != null && tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) {
259             return false;
260         }
261         return true;
262     }
263 
264     /**
265      * @param locale
266      * @param subtag
267      * @return
268      */
isSubdivision(ULocale locale, String subtag)269     private boolean isSubdivision(ULocale locale, String subtag) {
270         // First check if the subtag is valid
271         if (subtag.length() < 3) {
272             return false;
273         }
274         String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2);
275         String subdivision = subtag.substring(region.length());
276         if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) {
277             return false;
278         }
279         // Then check for consistency with the locale's region
280         String localeRegion = locale.getCountry();
281         if (localeRegion.isEmpty()) {
282             ULocale max = ULocale.addLikelySubtags(locale);
283             localeRegion = max.getCountry();
284         }
285         if (!region.equalsIgnoreCase(localeRegion)) {
286             return false;
287         }
288         return true;
289     }
290 
291     static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others", "zzzz"));
292     static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy"));
293     static final Set<Datasubtype> REGULAR_ONLY = EnumSet.of(Datasubtype.regular);
294     /**
295      * @param subtag
296      * @return
297      */
isScriptReorder(String subtag)298     private boolean isScriptReorder(String subtag) {
299         subtag = AsciiUtil.toLowerString(subtag);
300         if (REORDERING_INCLUDE.contains(subtag)) {
301             return true;
302         } else if (REORDERING_EXCLUDE.contains(subtag)) {
303             return false;
304         }
305         return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null;
306         //        space, punct, symbol, currency, digit - core groups of characters below 'a'
307         //        any script code except Common and Inherited.
308         //      sc ; Zinh                             ; Inherited                        ; Qaai
309         //      sc ; Zyyy                             ; Common
310         //        Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
311         //        others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others.        return false;
312     }
313 
314     /**
315      * @param extensionString
316      * @param where
317      * @return
318      */
isValidLocale(String extensionString, Where where)319     private boolean isValidLocale(String extensionString, Where where) {
320         try {
321             ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build();
322             return isValid(locale, where);
323         } catch (IllformedLocaleException e) {
324             int startIndex = e.getErrorIndex();
325             String[] list = SEPARATOR.split(extensionString.substring(startIndex));
326             return where.set(Datatype.t, list[0]);
327         } catch (Exception e) {
328             return where.set(Datatype.t, e.getMessage());
329         }
330     }
331 
332     /**
333      * @param datatype
334      * @param code
335      * @param where
336      * @return
337      */
isValid(Datatype datatype, String code, Where where)338     private boolean isValid(Datatype datatype, String code, Where where) {
339         if (code.isEmpty()) {
340             return true;
341         }
342 
343         // Note:
344         // BCP 47 -u- locale extension '-u-va-posix' is mapped to variant 'posix' automatically.
345         // For example, ULocale.forLanguageTag("en-u-va-posix").getVariant() returns "posix".
346         // This is only the exceptional case when -u- locale extension is mapped to a subtag type
347         // other than keyword.
348         //
349         // The locale validity data is based on IANA language subtag registry data and "posix"
350         // is not a valid variant. So we need to handle this specific case here. There are no
351         // othe exceptions.
352         if (datatype == Datatype.variant && "posix".equalsIgnoreCase(code)) {
353             return true;
354         }
355 
356         return ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ?
357                 true : (where == null ? false : where.set(datatype, code));
358     }
359 }
360