1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2015-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl.locale; 10 11 import java.util.Arrays; 12 import java.util.EnumSet; 13 import java.util.HashSet; 14 import java.util.Set; 15 import java.util.regex.Pattern; 16 17 import com.ibm.icu.impl.ValidIdentifiers; 18 import com.ibm.icu.impl.ValidIdentifiers.Datasubtype; 19 import com.ibm.icu.impl.ValidIdentifiers.Datatype; 20 import com.ibm.icu.impl.locale.KeyTypeData.ValueType; 21 import com.ibm.icu.util.IllformedLocaleException; 22 import com.ibm.icu.util.Output; 23 import com.ibm.icu.util.ULocale; 24 25 /** 26 * @author markdavis 27 * 28 */ 29 public class LocaleValidityChecker { 30 private final Set<Datasubtype> datasubtypes; 31 private final boolean allowsDeprecated; 32 public static class Where { 33 public Datatype fieldFailure; 34 public String codeFailure; 35 set(Datatype datatype, String code)36 public boolean set(Datatype datatype, String code) { 37 fieldFailure = datatype; 38 codeFailure = code; 39 return false; 40 } 41 @Override toString()42 public String toString() { 43 return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}"; 44 } 45 } 46 LocaleValidityChecker(Set<Datasubtype> datasubtypes)47 public LocaleValidityChecker(Set<Datasubtype> datasubtypes) { 48 this.datasubtypes = EnumSet.copyOf(datasubtypes); 49 allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated); 50 } 51 LocaleValidityChecker(Datasubtype... datasubtypes)52 public LocaleValidityChecker(Datasubtype... datasubtypes) { 53 this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes)); 54 allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated); 55 } 56 57 /** 58 * @return the datasubtypes 59 */ getDatasubtypes()60 public Set<Datasubtype> getDatasubtypes() { 61 return EnumSet.copyOf(datasubtypes); 62 } 63 64 static Pattern SEPARATOR = Pattern.compile("[-_]"); 65 66 @SuppressWarnings("unused") 67 private static final Pattern VALID_X = Pattern.compile("[a-zA-Z0-9]{2,8}(-[a-zA-Z0-9]{2,8})*"); 68 isValid(ULocale locale, Where where)69 public boolean isValid(ULocale locale, Where where) { 70 where.set(null, null); 71 final String language = locale.getLanguage(); 72 final String script = locale.getScript(); 73 final String region = locale.getCountry(); 74 final String variantString = locale.getVariant(); 75 final Set<Character> extensionKeys = locale.getExtensionKeys(); 76 // if (language.isEmpty()) { 77 // // the only case where this is valid is if there is only an 'x' extension string 78 // if (!script.isEmpty() || !region.isEmpty() || variantString.isEmpty() 79 // || extensionKeys.size() != 1 || !extensionKeys.contains('x')) { 80 // return where.set(Datatype.x, "Null language only with x-..."); 81 // } 82 // return true; // for x string, wellformedness = valid 83 // } 84 if (!isValid(Datatype.language, language, where)) { 85 // special case x 86 if (language.equals("x")) { 87 where.set(null, null); // for x, well-formed == valid 88 return true; 89 } 90 return false; 91 } 92 if (!isValid(Datatype.script, script, where)) return false; 93 if (!isValid(Datatype.region, region, where)) return false; 94 if (!variantString.isEmpty()) { 95 for (String variant : SEPARATOR.split(variantString)) { 96 if (!isValid(Datatype.variant, variant, where)) return false; 97 } 98 } 99 for (Character c : extensionKeys) { 100 try { 101 Datatype datatype = Datatype.valueOf(c+""); 102 switch (datatype) { 103 case x: 104 return true; // if it is syntactic (checked by ULocale) it is valid 105 case t: 106 case u: 107 if (!isValidU(locale, datatype, locale.getExtension(c), where)) return false; 108 break; 109 default: 110 break; 111 } 112 } catch (Exception e) { 113 return where.set(Datatype.illegal, c+""); 114 } 115 } 116 return true; 117 } 118 119 // TODO combine this with the KeyTypeData.SpecialType, and get it from the type, not the key 120 enum SpecialCase { 121 normal, anything, reorder, codepoints, subdivision, rgKey; get(String key)122 static SpecialCase get(String key) { 123 if (key.equals("kr")) { 124 return reorder; 125 } else if (key.equals("vt")) { 126 return codepoints; 127 } else if (key.equals("sd")) { 128 return subdivision; 129 } else if (key.equals("rg")) { 130 return rgKey; 131 } else if (key.equals("x0")) { 132 return anything; 133 } else { 134 return normal; 135 } 136 } 137 } 138 139 /** 140 * @param locale 141 * @param datatype 142 * @param extension 143 * @param where 144 * @return 145 */ isValidU(ULocale locale, Datatype datatype, String extensionString, Where where)146 private boolean isValidU(ULocale locale, Datatype datatype, String extensionString, Where where) { 147 String key = ""; 148 int typeCount = 0; 149 ValueType valueType = null; 150 SpecialCase specialCase = null; 151 StringBuilder prefix = new StringBuilder(); 152 Set<String> seen = new HashSet<String>(); 153 154 StringBuilder tBuffer = datatype == Datatype.t ? new StringBuilder() : null; 155 156 // TODO: is empty -u- valid? 157 158 for (String subtag : SEPARATOR.split(extensionString)) { 159 if (subtag.length() == 2 160 && (tBuffer == null || subtag.charAt(1) <= '9')) { 161 // if we have accumulated a t buffer, check that first 162 if (tBuffer != null) { 163 // Check t buffer. Empty after 't' is ok. 164 if (tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) { 165 return false; 166 } 167 tBuffer = null; 168 } 169 key = KeyTypeData.toBcpKey(subtag); 170 if (key == null) { 171 return where.set(datatype, subtag); 172 } 173 if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) { 174 return where.set(datatype, key); 175 } 176 valueType = KeyTypeData.getValueType(key); 177 specialCase = SpecialCase.get(key); 178 typeCount = 0; 179 } else if (tBuffer != null) { 180 if (tBuffer.length() != 0) { 181 tBuffer.append('-'); 182 } 183 tBuffer.append(subtag); 184 } else { 185 ++typeCount; 186 switch (valueType) { 187 case single: 188 if (typeCount > 1) { 189 return where.set(datatype, key+"-"+subtag); 190 } 191 break; 192 case incremental: 193 if (typeCount == 1) { 194 prefix.setLength(0); 195 prefix.append(subtag); 196 } else { 197 prefix.append('-').append(subtag); 198 subtag = prefix.toString(); 199 } 200 break; 201 case multiple: 202 if (typeCount == 1) { 203 seen.clear(); 204 } 205 break; 206 default: 207 break; 208 } 209 switch (specialCase) { 210 case anything: 211 continue; 212 case codepoints: 213 try { 214 if (Integer.parseInt(subtag,16) > 0x10FFFF) { 215 return where.set(datatype, key+"-"+subtag); 216 } 217 } catch (NumberFormatException e) { 218 return where.set(datatype, key+"-"+subtag); 219 } 220 continue; 221 case reorder: 222 boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag); 223 if (!newlyAdded || !isScriptReorder(subtag)) { 224 return where.set(datatype, key+"-"+subtag); 225 } 226 continue; 227 case subdivision: 228 if (!isSubdivision(locale, subtag)) { 229 return where.set(datatype, key+"-"+subtag); 230 } 231 continue; 232 case rgKey: 233 if (subtag.length() < 6 || !subtag.endsWith("zzzz")) { 234 return where.set(datatype, subtag); 235 } 236 if (!isValid(Datatype.region, subtag.substring(0,subtag.length()-4), where)) { 237 return false; 238 } 239 continue; 240 default: 241 break; 242 } 243 244 // en-u-sd-usca 245 // en-US-u-sd-usca 246 Output<Boolean> isKnownKey = new Output<Boolean>(); 247 Output<Boolean> isSpecialType = new Output<Boolean>(); 248 String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType); 249 if (type == null) { 250 return where.set(datatype, key+"-"+subtag); 251 } 252 if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) { 253 return where.set(datatype, key+"-"+subtag); 254 } 255 } 256 } 257 // Check t buffer. Empty after 't' is ok. 258 if (tBuffer != null && tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) { 259 return false; 260 } 261 return true; 262 } 263 264 /** 265 * @param locale 266 * @param subtag 267 * @return 268 */ isSubdivision(ULocale locale, String subtag)269 private boolean isSubdivision(ULocale locale, String subtag) { 270 // First check if the subtag is valid 271 if (subtag.length() < 3) { 272 return false; 273 } 274 String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2); 275 String subdivision = subtag.substring(region.length()); 276 if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) { 277 return false; 278 } 279 // Then check for consistency with the locale's region 280 String localeRegion = locale.getCountry(); 281 if (localeRegion.isEmpty()) { 282 ULocale max = ULocale.addLikelySubtags(locale); 283 localeRegion = max.getCountry(); 284 } 285 if (!region.equalsIgnoreCase(localeRegion)) { 286 return false; 287 } 288 return true; 289 } 290 291 static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others", "zzzz")); 292 static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy")); 293 static final Set<Datasubtype> REGULAR_ONLY = EnumSet.of(Datasubtype.regular); 294 /** 295 * @param subtag 296 * @return 297 */ isScriptReorder(String subtag)298 private boolean isScriptReorder(String subtag) { 299 subtag = AsciiUtil.toLowerString(subtag); 300 if (REORDERING_INCLUDE.contains(subtag)) { 301 return true; 302 } else if (REORDERING_EXCLUDE.contains(subtag)) { 303 return false; 304 } 305 return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null; 306 // space, punct, symbol, currency, digit - core groups of characters below 'a' 307 // any script code except Common and Inherited. 308 // sc ; Zinh ; Inherited ; Qaai 309 // sc ; Zyyy ; Common 310 // Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana. 311 // others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false; 312 } 313 314 /** 315 * @param extensionString 316 * @param where 317 * @return 318 */ isValidLocale(String extensionString, Where where)319 private boolean isValidLocale(String extensionString, Where where) { 320 try { 321 ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build(); 322 return isValid(locale, where); 323 } catch (IllformedLocaleException e) { 324 int startIndex = e.getErrorIndex(); 325 String[] list = SEPARATOR.split(extensionString.substring(startIndex)); 326 return where.set(Datatype.t, list[0]); 327 } catch (Exception e) { 328 return where.set(Datatype.t, e.getMessage()); 329 } 330 } 331 332 /** 333 * @param datatype 334 * @param code 335 * @param where 336 * @return 337 */ isValid(Datatype datatype, String code, Where where)338 private boolean isValid(Datatype datatype, String code, Where where) { 339 if (code.isEmpty()) { 340 return true; 341 } 342 343 // Note: 344 // BCP 47 -u- locale extension '-u-va-posix' is mapped to variant 'posix' automatically. 345 // For example, ULocale.forLanguageTag("en-u-va-posix").getVariant() returns "posix". 346 // This is only the exceptional case when -u- locale extension is mapped to a subtag type 347 // other than keyword. 348 // 349 // The locale validity data is based on IANA language subtag registry data and "posix" 350 // is not a valid variant. So we need to handle this specific case here. There are no 351 // othe exceptions. 352 if (datatype == Datatype.variant && "posix".equalsIgnoreCase(code)) { 353 return true; 354 } 355 356 return ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ? 357 true : (where == null ? false : where.set(datatype, code)); 358 } 359 } 360