1 package org.unicode.cldr.test; 2 3 import java.util.BitSet; 4 import java.util.List; 5 6 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 7 import org.unicode.cldr.util.CLDRConfig; 8 import org.unicode.cldr.util.CLDRFile; 9 import org.unicode.cldr.util.Factory; 10 import org.unicode.cldr.util.SupplementalDataInfo; 11 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 12 import org.unicode.cldr.util.XPathParts; 13 14 import com.ibm.icu.lang.UCharacter; 15 import com.ibm.icu.lang.UCharacterDirection; 16 import com.ibm.icu.lang.UProperty; 17 import com.ibm.icu.lang.UScript; 18 import com.ibm.icu.text.Collator; 19 import com.ibm.icu.text.UnicodeSet; 20 import com.ibm.icu.text.UnicodeSetIterator; 21 import com.ibm.icu.util.ULocale; 22 23 public class CheckExemplars extends FactoryCheckCLDR { 24 25 public static final boolean USE_PUNCTUATION = false; 26 private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true; 27 private static final String[] QUOTE_ELEMENTS = { 28 "quotationStart", "quotationEnd", 29 "alternateQuotationStart", "alternateQuotationEnd" }; 30 static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo(); 31 32 Collator col; 33 Collator spaceCol; 34 boolean isRoot; 35 UnicodeSetPrettyPrinter prettyPrinter; 36 37 static final UnicodeSet HangulSyllables = new UnicodeSet( 38 "[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze(); 39 40 public static final UnicodeSet AlwaysOK; 41 static { 42 if (USE_PUNCTUATION) { 43 AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]"); 44 } else { 45 AlwaysOK = new UnicodeSet( 46 "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" + 47 "[[؉][་ །༌][ཱ]{য়}য়]" + // TODO Fix this Hack 48 "]"); // [\\u200c-\\u200f] [:script=common:][:script=inherited:] 49 } AlwaysOK.freeze()50 AlwaysOK.freeze(); 51 } 52 // TODO Fix some of these characters 53 private static final UnicodeSet SPECIAL_ALLOW = new UnicodeSet( 54 "[\u061C\\u200E\\u200F\\u200c\\u200d" 55 + 56 "[\u064B\u064E-\u0651\u0670][:Nd:][\u0951\u0952][\u064B-\u0652\u0654-\u0657\u0670][\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652][\\u02BB\\u02BC][\u0CE6-\u0CEF][\u0966-\u096F]" 57 + 58 "[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore 59 // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] 60 ).freeze(); // add RLM, LRM [\u200C\u200D] 61 62 public static final UnicodeSet UAllowedInExemplars = new UnicodeSet("[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] 63 .removeAll(AlwaysOK) // this will remove some 64 // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them 65 // in SPECIAL_ALLOW 66 .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] 67 .freeze(); 68 69 public static final UnicodeSet UAllowedInNumbers = new UnicodeSet("[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] 70 .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] 71 .freeze(); 72 73 public static final UnicodeSet AllowedInExemplars = new UnicodeSet(UAllowedInExemplars) 74 .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]")) 75 .freeze(); 76 77 public static final UnicodeSet ALLOWED_IN_PUNCTUATION = new UnicodeSet("[[:P:][:S:]-[:Sc:]]") 78 .freeze(); 79 80 public static final UnicodeSet ALLOWED_IN_AUX = new UnicodeSet(AllowedInExemplars) 81 .addAll(ALLOWED_IN_PUNCTUATION) 82 .removeAll(AlwaysOK) // this will remove some 83 // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them 84 // in SPECIAL_ALLOW 85 .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] 86 .freeze(); 87 88 public enum ExemplarType { 89 main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true), punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", 90 false), auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", 91 true), index(UAllowedInExemplars, "(specific-script - invisibles)", false), numbers(UAllowedInNumbers, "(specific-script - invisibles)", false), 92 // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", false) 93 ; 94 95 public final UnicodeSet allowed; 96 public final UnicodeSet toRemove; 97 public final String message; 98 public final boolean convertUppercase; 99 ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase)100 ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) { 101 if (!allowed.isFrozen()) { 102 throw new IllegalArgumentException("Internal Error"); 103 } 104 this.allowed = allowed; 105 this.message = message; 106 this.toRemove = new UnicodeSet(allowed).complement().freeze(); 107 this.convertUppercase = convertUppercase; 108 } 109 } 110 CheckExemplars(Factory factory)111 public CheckExemplars(Factory factory) { 112 super(factory); 113 } 114 115 // Allowed[:script=common:][:script=inherited:][:alphabetic=false:] 116 117 @Override setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)118 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, 119 List<CheckStatus> possibleErrors) { 120 if (cldrFileToCheck == null) return this; 121 super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 122 String locale = cldrFileToCheck.getLocaleID(); 123 col = Collator.getInstance(new ULocale(locale)); 124 spaceCol = Collator.getInstance(new ULocale(locale)); 125 spaceCol.setStrength(Collator.PRIMARY); 126 isRoot = cldrFileToCheck.getLocaleID().equals("root"); 127 prettyPrinter = new UnicodeSetPrettyPrinter() 128 .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT)) 129 .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT) 130 .setStrength2(Collator.PRIMARY)) 131 .setCompressRanges(true); 132 133 // check for auxiliary anyway 134 if (!SUPPRESS_AUX_EMPTY_CHECK) { 135 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", 136 CLDRFile.WinningChoice.WINNING); 137 138 if (auxiliarySet == null) { 139 possibleErrors.add( 140 new CheckStatus().setCause(this) 141 .setMainType(CheckStatus.warningType) 142 .setSubtype(Subtype.missingAuxiliaryExemplars) 143 .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this.")); 144 } 145 } 146 return this; 147 } 148 149 @Override handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)150 public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, 151 List<CheckStatus> result) { 152 if (fullPath == null) return this; // skip paths that we don't have 153 if (path.indexOf("/exemplarCharacters") < 0) { 154 if (path.contains("parseLenient")) { 155 checkParse(path, fullPath, value, options, result); 156 } 157 return this; 158 } 159 XPathParts oparts = XPathParts.getFrozenInstance(path); 160 final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type"); 161 ExemplarType type = exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString); 162 checkExemplar(value, result, type); 163 164 // check relation to auxiliary set 165 try { 166 UnicodeSet mainSet = getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING); 167 if (type == ExemplarType.auxiliary) { 168 UnicodeSet auxiliarySet = new UnicodeSet(value); 169 170 UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet); 171 checkMixedScripts("main+auxiliary", combined, result); 172 173 if (auxiliarySet.containsSome(mainSet)) { 174 UnicodeSet overlap = new UnicodeSet(mainSet).retainAll(auxiliarySet).removeAll(HangulSyllables); 175 if (overlap.size() != 0) { 176 String fixedExemplar1 = new UnicodeSetPrettyPrinter() 177 .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT)) 178 .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT) 179 .setStrength2(Collator.PRIMARY)) 180 .setCompressRanges(true) 181 .format(overlap); 182 result 183 .add(new CheckStatus() 184 .setCause(this) 185 .setMainType(CheckStatus.errorType) 186 .setSubtype(Subtype.auxiliaryExemplarsOverlap) 187 .setMessage("Auxiliary characters also exist in main: \u200E{0}\u200E", 188 new Object[] { fixedExemplar1 })); 189 } 190 } 191 } else if (type == ExemplarType.punctuation) { 192 // Check that the punctuation exemplar characters include quotation marks. 193 UnicodeSet punctuationSet = new UnicodeSet(value); 194 UnicodeSet quoteSet = new UnicodeSet(); 195 for (String element : QUOTE_ELEMENTS) { 196 quoteSet.add(getResolvedCldrFileToCheck().getWinningValue("//ldml/delimiters/" + element)); 197 } 198 if (!punctuationSet.containsAll(quoteSet)) { 199 quoteSet.removeAll(punctuationSet); 200 // go ahead and list the characters separately, with space between, for clarity. 201 StringBuilder characters = new StringBuilder(); 202 for (String item : quoteSet) { 203 if (characters.length() != 0) { 204 characters.append(" "); 205 } 206 characters.append(item); 207 } 208 // String characters = quoteSet.toPattern(false); 209 CheckStatus message = new CheckStatus().setCause(this) 210 .setMainType(CheckStatus.warningType) 211 .setSubtype(Subtype.missingPunctuationCharacters) 212 .setMessage("Punctuation exemplar characters are missing quotation marks for this locale: {0}", 213 characters); 214 result.add(message); 215 } 216 } else if (type == ExemplarType.index) { 217 // Check that the index exemplar characters are in case-completed union of main and auxiliary exemplars 218 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING); 219 if (auxiliarySet == null) { 220 auxiliarySet = new UnicodeSet(); 221 } 222 UnicodeSet mainAndAuxAllCase = new UnicodeSet(mainSet).addAll(auxiliarySet).closeOver(UnicodeSet.ADD_CASE_MAPPINGS); 223 UnicodeSet indexBadChars = new UnicodeSet(value).removeAll(mainAndAuxAllCase); 224 225 if (!indexBadChars.isEmpty()) { 226 CheckStatus message = new CheckStatus().setCause(this) 227 .setMainType(CheckStatus.warningType) 228 .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars) 229 .setMessage("Index exemplars include characters not in main or auxiliary exemplars: {0}", 230 indexBadChars.toPattern(false)); 231 result.add(message); 232 } 233 } 234 235 // check for consistency with RTL 236 237 Boolean localeIsRTL = false; 238 String charOrientation = getResolvedCldrFileToCheck().getStringValue( 239 "//ldml/layout/orientation/characterOrder"); 240 if (charOrientation.equals("right-to-left")) { 241 localeIsRTL = true; 242 } 243 244 UnicodeSetIterator mi = new UnicodeSetIterator(mainSet); 245 while (mi.next()) { 246 if (mi.codepoint != UnicodeSetIterator.IS_STRING && 247 (UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT || 248 UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) 249 && 250 !localeIsRTL) { 251 result.add(new CheckStatus() 252 .setCause(this) 253 .setMainType(CheckStatus.errorType) 254 .setSubtype(Subtype.orientationDisagreesWithExemplars) 255 .setMessage( 256 "Main exemplar set contains RTL characters, but orientation of this locale is not RTL.")); 257 break; 258 } 259 } 260 261 } catch (Exception e) { 262 } // if these didn't parse, checkExemplar will be called anyway at some point 263 return this; 264 } 265 checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result)266 private void checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result) { 267 try { 268 XPathParts oparts = XPathParts.getFrozenInstance(path); 269 // only thing we do is make sure that the sample is in the value 270 UnicodeSet us = new UnicodeSet(value); 271 String sampleValue = oparts.getAttributeValue(-1, "sample"); 272 if (!us.contains(sampleValue)) { 273 CheckStatus message = new CheckStatus().setCause(this) 274 .setMainType(CheckStatus.errorType) 275 .setSubtype(Subtype.badParseLenient) 276 .setMessage("ParseLenient sample not in value: {0} ∌ {1}", us, sampleValue); 277 result.add(message); 278 } 279 } catch (IllegalArgumentException e) { 280 /* 281 * new UnicodeSet(value) throws IllegalArgumentException if, for example, value is null or value = "?". 282 * This can happen during cldr-unittest TestAll. 283 * path = //ldml/characters/parseLenients[@scope="general"][@level="lenient"]/parseLenient[@sample="’"] 284 * or 285 * path = //ldml/characters/parseLenients[@scope="date"][@level="lenient"]/parseLenient[@sample="-"] 286 */ 287 CheckStatus message = new CheckStatus().setCause(this) 288 .setMainType(CheckStatus.errorType) 289 .setSubtype(Subtype.badParseLenient) 290 .setMessage(e.toString() + (e.getMessage() == null ? "" : ": " + e.getMessage())); 291 result.add(message); 292 } 293 } 294 295 static final BitSet Japn = new BitSet(); 296 static final BitSet Kore = new BitSet(); 297 static { 298 Japn.set(UScript.HAN); 299 Japn.set(UScript.HIRAGANA); 300 Japn.set(UScript.KATAKANA); 301 Kore.set(UScript.HAN); 302 Kore.set(UScript.HANGUL); 303 } 304 checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result)305 private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) { 306 BitSet s = new BitSet(); 307 for (String item : set) { 308 int script = UScript.getScript(item.codePointAt(0)); 309 if (script != UScript.COMMON && script != UScript.INHERITED) { 310 s.set(script); 311 } 312 } 313 final int cardinality = s.cardinality(); 314 if (cardinality < 2) { 315 return; 316 } 317 if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) { 318 return; // allow 2 scripts in exemplars for currencies. 319 } 320 // allowable combinations 321 if (s.equals(Japn) || s.equals(Kore)) { 322 return; 323 } 324 StringBuilder scripts = new StringBuilder(); 325 for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) { 326 if (scripts.length() != 0) { 327 scripts.append(", "); 328 } 329 scripts.append(UScript.getName(i)); 330 UnicodeSet inSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set); 331 int count = 0; 332 scripts.append(" ("); 333 for (String cp : inSet) { 334 if (count != 0) { 335 scripts.append(","); 336 } 337 scripts.append(cp); 338 count++; 339 if (count > 3) { 340 scripts.append('\u2026'); 341 break; 342 } 343 } 344 scripts.append(")"); 345 } 346 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 347 .setSubtype(Subtype.illegalExemplarSet) 348 .setMessage("{0} exemplars contain multiple scripts: {1}", new Object[] { title, scripts })); 349 return; 350 } 351 checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType)352 private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) { 353 if (v == null) return; 354 final UnicodeSet exemplar1; 355 try { 356 exemplar1 = new UnicodeSet(v).freeze(); 357 } catch (Exception e) { 358 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 359 .setSubtype(Subtype.illegalExemplarSet) 360 .setMessage("This field must be a set of the form [a b c-d ...]: {0}", new Object[] { e.getMessage() })); 361 return; 362 } 363 364 // check for mixed scripts 365 366 checkMixedScripts(exemplarType.toString(), exemplar1, result); 367 368 // check that the formatting is correct 369 370 String fixedExemplar1 = prettyPrinter.format(exemplar1); 371 UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1); 372 if (!doubleCheck.equals(exemplar1)) { 373 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 374 .setSubtype(Subtype.internalUnicodeSetFormattingError) 375 .setMessage("Internal Error: formatting not working for {0}", new Object[] { exemplar1 })); 376 } 377 // else if (!v.equals(fixedExemplar1)) { 378 // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType) 379 // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1})); 380 // } 381 382 // now check that only allowed characters are in the set 383 384 if (!exemplarType.allowed.containsAll(exemplar1)) { 385 UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed); 386 387 // we do allow for punctuation & combining marks in strings 388 UnicodeSet remainder = new UnicodeSet(); 389 for (String s : remainder0) { 390 if (Character.codePointCount(s, 0, s.length()) == 1) { 391 remainder.add(s); 392 } else { 393 // just check normalization 394 } 395 } 396 397 // after a first check, we check again in case we flattened 398 399 if (remainder.size() != 0) { 400 fixedExemplar1 = prettyPrinter.format(exemplar1); 401 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 402 .setSubtype(Subtype.illegalCharactersInExemplars) 403 .setMessage("Should be limited to " + exemplarType.message + "; thus not contain: \u200E{0}\u200E", 404 new Object[] { remainder })); 405 } 406 } 407 408 // now check for empty 409 410 if (!isRoot && exemplar1.size() == 0) { 411 switch (exemplarType) { 412 // case currencySymbol: // ok if empty 413 // break; 414 case auxiliary: 415 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.warningType) 416 .setSubtype(Subtype.missingAuxiliaryExemplars) 417 .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this.")); 418 break; 419 case index: 420 case punctuation: 421 case main: 422 result.add(new CheckStatus() 423 .setCause(this) 424 .setMainType(CheckStatus.errorType) 425 .setSubtype(Subtype.missingMainExemplars) 426 .setMessage( 427 "Exemplar set (" + exemplarType 428 + ") must not be empty -- that would imply that this language uses no " + 429 (exemplarType == ExemplarType.punctuation ? "punctuation" : "letters") + "!")); 430 break; 431 } 432 } 433 } 434 } 435