1 package org.unicode.cldr.test; 2 3 import java.util.BitSet; 4 import java.util.List; 5 6 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 7 import org.unicode.cldr.util.CLDRConfig; 8 import org.unicode.cldr.util.CLDRFile; 9 import org.unicode.cldr.util.Factory; 10 import org.unicode.cldr.util.SupplementalDataInfo; 11 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 12 import org.unicode.cldr.util.XPathParts; 13 14 import com.ibm.icu.lang.UCharacter; 15 import com.ibm.icu.lang.UCharacterDirection; 16 import com.ibm.icu.lang.UProperty; 17 import com.ibm.icu.lang.UScript; 18 import com.ibm.icu.text.Collator; 19 import com.ibm.icu.text.UnicodeSet; 20 import com.ibm.icu.text.UnicodeSetIterator; 21 import com.ibm.icu.util.ULocale; 22 23 public class CheckExemplars extends FactoryCheckCLDR { 24 25 public static final boolean USE_PUNCTUATION = false; 26 private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true; 27 private static final String[] QUOTE_ELEMENTS = { 28 "quotationStart", "quotationEnd", 29 "alternateQuotationStart", "alternateQuotationEnd" }; 30 static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo(); 31 32 Collator col; 33 Collator spaceCol; 34 boolean isRoot; 35 UnicodeSetPrettyPrinter prettyPrinter; 36 37 static final UnicodeSet HangulSyllables = new UnicodeSet( 38 "[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze(); 39 40 public static final UnicodeSet AlwaysOK; 41 static { 42 if (USE_PUNCTUATION) { 43 AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]"); 44 } else { 45 AlwaysOK = new UnicodeSet( 46 "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" + 47 "[[؉][་ །༌][ཱ]{য়}য়]" + // TODO Fix this Hack 48 "]"); // [\\u200c-\\u200f] [:script=common:][:script=inherited:] 49 } AlwaysOK.freeze()50 AlwaysOK.freeze(); 51 } 52 // TODO Fix some of these characters 53 private static final UnicodeSet SPECIAL_ALLOW = new UnicodeSet( 54 "[\u061C\\u200E\\u200F\\u200c\\u200d" 55 + 56 "[\u064B\u064E-\u0651\u0670][:Nd:][\u0951\u0952][\u064B-\u0652\u0654-\u0657\u0670][\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652][\\u02BB\\u02BC][\u0CE6-\u0CEF][\u0966-\u096F]" 57 + 58 "[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore 59 // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] 60 ).freeze(); // add RLM, LRM [\u200C\u200D] 61 62 public static final UnicodeSet UAllowedInExemplars = new UnicodeSet("[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] 63 .removeAll(AlwaysOK) // this will remove some 64 // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them 65 // in SPECIAL_ALLOW 66 .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] 67 .freeze(); 68 69 public static final UnicodeSet UAllowedInNumbers = new UnicodeSet("[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] 70 .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] 71 .freeze(); 72 73 public static final UnicodeSet AllowedInExemplars = new UnicodeSet(UAllowedInExemplars) 74 .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]")) 75 .freeze(); 76 77 public static final UnicodeSet ALLOWED_IN_PUNCTUATION = new UnicodeSet("[[:P:][:S:]-[:Sc:]]") 78 .freeze(); 79 80 public static final UnicodeSet ALLOWED_IN_AUX = new UnicodeSet(AllowedInExemplars) 81 .addAll(ALLOWED_IN_PUNCTUATION) 82 .removeAll(AlwaysOK) // this will remove some 83 // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them 84 // in SPECIAL_ALLOW 85 .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D] 86 .freeze(); 87 88 public enum ExemplarType { 89 main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true), punctuation(ALLOWED_IN_PUNCTUATION, "punctuation", 90 false), auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)", 91 true), index(UAllowedInExemplars, "(specific-script - invisibles)", false), numbers(UAllowedInNumbers, "(specific-script - invisibles)", false), 92 // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", false) 93 ; 94 95 public final UnicodeSet allowed; 96 public final UnicodeSet toRemove; 97 public final String message; 98 public final boolean convertUppercase; 99 ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase)100 ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) { 101 if (!allowed.isFrozen()) { 102 throw new IllegalArgumentException("Internal Error"); 103 } 104 this.allowed = allowed; 105 this.message = message; 106 this.toRemove = new UnicodeSet(allowed).complement().freeze(); 107 this.convertUppercase = convertUppercase; 108 } 109 } 110 CheckExemplars(Factory factory)111 public CheckExemplars(Factory factory) { 112 super(factory); 113 } 114 115 // Allowed[:script=common:][:script=inherited:][:alphabetic=false:] 116 117 @Override setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)118 public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, 119 List<CheckStatus> possibleErrors) { 120 if (cldrFileToCheck == null) return this; 121 super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 122 String locale = cldrFileToCheck.getLocaleID(); 123 col = Collator.getInstance(new ULocale(locale)); 124 spaceCol = Collator.getInstance(new ULocale(locale)); 125 spaceCol.setStrength(Collator.PRIMARY); 126 isRoot = cldrFileToCheck.getLocaleID().equals("root"); 127 prettyPrinter = new UnicodeSetPrettyPrinter() 128 .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT)) 129 .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT) 130 .setStrength2(Collator.PRIMARY)) 131 .setCompressRanges(true); 132 133 // check for auxiliary anyway 134 if (!SUPPRESS_AUX_EMPTY_CHECK) { 135 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", 136 CLDRFile.WinningChoice.WINNING); 137 138 if (auxiliarySet == null) { 139 possibleErrors.add( 140 new CheckStatus().setCause(this) 141 .setMainType(CheckStatus.warningType) 142 .setSubtype(Subtype.missingAuxiliaryExemplars) 143 .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this.")); 144 } 145 } 146 return this; 147 } 148 handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)149 public CheckCLDR handleCheck(String path, String fullPath, String value, Options options, 150 List<CheckStatus> result) { 151 if (fullPath == null) return this; // skip paths that we don't have 152 if (path.indexOf("/exemplarCharacters") < 0) { 153 if (path.contains("parseLenient")) { 154 checkParse(path, fullPath, value, options, result); 155 } 156 return this; 157 } 158 XPathParts oparts = XPathParts.getFrozenInstance(path); 159 final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type"); 160 ExemplarType type = exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString); 161 checkExemplar(value, result, type); 162 163 // check relation to auxiliary set 164 try { 165 UnicodeSet mainSet = getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING); 166 if (type == ExemplarType.auxiliary) { 167 UnicodeSet auxiliarySet = new UnicodeSet(value); 168 169 UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet); 170 checkMixedScripts("main+auxiliary", combined, result); 171 172 if (auxiliarySet.containsSome(mainSet)) { 173 UnicodeSet overlap = new UnicodeSet(mainSet).retainAll(auxiliarySet).removeAll(HangulSyllables); 174 if (overlap.size() != 0) { 175 String fixedExemplar1 = new UnicodeSetPrettyPrinter() 176 .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT)) 177 .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT) 178 .setStrength2(Collator.PRIMARY)) 179 .setCompressRanges(true) 180 .format(overlap); 181 result 182 .add(new CheckStatus() 183 .setCause(this) 184 .setMainType(CheckStatus.errorType) 185 .setSubtype(Subtype.auxiliaryExemplarsOverlap) 186 .setMessage("Auxiliary characters also exist in main: \u200E{0}\u200E", 187 new Object[] { fixedExemplar1 })); 188 } 189 } 190 } else if (type == ExemplarType.punctuation) { 191 // Check that the punctuation exemplar characters include quotation marks. 192 UnicodeSet punctuationSet = new UnicodeSet(value); 193 UnicodeSet quoteSet = new UnicodeSet(); 194 for (String element : QUOTE_ELEMENTS) { 195 quoteSet.add(getResolvedCldrFileToCheck().getWinningValue("//ldml/delimiters/" + element)); 196 } 197 if (!punctuationSet.containsAll(quoteSet)) { 198 quoteSet.removeAll(punctuationSet); 199 // go ahead and list the characters separately, with space between, for clarity. 200 StringBuilder characters = new StringBuilder(); 201 for (String item : quoteSet) { 202 if (characters.length() != 0) { 203 characters.append(" "); 204 } 205 characters.append(item); 206 } 207 // String characters = quoteSet.toPattern(false); 208 CheckStatus message = new CheckStatus().setCause(this) 209 .setMainType(CheckStatus.warningType) 210 .setSubtype(Subtype.missingPunctuationCharacters) 211 .setMessage("Punctuation exemplar characters are missing quotation marks for this locale: {0}", 212 characters); 213 result.add(message); 214 } 215 } else if (type == ExemplarType.index) { 216 // Check that the index exemplar characters are in case-completed union of main and auxiliary exemplars 217 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING); 218 if (auxiliarySet == null) { 219 auxiliarySet = new UnicodeSet(); 220 } 221 UnicodeSet mainAndAuxAllCase = new UnicodeSet(mainSet).addAll(auxiliarySet).closeOver(UnicodeSet.ADD_CASE_MAPPINGS); 222 UnicodeSet indexBadChars = new UnicodeSet(value).removeAll(mainAndAuxAllCase); 223 224 if (!indexBadChars.isEmpty()) { 225 CheckStatus message = new CheckStatus().setCause(this) 226 .setMainType(CheckStatus.warningType) 227 .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars) 228 .setMessage("Index exemplars include characters not in main or auxiliary exemplars: {0}", 229 indexBadChars.toPattern(false)); 230 result.add(message); 231 } 232 } 233 234 // check for consistency with RTL 235 236 Boolean localeIsRTL = false; 237 String charOrientation = getResolvedCldrFileToCheck().getStringValue( 238 "//ldml/layout/orientation/characterOrder"); 239 if (charOrientation.equals("right-to-left")) { 240 localeIsRTL = true; 241 } 242 243 UnicodeSetIterator mi = new UnicodeSetIterator(mainSet); 244 while (mi.next()) { 245 if (mi.codepoint != UnicodeSetIterator.IS_STRING && 246 (UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT || 247 UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) 248 && 249 !localeIsRTL) { 250 result.add(new CheckStatus() 251 .setCause(this) 252 .setMainType(CheckStatus.errorType) 253 .setSubtype(Subtype.orientationDisagreesWithExemplars) 254 .setMessage( 255 "Main exemplar set contains RTL characters, but orientation of this locale is not RTL.")); 256 break; 257 } 258 } 259 260 } catch (Exception e) { 261 } // if these didn't parse, checkExemplar will be called anyway at some point 262 return this; 263 } 264 checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result)265 private void checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result) { 266 try { 267 XPathParts oparts = XPathParts.getFrozenInstance(path); 268 // only thing we do is make sure that the sample is in the value 269 UnicodeSet us = new UnicodeSet(value); 270 String sampleValue = oparts.getAttributeValue(-1, "sample"); 271 if (!us.contains(sampleValue)) { 272 CheckStatus message = new CheckStatus().setCause(this) 273 .setMainType(CheckStatus.errorType) 274 .setSubtype(Subtype.badParseLenient) 275 .setMessage("ParseLenient sample not in value: {0} ∌ {1}", us, sampleValue); 276 result.add(message); 277 } 278 } catch (Exception e) { 279 CheckStatus message = new CheckStatus().setCause(this) 280 .setMainType(CheckStatus.errorType) 281 .setSubtype(Subtype.badParseLenient) 282 .setMessage(e.getMessage()); 283 result.add(message); 284 } 285 } 286 287 static final BitSet Japn = new BitSet(); 288 static final BitSet Kore = new BitSet(); 289 static { 290 Japn.set(UScript.HAN); 291 Japn.set(UScript.HIRAGANA); 292 Japn.set(UScript.KATAKANA); 293 Kore.set(UScript.HAN); 294 Kore.set(UScript.HANGUL); 295 } 296 checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result)297 private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) { 298 BitSet s = new BitSet(); 299 for (String item : set) { 300 int script = UScript.getScript(item.codePointAt(0)); 301 if (script != UScript.COMMON && script != UScript.INHERITED) { 302 s.set(script); 303 } 304 } 305 final int cardinality = s.cardinality(); 306 if (cardinality < 2) { 307 return; 308 } 309 if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) { 310 return; // allow 2 scripts in exemplars for currencies. 311 } 312 // allowable combinations 313 if (s.equals(Japn) || s.equals(Kore)) { 314 return; 315 } 316 StringBuilder scripts = new StringBuilder(); 317 for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) { 318 if (scripts.length() != 0) { 319 scripts.append(", "); 320 } 321 scripts.append(UScript.getName(i)); 322 UnicodeSet inSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set); 323 int count = 0; 324 scripts.append(" ("); 325 for (String cp : inSet) { 326 if (count != 0) { 327 scripts.append(","); 328 } 329 scripts.append(cp); 330 count++; 331 if (count > 3) { 332 scripts.append('\u2026'); 333 break; 334 } 335 } 336 scripts.append(")"); 337 } 338 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 339 .setSubtype(Subtype.illegalExemplarSet) 340 .setMessage("{0} exemplars contain multiple scripts: {1}", new Object[] { title, scripts })); 341 return; 342 } 343 checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType)344 private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) { 345 if (v == null) return; 346 final UnicodeSet exemplar1; 347 try { 348 exemplar1 = new UnicodeSet(v).freeze(); 349 } catch (Exception e) { 350 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 351 .setSubtype(Subtype.illegalExemplarSet) 352 .setMessage("This field must be a set of the form [a b c-d ...]: ", new Object[] { e.getMessage() })); 353 return; 354 } 355 356 // check for mixed scripts 357 358 checkMixedScripts(exemplarType.toString(), exemplar1, result); 359 360 // check that the formatting is correct 361 362 String fixedExemplar1 = prettyPrinter.format(exemplar1); 363 UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1); 364 if (!doubleCheck.equals(exemplar1)) { 365 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 366 .setSubtype(Subtype.internalUnicodeSetFormattingError) 367 .setMessage("Internal Error: formatting not working for {0}", new Object[] { exemplar1 })); 368 } 369 // else if (!v.equals(fixedExemplar1)) { 370 // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType) 371 // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1})); 372 // } 373 374 // now check that only allowed characters are in the set 375 376 if (!exemplarType.allowed.containsAll(exemplar1)) { 377 UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed); 378 379 // we do allow for punctuation & combining marks in strings 380 UnicodeSet remainder = new UnicodeSet(); 381 for (String s : remainder0) { 382 if (Character.codePointCount(s, 0, s.length()) == 1) { 383 remainder.add(s); 384 } else { 385 // just check normalization 386 } 387 } 388 389 // after a first check, we check again in case we flattened 390 391 if (remainder.size() != 0) { 392 fixedExemplar1 = prettyPrinter.format(exemplar1); 393 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType) 394 .setSubtype(Subtype.illegalCharactersInExemplars) 395 .setMessage("Should be limited to " + exemplarType.message + "; thus not contain: \u200E{0}\u200E", 396 new Object[] { remainder })); 397 } 398 } 399 400 // now check for empty 401 402 if (!isRoot && exemplar1.size() == 0) { 403 switch (exemplarType) { 404 // case currencySymbol: // ok if empty 405 // break; 406 case auxiliary: 407 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.warningType) 408 .setSubtype(Subtype.missingAuxiliaryExemplars) 409 .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this.")); 410 break; 411 case index: 412 case punctuation: 413 case main: 414 result.add(new CheckStatus() 415 .setCause(this) 416 .setMainType(CheckStatus.errorType) 417 .setSubtype(Subtype.missingMainExemplars) 418 .setMessage( 419 "Exemplar set (" + exemplarType 420 + ") must not be empty -- that would imply that this language uses no " + 421 (exemplarType == ExemplarType.punctuation ? "punctuation" : "letters") + "!")); 422 break; 423 } 424 } 425 } 426 } 427