1 package org.unicode.cldr.test;
2 
3 import java.util.BitSet;
4 import java.util.List;
5 
6 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
7 import org.unicode.cldr.util.CLDRConfig;
8 import org.unicode.cldr.util.CLDRFile;
9 import org.unicode.cldr.util.Factory;
10 import org.unicode.cldr.util.SupplementalDataInfo;
11 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
12 import org.unicode.cldr.util.XPathParts;
13 
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.lang.UCharacterDirection;
16 import com.ibm.icu.lang.UProperty;
17 import com.ibm.icu.lang.UScript;
18 import com.ibm.icu.text.Collator;
19 import com.ibm.icu.text.UnicodeSet;
20 import com.ibm.icu.text.UnicodeSetIterator;
21 import com.ibm.icu.util.ULocale;
22 
23 public class CheckExemplars extends FactoryCheckCLDR {
24 
25     public static final boolean USE_PUNCTUATION = false;
26     private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true;
27     private static final String[] QUOTE_ELEMENTS = {
28         "quotationStart", "quotationEnd",
29         "alternateQuotationStart", "alternateQuotationEnd" };
30     static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo();
31 
32     Collator col;
33     Collator spaceCol;
34     boolean isRoot;
35     UnicodeSetPrettyPrinter prettyPrinter;
36 
37     static final UnicodeSet HangulSyllables = new UnicodeSet(
38         "[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze();
39 
40     public static final UnicodeSet AlwaysOK;
41     static {
42         if (USE_PUNCTUATION) {
43             AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]");
44         } else {
45             AlwaysOK = new UnicodeSet(
46                 "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" +
47                     "[[؉][་ །༌][ཱ]‎‎{য়}য়]" + // TODO Fix this Hack
48                     "]"); // [\\u200c-\\u200f] [:script=common:][:script=inherited:]
49         }
AlwaysOK.freeze()50         AlwaysOK.freeze();
51     }
52     // TODO Fix some of these characters
53     private static final UnicodeSet SPECIAL_ALLOW = new UnicodeSet(
54         "[\u061C\\u200E\\u200F\\u200c\\u200d"
55             +
56             "‎‎‎[\u064B\u064E-\u0651\u0670]‎[:Nd:]‎[\u0951\u0952]‎[\u064B-\u0652\u0654-\u0657\u0670]‎[\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652]‎[\\u02BB\\u02BC][\u0CE6-\u0CEF]‎‎[\u0966-\u096F]"
57             +
58             "‎‎‎[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore
59     // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
60     ).freeze(); // add RLM, LRM [\u200C\u200D]‎
61 
62     public static final UnicodeSet UAllowedInExemplars = new UnicodeSet("[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
63         .removeAll(AlwaysOK) // this will remove some
64         // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them
65         // in SPECIAL_ALLOW
66         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
67         .freeze();
68 
69     public static final UnicodeSet UAllowedInNumbers = new UnicodeSet("[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
70         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
71         .freeze();
72 
73     public static final UnicodeSet AllowedInExemplars = new UnicodeSet(UAllowedInExemplars)
74         .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]"))
75         .freeze();
76 
77     public static final UnicodeSet ALLOWED_IN_PUNCTUATION = new UnicodeSet("[[:P:][:S:]-[:Sc:]]")
78         .freeze();
79 
80     public static final UnicodeSet ALLOWED_IN_AUX = new UnicodeSet(AllowedInExemplars)
81         .addAll(ALLOWED_IN_PUNCTUATION)
82         .removeAll(AlwaysOK) // this will remove some
83         // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them
84         // in SPECIAL_ALLOW
85         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
86         .freeze();
87 
88     public enum ExemplarType {
89         main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true), punctuation(ALLOWED_IN_PUNCTUATION, "punctuation",
90             false), auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)",
91                 true), index(UAllowedInExemplars, "(specific-script - invisibles)", false), numbers(UAllowedInNumbers, "(specific-script - invisibles)", false),
92         // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", false)
93         ;
94 
95         public final UnicodeSet allowed;
96         public final UnicodeSet toRemove;
97         public final String message;
98         public final boolean convertUppercase;
99 
ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase)100         ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) {
101             if (!allowed.isFrozen()) {
102                 throw new IllegalArgumentException("Internal Error");
103             }
104             this.allowed = allowed;
105             this.message = message;
106             this.toRemove = new UnicodeSet(allowed).complement().freeze();
107             this.convertUppercase = convertUppercase;
108         }
109     }
110 
CheckExemplars(Factory factory)111     public CheckExemplars(Factory factory) {
112         super(factory);
113     }
114 
115     // Allowed[:script=common:][:script=inherited:][:alphabetic=false:]
116 
117     @Override
setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)118     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
119         List<CheckStatus> possibleErrors) {
120         if (cldrFileToCheck == null) return this;
121         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
122         String locale = cldrFileToCheck.getLocaleID();
123         col = Collator.getInstance(new ULocale(locale));
124         spaceCol = Collator.getInstance(new ULocale(locale));
125         spaceCol.setStrength(Collator.PRIMARY);
126         isRoot = cldrFileToCheck.getLocaleID().equals("root");
127         prettyPrinter = new UnicodeSetPrettyPrinter()
128             .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT))
129             .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT)
130                 .setStrength2(Collator.PRIMARY))
131             .setCompressRanges(true);
132 
133         // check for auxiliary anyway
134         if (!SUPPRESS_AUX_EMPTY_CHECK) {
135             UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary",
136                 CLDRFile.WinningChoice.WINNING);
137 
138             if (auxiliarySet == null) {
139                 possibleErrors.add(
140                     new CheckStatus().setCause(this)
141                         .setMainType(CheckStatus.warningType)
142                         .setSubtype(Subtype.missingAuxiliaryExemplars)
143                         .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this."));
144             }
145         }
146         return this;
147     }
148 
149     @Override
handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)150     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
151         List<CheckStatus> result) {
152         if (fullPath == null) return this; // skip paths that we don't have
153         if (path.indexOf("/exemplarCharacters") < 0) {
154             if (path.contains("parseLenient")) {
155                 checkParse(path, fullPath, value, options, result);
156             }
157             return this;
158         }
159         XPathParts oparts = XPathParts.getFrozenInstance(path);
160         final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type");
161         ExemplarType type = exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString);
162         checkExemplar(value, result, type);
163 
164         // check relation to auxiliary set
165         try {
166             UnicodeSet mainSet = getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING);
167             if (type == ExemplarType.auxiliary) {
168                 UnicodeSet auxiliarySet = new UnicodeSet(value);
169 
170                 UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet);
171                 checkMixedScripts("main+auxiliary", combined, result);
172 
173                 if (auxiliarySet.containsSome(mainSet)) {
174                     UnicodeSet overlap = new UnicodeSet(mainSet).retainAll(auxiliarySet).removeAll(HangulSyllables);
175                     if (overlap.size() != 0) {
176                         String fixedExemplar1 = new UnicodeSetPrettyPrinter()
177                             .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT))
178                             .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT)
179                                 .setStrength2(Collator.PRIMARY))
180                             .setCompressRanges(true)
181                             .format(overlap);
182                         result
183                             .add(new CheckStatus()
184                                 .setCause(this)
185                                 .setMainType(CheckStatus.errorType)
186                                 .setSubtype(Subtype.auxiliaryExemplarsOverlap)
187                                 .setMessage("Auxiliary characters also exist in main: \u200E{0}\u200E",
188                                     new Object[] { fixedExemplar1 }));
189                     }
190                 }
191             } else if (type == ExemplarType.punctuation) {
192                 // Check that the punctuation exemplar characters include quotation marks.
193                 UnicodeSet punctuationSet = new UnicodeSet(value);
194                 UnicodeSet quoteSet = new UnicodeSet();
195                 for (String element : QUOTE_ELEMENTS) {
196                     quoteSet.add(getResolvedCldrFileToCheck().getWinningValue("//ldml/delimiters/" + element));
197                 }
198                 if (!punctuationSet.containsAll(quoteSet)) {
199                     quoteSet.removeAll(punctuationSet);
200                     // go ahead and list the characters separately, with space between, for clarity.
201                     StringBuilder characters = new StringBuilder();
202                     for (String item : quoteSet) {
203                         if (characters.length() != 0) {
204                             characters.append(" ");
205                         }
206                         characters.append(item);
207                     }
208                     // String characters = quoteSet.toPattern(false);
209                     CheckStatus message = new CheckStatus().setCause(this)
210                         .setMainType(CheckStatus.warningType)
211                         .setSubtype(Subtype.missingPunctuationCharacters)
212                         .setMessage("Punctuation exemplar characters are missing quotation marks for this locale: {0}",
213                             characters);
214                     result.add(message);
215                 }
216             } else if (type == ExemplarType.index) {
217                 // Check that the index exemplar characters are in case-completed union of main and auxiliary exemplars
218                 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
219                 if (auxiliarySet == null) {
220                     auxiliarySet = new UnicodeSet();
221                 }
222                 UnicodeSet mainAndAuxAllCase = new UnicodeSet(mainSet).addAll(auxiliarySet).closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
223                 UnicodeSet indexBadChars = new UnicodeSet(value).removeAll(mainAndAuxAllCase);
224 
225                 if (!indexBadChars.isEmpty()) {
226                     CheckStatus message = new CheckStatus().setCause(this)
227                         .setMainType(CheckStatus.warningType)
228                         .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars)
229                         .setMessage("Index exemplars include characters not in main or auxiliary exemplars: {0}",
230                             indexBadChars.toPattern(false));
231                     result.add(message);
232                 }
233             }
234 
235             // check for consistency with RTL
236 
237             Boolean localeIsRTL = false;
238             String charOrientation = getResolvedCldrFileToCheck().getStringValue(
239                 "//ldml/layout/orientation/characterOrder");
240             if (charOrientation.equals("right-to-left")) {
241                 localeIsRTL = true;
242             }
243 
244             UnicodeSetIterator mi = new UnicodeSetIterator(mainSet);
245             while (mi.next()) {
246                 if (mi.codepoint != UnicodeSetIterator.IS_STRING &&
247                     (UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT ||
248                         UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
249                     &&
250                     !localeIsRTL) {
251                     result.add(new CheckStatus()
252                         .setCause(this)
253                         .setMainType(CheckStatus.errorType)
254                         .setSubtype(Subtype.orientationDisagreesWithExemplars)
255                         .setMessage(
256                             "Main exemplar set contains RTL characters, but orientation of this locale is not RTL."));
257                     break;
258                 }
259             }
260 
261         } catch (Exception e) {
262         } // if these didn't parse, checkExemplar will be called anyway at some point
263         return this;
264     }
265 
checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result)266     private void checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result) {
267         try {
268             XPathParts oparts = XPathParts.getFrozenInstance(path);
269             // only thing we do is make sure that the sample is in the value
270             UnicodeSet us = new UnicodeSet(value);
271             String sampleValue = oparts.getAttributeValue(-1, "sample");
272             if (!us.contains(sampleValue)) {
273                 CheckStatus message = new CheckStatus().setCause(this)
274                     .setMainType(CheckStatus.errorType)
275                     .setSubtype(Subtype.badParseLenient)
276                     .setMessage("ParseLenient sample not in value: {0} ∌ {1}", us, sampleValue);
277                 result.add(message);
278             }
279         } catch (IllegalArgumentException e) {
280             /*
281              * new UnicodeSet(value) throws IllegalArgumentException if, for example, value is null or value = "?".
282              * This can happen during cldr-unittest TestAll.
283              * path = //ldml/characters/parseLenients[@scope="general"][@level="lenient"]/parseLenient[@sample="’"]
284              * or
285              * path = //ldml/characters/parseLenients[@scope="date"][@level="lenient"]/parseLenient[@sample="-"]
286              */
287             CheckStatus message = new CheckStatus().setCause(this)
288                 .setMainType(CheckStatus.errorType)
289                 .setSubtype(Subtype.badParseLenient)
290                 .setMessage(e.toString() + (e.getMessage() == null ? "" : ": " + e.getMessage()));
291             result.add(message);
292         }
293     }
294 
295     static final BitSet Japn = new BitSet();
296     static final BitSet Kore = new BitSet();
297     static {
298         Japn.set(UScript.HAN);
299         Japn.set(UScript.HIRAGANA);
300         Japn.set(UScript.KATAKANA);
301         Kore.set(UScript.HAN);
302         Kore.set(UScript.HANGUL);
303     }
304 
checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result)305     private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) {
306         BitSet s = new BitSet();
307         for (String item : set) {
308             int script = UScript.getScript(item.codePointAt(0));
309             if (script != UScript.COMMON && script != UScript.INHERITED) {
310                 s.set(script);
311             }
312         }
313         final int cardinality = s.cardinality();
314         if (cardinality < 2) {
315             return;
316         }
317         if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) {
318             return; // allow 2 scripts in exemplars for currencies.
319         }
320         // allowable combinations
321         if (s.equals(Japn) || s.equals(Kore)) {
322             return;
323         }
324         StringBuilder scripts = new StringBuilder();
325         for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) {
326             if (scripts.length() != 0) {
327                 scripts.append(", ");
328             }
329             scripts.append(UScript.getName(i));
330             UnicodeSet inSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set);
331             int count = 0;
332             scripts.append(" (");
333             for (String cp : inSet) {
334                 if (count != 0) {
335                     scripts.append(",");
336                 }
337                 scripts.append(cp);
338                 count++;
339                 if (count > 3) {
340                     scripts.append('\u2026');
341                     break;
342                 }
343             }
344             scripts.append(")");
345         }
346         result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
347             .setSubtype(Subtype.illegalExemplarSet)
348             .setMessage("{0} exemplars contain multiple scripts: {1}", new Object[] { title, scripts }));
349         return;
350     }
351 
checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType)352     private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) {
353         if (v == null) return;
354         final UnicodeSet exemplar1;
355         try {
356             exemplar1 = new UnicodeSet(v).freeze();
357         } catch (Exception e) {
358             result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
359                 .setSubtype(Subtype.illegalExemplarSet)
360                 .setMessage("This field must be a set of the form [a b c-d ...]: {0}", new Object[] { e.getMessage() }));
361             return;
362         }
363 
364         // check for mixed scripts
365 
366         checkMixedScripts(exemplarType.toString(), exemplar1, result);
367 
368         // check that the formatting is correct
369 
370         String fixedExemplar1 = prettyPrinter.format(exemplar1);
371         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1);
372         if (!doubleCheck.equals(exemplar1)) {
373             result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
374                 .setSubtype(Subtype.internalUnicodeSetFormattingError)
375                 .setMessage("Internal Error: formatting not working for {0}", new Object[] { exemplar1 }));
376         }
377         // else if (!v.equals(fixedExemplar1)) {
378         // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType)
379         // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1}));
380         // }
381 
382         // now check that only allowed characters are in the set
383 
384         if (!exemplarType.allowed.containsAll(exemplar1)) {
385             UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed);
386 
387             // we do allow for punctuation & combining marks in strings
388             UnicodeSet remainder = new UnicodeSet();
389             for (String s : remainder0) {
390                 if (Character.codePointCount(s, 0, s.length()) == 1) {
391                     remainder.add(s);
392                 } else {
393                     // just check normalization
394                 }
395             }
396 
397             // after a first check, we check again in case we flattened
398 
399             if (remainder.size() != 0) {
400                 fixedExemplar1 = prettyPrinter.format(exemplar1);
401                 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
402                     .setSubtype(Subtype.illegalCharactersInExemplars)
403                     .setMessage("Should be limited to " + exemplarType.message + "; thus not contain: \u200E{0}\u200E",
404                         new Object[] { remainder }));
405             }
406         }
407 
408         // now check for empty
409 
410         if (!isRoot && exemplar1.size() == 0) {
411             switch (exemplarType) {
412 //            case currencySymbol: // ok if empty
413 //                break;
414             case auxiliary:
415                 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.warningType)
416                     .setSubtype(Subtype.missingAuxiliaryExemplars)
417                     .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this."));
418                 break;
419             case index:
420             case punctuation:
421             case main:
422                 result.add(new CheckStatus()
423                     .setCause(this)
424                     .setMainType(CheckStatus.errorType)
425                     .setSubtype(Subtype.missingMainExemplars)
426                     .setMessage(
427                         "Exemplar set (" + exemplarType
428                             + ") must not be empty -- that would imply that this language uses no " +
429                             (exemplarType == ExemplarType.punctuation ? "punctuation" : "letters") + "!"));
430                 break;
431             }
432         }
433     }
434 }
435