1 package org.unicode.cldr.test;
2 
3 import java.util.BitSet;
4 import java.util.List;
5 
6 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
7 import org.unicode.cldr.util.CLDRConfig;
8 import org.unicode.cldr.util.CLDRFile;
9 import org.unicode.cldr.util.Factory;
10 import org.unicode.cldr.util.SupplementalDataInfo;
11 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
12 import org.unicode.cldr.util.XPathParts;
13 
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.lang.UCharacterDirection;
16 import com.ibm.icu.lang.UProperty;
17 import com.ibm.icu.lang.UScript;
18 import com.ibm.icu.text.Collator;
19 import com.ibm.icu.text.UnicodeSet;
20 import com.ibm.icu.text.UnicodeSetIterator;
21 import com.ibm.icu.util.ULocale;
22 
23 public class CheckExemplars extends FactoryCheckCLDR {
24 
25     public static final boolean USE_PUNCTUATION = false;
26     private static final boolean SUPPRESS_AUX_EMPTY_CHECK = true;
27     private static final String[] QUOTE_ELEMENTS = {
28         "quotationStart", "quotationEnd",
29         "alternateQuotationStart", "alternateQuotationEnd" };
30     static final SupplementalDataInfo SUP = CLDRConfig.getInstance().getSupplementalDataInfo();
31 
32     Collator col;
33     Collator spaceCol;
34     boolean isRoot;
35     UnicodeSetPrettyPrinter prettyPrinter;
36 
37     static final UnicodeSet HangulSyllables = new UnicodeSet(
38         "[[:Hangul_Syllable_Type=LVT:][:Hangul_Syllable_Type=LV:]]").freeze();
39 
40     public static final UnicodeSet AlwaysOK;
41     static {
42         if (USE_PUNCTUATION) {
43             AlwaysOK = new UnicodeSet("[\\u0020\\u00A0]");
44         } else {
45             AlwaysOK = new UnicodeSet(
46                 "[[[:Nd:][:script=common:][:script=inherited:]-[:Default_Ignorable_Code_Point:]-[:C:] - [_]] [\u05BE \u05F3 \u066A-\u066C]" +
47                     "[[؉][་ །༌][ཱ]‎‎{য়}য়]" + // TODO Fix this Hack
48                     "]"); // [\\u200c-\\u200f] [:script=common:][:script=inherited:]
49         }
AlwaysOK.freeze()50         AlwaysOK.freeze();
51     }
52     // TODO Fix some of these characters
53     private static final UnicodeSet SPECIAL_ALLOW = new UnicodeSet(
54         "[\u061C\\u200E\\u200F\\u200c\\u200d"
55             +
56             "‎‎‎[\u064B\u064E-\u0651\u0670]‎[:Nd:]‎[\u0951\u0952]‎[\u064B-\u0652\u0654-\u0657\u0670]‎[\u0A66-\u0A6F][\u0ED0-\u0ED9][\u064B-\u0652]‎[\\u02BB\\u02BC][\u0CE6-\u0CEF]‎‎[\u0966-\u096F]"
57             +
58             "‎‎‎[:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] ]" // restore
59     // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
60     ).freeze(); // add RLM, LRM [\u200C\u200D]‎
61 
62     public static final UnicodeSet UAllowedInExemplars = new UnicodeSet("[[:assigned:]-[:Z:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
63         .removeAll(AlwaysOK) // this will remove some
64         // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them
65         // in SPECIAL_ALLOW
66         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
67         .freeze();
68 
69     public static final UnicodeSet UAllowedInNumbers = new UnicodeSet("[\u00A0\u202F[:N:][:P:][:Sm:][:Letter_Number:][:Numeric_Type=Numeric:]]") // [:alphabetic:][:Mn:][:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:]
70         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
71         .freeze();
72 
73     public static final UnicodeSet AllowedInExemplars = new UnicodeSet(UAllowedInExemplars)
74         .removeAll(new UnicodeSet("[[:Uppercase:]-[\u0130]]"))
75         .freeze();
76 
77     public static final UnicodeSet ALLOWED_IN_PUNCTUATION = new UnicodeSet("[[:P:][:S:]-[:Sc:]]")
78         .freeze();
79 
80     public static final UnicodeSet ALLOWED_IN_AUX = new UnicodeSet(AllowedInExemplars)
81         .addAll(ALLOWED_IN_PUNCTUATION)
82         .removeAll(AlwaysOK) // this will remove some
83         // [:word_break=Katakana:][:word_break=ALetter:][:word_break=MidLetter:] so we restore them
84         // in SPECIAL_ALLOW
85         .addAll(SPECIAL_ALLOW) // add RLM, LRM [\u200C\u200D]‎
86         .freeze();
87 
88     public enum ExemplarType {
89         main(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", true), punctuation(ALLOWED_IN_PUNCTUATION, "punctuation",
90             false), auxiliary(ALLOWED_IN_AUX, "(specific-script - uppercase - invisibles + \u0130)",
91                 true), index(UAllowedInExemplars, "(specific-script - invisibles)", false), numbers(UAllowedInNumbers, "(specific-script - invisibles)", false),
92         // currencySymbol(AllowedInExemplars, "(specific-script - uppercase - invisibles + \u0130)", false)
93         ;
94 
95         public final UnicodeSet allowed;
96         public final UnicodeSet toRemove;
97         public final String message;
98         public final boolean convertUppercase;
99 
ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase)100         ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) {
101             if (!allowed.isFrozen()) {
102                 throw new IllegalArgumentException("Internal Error");
103             }
104             this.allowed = allowed;
105             this.message = message;
106             this.toRemove = new UnicodeSet(allowed).complement().freeze();
107             this.convertUppercase = convertUppercase;
108         }
109     }
110 
CheckExemplars(Factory factory)111     public CheckExemplars(Factory factory) {
112         super(factory);
113     }
114 
115     // Allowed[:script=common:][:script=inherited:][:alphabetic=false:]
116 
117     @Override
setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)118     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
119         List<CheckStatus> possibleErrors) {
120         if (cldrFileToCheck == null) return this;
121         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
122         String locale = cldrFileToCheck.getLocaleID();
123         col = Collator.getInstance(new ULocale(locale));
124         spaceCol = Collator.getInstance(new ULocale(locale));
125         spaceCol.setStrength(Collator.PRIMARY);
126         isRoot = cldrFileToCheck.getLocaleID().equals("root");
127         prettyPrinter = new UnicodeSetPrettyPrinter()
128             .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT))
129             .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT)
130                 .setStrength2(Collator.PRIMARY))
131             .setCompressRanges(true);
132 
133         // check for auxiliary anyway
134         if (!SUPPRESS_AUX_EMPTY_CHECK) {
135             UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary",
136                 CLDRFile.WinningChoice.WINNING);
137 
138             if (auxiliarySet == null) {
139                 possibleErrors.add(
140                     new CheckStatus().setCause(this)
141                         .setMainType(CheckStatus.warningType)
142                         .setSubtype(Subtype.missingAuxiliaryExemplars)
143                         .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this."));
144             }
145         }
146         return this;
147     }
148 
handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)149     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
150         List<CheckStatus> result) {
151         if (fullPath == null) return this; // skip paths that we don't have
152         if (path.indexOf("/exemplarCharacters") < 0) {
153             if (path.contains("parseLenient")) {
154                 checkParse(path, fullPath, value, options, result);
155             }
156             return this;
157         }
158         XPathParts oparts = XPathParts.getFrozenInstance(path);
159         final String exemplarString = oparts.findAttributeValue("exemplarCharacters", "type");
160         ExemplarType type = exemplarString == null ? ExemplarType.main : ExemplarType.valueOf(exemplarString);
161         checkExemplar(value, result, type);
162 
163         // check relation to auxiliary set
164         try {
165             UnicodeSet mainSet = getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING);
166             if (type == ExemplarType.auxiliary) {
167                 UnicodeSet auxiliarySet = new UnicodeSet(value);
168 
169                 UnicodeSet combined = new UnicodeSet(mainSet).addAll(auxiliarySet);
170                 checkMixedScripts("main+auxiliary", combined, result);
171 
172                 if (auxiliarySet.containsSome(mainSet)) {
173                     UnicodeSet overlap = new UnicodeSet(mainSet).retainAll(auxiliarySet).removeAll(HangulSyllables);
174                     if (overlap.size() != 0) {
175                         String fixedExemplar1 = new UnicodeSetPrettyPrinter()
176                             .setOrdering(col != null ? col : Collator.getInstance(ULocale.ROOT))
177                             .setSpaceComparator(col != null ? col : Collator.getInstance(ULocale.ROOT)
178                                 .setStrength2(Collator.PRIMARY))
179                             .setCompressRanges(true)
180                             .format(overlap);
181                         result
182                             .add(new CheckStatus()
183                                 .setCause(this)
184                                 .setMainType(CheckStatus.errorType)
185                                 .setSubtype(Subtype.auxiliaryExemplarsOverlap)
186                                 .setMessage("Auxiliary characters also exist in main: \u200E{0}\u200E",
187                                     new Object[] { fixedExemplar1 }));
188                     }
189                 }
190             } else if (type == ExemplarType.punctuation) {
191                 // Check that the punctuation exemplar characters include quotation marks.
192                 UnicodeSet punctuationSet = new UnicodeSet(value);
193                 UnicodeSet quoteSet = new UnicodeSet();
194                 for (String element : QUOTE_ELEMENTS) {
195                     quoteSet.add(getResolvedCldrFileToCheck().getWinningValue("//ldml/delimiters/" + element));
196                 }
197                 if (!punctuationSet.containsAll(quoteSet)) {
198                     quoteSet.removeAll(punctuationSet);
199                     // go ahead and list the characters separately, with space between, for clarity.
200                     StringBuilder characters = new StringBuilder();
201                     for (String item : quoteSet) {
202                         if (characters.length() != 0) {
203                             characters.append(" ");
204                         }
205                         characters.append(item);
206                     }
207                     // String characters = quoteSet.toPattern(false);
208                     CheckStatus message = new CheckStatus().setCause(this)
209                         .setMainType(CheckStatus.warningType)
210                         .setSubtype(Subtype.missingPunctuationCharacters)
211                         .setMessage("Punctuation exemplar characters are missing quotation marks for this locale: {0}",
212                             characters);
213                     result.add(message);
214                 }
215             } else if (type == ExemplarType.index) {
216                 // Check that the index exemplar characters are in case-completed union of main and auxiliary exemplars
217                 UnicodeSet auxiliarySet = getResolvedCldrFileToCheck().getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
218                 if (auxiliarySet == null) {
219                     auxiliarySet = new UnicodeSet();
220                 }
221                 UnicodeSet mainAndAuxAllCase = new UnicodeSet(mainSet).addAll(auxiliarySet).closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
222                 UnicodeSet indexBadChars = new UnicodeSet(value).removeAll(mainAndAuxAllCase);
223 
224                 if (!indexBadChars.isEmpty()) {
225                     CheckStatus message = new CheckStatus().setCause(this)
226                         .setMainType(CheckStatus.warningType)
227                         .setSubtype(Subtype.charactersNotInMainOrAuxiliaryExemplars)
228                         .setMessage("Index exemplars include characters not in main or auxiliary exemplars: {0}",
229                             indexBadChars.toPattern(false));
230                     result.add(message);
231                 }
232             }
233 
234             // check for consistency with RTL
235 
236             Boolean localeIsRTL = false;
237             String charOrientation = getResolvedCldrFileToCheck().getStringValue(
238                 "//ldml/layout/orientation/characterOrder");
239             if (charOrientation.equals("right-to-left")) {
240                 localeIsRTL = true;
241             }
242 
243             UnicodeSetIterator mi = new UnicodeSetIterator(mainSet);
244             while (mi.next()) {
245                 if (mi.codepoint != UnicodeSetIterator.IS_STRING &&
246                     (UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT ||
247                         UCharacter.getDirection(mi.codepoint) == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
248                     &&
249                     !localeIsRTL) {
250                     result.add(new CheckStatus()
251                         .setCause(this)
252                         .setMainType(CheckStatus.errorType)
253                         .setSubtype(Subtype.orientationDisagreesWithExemplars)
254                         .setMessage(
255                             "Main exemplar set contains RTL characters, but orientation of this locale is not RTL."));
256                     break;
257                 }
258             }
259 
260         } catch (Exception e) {
261         } // if these didn't parse, checkExemplar will be called anyway at some point
262         return this;
263     }
264 
checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result)265     private void checkParse(String path, String fullPath, String value, Options options, List<CheckStatus> result) {
266         try {
267             XPathParts oparts = XPathParts.getFrozenInstance(path);
268             // only thing we do is make sure that the sample is in the value
269             UnicodeSet us = new UnicodeSet(value);
270             String sampleValue = oparts.getAttributeValue(-1, "sample");
271             if (!us.contains(sampleValue)) {
272                 CheckStatus message = new CheckStatus().setCause(this)
273                     .setMainType(CheckStatus.errorType)
274                     .setSubtype(Subtype.badParseLenient)
275                     .setMessage("ParseLenient sample not in value: {0} ∌ {1}", us, sampleValue);
276                 result.add(message);
277             }
278         } catch (Exception e) {
279             CheckStatus message = new CheckStatus().setCause(this)
280                 .setMainType(CheckStatus.errorType)
281                 .setSubtype(Subtype.badParseLenient)
282                 .setMessage(e.getMessage());
283             result.add(message);
284         }
285     }
286 
287     static final BitSet Japn = new BitSet();
288     static final BitSet Kore = new BitSet();
289     static {
290         Japn.set(UScript.HAN);
291         Japn.set(UScript.HIRAGANA);
292         Japn.set(UScript.KATAKANA);
293         Kore.set(UScript.HAN);
294         Kore.set(UScript.HANGUL);
295     }
296 
checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result)297     private void checkMixedScripts(String title, UnicodeSet set, List<CheckStatus> result) {
298         BitSet s = new BitSet();
299         for (String item : set) {
300             int script = UScript.getScript(item.codePointAt(0));
301             if (script != UScript.COMMON && script != UScript.INHERITED) {
302                 s.set(script);
303             }
304         }
305         final int cardinality = s.cardinality();
306         if (cardinality < 2) {
307             return;
308         }
309         if (cardinality == 2 && title.equals("currencySymbol") && s.get(UScript.LATIN)) {
310             return; // allow 2 scripts in exemplars for currencies.
311         }
312         // allowable combinations
313         if (s.equals(Japn) || s.equals(Kore)) {
314             return;
315         }
316         StringBuilder scripts = new StringBuilder();
317         for (int i = s.nextSetBit(0); i >= 0; i = s.nextSetBit(i + 1)) {
318             if (scripts.length() != 0) {
319                 scripts.append(", ");
320             }
321             scripts.append(UScript.getName(i));
322             UnicodeSet inSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, i).retainAll(set);
323             int count = 0;
324             scripts.append(" (");
325             for (String cp : inSet) {
326                 if (count != 0) {
327                     scripts.append(",");
328                 }
329                 scripts.append(cp);
330                 count++;
331                 if (count > 3) {
332                     scripts.append('\u2026');
333                     break;
334                 }
335             }
336             scripts.append(")");
337         }
338         result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
339             .setSubtype(Subtype.illegalExemplarSet)
340             .setMessage("{0} exemplars contain multiple scripts: {1}", new Object[] { title, scripts }));
341         return;
342     }
343 
checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType)344     private void checkExemplar(String v, List<CheckStatus> result, ExemplarType exemplarType) {
345         if (v == null) return;
346         final UnicodeSet exemplar1;
347         try {
348             exemplar1 = new UnicodeSet(v).freeze();
349         } catch (Exception e) {
350             result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
351                 .setSubtype(Subtype.illegalExemplarSet)
352                 .setMessage("This field must be a set of the form [a b c-d ...]: ", new Object[] { e.getMessage() }));
353             return;
354         }
355 
356         // check for mixed scripts
357 
358         checkMixedScripts(exemplarType.toString(), exemplar1, result);
359 
360         // check that the formatting is correct
361 
362         String fixedExemplar1 = prettyPrinter.format(exemplar1);
363         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar1);
364         if (!doubleCheck.equals(exemplar1)) {
365             result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
366                 .setSubtype(Subtype.internalUnicodeSetFormattingError)
367                 .setMessage("Internal Error: formatting not working for {0}", new Object[] { exemplar1 }));
368         }
369         // else if (!v.equals(fixedExemplar1)) {
370         // result.add(new CheckStatus().setCause(this).setType(CheckStatus.warningType)
371         // .setMessage("Better formatting would be \u200E{0}\u200E", new Object[]{fixedExemplar1}));
372         // }
373 
374         // now check that only allowed characters are in the set
375 
376         if (!exemplarType.allowed.containsAll(exemplar1)) {
377             UnicodeSet remainder0 = new UnicodeSet(exemplar1).removeAll(exemplarType.allowed);
378 
379             // we do allow for punctuation & combining marks in strings
380             UnicodeSet remainder = new UnicodeSet();
381             for (String s : remainder0) {
382                 if (Character.codePointCount(s, 0, s.length()) == 1) {
383                     remainder.add(s);
384                 } else {
385                     // just check normalization
386                 }
387             }
388 
389             // after a first check, we check again in case we flattened
390 
391             if (remainder.size() != 0) {
392                 fixedExemplar1 = prettyPrinter.format(exemplar1);
393                 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.errorType)
394                     .setSubtype(Subtype.illegalCharactersInExemplars)
395                     .setMessage("Should be limited to " + exemplarType.message + "; thus not contain: \u200E{0}\u200E",
396                         new Object[] { remainder }));
397             }
398         }
399 
400         // now check for empty
401 
402         if (!isRoot && exemplar1.size() == 0) {
403             switch (exemplarType) {
404 //            case currencySymbol: // ok if empty
405 //                break;
406             case auxiliary:
407                 result.add(new CheckStatus().setCause(this).setMainType(CheckStatus.warningType)
408                     .setSubtype(Subtype.missingAuxiliaryExemplars)
409                     .setMessage("Most languages allow <i>some<i> auxiliary characters, so review this."));
410                 break;
411             case index:
412             case punctuation:
413             case main:
414                 result.add(new CheckStatus()
415                     .setCause(this)
416                     .setMainType(CheckStatus.errorType)
417                     .setSubtype(Subtype.missingMainExemplars)
418                     .setMessage(
419                         "Exemplar set (" + exemplarType
420                             + ") must not be empty -- that would imply that this language uses no " +
421                             (exemplarType == ExemplarType.punctuation ? "punctuation" : "letters") + "!"));
422                 break;
423             }
424         }
425     }
426 }
427