1 package org.unicode.cldr.test;
2 
3 import java.util.Collections;
4 import java.util.EnumMap;
5 import java.util.List;
6 import java.util.Map;
7 import java.util.Set;
8 import java.util.TreeSet;
9 import java.util.regex.Matcher;
10 
11 import org.unicode.cldr.draft.ScriptMetadata;
12 import org.unicode.cldr.draft.ScriptMetadata.Info;
13 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
15 import org.unicode.cldr.tool.LikelySubtags;
16 import org.unicode.cldr.util.CLDRFile;
17 import org.unicode.cldr.util.CldrUtility;
18 import org.unicode.cldr.util.Counter;
19 import org.unicode.cldr.util.Factory;
20 import org.unicode.cldr.util.PathStarrer;
21 import org.unicode.cldr.util.PatternCache;
22 import org.unicode.cldr.util.RegexLookup;
23 
24 import com.google.common.base.Joiner;
25 import com.ibm.icu.lang.UCharacter;
26 import com.ibm.icu.text.BreakIterator;
27 import com.ibm.icu.util.ULocale;
28 
29 public class CheckConsistentCasing extends FactoryCheckCLDR {
30 
31     private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
32 
33     private static final double MIN_FACTOR = 2.5;
34     // remember to add this class to the list in CheckCLDR.getCheckAll
35     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.*
36 
37     ULocale uLocale = null;
38     BreakIterator breaker = null;
39     private String locale;
40     CasingInfo casingInfo;
41     private boolean hasCasingInfo;
42 
CheckConsistentCasing(Factory factory)43     public CheckConsistentCasing(Factory factory) {
44         super(factory);
45         casingInfo = new CasingInfo(factory);
46     }
47 
48     @Override
setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)49     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
50         List<CheckStatus> possibleErrors) {
51         if (cldrFileToCheck == null) return this;
52         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
53         locale = cldrFileToCheck.getLocaleID();
54         // get info about casing; note that this is done in two steps since
55         // ScriptMetadata.getInfo() returns null, in some instances.
56         // OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
57         String script = new LikelySubtags().getLikelyScript(locale);
58         Info localeInfo = ScriptMetadata.getInfo(script);
59 
60         if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
61             // this script has casing info, so we can request it here
62             try {
63                 types = casingInfo.getLocaleCasing(locale);
64             } catch (Exception e) {
65                 types = Collections.emptyMap();
66             }
67         } else {
68             // no casing info - since the types Map is global, and null checks aren't done,
69             // we are better off  with an empty map here
70             types = Collections.emptyMap();
71         }
72         if (types == null || types.isEmpty()) {
73             possibleErrors.add(new CheckStatus().setCause(this)
74                 .setMainType(CheckStatus.warningType)
75                 .setSubtype(Subtype.incorrectCasing)
76                 .setMessage("Could not load casing info for {0}", locale));
77         }
78         // types may be null, avoid NPE
79         hasCasingInfo = (types == null) ? false : types.size() > 0;
80         return this;
81     }
82 
83     // If you don't need any file initialization or postprocessing, you only need this one routine
84     @Override
handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)85     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
86         List<CheckStatus> result) {
87         // it helps performance to have a quick reject of most paths
88         if (fullPath == null) return this; // skip paths that we don't have
89         if (!hasCasingInfo) return this;
90 
91         String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
92         if (locale2.equals(locale) && value != null && value.length() > 0) {
93             Category category = getCategory(path);
94             if (category != null) {
95                 checkConsistentCasing(category, path, fullPath, value, options, result);
96             }
97         }
98         return this;
99     }
100 
101     static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
102 
103     /**
104      * The casing type of a given string.
105      */
106     public enum CasingType {
107         titlecase, lowercase, other;
from(String s)108         public static CasingType from(String s) {
109             if (s == null || s.length() == 0) {
110                 return other;
111             }
112             int cp;
113             // Look for the first meaningful character in the string to determine case.
114             for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
115                 cp = s.codePointAt(i);
116                 // used to skip the placeholders, but works better to have them be 'other'
117                 // if (cp == '{') {
118                 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
119                 // i = placeholder.end() - 1; // skip
120                 // continue;
121                 // }
122                 // }
123                 int type = UCharacter.getType(cp);
124                 switch (type) {
125 
126                 case UCharacter.LOWERCASE_LETTER:
127                     return lowercase;
128 
129                 case UCharacter.UPPERCASE_LETTER:
130                 case UCharacter.TITLECASE_LETTER:
131                     return titlecase;
132 
133                 // for other letters / numbers / symbols, return other
134                 case UCharacter.OTHER_LETTER:
135                 case UCharacter.DECIMAL_DIGIT_NUMBER:
136                 case UCharacter.LETTER_NUMBER:
137                 case UCharacter.OTHER_NUMBER:
138                 case UCharacter.MATH_SYMBOL:
139                 case UCharacter.CURRENCY_SYMBOL:
140                 case UCharacter.MODIFIER_SYMBOL:
141                 case UCharacter.OTHER_SYMBOL:
142                     return other;
143                 // ignore everything else (whitespace, punctuation, etc) and keep going
144                 }
145             }
146             return other;
147         }
148 
149         /**
150          * Return true if either is other, or they are identical.
151          */
worksWith(CasingType otherType)152         public boolean worksWith(CasingType otherType) {
153             return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other;
154         }
155     }
156 
157     public enum CasingTypeAndErrFlag {
158         titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase,
159             false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true);
160 
161         private final CasingType type;
162         private final boolean flag; // force error instead of warning for mismatch
163 
CasingTypeAndErrFlag(CasingType type, boolean flag)164         private CasingTypeAndErrFlag(CasingType type, boolean flag) {
165             this.type = type;
166             this.flag = flag;
167         }
168 
type()169         public CasingType type() {
170             return type;
171         }
172 
flag()173         public boolean flag() {
174             return flag;
175         }
176     }
177 
178     static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>()
179         .add("//ldml/localeDisplayNames/languages/language", Category.language)
180         .add("//ldml/localeDisplayNames/scripts/script", Category.script)
181         .add("//ldml/localeDisplayNames/territories/territory", Category.territory)
182         .add("//ldml/localeDisplayNames/variants/variant", Category.variant)
183         .add("//ldml/localeDisplayNames/keys/key", Category.key)
184         .add("//ldml/localeDisplayNames/types/type", Category.keyValue)
185         .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
186         .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow)
187         .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow)
188         .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
189         .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow)
190         .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow)
191         .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
192         .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
193         .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
194         .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow)
195         .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated)
196         .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide)
197         .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide)
198         .add("//ldml/.*/relative", Category.relative)
199         .add("//ldml/dates/fields", Category.calendar_field)
200         .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity)
201         .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
202         .add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
203         .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases
204         .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
205         .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
206         .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
207         .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count)
208         .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
209         .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
210         .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
211     // ldml/localeDisplayNames/keys/key[@type=".*"]
212     // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
213     // ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
214     ;
215 
216     Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class);
217 
218     public enum Category {
219         language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key;
220     }
221 
222     // //ldml/numbers/currencies/currency[@type="ADP"]/displayName
223     // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
224     // //ldml/numbers/currencies/currency[@type="BYB"]/symbol
225 
getCategory(String path)226     static Category getCategory(String path) {
227         return pathToBucket.get(path);
228     }
229 
230     /**
231      * Calculates casing information using data from the specified CLDRFile.
232      *
233      * @param resolved
234      *            the resolved CLDRFile to calculate casing information from
235      * @return
236      */
getSamples(CLDRFile resolved)237     public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
238         // Use EnumMap instead of an array for type safety.
239         Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class);
240 
241         for (Category category : Category.values()) {
242             counters.put(category, new Counter<CasingType>());
243         }
244         PathStarrer starrer = new PathStarrer();
245         boolean isRoot = "root".equals(resolved.getLocaleID());
246         Set<String> missing = !DEBUG ? null : new TreeSet<>();
247 
248         for (String path : resolved) {
249             if (!isRoot) {
250                 String locale2 = resolved.getSourceLocaleID(path, null);
251                 if (locale2.equals("root") || locale2.equals("code-fallback")) {
252                     continue;
253                 }
254             }
255             String winningPath = resolved.getWinningPath(path);
256             if (!winningPath.equals(path)) {
257                 continue;
258             }
259             Category category = getCategory(path);
260             if (category != null) {
261                 String value = resolved.getStringValue(path);
262                 if (value == null || value.length() == 0) continue;
263                 CasingType ft = CasingType.from(value);
264                 counters.get(category).add(ft, 1);
265             } else if (DEBUG) {
266                 String starred = starrer.set(path);
267                 missing.add(starred);
268             }
269         }
270 
271         Map<Category, CasingType> info = new EnumMap<>(Category.class);
272         for (Category category : Category.values()) {
273             if (category == Category.NOT_USED) continue;
274             Counter<CasingType> counter = counters.get(category);
275             long countLower = counter.getCount(CasingType.lowercase);
276             long countUpper = counter.getCount(CasingType.titlecase);
277             long countOther = counter.getCount(CasingType.other);
278             CasingType type;
279             if (countLower + countUpper == 0) {
280                 type = CasingType.other;
281             } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
282                 type = CasingType.lowercase;
283             } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
284                 type = CasingType.titlecase;
285             } else {
286                 type = CasingType.other;
287             }
288             info.put(category, type);
289         }
290         if (DEBUG && missing.size() != 0) {
291             System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing));
292         }
293         return info;
294     }
295 
296     private static final String CASE_WARNING = "The first letter of 〈{0}〉 is {1}, which differs from what is expected " +
297         "for the {2} category: that almost all values be {3}.\n\n" +
298         "For guidance, see ​http://cldr.org/translation/capitalization. " +
299         "If this warning is wrong, please file a ticket at http://unicode.org/cldr/trac/.";
300 
checkConsistentCasing(Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)301     private void checkConsistentCasing(Category category, String path, String fullPath, String value,
302         Options options, List<CheckStatus> result) {
303         // Avoid NPE
304         if (types != null) {
305             CasingType ft = CasingType.from(value);
306             CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
307             if (typeAndFlagFromCat == null) {
308                 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
309             }
310             if (!ft.worksWith(typeAndFlagFromCat.type())) {
311                 result.add(new CheckStatus().setCause(this)
312                     .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType)
313                     .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType
314                     .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments
315             }
316         }
317     }
318 }