1 package org.unicode.cldr.test;
2 
3 import java.util.Collections;
4 import java.util.EnumMap;
5 import java.util.List;
6 import java.util.Map;
7 import java.util.Set;
8 import java.util.TreeSet;
9 import java.util.regex.Matcher;
10 
11 import org.unicode.cldr.draft.ScriptMetadata;
12 import org.unicode.cldr.draft.ScriptMetadata.Info;
13 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
14 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
15 import org.unicode.cldr.tool.LikelySubtags;
16 import org.unicode.cldr.util.CLDRFile;
17 import org.unicode.cldr.util.CldrUtility;
18 import org.unicode.cldr.util.Counter;
19 import org.unicode.cldr.util.Factory;
20 import org.unicode.cldr.util.PathStarrer;
21 import org.unicode.cldr.util.PatternCache;
22 import org.unicode.cldr.util.RegexLookup;
23 import org.unicode.cldr.util.XPathParts;
24 
25 import com.ibm.icu.dev.util.CollectionUtilities;
26 import com.ibm.icu.lang.UCharacter;
27 import com.ibm.icu.text.BreakIterator;
28 import com.ibm.icu.util.ULocale;
29 
30 public class CheckConsistentCasing extends FactoryCheckCLDR {
31 
32     private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
33 
34     private static final double MIN_FACTOR = 2.5;
35     // remember to add this class to the list in CheckCLDR.getCheckAll
36     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* -t.*Currencies.*
37 
38     XPathParts parts = new XPathParts(); // used to parse out a path
39     ULocale uLocale = null;
40     BreakIterator breaker = null;
41     private String locale;
42     CasingInfo casingInfo;
43     private boolean hasCasingInfo;
44 
CheckConsistentCasing(Factory factory)45     public CheckConsistentCasing(Factory factory) {
46         super(factory);
47         casingInfo = new CasingInfo(factory);
48     }
49 
50     @Override
setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)51     public CheckCLDR setCldrFileToCheck(CLDRFile cldrFileToCheck, Options options,
52         List<CheckStatus> possibleErrors) {
53         if (cldrFileToCheck == null) return this;
54         super.setCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
55         locale = cldrFileToCheck.getLocaleID();
56         // get info about casing; note that this is done in two steps since
57         // ScriptMetadata.getInfo() returns null, in some instances.
58         // OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
59         String script = new LikelySubtags().getLikelyScript(locale);
60         Info localeInfo = ScriptMetadata.getInfo(script);
61 
62         if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
63             // this script has casing info, so we can request it here
64             try {
65                 types = casingInfo.getLocaleCasing(locale);
66             } catch (Exception e) {
67                 types = Collections.emptyMap();
68             }
69         } else {
70             // no casing info - since the types Map is global, and null checks aren't done,
71             // we are better off  with an empty map here
72             types = Collections.emptyMap();
73         }
74         if (types == null || types.isEmpty()) {
75             possibleErrors.add(new CheckStatus().setCause(this)
76                 .setMainType(CheckStatus.warningType)
77                 .setSubtype(Subtype.incorrectCasing)
78                 .setMessage("Could not load casing info for {0}", locale));
79         }
80         // types may be null, avoid NPE
81         hasCasingInfo = (types == null) ? false : types.size() > 0;
82         return this;
83     }
84 
85     // If you don't need any file initialization or postprocessing, you only need this one routine
handleCheck(String path, String fullPath, String value, Options options, List<CheckStatus> result)86     public CheckCLDR handleCheck(String path, String fullPath, String value, Options options,
87         List<CheckStatus> result) {
88         // it helps performance to have a quick reject of most paths
89         if (fullPath == null) return this; // skip paths that we don't have
90         if (!hasCasingInfo) return this;
91 
92         String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
93         if (locale2.equals(locale) && value != null && value.length() > 0) {
94             Category category = getCategory(path);
95             if (category != null) {
96                 checkConsistentCasing(category, path, fullPath, value, options, result);
97             }
98         }
99         return this;
100     }
101 
102     static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
103 
104     /**
105      * The casing type of a given string.
106      */
107     public enum CasingType {
108         titlecase, lowercase, other;
from(String s)109         public static CasingType from(String s) {
110             if (s == null || s.length() == 0) {
111                 return other;
112             }
113             int cp;
114             // Look for the first meaningful character in the string to determine case.
115             for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
116                 cp = s.codePointAt(i);
117                 // used to skip the placeholders, but works better to have them be 'other'
118                 // if (cp == '{') {
119                 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
120                 // i = placeholder.end() - 1; // skip
121                 // continue;
122                 // }
123                 // }
124                 int type = UCharacter.getType(cp);
125                 switch (type) {
126 
127                 case UCharacter.LOWERCASE_LETTER:
128                     return lowercase;
129 
130                 case UCharacter.UPPERCASE_LETTER:
131                 case UCharacter.TITLECASE_LETTER:
132                     return titlecase;
133 
134                 // for other letters / numbers / symbols, return other
135                 case UCharacter.OTHER_LETTER:
136                 case UCharacter.DECIMAL_DIGIT_NUMBER:
137                 case UCharacter.LETTER_NUMBER:
138                 case UCharacter.OTHER_NUMBER:
139                 case UCharacter.MATH_SYMBOL:
140                 case UCharacter.CURRENCY_SYMBOL:
141                 case UCharacter.MODIFIER_SYMBOL:
142                 case UCharacter.OTHER_SYMBOL:
143                     return other;
144                 // ignore everything else (whitespace, punctuation, etc) and keep going
145                 }
146             }
147             return other;
148         }
149 
150         /**
151          * Return true if either is other, or they are identical.
152          */
worksWith(CasingType otherType)153         public boolean worksWith(CasingType otherType) {
154             return otherType == null || this == otherType || this == CasingType.other || otherType == CasingType.other;
155         }
156     }
157 
158     public enum CasingTypeAndErrFlag {
159         titlecase_mismatchWarn(CasingType.titlecase, false), titlecase_mismatchErr(CasingType.titlecase, true), lowercase_mismatchWarn(CasingType.lowercase,
160             false), lowercase_mismatchErr(CasingType.lowercase, true), other_mismatchWarn(CasingType.other, false), other_mismatchErr(CasingType.other, true);
161 
162         private final CasingType type;
163         private final boolean flag; // force error instead of warning for mismatch
164 
CasingTypeAndErrFlag(CasingType type, boolean flag)165         private CasingTypeAndErrFlag(CasingType type, boolean flag) {
166             this.type = type;
167             this.flag = flag;
168         }
169 
type()170         public CasingType type() {
171             return type;
172         }
173 
flag()174         public boolean flag() {
175             return flag;
176         }
177     }
178 
179     static final RegexLookup<Category> pathToBucket = new RegexLookup<Category>()
180         .add("//ldml/localeDisplayNames/languages/language", Category.language)
181         .add("//ldml/localeDisplayNames/scripts/script", Category.script)
182         .add("//ldml/localeDisplayNames/territories/territory", Category.territory)
183         .add("//ldml/localeDisplayNames/variants/variant", Category.variant)
184         .add("//ldml/localeDisplayNames/keys/key", Category.key)
185         .add("//ldml/localeDisplayNames/types/type", Category.keyValue)
186         .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
187         .add("//ldml/dates/calendars/calendar.*/months.*format", Category.month_format_except_narrow)
188         .add("//ldml/dates/calendars/calendar.*/months", Category.month_standalone_except_narrow)
189         .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
190         .add("//ldml/dates/calendars/calendar.*/days.*format", Category.day_format_except_narrow)
191         .add("//ldml/dates/calendars/calendar.*/days", Category.day_standalone_except_narrow)
192         .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
193         .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
194         .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
195         .add("//ldml/dates/calendars/calendar.*/quarters.*narrow", Category.quarter_narrow)
196         .add("//ldml/dates/calendars/calendar.*/quarters.*abbreviated", Category.quarter_abbreviated)
197         .add("//ldml/dates/calendars/calendar.*/quarters.*format", Category.quarter_format_wide)
198         .add("//ldml/dates/calendars/calendar.*/quarters", Category.quarter_standalone_wide)
199         .add("//ldml/.*/relative", Category.relative)
200         .add("//ldml/dates/fields", Category.calendar_field)
201         .add("//ldml/dates/timeZoneNames/zone.*/exemplarCity", Category.zone_exemplarCity)
202         .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
203         .add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
204         .add("//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", Category.NOT_USED) // just to remove them from the other cases
205         .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
206         .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
207         .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
208         .add("//ldml/numbers/currencies/currency.*/displayName.*@count", Category.currencyName_count)
209         .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
210         .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
211         .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
212     // ldml/localeDisplayNames/keys/key[@type=".*"]
213     // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
214     // ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
215     ;
216 
217     Map<Category, CasingTypeAndErrFlag> types = new EnumMap<Category, CasingTypeAndErrFlag>(Category.class);
218 
219     public enum Category {
220         language, script, territory, variant, keyValue, month_narrow, month_format_except_narrow, month_standalone_except_narrow, day_narrow, day_format_except_narrow, day_standalone_except_narrow, era_narrow, era_abbr, era_name, quarter_narrow, quarter_abbreviated, quarter_format_wide, quarter_standalone_wide, calendar_field, zone_exemplarCity, zone_short, zone_long, NOT_USED, metazone_short, metazone_long, symbol, currencyName_count, currencyName, relative, unit_pattern, key;
221     }
222 
223     // //ldml/numbers/currencies/currency[@type="ADP"]/displayName
224     // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
225     // //ldml/numbers/currencies/currency[@type="BYB"]/symbol
226 
getCategory(String path)227     static Category getCategory(String path) {
228         return pathToBucket.get(path);
229     }
230 
231     /**
232      * Calculates casing information using data from the specified CLDRFile.
233      *
234      * @param resolved
235      *            the resolved CLDRFile to calculate casing information from
236      * @return
237      */
getSamples(CLDRFile resolved)238     public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
239         // Use EnumMap instead of an array for type safety.
240         Map<Category, Counter<CasingType>> counters = new EnumMap<Category, Counter<CasingType>>(Category.class);
241 
242         for (Category category : Category.values()) {
243             counters.put(category, new Counter<CasingType>());
244         }
245         PathStarrer starrer = new PathStarrer();
246         boolean isRoot = "root".equals(resolved.getLocaleID());
247         Set<String> missing = !DEBUG ? null : new TreeSet<String>();
248 
249         for (String path : resolved) {
250             if (!isRoot) {
251                 String locale2 = resolved.getSourceLocaleID(path, null);
252                 if (locale2.equals("root") || locale2.equals("code-fallback")) {
253                     continue;
254                 }
255             }
256             String winningPath = resolved.getWinningPath(path);
257             if (!winningPath.equals(path)) {
258                 continue;
259             }
260             Category category = getCategory(path);
261             if (category != null) {
262                 String value = resolved.getStringValue(path);
263                 if (value == null || value.length() == 0) continue;
264                 CasingType ft = CasingType.from(value);
265                 counters.get(category).add(ft, 1);
266             } else if (DEBUG) {
267                 String starred = starrer.set(path);
268                 missing.add(starred);
269             }
270         }
271 
272         Map<Category, CasingType> info = new EnumMap<Category, CasingType>(Category.class);
273         for (Category category : Category.values()) {
274             if (category == Category.NOT_USED) continue;
275             Counter<CasingType> counter = counters.get(category);
276             long countLower = counter.getCount(CasingType.lowercase);
277             long countUpper = counter.getCount(CasingType.titlecase);
278             long countOther = counter.getCount(CasingType.other);
279             CasingType type;
280             if (countLower + countUpper == 0) {
281                 type = CasingType.other;
282             } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
283                 type = CasingType.lowercase;
284             } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
285                 type = CasingType.titlecase;
286             } else {
287                 type = CasingType.other;
288             }
289             info.put(category, type);
290         }
291         if (DEBUG && missing.size() != 0) {
292             System.out.println("Paths skipped:\n" + CollectionUtilities.join(missing, "\n"));
293         }
294         return info;
295     }
296 
297     private static final String CASE_WARNING = "The first letter of 〈{0}〉 is {1}, which differs from what is expected " +
298         "for the {2} category: that almost all values be {3}.\n\n" +
299         "For guidance, see ​http://cldr.org/translation/capitalization. " +
300         "If this warning is wrong, please file a ticket at http://unicode.org/cldr/trac/.";
301 
checkConsistentCasing(Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)302     private void checkConsistentCasing(Category category, String path, String fullPath, String value,
303         Options options, List<CheckStatus> result) {
304         // Avoid NPE
305         if (types != null) {
306             CasingType ft = CasingType.from(value);
307             CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
308             if (typeAndFlagFromCat == null) {
309                 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
310             }
311             if (!ft.worksWith(typeAndFlagFromCat.type())) {
312                 result.add(new CheckStatus().setCause(this)
313                     .setMainType(typeAndFlagFromCat.flag() ? CheckStatus.errorType : CheckStatus.warningType)
314                     .setSubtype(Subtype.incorrectCasing) // typically warningType or errorType
315                     .setMessage(CASE_WARNING, value, ft, category, typeAndFlagFromCat.type())); // the message; can be MessageFormat with arguments
316             }
317         }
318     }
319 }