1 /* Copyright (C) 2007-2013 Google and others.  All Rights Reserved. */
2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */
3 
4 package org.unicode.cldr.test;
5 
6 import java.util.Arrays;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 
16 import org.unicode.cldr.test.CheckExemplars.ExemplarType;
17 import org.unicode.cldr.util.Builder;
18 import org.unicode.cldr.util.CLDRFile;
19 import org.unicode.cldr.util.CLDRLocale;
20 import org.unicode.cldr.util.CldrUtility;
21 import org.unicode.cldr.util.DateTimeCanonicalizer;
22 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
23 import org.unicode.cldr.util.Emoji;
24 import org.unicode.cldr.util.ICUServiceBuilder;
25 import org.unicode.cldr.util.PatternCache;
26 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
27 import org.unicode.cldr.util.With;
28 import org.unicode.cldr.util.XPathParts;
29 
30 import com.google.common.base.Joiner;
31 import com.google.common.base.Splitter;
32 import com.google.myanmartools.ZawgyiDetector;
33 import com.ibm.icu.lang.UCharacter;
34 import com.ibm.icu.text.Collator;
35 import com.ibm.icu.text.DateIntervalInfo;
36 import com.ibm.icu.text.DateTimePatternGenerator;
37 import com.ibm.icu.text.DecimalFormat;
38 import com.ibm.icu.text.Normalizer;
39 import com.ibm.icu.text.RuleBasedCollator;
40 import com.ibm.icu.text.Transform;
41 import com.ibm.icu.text.Transliterator;
42 import com.ibm.icu.text.UnicodeSet;
43 import com.ibm.icu.text.UnicodeSetIterator;
44 import com.ibm.icu.util.ULocale;
45 
46 /**
47  * Class for processing the input and output of CLDR data for use in the
48  * Survey Tool and other tools.
49  */
50 public class DisplayAndInputProcessor {
51 
52     private static final boolean FIX_YEARS = true;
53 
54     public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false);
55 
56     public static final UnicodeSet RTL = new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]")
57         .freeze();
58 
59     public static final UnicodeSet TO_QUOTE = new UnicodeSet(
60         "[[:Cn:]" +
61             "[:Default_Ignorable_Code_Point:]" +
62             "[:patternwhitespace:]" +
63             "[:Me:][:Mn:]]" // add non-spacing marks
64     ).freeze();
65 
66     public static final Pattern NUMBER_FORMAT_XPATH = Pattern
67         .compile("//ldml/numbers/.*Format\\[@type=\"standard\"]/pattern.*");
68 
69     public static final Pattern NUMBER_SEPARATOR_PATTERN = Pattern
70         .compile("//ldml/numbers/symbols.*/(decimal|group)");
71 
72     private static final Pattern APOSTROPHE_SKIP_PATHS = PatternCache.get("//ldml/("
73         + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|"
74         + "characters/.*|"
75         + "delimiters/.*|"
76         + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|"
77         + "units/.+/unitPattern.*|"
78         + "units/.+/durationUnitPattern.*|"
79         + "numbers/symbols.*|"
80         + "numbers/miscPatterns.*|"
81         + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)");
82     private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*");
83     private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
84     private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // string of whitespace not
85     // including NBSP, i.e. [
86     // \t\n\r]+
87     private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = PatternCache.get("[\\s\\u00A0]+"); // string of
88     // whitespace
89     // including NBSP,
90     // i.e. [
91     // \u00A0\t\n\r]+
92     private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
93 
94     private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
95     private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro");
96     private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca");
97     private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo");
98     private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg");
99     private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he");
100     private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my");
101     private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky");
102     private static final CLDRLocale URDU = CLDRLocale.getInstance("ur");
103     private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps");
104     private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa");
105     private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH");
106     private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw");
107     private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm");
108     public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = new HashSet<>(
109         Arrays.asList("br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn"));
110 
111     // Ş ş Ţ ţ  =>  Ș ș Ț ț
112     private static final char[][] ROMANIAN_CONVERSIONS = {
113         { '\u015E', '\u0218' }, { '\u015F', '\u0219' }, { '\u0162', '\u021A' },
114         { '\u0163', '\u021B' } };
115 
116     private static final char[][] CATALAN_CONVERSIONS = {
117         { '\u013F', '\u004C', '\u00B7' }, // Ŀ -> L·
118         { '\u0140', '\u006C', '\u00B7' } }; // ŀ -> l·
119 
120     private static final char[][] NGOMBA_CONVERSIONS = {
121         { '\u0251', '\u0061' }, { '\u0261', '\u0067' }, //  ɑ -> a , ɡ -> g , See ticket #5691
122         { '\u2019', '\uA78C' }, { '\u02BC', '\uA78C' } }; //  Saltillo, see ticket #6805
123 
124     private static final char[][] KWASIO_CONVERSIONS = {
125         { '\u0306', '\u030C' }, // See ticket #6571, use caron instead of breve
126         { '\u0103', '\u01CE' }, { '\u0102', '\u01CD' }, // a-breve -> a-caron
127         { '\u0115', '\u011B' }, { '\u011A', '\u01CD' }, // e-breve -> e-caron
128         { '\u012D', '\u01D0' }, { '\u012C', '\u01CF' }, // i-breve -> i-caron
129         { '\u014F', '\u01D2' }, { '\u014E', '\u01D1' }, // o-breve -> o-caron
130         { '\u016D', '\u01D4' }, { '\u016C', '\u01D3' } // u-breve -> u-caron
131     };
132 
133     private static final char[][] HEBREW_CONVERSIONS = {
134         { '\'', '\u05F3' }, { '"', '\u05F4' } }; //  ' -> geresh  " -> gershayim
135 
136     private static final char[][] KYRGYZ_CONVERSIONS = {
137         { 'ӊ', 'ң' }, { 'Ӊ', 'Ң' } }; //  right modifier
138 
139     private static final char[][] URDU_PLUS_CONVERSIONS = {
140         { '\u0643', '\u06A9' }}; //  wrong char
141 
142     private static final ZawgyiDetector detector = new ZawgyiDetector();
143     private static final Transliterator zawgyiUnicodeTransliterator =
144         Transliterator.getInstance("Zawgyi-my");
145 
146     private Collator col;
147 
148     private Collator spaceCol;
149 
150     private UnicodeSetPrettyPrinter pp = null;
151 
152     final private CLDRLocale locale;
153     private boolean isPosix;
154 
155     /**
156      * Constructor, taking cldrFile.
157      *
158      * @param cldrFileToCheck
159      */
DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)160     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) {
161         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator);
162     }
163 
DisplayAndInputProcessor(CLDRFile cldrFileToCheck)164     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) {
165         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true);
166     }
167 
init(CLDRLocale locale, boolean needsCollator)168     void init(CLDRLocale locale, boolean needsCollator) {
169         isPosix = locale.toString().indexOf("POSIX") >= 0;
170         if (needsCollator) {
171             ICUServiceBuilder isb = null;
172             try {
173                 isb = ICUServiceBuilder.forLocale(locale);
174             } catch (Exception e) {
175             }
176 
177             if (isb != null) {
178                 try {
179                     col = isb.getRuleBasedCollator();
180                 } catch (Exception e) {
181                     col = Collator.getInstance(ULocale.ROOT);
182                 }
183             } else {
184                 col = Collator.getInstance(ULocale.ROOT);
185             }
186 
187             spaceCol = Collator.getInstance(locale.toULocale());
188             if (spaceCol instanceof RuleBasedCollator) {
189                 ((RuleBasedCollator) spaceCol).setAlternateHandlingShifted(false);
190             }
191             pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT))
192                 .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY))
193                 .setCompressRanges(true)
194                 .setToQuote(new UnicodeSet(TO_QUOTE))
195                 .setOrdering(col)
196                 .setSpaceComparator(spaceCol);
197         }
198     }
199 
getPrettyPrinter()200     public UnicodeSetPrettyPrinter getPrettyPrinter() {
201         return pp;
202     }
203 
204     /**
205      * Constructor, taking ULocale and boolean.
206      *
207      * @param locale the ULocale
208      * @param needsCollator true or false
209      *
210      * Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE
211      */
DisplayAndInputProcessor(ULocale locale, boolean needsCollator)212     public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) {
213         init(this.locale = CLDRLocale.getInstance(locale), needsCollator);
214     }
215 
216     /**
217      * Constructor, taking ULocale.
218      *
219      * @param locale the ULocale
220      */
DisplayAndInputProcessor(ULocale locale)221     public DisplayAndInputProcessor(ULocale locale) {
222         init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */);
223     }
224 
225     /**
226      * Constructor, taking CLDRLocale and boolean.
227      *
228      * @param locale the CLDRLocale
229      * @param needsCollator true or false
230      */
DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)231     public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) {
232         init(this.locale = locale, needsCollator);
233     }
234 
235     /**
236      * Constructor, taking locale.
237      *
238      * @param locale
239      */
DisplayAndInputProcessor(CLDRLocale locale)240     public DisplayAndInputProcessor(CLDRLocale locale) {
241         init(this.locale = locale, true);
242     }
243 
244     /**
245      * Process the value for display. The result is a string for display in the
246      * Survey tool or similar program.
247      *
248      * @param path
249      * @param value
250      * @param fullPath
251      * @return
252      */
processForDisplay(String path, String value)253     public synchronized String processForDisplay(String path, String value) {
254         value = Normalizer.compose(value, false); // Always normalize all text to NFC.
255         if (hasUnicodeSetValue(path)) {
256             value = displayUnicodeSet(value);
257         } else if (path.contains("stopword")) {
258             return value.trim().isEmpty() ? "NONE" : value;
259         } else {
260             NumericType numericType = NumericType.getNumericType(path);
261             if (numericType != NumericType.NOT_NUMERIC) {
262                 // Canonicalize existing values that aren't canonicalized yet.
263                 // New values will be canonicalized on input using processInput().
264                 try {
265                     value = getCanonicalPattern(value, numericType, isPosix);
266                 } catch (IllegalArgumentException e) {
267                     if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value);
268                 }
269                 if (numericType != NumericType.CURRENCY && numericType != NumericType.CURRENCY_ABBREVIATED) {
270                     value = value.replace("'", "");
271                 }
272             }
273         }
274         // Fix up any apostrophes in number symbols
275         if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
276             value = value.replace('\'', '\u2019');
277         }
278         // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
279         if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
280             value = normalizeApostrophes(value);
281         }
282         // Fix up hyphens, replacing with N-dash as appropriate
283         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
284             value = normalizeIntervalHyphens(value);
285         } else {
286             value = normalizeHyphens(value);
287         }
288         return value;
289     }
290 
hasUnicodeSetValue(String path)291     private boolean hasUnicodeSetValue(String path) {
292         return path.startsWith("//ldml/characters/exemplarCharacters") || path.startsWith("//ldml/characters/parseLenients");
293     }
294 
295     static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
296     static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS);
297 
298     public static final Splitter SPLIT_BAR = Splitter.on(Pattern.compile("(\\||\\s+l\\s+)")).trimResults().omitEmptyStrings();
299     static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings();
300     static final Joiner JOIN_BAR = Joiner.on(" | ");
301 
302     /**
303      * Process the value for input. The result is a cleaned-up value. For example,
304      * an exemplar set is modified to be in the normal format, and any missing [ ]
305      * are added (a common omission on entry). If there are any failures then the
306      * original value is returned, so that the proper error message can be given.
307      *
308      * @param path
309      * @param value
310      * @param internalException
311      *            TODO
312      * @param fullPath
313      * @return
314      */
processInput(String path, String value, Exception[] internalException)315     public synchronized String processInput(String path, String value, Exception[] internalException) {
316         String original = value;
317         value = stripProblematicControlCharacters(value);
318         value = Normalizer.compose(value, false); // Always normalize all input to NFC.
319         if (internalException != null) {
320             internalException[0] = null;
321         }
322         // skip processing for inheritance marker
323         if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
324             return value; // Reference: https://unicode.org/cldr/trac/ticket/11261
325         }
326         // for root annotations
327         if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) {
328             return value; // Reference: https://unicode.org/cldr/trac/ticket/11261
329         }
330 
331         try {
332             // Normalise Malayalam characters.
333             boolean isUnicodeSet = hasUnicodeSetValue(path);
334             if (locale.childOf(MALAYALAM)) {
335                 String newvalue = normalizeMalayalam(value);
336                 if (DEBUG_DAIP) System.out.println("DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'");
337                 value = newvalue;
338             } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) {
339                 value = standardizeRomanian(value);
340             } else if (locale.childOf(CATALAN) && !isUnicodeSet) {
341                 value = standardizeCatalan(value);
342             } else if (locale.childOf(NGOMBA) && !isUnicodeSet) {
343                 value = standardizeNgomba(value);
344             } else if (locale.childOf(KWASIO) && !isUnicodeSet) {
345                 value = standardizeKwasio(value);
346             } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
347                 value = replaceChars(path, value, HEBREW_CONVERSIONS, false);
348             } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) && !isUnicodeSet) {
349                 value = standardizeSwissGerman(value);
350             } else if (locale.childOf(MYANMAR) && !isUnicodeSet) {
351                 value = standardizeMyanmar(value);
352             } else if (locale.childOf(KYRGYZ)) {
353                 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false);
354             } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) {
355                 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true);
356             } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) {
357                 value = fixAdlamNasalization(value);
358             }
359 
360             if (UNICODE_WHITESPACE.containsSome(value)) {
361                 value = normalizeWhitespace(path, value);
362             }
363 
364             // all of our values should not have leading or trailing spaces, except insertBetween
365             if (!path.contains("/insertBetween") && !isUnicodeSet) {
366                 value = value.trim();
367             }
368 
369             // fix grouping separator if space
370             if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) {
371                 if (value.isEmpty()) {
372                     value = "\u00A0";
373                 }
374                 value = value.replace(' ', '\u00A0');
375             }
376 
377             // fix date patterns
378             DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path);
379             if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) {
380                 try {
381                     value = dtc.getCanonicalDatePattern(path, value, datetimePatternType);
382                 } catch (IllegalArgumentException ex) {
383                     return value;
384                 }
385             }
386 
387             if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) {
388                 value = normalizeCurrencyDisplayName(value);
389             }
390             NumericType numericType = NumericType.getNumericType(path);
391             if (numericType != NumericType.NOT_NUMERIC) {
392                 if (numericType == NumericType.CURRENCY) {
393                     value = value.replaceAll(" ", "\u00A0");
394                     if (numericType == NumericType.CURRENCY_ABBREVIATED) {
395                         value = value.replaceAll("0\\.0+", "0");
396                     }
397                 } else {
398                     value = value.replaceAll("([%\u00A4]) ", "$1\u00A0")
399                         .replaceAll(" ([%\u00A4])", "\u00A0$1");
400                     value = replace(NON_DECIMAL_PERIOD, value, "'.'");
401                     if (numericType == NumericType.DECIMAL_ABBREVIATED) {
402                         value = value.replaceAll("0\\.0+", "0");
403                     }
404                 }
405                 value = getCanonicalPattern(value, numericType, isPosix);
406             }
407 
408             // fix [,]
409             if (path.startsWith("//ldml/localeDisplayNames/languages/language")
410                 || path.startsWith("//ldml/localeDisplayNames/scripts/script")
411                 || path.startsWith("//ldml/localeDisplayNames/territories/territory")
412                 || path.startsWith("//ldml/localeDisplayNames/variants/variant")
413                 || path.startsWith("//ldml/localeDisplayNames/keys/key")
414                 || path.startsWith("//ldml/localeDisplayNames/types/type")) {
415                 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')');
416             }
417 
418             // Normalize two single quotes for the inches symbol.
419             if (path.contains("/units")) {
420                 value = value.replace("''", "″");
421             }
422 
423             // check specific cases
424             if (isUnicodeSet) {
425                 value = inputUnicodeSet(path, value);
426             } else if (path.contains("stopword")) {
427                 if (value.equals("NONE")) {
428                     value = "";
429                 }
430             }
431 
432             // Normalize ellipsis data.
433             if (path.startsWith("//ldml/characters/ellipsis")) {
434                 value = value.replace("...", "…");
435             }
436 
437             // Replace Arabic presentation forms with their nominal counterparts
438             value = replaceArabicPresentationForms(value);
439 
440             // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
441             if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
442                 value = normalizeApostrophes(value);
443             }
444             // Fix up any apostrophes in number symbols
445             if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
446                 value = value.replace('\'', '\u2019');
447             }
448             // Fix up hyphens, replacing with N-dash as appropriate
449             if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
450                 value = normalizeIntervalHyphens(value);
451             } else if (!isUnicodeSet) {
452                 value = normalizeHyphens(value);
453             }
454 
455             if (path.startsWith("//ldml/annotations/annotation")) {
456                 if (path.contains(Emoji.TYPE_TTS)) {
457                     // The row has something like "�� -name" in the first column. Cf. namePath, getNamePaths.
458                     // Normally the value is like "zebra" or "unicorn face", without "|".
459                     // If the user enters a value with "|",  discard anything after "|"; e.g., change "a | b | c" to "a".
460                     value = SPLIT_BAR.split(value).iterator().next();
461                 } else {
462                     // The row has something like "�� –keywords" in the first column. Cf. keywordPath, getKeywordPaths.
463                     // Normally the value is like "stripe | zebra", with "|".
464                     value = annotationsForDisplay(value);
465                 }
466             }
467 
468             return value;
469         } catch (RuntimeException e) {
470             if (internalException != null) {
471                 internalException[0] = e;
472             }
473             return original;
474         }
475     }
476 
477     /**
478      * Strip out all code points less than U+0020 except for U+0009 tab,
479      * U+000A line feed, and U+000D carriage return.
480      *
481      * @param s the string
482      * @return the resulting string
483      */
stripProblematicControlCharacters(String s)484     private String stripProblematicControlCharacters(String s) {
485         if (s == null || s.isEmpty()) {
486             return s;
487         }
488         return s.codePoints()
489             .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD))
490             .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
491             .toString();
492     }
493 
494     private static final boolean REMOVE_COVERED_KEYWORDS = true;
495 
496     /**
497      * Produce a modification of the given annotation by sorting its components and filtering covered keywords.
498      *
499      * Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | panda".
500      *
501      * @param value the string
502      * @return the possibly modified string
503      */
annotationsForDisplay(String value)504     private static String annotationsForDisplay(String value) {
505         TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
506         sorted.addAll(SPLIT_BAR.splitToList(value));
507         if (REMOVE_COVERED_KEYWORDS) {
508             filterCoveredKeywords(sorted);
509         }
510         value = JOIN_BAR.join(sorted);
511         return value;
512     }
513 
514     /**
515      * Filter from the given set some keywords that include spaces, if they duplicate,
516      * or are "covered by", other keywords in the set.
517      *
518      * For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | panda bear"),
519      * then remove "panda bear", treating it as "covered" since the set already includes "panda" and "bear".
520      *
521      * @param sorted the set from which items may be removed
522      */
filterCoveredKeywords(TreeSet<String> sorted)523     public static void filterCoveredKeywords(TreeSet<String> sorted) {
524         // for now, just do single items
525         HashSet<String> toRemove = new HashSet<>();
526 
527         for (String item : sorted) {
528             List<String> list = SPLIT_SPACE.splitToList(item);
529             if (list.size() < 2) {
530                 continue;
531             }
532             if (sorted.containsAll(list)) {
533                 toRemove.add(item);
534             }
535         }
536         sorted.removeAll(toRemove);
537     }
538 
displayUnicodeSet(String value)539     private String displayUnicodeSet(String value) {
540         if (value.startsWith("[") && value.endsWith("]")) {
541             value = value.substring(1, value.length() - 1);
542         }
543 
544         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
545         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
546 
547         // if (RTL.containsSome(value) && value.startsWith("[") && value.endsWith("]")) {
548         // return "\u200E[\u200E" + value.substring(1,value.length()-2) + "\u200E]\u200E";
549         // }
550         return value;
551     }
552 
inputUnicodeSet(String path, String value)553     private String inputUnicodeSet(String path, String value) {
554         // clean up the user's input.
555         // first, fix up the '['
556         value = value.trim();
557 
558         // remove brackets and trim again before regex
559         if (value.startsWith("[")) {
560             value = value.substring(1);
561         }
562         if (value.endsWith("]") && (!value.endsWith("\\]") || value.endsWith("\\\\]"))) {
563             value = value.substring(0, value.length() - 1);
564         }
565         value = value.trim();
566 
567         value = replace(NEEDS_QUOTE1, value, "$1\\\\$2$3");
568         value = replace(NEEDS_QUOTE2, value, "$1\\\\$2$3");
569 
570         // re-add brackets.
571         value = "[" + value + "]";
572 
573         UnicodeSet exemplar = new UnicodeSet(value);
574         XPathParts parts = XPathParts.getFrozenInstance(path);
575         if (parts.getElement(2).equals("parseLenients")) {
576             return exemplar.toPattern(false);
577         }
578         final String type = parts.getAttributeValue(-1, "type");
579         ExemplarType exemplarType = type == null ? ExemplarType.main : ExemplarType.valueOf(type);
580         value = getCleanedUnicodeSet(exemplar, pp, exemplarType);
581         return value;
582     }
583 
normalizeWhitespace(String path, String value)584     private String normalizeWhitespace(String path, String value) {
585         // turn all whitespace sequences (including tab and newline, and NBSP for certain paths)
586         // into a single space or a single NBSP depending on path.
587         if ((path.contains("/dateFormatLength") && path.contains("/pattern")) ||
588             path.contains("/availableFormats/dateFormatItem") ||
589             (path.startsWith("//ldml/dates/timeZoneNames/metazone") && path.contains("/long")) ||
590             path.startsWith("//ldml/dates/timeZoneNames/regionFormat") ||
591             path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") ||
592             path.startsWith("//ldml/localeDisplayNames/languages/language") ||
593             path.startsWith("//ldml/localeDisplayNames/territories/territory") ||
594             path.startsWith("//ldml/localeDisplayNames/types/type") ||
595             (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("/displayName")) ||
596             (path.contains("/decimalFormatLength[@type=\"long\"]") && path.contains("/pattern")) ||
597             path.startsWith("//ldml/posix/messages") ||
598             (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) {
599             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
600         } else if ((path.contains("/currencies/currency") && (path.contains("/group") || path.contains("/pattern")))
601             ||
602             (path.contains("/currencyFormatLength") && path.contains("/pattern")) ||
603             (path.contains("/currencySpacing") && path.contains("/insertBetween")) ||
604             (path.contains("/decimalFormatLength") && path.contains("/pattern")) || // i.e. the non-long ones
605             (path.contains("/percentFormatLength") && path.contains("/pattern")) ||
606             (path.startsWith("//ldml/numbers/symbols") && (path.contains("/group") || path.contains("/nan")))) {
607             value = WHITESPACE_AND_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); // replace with NBSP
608         } else {
609             // in this case don't normalize away NBSP
610             value = WHITESPACE_NO_NBSP_TO_NORMALIZE.matcher(value).replaceAll(" "); // replace with regular space
611         }
612         return value;
613     }
614 
normalizeCurrencyDisplayName(String value)615     private String normalizeCurrencyDisplayName(String value) {
616         StringBuilder result = new StringBuilder();
617         boolean inParentheses = false;
618         for (int i = 0; i < value.length(); i++) {
619             char c = value.charAt(i);
620             if (c == '(') {
621                 inParentheses = true;
622             } else if (c == ')') {
623                 inParentheses = false;
624             }
625             if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) {
626                 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */
627             }
628             result.append(c);
629         }
630         return result.toString();
631     }
632 
normalizeApostrophes(String value)633     private String normalizeApostrophes(String value) {
634         // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set in it to see.
635         // But since we don't, we just maintain the list internally and use it.
636         if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) {
637             return value.replace('\'', '\u02bc');
638         } else {
639             char prev = 0;
640             StringBuilder builder = new StringBuilder();
641             for (char c : value.toCharArray()) {
642                 if (c == '\'') {
643                     if (Character.isLetter(prev)) {
644                         builder.append('\u2019');
645                     } else {
646                         builder.append('\u2018');
647                     }
648                 } else {
649                     builder.append(c);
650                 }
651                 prev = c;
652             }
653             return builder.toString();
654         }
655     }
656 
normalizeIntervalHyphens(String value)657     private String normalizeIntervalHyphens(String value) {
658         DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
659         fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
660         List<Object> items = fp.getItems();
661         Object last = items.get(items.size() - 1);
662         if (last instanceof String) {
663             String separator = last.toString();
664             if (separator.contains("-")) {
665                 StringBuilder sb = new StringBuilder();
666                 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
667                 if (sb.lastIndexOf(separator) >= 0) {
668                     sb.delete(sb.lastIndexOf(separator), sb.length());
669                     sb.append(separator.replace("-", "\u2013"));
670                     sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart());
671                     return sb.toString();
672                 }
673             }
674         }
675         return value;
676     }
677 
normalizeHyphens(String value)678     private String normalizeHyphens(String value) {
679         int hyphenLocation = value.indexOf("-");
680         if (hyphenLocation > 0 &&
681             Character.isDigit(value.charAt(hyphenLocation - 1)) &&
682             hyphenLocation < value.length() - 1 &&
683             Character.isDigit(value.charAt(hyphenLocation + 1))) {
684             StringBuilder sb = new StringBuilder();
685             sb.append(value.substring(0, hyphenLocation));
686             sb.append("\u2013");
687             sb.append(value.substring(hyphenLocation + 1));
688             return sb.toString();
689         }
690         return value;
691     }
692 
standardizeRomanian(String value)693     private String standardizeRomanian(String value) {
694         StringBuilder builder = new StringBuilder();
695         for (char c : value.toCharArray()) {
696             for (char[] pair : ROMANIAN_CONVERSIONS) {
697                 if (c == pair[0]) {
698                     c = pair[1];
699                     break;
700                 }
701             }
702             builder.append(c);
703         }
704         return builder.toString();
705     }
706 
standardizeKwasio(String value)707     private String standardizeKwasio(String value) {
708         StringBuilder builder = new StringBuilder();
709         for (char c : value.toCharArray()) {
710             for (char[] pair : KWASIO_CONVERSIONS) {
711                 if (c == pair[0]) {
712                     c = pair[1];
713                     break;
714                 }
715             }
716             builder.append(c);
717         }
718         return builder.toString();
719     }
720 
721     // Use the myanmar-tools detector.
standardizeMyanmar(String value)722     private String standardizeMyanmar(String value) {
723         if (detector.getZawgyiProbability(value) > 0.90) {
724             return zawgyiUnicodeTransliterator.transform(value);
725         }
726         return value;
727     }
728 
standardizeNgomba(String value)729     private String standardizeNgomba(String value) {
730         StringBuilder builder = new StringBuilder();
731         char[] charArray = value.toCharArray();
732         for (int i = 0; i < charArray.length; i++) {
733             char c = charArray[i];
734             boolean convertedSaltillo = false;
735             for (char[] pair : NGOMBA_CONVERSIONS) {
736                 if (c == pair[0]) {
737                     c = pair[1];
738                     if (c == '\uA78C') {
739                         convertedSaltillo = true;
740                     }
741                     break;
742                 }
743             }
744             if (convertedSaltillo &&
745                 ((i > 0 && i < charArray.length - 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i + 1])) ||
746                     (i > 1 && Character.isUpperCase(charArray[i - 1]) && Character.isUpperCase(charArray[i - 2])))) {
747                 c = '\uA78B'; // UPPER CASE SALTILLO
748             }
749             builder.append(c);
750         }
751         return builder.toString();
752     }
753 
replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)754     private String replaceChars(String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) {
755         if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) {
756             return value;
757         }
758         StringBuilder builder = new StringBuilder();
759         for (char c : value.toCharArray()) {
760             for (char[] pair : charsToReplace) {
761                 if (c == pair[0]) {
762                     c = pair[1];
763                     break;
764                 }
765             }
766             builder.append(c);
767         }
768         return builder.toString();
769     }
770 
standardizeSwissGerman(String value)771     private String standardizeSwissGerman(String value) {
772         return value.replaceAll("\u00DF", "ss");
773     }
774 
standardizeCatalan(String value)775     private String standardizeCatalan(String value) {
776         StringBuilder builder = new StringBuilder();
777         for (char c : value.toCharArray()) {
778             boolean didSubstitute = false;
779             for (char[] triple : CATALAN_CONVERSIONS) {
780                 if (c == triple[0]) {
781                     builder.append(triple[1]);
782                     builder.append(triple[2]);
783                     didSubstitute = true;
784                     break;
785                 }
786             }
787             if (!didSubstitute) {
788                 builder.append(c);
789             }
790         }
791         return builder.toString();
792     }
793 
replace(Pattern pattern, String value, String replacement)794     private String replace(Pattern pattern, String value, String replacement) {
795         String value2 = pattern.matcher(value).replaceAll(replacement);
796         if (DEBUG_DAIP && !value.equals(value2)) {
797             System.out.println("\n" + value + " => " + value2);
798         }
799         return value2;
800     }
801 
802     private static Pattern UNNORMALIZED_MALAYALAM = PatternCache.get(
803         "(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D");
804 
805     private static Map<Character, Character> NORMALIZING_MAP = Builder.with(new HashMap<Character, Character>())
806         .put('\u0D23', '\u0D7A').put('\u0D28', '\u0D7B')
807         .put('\u0D30', '\u0D7C').put('\u0D32', '\u0D7D')
808         .put('\u0D33', '\u0D7E').put('\u0D15', '\u0D7F').get();
809 
810     /**
811      * Normalizes the Malayalam characters in the specified input.
812      *
813      * @param value
814      *            the input to be normalized
815      * @return
816      */
normalizeMalayalam(String value)817     private String normalizeMalayalam(String value) {
818         // Normalize Malayalam characters.
819         Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value);
820         if (matcher.find()) {
821             StringBuffer buffer = new StringBuffer();
822             int start = 0;
823             do {
824                 buffer.append(value.substring(start, matcher.start(0)));
825                 char codePoint = matcher.group(1).charAt(0);
826                 buffer.append(NORMALIZING_MAP.get(codePoint));
827                 start = matcher.end(0);
828             } while (matcher.find());
829             buffer.append(value.substring(start));
830             value = buffer.toString();
831         }
832         return value;
833     }
834 
835     static final Transform<String, String> fixArabicPresentation = Transliterator.getInstance(
836         "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc");
837 
838     /**
839      * Normalizes the Arabic presentation forms characters in the specified input.
840      *
841      * @param value
842      *            the input to be normalized
843      * @return
844      */
replaceArabicPresentationForms(String value)845     private String replaceArabicPresentationForms(String value) {
846         value = fixArabicPresentation.transform(value);
847         return value;
848     }
849 
850     static Pattern ADLAM_MISNASALIZED = PatternCache.get("([����])['’‘]([����������������])");
851     public static String ADLAM_NASALIZATION = "��"; // U+1E94B (Unicode 12.0)
852 
fixAdlamNasalization(String fromString)853     public static String fixAdlamNasalization(String fromString) {
854         return ADLAM_MISNASALIZED.matcher(fromString)
855         .replaceAll("$1"+ADLAM_NASALIZATION+"$2");  // replace quote with ��
856     }
857 
858     static Pattern REMOVE_QUOTE1 = PatternCache.get("(\\s)(\\\\[-\\}\\]\\&])()");
859     static Pattern REMOVE_QUOTE2 = PatternCache.get("(\\\\[\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
860 
861     static Pattern NEEDS_QUOTE1 = PatternCache.get("(\\s|$)([-\\}\\]\\&])()");
862     static Pattern NEEDS_QUOTE2 = PatternCache.get("([^\\\\])([\\-\\{\\[\\&])(\\s)"); // ([^\\])([\\-\\{\\[])(\\s)
863 
getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter, ExemplarType exemplarType)864     public static String getCleanedUnicodeSet(UnicodeSet exemplar, UnicodeSetPrettyPrinter prettyPrinter,
865         ExemplarType exemplarType) {
866         if (prettyPrinter == null) {
867             return exemplar.toPattern(false);
868         }
869         String value;
870         prettyPrinter.setCompressRanges(exemplar.size() > 300);
871         value = exemplar.toPattern(false);
872         UnicodeSet toAdd = new UnicodeSet();
873 
874         for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next();) {
875             String string = usi.getString();
876             if (string.equals("ß") || string.equals("İ")) {
877                 toAdd.add(string);
878                 continue;
879             }
880             switch (string) {
881             case "\u2011": toAdd.add("-"); break; // nobreak hyphen
882             case "-": toAdd.add("\u2011"); break; // nobreak hyphen
883 
884             case " ": toAdd.add("\u00a0"); break; // nobreak space
885             case "\u00a0": toAdd.add(" "); break; // nobreak space
886 
887             case "\u202F": toAdd.add("\u2009"); break; // nobreak narrow space
888             case "\u2009": toAdd.add("\u202F"); break; // nobreak narrow space
889             }
890             if (exemplarType.convertUppercase) {
891                 string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
892             }
893             toAdd.add(string);
894             String composed = Normalizer.compose(string, false);
895             if (!string.equals(composed)) {
896                 toAdd.add(composed);
897             }
898         }
899 
900         toAdd.removeAll(exemplarType.toRemove);
901 
902         if (DEBUG_DAIP && !toAdd.equals(exemplar)) {
903             UnicodeSet oldOnly = new UnicodeSet(exemplar).removeAll(toAdd);
904             UnicodeSet newOnly = new UnicodeSet(toAdd).removeAll(exemplar);
905             System.out.println("Exemplar:\t" + exemplarType + ",\tremoved\t" + oldOnly + ",\tadded\t" + newOnly);
906         }
907 
908         String fixedExemplar = prettyPrinter.format(toAdd);
909         UnicodeSet doubleCheck = new UnicodeSet(fixedExemplar);
910         if (!toAdd.equals(doubleCheck)) {
911             // something went wrong, leave as is
912         } else if (!value.equals(fixedExemplar)) { // put in this condition just for debugging
913             if (DEBUG_DAIP) {
914                 System.out.println(TestMetadata.showDifference(
915                     With.codePoints(value),
916                     With.codePoints(fixedExemplar),
917                     "\n"));
918             }
919             value = fixedExemplar;
920         }
921         return value;
922     }
923 
924     /**
925      * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is set for en_US_POSIX.
926      */
927     static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();
928 
getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)929     public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) {
930         // TODO fix later to properly handle quoted ;
931 
932         DecimalFormat df = new DecimalFormat(inpattern);
933         if (type == NumericType.DECIMAL_ABBREVIATED || type == NumericType.CURRENCY_ABBREVIATED
934             || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) {
935             return inpattern; // TODO fix when ICU bug is fixed
936             // df.setMaximumFractionDigits(df.getMinimumFractionDigits());
937             // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits()));
938         } else {
939             // int decimals = type == CURRENCY_TYPE ? 2 : 1;
940             int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount;
941             df.setMinimumIntegerDigits(digits[0]);
942             df.setMinimumFractionDigits(digits[1]);
943             df.setMaximumFractionDigits(digits[2]);
944         }
945         String pattern = df.toPattern();
946         List<String> parts = SEMI_SPLITTER.splitToList(pattern);
947         String pattern2 = parts.get(0);
948         if (parts.size() > 1) {
949             pattern2 += ";" + parts.get(1);
950         }
951         if (!pattern2.equals(pattern)) {
952             pattern = pattern2;
953         }
954         // int pos = pattern.indexOf(';');
955         // if (pos < 0) return pattern + ";-" + pattern;
956         return pattern;
957     }
958 
959     /*
960      * This tests what type a numeric pattern is.
961      */
962     public enum NumericType {
963         CURRENCY(new int[] { 1, 2, 2 }, new int[] { 1, 2, 2 }), CURRENCY_ABBREVIATED(), DECIMAL(new int[] { 1, 0, 3 },
964             new int[] { 1, 0, 6 }), DECIMAL_ABBREVIATED(), PERCENT(new int[] { 1, 0, 0 },
965                 new int[] { 1, 0, 0 }), SCIENTIFIC(new int[] { 0, 0, 0 }, new int[] { 1, 6, 6 }), NOT_NUMERIC;
966 
967         private static final Pattern NUMBER_PATH = Pattern
968             .compile("//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*");
969         private int[] digitCount;
970         private int[] posixDigitCount;
971 
NumericType()972         private NumericType() {
973         }
974 
NumericType(int[] digitCount, int[] posixDigitCount)975         private NumericType(int[] digitCount, int[] posixDigitCount) {
976             this.digitCount = digitCount;
977             this.posixDigitCount = posixDigitCount;
978         }
979 
980         /**
981          * @return the numeric type of the xpath
982          */
getNumericType(String xpath)983         public static NumericType getNumericType(String xpath) {
984             Matcher matcher = NUMBER_PATH.matcher(xpath);
985             if (xpath.indexOf("/pattern") < 0) {
986                 return NOT_NUMERIC;
987             } else if (matcher.matches()) {
988                 if (matcher.group(1).equals("currencies/currency")) {
989                     return CURRENCY;
990                 } else {
991                     NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase());
992                     if (xpath.contains("=\"1000")) {
993                         if (type == DECIMAL) {
994                             type = DECIMAL_ABBREVIATED;
995                         } else if (type == CURRENCY) {
996                             type = CURRENCY_ABBREVIATED;
997                         } else {
998                             throw new IllegalArgumentException("Internal Error");
999                         }
1000                     }
1001                     return type;
1002                 }
1003             } else {
1004                 return NOT_NUMERIC;
1005             }
1006         }
1007 
getDigitCount()1008         public int[] getDigitCount() {
1009             return digitCount;
1010         }
1011 
getPosixDigitCount()1012         public int[] getPosixDigitCount() {
1013             return posixDigitCount;
1014         }
1015     }
1016 }
1017