1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.util.HashSet;
7 import java.util.LinkedHashMap;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Set;
11 import java.util.TreeMap;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 
16 import org.unicode.cldr.draft.FileUtilities;
17 import org.unicode.cldr.test.DisplayAndInputProcessor;
18 import org.unicode.cldr.tool.Option.Options;
19 import org.unicode.cldr.tool.Option.Params;
20 import org.unicode.cldr.util.Annotations;
21 import org.unicode.cldr.util.Annotations.AnnotationSet;
22 import org.unicode.cldr.util.CLDRConfig;
23 import org.unicode.cldr.util.CLDRFile;
24 import org.unicode.cldr.util.CLDRPaths;
25 import org.unicode.cldr.util.CldrUtility;
26 import org.unicode.cldr.util.Emoji;
27 import org.unicode.cldr.util.Factory;
28 import org.unicode.cldr.util.Level;
29 import org.unicode.cldr.util.Organization;
30 import org.unicode.cldr.util.SimpleXMLSource;
31 import org.unicode.cldr.util.XPathParts.Comments.CommentType;
32 
33 import com.google.common.base.Joiner;
34 import com.google.common.base.Splitter;
35 import com.google.common.collect.ImmutableSortedSet;
36 import com.ibm.icu.impl.Utility;
37 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap;
38 import com.ibm.icu.text.UnicodeSet;
39 
40 public class GenerateDerivedAnnotations {
41     // Use EmojiData.getDerivableNames() to update this for each version of Unicode.
42 
43     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
44 
45     static final UnicodeSet SKIP = new UnicodeSet()
46         .add(Annotations.ENGLISH_MARKER)
47         .add(Annotations.BAD_MARKER)
48         .add(Annotations.MISSING_MARKER)
49         .freeze();
50 
51     static Map<String,String> codepointToIsoCurrencyCode;
52     static {
53         final Splitter tabSplitter = Splitter.on('\t').trimResults();
54         Map<String,String> _codepointToIsoCurrencyCode = new TreeMap<>();
55         for (String line : FileUtilities.in(CldrUtility.class, "data/codepointToIsoCurrencyCode.tsv")) {
56             if (line.startsWith("#")) {
57                 continue;
58             }
59             List<String> parts = tabSplitter.splitToList(line);
60             _codepointToIsoCurrencyCode.put(parts.get(0), parts.get(1));
61         }
62         codepointToIsoCurrencyCode = ImmutableMap.copyOf(_codepointToIsoCurrencyCode);
63     }
64 
65     private enum MyOptions {
66         fileFilter(new Params().setHelp("filter files by dir/locale, eg: ^main/en$ or .*/en").setMatch(".*").setDefault(".*")),
67         missing(new Params().setHelp("only missing").setMatch("")),
68         ;
69 
70         // BOILERPLATE TO COPY
71         final Option option;
72 
MyOptions(Params params)73         private MyOptions(Params params) {
74             option = new Option(this, params);
75         }
76 
77         private static Options myOptions = new Options();
78         static {
79             for (MyOptions option : MyOptions.values()) {
myOptions.add(option, option.option)80                 myOptions.add(option, option.option);
81             }
82         }
83 
parse(String[] args)84         private static Set<String> parse(String[] args) {
85             return myOptions.parse(MyOptions.values()[0], args, true);
86         }
87     }
88 
main(String[] args)89     public static void main(String[] args) throws IOException {
90         MyOptions.parse(args);
91 
92         boolean missingOnly = MyOptions.missing.option.doesOccur();
93         if (missingOnly) {
94             System.out.println("With the 'missing' argument files will not be written, only the missing items will be written to the console");
95         }
96 
97         Matcher localeMatcher = Pattern.compile(MyOptions.fileFilter.option.getValue()).matcher("");
98         Joiner BAR = Joiner.on(" | ");
99         AnnotationSet enAnnotations = Annotations.getDataSet("en");
100         CLDRFile english = CLDR_CONFIG.getEnglish();
101 
102         UnicodeSet derivables = new UnicodeSet(Emoji.getAllRgiNoES())
103             .addAll(codepointToIsoCurrencyCode.keySet())
104             .removeAll(enAnnotations.keySet())
105             .freeze();
106 
107         for (String d : derivables) {
108             if (d.contains("����")) {
109                 System.out.println(d + "\t" + Utility.hex(d));
110             }
111         }
112 
113         Map<String, UnicodeSet> localeToFailures = new LinkedHashMap<>();
114         Set<String> locales = ImmutableSortedSet.copyOf(Annotations.getAvailable());
115         final Factory cldrFactory = CLDRConfig.getInstance().getCldrFactory();
116 
117         for (String locale : locales) {
118             if ("root".equals(locale)) {
119                 continue;
120             }
121             if (!localeMatcher.reset(locale).matches()) {
122                 continue;
123             }
124             UnicodeSet failures = new UnicodeSet(Emoji.getAllRgiNoES());
125             localeToFailures.put(locale, failures);
126 
127             AnnotationSet annotations;
128             try {
129                 annotations = Annotations.getDataSet(locale);
130                 failures.removeAll(annotations.getExplicitValues());
131             } catch (Exception e) {
132                 System.out.println("Can't create annotations for: " + locale + "\n\t" + e.getMessage());
133                 annotations = Annotations.getDataSet(locale);
134                 continue;
135             }
136             CLDRFile target = new CLDRFile(new SimpleXMLSource(locale));
137             CLDRFile main = null;
138             DisplayAndInputProcessor DAIP = new DisplayAndInputProcessor(target);
139             Exception[] internalException = new Exception[1];
140 
141             target.addComment("//ldml", "Derived short names and annotations, using GenerateDerivedAnnotations.java. See warnings in /annotations/ file.",
142                 CommentType.PREBLOCK);
143             for (String derivable : derivables) {
144                 String shortName = null;
145                 try {
146                     shortName = annotations.getShortName(derivable);
147                 } catch (Exception e) {
148                 }
149 
150                 if (shortName == null) {
151                     String currencyCode = codepointToIsoCurrencyCode.get(derivable);
152                     if (currencyCode != null) {
153                         if (main == null) {
154                             main = cldrFactory.make(locale, true);
155                         }
156                         shortName = main.getName(CLDRFile.CURRENCY_NAME, currencyCode);
157                         if (shortName.contentEquals(currencyCode)) {
158                             shortName = null; // don't want fallback raw code
159                         }
160                     }
161                 }
162 
163                 if (shortName == null || SKIP.containsSome(shortName)) {
164                     continue; // missing
165                 }
166                 Set<String> keywords = annotations.getKeywordsMinus(derivable);
167                 String path = "//ldml/annotations/annotation[@cp=\"" + derivable + "\"]";
168                 if (!keywords.isEmpty()) {
169                     Set<String> keywordsFixed = new HashSet<>();
170                     for (String keyword : keywords) {
171                         if (!SKIP.containsSome(keyword)) {
172                             keywordsFixed.add(keyword);
173                         }
174                     }
175                     if (!keywordsFixed.isEmpty()) {
176                         String value = BAR.join(keywordsFixed);
177                         String newValue = DAIP.processInput(path, value, internalException);
178                         target.add(path, newValue);
179                     }
180                 }
181                 failures.remove(derivable);
182                 String ttsPath = path + "[@type=\"tts\"]";
183                 String shortName2 = DAIP.processInput(path, shortName, internalException);
184                 target.add(ttsPath, shortName2);
185             }
186             failures.freeze();
187             if (!failures.isEmpty()) {
188                 Level level = CLDR_CONFIG.getStandardCodes().getLocaleCoverageLevel(Organization.cldr, locale);
189                 System.out.println("Failures\t" + locale
190                     + "\t" + level
191                     + "\t" + english.getName(locale)
192                     + "\t" + failures.size()
193                     + "\t" + failures.toPattern(false));
194             }
195             if (missingOnly) {
196                 continue;
197             }
198             try (PrintWriter pw = FileUtilities.openUTF8Writer(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml")) {
199                 target.write(pw);
200             }
201         }
202         Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", ".*");
203         for (String locale : locales) {
204             if ("root".equals(locale)) {
205                 continue;
206             }
207             if (!localeMatcher.reset(locale).matches()) {
208                 continue;
209             }
210             CLDRFile cldrFileUnresolved = factory.make(locale, false);
211             CLDRFile cldrFileResolved = factory.make(locale, true);
212             Set<String> toRemove = new TreeSet<>(); // TreeSet just makes debugging easier
213             boolean gotOne = false;
214             for (String xpath : cldrFileUnresolved) {
215                 if (xpath.startsWith("//ldml/identity")) {
216                     continue;
217                 }
218 
219                 String value = cldrFileUnresolved.getStringValue(xpath);
220 
221                 // remove items that are the same as their bailey values. This also catches Inheritance Marker
222 
223                 String bailey = cldrFileResolved.getConstructedBaileyValue(xpath, null, null);
224                 if (value.equals(bailey)) {
225                     toRemove.add(xpath);
226                     continue;
227                 }
228                 gotOne = true;
229             }
230             if (!gotOne) {
231                 if (locale.equals("sr_Cyrl")) {
232                     System.err.println("TODO: keep from deleting files with non-empty children");
233                 } else {
234                     System.out.println("Removing empty " + locale);
235                     new File(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml").deleteOnExit();
236                 }
237             } else if (!toRemove.isEmpty()) {
238                 System.out.println("Removing " + toRemove.size() + " items from " + locale);
239                 CLDRFile fileToWrite = cldrFileUnresolved.cloneAsThawed();
240                 fileToWrite.removeAll(toRemove, false);
241                 File file = new File(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml");
242                 try (PrintWriter pw = new PrintWriter(file)) {
243                     fileToWrite.write(pw);
244                 }
245             }
246         }
247         System.out.println("Be sure to run CLDRModify passes afterwards, and generate transformed locales (like de-CH).");
248     }
249 
250 }
251