1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.HashSet;
8 import java.util.LinkedHashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeMap;
14 import java.util.TreeSet;
15 import java.util.regex.Matcher;
16 
17 import org.unicode.cldr.tool.FormattedFileWriter.Anchors;
18 import org.unicode.cldr.util.CLDRConfig;
19 import org.unicode.cldr.util.CLDRFile;
20 import org.unicode.cldr.util.CLDRFile.DraftStatus;
21 import org.unicode.cldr.util.CLDRFile.NumberingSystem;
22 import org.unicode.cldr.util.CLDRFile.WinningChoice;
23 import org.unicode.cldr.util.CLDRPaths;
24 import org.unicode.cldr.util.Factory;
25 import org.unicode.cldr.util.FileCopier;
26 import org.unicode.cldr.util.Pair;
27 import org.unicode.cldr.util.PatternCache;
28 import org.unicode.cldr.util.XMLFileReader;
29 import org.unicode.cldr.util.XPathParts;
30 
31 import com.google.common.base.Splitter;
32 import com.ibm.icu.dev.util.CollectionUtilities;
33 import com.ibm.icu.text.Collator;
34 import com.ibm.icu.text.RuleBasedCollator;
35 import com.ibm.icu.text.Transliterator;
36 import com.ibm.icu.text.UnicodeSet;
37 
38 public class ChartCollation extends Chart {
39 
40     static final String NOT_TAILORED = "notTailored";
41     static final String NOT_EXEMPLARS = "notExemplars";
42 
43     private static final String KNOWN_PROBLEMS = "<p>Known issues:</p>"
44         + "<ul>" + LS
45         + "<li>The ordering is illustrated with a basic list:"
46         + "<ol>" + LS
47         + "<li>it doesn't show the strength differences</li>" + LS
48         + "<li>it does not yet take the settings or imports into account, so those are listed separately</li>" + LS
49         + "<li>consult the XML file for the exact details</li>" + LS
50         + "</ol>" + LS
51         + "<li>The characters used in the illustration are:" + LS
52         + "<ol>" + LS
53         + "<li>those <span class='" + NOT_TAILORED + "'>not tailored</span> (added from standard exemplars for context)</li>" + LS
54         + "<li>those <span class='" + NOT_EXEMPLARS + "'>tailored</span>, but not in any exemplars (standard, aux, punctuation)</li>" + LS
55         + "<li>those both tailored and in exemplars</li>" + LS
56         + "</ol>" + LS
57         + "<li>The tailored characters may include:" + LS
58         + "<ol>" + LS
59         + "<li>some longer strings (contractions) from the rules</li>" + LS
60         + "<li>generated Unicode characters (for <i>canonical closure</i>)</li>" + LS
61         + "</ol>" + LS
62         + "</li>" + LS
63         + "</ul>" + LS;
64 
65     private static final Factory CLDR_FACTORY = CLDRConfig.getInstance().getCldrFactory();
66     private static final boolean DEBUG = false;
67     private static final String DIR = CLDRPaths.CHART_DIRECTORY + "collation/";
68 
69     //static Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "collation/", ".*");
70 
main(String[] args)71     public static void main(String[] args) {
72         new ChartCollation().writeChart(null);
73     }
74 
75     @Override
getDirectory()76     public String getDirectory() {
77         return DIR;
78     }
79 
80     @Override
getTitle()81     public String getTitle() {
82         return "Collation Charts";
83     }
84 
85     @Override
getFileName()86     public String getFileName() {
87         return "index";
88     }
89 
90     @Override
getExplanation()91     public String getExplanation() {
92         return "<p>This is a <i>preliminary</i> set of charts for CLDR collation tailorings. "
93             + "Collation tailorings provide language or locale-specific modifications of the standard Unicode CLDR collation order, "
94             + "which is based on <a target='_blank' href='http://unicode.org/charts/collation/'>Unicode default collation charts</a>. "
95             + "Locales that just use the standard CLDR order are not listed. "
96             + "For more information, see the "
97             + "<a target='_blank' href='http://unicode.org/reports/tr35/tr35-collation.html'>LDML Collation spec</a>. "
98             + "The complete data for these charts is in "
99             + "<a target='_blank' href='" + ToolConstants.CHART_SOURCE + "common/collation/'>collation/</a>.</p>" + LS;
100     }
101 
writeContents(FormattedFileWriter pw)102     public void writeContents(FormattedFileWriter pw) throws IOException {
103         FileCopier.ensureDirectoryExists(DIR);
104         FileCopier.copy(Chart.class, "index.css", DIR);
105 
106         FormattedFileWriter.Anchors anchors = new FormattedFileWriter.Anchors();
107         writeSubcharts(anchors);
108         pw.setIndex("Main Chart Index", "../index.html");
109         pw.write(anchors.toString());
110     }
111 
112     static class Data {
113         RuleBasedCollator collator;
114         Set<String> settings = new LinkedHashSet<>();
115     }
116 
writeSubcharts(Anchors anchors)117     public void writeSubcharts(Anchors anchors) throws IOException {
118         Matcher settingsMatcher = PatternCache.get(
119             "//ldml/collations/collation"
120                 + "\\[@type=\"([^\"]+)\"]"
121                 + "(.*)?"
122                 + "/(settings|import|cr)"
123                 + "(.*)")
124             .matcher("");
125         Splitter settingSplitter = Splitter.onPattern("[\\[\\]@]").omitEmptyStrings().trimResults();
126         File baseDir = new File(CLDRPaths.COMMON_DIRECTORY + "collation/");
127         Transliterator fromUnicode = Transliterator.getInstance("Hex-Any");
128         List<Pair<String, String>> pathValueList = new ArrayList<>();
129         HashSet<String> mainAvailable = new HashSet<>(CLDR_FACTORY.getAvailable());
130 //        for (String xmlName : baseDir.list()) {
131 //            if (!xmlName.endsWith(".xml")) {
132 //                continue;
133 //            }
134 //            String locale = xmlName.substring(0,xmlName.length()-4);
135 //        }
136         for (String xmlName : baseDir.list()) {
137             if (!xmlName.endsWith(".xml")) {
138                 continue;
139             }
140             String locale = xmlName.substring(0, xmlName.length() - 4);
141             if (!mainAvailable.contains(locale)) {
142                 System.out.println("Skipping locale not in main: " + locale);
143                 continue;
144             }
145 
146             pathValueList.clear();
147             XMLFileReader.loadPathValues(CLDRPaths.COMMON_DIRECTORY + "collation/" + xmlName, pathValueList, true);
148             Map<String, Data> data = new TreeMap<>();
149             XPathParts xpp = new XPathParts();
150 
151             for (Pair<String, String> entry : pathValueList) {
152                 String path = entry.getFirst();
153                 String value = entry.getSecond();
154                 if (path.startsWith("//ldml/identity/")) {
155                     continue;
156                 }
157 
158                 if (path.equals("//ldml/collations/defaultCollation")) {
159                     addCollator(data, value, "defaultCollation", Arrays.asList("true"));
160                     continue;
161                 }
162 
163                 // Root collator being empty isn't really a failure - just skip it.
164                 if (xmlName.equals("root.xml") && path.equals("//ldml/collations/collation[@type=\"standard\"]")) {
165                     continue;
166                 }
167 
168                 xpp.set(path);
169                 DraftStatus status = DraftStatus.forString(xpp.findFirstAttributeValue("draft"));
170                 if (status == DraftStatus.unconfirmed) {
171                     System.out.println("Skipping " + path + " in: " + xmlName + " due to draft status = " + status.toString());
172                     continue;
173                 }
174 
175                 if (!settingsMatcher.reset(path).matches()) {
176                     System.out.println("Failure in " + xmlName + " with: " + path);
177                     continue;
178                 }
179                 String type = settingsMatcher.group(1);
180 //                if (type.startsWith("private-")) {
181 //                    System.out.println("Skipping private-");
182 //                    continue;
183 //                }
184 
185                 String otherAttributes = settingsMatcher.group(2);
186                 String leaf = settingsMatcher.group(3);
187                 String values = settingsMatcher.group(4);
188 
189                 if (leaf.equals("settings") || leaf.equals("import")) {
190                     //ldml/collations/collation[@type="compat"][@visibility="external"]/settings[@reorder="Arab"]
191                     List<String> settings = settingSplitter.splitToList(values);
192                     addCollator(data, type, leaf, settings);
193                     continue;
194                 }
195                 String rules = value;
196                 if (!rules.contains("'#⃣'")) {
197                     rules = rules.replace("#⃣", "'#⃣'").replace("*⃣", "'*⃣'"); //hack for 8288
198                 }
199                 rules = fromUnicode.transform(rules);
200 
201                 try {
202                     RuleBasedCollator col = new RuleBasedCollator(rules);
203                     col.setStrength(Collator.IDENTICAL);
204                     col.freeze();
205                     addCollator(data, type, col);
206                 } catch (Exception e) {
207                     System.out.println("*** Skipping " + locale + ":" + type + ", " + e);
208                 }
209             }
210             if (data.isEmpty()) { // remove completely empty
211                 continue;
212             }
213             if (!data.containsKey("standard")) {
214                 addCollator(data, "standard", (RuleBasedCollator) null);
215             }
216             new Subchart(ENGLISH.getName(locale, true, CLDRFile.SHORT_ALTS), locale, data).writeChart(anchors);
217         }
218     }
219 
addCollator(Map<String, Data> data, String type, String leaf, List<String> settings)220     private void addCollator(Map<String, Data> data, String type, String leaf, List<String> settings) {
221         if (type.startsWith("private-")) {
222             type = "\uFFFF" + type;
223         }
224         Data dataItem = data.get(type);
225         if (dataItem == null) {
226             data.put(type, dataItem = new Data());
227         }
228         dataItem.settings.add(leaf + ":" + CollectionUtilities.join(settings, ";"));
229     }
230 
addCollator(Map<String, Data> data, String type, RuleBasedCollator col)231     private void addCollator(Map<String, Data> data, String type, RuleBasedCollator col) {
232         if (type.startsWith("private-")) {
233             type = "\uFFFF" + type;
234         }
235         Data dataItem = data.get(type);
236         if (dataItem == null) {
237             data.put(type, dataItem = new Data());
238         }
239         dataItem.collator = col;
240     }
241 
242     //RuleBasedCollator ROOT = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
243 
244     private class Subchart extends Chart {
245         private static final String HIGH_COLLATION_PRIMARY = "\uFFFF";
246         String title;
247         String file;
248         private Map<String, Data> data;
249 
250         @Override
getShowDate()251         public boolean getShowDate() {
252             return false;
253         }
254 
Subchart(String title, String file, Map<String, Data> data2)255         public Subchart(String title, String file, Map<String, Data> data2) {
256             this.title = title;
257             this.file = file;
258             this.data = data2;
259         }
260 
261         @Override
getDirectory()262         public String getDirectory() {
263             return DIR;
264         }
265 
266         @Override
getTitle()267         public String getTitle() {
268             return title;
269         }
270 
271         @Override
getFileName()272         public String getFileName() {
273             return file;
274         }
275 
276         @Override
getExplanation()277         public String getExplanation() {
278             return "<p>This is a <i>preliminary</i> chart for the " + title
279                 + " collation tailorings. "
280                 + "The complete data for this chart is found on "
281                 + "<a target='_blank' href='" + ToolConstants.CHART_SOURCE + "common/collation/" + file + ".xml'>" + file + ".xml</a>.</p>"
282                 + KNOWN_PROBLEMS;
283         }
284 
285         @Override
writeContents(FormattedFileWriter pw)286         public void writeContents(FormattedFileWriter pw) throws IOException {
287 
288             CLDRFile cldrFile = CLDR_FACTORY.make(file, true);
289             UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING).freeze();
290 
291             UnicodeSet exemplars_all = new UnicodeSet(exemplars);
292             UnicodeSet exemplars_auxiliary = cldrFile.getExemplarSet("auxiliary", WinningChoice.WINNING);
293             UnicodeSet exemplars_punctuation = cldrFile.getExemplarSet("punctuation", WinningChoice.WINNING);
294             exemplars_all.addAll(exemplars_auxiliary)
295                 .addAll(exemplars_punctuation);
296 
297             for (NumberingSystem system : NumberingSystem.values()) {
298                 UnicodeSet exemplars_numeric = cldrFile.getExemplarsNumeric(system);
299                 if (exemplars_numeric != null) {
300                     exemplars_all.addAll(exemplars_numeric);
301                     //System.out.println(file + "\t" + system + "\t" + exemplars_numeric.toPattern(false));
302                 }
303             }
304             exemplars_all.freeze();
305 
306             TablePrinter tablePrinter = new TablePrinter()
307                 .addColumn("Type", "class='source'", null, "class='source'", true)
308                 .addColumn("Ordering", "class='target'", null, "class='target_nofont'", true);
309 
310             for (Entry<String, Data> entry : data.entrySet()) {
311                 // sort the characters
312                 String type = entry.getKey();
313                 if (type.startsWith(HIGH_COLLATION_PRIMARY)) {
314                     type = type.substring(1);
315                 }
316                 RuleBasedCollator col = entry.getValue().collator;
317                 Set<String> settings = entry.getValue().settings;
318                 StringBuilder list = new StringBuilder();
319                 if (!settings.isEmpty()) {
320                     list.append(CollectionUtilities.join(settings, "<br>"));
321                     list.append("<br><b><i>plus</i></b><br>");
322                 }
323                 if (col == null) {
324                     list.append("<i>CLDR default character order</i>");
325                 } else {
326                     UnicodeSet tailored = new UnicodeSet(col.getTailoredSet());
327                     Set<String> sorted = new TreeSet<>(col);
328                     exemplars.addAllTo(sorted);
329                     tailored.addAllTo(sorted);
330                     boolean first = true;
331                     for (String s : sorted) {
332 //                        if (--maxCount < 0) {
333 //                            list.append(" …");
334 //                            break;
335 //                        }
336                         if (first) {
337                             first = false;
338                         } else {
339                             list.append(' ');
340                         }
341                         if (s.startsWith("\uFDD0")) { // special CJK markers
342                             int len = list.length();
343                             if (len > 4 && list.substring(len - 4, len).equals("<br>")) {
344                                 list.append("<br>");
345                             }
346                             continue;
347                         }
348                         if (!tailored.contains(s)) {
349                             list.append("<span class='" + NOT_TAILORED + "'>").append(s).append("</span>");
350                         } else if (!exemplars_all.containsAll(s) && !file.equals("root")) {
351                             list.append("<span class='" + NOT_EXEMPLARS + "'>").append(s).append("</span>");
352                         } else {
353                             list.append(s);
354                         }
355                     }
356                 }
357                 tablePrinter
358                     .addRow()
359                     .addCell(type)
360                     .addCell(list.toString());
361                 tablePrinter.finishRow();
362             }
363             pw.write(tablePrinter.toTable());
364         }
365     }
366 }
367