1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 import java.util.HashSet; 8 import java.util.LinkedHashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Map.Entry; 12 import java.util.Set; 13 import java.util.TreeMap; 14 import java.util.TreeSet; 15 import java.util.regex.Matcher; 16 17 import org.unicode.cldr.tool.FormattedFileWriter.Anchors; 18 import org.unicode.cldr.util.CLDRConfig; 19 import org.unicode.cldr.util.CLDRFile; 20 import org.unicode.cldr.util.CLDRFile.DraftStatus; 21 import org.unicode.cldr.util.CLDRFile.NumberingSystem; 22 import org.unicode.cldr.util.CLDRFile.WinningChoice; 23 import org.unicode.cldr.util.CLDRPaths; 24 import org.unicode.cldr.util.Factory; 25 import org.unicode.cldr.util.FileCopier; 26 import org.unicode.cldr.util.Pair; 27 import org.unicode.cldr.util.PatternCache; 28 import org.unicode.cldr.util.XMLFileReader; 29 import org.unicode.cldr.util.XPathParts; 30 31 import com.google.common.base.Splitter; 32 import com.ibm.icu.dev.util.CollectionUtilities; 33 import com.ibm.icu.text.Collator; 34 import com.ibm.icu.text.RuleBasedCollator; 35 import com.ibm.icu.text.Transliterator; 36 import com.ibm.icu.text.UnicodeSet; 37 38 public class ChartCollation extends Chart { 39 40 static final String NOT_TAILORED = "notTailored"; 41 static final String NOT_EXEMPLARS = "notExemplars"; 42 43 private static final String KNOWN_PROBLEMS = "<p>Known issues:</p>" 44 + "<ul>" + LS 45 + "<li>The ordering is illustrated with a basic list:" 46 + "<ol>" + LS 47 + "<li>it doesn't show the strength differences</li>" + LS 48 + "<li>it does not yet take the settings or imports into account, so those are listed separately</li>" + LS 49 + "<li>consult the XML file for the exact details</li>" + LS 50 + "</ol>" + LS 51 + "<li>The characters used in the illustration are:" + LS 52 + "<ol>" + LS 53 + "<li>those <span class='" + NOT_TAILORED + "'>not tailored</span> (added from standard exemplars for context)</li>" + LS 54 + "<li>those <span class='" + NOT_EXEMPLARS + "'>tailored</span>, but not in any exemplars (standard, aux, punctuation)</li>" + LS 55 + "<li>those both tailored and in exemplars</li>" + LS 56 + "</ol>" + LS 57 + "<li>The tailored characters may include:" + LS 58 + "<ol>" + LS 59 + "<li>some longer strings (contractions) from the rules</li>" + LS 60 + "<li>generated Unicode characters (for <i>canonical closure</i>)</li>" + LS 61 + "</ol>" + LS 62 + "</li>" + LS 63 + "</ul>" + LS; 64 65 private static final Factory CLDR_FACTORY = CLDRConfig.getInstance().getCldrFactory(); 66 private static final boolean DEBUG = false; 67 private static final String DIR = CLDRPaths.CHART_DIRECTORY + "collation/"; 68 69 //static Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "collation/", ".*"); 70 main(String[] args)71 public static void main(String[] args) { 72 new ChartCollation().writeChart(null); 73 } 74 75 @Override getDirectory()76 public String getDirectory() { 77 return DIR; 78 } 79 80 @Override getTitle()81 public String getTitle() { 82 return "Collation Charts"; 83 } 84 85 @Override getFileName()86 public String getFileName() { 87 return "index"; 88 } 89 90 @Override getExplanation()91 public String getExplanation() { 92 return "<p>This is a <i>preliminary</i> set of charts for CLDR collation tailorings. " 93 + "Collation tailorings provide language or locale-specific modifications of the standard Unicode CLDR collation order, " 94 + "which is based on <a target='_blank' href='http://unicode.org/charts/collation/'>Unicode default collation charts</a>. " 95 + "Locales that just use the standard CLDR order are not listed. " 96 + "For more information, see the " 97 + "<a target='_blank' href='http://unicode.org/reports/tr35/tr35-collation.html'>LDML Collation spec</a>. " 98 + "The complete data for these charts is in " 99 + "<a target='_blank' href='" + ToolConstants.CHART_SOURCE + "common/collation/'>collation/</a>.</p>" + LS; 100 } 101 writeContents(FormattedFileWriter pw)102 public void writeContents(FormattedFileWriter pw) throws IOException { 103 FileCopier.ensureDirectoryExists(DIR); 104 FileCopier.copy(Chart.class, "index.css", DIR); 105 106 FormattedFileWriter.Anchors anchors = new FormattedFileWriter.Anchors(); 107 writeSubcharts(anchors); 108 pw.setIndex("Main Chart Index", "../index.html"); 109 pw.write(anchors.toString()); 110 } 111 112 static class Data { 113 RuleBasedCollator collator; 114 Set<String> settings = new LinkedHashSet<>(); 115 } 116 writeSubcharts(Anchors anchors)117 public void writeSubcharts(Anchors anchors) throws IOException { 118 Matcher settingsMatcher = PatternCache.get( 119 "//ldml/collations/collation" 120 + "\\[@type=\"([^\"]+)\"]" 121 + "(.*)?" 122 + "/(settings|import|cr)" 123 + "(.*)") 124 .matcher(""); 125 Splitter settingSplitter = Splitter.onPattern("[\\[\\]@]").omitEmptyStrings().trimResults(); 126 File baseDir = new File(CLDRPaths.COMMON_DIRECTORY + "collation/"); 127 Transliterator fromUnicode = Transliterator.getInstance("Hex-Any"); 128 List<Pair<String, String>> pathValueList = new ArrayList<>(); 129 HashSet<String> mainAvailable = new HashSet<>(CLDR_FACTORY.getAvailable()); 130 // for (String xmlName : baseDir.list()) { 131 // if (!xmlName.endsWith(".xml")) { 132 // continue; 133 // } 134 // String locale = xmlName.substring(0,xmlName.length()-4); 135 // } 136 for (String xmlName : baseDir.list()) { 137 if (!xmlName.endsWith(".xml")) { 138 continue; 139 } 140 String locale = xmlName.substring(0, xmlName.length() - 4); 141 if (!mainAvailable.contains(locale)) { 142 System.out.println("Skipping locale not in main: " + locale); 143 continue; 144 } 145 146 pathValueList.clear(); 147 XMLFileReader.loadPathValues(CLDRPaths.COMMON_DIRECTORY + "collation/" + xmlName, pathValueList, true); 148 Map<String, Data> data = new TreeMap<>(); 149 XPathParts xpp = new XPathParts(); 150 151 for (Pair<String, String> entry : pathValueList) { 152 String path = entry.getFirst(); 153 String value = entry.getSecond(); 154 if (path.startsWith("//ldml/identity/")) { 155 continue; 156 } 157 158 if (path.equals("//ldml/collations/defaultCollation")) { 159 addCollator(data, value, "defaultCollation", Arrays.asList("true")); 160 continue; 161 } 162 163 // Root collator being empty isn't really a failure - just skip it. 164 if (xmlName.equals("root.xml") && path.equals("//ldml/collations/collation[@type=\"standard\"]")) { 165 continue; 166 } 167 168 xpp.set(path); 169 DraftStatus status = DraftStatus.forString(xpp.findFirstAttributeValue("draft")); 170 if (status == DraftStatus.unconfirmed) { 171 System.out.println("Skipping " + path + " in: " + xmlName + " due to draft status = " + status.toString()); 172 continue; 173 } 174 175 if (!settingsMatcher.reset(path).matches()) { 176 System.out.println("Failure in " + xmlName + " with: " + path); 177 continue; 178 } 179 String type = settingsMatcher.group(1); 180 // if (type.startsWith("private-")) { 181 // System.out.println("Skipping private-"); 182 // continue; 183 // } 184 185 String otherAttributes = settingsMatcher.group(2); 186 String leaf = settingsMatcher.group(3); 187 String values = settingsMatcher.group(4); 188 189 if (leaf.equals("settings") || leaf.equals("import")) { 190 //ldml/collations/collation[@type="compat"][@visibility="external"]/settings[@reorder="Arab"] 191 List<String> settings = settingSplitter.splitToList(values); 192 addCollator(data, type, leaf, settings); 193 continue; 194 } 195 String rules = value; 196 if (!rules.contains("'#⃣'")) { 197 rules = rules.replace("#⃣", "'#⃣'").replace("*⃣", "'*⃣'"); //hack for 8288 198 } 199 rules = fromUnicode.transform(rules); 200 201 try { 202 RuleBasedCollator col = new RuleBasedCollator(rules); 203 col.setStrength(Collator.IDENTICAL); 204 col.freeze(); 205 addCollator(data, type, col); 206 } catch (Exception e) { 207 System.out.println("*** Skipping " + locale + ":" + type + ", " + e); 208 } 209 } 210 if (data.isEmpty()) { // remove completely empty 211 continue; 212 } 213 if (!data.containsKey("standard")) { 214 addCollator(data, "standard", (RuleBasedCollator) null); 215 } 216 new Subchart(ENGLISH.getName(locale, true, CLDRFile.SHORT_ALTS), locale, data).writeChart(anchors); 217 } 218 } 219 addCollator(Map<String, Data> data, String type, String leaf, List<String> settings)220 private void addCollator(Map<String, Data> data, String type, String leaf, List<String> settings) { 221 if (type.startsWith("private-")) { 222 type = "\uFFFF" + type; 223 } 224 Data dataItem = data.get(type); 225 if (dataItem == null) { 226 data.put(type, dataItem = new Data()); 227 } 228 dataItem.settings.add(leaf + ":" + CollectionUtilities.join(settings, ";")); 229 } 230 addCollator(Map<String, Data> data, String type, RuleBasedCollator col)231 private void addCollator(Map<String, Data> data, String type, RuleBasedCollator col) { 232 if (type.startsWith("private-")) { 233 type = "\uFFFF" + type; 234 } 235 Data dataItem = data.get(type); 236 if (dataItem == null) { 237 data.put(type, dataItem = new Data()); 238 } 239 dataItem.collator = col; 240 } 241 242 //RuleBasedCollator ROOT = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 243 244 private class Subchart extends Chart { 245 private static final String HIGH_COLLATION_PRIMARY = "\uFFFF"; 246 String title; 247 String file; 248 private Map<String, Data> data; 249 250 @Override getShowDate()251 public boolean getShowDate() { 252 return false; 253 } 254 Subchart(String title, String file, Map<String, Data> data2)255 public Subchart(String title, String file, Map<String, Data> data2) { 256 this.title = title; 257 this.file = file; 258 this.data = data2; 259 } 260 261 @Override getDirectory()262 public String getDirectory() { 263 return DIR; 264 } 265 266 @Override getTitle()267 public String getTitle() { 268 return title; 269 } 270 271 @Override getFileName()272 public String getFileName() { 273 return file; 274 } 275 276 @Override getExplanation()277 public String getExplanation() { 278 return "<p>This is a <i>preliminary</i> chart for the " + title 279 + " collation tailorings. " 280 + "The complete data for this chart is found on " 281 + "<a target='_blank' href='" + ToolConstants.CHART_SOURCE + "common/collation/" + file + ".xml'>" + file + ".xml</a>.</p>" 282 + KNOWN_PROBLEMS; 283 } 284 285 @Override writeContents(FormattedFileWriter pw)286 public void writeContents(FormattedFileWriter pw) throws IOException { 287 288 CLDRFile cldrFile = CLDR_FACTORY.make(file, true); 289 UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING).freeze(); 290 291 UnicodeSet exemplars_all = new UnicodeSet(exemplars); 292 UnicodeSet exemplars_auxiliary = cldrFile.getExemplarSet("auxiliary", WinningChoice.WINNING); 293 UnicodeSet exemplars_punctuation = cldrFile.getExemplarSet("punctuation", WinningChoice.WINNING); 294 exemplars_all.addAll(exemplars_auxiliary) 295 .addAll(exemplars_punctuation); 296 297 for (NumberingSystem system : NumberingSystem.values()) { 298 UnicodeSet exemplars_numeric = cldrFile.getExemplarsNumeric(system); 299 if (exemplars_numeric != null) { 300 exemplars_all.addAll(exemplars_numeric); 301 //System.out.println(file + "\t" + system + "\t" + exemplars_numeric.toPattern(false)); 302 } 303 } 304 exemplars_all.freeze(); 305 306 TablePrinter tablePrinter = new TablePrinter() 307 .addColumn("Type", "class='source'", null, "class='source'", true) 308 .addColumn("Ordering", "class='target'", null, "class='target_nofont'", true); 309 310 for (Entry<String, Data> entry : data.entrySet()) { 311 // sort the characters 312 String type = entry.getKey(); 313 if (type.startsWith(HIGH_COLLATION_PRIMARY)) { 314 type = type.substring(1); 315 } 316 RuleBasedCollator col = entry.getValue().collator; 317 Set<String> settings = entry.getValue().settings; 318 StringBuilder list = new StringBuilder(); 319 if (!settings.isEmpty()) { 320 list.append(CollectionUtilities.join(settings, "<br>")); 321 list.append("<br><b><i>plus</i></b><br>"); 322 } 323 if (col == null) { 324 list.append("<i>CLDR default character order</i>"); 325 } else { 326 UnicodeSet tailored = new UnicodeSet(col.getTailoredSet()); 327 Set<String> sorted = new TreeSet<>(col); 328 exemplars.addAllTo(sorted); 329 tailored.addAllTo(sorted); 330 boolean first = true; 331 for (String s : sorted) { 332 // if (--maxCount < 0) { 333 // list.append(" …"); 334 // break; 335 // } 336 if (first) { 337 first = false; 338 } else { 339 list.append(' '); 340 } 341 if (s.startsWith("\uFDD0")) { // special CJK markers 342 int len = list.length(); 343 if (len > 4 && list.substring(len - 4, len).equals("<br>")) { 344 list.append("<br>"); 345 } 346 continue; 347 } 348 if (!tailored.contains(s)) { 349 list.append("<span class='" + NOT_TAILORED + "'>").append(s).append("</span>"); 350 } else if (!exemplars_all.containsAll(s) && !file.equals("root")) { 351 list.append("<span class='" + NOT_EXEMPLARS + "'>").append(s).append("</span>"); 352 } else { 353 list.append(s); 354 } 355 } 356 } 357 tablePrinter 358 .addRow() 359 .addCell(type) 360 .addCell(list.toString()); 361 tablePrinter.finishRow(); 362 } 363 pw.write(tablePrinter.toTable()); 364 } 365 } 366 } 367