1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.util.Arrays; 5 import java.util.Collection; 6 import java.util.EnumMap; 7 import java.util.LinkedHashMap; 8 import java.util.Map; 9 import java.util.Map.Entry; 10 import java.util.Set; 11 import java.util.TreeSet; 12 13 import org.unicode.cldr.draft.FileUtilities; 14 import org.unicode.cldr.tool.FormattedFileWriter.Anchors; 15 import org.unicode.cldr.util.Annotations; 16 import org.unicode.cldr.util.Annotations.AnnotationSet; 17 import org.unicode.cldr.util.CLDRFile; 18 import org.unicode.cldr.util.CLDRPaths; 19 import org.unicode.cldr.util.CldrUtility; 20 import org.unicode.cldr.util.Factory; 21 import org.unicode.cldr.util.FileCopier; 22 import org.unicode.cldr.util.LanguageGroup; 23 import org.unicode.cldr.util.LanguageTagParser; 24 import org.unicode.cldr.util.LocaleIDParser; 25 26 import com.google.common.collect.Multimap; 27 import com.google.common.collect.TreeMultimap; 28 import com.ibm.icu.dev.util.CollectionUtilities; 29 import com.ibm.icu.impl.Relation; 30 import com.ibm.icu.impl.Row; 31 import com.ibm.icu.impl.Row.R3; 32 import com.ibm.icu.impl.Utility; 33 import com.ibm.icu.text.RuleBasedCollator; 34 import com.ibm.icu.text.UnicodeSet; 35 import com.ibm.icu.util.ULocale; 36 37 public class ChartAnnotations extends Chart { 38 39 private static final String LDML_ANNOTATIONS = "<a href='http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-general.html#Annotations'>LDML Annotations</a>"; 40 41 private static final String MAIN_HEADER = "<p>Annotations provide names and keywords for Unicode characters, currently focusing on emoji. " 42 + "If you see any problems, please <a target='_blank' href='http://unicode.org/cldr/trac/newticket'>file a ticket</a> with the corrected values for the locale. " 43 + "For the XML data used for these charts, see " 44 + "<a href='http://unicode.org/repos/cldr/tags/latest/common/annotations/'>latest-release annotations </a> " 45 + "or <a href='http://unicode.org/repos/cldr/tags/latest/common/annotations/'>beta annotations</a>. " 46 + "For more information, see " + LDML_ANNOTATIONS + ".</p>"; 47 private static final boolean DEBUG = false; 48 private static final String DIR = CLDRPaths.CHART_DIRECTORY + "annotations/"; 49 main(String[] args)50 public static void main(String[] args) { 51 new ChartAnnotations().writeChart(null); 52 } 53 54 @Override getDirectory()55 public String getDirectory() { 56 return DIR; 57 } 58 59 @Override getTitle()60 public String getTitle() { 61 return "Annotation Charts"; 62 } 63 64 @Override getFileName()65 public String getFileName() { 66 return "index"; 67 } 68 69 @Override getExplanation()70 public String getExplanation() { 71 return MAIN_HEADER + "<p>The charts are presented in groups of related languages, for easier comparison.<p>"; 72 } 73 writeContents(FormattedFileWriter pw)74 public void writeContents(FormattedFileWriter pw) throws IOException { 75 FileCopier.ensureDirectoryExists(DIR); 76 FileCopier.copy(Chart.class, "index.css", DIR); 77 78 FormattedFileWriter.Anchors anchors = new FormattedFileWriter.Anchors(); 79 writeSubcharts(anchors); 80 pw.setIndex("Main Chart Index", "../index.html"); 81 pw.write(anchors.toString()); 82 } 83 84 static final UnicodeSet EXTRAS = new UnicodeSet() 85 .addAll(Arrays.asList( 86 "", "", "#️⃣", "", "❤️", "❤️", "", "⚕️", "♂️", "♀️", "❤️", "♀️", 87 "", "❤️", "", "❤️", "", "", 88 "", "", "⚖", "⚖", "⚖", "⚖", "", "♂️", "♂️", "♀️", "♀️", 89 "", "", "♂️", "♂️", "♀️", "♀️", 90 "", 91 "#️⃣", 92 "", 93 "⛹️♀️", 94 "⚕️", 95 "️","☠️", 96 "", 97 "", 98 "","" 99 )) 100 .freeze(); 101 writeSubcharts(Anchors anchors)102 public void writeSubcharts(Anchors anchors) throws IOException { 103 Set<String> locales = Annotations.getAvailableLocales(); 104 105 AnnotationSet english = Annotations.getDataSet("en"); 106 UnicodeSet s = new UnicodeSet(english.keySet()).addAll(EXTRAS).freeze(); 107 108 // set up right order for columns 109 110 Map<String, String> nameToCode = new LinkedHashMap<String, String>(); 111 Relation<LanguageGroup, R3<Integer, String, String>> groupToNameAndCodeSorted = Relation.of( 112 new EnumMap<LanguageGroup, Set<R3<Integer, String, String>>>(LanguageGroup.class), 113 TreeSet.class); 114 115 Multimap<String, String> localeToSub = TreeMultimap.create(); 116 LanguageTagParser ltp = new LanguageTagParser(); 117 118 for (String locale : locales) { 119 ltp.set(locale); 120 if (locale.equals("root")) { 121 continue; 122 } 123 if (locale.equals("en")) { // make first 124 continue; 125 } 126 String region = ltp.getRegion(); 127 if (!region.isEmpty()) { 128 localeToSub.put(ltp.getLanguageScript(), locale); 129 continue; 130 } 131 132 if (locale.startsWith("en")) { 133 int debug = 0; 134 } 135 String name = ENGLISH.getName(locale, true); 136 int baseEnd = locale.indexOf('_'); 137 ULocale loc = new ULocale(baseEnd < 0 ? locale : locale.substring(0, baseEnd)); 138 LanguageGroup group = LanguageGroup.get(loc); 139 int rank = LanguageGroup.rankInGroup(loc); 140 groupToNameAndCodeSorted.put(group, Row.of(rank, name, locale)); 141 } 142 143 for (Entry<LanguageGroup, Set<R3<Integer, String, String>>> groupPairs : groupToNameAndCodeSorted.keyValuesSet()) { 144 LanguageGroup group = groupPairs.getKey(); 145 String ename = ENGLISH.getName("en", true); 146 nameToCode.clear(); 147 nameToCode.put(ename, "en"); // always have english first 148 149 // add English variants if they exist 150 151 for (R3<Integer, String, String> pair : groupPairs.getValue()) { 152 String name = pair.get1(); 153 String locale = pair.get2(); 154 if (locale.startsWith("en_")) { 155 nameToCode.put(name, locale); 156 } 157 } 158 159 for (R3<Integer, String, String> pair : groupPairs.getValue()) { 160 String name = pair.get1(); 161 String locale = pair.get2(); 162 163 nameToCode.put(name, locale); 164 System.out.println(pair); 165 } 166 // now build table with right order for columns 167 double width = ((int) ((99.0 / (locales.size() + 1)) * 1000)) / 1000.0; 168 //String widthString = "class='source' width='"+ width + "%'"; 169 String widthStringTarget = "class='target' width='" + width + "%'"; 170 171 TablePrinter tablePrinter = new TablePrinter() 172 .addColumn("Char", "class='source' width='1%'", CldrUtility.getDoubleLinkMsg(), "class='source-image'", true) 173 .addColumn("Hex", "class='source' width='1%'", null, "class='source'", true) 174 //.addColumn("Formal Name", "class='source' width='" + width + "%'", null, "class='source'", true) 175 ; 176 177 for (Entry<String, String> entry : nameToCode.entrySet()) { 178 String name = entry.getKey(); 179 tablePrinter.addColumn(name, widthStringTarget, null, "class='target'", true); 180 } 181 // sort the characters 182 Set<String> sorted = new TreeSet<>(RBC); 183 Multimap<String, String> valueToSub = TreeMultimap.create(); 184 185 for (String cp : s.addAllTo(sorted)) { 186 tablePrinter 187 .addRow() 188 .addCell(cp) 189 .addCell(Utility.hex(cp, 4, " ")) 190 //.addCell(getName(cp)) 191 ; 192 for (Entry<String, String> nameAndLocale : nameToCode.entrySet()) { 193 String name = nameAndLocale.getKey(); 194 String locale = nameAndLocale.getValue(); 195 196 AnnotationSet annotations = Annotations.getDataSet(locale); 197 AnnotationSet parentAnnotations = Annotations.getDataSet(LocaleIDParser.getParent(locale)); 198 String baseAnnotation = annotations.toString(cp, true, parentAnnotations); 199 String baseAnnotationOriginal = baseAnnotation; 200 201 if (DEBUG) System.out.println(name + ":" + annotations.toString(cp, false, null)); 202 Collection<String> subs = localeToSub.get(locale); 203 if (!subs.isEmpty()) { 204 valueToSub.clear(); 205 for (String sub : subs) { 206 AnnotationSet subAnnotations = Annotations.getDataSet(sub); 207 AnnotationSet subParentAnnotations = Annotations.getDataSet(LocaleIDParser.getParent(locale)); 208 String baseAnnotation2 = subAnnotations.toString(cp, true, subParentAnnotations); 209 if (!baseAnnotation2.equals(baseAnnotationOriginal)) { 210 valueToSub.put(baseAnnotation2, sub); 211 } 212 } 213 for (Entry<String, Collection<String>> entry : valueToSub.asMap().entrySet()) { 214 baseAnnotation += "<hr><i>" + CollectionUtilities.join(entry.getValue(), ", ") + "</i>: " + entry.getKey(); 215 } 216 } 217 tablePrinter.addCell(baseAnnotation); 218 } 219 tablePrinter.finishRow(); 220 } 221 final String name = group.toString(); 222 new Subchart(name + " Annotations", FileUtilities.anchorize(name), tablePrinter).writeChart(anchors); 223 } 224 } 225 226 static final int FIRST_REGIONAL = 0x1F1E6; 227 static final int LAST_REGIONAL = 0x1F1FF; 228 getRegionalIndicator(int firstCodepoint)229 public static int getRegionalIndicator(int firstCodepoint) { 230 return FIRST_REGIONAL <= firstCodepoint && firstCodepoint <= LAST_REGIONAL ? firstCodepoint - FIRST_REGIONAL + 'A' : -1; 231 } 232 233 // private String getName(String cp) { 234 // int ri1 = getRegionalIndicator(cp.codePointAt(0)); 235 // if (ri1 >= 0) { 236 // int ri2 = getRegionalIndicator(cp.codePointAt(2)); 237 // return ENGLISH.getName(CLDRFile.TERRITORY_NAME, String.valueOf((char) ri1) + String.valueOf((char) ri2)); 238 // } 239 // String result = NAMES80.get(cp); 240 // return result != null ? result : UCharacter.getName(cp, ", "); 241 // } 242 // 243 // private static UnicodeMap<String> NAMES80 = new UnicodeMap<>(); 244 // static { 245 // String[][] data = { 246 // { "", "EMOJI MODIFIER FITZPATRICK TYPE-1-2" }, 247 // { "", "EMOJI MODIFIER FITZPATRICK TYPE-3" }, 248 // { "", "EMOJI MODIFIER FITZPATRICK TYPE-4" }, 249 // { "", "EMOJI MODIFIER FITZPATRICK TYPE-5" }, 250 // { "", "EMOJI MODIFIER FITZPATRICK TYPE-6" }, 251 // { "", "ZIPPER-MOUTH FACE" }, 252 // { "", "MONEY-MOUTH FACE" }, 253 // { "", "FACE WITH THERMOMETER" }, 254 // { "", "NERD FACE" }, 255 // { "", "THINKING FACE" }, 256 // { "", "FACE WITH ROLLING EYES" }, 257 // { "", "UPSIDE-DOWN FACE" }, 258 // { "", "FACE WITH HEAD-BANDAGE" }, 259 // { "", "ROBOT FACE" }, 260 // { "", "HUGGING FACE" }, 261 // { "", "SIGN OF THE HORNS" }, 262 // { "", "CRAB (also Cancer)" }, 263 // { "", "SCORPION (also Scorpio)" }, 264 // { "", "LION FACE (also Leo)" }, 265 // { "", "BOW AND ARROW (also Sagittarius)" }, 266 // { "", "AMPHORA (also Aquarius)" }, 267 // { "", "PLACE OF WORSHIP" }, 268 // { "", "KAABA" }, 269 // { "", "MOSQUE" }, 270 // { "", "SYNAGOGUE" }, 271 // { "", "MENORAH WITH NINE BRANCHES" }, 272 // { "", "PRAYER BEADS" }, 273 // { "", "HOT DOG" }, 274 // { "", "TACO" }, 275 // { "", "BURRITO" }, 276 // { "", "CHEESE WEDGE" }, 277 // { "", "POPCORN" }, 278 // { "", "BOTTLE WITH POPPING CORK" }, 279 // { "", "TURKEY" }, 280 // { "", "UNICORN FACE" }, 281 // { "", "CRICKET BAT AND BALL" }, 282 // { "", "VOLLEYBALL" }, 283 // { "", "FIELD HOCKEY STICK AND BALL" }, 284 // { "", "ICE HOCKEY STICK AND PUCK" }, 285 // { "", "TABLE TENNIS PADDLE AND BALL" }, 286 // { "", "BADMINTON RACQUET AND SHUTTLECOCK" } }; 287 // for (String[] pair : data) { 288 // NAMES80.put(pair[0], pair[1]); 289 // } 290 // NAMES80.freeze(); 291 // } 292 293 private class Subchart extends Chart { 294 String title; 295 String file; 296 private TablePrinter tablePrinter; 297 298 @Override getShowDate()299 public boolean getShowDate() { 300 return false; 301 } 302 Subchart(String title, String file, TablePrinter tablePrinter)303 public Subchart(String title, String file, TablePrinter tablePrinter) { 304 super(); 305 this.title = title; 306 this.file = file; 307 this.tablePrinter = tablePrinter; 308 } 309 310 @Override getDirectory()311 public String getDirectory() { 312 return DIR; 313 } 314 315 @Override getTitle()316 public String getTitle() { 317 return title; 318 } 319 320 @Override getFileName()321 public String getFileName() { 322 return file; 323 } 324 325 @Override getExplanation()326 public String getExplanation() { 327 return MAIN_HEADER 328 + "<p>This table shows the annotations for a group of related languages (plus English) for easier comparison. " 329 + "The first item is the <b>short name</b> (also the text-to-speech phrase). " 330 + "It is bolded for clarity, and marked with a * for searching on this page. " 331 + "The remaining phrases are <b>keywords</b> (labels), separated by “|”. " 332 + "The keywords plus the words in the short name are typically used for search and predictive typing.<p>\n" 333 + "<p>Most short names and keywords that can be constructed with the mechanism in " + LDML_ANNOTATIONS + " are omitted. " 334 + "However, a few are included for comparison: " 335 + CollectionUtilities.join(EXTRAS.addAllTo(new TreeSet<>()), ", ") + ". " 336 + "In this chart, missing items are marked with “" + Annotations.MISSING_MARKER + "”, " 337 + "‘fallback’ constructed items with “" + Annotations.BAD_MARKER + "”, " 338 + "substituted English values with “" + Annotations.ENGLISH_MARKER + "”, and " 339 + "values equal to their parent locale’s values are replaced with " + Annotations.EQUIVALENT + ".</p>\n"; 340 } 341 342 @Override writeContents(FormattedFileWriter pw)343 public void writeContents(FormattedFileWriter pw) throws IOException { 344 pw.write(tablePrinter.toTable()); 345 } 346 } 347 348 public static RuleBasedCollator RBC; 349 static { 350 Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "collation/", ".*"); 351 CLDRFile root = cldrFactory.make("root", false); 352 String rules = root.getStringValue("//ldml/collations/collation[@type=\"emoji\"][@visibility=\"external\"]/cr"); 353 354 // if (!rules.contains("'#⃣'")) { 355 // rules = rules.replace("#⃣", "'#⃣'").replace("*⃣", "'*⃣'"); //hack for 8288 356 // } 357 358 try { 359 RBC = new RuleBasedCollator(rules); 360 } catch (Exception e) { 361 throw new IllegalArgumentException(e); 362 } 363 } 364 365 // static final Set<String> ENGLISH_LABELS = new LinkedHashSet<>(Arrays.asList( 366 // "flag", "nature", "objects", "people", "places", "symbols", "travel", "animal", 367 // "office", "sign", "word", "time", "food", "person", "weather", "activity", 368 // "vehicle", "restaurant", "communication", "emotion", "geometric", "mark", 369 // "education", "gesture", "japanese", "symbol", "congratulation", "body", "clothing")); 370 371 // static class Annotations { 372 // 373 // final UnicodeRelation<String> values = new UnicodeRelation<>(); 374 // 375 // static Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "annotations/", ".*"); 376 // 377 // static Set<String> getAvailableLocales() { 378 // return cldrFactory.getAvailable(); 379 // } 380 // 381 // static Map<String, Annotations> cache = new ConcurrentHashMap<>(); 382 // 383 // static synchronized Annotations make(String locale) { 384 // Annotations result = cache.get(locale); 385 // if (result == null) { 386 // CLDRFile file = cldrFactory.make(locale, false); // for now, don't resolve 387 // result = new Annotations(); 388 // LinkedHashSet<String> values = new LinkedHashSet<>(); 389 // XPathParts parts = new XPathParts(); 390 // Splitter sp = Splitter.on(';').omitEmptyStrings().trimResults(); 391 // for (String path : file) { 392 // if (path.startsWith("//ldml/identity")) { 393 // continue; 394 // } 395 // String value = file.getStringValue(path); 396 // String fullPath = file.getFullXPath(path); 397 // String cpString = parts.set(fullPath).getAttributeValue(-1, "cp"); 398 // UnicodeSet cps = new UnicodeSet(cpString); 399 // String tts = parts.set(fullPath).getAttributeValue(-1, "tts"); 400 // values.clear(); 401 // if (tts != null) { 402 // values.add(tts.trim()); // always first value 403 // } 404 // values.addAll(sp.splitToList(value)); 405 // result.values.addAll(cps, values); 406 // } 407 // 408 // // remove labels 409 // 410 // if (locale.equals("en")) { 411 // for (Entry<String, Set<String>> item : result.values.keyValues()) { 412 // String key = item.getKey(); 413 // Set<String> valueSet = new LinkedHashSet<>(item.getValue()); 414 // for (String skip : ENGLISH_LABELS) { 415 // if (valueSet.contains(skip)) { 416 // result.values.remove(key, skip); 417 // if (result.values.get(key) == null) { 418 // result.values.add(key, skip); // restore 419 // break; 420 // } 421 // } 422 // } 423 // Set<String> newSet = result.values.get(key); 424 // if (!valueSet.equals(newSet)) { 425 // if (DEBUG) System.out.println("dropping labels from " + item.getKey() + ", old: " + valueSet + ", new: " + newSet); 426 // } 427 // } 428 // } 429 // result.values.freeze(); 430 // cache.put(locale, result); 431 // } 432 // return result; 433 // } 434 // } 435 } 436