1 package org.unicode.cldr.tool; 2 3 import java.io.PrintWriter; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Comparator; 7 import java.util.HashSet; 8 import java.util.Iterator; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Random; 12 import java.util.Set; 13 import java.util.TreeMap; 14 import java.util.TreeSet; 15 16 import org.unicode.cldr.draft.FileUtilities; 17 import org.unicode.cldr.util.ArrayComparator; 18 import org.unicode.cldr.util.CLDRFile; 19 import org.unicode.cldr.util.CLDRPaths; 20 import org.unicode.cldr.util.Factory; 21 import org.unicode.cldr.util.Level; 22 import org.unicode.cldr.util.Organization; 23 import org.unicode.cldr.util.StandardCodes; 24 import org.unicode.cldr.util.SupplementalDataInfo; 25 import org.unicode.cldr.util.XPathParts; 26 27 import com.ibm.icu.text.BreakIterator; 28 import com.ibm.icu.text.Collator; 29 import com.ibm.icu.text.NumberFormat; 30 import com.ibm.icu.text.RuleBasedCollator; 31 import com.ibm.icu.text.UTF16; 32 import com.ibm.icu.text.UnicodeSet; 33 import com.ibm.icu.util.ULocale; 34 35 public class GenerateG2xG2 { 36 static CLDRFile english; 37 static CLDRFile root; 38 main(String[] args)39 public static void main(String[] args) throws Exception { 40 if (showLocales(-1)) return; 41 // showCollator(); 42 43 String sourceLanguage = "G5"; 44 String targetLanguage = "G5"; 45 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 46 english = cldrFactory.make("en", true); 47 root = cldrFactory.make("root", true); 48 StandardCodes sc = StandardCodes.make(); 49 Map<Organization, Map<String, Level>> type_code_value = sc.getLocaleTypes(); 50 Set<String> sourceSet = new TreeSet<>(); 51 Set<String> targetLanguageSet = new TreeSet<>(); 52 targetLanguageSet.add("no"); 53 addPriority("G2", "nn"); 54 addPriority("G2", "no"); 55 targetLanguageSet.add("nn"); 56 Set<String> targetScriptSet = new TreeSet<>(); 57 Set<String> targetRegionSet = new TreeSet<>(); 58 Set<String> targetTZSet = new TreeSet<>(); 59 Set<String> targetCurrencySet = new TreeSet<>(); 60 for (Organization type : type_code_value.keySet()) { 61 Map<String, Level> code_value = type_code_value.get(type); 62 if (!type.equals(Organization.ibm)) continue; 63 for (String locale : code_value.keySet()) { 64 if (locale.equals("no")) continue; 65 String priority = code_value.get(locale).toString(); 66 ULocale ulocale = new ULocale(locale); 67 String language = ulocale.getLanguage(); 68 String script = ulocale.getScript(); 69 String territory = ulocale.getCountry(); 70 if (sourceLanguage.compareTo(priority) >= 0) { 71 if (language.equals("no")) language = "nn"; 72 locale = new ULocale(language, script).toString(); 73 sourceSet.add(locale); 74 addPriority(priority, locale); 75 } 76 if (targetLanguage.compareTo(priority) >= 0) { 77 targetLanguageSet.add(language); 78 targetScriptSet.add(script); 79 targetRegionSet.add(territory); 80 addPriority(priority, language); 81 addPriority(priority, script); 82 addPriority("G4", territory); // will normally be overridden 83 } 84 } 85 } 86 // set the priorities for territories 87 Map<String, List<String>> worldBankInfo = sc.getWorldBankInfo(); 88 Set<String> euCodes = new HashSet<>(Arrays.asList(new String[] { "AT", "BE", "CY", "CZ", "DK", "EE", 89 "FI", "FR", "DE", "GR", "HU", "IT", "LV", "LT", "LU", "MT", "NL", "PL", "PT", "SI", "ES", "SE", "GB" })); 90 for (String countryCode : worldBankInfo.keySet()) { 91 if (priorityMap.get(countryCode) == null) continue; // only use ones we already have: defaults G4 92 List<String> values = worldBankInfo.get(countryCode); 93 double gdp = Double.parseDouble(values.get(1)); 94 if (gdp >= 1E+13) 95 addPriority("G0", countryCode); 96 else if (gdp >= 1E+12) 97 addPriority("G1", countryCode); 98 else if (gdp >= 1E+11) 99 addPriority("G2", countryCode); 100 else if (euCodes.contains(countryCode)) addPriority("G3", countryCode); 101 // else if (gdp >= 1E+10) addPriority("G4", countryCode); 102 } 103 // fill in the currencies, and TZs for the countries that have multiple zones 104 Map<String, Set<String>> c2z = sc.getCountryToZoneSet(); 105 SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance(); 106 Set<String> mainTimeZones = supplementalDataInfo.getCanonicalTimeZones(); 107 for (Iterator<String> it = targetRegionSet.iterator(); it.hasNext();) { 108 String country = it.next(); 109 String priority = priorityMap.get(country); 110 for (Iterator<String> it2 = getCurrency(country).iterator(); it2.hasNext();) { 111 String currency = it2.next(); 112 targetCurrencySet.add(currency); 113 addPriority(priority, currency); 114 } 115 Set<String> s = c2z.get(country); 116 if (s.size() == 1) continue; 117 for (Iterator<String> it2 = s.iterator(); it2.hasNext();) { 118 String tzid = it2.next(); 119 if (!mainTimeZones.contains(tzid)) continue; 120 targetTZSet.add(tzid); 121 addPriority(priority, tzid); 122 } 123 } 124 // print out missing translations. 125 PrintWriter pw = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "G2xG2.txt"); 126 // show priorities 127 Comparator<String> comp = new UTF16.StringComparator(); 128 @SuppressWarnings("unchecked") 129 Set<String[]> priority_set = new TreeSet<String[]>(new ArrayComparator(new Comparator[] { comp, comp, comp })); 130 for (Iterator<String> it = priorityMap.keySet().iterator(); it.hasNext();) { 131 String code = it.next(); 132 String priority = priorityMap.get(code); 133 if (priority == null) continue; 134 int type = getType(code); 135 // if (type != CLDRFile.TERRITORY_NAME) continue; 136 priority_set.add(new String[] { priority, type + "", code }); 137 } 138 String lastPriority = ""; 139 //String lastType = ""; 140 for (Iterator<String[]> it = priority_set.iterator(); it.hasNext();) { 141 String[] items = it.next(); 142 if (!lastPriority.equals(items[0])) { 143 lastPriority = items[0]; 144 pw.println(); 145 // pw.println(lastPriority); 146 } 147 String typeName = getTypeName(items[2]); 148 pw.println(lastPriority + "\t" + typeName + "\t" + items[2] + "\t(" + getItemName(english, items[2]) + ")"); 149 } 150 pw.flush(); 151 // print out missing translations. 152 for (Iterator<String> it = sourceSet.iterator(); it.hasNext();) { 153 String sourceLocale = it.next(); 154 System.out.print(sourceLocale + ", "); 155 CLDRFile sourceData = cldrFactory.make(sourceLocale, true); 156 pw.println(); 157 String title = sourceLocale; 158 checkItems(pw, title, sourceData, CLDRFile.LANGUAGE_NAME, targetLanguageSet); 159 checkItems(pw, title, sourceData, CLDRFile.SCRIPT_NAME, targetScriptSet); 160 checkItems(pw, title, sourceData, CLDRFile.TERRITORY_NAME, targetRegionSet); 161 checkItems(pw, title, sourceData, CLDRFile.CURRENCY_NAME, targetCurrencySet); 162 // only check timezones if exemplar characters don't include a-z 163 String v = sourceData.getStringValue("//ldml/characters/exemplarCharacters"); 164 UnicodeSet exemplars = new UnicodeSet(v); 165 if (exemplars.contains('a', 'z')) continue; 166 checkItems(pw, title, sourceData, CLDRFile.TZ_EXEMPLAR, targetTZSet); 167 } 168 pw.println(); 169 pw.println("Sizes - incremental"); 170 pw.println(); 171 int runningTotalCount = 0; 172 int runningMissingCount = 0; 173 NumberFormat percent = NumberFormat.getPercentInstance(); 174 percent.setMinimumFractionDigits(1); 175 NumberFormat nf = NumberFormat.getInstance(); 176 nf.setGroupingUsed(true); 177 nf.setMinimumFractionDigits(0); 178 for (Iterator<String> it = totalMap.keySet().iterator(); it.hasNext();) { 179 String key = it.next(); 180 Totals t = totalMap.get(key); 181 runningTotalCount = t.totalCount; 182 runningMissingCount = t.missingCount; 183 pw.println(key.substring(0, 2) + "\t" + key.substring(2) + "\t" + runningMissingCount 184 + "\t" + runningTotalCount 185 + "\t" + percent.format(runningMissingCount / (0.0 + runningTotalCount))); 186 } 187 pw.close(); 188 System.out.println(); 189 System.out.println("Done"); 190 } 191 showLocales(int choice)192 private static boolean showLocales(int choice) throws Exception { 193 ULocale desiredDisplayLocale = ULocale.ENGLISH; 194 Set<String> testSet = new TreeSet<>(); 195 StandardCodes sc = StandardCodes.make(); 196 { 197 Set<String> countries = sc.getGoodAvailableCodes("territory"); 198 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 199 english = cldrFactory.make("en", true); 200 for (Iterator<String> it = countries.iterator(); it.hasNext();) { 201 String territory = it.next(); 202 if (territory.charAt(0) < 'A') continue; 203 String locale = "haw-" + territory; 204 System.out.print(locale + ": " + english.getName(locale) + ", "); 205 } 206 if (true) return true; 207 } 208 209 if (choice == -1) { 210 211 testSet.addAll(sc.getGoodAvailableCodes("currency")); 212 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 213 english = cldrFactory.make("en", false); 214 for (Iterator it = testSet.iterator(); it.hasNext();) { 215 String country = (String) it.next(); 216 System.out.println(country + "\t" + english.getName(CLDRFile.CURRENCY_NAME, country)); 217 } 218 return true; 219 } else if (choice == 0) { // get available 220 ULocale[] list = BreakIterator.getAvailableULocales(); 221 for (int i = 0; i < list.length; ++i) { 222 testSet.add(list[i].toString()); 223 } 224 } else { 225 boolean USE_3066bis = choice == 2; 226 // produce random list of RFC3066 language tags 227 Set<String> legacy = sc.getAvailableCodes("legacy"); 228 List<String> language_subtags = new ArrayList<>(sc.getGoodAvailableCodes("language")); 229 List<String> script_subtags = new ArrayList<>(sc.getGoodAvailableCodes("script")); 230 List<String> region_subtags = new ArrayList<>(sc.getGoodAvailableCodes("territory")); 231 for (String possibility : legacy) { 232 System.out.println(possibility); 233 if (new ULocale(possibility).getScript().length() != 0) { 234 System.out.println("\tAdding"); 235 testSet.add(possibility); 236 } 237 } 238 if (!USE_3066bis) for (Iterator it = region_subtags.iterator(); it.hasNext();) { 239 String possibility = (String) it.next(); 240 if (possibility.compareTo("A") < 0) it.remove(); 241 } 242 Random rand = new Random(); 243 for (int i = 0; i < 200; ++i) { 244 int r = rand.nextInt(language_subtags.size()); 245 String result = language_subtags.get(rand.nextInt(language_subtags.size())); 246 if (USE_3066bis && rand.nextDouble() > 0.5) { 247 result += "-" + script_subtags.get(rand.nextInt(script_subtags.size())); 248 } 249 if (rand.nextDouble() > 0.1) { 250 result += "-" + region_subtags.get(rand.nextInt(region_subtags.size())); 251 } 252 testSet.add(result); 253 } 254 } 255 for (Iterator<String> it = testSet.iterator(); it.hasNext();) { 256 ULocale language = new ULocale(it.next()); 257 System.out.println(language + " \t" + language.getDisplayName(desiredDisplayLocale)); 258 } 259 return true; 260 } 261 showCollator()262 private static void showCollator() throws Exception { 263 RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(new ULocale("zh")); 264 showExample(col); 265 String rules = col.getRules(false); 266 // System.out.println(com.ibm.icu.impl.Utility.escape(rules)); 267 rules += "& \u93CA < A <<< a & \u7C3F < B <<< b"; 268 RuleBasedCollator col2 = new RuleBasedCollator(rules); 269 showExample(col2); 270 } 271 showExample(RuleBasedCollator col)272 private static void showExample(RuleBasedCollator col) { 273 String samples = "a A b B \u5416 \u93CA \u516b \u7C3F"; 274 Set<String> s = new TreeSet<>(col); 275 s.addAll(Arrays.asList(samples.split(" "))); 276 System.out.println(com.ibm.icu.impl.Utility.escape(s.toString())); 277 } 278 279 static Map<String, String> priorityMap = new TreeMap<>(); 280 addPriority(String priority, String code)281 static void addPriority(String priority, String code) { 282 if (code.length() == 0) return; 283 String oldPriority = priorityMap.get(code); 284 if (oldPriority == null || priority.compareTo(oldPriority) < 0) priorityMap.put(code, priority); 285 System.out.println(code + ": " + priority); 286 } 287 288 static class Totals { 289 int totalCount; 290 int missingCount; 291 } 292 293 static Map<String, Totals> totalMap = new TreeMap<>(); 294 checkItems(PrintWriter pw, String sourceLocale, CLDRFile sourceData, int type, Set<String> targetItemSet)295 static void checkItems(PrintWriter pw, String sourceLocale, CLDRFile sourceData, int type, Set<String> targetItemSet) { 296 for (Iterator<String> it2 = targetItemSet.iterator(); it2.hasNext();) { 297 String item = it2.next(); 298 if (item.length() == 0) continue; 299 String key = priorityMap.get(sourceLocale) + "" + priorityMap.get(item); 300 Totals t = totalMap.get(key); 301 if (t == null) totalMap.put(key, t = new Totals()); 302 t.totalCount++; 303 String translation = getItemName(sourceData, type, item); 304 String rootName = getItemName(root, type, item); 305 if (rootName.equals(translation)) { 306 t.missingCount++; 307 pw.println(priorityMap.get(sourceLocale) 308 + "\t" + sourceLocale + 309 "\t(" + english.getName(sourceLocale) + ": " 310 + sourceData.getName(sourceLocale) + ")" 311 + "\t" + priorityMap.get(item) 312 + "\t" + item 313 + "\t(" + getItemName(english, type, item) + ")"); 314 } 315 } 316 } 317 getItemName(CLDRFile data, String item)318 private static String getItemName(CLDRFile data, String item) { 319 return getItemName(data, getType(item), item); 320 } 321 getType(String item)322 private static int getType(String item) { 323 int type = CLDRFile.LANGUAGE_NAME; 324 if (item.indexOf('/') >= 0) 325 type = CLDRFile.TZ_EXEMPLAR; // America/Los_Angeles 326 else if (item.length() == 4) 327 type = CLDRFile.SCRIPT_NAME; // Hant 328 else if (item.charAt(0) <= '9') 329 type = CLDRFile.TERRITORY_NAME; // 001 330 else if (item.charAt(0) < 'a') { 331 if (item.length() == 3) 332 type = CLDRFile.CURRENCY_NAME; 333 else 334 type = CLDRFile.TERRITORY_NAME; // US or USD 335 } 336 return type; 337 } 338 getTypeName(String item)339 private static String getTypeName(String item) { 340 switch (getType(item)) { 341 case CLDRFile.LANGUAGE_NAME: 342 return "Lang"; 343 case CLDRFile.TZ_EXEMPLAR: 344 return "Zone"; 345 case CLDRFile.SCRIPT_NAME: 346 return "Script"; 347 case CLDRFile.TERRITORY_NAME: 348 return "Region"; 349 case CLDRFile.CURRENCY_NAME: 350 return "Curr."; 351 } 352 return "?"; 353 } 354 getItemName(CLDRFile data, int type, String item)355 private static String getItemName(CLDRFile data, int type, String item) { 356 String result; 357 if (type == CLDRFile.LANGUAGE_NAME) { 358 result = data.getName(item); 359 } else if (type != CLDRFile.TZ_EXEMPLAR) { 360 result = data.getName(type, item); 361 } else { 362 String prefix = "//ldml/dates/timeZoneNames/zone[@type=\"" + item + "\"]/exemplarCity"; 363 result = data.getStringValue(prefix); 364 } 365 return result == null ? item : result; 366 } 367 368 static Map<String, List<String>> territory_currency = null; 369 getCurrency(String territory)370 private static List<String> getCurrency(String territory) { 371 if (territory_currency == null) { 372 territory_currency = new TreeMap<>(); 373 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 374 CLDRFile supp = cldrFactory.make(CLDRFile.SUPPLEMENTAL_NAME, false); 375 for (String path : supp) { 376 if (path.indexOf("/currencyData") >= 0) { 377 // <region iso3166="AR"> 378 // <currency iso4217="ARS" from="1992-01-01"/> 379 if (path.indexOf("/region") >= 0) { 380 XPathParts parts = XPathParts.getFrozenInstance(supp.getFullXPath(path)); 381 Map<String, String> attributes = parts.getAttributes(parts.size() - 2); 382 String iso3166 = attributes.get("iso3166"); 383 attributes = parts.getAttributes(parts.size() - 1); 384 String iso4217 = attributes.get("iso4217"); 385 String to = attributes.get("to"); 386 if (to != null) { 387 continue; 388 } 389 List<String> info = territory_currency.get(iso3166); 390 if (info == null) { 391 territory_currency.put(iso3166, info = new ArrayList<>()); 392 } 393 info.add(iso4217); 394 } 395 } 396 } 397 } 398 return territory_currency.get(territory); 399 } 400 }