1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Collection; 10 import java.util.Collections; 11 import java.util.HashSet; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Map; 15 import java.util.Map.Entry; 16 import java.util.Set; 17 import java.util.TreeMap; 18 import java.util.TreeSet; 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 22 import org.unicode.cldr.draft.FileUtilities; 23 import org.unicode.cldr.tool.Option.Options; 24 import org.unicode.cldr.util.Builder; 25 import org.unicode.cldr.util.CLDRConfig; 26 import org.unicode.cldr.util.CLDRFile; 27 import org.unicode.cldr.util.CLDRPaths; 28 import org.unicode.cldr.util.CldrUtility; 29 import org.unicode.cldr.util.Counter; 30 import org.unicode.cldr.util.DtdData; 31 import org.unicode.cldr.util.DtdData.Attribute; 32 import org.unicode.cldr.util.DtdData.Element; 33 import org.unicode.cldr.util.DtdType; 34 import org.unicode.cldr.util.PathStarrer; 35 import org.unicode.cldr.util.PathUtilities; 36 import org.unicode.cldr.util.PatternCache; 37 import org.unicode.cldr.util.RegexUtilities; 38 import org.unicode.cldr.util.SupplementalDataInfo; 39 import org.unicode.cldr.util.XMLFileReader; 40 import org.unicode.cldr.util.XMLFileReader.SimpleHandler; 41 import org.unicode.cldr.util.XPathParts; 42 import org.xml.sax.ErrorHandler; 43 import org.xml.sax.SAXException; 44 import org.xml.sax.SAXParseException; 45 46 import com.google.common.base.Joiner; 47 import com.google.common.base.Splitter; 48 import com.ibm.icu.impl.Relation; 49 import com.ibm.icu.impl.Row; 50 import com.ibm.icu.impl.Row.R2; 51 import com.ibm.icu.impl.Row.R4; 52 import com.ibm.icu.util.VersionInfo; 53 54 public class GenerateItemCounts { 55 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig.getInstance().getSupplementalDataInfo(); 56 private static final boolean SKIP_ORDERING = true; 57 private static final String OUT_DIRECTORY = CLDRPaths.GEN_DIRECTORY + "/itemcount/"; // CldrUtility.MAIN_DIRECTORY; 58 private Map<String, List<StackTraceElement>> cantRead = new TreeMap<>(); 59 static { 60 System.err.println("Probably obsolete tool"); 61 } 62 private static String[] DIRECTORIES = { 63 // MUST be oldest first! 64 // "cldr-archive/cldr-21.0", 65 // "cldr-24.0", 66 "cldr-27.0", 67 "trunk" 68 }; 69 70 private static String TRUNK_VERSION = "26.0"; 71 72 static boolean doChanges = true; 73 static Relation<String, String> path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 74 static final AttributeTypes ATTRIBUTE_TYPES = new AttributeTypes(); 75 76 final static Options myOptions = new Options(); 77 78 enum MyOptions { 79 summary(null, null, "if present, summarizes data already collected. Run once with, once without."), directory(".*", ".*", 80 "if summary, creates filtered version (eg -d main): does a find in the name, which is of the form dir/file"), verbose(null, null, 81 "verbose debugging messages"), rawfilter(".*", ".*", "filter the raw files (non-summary, mostly for debugging)"),; 82 // boilerplate 83 final Option option; 84 MyOptions(String argumentPattern, String defaultArgument, String helpText)85 MyOptions(String argumentPattern, String defaultArgument, String helpText) { 86 option = myOptions.add(this, argumentPattern, defaultArgument, helpText); 87 } 88 } 89 90 static Matcher DIR_FILE_MATCHER; 91 static Matcher RAW_FILE_MATCHER; 92 static boolean VERBOSE; 93 main(String[] args)94 public static void main(String[] args) throws IOException { 95 myOptions.parse(MyOptions.directory, args, true); 96 97 DIR_FILE_MATCHER = PatternCache.get(MyOptions.directory.option.getValue()).matcher(""); 98 RAW_FILE_MATCHER = PatternCache.get(MyOptions.rawfilter.option.getValue()).matcher(""); 99 VERBOSE = MyOptions.verbose.option.doesOccur(); 100 101 if (MyOptions.summary.option.doesOccur()) { 102 doSummary(); 103 System.out.println("DONE"); 104 return; 105 // } else if (arg.equals("changes")) { 106 // doChanges = true; 107 } else { 108 } 109 // Pattern dirPattern = dirPattern = PatternCache.get(arg); 110 GenerateItemCounts main = new GenerateItemCounts(); 111 try { 112 Relation<String, String> oldPath2value = null; 113 for (String dir : DIRECTORIES) { 114 // if (dirPattern != null && !dirPattern.matcher(dir).find()) continue; 115 final String pathname = dir.equals("trunk") ? CLDRPaths.BASE_DIRECTORY 116 : CLDRPaths.ARCHIVE_DIRECTORY + "/" + dir; 117 boolean isFinal = dir == DIRECTORIES[DIRECTORIES.length - 1]; 118 119 String fulldir = PathUtilities.getNormalizedPathString(pathname); 120 String prefix = (MyOptions.rawfilter.option.doesOccur() ? "filtered_" : ""); 121 String fileKey = dir.replace("/", "_"); 122 try ( 123 PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_count.txt"); 124 PrintWriter changes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes.txt"); 125 PrintWriter changesNew = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_news.txt"); 126 PrintWriter changesDeletes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_deletes.txt"); 127 PrintWriter changesSummary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes_summary.txt");) { 128 main.summarizeCoverage(summary, fulldir, isFinal); 129 if (doChanges) { 130 if (oldPath2value != null) { 131 compare(summary, changes, changesNew, changesDeletes, changesSummary, oldPath2value, path2value); 132 checkBadAttributes(path2value, prefix + fileKey + "_dtd_check.txt"); 133 } 134 oldPath2value = path2value; 135 path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 136 } 137 } 138 } 139 ATTRIBUTE_TYPES.showStarred(); 140 } finally { 141 if (main.cantRead.size() != 0) { 142 System.out.println("Couldn't read:\t"); 143 for (String file : main.cantRead.keySet()) { 144 System.out.println(file + "\t" + main.cantRead.get(file)); 145 } 146 } 147 System.out.println("DONE"); 148 } 149 } 150 151 static final Set<String> SKIP_ATTRIBUTES = new HashSet<>(Arrays.asList("draft", "references", "validSubLocales")); 152 153 static final Relation<String, DtdType> ELEMENTS_OCCURRING = Relation.of(new TreeMap(), TreeSet.class); 154 static final Relation<String, DtdType> ELEMENTS_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class); 155 static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_OCCURRING = Relation.of(new TreeMap(), TreeSet.class); 156 static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class); 157 checkBadAttributes(Relation<String, String> path2value2, String outputFile)158 private static void checkBadAttributes(Relation<String, String> path2value2, String outputFile) 159 throws IOException { 160 // an attribute is misplaced if it is not distinguishing, but is on a non-final node. 161 162 Set<String> errors = new LinkedHashSet<>(); 163 164 SupplementalDataInfo supp = SUPPLEMENTAL_DATA_INFO; 165 for (DtdType dtdType : DtdType.values()) { 166 if (dtdType == DtdType.ldmlICU) { 167 continue; 168 } 169 DtdData data = DtdData.getInstance(dtdType); 170 for (Element element : data.getElements()) { 171 String elementName = element.name; 172 ELEMENTS_POSSIBLE.put(elementName, dtdType); 173 final Set<Element> children = element.getChildren().keySet(); 174 175 boolean skipFinal = children.isEmpty() 176 || children.size() == 1 177 && children.iterator().next().name.equals("special"); 178 179 for (Entry<Attribute, Integer> attributeInt : element.getAttributes().entrySet()) { 180 Attribute attribute = attributeInt.getKey(); 181 String attributeName = attribute.name; 182 if (attribute.defaultValue != null) { 183 errors.add("Warning, default value «" + attribute.defaultValue 184 + "» for: " + dtdType + "\t" + elementName + "\t" + attributeName); 185 } 186 final R2<DtdType, String> attributeRow = Row.of(dtdType, elementName); 187 ATTRIBUTES_POSSIBLE.put(attributeName, attributeRow); 188 if (skipFinal || SKIP_ATTRIBUTES.contains(attributeName)) { // don't worry about non-final, references, draft, standard 189 continue; 190 } 191 if (supp.isDeprecated(dtdType, elementName, attributeName, null)) { 192 continue; 193 } 194 if (!CLDRFile.isDistinguishing(dtdType, elementName, attributeName)) { 195 String doesOccur = ""; 196 final Set<R2<DtdType, String>> attributeRows = ATTRIBUTES_OCCURRING.get(attributeName); 197 if (attributeRows == null || !attributeRows.contains(attributeRow)) { 198 doesOccur = "\tNEVER"; 199 } 200 errors.add("Warning, !disting, !leaf: " + dtdType + "\t" + elementName + "\t" + attributeName + "\t" + children + doesOccur); 201 } 202 } 203 } 204 } 205 try ( 206 PrintWriter out = FileUtilities.openUTF8Writer(OUT_DIRECTORY, outputFile)) { 207 out.println("\nElements\tDeprecated\tOccurring\tPossible in DTD, but never occurs"); 208 209 for (Entry<String, Set<DtdType>> x : ELEMENTS_POSSIBLE.keyValuesSet()) { 210 final String element = x.getKey(); 211 if (element.equals("#PCDATA") || element.equals("ANY") || element.equals("generation")) { 212 continue; 213 } 214 final Set<DtdType> possible = x.getValue(); 215 Set<DtdType> deprecated = new TreeSet(); 216 for (DtdType dtdType : possible) { 217 if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, "*", "*")) { 218 deprecated.add(dtdType); 219 } 220 } 221 Set<DtdType> notDeprecated = new TreeSet(possible); 222 notDeprecated.removeAll(deprecated); 223 224 Set<DtdType> occurs = CldrUtility.ifNull(ELEMENTS_OCCURRING.get(element), Collections.EMPTY_SET); 225 Set<DtdType> noOccur = new TreeSet(possible); 226 noOccur.removeAll(occurs); 227 228 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur 229 final Set<DtdType> intersection = CldrUtility.intersect(deprecated, occurs); 230 errors.add("Error: element «" + element 231 + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) + 232 " but occurs in live data: " + intersection); 233 } 234 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning 235 errors.add("Warning: element «" + element 236 + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur)); 237 } 238 239 out.println(element 240 + "\t" + deprecated 241 + "\t" + occurs 242 + "\t" + noOccur); 243 } 244 245 out.println("\nAttributes\tDeprecated\tOccurring\tPossible in DTD, but never occurs"); 246 247 for (Entry<String, Set<R2<DtdType, String>>> x : ATTRIBUTES_POSSIBLE.keyValuesSet()) { 248 final String attribute = x.getKey(); 249 if (attribute.equals("alt") || attribute.equals("draft") || attribute.equals("references")) { 250 continue; 251 } 252 final Set<R2<DtdType, String>> possible = x.getValue(); 253 Set<R2<DtdType, String>> deprecated = new TreeSet(); 254 for (R2<DtdType, String> s : possible) { 255 final DtdType dtdType = s.get0(); 256 final String element = s.get1(); 257 if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, attribute, "*")) { 258 deprecated.add(s); 259 } 260 } 261 Set<R2<DtdType, String>> notDeprecated = new TreeSet(possible); 262 notDeprecated.removeAll(deprecated); 263 264 Set<R2<DtdType, String>> occurs = CldrUtility.ifNull(ATTRIBUTES_OCCURRING.get(attribute), Collections.EMPTY_SET); 265 Set<R2<DtdType, String>> noOccur = new TreeSet(possible); 266 noOccur.removeAll(occurs); 267 268 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur 269 final Set<R2<DtdType, String>> intersection = CldrUtility.intersect(deprecated, occurs); 270 errors.add("Error: attribute «" + attribute 271 + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) + 272 " but occurs in live data: " + intersection); 273 } 274 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning 275 errors.add("Warning: attribute «" + attribute 276 + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur)); 277 } 278 out.println(attribute 279 + "\t" + deprecated 280 + "\t" + occurs 281 + "\t" + noOccur); 282 } 283 out.println("\nERRORS/WARNINGS"); 284 out.println(Joiner.on("\n").join(errors)); 285 } 286 } 287 288 static class AttributeTypes { 289 Relation<String, String> elementPathToAttributes = Relation.of(new TreeMap<String, Set<String>>(), 290 TreeSet.class); 291 final PathStarrer PATH_STARRER = new PathStarrer().setSubstitutionPattern("*"); 292 final Set<String> STARRED_PATHS = new TreeSet<>(); 293 StringBuilder elementPath = new StringBuilder(); 294 add(String path)295 public void add(String path) { 296 XPathParts parts = XPathParts.getFrozenInstance(path); 297 elementPath.setLength(0); 298 for (int i = 0; i < parts.size(); ++i) { 299 String element = parts.getElement(i); 300 elementPath.append('/').append(element); 301 elementPathToAttributes.putAll(elementPath.toString().intern(), parts.getAttributeKeys(i)); 302 } 303 } 304 showStarred()305 public void showStarred() throws IOException { 306 PrintWriter starred = FileUtilities.openUTF8Writer(OUT_DIRECTORY, "starred" + ".txt"); 307 308 for (Entry<String, Set<String>> entry : elementPathToAttributes.keyValuesSet()) { 309 Set<String> attributes = entry.getValue(); 310 if (attributes.size() == 0) { 311 continue; 312 } 313 String path = entry.getKey(); 314 String[] elements = path.split("/"); 315 DtdType type = DtdType.valueOf(elements[1]); 316 String finalElement = elements[elements.length - 1]; 317 starred.print(path); 318 for (String attribute : attributes) { 319 if (CLDRFile.isDistinguishing(type, finalElement, attribute)) { 320 starred.print("[@" + attribute + "='disting.']"); 321 } else { 322 starred.print("[@" + attribute + "='DATA']"); 323 } 324 } 325 starred.println(); 326 } 327 starred.close(); 328 } 329 } 330 331 static Pattern prefix = PatternCache.get("([^/]+/[^/]+)(.*)"); 332 333 static class Delta { 334 Counter<String> newCount = new Counter<>(); 335 Counter<String> deletedCount = new Counter<>(); 336 Counter<String> changedCount = new Counter<>(); 337 Counter<String> unchangedCount = new Counter<>(); 338 print(PrintWriter changesSummary, Set<String> prefixes)339 void print(PrintWriter changesSummary, Set<String> prefixes) { 340 changesSummary.println("Total" 341 + "\t" + unchangedCount.getTotal() 342 + "\t" + deletedCount.getTotal() 343 + "\t" + changedCount.getTotal() 344 + "\t" + newCount.getTotal()); 345 changesSummary.println("Directory\tSame\tRemoved\tChanged\tAdded"); 346 for (String prefix : prefixes) { 347 changesSummary.println(prefix 348 + "\t" + unchangedCount.get(prefix) 349 + "\t" + deletedCount.get(prefix) 350 + "\t" + changedCount.get(prefix) 351 + "\t" + newCount.get(prefix)); 352 } 353 } 354 } 355 compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew, PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, Relation<String, String> path2value2)356 private static void compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew, 357 PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, 358 Relation<String, String> path2value2) { 359 Set<String> union = Builder.with(new TreeSet<String>()).addAll(oldPath2value.keySet()) 360 .addAll(path2value2.keySet()).get(); 361 long total = 0; 362 Matcher prefixMatcher = prefix.matcher(""); 363 Delta charCount = new Delta(); 364 Delta itemCount = new Delta(); 365 Set<String> prefixes = new TreeSet(); 366 for (String path : union) { 367 if (!prefixMatcher.reset(path).find()) { 368 throw new IllegalArgumentException(); 369 } 370 String prefix = prefixMatcher.group(1); 371 prefixes.add(prefix); 372 String localPath = prefixMatcher.group(2); 373 Set<String> set1 = oldPath2value.getAll(path); 374 Set<String> set2 = path2value2.getAll(path); 375 if (set2 != null) { 376 total += set2.size(); 377 } 378 if (set1 == null) { 379 changesNew.println(prefix + "\t" + "\t" + set2 + "\t" + localPath); 380 itemCount.newCount.add(prefix, set2.size()); 381 charCount.newCount.add(prefix, totalLength(set2)); 382 } else if (set2 == null) { 383 changesDeletes.println(prefix + "\t" + set1 + "\t\t" + localPath); 384 itemCount.deletedCount.add(prefix, -set1.size()); 385 charCount.deletedCount.add(prefix, -totalLength(set1)); 386 } else if (!set1.equals(set2)) { 387 TreeSet<String> set1minus2 = Builder.with(new TreeSet<String>()).addAll(set1).removeAll(set2).get(); 388 TreeSet<String> set2minus1 = Builder.with(new TreeSet<String>()).addAll(set2).removeAll(set1).get(); 389 TreeSet<String> set2and1 = Builder.with(new TreeSet<String>()).addAll(set2).retainAll(set1).get(); 390 itemCount.changedCount.add(prefix, (set2minus1.size() + set1minus2.size() + 1) / 2); 391 itemCount.unchangedCount.add(prefix, set2and1.size()); 392 charCount.changedCount.add(prefix, (totalLength(set2minus1) + totalLength(set1minus2) + 1) / 2); 393 charCount.unchangedCount.add(prefix, totalLength(set2and1)); 394 changes.println(prefix + "\t" + set1minus2 395 + "\t" 396 + set2minus1 397 + "\t" + localPath); 398 } else { 399 itemCount.unchangedCount.add(prefix, set2.size()); 400 charCount.unchangedCount.add(prefix, totalLength(set2)); 401 } 402 } 403 itemCount.print(changesSummary, prefixes); 404 changesSummary.println(); 405 charCount.print(changesSummary, prefixes); 406 // union = Builder.with(new TreeSet<String>()) 407 // .addAll(newCount.keySet()) 408 // .addAll(deletedCount.keySet()) 409 // .addAll(changedCount.keySet()) 410 // .addAll(unchangedCount.keySet()) 411 // .get(); 412 summary.println("#Total:\t" + total); 413 } 414 totalLength(Set<String> set2)415 private static long totalLength(Set<String> set2) { 416 int result = 0; 417 for (String s : set2) { 418 result += s.length(); 419 } 420 return result; 421 } 422 423 final static Pattern LOCALE_PATTERN = PatternCache.get( 424 "([a-z]{2,3})(?:[_-]([A-Z][a-z]{3}))?(?:[_-]([a-zA-Z0-9]{2,3}))?([_-][a-zA-Z0-9]{1,8})*"); 425 doSummary()426 public static void doSummary() throws IOException { 427 Map<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>> key_release_count = new TreeMap<>(); 428 Matcher countryLocale = LOCALE_PATTERN.matcher(""); 429 List<String> releases = new ArrayList<>(); 430 Pattern releaseNumber = PatternCache.get("count_(?:.*-(\\d+(\\.\\d+)*)|trunk)\\.txt"); 431 // int releaseCount = 1; 432 Relation<String, String> release_keys = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 433 Relation<String, String> localesToPaths = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 434 Set<String> writtenLanguages = new TreeSet<>(); 435 Set<String> countries = new TreeSet<>(); 436 437 File[] listFiles = new File(OUT_DIRECTORY).listFiles(); 438 // find the most recent version 439 VersionInfo mostRecentVersion = VersionInfo.getInstance(0); 440 for (File subdir : listFiles) { 441 final String name = subdir.getName(); 442 final Matcher releaseMatcher = releaseNumber.matcher(name); 443 if (!releaseMatcher.matches()) { 444 if (name.startsWith("count_")) { 445 throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name)); 446 } 447 continue; 448 } 449 String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++; 450 if (releaseNum == null) { 451 releaseNum = TRUNK_VERSION; 452 } 453 VersionInfo vi = VersionInfo.getInstance(releaseNum); 454 if (vi.compareTo(mostRecentVersion) > 0) { 455 mostRecentVersion = vi; 456 } 457 } 458 459 for (File subdir : listFiles) { 460 final String name = subdir.getName(); 461 final Matcher releaseMatcher = releaseNumber.matcher(name); 462 if (!releaseMatcher.matches()) { 463 if (name.startsWith("count_")) { 464 throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name)); 465 } 466 continue; 467 } 468 String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++; 469 if (releaseNum == null) { 470 releaseNum = TRUNK_VERSION; 471 } 472 VersionInfo vi = VersionInfo.getInstance(releaseNum); 473 boolean captureData = vi.equals(mostRecentVersion); 474 releases.add(releaseNum); 475 BufferedReader in = FileUtilities.openUTF8Reader("", PathUtilities.getNormalizedPathString(subdir)); 476 while (true) { 477 String line = in.readLine(); 478 if (line == null) break; 479 line = line.trim(); 480 if (line.startsWith("#")) { 481 continue; 482 } 483 // common/main New: [Yellowknife] /gl//ldml/dates/timeZoneNames/zone[@type="America/Yellowknife"]/exemplarCity 484 485 String[] parts = line.split("\t"); 486 try { 487 String file = parts[0]; 488 if (file.startsWith("seed/") || !DIR_FILE_MATCHER.reset(file).find()) { 489 if (VERBOSE) { 490 System.out.println("Skipping: " + RegexUtilities.showMismatch(DIR_FILE_MATCHER, file)); 491 } 492 continue; 493 } else if (VERBOSE) { 494 System.out.println("Including: " + file); 495 } 496 497 long valueCount = Long.parseLong(parts[1]); 498 long valueLen = Long.parseLong(parts[2]); 499 long attrCount = Long.parseLong(parts[3]); 500 long attrLen = Long.parseLong(parts[4]); 501 int lastSlash = file.lastIndexOf("/"); 502 String key2 = file; 503 String path = file.substring(0, lastSlash); 504 String key = file.substring(lastSlash + 1); 505 if (countryLocale.reset(key).matches()) { 506 String lang = countryLocale.group(1); 507 String script = countryLocale.group(2); 508 String country = countryLocale.group(3); 509 String writtenLang = lang + (script == null ? "" : "_" + script); 510 String locale = writtenLang + (country == null ? "" : "_" + country); 511 if (captureData) { 512 localesToPaths.put(locale, path); 513 writtenLanguages.add(writtenLang); 514 if (country != null) { 515 countries.add(country); 516 } 517 } 518 // System.out.println(key + " => " + newKey); 519 //key = writtenLang + "—" + ULocale.getDisplayName(writtenLang, "en"); 520 } 521 if (valueCount + attrCount == 0) continue; 522 release_keys.put(releaseNum, key2); 523 R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count 524 .get(key2); 525 if (release_count == null) { 526 release_count = Row.of(new Counter<String>(), new Counter<String>(), new Counter<String>(), 527 new Counter<String>()); 528 key_release_count.put(key2, release_count); 529 } 530 release_count.get0().add(releaseNum, valueCount); 531 release_count.get1().add(releaseNum, valueLen); 532 release_count.get2().add(releaseNum, attrCount); 533 release_count.get3().add(releaseNum, attrLen); 534 } catch (Exception e) { 535 throw new IllegalArgumentException(line, e); 536 } 537 } 538 in.close(); 539 } 540 PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "summary" + 541 ".txt"); 542 for (String file : releases) { 543 summary.print("\t" + file + "\tlen"); 544 } 545 summary.println(); 546 for (String key : key_release_count.keySet()) { 547 summary.print(key); 548 R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count 549 .get(key); 550 for (String release2 : releases) { 551 long count = release_count.get0().get(release2) + release_count.get2().get(release2); 552 long len = release_count.get1().get(release2) + release_count.get3().get(release2); 553 summary.print("\t" + count + "\t" + len); 554 } 555 summary.println(); 556 } 557 for (String release : release_keys.keySet()) { 558 System.out.println("Release:\t" + release + "\t" + release_keys.getAll(release).size()); 559 } 560 summary.close(); 561 PrintWriter summary2 = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "locales" + 562 ".txt"); 563 summary2.println("#Languages (inc. script):\t" + writtenLanguages.size()); 564 summary2.println("#Countries:\t" + countries.size()); 565 summary2.println("#Locales:\t" + localesToPaths.size()); 566 for (Entry<String, Set<String>> entry : localesToPaths.keyValuesSet()) { 567 summary2.println(entry.getKey() + "\t" + Joiner.on("\t").join(entry.getValue())); 568 } 569 summary2.close(); 570 } 571 572 static final Set<String> ATTRIBUTES_TO_SKIP = Builder.with(new HashSet<String>()) 573 .addAll("version", "references", "standard", "draft").freeze(); 574 static final Pattern skipPath = PatternCache.get("" + 575 "\\[\\@alt=\"[^\"]*proposed" + 576 "|^//" + 577 "(ldml(\\[[^/]*)?/identity" + 578 "|(ldmlBCP47|supplementalData|keyboard)(\\[[^/]*)?/(generation|version)" + 579 ")"); 580 capture(DtdType type2, XPathParts parts)581 static void capture(DtdType type2, XPathParts parts) { 582 for (int i = 0; i < parts.size(); ++i) { 583 String element = parts.getElement(i); 584 ELEMENTS_OCCURRING.put(element, type2); 585 for (String attribute : parts.getAttributes(i).keySet()) { 586 ATTRIBUTES_OCCURRING.put(attribute, Row.of(type2, element)); 587 } 588 } 589 } 590 591 static class MyHandler extends SimpleHandler { 592 long valueCount; 593 long valueLen; 594 long attributeCount; 595 long attributeLen; 596 Matcher skipPathMatcher = skipPath.matcher(""); 597 Splitter lines = Splitter.onPattern("\n+").omitEmptyStrings().trimResults(); 598 String prefix; 599 int orderedCount; 600 DtdType type; 601 private final boolean isFinal; 602 MyHandler(String prefix, boolean isFinal)603 MyHandler(String prefix, boolean isFinal) { 604 this.prefix = prefix; 605 this.isFinal = isFinal; 606 } 607 608 @Override handlePathValue(String path, String value)609 public void handlePathValue(String path, String value) { 610 if (type == null) { 611 XPathParts parts = XPathParts.getFrozenInstance(path); 612 type = DtdType.valueOf(parts.getElement(0)); 613 } 614 615 ATTRIBUTE_TYPES.add(path); 616 617 if (skipPathMatcher.reset(path).find()) { 618 return; 619 } 620 String pathKey = null; 621 if (doChanges) { 622 // if (path.contains("/collations")) { 623 // System.out.println("whoops"); 624 // } 625 pathKey = fixKeyPath(path); 626 } 627 int len = value.length(); 628 value = value.trim(); 629 if (value.isEmpty() && len > 0) { 630 value = " "; 631 } 632 if (value.length() != 0) { 633 List<String> valueLines = lines.splitToList(value); 634 if (valueLines.size() == 1) { 635 valueCount++; 636 valueLen += value.length(); 637 if (doChanges) { 638 path2value.put(pathKey, value); 639 } 640 } else { 641 int count = 0; 642 for (String v : valueLines) { 643 valueCount++; 644 valueLen += v.length(); 645 if (doChanges) { 646 path2value.put(pathKey + "/_q" + count++, v); 647 } 648 } 649 } 650 } 651 XPathParts parts = XPathParts.getFrozenInstance(path); 652 if (isFinal) { 653 capture(type, parts); 654 } 655 if (path.contains("[@")) { 656 int i = parts.size() - 1; // only look at last item 657 Collection<String> attributes = parts.getAttributeKeys(i); 658 if (attributes.size() != 0) { 659 String element = parts.getElement(i); 660 for (String attribute : attributes) { 661 if (ATTRIBUTES_TO_SKIP.contains(attribute) 662 || CLDRFile.isDistinguishing(type, element, attribute)) { 663 continue; 664 } 665 String valuePart = parts.getAttributeValue(i, attribute); 666 // String[] valueParts = attrValue.split("\\s"); 667 // for (String valuePart : valueParts) { 668 attributeCount++; 669 attributeLen += valuePart.length(); 670 if (doChanges) { 671 path2value.put(pathKey + "/_" + attribute, valuePart); 672 // } 673 } 674 } 675 } 676 } 677 } 678 fixKeyPath(String path)679 private String fixKeyPath(String path) { 680 XPathParts parts = XPathParts.getFrozenInstance(path); 681 if (!SKIP_ORDERING) { 682 parts = parts.cloneAsThawed(); 683 } 684 for (int i = 0; i < parts.size(); ++i) { 685 String element = parts.getElement(i); 686 if (!SKIP_ORDERING) { 687 if (CLDRFile.isOrdered(element, type)) { 688 parts.addAttribute("_q", String.valueOf(orderedCount++)); 689 } 690 } 691 } 692 return prefix + CLDRFile.getDistinguishingXPath(parts.toString(), null); 693 } 694 } 695 check(String systemID, String name, boolean isFinal)696 private MyHandler check(String systemID, String name, boolean isFinal) { 697 MyHandler myHandler = new MyHandler(name, isFinal); 698 try { 699 XMLFileReader reader = new XMLFileReader().setHandler(myHandler); 700 reader.read(systemID, XMLFileReader.CONTENT_HANDLER, true); 701 } catch (Exception e) { 702 cantRead.put(name, Arrays.asList(e.getStackTrace())); 703 } 704 return myHandler; 705 706 // try { 707 // FileInputStream fis = new FileInputStream(systemID); 708 // XMLFileReader xmlReader = XMLFileReader.createXMLReader(true); 709 // xmlReader.setErrorHandler(new MyErrorHandler()); 710 // MyHandler myHandler = new MyHandler(); 711 // smlReader 712 // xmlReader.setHandler(myHandler); 713 // InputSource is = new InputSource(fis); 714 // is.setSystemId(systemID.toString()); 715 // xmlReader.parse(is); 716 // fis.close(); 717 // return myHandler; 718 // } catch (SAXParseException e) { 719 // System.out.println("\t" + "Can't read " + systemID); 720 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 721 // } catch (SAXException e) { 722 // System.out.println("\t" + "Can't read " + systemID); 723 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 724 // } catch (IOException e) { 725 // System.out.println("\t" + "Can't read " + systemID); 726 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 727 // } 728 } 729 730 static class MyErrorHandler implements ErrorHandler { 731 @Override error(SAXParseException exception)732 public void error(SAXParseException exception) throws SAXException { 733 System.out.println("\nerror: " + XMLFileReader.showSAX(exception)); 734 throw exception; 735 } 736 737 @Override fatalError(SAXParseException exception)738 public void fatalError(SAXParseException exception) throws SAXException { 739 System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception)); 740 throw exception; 741 } 742 743 @Override warning(SAXParseException exception)744 public void warning(SAXParseException exception) throws SAXException { 745 System.out.println("\nwarning: " + XMLFileReader.showSAX(exception)); 746 throw exception; 747 } 748 } 749 summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal)750 private void summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal) { 751 System.out.println(commonDir); 752 summary.println("#name" + "\t" + "value-count" + "\t" + "value-len" + "\t" + "attr-count" + "\t" + "attr-len"); 753 File commonDirectory = new File(commonDir); 754 if (!commonDirectory.exists()) { 755 System.out.println("Doesn't exist:\t" + commonDirectory); 756 } 757 summarizeFiles(summary, commonDirectory, isFinal, 1); 758 } 759 760 static final Set<String> SKIP_DIRS = new HashSet<>(Arrays.asList("specs", "tools", "seed", "exemplars")); 761 summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level)762 public void summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level) { 763 System.out.println("\t\t\t\t\t\t\t".substring(0, level) + directory); 764 int count = 0; 765 for (File file : directory.listFiles()) { 766 String filename = file.getName(); 767 if (filename.startsWith(".")) { 768 // do nothing 769 } else if (file.isDirectory()) { 770 if (!SKIP_DIRS.contains(filename)) { 771 summarizeFiles(summary, file, isFinal, level + 1); 772 } 773 } else if (!filename.startsWith("#") && filename.endsWith(".xml")) { 774 String name = new File(directory.getParent()).getName() + "/" + directory.getName() + "/" 775 + file.getName(); 776 name = name.substring(0, name.length() - 4); // strip .xml 777 if (!RAW_FILE_MATCHER.reset(name).find()) { 778 continue; 779 } 780 if (VERBOSE) { 781 System.out.println(name); 782 } else { 783 System.out.print("."); 784 if (++count > 100) { 785 count = 0; 786 System.out.println(); 787 } 788 System.out.flush(); 789 } 790 MyHandler handler = check(file.toString(), name, isFinal); 791 summary.println(name + "\t" + handler.valueCount + "\t" + handler.valueLen + "\t" 792 + handler.attributeCount + "\t" + handler.attributeLen); 793 } 794 } 795 System.out.println(); 796 } 797 } 798