• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.PrintWriter;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.Collection;
10 import java.util.Collections;
11 import java.util.HashSet;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Map.Entry;
16 import java.util.Set;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19 import java.util.regex.Matcher;
20 import java.util.regex.Pattern;
21 
22 import org.unicode.cldr.draft.FileUtilities;
23 import org.unicode.cldr.tool.Option.Options;
24 import org.unicode.cldr.util.Builder;
25 import org.unicode.cldr.util.CLDRConfig;
26 import org.unicode.cldr.util.CLDRFile;
27 import org.unicode.cldr.util.CLDRPaths;
28 import org.unicode.cldr.util.CldrUtility;
29 import org.unicode.cldr.util.Counter;
30 import org.unicode.cldr.util.DtdData;
31 import org.unicode.cldr.util.DtdData.Attribute;
32 import org.unicode.cldr.util.DtdData.Element;
33 import org.unicode.cldr.util.DtdType;
34 import org.unicode.cldr.util.PathStarrer;
35 import org.unicode.cldr.util.PathUtilities;
36 import org.unicode.cldr.util.PatternCache;
37 import org.unicode.cldr.util.RegexUtilities;
38 import org.unicode.cldr.util.SupplementalDataInfo;
39 import org.unicode.cldr.util.XMLFileReader;
40 import org.unicode.cldr.util.XMLFileReader.SimpleHandler;
41 import org.unicode.cldr.util.XPathParts;
42 import org.xml.sax.ErrorHandler;
43 import org.xml.sax.SAXException;
44 import org.xml.sax.SAXParseException;
45 
46 import com.google.common.base.Joiner;
47 import com.google.common.base.Splitter;
48 import com.ibm.icu.impl.Relation;
49 import com.ibm.icu.impl.Row;
50 import com.ibm.icu.impl.Row.R2;
51 import com.ibm.icu.impl.Row.R4;
52 import com.ibm.icu.util.VersionInfo;
53 
54 public class GenerateItemCounts {
55     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig.getInstance().getSupplementalDataInfo();
56     private static final boolean SKIP_ORDERING = true;
57     private static final String OUT_DIRECTORY = CLDRPaths.GEN_DIRECTORY + "/itemcount/"; // CldrUtility.MAIN_DIRECTORY;
58     private Map<String, List<StackTraceElement>> cantRead = new TreeMap<>();
59     static {
60         System.err.println("Probably obsolete tool");
61     }
62     private static String[] DIRECTORIES = {
63         // MUST be oldest first!
64         // "cldr-archive/cldr-21.0",
65         // "cldr-24.0",
66         "cldr-27.0",
67         "trunk"
68     };
69 
70     private static String TRUNK_VERSION = "26.0";
71 
72     static boolean doChanges = true;
73     static Relation<String, String> path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
74     static final AttributeTypes ATTRIBUTE_TYPES = new AttributeTypes();
75 
76     final static Options myOptions = new Options();
77 
78     enum MyOptions {
79         summary(null, null, "if present, summarizes data already collected. Run once with, once without."), directory(".*", ".*",
80             "if summary, creates filtered version (eg -d main): does a find in the name, which is of the form dir/file"), verbose(null, null,
81                 "verbose debugging messages"), rawfilter(".*", ".*", "filter the raw files (non-summary, mostly for debugging)"),;
82         // boilerplate
83         final Option option;
84 
MyOptions(String argumentPattern, String defaultArgument, String helpText)85         MyOptions(String argumentPattern, String defaultArgument, String helpText) {
86             option = myOptions.add(this, argumentPattern, defaultArgument, helpText);
87         }
88     }
89 
90     static Matcher DIR_FILE_MATCHER;
91     static Matcher RAW_FILE_MATCHER;
92     static boolean VERBOSE;
93 
main(String[] args)94     public static void main(String[] args) throws IOException {
95         myOptions.parse(MyOptions.directory, args, true);
96 
97         DIR_FILE_MATCHER = PatternCache.get(MyOptions.directory.option.getValue()).matcher("");
98         RAW_FILE_MATCHER = PatternCache.get(MyOptions.rawfilter.option.getValue()).matcher("");
99         VERBOSE = MyOptions.verbose.option.doesOccur();
100 
101         if (MyOptions.summary.option.doesOccur()) {
102             doSummary();
103             System.out.println("DONE");
104             return;
105             // } else if (arg.equals("changes")) {
106             // doChanges = true;
107         } else {
108         }
109         // Pattern dirPattern = dirPattern = PatternCache.get(arg);
110         GenerateItemCounts main = new GenerateItemCounts();
111         try {
112             Relation<String, String> oldPath2value = null;
113             for (String dir : DIRECTORIES) {
114                 // if (dirPattern != null && !dirPattern.matcher(dir).find()) continue;
115                 final String pathname = dir.equals("trunk") ? CLDRPaths.BASE_DIRECTORY
116                     : CLDRPaths.ARCHIVE_DIRECTORY + "/" + dir;
117                 boolean isFinal = dir == DIRECTORIES[DIRECTORIES.length - 1];
118 
119                 String fulldir = PathUtilities.getNormalizedPathString(pathname);
120                 String prefix = (MyOptions.rawfilter.option.doesOccur() ? "filtered_" : "");
121                 String fileKey = dir.replace("/", "_");
122                 try (
123                     PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_count.txt");
124                     PrintWriter changes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes.txt");
125                     PrintWriter changesNew = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_news.txt");
126                     PrintWriter changesDeletes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_deletes.txt");
127                     PrintWriter changesSummary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes_summary.txt");) {
128                     main.summarizeCoverage(summary, fulldir, isFinal);
129                     if (doChanges) {
130                         if (oldPath2value != null) {
131                             compare(summary, changes, changesNew, changesDeletes, changesSummary, oldPath2value, path2value);
132                             checkBadAttributes(path2value, prefix + fileKey + "_dtd_check.txt");
133                         }
134                         oldPath2value = path2value;
135                         path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
136                     }
137                 }
138             }
139             ATTRIBUTE_TYPES.showStarred();
140         } finally {
141             if (main.cantRead.size() != 0) {
142                 System.out.println("Couldn't read:\t");
143                 for (String file : main.cantRead.keySet()) {
144                     System.out.println(file + "\t" + main.cantRead.get(file));
145                 }
146             }
147             System.out.println("DONE");
148         }
149     }
150 
151     static final Set<String> SKIP_ATTRIBUTES = new HashSet<>(Arrays.asList("draft", "references", "validSubLocales"));
152 
153     static final Relation<String, DtdType> ELEMENTS_OCCURRING = Relation.of(new TreeMap(), TreeSet.class);
154     static final Relation<String, DtdType> ELEMENTS_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class);
155     static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_OCCURRING = Relation.of(new TreeMap(), TreeSet.class);
156     static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class);
157 
checkBadAttributes(Relation<String, String> path2value2, String outputFile)158     private static void checkBadAttributes(Relation<String, String> path2value2, String outputFile)
159         throws IOException {
160         // an attribute is misplaced if it is not distinguishing, but is on a non-final node.
161 
162         Set<String> errors = new LinkedHashSet<>();
163 
164         SupplementalDataInfo supp = SUPPLEMENTAL_DATA_INFO;
165         for (DtdType dtdType : DtdType.values()) {
166             if (dtdType == DtdType.ldmlICU) {
167                 continue;
168             }
169             DtdData data = DtdData.getInstance(dtdType);
170             for (Element element : data.getElements()) {
171                 String elementName = element.name;
172                 ELEMENTS_POSSIBLE.put(elementName, dtdType);
173                 final Set<Element> children = element.getChildren().keySet();
174 
175                 boolean skipFinal = children.isEmpty()
176                     || children.size() == 1
177                         && children.iterator().next().name.equals("special");
178 
179                 for (Entry<Attribute, Integer> attributeInt : element.getAttributes().entrySet()) {
180                     Attribute attribute = attributeInt.getKey();
181                     String attributeName = attribute.name;
182                     if (attribute.defaultValue != null) {
183                         errors.add("Warning, default value «" + attribute.defaultValue
184                             + "» for: " + dtdType + "\t" + elementName + "\t" + attributeName);
185                     }
186                     final R2<DtdType, String> attributeRow = Row.of(dtdType, elementName);
187                     ATTRIBUTES_POSSIBLE.put(attributeName, attributeRow);
188                     if (skipFinal || SKIP_ATTRIBUTES.contains(attributeName)) { // don't worry about non-final, references, draft, standard
189                         continue;
190                     }
191                     if (supp.isDeprecated(dtdType, elementName, attributeName, null)) {
192                         continue;
193                     }
194                     if (!CLDRFile.isDistinguishing(dtdType, elementName, attributeName)) {
195                         String doesOccur = "";
196                         final Set<R2<DtdType, String>> attributeRows = ATTRIBUTES_OCCURRING.get(attributeName);
197                         if (attributeRows == null || !attributeRows.contains(attributeRow)) {
198                             doesOccur = "\tNEVER";
199                         }
200                         errors.add("Warning, !disting, !leaf: " + dtdType + "\t" + elementName + "\t" + attributeName + "\t" + children + doesOccur);
201                     }
202                 }
203             }
204         }
205         try (
206             PrintWriter out = FileUtilities.openUTF8Writer(OUT_DIRECTORY, outputFile)) {
207             out.println("\nElements\tDeprecated\tOccurring\tPossible in DTD, but never occurs");
208 
209             for (Entry<String, Set<DtdType>> x : ELEMENTS_POSSIBLE.keyValuesSet()) {
210                 final String element = x.getKey();
211                 if (element.equals("#PCDATA") || element.equals("ANY") || element.equals("generation")) {
212                     continue;
213                 }
214                 final Set<DtdType> possible = x.getValue();
215                 Set<DtdType> deprecated = new TreeSet();
216                 for (DtdType dtdType : possible) {
217                     if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, "*", "*")) {
218                         deprecated.add(dtdType);
219                     }
220                 }
221                 Set<DtdType> notDeprecated = new TreeSet(possible);
222                 notDeprecated.removeAll(deprecated);
223 
224                 Set<DtdType> occurs = CldrUtility.ifNull(ELEMENTS_OCCURRING.get(element), Collections.EMPTY_SET);
225                 Set<DtdType> noOccur = new TreeSet(possible);
226                 noOccur.removeAll(occurs);
227 
228                 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur
229                     final Set<DtdType> intersection = CldrUtility.intersect(deprecated, occurs);
230                     errors.add("Error: element «" + element
231                         + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) +
232                         " but occurs in live data: " + intersection);
233                 }
234                 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning
235                     errors.add("Warning: element «" + element
236                         + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur));
237                 }
238 
239                 out.println(element
240                     + "\t" + deprecated
241                     + "\t" + occurs
242                     + "\t" + noOccur);
243             }
244 
245             out.println("\nAttributes\tDeprecated\tOccurring\tPossible in DTD, but never occurs");
246 
247             for (Entry<String, Set<R2<DtdType, String>>> x : ATTRIBUTES_POSSIBLE.keyValuesSet()) {
248                 final String attribute = x.getKey();
249                 if (attribute.equals("alt") || attribute.equals("draft") || attribute.equals("references")) {
250                     continue;
251                 }
252                 final Set<R2<DtdType, String>> possible = x.getValue();
253                 Set<R2<DtdType, String>> deprecated = new TreeSet();
254                 for (R2<DtdType, String> s : possible) {
255                     final DtdType dtdType = s.get0();
256                     final String element = s.get1();
257                     if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, attribute, "*")) {
258                         deprecated.add(s);
259                     }
260                 }
261                 Set<R2<DtdType, String>> notDeprecated = new TreeSet(possible);
262                 notDeprecated.removeAll(deprecated);
263 
264                 Set<R2<DtdType, String>> occurs = CldrUtility.ifNull(ATTRIBUTES_OCCURRING.get(attribute), Collections.EMPTY_SET);
265                 Set<R2<DtdType, String>> noOccur = new TreeSet(possible);
266                 noOccur.removeAll(occurs);
267 
268                 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur
269                     final Set<R2<DtdType, String>> intersection = CldrUtility.intersect(deprecated, occurs);
270                     errors.add("Error: attribute «" + attribute
271                         + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) +
272                         " but occurs in live data: " + intersection);
273                 }
274                 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning
275                     errors.add("Warning: attribute «" + attribute
276                         + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur));
277                 }
278                 out.println(attribute
279                     + "\t" + deprecated
280                     + "\t" + occurs
281                     + "\t" + noOccur);
282             }
283             out.println("\nERRORS/WARNINGS");
284             out.println(Joiner.on("\n").join(errors));
285         }
286     }
287 
288     static class AttributeTypes {
289         Relation<String, String> elementPathToAttributes = Relation.of(new TreeMap<String, Set<String>>(),
290             TreeSet.class);
291         final PathStarrer PATH_STARRER = new PathStarrer().setSubstitutionPattern("*");
292         final Set<String> STARRED_PATHS = new TreeSet<>();
293         StringBuilder elementPath = new StringBuilder();
294 
add(String path)295         public void add(String path) {
296             XPathParts parts = XPathParts.getFrozenInstance(path);
297             elementPath.setLength(0);
298             for (int i = 0; i < parts.size(); ++i) {
299                 String element = parts.getElement(i);
300                 elementPath.append('/').append(element);
301                 elementPathToAttributes.putAll(elementPath.toString().intern(), parts.getAttributeKeys(i));
302             }
303         }
304 
showStarred()305         public void showStarred() throws IOException {
306             PrintWriter starred = FileUtilities.openUTF8Writer(OUT_DIRECTORY, "starred" + ".txt");
307 
308             for (Entry<String, Set<String>> entry : elementPathToAttributes.keyValuesSet()) {
309                 Set<String> attributes = entry.getValue();
310                 if (attributes.size() == 0) {
311                     continue;
312                 }
313                 String path = entry.getKey();
314                 String[] elements = path.split("/");
315                 DtdType type = DtdType.valueOf(elements[1]);
316                 String finalElement = elements[elements.length - 1];
317                 starred.print(path);
318                 for (String attribute : attributes) {
319                     if (CLDRFile.isDistinguishing(type, finalElement, attribute)) {
320                         starred.print("[@" + attribute + "='disting.']");
321                     } else {
322                         starred.print("[@" + attribute + "='DATA']");
323                     }
324                 }
325                 starred.println();
326             }
327             starred.close();
328         }
329     }
330 
331     static Pattern prefix = PatternCache.get("([^/]+/[^/]+)(.*)");
332 
333     static class Delta {
334         Counter<String> newCount = new Counter<>();
335         Counter<String> deletedCount = new Counter<>();
336         Counter<String> changedCount = new Counter<>();
337         Counter<String> unchangedCount = new Counter<>();
338 
print(PrintWriter changesSummary, Set<String> prefixes)339         void print(PrintWriter changesSummary, Set<String> prefixes) {
340             changesSummary.println("Total"
341                 + "\t" + unchangedCount.getTotal()
342                 + "\t" + deletedCount.getTotal()
343                 + "\t" + changedCount.getTotal()
344                 + "\t" + newCount.getTotal());
345             changesSummary.println("Directory\tSame\tRemoved\tChanged\tAdded");
346             for (String prefix : prefixes) {
347                 changesSummary.println(prefix
348                     + "\t" + unchangedCount.get(prefix)
349                     + "\t" + deletedCount.get(prefix)
350                     + "\t" + changedCount.get(prefix)
351                     + "\t" + newCount.get(prefix));
352             }
353         }
354     }
355 
compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew, PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, Relation<String, String> path2value2)356     private static void compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew,
357         PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value,
358         Relation<String, String> path2value2) {
359         Set<String> union = Builder.with(new TreeSet<String>()).addAll(oldPath2value.keySet())
360             .addAll(path2value2.keySet()).get();
361         long total = 0;
362         Matcher prefixMatcher = prefix.matcher("");
363         Delta charCount = new Delta();
364         Delta itemCount = new Delta();
365         Set<String> prefixes = new TreeSet();
366         for (String path : union) {
367             if (!prefixMatcher.reset(path).find()) {
368                 throw new IllegalArgumentException();
369             }
370             String prefix = prefixMatcher.group(1);
371             prefixes.add(prefix);
372             String localPath = prefixMatcher.group(2);
373             Set<String> set1 = oldPath2value.getAll(path);
374             Set<String> set2 = path2value2.getAll(path);
375             if (set2 != null) {
376                 total += set2.size();
377             }
378             if (set1 == null) {
379                 changesNew.println(prefix + "\t" + "\t" + set2 + "\t" + localPath);
380                 itemCount.newCount.add(prefix, set2.size());
381                 charCount.newCount.add(prefix, totalLength(set2));
382             } else if (set2 == null) {
383                 changesDeletes.println(prefix + "\t" + set1 + "\t\t" + localPath);
384                 itemCount.deletedCount.add(prefix, -set1.size());
385                 charCount.deletedCount.add(prefix, -totalLength(set1));
386             } else if (!set1.equals(set2)) {
387                 TreeSet<String> set1minus2 = Builder.with(new TreeSet<String>()).addAll(set1).removeAll(set2).get();
388                 TreeSet<String> set2minus1 = Builder.with(new TreeSet<String>()).addAll(set2).removeAll(set1).get();
389                 TreeSet<String> set2and1 = Builder.with(new TreeSet<String>()).addAll(set2).retainAll(set1).get();
390                 itemCount.changedCount.add(prefix, (set2minus1.size() + set1minus2.size() + 1) / 2);
391                 itemCount.unchangedCount.add(prefix, set2and1.size());
392                 charCount.changedCount.add(prefix, (totalLength(set2minus1) + totalLength(set1minus2) + 1) / 2);
393                 charCount.unchangedCount.add(prefix, totalLength(set2and1));
394                 changes.println(prefix + "\t" + set1minus2
395                     + "\t"
396                     + set2minus1
397                     + "\t" + localPath);
398             } else {
399                 itemCount.unchangedCount.add(prefix, set2.size());
400                 charCount.unchangedCount.add(prefix, totalLength(set2));
401             }
402         }
403         itemCount.print(changesSummary, prefixes);
404         changesSummary.println();
405         charCount.print(changesSummary, prefixes);
406 //        union = Builder.with(new TreeSet<String>())
407 //            .addAll(newCount.keySet())
408 //            .addAll(deletedCount.keySet())
409 //            .addAll(changedCount.keySet())
410 //            .addAll(unchangedCount.keySet())
411 //            .get();
412         summary.println("#Total:\t" + total);
413     }
414 
totalLength(Set<String> set2)415     private static long totalLength(Set<String> set2) {
416         int result = 0;
417         for (String s : set2) {
418             result += s.length();
419         }
420         return result;
421     }
422 
423     final static Pattern LOCALE_PATTERN = PatternCache.get(
424         "([a-z]{2,3})(?:[_-]([A-Z][a-z]{3}))?(?:[_-]([a-zA-Z0-9]{2,3}))?([_-][a-zA-Z0-9]{1,8})*");
425 
doSummary()426     public static void doSummary() throws IOException {
427         Map<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>> key_release_count = new TreeMap<>();
428         Matcher countryLocale = LOCALE_PATTERN.matcher("");
429         List<String> releases = new ArrayList<>();
430         Pattern releaseNumber = PatternCache.get("count_(?:.*-(\\d+(\\.\\d+)*)|trunk)\\.txt");
431         // int releaseCount = 1;
432         Relation<String, String> release_keys = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
433         Relation<String, String> localesToPaths = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
434         Set<String> writtenLanguages = new TreeSet<>();
435         Set<String> countries = new TreeSet<>();
436 
437         File[] listFiles = new File(OUT_DIRECTORY).listFiles();
438         // find the most recent version
439         VersionInfo mostRecentVersion = VersionInfo.getInstance(0);
440         for (File subdir : listFiles) {
441             final String name = subdir.getName();
442             final Matcher releaseMatcher = releaseNumber.matcher(name);
443             if (!releaseMatcher.matches()) {
444                 if (name.startsWith("count_")) {
445                     throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name));
446                 }
447                 continue;
448             }
449             String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++;
450             if (releaseNum == null) {
451                 releaseNum = TRUNK_VERSION;
452             }
453             VersionInfo vi = VersionInfo.getInstance(releaseNum);
454             if (vi.compareTo(mostRecentVersion) > 0) {
455                 mostRecentVersion = vi;
456             }
457         }
458 
459         for (File subdir : listFiles) {
460             final String name = subdir.getName();
461             final Matcher releaseMatcher = releaseNumber.matcher(name);
462             if (!releaseMatcher.matches()) {
463                 if (name.startsWith("count_")) {
464                     throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name));
465                 }
466                 continue;
467             }
468             String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++;
469             if (releaseNum == null) {
470                 releaseNum = TRUNK_VERSION;
471             }
472             VersionInfo vi = VersionInfo.getInstance(releaseNum);
473             boolean captureData = vi.equals(mostRecentVersion);
474             releases.add(releaseNum);
475             BufferedReader in = FileUtilities.openUTF8Reader("", PathUtilities.getNormalizedPathString(subdir));
476             while (true) {
477                 String line = in.readLine();
478                 if (line == null) break;
479                 line = line.trim();
480                 if (line.startsWith("#")) {
481                     continue;
482                 }
483                 // common/main  New:        [Yellowknife]   /gl//ldml/dates/timeZoneNames/zone[@type="America/Yellowknife"]/exemplarCity
484 
485                 String[] parts = line.split("\t");
486                 try {
487                     String file = parts[0];
488                     if (file.startsWith("seed/") || !DIR_FILE_MATCHER.reset(file).find()) {
489                         if (VERBOSE) {
490                             System.out.println("Skipping: " + RegexUtilities.showMismatch(DIR_FILE_MATCHER, file));
491                         }
492                         continue;
493                     } else if (VERBOSE) {
494                         System.out.println("Including: " + file);
495                     }
496 
497                     long valueCount = Long.parseLong(parts[1]);
498                     long valueLen = Long.parseLong(parts[2]);
499                     long attrCount = Long.parseLong(parts[3]);
500                     long attrLen = Long.parseLong(parts[4]);
501                     int lastSlash = file.lastIndexOf("/");
502                     String key2 = file;
503                     String path = file.substring(0, lastSlash);
504                     String key = file.substring(lastSlash + 1);
505                     if (countryLocale.reset(key).matches()) {
506                         String lang = countryLocale.group(1);
507                         String script = countryLocale.group(2);
508                         String country = countryLocale.group(3);
509                         String writtenLang = lang + (script == null ? "" : "_" + script);
510                         String locale = writtenLang + (country == null ? "" : "_" + country);
511                         if (captureData) {
512                             localesToPaths.put(locale, path);
513                             writtenLanguages.add(writtenLang);
514                             if (country != null) {
515                                 countries.add(country);
516                             }
517                         }
518                         // System.out.println(key + " => " + newKey);
519                         //key = writtenLang + "—" + ULocale.getDisplayName(writtenLang, "en");
520                     }
521                     if (valueCount + attrCount == 0) continue;
522                     release_keys.put(releaseNum, key2);
523                     R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count
524                         .get(key2);
525                     if (release_count == null) {
526                         release_count = Row.of(new Counter<String>(), new Counter<String>(), new Counter<String>(),
527                             new Counter<String>());
528                         key_release_count.put(key2, release_count);
529                     }
530                     release_count.get0().add(releaseNum, valueCount);
531                     release_count.get1().add(releaseNum, valueLen);
532                     release_count.get2().add(releaseNum, attrCount);
533                     release_count.get3().add(releaseNum, attrLen);
534                 } catch (Exception e) {
535                     throw new IllegalArgumentException(line, e);
536                 }
537             }
538             in.close();
539         }
540         PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "summary" +
541             ".txt");
542         for (String file : releases) {
543             summary.print("\t" + file + "\tlen");
544         }
545         summary.println();
546         for (String key : key_release_count.keySet()) {
547             summary.print(key);
548             R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count
549                 .get(key);
550             for (String release2 : releases) {
551                 long count = release_count.get0().get(release2) + release_count.get2().get(release2);
552                 long len = release_count.get1().get(release2) + release_count.get3().get(release2);
553                 summary.print("\t" + count + "\t" + len);
554             }
555             summary.println();
556         }
557         for (String release : release_keys.keySet()) {
558             System.out.println("Release:\t" + release + "\t" + release_keys.getAll(release).size());
559         }
560         summary.close();
561         PrintWriter summary2 = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "locales" +
562             ".txt");
563         summary2.println("#Languages (inc. script):\t" + writtenLanguages.size());
564         summary2.println("#Countries:\t" + countries.size());
565         summary2.println("#Locales:\t" + localesToPaths.size());
566         for (Entry<String, Set<String>> entry : localesToPaths.keyValuesSet()) {
567             summary2.println(entry.getKey() + "\t" + Joiner.on("\t").join(entry.getValue()));
568         }
569         summary2.close();
570     }
571 
572     static final Set<String> ATTRIBUTES_TO_SKIP = Builder.with(new HashSet<String>())
573         .addAll("version", "references", "standard", "draft").freeze();
574     static final Pattern skipPath = PatternCache.get("" +
575         "\\[\\@alt=\"[^\"]*proposed" +
576         "|^//" +
577         "(ldml(\\[[^/]*)?/identity" +
578         "|(ldmlBCP47|supplementalData|keyboard)(\\[[^/]*)?/(generation|version)" +
579         ")");
580 
capture(DtdType type2, XPathParts parts)581     static void capture(DtdType type2, XPathParts parts) {
582         for (int i = 0; i < parts.size(); ++i) {
583             String element = parts.getElement(i);
584             ELEMENTS_OCCURRING.put(element, type2);
585             for (String attribute : parts.getAttributes(i).keySet()) {
586                 ATTRIBUTES_OCCURRING.put(attribute, Row.of(type2, element));
587             }
588         }
589     }
590 
591     static class MyHandler extends SimpleHandler {
592         long valueCount;
593         long valueLen;
594         long attributeCount;
595         long attributeLen;
596         Matcher skipPathMatcher = skipPath.matcher("");
597         Splitter lines = Splitter.onPattern("\n+").omitEmptyStrings().trimResults();
598         String prefix;
599         int orderedCount;
600         DtdType type;
601         private final boolean isFinal;
602 
MyHandler(String prefix, boolean isFinal)603         MyHandler(String prefix, boolean isFinal) {
604             this.prefix = prefix;
605             this.isFinal = isFinal;
606         }
607 
608         @Override
handlePathValue(String path, String value)609         public void handlePathValue(String path, String value) {
610             if (type == null) {
611                 XPathParts parts = XPathParts.getFrozenInstance(path);
612                 type = DtdType.valueOf(parts.getElement(0));
613             }
614 
615             ATTRIBUTE_TYPES.add(path);
616 
617             if (skipPathMatcher.reset(path).find()) {
618                 return;
619             }
620             String pathKey = null;
621             if (doChanges) {
622                 // if (path.contains("/collations")) {
623                 // System.out.println("whoops");
624                 // }
625                 pathKey = fixKeyPath(path);
626             }
627             int len = value.length();
628             value = value.trim();
629             if (value.isEmpty() && len > 0) {
630                 value = " ";
631             }
632             if (value.length() != 0) {
633                 List<String> valueLines = lines.splitToList(value);
634                 if (valueLines.size() == 1) {
635                     valueCount++;
636                     valueLen += value.length();
637                     if (doChanges) {
638                         path2value.put(pathKey, value);
639                     }
640                 } else {
641                     int count = 0;
642                     for (String v : valueLines) {
643                         valueCount++;
644                         valueLen += v.length();
645                         if (doChanges) {
646                             path2value.put(pathKey + "/_q" + count++, v);
647                         }
648                     }
649                 }
650             }
651             XPathParts parts = XPathParts.getFrozenInstance(path);
652             if (isFinal) {
653                 capture(type, parts);
654             }
655             if (path.contains("[@")) {
656                 int i = parts.size() - 1; // only look at last item
657                 Collection<String> attributes = parts.getAttributeKeys(i);
658                 if (attributes.size() != 0) {
659                     String element = parts.getElement(i);
660                     for (String attribute : attributes) {
661                         if (ATTRIBUTES_TO_SKIP.contains(attribute)
662                             || CLDRFile.isDistinguishing(type, element, attribute)) {
663                             continue;
664                         }
665                         String valuePart = parts.getAttributeValue(i, attribute);
666                         // String[] valueParts = attrValue.split("\\s");
667                         // for (String valuePart : valueParts) {
668                         attributeCount++;
669                         attributeLen += valuePart.length();
670                         if (doChanges) {
671                             path2value.put(pathKey + "/_" + attribute, valuePart);
672                             // }
673                         }
674                     }
675                 }
676             }
677         }
678 
fixKeyPath(String path)679         private String fixKeyPath(String path) {
680             XPathParts parts = XPathParts.getFrozenInstance(path);
681             if (!SKIP_ORDERING) {
682                 parts = parts.cloneAsThawed();
683             }
684             for (int i = 0; i < parts.size(); ++i) {
685                 String element = parts.getElement(i);
686                 if (!SKIP_ORDERING) {
687                     if (CLDRFile.isOrdered(element, type)) {
688                         parts.addAttribute("_q", String.valueOf(orderedCount++));
689                     }
690                 }
691             }
692             return prefix + CLDRFile.getDistinguishingXPath(parts.toString(), null);
693         }
694     }
695 
check(String systemID, String name, boolean isFinal)696     private MyHandler check(String systemID, String name, boolean isFinal) {
697         MyHandler myHandler = new MyHandler(name, isFinal);
698         try {
699             XMLFileReader reader = new XMLFileReader().setHandler(myHandler);
700             reader.read(systemID, XMLFileReader.CONTENT_HANDLER, true);
701         } catch (Exception e) {
702             cantRead.put(name, Arrays.asList(e.getStackTrace()));
703         }
704         return myHandler;
705 
706         // try {
707         // FileInputStream fis = new FileInputStream(systemID);
708         // XMLFileReader xmlReader = XMLFileReader.createXMLReader(true);
709         // xmlReader.setErrorHandler(new MyErrorHandler());
710         // MyHandler myHandler = new MyHandler();
711         // smlReader
712         // xmlReader.setHandler(myHandler);
713         // InputSource is = new InputSource(fis);
714         // is.setSystemId(systemID.toString());
715         // xmlReader.parse(is);
716         // fis.close();
717         // return myHandler;
718         // } catch (SAXParseException e) {
719         // System.out.println("\t" + "Can't read " + systemID);
720         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
721         // } catch (SAXException e) {
722         // System.out.println("\t" + "Can't read " + systemID);
723         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
724         // } catch (IOException e) {
725         // System.out.println("\t" + "Can't read " + systemID);
726         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
727         // }
728     }
729 
730     static class MyErrorHandler implements ErrorHandler {
731         @Override
error(SAXParseException exception)732         public void error(SAXParseException exception) throws SAXException {
733             System.out.println("\nerror: " + XMLFileReader.showSAX(exception));
734             throw exception;
735         }
736 
737         @Override
fatalError(SAXParseException exception)738         public void fatalError(SAXParseException exception) throws SAXException {
739             System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception));
740             throw exception;
741         }
742 
743         @Override
warning(SAXParseException exception)744         public void warning(SAXParseException exception) throws SAXException {
745             System.out.println("\nwarning: " + XMLFileReader.showSAX(exception));
746             throw exception;
747         }
748     }
749 
summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal)750     private void summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal) {
751         System.out.println(commonDir);
752         summary.println("#name" + "\t" + "value-count" + "\t" + "value-len" + "\t" + "attr-count" + "\t" + "attr-len");
753         File commonDirectory = new File(commonDir);
754         if (!commonDirectory.exists()) {
755             System.out.println("Doesn't exist:\t" + commonDirectory);
756         }
757         summarizeFiles(summary, commonDirectory, isFinal, 1);
758     }
759 
760     static final Set<String> SKIP_DIRS = new HashSet<>(Arrays.asList("specs", "tools", "seed", "exemplars"));
761 
summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level)762     public void summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level) {
763         System.out.println("\t\t\t\t\t\t\t".substring(0, level) + directory);
764         int count = 0;
765         for (File file : directory.listFiles()) {
766             String filename = file.getName();
767             if (filename.startsWith(".")) {
768                 // do nothing
769             } else if (file.isDirectory()) {
770                 if (!SKIP_DIRS.contains(filename)) {
771                     summarizeFiles(summary, file, isFinal, level + 1);
772                 }
773             } else if (!filename.startsWith("#") && filename.endsWith(".xml")) {
774                 String name = new File(directory.getParent()).getName() + "/" + directory.getName() + "/"
775                     + file.getName();
776                 name = name.substring(0, name.length() - 4); // strip .xml
777                 if (!RAW_FILE_MATCHER.reset(name).find()) {
778                     continue;
779                 }
780                 if (VERBOSE) {
781                     System.out.println(name);
782                 } else {
783                     System.out.print(".");
784                     if (++count > 100) {
785                         count = 0;
786                         System.out.println();
787                     }
788                     System.out.flush();
789                 }
790                 MyHandler handler = check(file.toString(), name, isFinal);
791                 summary.println(name + "\t" + handler.valueCount + "\t" + handler.valueLen + "\t"
792                     + handler.attributeCount + "\t" + handler.attributeLen);
793             }
794         }
795         System.out.println();
796     }
797 }
798