1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.PrintWriter;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.Collection;
10 import java.util.Collections;
11 import java.util.HashSet;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Map.Entry;
16 import java.util.Set;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19 import java.util.regex.Matcher;
20 import java.util.regex.Pattern;
21 
22 import org.unicode.cldr.draft.FileUtilities;
23 import org.unicode.cldr.tool.Option.Options;
24 import org.unicode.cldr.util.Builder;
25 import org.unicode.cldr.util.CLDRConfig;
26 import org.unicode.cldr.util.CLDRFile;
27 import org.unicode.cldr.util.CLDRPaths;
28 import org.unicode.cldr.util.CldrUtility;
29 import org.unicode.cldr.util.Counter;
30 import org.unicode.cldr.util.DtdData;
31 import org.unicode.cldr.util.DtdData.Attribute;
32 import org.unicode.cldr.util.DtdData.Element;
33 import org.unicode.cldr.util.DtdType;
34 import org.unicode.cldr.util.PathStarrer;
35 import org.unicode.cldr.util.PatternCache;
36 import org.unicode.cldr.util.RegexUtilities;
37 import org.unicode.cldr.util.SupplementalDataInfo;
38 import org.unicode.cldr.util.XMLFileReader;
39 import org.unicode.cldr.util.XMLFileReader.SimpleHandler;
40 import org.unicode.cldr.util.XPathParts;
41 import org.xml.sax.ErrorHandler;
42 import org.xml.sax.SAXException;
43 import org.xml.sax.SAXParseException;
44 
45 import com.google.common.base.Splitter;
46 import com.ibm.icu.dev.util.CollectionUtilities;
47 import com.ibm.icu.impl.Relation;
48 import com.ibm.icu.impl.Row;
49 import com.ibm.icu.impl.Row.R2;
50 import com.ibm.icu.impl.Row.R4;
51 import com.ibm.icu.util.VersionInfo;
52 
53 public class GenerateItemCounts {
54     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig.getInstance().getSupplementalDataInfo();
55     private static final boolean SKIP_ORDERING = true;
56     private static final String OUT_DIRECTORY = CLDRPaths.GEN_DIRECTORY + "/itemcount/"; // CldrUtility.MAIN_DIRECTORY;
57     private Map<String, List<StackTraceElement>> cantRead = new TreeMap<String, List<StackTraceElement>>();
58     static {
59         System.err.println("Probably obsolete tool");
60     }
61     private static String[] DIRECTORIES = {
62         // MUST be oldest first!
63         // "cldr-archive/cldr-21.0",
64         // "cldr-24.0",
65         "cldr-27.0",
66         "trunk"
67     };
68 
69     private static String TRUNK_VERSION = "26.0";
70 
71     static boolean doChanges = true;
72     static Relation<String, String> path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
73     static final AttributeTypes ATTRIBUTE_TYPES = new AttributeTypes();
74 
75     final static Options myOptions = new Options();
76 
77     enum MyOptions {
78         summary(null, null, "if present, summarizes data already collected. Run once with, once without."), directory(".*", ".*",
79             "if summary, creates filtered version (eg -d main): does a find in the name, which is of the form dir/file"), verbose(null, null,
80                 "verbose debugging messages"), rawfilter(".*", ".*", "filter the raw files (non-summary, mostly for debugging)"),;
81         // boilerplate
82         final Option option;
83 
MyOptions(String argumentPattern, String defaultArgument, String helpText)84         MyOptions(String argumentPattern, String defaultArgument, String helpText) {
85             option = myOptions.add(this, argumentPattern, defaultArgument, helpText);
86         }
87     }
88 
89     static Matcher DIR_FILE_MATCHER;
90     static Matcher RAW_FILE_MATCHER;
91     static boolean VERBOSE;
92 
main(String[] args)93     public static void main(String[] args) throws IOException {
94         myOptions.parse(MyOptions.directory, args, true);
95 
96         DIR_FILE_MATCHER = PatternCache.get(MyOptions.directory.option.getValue()).matcher("");
97         RAW_FILE_MATCHER = PatternCache.get(MyOptions.rawfilter.option.getValue()).matcher("");
98         VERBOSE = MyOptions.verbose.option.doesOccur();
99 
100         if (MyOptions.summary.option.doesOccur()) {
101             doSummary();
102             System.out.println("DONE");
103             return;
104             // } else if (arg.equals("changes")) {
105             // doChanges = true;
106         } else {
107         }
108         // Pattern dirPattern = dirPattern = PatternCache.get(arg);
109         GenerateItemCounts main = new GenerateItemCounts();
110         try {
111             Relation<String, String> oldPath2value = null;
112             for (String dir : DIRECTORIES) {
113                 // if (dirPattern != null && !dirPattern.matcher(dir).find()) continue;
114                 final String pathname = dir.equals("trunk") ? CLDRPaths.BASE_DIRECTORY
115                     : CLDRPaths.ARCHIVE_DIRECTORY + "/" + dir;
116                 boolean isFinal = dir == DIRECTORIES[DIRECTORIES.length - 1];
117 
118                 String fulldir = new File(pathname).getCanonicalPath();
119                 String prefix = (MyOptions.rawfilter.option.doesOccur() ? "filtered_" : "");
120                 String fileKey = dir.replace("/", "_");
121                 try (
122                     PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_count.txt");
123                     PrintWriter changes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes.txt");
124                     PrintWriter changesNew = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_news.txt");
125                     PrintWriter changesDeletes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_deletes.txt");
126                     PrintWriter changesSummary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes_summary.txt");) {
127                     main.summarizeCoverage(summary, fulldir, isFinal);
128                     if (doChanges) {
129                         if (oldPath2value != null) {
130                             compare(summary, changes, changesNew, changesDeletes, changesSummary, oldPath2value, path2value);
131                             checkBadAttributes(path2value, prefix + fileKey + "_dtd_check.txt");
132                         }
133                         oldPath2value = path2value;
134                         path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
135                     }
136                 }
137             }
138             ATTRIBUTE_TYPES.showStarred();
139         } finally {
140             if (main.cantRead.size() != 0) {
141                 System.out.println("Couldn't read:\t");
142                 for (String file : main.cantRead.keySet()) {
143                     System.out.println(file + "\t" + main.cantRead.get(file));
144                 }
145             }
146             System.out.println("DONE");
147         }
148     }
149 
150     static final Set<String> SKIP_ATTRIBUTES = new HashSet<>(Arrays.asList("draft", "references", "validSubLocales"));
151 
152     static final Relation<String, DtdType> ELEMENTS_OCCURRING = Relation.of(new TreeMap(), TreeSet.class);
153     static final Relation<String, DtdType> ELEMENTS_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class);
154     static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_OCCURRING = Relation.of(new TreeMap(), TreeSet.class);
155     static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class);
156 
checkBadAttributes(Relation<String, String> path2value2, String outputFile)157     private static void checkBadAttributes(Relation<String, String> path2value2, String outputFile)
158         throws IOException {
159         // an attribute is misplaced if it is not distinguishing, but is on a non-final node.
160 
161         Set<String> errors = new LinkedHashSet<>();
162 
163         SupplementalDataInfo supp = SUPPLEMENTAL_DATA_INFO;
164         for (DtdType dtdType : DtdType.values()) {
165             if (dtdType == DtdType.ldmlICU) {
166                 continue;
167             }
168             DtdData data = DtdData.getInstance(dtdType);
169             for (Element element : data.getElements()) {
170                 String elementName = element.name;
171                 ELEMENTS_POSSIBLE.put(elementName, dtdType);
172                 final Set<Element> children = element.getChildren().keySet();
173 
174                 boolean skipFinal = children.isEmpty()
175                     || children.size() == 1
176                         && children.iterator().next().name.equals("special");
177 
178                 for (Entry<Attribute, Integer> attributeInt : element.getAttributes().entrySet()) {
179                     Attribute attribute = attributeInt.getKey();
180                     String attributeName = attribute.name;
181                     if (attribute.defaultValue != null) {
182                         errors.add("Warning, default value «" + attribute.defaultValue
183                             + "» for: " + dtdType + "\t" + elementName + "\t" + attributeName);
184                     }
185                     final R2<DtdType, String> attributeRow = Row.of(dtdType, elementName);
186                     ATTRIBUTES_POSSIBLE.put(attributeName, attributeRow);
187                     if (skipFinal || SKIP_ATTRIBUTES.contains(attributeName)) { // don't worry about non-final, references, draft, standard
188                         continue;
189                     }
190                     if (supp.isDeprecated(dtdType, elementName, attributeName, null)) {
191                         continue;
192                     }
193                     if (!CLDRFile.isDistinguishing(dtdType, elementName, attributeName)) {
194                         String doesOccur = "";
195                         final Set<R2<DtdType, String>> attributeRows = ATTRIBUTES_OCCURRING.get(attributeName);
196                         if (attributeRows == null || !attributeRows.contains(attributeRow)) {
197                             doesOccur = "\tNEVER";
198                         }
199                         errors.add("Warning, !disting, !leaf: " + dtdType + "\t" + elementName + "\t" + attributeName + "\t" + children + doesOccur);
200                     }
201                 }
202             }
203         }
204         try (
205             PrintWriter out = FileUtilities.openUTF8Writer(OUT_DIRECTORY, outputFile)) {
206             out.println("\nElements\tDeprecated\tOccurring\tPossible in DTD, but never occurs");
207 
208             for (Entry<String, Set<DtdType>> x : ELEMENTS_POSSIBLE.keyValuesSet()) {
209                 final String element = x.getKey();
210                 if (element.equals("#PCDATA") || element.equals("ANY") || element.equals("generation")) {
211                     continue;
212                 }
213                 final Set<DtdType> possible = x.getValue();
214                 Set<DtdType> deprecated = new TreeSet();
215                 for (DtdType dtdType : possible) {
216                     if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, "*", "*")) {
217                         deprecated.add(dtdType);
218                     }
219                 }
220                 Set<DtdType> notDeprecated = new TreeSet(possible);
221                 notDeprecated.removeAll(deprecated);
222 
223                 Set<DtdType> occurs = CldrUtility.ifNull(ELEMENTS_OCCURRING.get(element), Collections.EMPTY_SET);
224                 Set<DtdType> noOccur = new TreeSet(possible);
225                 noOccur.removeAll(occurs);
226 
227                 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur
228                     final Set<DtdType> intersection = CldrUtility.intersect(deprecated, occurs);
229                     errors.add("Error: element «" + element
230                         + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) +
231                         " but occurs in live data: " + intersection);
232                 }
233                 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning
234                     errors.add("Warning: element «" + element
235                         + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur));
236                 }
237 
238                 out.println(element
239                     + "\t" + deprecated
240                     + "\t" + occurs
241                     + "\t" + noOccur);
242             }
243 
244             out.println("\nAttributes\tDeprecated\tOccurring\tPossible in DTD, but never occurs");
245 
246             for (Entry<String, Set<R2<DtdType, String>>> x : ATTRIBUTES_POSSIBLE.keyValuesSet()) {
247                 final String attribute = x.getKey();
248                 if (attribute.equals("alt") || attribute.equals("draft") || attribute.equals("references")) {
249                     continue;
250                 }
251                 final Set<R2<DtdType, String>> possible = x.getValue();
252                 Set<R2<DtdType, String>> deprecated = new TreeSet();
253                 for (R2<DtdType, String> s : possible) {
254                     final DtdType dtdType = s.get0();
255                     final String element = s.get1();
256                     if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, attribute, "*")) {
257                         deprecated.add(s);
258                     }
259                 }
260                 Set<R2<DtdType, String>> notDeprecated = new TreeSet(possible);
261                 notDeprecated.removeAll(deprecated);
262 
263                 Set<R2<DtdType, String>> occurs = CldrUtility.ifNull(ATTRIBUTES_OCCURRING.get(attribute), Collections.EMPTY_SET);
264                 Set<R2<DtdType, String>> noOccur = new TreeSet(possible);
265                 noOccur.removeAll(occurs);
266 
267                 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur
268                     final Set<R2<DtdType, String>> intersection = CldrUtility.intersect(deprecated, occurs);
269                     errors.add("Error: attribute «" + attribute
270                         + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) +
271                         " but occurs in live data: " + intersection);
272                 }
273                 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning
274                     errors.add("Warning: attribute «" + attribute
275                         + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur));
276                 }
277                 out.println(attribute
278                     + "\t" + deprecated
279                     + "\t" + occurs
280                     + "\t" + noOccur);
281             }
282             out.println("\nERRORS/WARNINGS");
283             out.println(CollectionUtilities.join(errors, "\n"));
284         }
285     }
286 
287     static class AttributeTypes {
288         Relation<String, String> elementPathToAttributes = Relation.of(new TreeMap<String, Set<String>>(),
289             TreeSet.class);
290         final PathStarrer PATH_STARRER = new PathStarrer().setSubstitutionPattern("*");
291         final Set<String> STARRED_PATHS = new TreeSet<String>();
292         XPathParts parts = new XPathParts();
293         StringBuilder elementPath = new StringBuilder();
294 
add(String path)295         public void add(String path) {
296             parts.set(path);
297             elementPath.setLength(0);
298             //DtdType type = CLDRFile.DtdType.valueOf(parts.getElement(0));
299             for (int i = 0; i < parts.size(); ++i) {
300                 String element = parts.getElement(i);
301                 elementPath.append('/').append(element);
302                 elementPathToAttributes.putAll(elementPath.toString().intern(), parts.getAttributeKeys(i));
303             }
304         }
305 
showStarred()306         public void showStarred() throws IOException {
307             PrintWriter starred = FileUtilities.openUTF8Writer(OUT_DIRECTORY, "starred" + ".txt");
308 
309             for (Entry<String, Set<String>> entry : elementPathToAttributes.keyValuesSet()) {
310                 Set<String> attributes = entry.getValue();
311                 if (attributes.size() == 0) {
312                     continue;
313                 }
314                 String path = entry.getKey();
315                 String[] elements = path.split("/");
316                 DtdType type = DtdType.valueOf(elements[1]);
317                 String finalElement = elements[elements.length - 1];
318                 starred.print(path);
319                 for (String attribute : attributes) {
320                     if (CLDRFile.isDistinguishing(type, finalElement, attribute)) {
321                         starred.print("[@" + attribute + "='disting.']");
322                     } else {
323                         starred.print("[@" + attribute + "='DATA']");
324                     }
325                 }
326                 starred.println();
327             }
328             starred.close();
329         }
330     }
331 
332     static Pattern prefix = PatternCache.get("([^/]+/[^/]+)(.*)");
333 
334     static class Delta {
335         Counter<String> newCount = new Counter<String>();
336         Counter<String> deletedCount = new Counter<String>();
337         Counter<String> changedCount = new Counter<String>();
338         Counter<String> unchangedCount = new Counter<String>();
339 
print(PrintWriter changesSummary, Set<String> prefixes)340         void print(PrintWriter changesSummary, Set<String> prefixes) {
341             changesSummary.println("Total"
342                 + "\t" + unchangedCount.getTotal()
343                 + "\t" + deletedCount.getTotal()
344                 + "\t" + changedCount.getTotal()
345                 + "\t" + newCount.getTotal());
346             changesSummary.println("Directory\tSame\tRemoved\tChanged\tAdded");
347             for (String prefix : prefixes) {
348                 changesSummary.println(prefix
349                     + "\t" + unchangedCount.get(prefix)
350                     + "\t" + deletedCount.get(prefix)
351                     + "\t" + changedCount.get(prefix)
352                     + "\t" + newCount.get(prefix));
353             }
354         }
355     }
356 
compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew, PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, Relation<String, String> path2value2)357     private static void compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew,
358         PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value,
359         Relation<String, String> path2value2) {
360         Set<String> union = Builder.with(new TreeSet<String>()).addAll(oldPath2value.keySet())
361             .addAll(path2value2.keySet()).get();
362         long total = 0;
363         Matcher prefixMatcher = prefix.matcher("");
364         Delta charCount = new Delta();
365         Delta itemCount = new Delta();
366         Counter<String> newLength = new Counter<String>();
367         Counter<String> deletedLength = new Counter<String>();
368         Counter<String> changedLength = new Counter<String>();
369         Counter<String> unchangedLength = new Counter<String>();
370         Set<String> prefixes = new TreeSet();
371         for (String path : union) {
372             if (!prefixMatcher.reset(path).find()) {
373                 throw new IllegalArgumentException();
374             }
375             String prefix = prefixMatcher.group(1);
376             prefixes.add(prefix);
377             String localPath = prefixMatcher.group(2);
378             Set<String> set1 = oldPath2value.getAll(path);
379             Set<String> set2 = path2value2.getAll(path);
380             if (set2 != null) {
381                 total += set2.size();
382             }
383             if (set1 == null) {
384                 changesNew.println(prefix + "\t" + "\t" + set2 + "\t" + localPath);
385                 itemCount.newCount.add(prefix, set2.size());
386                 charCount.newCount.add(prefix, totalLength(set2));
387             } else if (set2 == null) {
388                 changesDeletes.println(prefix + "\t" + set1 + "\t\t" + localPath);
389                 itemCount.deletedCount.add(prefix, -set1.size());
390                 charCount.deletedCount.add(prefix, -totalLength(set1));
391             } else if (!set1.equals(set2)) {
392                 TreeSet<String> set1minus2 = Builder.with(new TreeSet<String>()).addAll(set1).removeAll(set2).get();
393                 TreeSet<String> set2minus1 = Builder.with(new TreeSet<String>()).addAll(set2).removeAll(set1).get();
394                 TreeSet<String> set2and1 = Builder.with(new TreeSet<String>()).addAll(set2).retainAll(set1).get();
395                 itemCount.changedCount.add(prefix, (set2minus1.size() + set1minus2.size() + 1) / 2);
396                 itemCount.unchangedCount.add(prefix, set2and1.size());
397                 charCount.changedCount.add(prefix, (totalLength(set2minus1) + totalLength(set1minus2) + 1) / 2);
398                 charCount.unchangedCount.add(prefix, totalLength(set2and1));
399                 changes.println(prefix + "\t" + set1minus2
400                     + "\t"
401                     + set2minus1
402                     + "\t" + localPath);
403             } else {
404                 itemCount.unchangedCount.add(prefix, set2.size());
405                 charCount.unchangedCount.add(prefix, totalLength(set2));
406             }
407         }
408         itemCount.print(changesSummary, prefixes);
409         changesSummary.println();
410         charCount.print(changesSummary, prefixes);
411 //        union = Builder.with(new TreeSet<String>())
412 //            .addAll(newCount.keySet())
413 //            .addAll(deletedCount.keySet())
414 //            .addAll(changedCount.keySet())
415 //            .addAll(unchangedCount.keySet())
416 //            .get();
417         summary.println("#Total:\t" + total);
418     }
419 
totalLength(Set<String> set2)420     private static long totalLength(Set<String> set2) {
421         int result = 0;
422         for (String s : set2) {
423             result += s.length();
424         }
425         return result;
426     }
427 
428     final static Pattern LOCALE_PATTERN = PatternCache.get(
429         "([a-z]{2,3})(?:[_-]([A-Z][a-z]{3}))?(?:[_-]([a-zA-Z0-9]{2,3}))?([_-][a-zA-Z0-9]{1,8})*");
430 
doSummary()431     public static void doSummary() throws IOException {
432         Map<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>> key_release_count = new TreeMap<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>>();
433         Matcher countryLocale = LOCALE_PATTERN.matcher("");
434         List<String> releases = new ArrayList<String>();
435         Pattern releaseNumber = PatternCache.get("count_(?:.*-(\\d+(\\.\\d+)*)|trunk)\\.txt");
436         // int releaseCount = 1;
437         Relation<String, String> release_keys = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
438         Relation<String, String> localesToPaths = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
439         Set<String> writtenLanguages = new TreeSet<String>();
440         Set<String> countries = new TreeSet<String>();
441 
442         File[] listFiles = new File(OUT_DIRECTORY).listFiles();
443         // find the most recent version
444         VersionInfo mostRecentVersion = VersionInfo.getInstance(0);
445         for (File subdir : listFiles) {
446             final String name = subdir.getName();
447             final Matcher releaseMatcher = releaseNumber.matcher(name);
448             if (!releaseMatcher.matches()) {
449                 if (name.startsWith("count_")) {
450                     throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name));
451                 }
452                 continue;
453             }
454             String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++;
455             if (releaseNum == null) {
456                 releaseNum = TRUNK_VERSION;
457             }
458             VersionInfo vi = VersionInfo.getInstance(releaseNum);
459             if (vi.compareTo(mostRecentVersion) > 0) {
460                 mostRecentVersion = vi;
461             }
462         }
463 
464         for (File subdir : listFiles) {
465             final String name = subdir.getName();
466             final Matcher releaseMatcher = releaseNumber.matcher(name);
467             if (!releaseMatcher.matches()) {
468                 if (name.startsWith("count_")) {
469                     throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name));
470                 }
471                 continue;
472             }
473             String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++;
474             if (releaseNum == null) {
475                 releaseNum = TRUNK_VERSION;
476             }
477             VersionInfo vi = VersionInfo.getInstance(releaseNum);
478             boolean captureData = vi.equals(mostRecentVersion);
479             releases.add(releaseNum);
480             BufferedReader in = FileUtilities.openUTF8Reader("", subdir.getCanonicalPath());
481             while (true) {
482                 String line = in.readLine();
483                 if (line == null) break;
484                 line = line.trim();
485                 if (line.startsWith("#")) {
486                     continue;
487                 }
488                 // common/main  New:        [Yellowknife]   /gl//ldml/dates/timeZoneNames/zone[@type="America/Yellowknife"]/exemplarCity
489 
490                 String[] parts = line.split("\t");
491                 try {
492                     String file = parts[0];
493                     if (file.startsWith("seed/") || !DIR_FILE_MATCHER.reset(file).find()) {
494                         if (VERBOSE) {
495                             System.out.println("Skipping: " + RegexUtilities.showMismatch(DIR_FILE_MATCHER, file));
496                         }
497                         continue;
498                     } else if (VERBOSE) {
499                         System.out.println("Including: " + file);
500                     }
501 
502                     long valueCount = Long.parseLong(parts[1]);
503                     long valueLen = Long.parseLong(parts[2]);
504                     long attrCount = Long.parseLong(parts[3]);
505                     long attrLen = Long.parseLong(parts[4]);
506                     int lastSlash = file.lastIndexOf("/");
507                     String key2 = file;
508                     String path = file.substring(0, lastSlash);
509                     String key = file.substring(lastSlash + 1);
510                     if (countryLocale.reset(key).matches()) {
511                         String lang = countryLocale.group(1);
512                         String script = countryLocale.group(2);
513                         String country = countryLocale.group(3);
514                         String writtenLang = lang + (script == null ? "" : "_" + script);
515                         String locale = writtenLang + (country == null ? "" : "_" + country);
516                         if (captureData) {
517                             localesToPaths.put(locale, path);
518                             writtenLanguages.add(writtenLang);
519                             if (country != null) {
520                                 countries.add(country);
521                             }
522                         }
523                         // System.out.println(key + " => " + newKey);
524                         //key = writtenLang + "—" + ULocale.getDisplayName(writtenLang, "en");
525                     }
526                     if (valueCount + attrCount == 0) continue;
527                     release_keys.put(releaseNum, key2);
528                     R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count
529                         .get(key2);
530                     if (release_count == null) {
531                         release_count = Row.of(new Counter<String>(), new Counter<String>(), new Counter<String>(),
532                             new Counter<String>());
533                         key_release_count.put(key2, release_count);
534                     }
535                     release_count.get0().add(releaseNum, valueCount);
536                     release_count.get1().add(releaseNum, valueLen);
537                     release_count.get2().add(releaseNum, attrCount);
538                     release_count.get3().add(releaseNum, attrLen);
539                 } catch (Exception e) {
540                     throw new IllegalArgumentException(line, e);
541                 }
542             }
543             in.close();
544         }
545         PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "summary" +
546             ".txt");
547         for (String file : releases) {
548             summary.print("\t" + file + "\tlen");
549         }
550         summary.println();
551         for (String key : key_release_count.keySet()) {
552             summary.print(key);
553             R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count
554                 .get(key);
555             for (String release2 : releases) {
556                 long count = release_count.get0().get(release2) + release_count.get2().get(release2);
557                 long len = release_count.get1().get(release2) + release_count.get3().get(release2);
558                 summary.print("\t" + count + "\t" + len);
559             }
560             summary.println();
561         }
562         for (String release : release_keys.keySet()) {
563             System.out.println("Release:\t" + release + "\t" + release_keys.getAll(release).size());
564         }
565         summary.close();
566         PrintWriter summary2 = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "locales" +
567             ".txt");
568         summary2.println("#Languages (inc. script):\t" + writtenLanguages.size());
569         summary2.println("#Countries:\t" + countries.size());
570         summary2.println("#Locales:\t" + localesToPaths.size());
571         for (Entry<String, Set<String>> entry : localesToPaths.keyValuesSet()) {
572             summary2.println(entry.getKey() + "\t" + CollectionUtilities.join(entry.getValue(), "\t"));
573         }
574         summary2.close();
575     }
576 
577     static final Set<String> ATTRIBUTES_TO_SKIP = Builder.with(new HashSet<String>())
578         .addAll("version", "references", "standard", "draft").freeze();
579     static final Pattern skipPath = PatternCache.get("" +
580         "\\[\\@alt=\"[^\"]*proposed" +
581         "|^//" +
582         "(ldml(\\[[^/]*)?/identity" +
583         "|(ldmlBCP47|supplementalData|keyboard)(\\[[^/]*)?/(generation|version)" +
584         ")");
585 
capture(DtdType type2, XPathParts parts)586     static void capture(DtdType type2, XPathParts parts) {
587         for (int i = 0; i < parts.size(); ++i) {
588             String element = parts.getElement(i);
589             ELEMENTS_OCCURRING.put(element, type2);
590             for (String attribute : parts.getAttributes(i).keySet()) {
591                 ATTRIBUTES_OCCURRING.put(attribute, Row.of(type2, element));
592             }
593         }
594     }
595 
596     static class MyHandler extends SimpleHandler {
597         XPathParts parts = new XPathParts();
598         long valueCount;
599         long valueLen;
600         long attributeCount;
601         long attributeLen;
602         Matcher skipPathMatcher = skipPath.matcher("");
603         Splitter lines = Splitter.onPattern("\n+").omitEmptyStrings().trimResults();
604         String prefix;
605         int orderedCount;
606         DtdType type;
607         private final boolean isFinal;
608 
MyHandler(String prefix, boolean isFinal)609         MyHandler(String prefix, boolean isFinal) {
610             this.prefix = prefix;
611             this.isFinal = isFinal;
612         }
613 
614         @Override
handlePathValue(String path, String value)615         public void handlePathValue(String path, String value) {
616             if (type == null) {
617                 parts.set(path);
618                 type = DtdType.valueOf(parts.getElement(0));
619             }
620 
621             ATTRIBUTE_TYPES.add(path);
622 
623             if (skipPathMatcher.reset(path).find()) {
624                 return;
625             }
626             String pathKey = null;
627             if (doChanges) {
628                 // if (path.contains("/collations")) {
629                 // System.out.println("whoops");
630                 // }
631                 pathKey = fixKeyPath(path);
632             }
633             int len = value.length();
634             value = value.trim();
635             if (value.isEmpty() && len > 0) {
636                 value = " ";
637             }
638             if (value.length() != 0) {
639                 List<String> valueLines = lines.splitToList(value);
640                 if (valueLines.size() == 1) {
641                     valueCount++;
642                     valueLen += value.length();
643                     if (doChanges) {
644                         path2value.put(pathKey, value);
645                     }
646                 } else {
647                     int count = 0;
648                     for (String v : valueLines) {
649                         valueCount++;
650                         valueLen += v.length();
651                         if (doChanges) {
652                             path2value.put(pathKey + "/_q" + count++, v);
653                         }
654                     }
655                 }
656             }
657             parts.set(path);
658             if (isFinal) {
659                 capture(type, parts);
660             }
661             if (path.contains("[@")) {
662                 int i = parts.size() - 1; // only look at last item
663                 Collection<String> attributes = parts.getAttributeKeys(i);
664                 if (attributes.size() != 0) {
665                     String element = parts.getElement(i);
666                     for (String attribute : attributes) {
667                         if (ATTRIBUTES_TO_SKIP.contains(attribute)
668                             || CLDRFile.isDistinguishing(type, element, attribute)) {
669                             continue;
670                         }
671                         String valuePart = parts.getAttributeValue(i, attribute);
672                         // String[] valueParts = attrValue.split("\\s");
673                         // for (String valuePart : valueParts) {
674                         attributeCount++;
675                         attributeLen += valuePart.length();
676                         if (doChanges) {
677                             path2value.put(pathKey + "/_" + attribute, valuePart);
678                             // }
679                         }
680                     }
681                 }
682             }
683         }
684 
fixKeyPath(String path)685         private String fixKeyPath(String path) {
686             parts.set(path);
687             for (int i = 0; i < parts.size(); ++i) {
688                 String element = parts.getElement(i);
689                 if (!SKIP_ORDERING) {
690                     if (CLDRFile.isOrdered(element, type)) {
691                         parts.addAttribute("_q", String.valueOf(orderedCount++));
692                     }
693                 }
694             }
695             return prefix + CLDRFile.getDistinguishingXPath(parts.toString(), null, false);
696         }
697     }
698 
check(String systemID, String name, boolean isFinal)699     private MyHandler check(String systemID, String name, boolean isFinal) {
700         MyHandler myHandler = new MyHandler(name, isFinal);
701         try {
702             XMLFileReader reader = new XMLFileReader().setHandler(myHandler);
703             reader.read(systemID, XMLFileReader.CONTENT_HANDLER, true);
704         } catch (Exception e) {
705             cantRead.put(name, Arrays.asList(e.getStackTrace()));
706         }
707         return myHandler;
708 
709         // try {
710         // FileInputStream fis = new FileInputStream(systemID);
711         // XMLFileReader xmlReader = XMLFileReader.createXMLReader(true);
712         // xmlReader.setErrorHandler(new MyErrorHandler());
713         // MyHandler myHandler = new MyHandler();
714         // smlReader
715         // xmlReader.setHandler(myHandler);
716         // InputSource is = new InputSource(fis);
717         // is.setSystemId(systemID.toString());
718         // xmlReader.parse(is);
719         // fis.close();
720         // return myHandler;
721         // } catch (SAXParseException e) {
722         // System.out.println("\t" + "Can't read " + systemID);
723         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
724         // } catch (SAXException e) {
725         // System.out.println("\t" + "Can't read " + systemID);
726         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
727         // } catch (IOException e) {
728         // System.out.println("\t" + "Can't read " + systemID);
729         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
730         // }
731     }
732 
733     static class MyErrorHandler implements ErrorHandler {
error(SAXParseException exception)734         public void error(SAXParseException exception) throws SAXException {
735             System.out.println("\nerror: " + XMLFileReader.showSAX(exception));
736             throw exception;
737         }
738 
fatalError(SAXParseException exception)739         public void fatalError(SAXParseException exception) throws SAXException {
740             System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception));
741             throw exception;
742         }
743 
warning(SAXParseException exception)744         public void warning(SAXParseException exception) throws SAXException {
745             System.out.println("\nwarning: " + XMLFileReader.showSAX(exception));
746             throw exception;
747         }
748     }
749 
summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal)750     private void summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal) {
751         System.out.println(commonDir);
752         summary.println("#name" + "\t" + "value-count" + "\t" + "value-len" + "\t" + "attr-count" + "\t" + "attr-len");
753         File commonDirectory = new File(commonDir);
754         if (!commonDirectory.exists()) {
755             System.out.println("Doesn't exist:\t" + commonDirectory);
756         }
757         summarizeFiles(summary, commonDirectory, isFinal, 1);
758     }
759 
760     static final Set<String> SKIP_DIRS = new HashSet<>(Arrays.asList("specs", "tools", "seed", "exemplars"));
761 
summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level)762     public void summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level) {
763         System.out.println("\t\t\t\t\t\t\t".substring(0, level) + directory);
764         int count = 0;
765         for (File file : directory.listFiles()) {
766             String filename = file.getName();
767             if (filename.startsWith(".")) {
768                 // do nothing
769             } else if (file.isDirectory()) {
770                 if (!SKIP_DIRS.contains(filename)) {
771                     summarizeFiles(summary, file, isFinal, level + 1);
772                 }
773             } else if (!filename.startsWith("#") && filename.endsWith(".xml")) {
774                 String name = new File(directory.getParent()).getName() + "/" + directory.getName() + "/"
775                     + file.getName();
776                 name = name.substring(0, name.length() - 4); // strip .xml
777                 if (!RAW_FILE_MATCHER.reset(name).find()) {
778                     continue;
779                 }
780                 if (VERBOSE) {
781                     System.out.println(name);
782                 } else {
783                     System.out.print(".");
784                     if (++count > 100) {
785                         count = 0;
786                         System.out.println();
787                     }
788                     System.out.flush();
789                 }
790                 MyHandler handler = check(file.toString(), name, isFinal);
791                 summary.println(name + "\t" + handler.valueCount + "\t" + handler.valueLen + "\t"
792                     + handler.attributeCount + "\t" + handler.attributeLen);
793             }
794         }
795         System.out.println();
796     }
797 }
798