1 package org.unicode.cldr.util;
2 
3 import java.util.ArrayList;
4 import java.util.Arrays;
5 import java.util.Collection;
6 import java.util.Collections;
7 import java.util.HashSet;
8 import java.util.List;
9 import java.util.NavigableSet;
10 import java.util.Objects;
11 import java.util.Set;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 
16 import com.google.common.base.Joiner;
17 import com.google.common.collect.Iterables;
18 import com.google.common.collect.Multimap;
19 import com.google.common.collect.Multiset;
20 import com.google.common.collect.Sets;
21 import com.google.common.collect.TreeMultimap;
22 import com.google.common.collect.TreeMultiset;
23 
24 /**
25  * Helper class that allows logging the use of regular expressions. A class that will summarize them will get a
26  * NavigabSet of PatternCountInterface instances.
27  *
28  * @author ribnitz
29  *
30  */
31 public class RegexLogger {
32     /**
33      * Should debugging be done? - if not, a null implementation will be used
34      */
35     private static final boolean DEBUG = false;
36     /**
37      * Instance
38      */
39     private static RegexLoggerInterface instance = null;
40 
getInstance()41     public static RegexLoggerInterface getInstance() {
42         if (instance == null) {
43             if (DEBUG) {
44                 instance = new RegexLoggerImpl();
45             } else {
46                 instance = new NullRegexLogger();
47             }
48         }
49         return instance;
50     }
51 
52     private static class PatternStringWithBoolean implements Comparable<PatternStringWithBoolean> {
53         private final String pattern;
54         private final boolean calledFromRegexFinder;
55         private final int hashCode;
56 
PatternStringWithBoolean(String patternStr, boolean calledFromRegexFinder)57         public PatternStringWithBoolean(String patternStr, boolean calledFromRegexFinder) {
58             this.pattern = patternStr.trim();
59             this.calledFromRegexFinder = calledFromRegexFinder;
60             hashCode = Objects.hash(this.pattern, this.calledFromRegexFinder);
61         }
62 
63         @Override
hashCode()64         public int hashCode() {
65             return hashCode;
66         }
67 
getPattern()68         public String getPattern() {
69             return pattern;
70         }
71 
isCalledFromRegexFinder()72         public boolean isCalledFromRegexFinder() {
73             return calledFromRegexFinder;
74         }
75 
76         @Override
equals(Object obj)77         public boolean equals(Object obj) {
78             if (this == obj) {
79                 return true;
80             }
81             if (obj == null) {
82                 return false;
83             }
84             if (getClass() != obj.getClass()) {
85                 return false;
86             }
87             PatternStringWithBoolean other = (PatternStringWithBoolean) obj;
88             if (calledFromRegexFinder != other.calledFromRegexFinder) {
89                 return false;
90             }
91             if (hashCode != other.hashCode) {
92                 return false;
93             }
94             if (other.pattern != null) {
95                 return false;
96             }
97             if (!pattern.equals(other.pattern)) {
98                 return false;
99             }
100             return true;
101         }
102 
103         @Override
compareTo(PatternStringWithBoolean o)104         public int compareTo(PatternStringWithBoolean o) {
105             if (o == null) {
106                 return 1;
107             }
108             if (this == o) {
109                 return 0;
110             }
111             return pattern.compareTo(o.pattern);
112         }
113     }
114 
115     /**
116      * Interface used for logging Regular expressions
117      * @author ribnitz
118      *
119      */
120     public static interface RegexLoggerInterface {
121         /**
122          * Log that the given pattern was applied on the given matchStr, whether it matched, and
123          * what the type of the log was. Cls conains the calling class.
124          * @param pattern
125          * @param matchStr
126          * @param matched
127          * @param type
128          * @param cls
129          */
log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls)130         void log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls);
131 
log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls)132         void log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls);
133 
log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls)134         void log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls);
135 
log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)136         void log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls);
137 
138         /**
139          * Get all the entries that matched
140          * @return
141          */
getEntries()142         NavigableSet<PatternCountInterface> getEntries();
143 
144         /**
145          * Get the entries that occurred at least minCount times. If there are no matches, an empty set is returned
146          * @param minCount
147          * @return
148          */
getEntries(final int minCount)149         NavigableSet<PatternCountInterface> getEntries(final int minCount);
150 
isEnabled()151         boolean isEnabled();
152     }
153 
154     /**
155      * Three of the methods can be delegations, which reduces the actual implementation to two methods
156      * @author ribnitz
157      *
158      */
159     private static abstract class AbstractRegexLogger implements RegexLoggerInterface {
160 
161         @Override
log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls)162         public void log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls) {
163             log(matcher.pattern(), matchStr, matched, type, cls);
164 
165         }
166 
log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls)167         public void log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls) {
168             log(pattern.pattern(), matchStr, matched, type, cls);
169         }
170 
log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls)171         public void log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls) {
172             log(pattern, matchStr, matched, 0, type, cls);
173         }
174 
175         /**
176          * Get all entries
177          */
getEntries()178         public NavigableSet<PatternCountInterface> getEntries() {
179             return getEntries(1);
180         }
181 
182         @Override
isEnabled()183         public boolean isEnabled() {
184             return DEBUG;
185         }
186 
187     }
188 
189     /**
190      * Null implementation
191      * @author ribnitz
192      *
193      */
194     private static class NullRegexLogger extends AbstractRegexLogger {
195 
196         @Override
log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)197         public void log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls) {
198             // do nothing
199         }
200 
201         @Override
getEntries(int minCount)202         public NavigableSet<PatternCountInterface> getEntries(int minCount) {
203             NavigableSet<PatternCountInterface> returned = (NavigableSet<PatternCountInterface>) Sets.newTreeSet(Collections.EMPTY_SET);
204             return returned;
205         }
206     }
207 
208     /**
209      * Inetface used for the entries returnred by the RegexLogger
210      * @author ribnitz
211      *
212      */
213     public static interface PatternCountInterface {
214         /**
215          * Get the pattern used
216          * @return
217          */
getPattern()218         String getPattern();
219 
220         /**
221          * Get the number of successful matches obtained through FIND
222          * @return
223          */
getNumberOfFindMatches()224         int getNumberOfFindMatches();
225 
226         /**
227          * Get the number of unsuccessful matches obtained through FIND
228          * @return
229          */
getNumberOfFindFailures()230         int getNumberOfFindFailures();
231 
232         /**
233          * Get the number of successful matches obtained through MATCH
234          * @return
235          */
getNumberOfMatchMatches()236         int getNumberOfMatchMatches();
237 
238         /**
239          * Get the number of unsuccessful matches obtained through FIND
240          * @return
241          */
getNumberOfMatchFailures()242         int getNumberOfMatchFailures();
243 
244         /**
245          * Return true if this call was made from RegexFinder
246          * @return
247          */
isCalledFromRegexFinder()248         boolean isCalledFromRegexFinder();
249 
250         /**
251          * Get a set of all call locations
252          * @return
253          */
getCallLocations()254         Set<String> getCallLocations();
255 
256     }
257 
258     /**
259      * GetAll uses this class to add all the entries of a multiSet to the result set, constructing
260      * the object to return for each pattern. Objects will only be added once.
261      *
262      * This is the implementatioon that adds all items.
263      * @author ribnitz
264      *
265      */
266     private static class AddAllEntryProcessor {
267         protected final int minCount;
268         protected final CountSets c;
269         protected final Set<PatternStringWithBoolean> seen = new HashSet<>();
270         protected final NavigableSet<PatternCountInterface> result = new TreeSet<>();
271 
AddAllEntryProcessor(int minCount, CountSets c)272         public AddAllEntryProcessor(int minCount, CountSets c) {
273             this.minCount = minCount;
274             this.c = c;
275         }
276 
getResult()277         public NavigableSet<PatternCountInterface> getResult() {
278             return result;
279         }
280 
process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet)281         public void process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet) {
282             if (!seen.contains(item)) {
283                 result.add(new RegexKeyWithCount(item, c));
284                 seen.add(item);
285             }
286         }
287     }
288 
289     /**
290      * Sometimes getEntries is called with a minCount; this Class filters and only adds the
291      * items that occur at least minCount times.
292      * @author ribnitz
293      *
294      */
295     private static class EntryProcessor extends AddAllEntryProcessor {
EntryProcessor(int minCount, CountSets c)296         public EntryProcessor(int minCount, CountSets c) {
297             super(minCount, c);
298         }
299 
process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet)300         public void process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet) {
301             if (countSet.count(item) >= minCount) {
302                 super.process(item, countSet);
303             }
304         }
305     }
306 
307     /**
308      * Since all the inner classes are static, this object is used to pass around the refernces to the
309      * different sets/the state
310      *
311      * @author ribnitz
312      *
313      */
314     private static class CountSets {
315         final Multiset<PatternStringWithBoolean> matchedFindSet;
316         final Multiset<PatternStringWithBoolean> failedFindSet;
317         final Multiset<PatternStringWithBoolean> matchedMatchSet;
318         final Multiset<PatternStringWithBoolean> failedMatchSet;
319         final Multimap<PatternStringWithBoolean, String> stacktraces;
320 
CountSets(Multiset<PatternStringWithBoolean> matchedFindSet, Multiset<PatternStringWithBoolean> failedFindSet, Multiset<PatternStringWithBoolean> matchedMatchSet, Multiset<PatternStringWithBoolean> failedMatchSet, Multimap<PatternStringWithBoolean, String> occurrences)321         public CountSets(Multiset<PatternStringWithBoolean> matchedFindSet, Multiset<PatternStringWithBoolean> failedFindSet,
322             Multiset<PatternStringWithBoolean> matchedMatchSet, Multiset<PatternStringWithBoolean> failedMatchSet,
323             Multimap<PatternStringWithBoolean, String> occurrences) {
324             this.failedFindSet = failedFindSet;
325             this.failedMatchSet = failedMatchSet;
326             this.matchedMatchSet = matchedMatchSet;
327             this.stacktraces = occurrences;
328             this.matchedFindSet = matchedFindSet;
329         }
330     }
331 
332     private static class RegexKeyWithCount implements PatternCountInterface, Comparable<PatternCountInterface> {
333         private final String pattern;
334         private final int findMatchCount;
335         private final int findFailCount;
336         private final int matchMatchCount;
337         private final int matchFailCount;
338         private final boolean calledFromRegexFinder;
339         private final Set<String> callLocations = new HashSet<>();
340         private final int hashCode;
341 
RegexKeyWithCount(PatternStringWithBoolean key, CountSets bean)342         public RegexKeyWithCount(PatternStringWithBoolean key, CountSets bean) {
343             this.pattern = key.getPattern();
344             this.calledFromRegexFinder = key.isCalledFromRegexFinder();
345             this.findMatchCount = bean.matchedFindSet.count(key);
346             this.findFailCount = bean.failedFindSet.count(key);
347             this.matchMatchCount = bean.matchedMatchSet.count(key);
348             this.matchFailCount = bean.failedMatchSet.count(key);
349             Collection<String> tmp = bean.stacktraces.get(key);
350             for (String cur : tmp) {
351                 if (!callLocations.contains(cur)) {
352                     callLocations.add(cur);
353                 }
354             }
355             this.hashCode = Objects.hash(this.pattern,
356                 this.findMatchCount,
357                 this.findFailCount,
358                 this.matchFailCount,
359                 this.matchMatchCount,
360                 this.calledFromRegexFinder,
361                 this.callLocations);
362         }
363 
getPattern()364         public String getPattern() {
365             return pattern;
366         }
367 
368         @Override
hashCode()369         public int hashCode() {
370             return hashCode;
371         }
372 
373         @Override
getNumberOfFindMatches()374         public int getNumberOfFindMatches() {
375             return findMatchCount;
376         }
377 
378         @Override
getNumberOfFindFailures()379         public int getNumberOfFindFailures() {
380             return findFailCount;
381         }
382 
383         @Override
getNumberOfMatchMatches()384         public int getNumberOfMatchMatches() {
385             return matchMatchCount;
386         }
387 
388         @Override
getNumberOfMatchFailures()389         public int getNumberOfMatchFailures() {
390             return matchFailCount;
391         }
392 
393         @Override
equals(Object obj)394         public boolean equals(Object obj) {
395             if (this == obj) {
396                 return true;
397             }
398             if (obj == null) {
399                 return false;
400             }
401             if (hashCode != obj.hashCode()) {
402                 return false;
403             }
404             if (getClass() != obj.getClass()) {
405                 return false;
406             }
407             RegexKeyWithCount other = (RegexKeyWithCount) obj;
408             if (matchFailCount != other.matchFailCount) {
409                 return false;
410             }
411             if (matchMatchCount != other.matchMatchCount) {
412                 return false;
413             }
414             if (findFailCount != other.findFailCount) {
415                 return false;
416             }
417             if (findMatchCount != other.findMatchCount) {
418                 return false;
419             }
420             if (!pattern.equals(other.pattern)) {
421                 return false;
422             }
423             if (calledFromRegexFinder != other.calledFromRegexFinder) {
424                 return false;
425             }
426             if (callLocations != other.callLocations) {
427                 return false;
428             }
429             return true;
430         }
431 
432         @Override
compareTo(PatternCountInterface o)433         public int compareTo(PatternCountInterface o) {
434             if (o == null) {
435                 return 1;
436             }
437             return new Integer(matchFailCount + matchMatchCount + findFailCount + findMatchCount).compareTo(
438                 o.getNumberOfFindFailures() + o.getNumberOfFindMatches() + o.getNumberOfMatchFailures() + o.getNumberOfMatchMatches());
439         }
440 
441         @Override
isCalledFromRegexFinder()442         public boolean isCalledFromRegexFinder() {
443             return calledFromRegexFinder;
444         }
445 
446         @Override
getCallLocations()447         public Set<String> getCallLocations() {
448             return callLocations;
449         }
450 
451     }
452 
453     public enum LogType {
454         FIND, MATCH
455     }
456 
457     private static interface IterableTransformer<E, F> {
transform(Iterable<E> input)458         Iterable<F> transform(Iterable<E> input);
459     }
460 
461     private static class StringIterableTransformer implements IterableTransformer<String, String> {
462 
463         @Override
transform(Iterable<String> input)464         public Iterable<String> transform(Iterable<String> input) {
465             List<String> returned = new ArrayList<>(Iterables.size(input));
466             String lastClass = null;
467             for (String current : input) {
468                 String transformed = current;
469                 if (lastClass != null) {
470                     if (lastClass.startsWith("RegexLookup") && !current.startsWith("org.unicode.cldr.util.RegexLookup")) {
471                         returned.add(lastClass);
472                     }
473                     break;
474                 }
475                 if (current.startsWith("org.unicode.cldr.test.CheckCLDR") &&
476                     !lastClass.startsWith("org.unicode.cldr.test.CheckCLDR")) {
477                     lastClass = current;
478                     // leave out
479                     continue;
480                 }
481                 // remove org.unicode.cldr
482                 if (current.startsWith("org.unicode.cldr.util.")) {
483                     transformed = current.substring("org.unicode.cldr.util.".length());
484                 }
485                 // only the last RegexLookup will be added
486                 if (!transformed.startsWith("RegexLookup")) {
487                     returned.add(transformed);
488                 }
489                 lastClass = transformed;
490             }
491             return returned;
492         }
493     }
494 
495     private static class ClassnameOnlyStringTransformer implements IterableTransformer<String, String> {
496 
497         @Override
transform(Iterable<String> input)498         public Iterable<String> transform(Iterable<String> input) {
499             List<String> returned = new ArrayList<>(Iterables.size(input));
500             String lastClass = null;
501             for (String current : input) {
502                 if (current.lastIndexOf(".") > 0) {
503                     current = current.substring(current.lastIndexOf("."));
504                 }
505                 if (lastClass != null) {
506                     if (lastClass.startsWith("RegexLookup") && !current.startsWith("RegexLookup")) {
507                         returned.add(lastClass);
508                     }
509                     if (lastClass.startsWith("VettingViewer")) {
510                         break;
511                     }
512                     if (current.startsWith("CheckCLDR") && !lastClass.startsWith("CheckCLDR")) {
513                         lastClass = current;
514                         // leave out
515                         continue;
516                     }
517                 }
518                 // only the last RegexLookup will be added
519                 if (!current.startsWith("RegexLookup")) {
520                     returned.add(current);
521                 }
522                 lastClass = current;
523             }
524             return returned;
525         }
526     }
527 
528     /**
529      * This is the class doing the bulk of the work.
530      * @author ribnitz
531      */
532     private static class RegexLoggerImpl extends AbstractRegexLogger {
533 
534         /*
535          * Each has more than 1m hits, together they account for about 14m (of the 26m total)
536          */
537         private static final Set<String> exactMatchSet = new HashSet<>(Arrays.asList(new String[] {
538             "^//ldml.*",
539             "^//ldml/dates.*",
540             "^//ldml/units.*",
541             "^//ldml/characters/ellipsis[@type=\"(final|initial|medial)\"]",
542             "^//ldml/characters.*",
543             "^//ldml/listPatterns/listPattern.*",
544             "^//ldml/units/unitLength[@type=\"(long|short|narrow)\"].*",
545         }));
546         private static final Set<String> patternSet = new HashSet<>(Arrays.asList(new String[] {
547             "^//ldml/dates/fields",
548             "^//ldml/dates/calendars/calendar",
549             "/(availableFormats",
550         }));
551         private final Multiset<PatternStringWithBoolean> matchedFindSet = TreeMultiset.create();
552         private final Multiset<PatternStringWithBoolean> failedFindSet = TreeMultiset.create();
553         private final Multiset<PatternStringWithBoolean> matchedMatchSet = TreeMultiset.create();
554         private final Multiset<PatternStringWithBoolean> failedMatchSet = TreeMultiset.create();
555 
556         private final Multimap<PatternStringWithBoolean, String> occurrences = TreeMultimap.create();
557         private final IterableTransformer<String, String> transformer = new StringIterableTransformer();
558 
log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)559         public void log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls) {
560             boolean isRegexFinder = findClassName("org.unicode.cldr.util.RegexLookup", 10);
561             PatternStringWithBoolean key = new PatternStringWithBoolean(pattern, isRegexFinder);
562             Collection<PatternStringWithBoolean> collectionToAdd = determineCollectionToUse(matched, type);
563             if (collectionToAdd != null) {
564                 collectionToAdd.add(key);
565             }
566             if (shouldLogPattern(pattern, isRegexFinder)) {
567                 addElementToList(key);
568             }
569         }
570 
determineCollectionToUse(boolean matched, LogType type)571         private Collection<PatternStringWithBoolean> determineCollectionToUse(boolean matched, LogType type) {
572             Collection<PatternStringWithBoolean> collectionToAdd = null;
573             switch (type) {
574             case FIND:
575                 if (matched) {
576                     collectionToAdd = matchedFindSet;
577                 } else {
578                     collectionToAdd = failedFindSet;
579                 }
580                 break;
581             case MATCH:
582                 if (matched) {
583                     collectionToAdd = matchedMatchSet;
584                 } else {
585                     collectionToAdd = failedMatchSet;
586                 }
587                 break;
588             }
589             return collectionToAdd;
590         }
591 
shouldLogPattern(String pattern, boolean isRegexFinder)592         private boolean shouldLogPattern(String pattern, boolean isRegexFinder) {
593             if (!isRegexFinder) {
594                 return true;
595             } else {
596                 if (exactMatchSet.contains(pattern)) {
597                     return true;
598                 } else {
599                     for (String cur : patternSet) {
600                         if (pattern.startsWith(cur)) {
601                             return true;
602                         }
603                     }
604                 }
605             }
606             return false;
607         }
608 
findClassName(String className, int depth)609         private boolean findClassName(String className, int depth) {
610             StackTraceElement[] st = Thread.currentThread().getStackTrace();
611             int startPos = (st.length > 2) ? 2 : 0;
612             int endPos = (startPos + depth > st.length) ? st.length : startPos + depth;
613             for (int i = startPos; i < endPos; i++) {
614                 StackTraceElement cur = st[i];
615                 String curClass = cur.getClassName();
616                 if (curClass.startsWith(className)) {
617                     return true;
618                 }
619             }
620             return false;
621         }
622 
623         private final static Joiner JOINER = Joiner.on(";");
624 
addElementToList(PatternStringWithBoolean key)625         private void addElementToList(PatternStringWithBoolean key) {
626             List<String> stList = processStackTrace("org.unicode.cldr.util.RegexLookup", 0);
627 
628             if (!stList.isEmpty()) {
629                 occurrences.put(key, JOINER.join(transformer.transform(stList)));
630             }
631         }
632 
processStackTrace(String classNameToStartAt, int depth)633         private List<String> processStackTrace(String classNameToStartAt, int depth) {
634             StackTraceElement[] st = Thread.currentThread().getStackTrace();
635             if (depth == 0) {
636                 depth = st.length;
637             }
638             int startPos;
639             if (depth < 0) {
640                 startPos = depth + st.length;
641                 depth = Math.abs(depth);
642             } else {
643                 startPos = (st.length > 2) ? 2 : 0;
644             }
645             int pos;
646             boolean found = false;
647             for (pos = startPos; pos < st.length; pos++) {
648                 if (st[pos].getClassName().startsWith(classNameToStartAt)) {
649                     found = true;
650                     break;
651                 }
652             }
653             if (!found) {
654                 return Collections.emptyList();
655             }
656             int endPos = (pos + depth > st.length) ? st.length : startPos + depth;
657             List<String> ret = new ArrayList<>(depth + 2);
658             for (int i = pos; i < endPos; i++) {
659                 StackTraceElement cur = st[i];
660                 String curClass = cur.getClassName();
661                 ret.add(curClass + ":" + cur.getLineNumber());
662             }
663             return ret;
664         }
665 
getEntries(final int minCount)666         public NavigableSet<PatternCountInterface> getEntries(final int minCount) {
667             CountSets c = new CountSets(matchedFindSet, failedFindSet, matchedMatchSet, failedMatchSet, occurrences);
668             final AddAllEntryProcessor processor = (minCount == 1) ? new AddAllEntryProcessor(minCount, c) : new EntryProcessor(minCount, c);
669             for (PatternStringWithBoolean item : matchedFindSet) {
670                 processor.process(item, matchedFindSet);
671             }
672             for (PatternStringWithBoolean item : failedFindSet) {
673                 processor.process(item, failedFindSet);
674             }
675             for (PatternStringWithBoolean item : matchedMatchSet) {
676                 processor.process(item, matchedMatchSet);
677             }
678             for (PatternStringWithBoolean item : failedMatchSet) {
679                 processor.process(item, failedMatchSet);
680             }
681             return Sets.unmodifiableNavigableSet(processor.getResult());
682         }
683     }
684 }
685