1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.PrintWriter;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7 import java.util.Iterator;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Set;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13 
14 import org.unicode.cldr.draft.FileUtilities;
15 import org.unicode.cldr.tool.Option.Options;
16 import org.unicode.cldr.util.CLDRConfig;
17 import org.unicode.cldr.util.CLDRFile;
18 import org.unicode.cldr.util.CLDRFile.DraftStatus;
19 import org.unicode.cldr.util.CLDRPaths;
20 import org.unicode.cldr.util.CoverageInfo;
21 import org.unicode.cldr.util.Factory;
22 import org.unicode.cldr.util.LocaleIDParser;
23 import org.unicode.cldr.util.Organization;
24 import org.unicode.cldr.util.PatternCache;
25 import org.unicode.cldr.util.RegexFileParser;
26 import org.unicode.cldr.util.RegexFileParser.RegexLineParser;
27 import org.unicode.cldr.util.RegexLookup;
28 import org.unicode.cldr.util.StandardCodes;
29 import org.unicode.cldr.util.SupplementalDataInfo;
30 import org.unicode.cldr.util.XMLSource;
31 import org.unicode.cldr.util.XPathParts;
32 
33 import com.ibm.icu.util.Output;
34 
35 /**
36  * Factory for filtering CLDRFiles by organization and replacing certain values.
37  * Organization coverage data is in org/unicode/cldr/util/data/Locales.txt.
38  *
39  * @author jchye
40  */
41 public class FilterFactory extends Factory {
42     /**
43      * Types of data modification supported.
44      */
45     private enum ModificationType {
46         xpath, value;
47     }
48 
49     private Factory rawFactory;
50     private String organization;
51     private SupplementalDataInfo supplementalData;
52     private boolean modifyValues;
53 
54     private List<Modifier> modifiers = new ArrayList<Modifier>();
55 
56     /**
57      * Creates a new Factory for filtering CLDRFiles.
58      *
59      * @param rawFactory
60      *            the factory to be filtered
61      * @param organization
62      *            the organization that the filtering is catered towards
63      * @param modifyValues
64      *            true if certain values in the data should be modified or replaced
65      */
FilterFactory(Factory rawFactory, String organization, boolean modifyValues)66     private FilterFactory(Factory rawFactory, String organization, boolean modifyValues) {
67         this.rawFactory = rawFactory;
68         this.organization = organization;
69         supplementalData = SupplementalDataInfo.getInstance();
70         setSupplementalDirectory(rawFactory.getSupplementalDirectory());
71         this.modifyValues = modifyValues;
72     }
73 
load(Factory rawFactory, String organization, boolean usesAltValue)74     public static FilterFactory load(Factory rawFactory, String organization, boolean usesAltValue) {
75         FilterFactory filterFactory = new FilterFactory(rawFactory, organization, usesAltValue);
76         filterFactory.loadModifiers("dataModifiers.txt");
77         return filterFactory;
78     }
79 
80     @Override
getSourceDirectories()81     public File[] getSourceDirectories() {
82         return rawFactory.getSourceDirectories();
83     }
84 
85     @Override
getSourceDirectoriesForLocale(String localeID)86     public List<File> getSourceDirectoriesForLocale(String localeID) {
87         return rawFactory.getSourceDirectoriesForLocale(localeID);
88     }
89 
90     @Override
handleMake(String localeID, boolean resolved, DraftStatus minimalDraftStatus)91     protected CLDRFile handleMake(String localeID, boolean resolved, DraftStatus minimalDraftStatus) {
92         if (resolved) {
93             return new CLDRFile(makeResolvingSource(localeID, minimalDraftStatus));
94         } else {
95             return filterCldrFile(localeID, minimalDraftStatus);
96         }
97     }
98 
99     /**
100      * @return a filtered CLDRFile.
101      */
filterCldrFile(String localeID, DraftStatus minimalDraftStatus)102     private CLDRFile filterCldrFile(String localeID, DraftStatus minimalDraftStatus) {
103         CLDRFile rawFile = rawFactory.make(localeID, false, minimalDraftStatus).cloneAsThawed();
104 
105         filterAltValues(rawFile);
106         filterCoverage(rawFile);
107         removeRedundantPaths(rawFile);
108         return rawFile;
109     }
110 
111     /**
112      * Replaces the value for certain XPaths with their alternate value.
113      *
114      * @param rawFile
115      */
filterAltValues(CLDRFile rawFile)116     private void filterAltValues(CLDRFile rawFile) {
117         if (!modifyValues) return;
118 
119         for (Modifier modifier : modifiers) {
120             modifier = modifier.filterLocale(rawFile.getLocaleID());
121             if (!modifier.isEmpty()) {
122                 modifier.modifyFile(rawFile);
123             }
124         }
125     }
126 
127     /**
128      * Filters a CLDRFile according to the specified organization's coverage level.
129      *
130      * @param rawFile
131      */
filterCoverage(CLDRFile rawFile)132     private void filterCoverage(CLDRFile rawFile) {
133         if (organization == null) return;
134 
135         int minLevel = StandardCodes.make()
136             .getLocaleCoverageLevel(organization, rawFile.getLocaleID())
137             .getLevel();
138         CoverageInfo covInfo = CLDRConfig.getInstance().getCoverageInfo();
139         for (String xpath : rawFile) {
140             // Locale metadata shouldn't be stripped.
141             int level = covInfo.getCoverageValue(xpath, rawFile.getLocaleID());
142             if (level > minLevel) {
143                 rawFile.remove(xpath);
144             }
145         }
146     }
147 
148     /**
149      * Removes paths with duplicate values that can be found elsewhere in the file.
150      * @param rawFile
151      */
removeRedundantPaths(CLDRFile rawFile)152     private void removeRedundantPaths(CLDRFile rawFile) {
153         if (organization == null || rawFile.getLocaleID().equals("root")) return;
154 
155         String parent = LocaleIDParser.getParent(rawFile.getLocaleID());
156         CLDRFile resolvedParent = rawFactory.make(parent, true);
157         List<String> duplicatePaths = new ArrayList<String>();
158         XPathParts parts = new XPathParts();
159         for (String xpath : rawFile) {
160             if (xpath.startsWith("//ldml/identity")) continue;
161             String value = rawFile.getStringValue(xpath);
162             // Remove count="x" if the value is equivalent to count="other".
163             if (xpath.contains("[@count=")) {
164                 parts.set(xpath);
165                 String count = parts.getAttributeValue(-1, "count");
166                 if (!count.equals("other")) {
167                     parts.setAttribute(-1, "count", "other");
168                     String otherPath = parts.toString();
169                     if (value.equals(rawFile.getStringValue(otherPath))) {
170                         duplicatePaths.add(xpath);
171                         continue;
172                     }
173                 }
174             }
175             // Remove xpaths with values also found in the parent.
176             String sourceLocale = resolvedParent.getSourceLocaleID(xpath, null);
177             if (!sourceLocale.equals(XMLSource.CODE_FALLBACK_ID)) {
178                 String parentValue = resolvedParent.getStringValue(xpath);
179                 if (value.equals(parentValue)) {
180                     duplicatePaths.add(xpath);
181                 }
182             }
183         }
184         for (String xpath : duplicatePaths) {
185             rawFile.remove(xpath);
186         }
187     }
188 
189     @Override
getMinimalDraftStatus()190     public DraftStatus getMinimalDraftStatus() {
191         return rawFactory.getMinimalDraftStatus();
192     }
193 
194     @Override
handleGetAvailable()195     protected Set<String> handleGetAvailable() {
196         return rawFactory.getAvailable();
197     }
198 
199     /**
200      * Wrapper class for holding information about a value modification entry.
201      */
202     private class ModifierEntry {
203         String oldValue;
204         String newValue;
205         Map<String, String> options;
206 
ModifierEntry(String oldValue, String newValue, Map<String, String> options)207         public ModifierEntry(String oldValue, String newValue, Map<String, String> options) {
208             this.oldValue = oldValue;
209             this.newValue = newValue;
210             this.options = options;
211         }
212 
213         /**
214          * @param locale
215          *            the locale to be matched
216          * @return true if the locale matches the locale filter in this entry.
217          */
localeMatches(String locale)218         public boolean localeMatches(String locale) {
219             String pattern = options.get("locale");
220             return pattern == null ? true : locale.matches(pattern);
221         }
222     }
223 
224     /**
225      * Class for performing a specific type of data modification on a CLDRFile.
226      */
227     private abstract class Modifier {
228         protected List<ModifierEntry> entries = new ArrayList<ModifierEntry>();
229 
modifyFile(CLDRFile file)230         public abstract void modifyFile(CLDRFile file);
231 
filterLocale(String locale)232         public abstract Modifier filterLocale(String locale);
233 
234         /**
235          * @return the list of modifiers meant for the specified locale.
236          */
getModifiersForLocale(String locale)237         protected List<ModifierEntry> getModifiersForLocale(String locale) {
238             List<ModifierEntry> newFilters = new ArrayList<ModifierEntry>();
239             for (ModifierEntry filter : entries) {
240                 if (filter.localeMatches(locale)) {
241                     newFilters.add(filter);
242                 }
243             }
244             return newFilters;
245         }
246 
247         /**
248          *
249          * @param filter
250          */
addModifierEntry(ModifierEntry entry)251         public void addModifierEntry(ModifierEntry entry) {
252             entries.add(entry);
253         }
254 
isEmpty()255         public boolean isEmpty() {
256             return entries.size() == 0;
257         }
258     }
259 
260     /**
261      * Maps the value of an XPath onto another XPath.
262      */
263     private class PathModifier extends Modifier {
264         @Override
modifyFile(CLDRFile file)265         public void modifyFile(CLDRFile file) {
266             // For certain alternate values, use them as the main values.
267             for (ModifierEntry entry : entries) {
268                 String oldPath = entry.oldValue;
269                 String value = file.getStringValue(oldPath);
270                 if (value != null) {
271                     String newPath = entry.newValue;
272                     file.add(newPath, value);
273                     file.remove(oldPath);
274                 }
275             }
276         }
277 
278         @Override
filterLocale(String locale)279         public Modifier filterLocale(String locale) {
280             PathModifier newModifier = new PathModifier();
281             newModifier.entries = getModifiersForLocale(locale);
282             return newModifier;
283         }
284     }
285 
286     /**
287      * Replaces certain values with other values.
288      */
289     private class ValueModifier extends Modifier {
290         @Override
modifyFile(CLDRFile file)291         public void modifyFile(CLDRFile file) {
292             // Replace values.
293             for (ModifierEntry entry : entries) {
294                 String filteringPath = entry.options.get("xpath");
295                 if (filteringPath != null && isValidXPath(filteringPath)) {
296                     // For non-regex XPaths, look them up directly.
297                     String value = file.getStringValue(filteringPath);
298                     if (value != null) {
299                         value = value.replaceAll(entry.oldValue, entry.newValue);
300                         file.add(filteringPath, value);
301                     }
302                 } else {
303                     Iterator<String> iterator = file.iterator();
304                     if (filteringPath != null) {
305                         Matcher matcher = PatternCache.get(filteringPath).matcher("");
306                         iterator = file.iterator(matcher);
307                     }
308                     while (iterator.hasNext()) {
309                         String xpath = iterator.next();
310                         String originalValue = file.getStringValue(xpath);
311                         String value = originalValue.replaceAll(entry.oldValue, entry.newValue);
312                         if (!value.equals(originalValue)) {
313                             file.add(xpath, value);
314                         }
315                     }
316                 }
317             }
318         }
319 
320         @Override
filterLocale(String locale)321         public Modifier filterLocale(String locale) {
322             ValueModifier newModifier = new ValueModifier();
323             newModifier.entries = getModifiersForLocale(locale);
324             return newModifier;
325         }
326     }
327 
328     /**
329      * Maps the value of XPaths onto other XPaths using regexes.
330      */
331     private class PathRegexModifier extends Modifier {
332         private RegexLookup<String> xpathLookup = new RegexLookup<String>();
333 
334         @Override
addModifierEntry(ModifierEntry entry)335         public void addModifierEntry(ModifierEntry entry) {
336             super.addModifierEntry(entry);
337             xpathLookup.add(entry.oldValue, entry.newValue);
338         }
339 
340         @Override
modifyFile(CLDRFile file)341         public void modifyFile(CLDRFile file) {
342             if (xpathLookup.size() > 0) {
343                 Output<String[]> arguments = new Output<String[]>();
344                 for (String xpath : file) {
345                     String newValue = xpathLookup.get(xpath, null, arguments, null, null);
346                     if (newValue != null) {
347                         String newPath = RegexLookup.replace(newValue, arguments.value);
348                         String value = file.getStringValue(xpath);
349                         file.add(newPath, value);
350                         file.remove(xpath);
351                     }
352                 }
353             }
354         }
355 
356         @Override
filterLocale(String locale)357         public Modifier filterLocale(String locale) {
358             PathRegexModifier newModifier = new PathRegexModifier();
359             newModifier.entries = getModifiersForLocale(locale);
360             for (ModifierEntry entry : newModifier.entries) {
361                 newModifier.xpathLookup.add(entry.oldValue, entry.newValue);
362             }
363             return newModifier;
364         }
365     }
366 
367     /**
368      * Loads modifiers from a specified file.
369      */
loadModifiers(String filename)370     private void loadModifiers(String filename) {
371         if (!modifyValues) return;
372         final Modifier pathModifier = new PathModifier();
373         final Modifier pathRegexModifier = new PathRegexModifier();
374         final Modifier valueModifier = new ValueModifier();
375         RegexFileParser fileParser = new RegexFileParser();
376         fileParser.setLineParser(new RegexLineParser() {
377             @Override
378             public void parse(String line) {
379                 String[] contents = line.split("\\s*+;\\s*+");
380                 ModificationType filterType = ModificationType.valueOf(contents[0]);
381                 String oldValue = contents[1];
382                 String newValue = contents[2];
383                 // Process remaining options.
384                 Map<String, String> options = new HashMap<String, String>();
385                 for (int i = 3; i < contents.length; i++) {
386                     String rawLine = contents[i];
387                     int pos = rawLine.indexOf('=');
388                     if (pos < 0) {
389                         throw new IllegalArgumentException("Invalid option: " + rawLine);
390                     }
391                     String optionType = rawLine.substring(0, pos).trim();
392                     options.put(optionType, rawLine.substring(pos + 1).trim());
393                 }
394 
395                 switch (filterType) {
396                 case xpath:
397                     if (isValidXPath(oldValue)) {
398                         pathModifier.addModifierEntry(new ModifierEntry(oldValue, newValue, options));
399                     } else {
400                         pathRegexModifier.addModifierEntry(new ModifierEntry(fixXPathRegex(oldValue),
401                             newValue, options));
402                     }
403                     break;
404                 case value:
405                     String xpath = options.get("xpath");
406                     if (xpath != null && !isValidXPath(xpath)) {
407                         options.put("xpath", fixXPathRegex(xpath));
408                     }
409                     valueModifier.addModifierEntry(new ModifierEntry(oldValue, newValue, options));
410                     break;
411                 }
412             }
413         });
414         fileParser.parse(FilterFactory.class, filename);
415         modifiers.add(pathModifier);
416         modifiers.add(pathRegexModifier);
417         modifiers.add(valueModifier);
418     }
419 
420     private Pattern XPATH_PATTERN = PatternCache.get("/(/\\w++(\\[@\\w++=\"[^\"()%\\\\]+\"])*)++");
421 
422     /**
423      * @param path
424      * @return true if path is a valid XPath and not a regex.
425      */
isValidXPath(String path)426     private boolean isValidXPath(String path) {
427         return XPATH_PATTERN.matcher(path).matches();
428     }
429 
430     /**
431      * Converts an xpath into a proper regex pattern.
432      *
433      * @param path
434      * @return
435      */
fixXPathRegex(String path)436     private String fixXPathRegex(String path) {
437         return '^' + path.replace("[@", "\\[@");
438     }
439 
440     private static final Options options = new Options(
441         "Filters CLDR XML files according to orgnizational coverage levels and an " +
442             "input file of replacement values/xpaths.")
443                 //        .add("org", 'o', ".*", "google", "The organization that the filtering is for. If set, also removes duplicate paths.")
444                 .add("org", 'o', ".*", Organization.cldr.name(), "The organization that the filtering is for. If set, also removes duplicate paths.")
445                 .add("locales", 'l', ".*", ".*", "A regular expression indicating the locales to be filtered");
446 
447     /**
448      * Run FilterFactory for a specific organization.
449      *
450      * @param args
451      * @throws Exception
452      */
main(String[] args)453     public static void main(String[] args) throws Exception {
454         options.parse(args, true);
455         Factory rawFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, options.get("locales").getValue());
456         String org = options.get("org").getValue();
457         FilterFactory filterFactory = FilterFactory.load(rawFactory, org, true);
458         String outputDir = CLDRPaths.GEN_DIRECTORY + "/filter";
459         for (String locale : rawFactory.getAvailable()) {
460             try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, locale + ".xml");) {
461                 filterFactory.make(locale, false).write(out);
462             }
463 //            out.close();
464         }
465     }
466 }
467