1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.PrintWriter;
5 import java.util.HashMap;
6 import java.util.Map;
7 
8 import org.unicode.cldr.draft.FileUtilities;
9 import org.unicode.cldr.util.CLDRFile;
10 import org.unicode.cldr.util.CLDRPaths;
11 import org.unicode.cldr.util.CLDRTransforms;
12 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID;
13 import org.unicode.cldr.util.DtdType;
14 import org.unicode.cldr.util.Factory;
15 import org.unicode.cldr.util.LocaleIDParser;
16 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException;
17 import org.unicode.cldr.util.SimpleXMLSource;
18 import org.unicode.cldr.util.XMLSource;
19 
20 import com.ibm.icu.text.Normalizer;
21 import com.ibm.icu.text.Transliterator;
22 import com.ibm.icu.text.UnicodeSet;
23 import com.ibm.icu.util.ICUUncheckedIOException;
24 
25 /**
26  * Transforms the contents of a CLDRFile.
27  *
28  * @author jchye
29  */
30 public class CLDRFileTransformer {
31     /**
32      * Contains all supported locale-to-locale conversions along with information
33      * needed to convert each locale. Each enum value is named after the locale that results
34      * from the conversion.
35      */
36     enum PolicyIfExisting {
37         RETAIN, DISCARD, MINIMIZE
38     }
39 
40     public enum LocaleTransform {
41         sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), sr_Latn_BA("sr_Cyrl_BA",
42             "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml",
43                 Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD,
44                     "[:script=Cyrl:]", PolicyIfExisting.DISCARD), ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]",
45                         PolicyIfExisting.DISCARD), yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), de_CH("de",
46                             "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), yue_Hans("yue", "Simplified-Traditional.xml",
47                                 Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN),
48         // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD),
49         // Needs work to fix currency symbols, handle Maori. See http://unicode.org/cldr/trac/ticket/9516#comment:6
50         ;
51 
52         private final String inputLocale;
53         private final String transformFilename;
54         private final int direction;
55         private final UnicodeSet inputChars;
56         private final PolicyIfExisting policy;
57 
58         /**
59          * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead
60          */
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)61         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) {
62             this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD);
63         }
64 
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)65         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) {
66             this.inputLocale = inputLocale;
67             this.transformFilename = transformFilename;
68             this.direction = direction;
69             this.inputChars = new UnicodeSet(inputCharPattern);
70             this.policy = policy;
71         }
72 
73         /**
74          * @return the locale that used for conversion
75          */
getInputLocale()76         public String getInputLocale() {
77             return inputLocale;
78         }
79 
80         /**
81          * @return the locale that used for conversion
82          */
getOutputLocale()83         public String getOutputLocale() {
84             return this.toString();
85         }
86 
87         /**
88          * @return the filename of the transform used to make the conversion
89          */
getTransformFilename()90         public String getTransformFilename() {
91             return transformFilename;
92         }
93 
94         /**
95          * @return the direction of the transformation
96          */
getDirection()97         public int getDirection() {
98             return direction;
99         }
100 
101         /**
102          * @return the set of characters in the input locale that should have been removed after
103          *         transformation, used for internal debugging
104          */
getInputChars()105         private UnicodeSet getInputChars() {
106             return inputChars;
107         }
108     }
109 
110     private UnicodeSet unconverted = new UnicodeSet();
111     private Factory factory;
112     private Map<LocaleTransform, Transliterator> transliterators = new HashMap<LocaleTransform, Transliterator>();
113     private String transformDir;
114 
115     /**
116      * @param factory
117      *            the factory to get locale data from
118      * @param transformDir
119      *            the directory containing the transform files
120      */
CLDRFileTransformer(Factory factory, String transformDir)121     public CLDRFileTransformer(Factory factory, String transformDir) {
122         this.factory = factory;
123         this.transformDir = transformDir;
124     }
125 
loadTransliterator(LocaleTransform localeTransform)126     public Transliterator loadTransliterator(LocaleTransform localeTransform) {
127         if (transliterators.containsKey(localeTransform)) {
128             return transliterators.get(localeTransform);
129         }
130         Transliterator transliterator;
131         if (localeTransform.getTransformFilename().contains(".xml")) {
132             ParsedTransformID directionInfo = new ParsedTransformID();
133             String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo);
134             transliterator = Transliterator.createFromRules(directionInfo.getId(),
135                 ruleString, localeTransform.getDirection());
136             transliterators.put(localeTransform, transliterator);
137         } else {
138             transliterator = Transliterator.getInstance(localeTransform.getTransformFilename());
139         }
140         return transliterator;
141     }
142 
143     /**
144      * NOTE: This method does not currently handle nested transliterators.
145      *
146      * @param input
147      * @return null if the input file was missing, or if there is no new output file.
148      */
transform(LocaleTransform localeTransform)149     public CLDRFile transform(LocaleTransform localeTransform) {
150         Transliterator transliterator = loadTransliterator(localeTransform);
151         CLDRFile input;
152         try {
153             input = factory.make(localeTransform.getInputLocale(), false);
154         } catch (ICUUncheckedIOException e1) {
155             return null; // input file is missing (or otherwise unavailable)
156         }
157         boolean hadOutput = true;
158         CLDRFile output;
159         try {
160             output = factory.make(localeTransform.getOutputLocale(), false);
161         } catch (NoSourceDirectoryException e) {
162             // if we can't open the file, then just make a new one.
163             XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale());
164             output = new CLDRFile(dataSource);
165             hadOutput = false;
166         }
167         String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale());
168         CLDRFile outputParent = factory.make(outputParentString, true);
169 
170         outputParent = factory.make(localeTransform.getInputLocale(), false);
171         XMLSource outputSource = new SimpleXMLSource(localeTransform.toString());
172         for (String xpath : input) {
173             String fullPath = input.getFullXPath(xpath);
174             String value = input.getStringValue(xpath);
175             String oldValue = output.getStringValue(xpath);
176             String parentValue = outputParent.getStringValue(xpath);
177             value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue);
178             if (value != null) {
179                 outputSource.putValueAtPath(fullPath, value);
180             }
181         }
182         if (!outputSource.iterator().hasNext()) { // empty new output
183             if (!hadOutput) {
184                 return null; // don't add file if nothing to add
185             }
186         }
187         return new CLDRFile(outputSource);
188     }
189 
190     /**
191      * Transforms a CLDRFile value into another form.
192      * @param parentValue
193      */
transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)194     private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value,
195         String oldValue, String parentValue) {
196 
197         // allows us to change only new values
198         switch (localeTransform.policy) {
199         case RETAIN:
200         case MINIMIZE:
201             if (oldValue != null) {
202                 return oldValue;
203             }
204             break;
205         default:
206         }
207 
208         UnicodeSet chars = localeTransform.getInputChars();
209         String transliterated;
210 
211         // TODO: Don't transform dates/patterns.
212         // For now, don't try to transliterate the exemplar characters - use the ones from the original locale.
213         // In the future, we can probably control this better with a config file - similar to CLDRModify's config file.
214         if (path.contains("exemplarCharacters")) {
215             if (oldValue != null) {
216                 transliterated = oldValue;
217             } else {
218                 transliterated = value;
219             }
220         } else {
221             transliterated = transliterator.transliterate(value);
222             transliterated = Normalizer.compose(transliterated, false);
223         }
224         if (localeTransform.policy == PolicyIfExisting.MINIMIZE) {
225             if (transliterated.equals(value)) {
226                 return null;
227             }
228         }
229 
230         if (chars.containsSome(transliterated)) {
231             unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated));
232         }
233         return transliterated;
234     }
235 
main(String[] args)236     public static void main(String[] args) throws Exception {
237         for (String dir : DtdType.ldml.directories) {
238             if (dir.equals("casing") // skip, field contents are keywords, not localizable content
239                 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped
240                 || dir.equals("annotationsDerived") // skip, derived later
241             ) {
242                 continue;
243             }
244             System.out.println("\nDirectory: " + dir);
245             Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + dir + "/", ".*");
246             CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator);
247             for (LocaleTransform localeTransform : LocaleTransform.values()) {
248                 CLDRFile output = transformer.transform(localeTransform);
249                 if (output == null) {
250                     System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml");
251                     continue;
252                 }
253                 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator;
254                 String outputFile = output.getLocaleID() + ".xml";
255                 PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile);
256                 System.out.println("Generating locale file: " + outputDir + outputFile);
257                 if (!transformer.unconverted.isEmpty()) {
258                     System.out.println("Untransformed characters: " + transformer.unconverted);
259                     transformer.unconverted.clear();
260                 }
261                 output.write(out);
262                 out.close();
263             }
264         }
265     }
266 }
267