1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.PrintWriter;
5 import java.util.Map;
6 import java.util.concurrent.ConcurrentHashMap;
7 
8 import org.unicode.cldr.draft.FileUtilities;
9 import org.unicode.cldr.util.CLDRFile;
10 import org.unicode.cldr.util.CLDRPaths;
11 import org.unicode.cldr.util.CLDRTransforms;
12 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID;
13 import org.unicode.cldr.util.CldrUtility;
14 import org.unicode.cldr.util.DtdType;
15 import org.unicode.cldr.util.Factory;
16 import org.unicode.cldr.util.LocaleIDParser;
17 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException;
18 import org.unicode.cldr.util.SimpleXMLSource;
19 import org.unicode.cldr.util.XMLSource;
20 
21 import com.ibm.icu.text.Normalizer;
22 import com.ibm.icu.text.Transliterator;
23 import com.ibm.icu.text.UnicodeSet;
24 import com.ibm.icu.util.ICUUncheckedIOException;
25 
26 /**
27  * Transforms the contents of a CLDRFile.
28  *
29  * @author jchye
30  */
31 public class CLDRFileTransformer {
32     public enum PolicyIfExisting {
33         RETAIN,  // Do not transliterate if existing output has locale content
34         DISCARD, // Replace existing output locale content
35         MINIMIZE // RETAIN, plus drop values if translit is a no-op.
36     }
37 
38     /**
39      * Contains all supported locale-to-locale conversions along with information
40      * needed to convert each locale. Each enum value is named after the locale that results
41      * from the conversion.
42      */
43     public enum LocaleTransform {
44         sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
45         sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
46         sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
47         sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
48         ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), //
49         yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), //
50         de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), //
51         yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), //
52         // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD),
53         // Needs work to fix currency symbols, handle Maori. See http://unicode.org/cldr/trac/ticket/9516#comment:6
54         ;
55 
56         private final String inputLocale;
57         private final String transformFilename;
58         private final int direction;
59         private final UnicodeSet inputChars;
60         private final PolicyIfExisting policy;
61 
62         /**
63          * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead
64          */
65         @Deprecated
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)66         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) {
67             this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD);
68         }
69 
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)70         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) {
71             this.inputLocale = inputLocale;
72             this.transformFilename = transformFilename;
73             this.direction = direction;
74             this.inputChars = new UnicodeSet(inputCharPattern);
75             this.policy = policy;
76         }
77 
78         /**
79          * @return the policy for existing content
80          */
getPolicyIfExisting()81         public PolicyIfExisting getPolicyIfExisting() {
82             return policy;
83         }
84 
85         /**
86          * @return the locale that used for conversion
87          */
getInputLocale()88         public String getInputLocale() {
89             return inputLocale;
90         }
91 
92         /**
93          * @return the locale that used for conversion
94          */
getOutputLocale()95         public String getOutputLocale() {
96             return this.toString();
97         }
98 
99         /**
100          * @return the filename of the transform used to make the conversion
101          */
getTransformFilename()102         public String getTransformFilename() {
103             return transformFilename;
104         }
105 
106         /**
107          * @return the direction of the transformation
108          */
getDirection()109         public int getDirection() {
110             return direction;
111         }
112 
113         /**
114          * @return the set of characters in the input locale that should have been removed after
115          *         transformation, used for internal debugging
116          */
getInputChars()117         private UnicodeSet getInputChars() {
118             return inputChars;
119         }
120     }
121 
122     private UnicodeSet unconverted = new UnicodeSet();
123     private Factory factory;
124     /*
125      * The transliterators map exists, and is static, to avoid wasting a lot of time creating
126      * a new Transliterator more often than necessary. (An alternative to "static" here might be to
127      * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.)
128      * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems.
129      * Reference: https://unicode.org/cldr/trac/ticket/11657
130      */
131     private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>();
132     private String transformDir;
133 
134     /**
135      * @param factory
136      *            the factory to get locale data from
137      * @param transformDir
138      *            the directory containing the transform files
139      */
CLDRFileTransformer(Factory factory, String transformDir)140     public CLDRFileTransformer(Factory factory, String transformDir) {
141         this.factory = factory;
142         this.transformDir = transformDir;
143     }
144 
loadTransliterator(LocaleTransform localeTransform)145     public Transliterator loadTransliterator(LocaleTransform localeTransform) {
146         if (transliterators.containsKey(localeTransform)) {
147             return transliterators.get(localeTransform);
148         }
149         Transliterator transliterator;
150         if (localeTransform.getTransformFilename().contains(".xml")) {
151             ParsedTransformID directionInfo = new ParsedTransformID();
152             String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo);
153             transliterator = Transliterator.createFromRules(directionInfo.getId(),
154                 ruleString, localeTransform.getDirection());
155         } else {
156             transliterator = Transliterator.getInstance(localeTransform.getTransformFilename());
157         }
158         transliterators.put(localeTransform, transliterator);
159         return transliterator;
160     }
161 
162     /**
163      * NOTE: This method does not currently handle nested transliterators.
164      *
165      * @param input
166      * @return null if the input file was missing, or if there is no new output file.
167      */
transform(LocaleTransform localeTransform)168     public CLDRFile transform(LocaleTransform localeTransform) {
169         Transliterator transliterator = loadTransliterator(localeTransform);
170         CLDRFile input;
171         try {
172             input = factory.make(localeTransform.getInputLocale(), false);
173         } catch (ICUUncheckedIOException e1) {
174             return null; // input file is missing (or otherwise unavailable)
175         }
176         boolean hadOutput = true;
177         CLDRFile output;
178         try {
179             output = factory.make(localeTransform.getOutputLocale(), false);
180         } catch (NoSourceDirectoryException e) {
181             // if we can't open the file, then just make a new one.
182             XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale());
183             output = new CLDRFile(dataSource);
184             hadOutput = false;
185         }
186         String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale());
187         CLDRFile outputParent = factory.make(outputParentString, true);
188 
189         outputParent = factory.make(localeTransform.getInputLocale(), false);
190         XMLSource outputSource = new SimpleXMLSource(localeTransform.toString());
191         for (String xpath : input) {
192             String value = input.getStringValue(xpath);
193             if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
194                 value = null;
195             }
196             if (value == null) {
197                 continue;
198             }
199             String fullPath = input.getFullXPath(xpath);
200             String oldValue = output.getStringValue(xpath);
201             String parentValue = outputParent.getStringValue(xpath);
202             value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue);
203             if (value != null && !CldrUtility.INHERITANCE_MARKER.equals(value)) {
204                 outputSource.putValueAtPath(fullPath, value);
205             }
206         }
207         if (!outputSource.iterator().hasNext()) { // empty new output
208             if (!hadOutput) {
209                 return null; // don't add file if nothing to add
210             }
211         }
212         return new CLDRFile(outputSource);
213     }
214 
215     /**
216      * Transforms a CLDRFile value into another form.
217      * @param parentValue
218      */
transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)219     private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value,
220         String oldValue, String parentValue) {
221 
222         // allows us to change only new values
223         switch (localeTransform.policy) {
224         case RETAIN:
225         case MINIMIZE:
226             if (oldValue != null) {
227                 return oldValue;
228             }
229             break;
230         default:
231         }
232 
233         UnicodeSet chars = localeTransform.getInputChars();
234         String transliterated;
235 
236         // TODO: Don't transform dates/patterns.
237         // For now, don't try to transliterate the exemplar characters - use the ones from the original locale.
238         // In the future, we can probably control this better with a config file - similar to CLDRModify's config file.
239         if (path.contains("exemplarCharacters")) {
240             if (oldValue != null) {
241                 transliterated = oldValue;
242             } else {
243                 transliterated = value;
244             }
245         } else {
246             transliterated = transliterator.transliterate(value);
247             transliterated = Normalizer.compose(transliterated, false);
248         }
249         if (localeTransform.policy == PolicyIfExisting.MINIMIZE) {
250             if (transliterated.equals(value)) {
251                 return null;
252             }
253         }
254 
255         if (chars.containsSome(transliterated)) {
256             unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated));
257         }
258         return transliterated;
259     }
260 
main(String[] args)261     public static void main(String[] args) throws Exception {
262         for (String dir : DtdType.ldml.directories) {
263             if (dir.equals("casing") // skip, field contents are keywords, not localizable content
264                 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped
265                 || dir.equals("annotationsDerived") // skip, derived later
266                 ) {
267                 continue;
268             }
269             System.out.println("\nDirectory: " + dir);
270             final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/";
271             Factory factory = Factory.make(sourceDirectory, ".*");
272 
273             CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator);
274             for (LocaleTransform localeTransform : LocaleTransform.values()) {
275                 CLDRFile output = transformer.transform(localeTransform);
276                 if (output == null) {
277                     System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml");
278                     continue;
279                 }
280                 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator;
281                 String outputFile = output.getLocaleID() + ".xml";
282 
283                 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) {
284                     System.out.println("Generating locale file: " + outputDir + outputFile);
285                     if (!transformer.unconverted.isEmpty()) {
286                         System.out.println("Untransformed characters: " + transformer.unconverted);
287                         transformer.unconverted.clear();
288                     }
289                     output.write(out);
290                 }
291             }
292         }
293     }
294 }
295