1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.PrintWriter; 5 import java.util.Map; 6 import java.util.concurrent.ConcurrentHashMap; 7 8 import org.unicode.cldr.draft.FileUtilities; 9 import org.unicode.cldr.util.CLDRFile; 10 import org.unicode.cldr.util.CLDRPaths; 11 import org.unicode.cldr.util.CLDRTransforms; 12 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; 13 import org.unicode.cldr.util.CldrUtility; 14 import org.unicode.cldr.util.DtdType; 15 import org.unicode.cldr.util.Factory; 16 import org.unicode.cldr.util.LocaleIDParser; 17 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; 18 import org.unicode.cldr.util.SimpleXMLSource; 19 import org.unicode.cldr.util.XMLSource; 20 21 import com.ibm.icu.text.Normalizer; 22 import com.ibm.icu.text.Transliterator; 23 import com.ibm.icu.text.UnicodeSet; 24 import com.ibm.icu.util.ICUUncheckedIOException; 25 26 /** 27 * Transforms the contents of a CLDRFile. 28 * 29 * @author jchye 30 */ 31 public class CLDRFileTransformer { 32 public enum PolicyIfExisting { 33 RETAIN, // Do not transliterate if existing output has locale content 34 DISCARD, // Replace existing output locale content 35 MINIMIZE // RETAIN, plus drop values if translit is a no-op. 36 } 37 38 /** 39 * Contains all supported locale-to-locale conversions along with information 40 * needed to convert each locale. Each enum value is named after the locale that results 41 * from the conversion. 42 */ 43 public enum LocaleTransform { 44 sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 45 sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 46 sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 47 sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 48 ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), // 49 yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), // 50 de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), // 51 yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), // 52 // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), 53 // Needs work to fix currency symbols, handle Maori. See http://unicode.org/cldr/trac/ticket/9516#comment:6 54 ; 55 56 private final String inputLocale; 57 private final String transformFilename; 58 private final int direction; 59 private final UnicodeSet inputChars; 60 private final PolicyIfExisting policy; 61 62 /** 63 * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead 64 */ 65 @Deprecated LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)66 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) { 67 this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD); 68 } 69 LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)70 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) { 71 this.inputLocale = inputLocale; 72 this.transformFilename = transformFilename; 73 this.direction = direction; 74 this.inputChars = new UnicodeSet(inputCharPattern); 75 this.policy = policy; 76 } 77 78 /** 79 * @return the policy for existing content 80 */ getPolicyIfExisting()81 public PolicyIfExisting getPolicyIfExisting() { 82 return policy; 83 } 84 85 /** 86 * @return the locale that used for conversion 87 */ getInputLocale()88 public String getInputLocale() { 89 return inputLocale; 90 } 91 92 /** 93 * @return the locale that used for conversion 94 */ getOutputLocale()95 public String getOutputLocale() { 96 return this.toString(); 97 } 98 99 /** 100 * @return the filename of the transform used to make the conversion 101 */ getTransformFilename()102 public String getTransformFilename() { 103 return transformFilename; 104 } 105 106 /** 107 * @return the direction of the transformation 108 */ getDirection()109 public int getDirection() { 110 return direction; 111 } 112 113 /** 114 * @return the set of characters in the input locale that should have been removed after 115 * transformation, used for internal debugging 116 */ getInputChars()117 private UnicodeSet getInputChars() { 118 return inputChars; 119 } 120 } 121 122 private UnicodeSet unconverted = new UnicodeSet(); 123 private Factory factory; 124 /* 125 * The transliterators map exists, and is static, to avoid wasting a lot of time creating 126 * a new Transliterator more often than necessary. (An alternative to "static" here might be to 127 * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.) 128 * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems. 129 * Reference: https://unicode.org/cldr/trac/ticket/11657 130 */ 131 private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>(); 132 private String transformDir; 133 134 /** 135 * @param factory 136 * the factory to get locale data from 137 * @param transformDir 138 * the directory containing the transform files 139 */ CLDRFileTransformer(Factory factory, String transformDir)140 public CLDRFileTransformer(Factory factory, String transformDir) { 141 this.factory = factory; 142 this.transformDir = transformDir; 143 } 144 loadTransliterator(LocaleTransform localeTransform)145 public Transliterator loadTransliterator(LocaleTransform localeTransform) { 146 if (transliterators.containsKey(localeTransform)) { 147 return transliterators.get(localeTransform); 148 } 149 Transliterator transliterator; 150 if (localeTransform.getTransformFilename().contains(".xml")) { 151 ParsedTransformID directionInfo = new ParsedTransformID(); 152 String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo); 153 transliterator = Transliterator.createFromRules(directionInfo.getId(), 154 ruleString, localeTransform.getDirection()); 155 } else { 156 transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); 157 } 158 transliterators.put(localeTransform, transliterator); 159 return transliterator; 160 } 161 162 /** 163 * NOTE: This method does not currently handle nested transliterators. 164 * 165 * @param input 166 * @return null if the input file was missing, or if there is no new output file. 167 */ transform(LocaleTransform localeTransform)168 public CLDRFile transform(LocaleTransform localeTransform) { 169 Transliterator transliterator = loadTransliterator(localeTransform); 170 CLDRFile input; 171 try { 172 input = factory.make(localeTransform.getInputLocale(), false); 173 } catch (ICUUncheckedIOException e1) { 174 return null; // input file is missing (or otherwise unavailable) 175 } 176 boolean hadOutput = true; 177 CLDRFile output; 178 try { 179 output = factory.make(localeTransform.getOutputLocale(), false); 180 } catch (NoSourceDirectoryException e) { 181 // if we can't open the file, then just make a new one. 182 XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); 183 output = new CLDRFile(dataSource); 184 hadOutput = false; 185 } 186 String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); 187 CLDRFile outputParent = factory.make(outputParentString, true); 188 189 outputParent = factory.make(localeTransform.getInputLocale(), false); 190 XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); 191 for (String xpath : input) { 192 String value = input.getStringValue(xpath); 193 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 194 value = null; 195 } 196 if (value == null) { 197 continue; 198 } 199 String fullPath = input.getFullXPath(xpath); 200 String oldValue = output.getStringValue(xpath); 201 String parentValue = outputParent.getStringValue(xpath); 202 value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue); 203 if (value != null && !CldrUtility.INHERITANCE_MARKER.equals(value)) { 204 outputSource.putValueAtPath(fullPath, value); 205 } 206 } 207 if (!outputSource.iterator().hasNext()) { // empty new output 208 if (!hadOutput) { 209 return null; // don't add file if nothing to add 210 } 211 } 212 return new CLDRFile(outputSource); 213 } 214 215 /** 216 * Transforms a CLDRFile value into another form. 217 * @param parentValue 218 */ transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)219 private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, 220 String oldValue, String parentValue) { 221 222 // allows us to change only new values 223 switch (localeTransform.policy) { 224 case RETAIN: 225 case MINIMIZE: 226 if (oldValue != null) { 227 return oldValue; 228 } 229 break; 230 default: 231 } 232 233 UnicodeSet chars = localeTransform.getInputChars(); 234 String transliterated; 235 236 // TODO: Don't transform dates/patterns. 237 // For now, don't try to transliterate the exemplar characters - use the ones from the original locale. 238 // In the future, we can probably control this better with a config file - similar to CLDRModify's config file. 239 if (path.contains("exemplarCharacters")) { 240 if (oldValue != null) { 241 transliterated = oldValue; 242 } else { 243 transliterated = value; 244 } 245 } else { 246 transliterated = transliterator.transliterate(value); 247 transliterated = Normalizer.compose(transliterated, false); 248 } 249 if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { 250 if (transliterated.equals(value)) { 251 return null; 252 } 253 } 254 255 if (chars.containsSome(transliterated)) { 256 unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); 257 } 258 return transliterated; 259 } 260 main(String[] args)261 public static void main(String[] args) throws Exception { 262 for (String dir : DtdType.ldml.directories) { 263 if (dir.equals("casing") // skip, field contents are keywords, not localizable content 264 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped 265 || dir.equals("annotationsDerived") // skip, derived later 266 ) { 267 continue; 268 } 269 System.out.println("\nDirectory: " + dir); 270 final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/"; 271 Factory factory = Factory.make(sourceDirectory, ".*"); 272 273 CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); 274 for (LocaleTransform localeTransform : LocaleTransform.values()) { 275 CLDRFile output = transformer.transform(localeTransform); 276 if (output == null) { 277 System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml"); 278 continue; 279 } 280 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator; 281 String outputFile = output.getLocaleID() + ".xml"; 282 283 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) { 284 System.out.println("Generating locale file: " + outputDir + outputFile); 285 if (!transformer.unconverted.isEmpty()) { 286 System.out.println("Untransformed characters: " + transformer.unconverted); 287 transformer.unconverted.clear(); 288 } 289 output.write(out); 290 } 291 } 292 } 293 } 294 } 295