1 // Copyright 2011-2017 Google Inc. All Rights Reserved. 2 package org.unicode.cldr.tool; 3 4 import java.io.File; 5 import java.io.PrintWriter; 6 import java.util.HashMap; 7 import java.util.Map; 8 import java.util.regex.Matcher; 9 import java.util.regex.Pattern; 10 11 import org.unicode.cldr.draft.FileUtilities; 12 import org.unicode.cldr.util.CLDRFile; 13 import org.unicode.cldr.util.CLDRPaths; 14 import org.unicode.cldr.util.Factory; 15 import org.unicode.cldr.util.SimpleXMLSource; 16 import org.unicode.cldr.util.XMLSource; 17 18 /** 19 * Generates pseudolocalized contents of a CLDRFile. 20 * 21 * @author viarheichyk@google.com (Igor Viarheichyk) 22 */ 23 public class CLDRFilePseudolocalizer { 24 private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}"); 25 private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'"); 26 // Android patch (b/37077221) begin. 27 private static final String PSEUDOLOCALES_DIRECTORY = "."; 28 // Android patch (b/37077221) end. 29 private static final String ORIGINAL_LOCALE = "en"; 30 // Android patch (b/37512961) begin. 31 private static final String NUMBERS_PATH = "//ldml/numbers/defaultNumberingSystem"; 32 // Android patch (b/37512961) end. 33 private static final String EXEMPLAR_PATH = "//ldml/characters/exemplarCharacters"; 34 private static final String EXEMPLAR_AUX_PATH = "//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]"; 35 private static final String TERRITORY_PATTERN = "//ldml/localeDisplayNames/territories/territory[@type=\"%s\"]"; 36 private static final String[] EXCLUDE_LIST = { "/exemplarCharacters", "/delimiters", 37 "/contextTransforms", "/numbers", 38 "/units", // [ and ] are not allowed in units 39 "narrow", "localeDisplayPattern", "timeZoneNames/fallbackFormat", // Expansion limits 40 }; 41 private static final String[] PATTERN_LIST = { "/pattern", "FormatItem", "hourFormat" }; 42 43 private static class Pseudolocalizer { 44 private boolean pattern; 45 Pseudolocalizer()46 public Pseudolocalizer() { 47 pattern = false; 48 } 49 getPattern()50 public boolean getPattern() { 51 return pattern; 52 } 53 start()54 public String start() { 55 return ""; 56 } 57 end()58 public String end() { 59 return ""; 60 } 61 fragment(String text)62 public String fragment(String text) { 63 return text; 64 } 65 setPattern(boolean pattern)66 protected void setPattern(boolean pattern) { 67 this.pattern = pattern; 68 } 69 } 70 71 private static class PseudolocalizerXA extends Pseudolocalizer { 72 private static final String[] NUMBERS = { 73 "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", 74 "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", 75 "seventeen", "eighteen", "nineteen", "twenty", "twentyone", "twentytwo", 76 "twentythree", "twentyfour", "twentyfive", "twentysix", "twentyseven", 77 "twentyeight", "twentynine", "thirty", "thirtyone", "thirtytwo", 78 "thirtythree", "thirtyfour", "thirtyfive", "thirtysix", "thirtyseven", 79 "thirtyeight", "thirtynine", "forty" 80 }; 81 private static final Map<Integer, String> REPLACEMENTS = buildReplacementsTable(); 82 private int charCount = 0; 83 buildReplacementsTable()84 private static Map<Integer, String> buildReplacementsTable() { 85 Map<Integer, String> table = new HashMap<Integer, String>(); 86 table.put((int) ' ', "\u2003"); 87 table.put((int) '!', "\u00a1"); 88 table.put((int) '"', "\u2033"); 89 table.put((int) '#', "\u266f"); 90 table.put((int) '$', "\u20ac"); 91 table.put((int) '%', "\u2030"); 92 table.put((int) '&', "\u214b"); 93 table.put((int) '*', "\u204e"); 94 table.put((int) '+', "\u207a"); 95 table.put((int) ',', "\u060c"); 96 table.put((int) '-', "\u2010"); 97 table.put((int) '.', "\u00b7"); 98 table.put((int) '/', "\u2044"); 99 table.put((int) '0', "\u24ea"); 100 table.put((int) '1', "\u2460"); 101 table.put((int) '2', "\u2461"); 102 table.put((int) '3', "\u2462"); 103 table.put((int) '4', "\u2463"); 104 table.put((int) '5', "\u2464"); 105 table.put((int) '6', "\u2465"); 106 table.put((int) '7', "\u2466"); 107 table.put((int) '8', "\u2467"); 108 table.put((int) '9', "\u2468"); 109 table.put((int) ':', "\u2236"); 110 table.put((int) ';', "\u204f"); 111 table.put((int) '<', "\u2264"); 112 table.put((int) '=', "\u2242"); 113 table.put((int) '>', "\u2265"); 114 table.put((int) '?', "\u00bf"); 115 table.put((int) '@', "\u055e"); 116 table.put((int) 'A', "\u00c5"); 117 table.put((int) 'B', "\u0181"); 118 table.put((int) 'C', "\u00c7"); 119 table.put((int) 'D', "\u00d0"); 120 table.put((int) 'E', "\u00c9"); 121 table.put((int) 'F', "\u0191"); 122 table.put((int) 'G', "\u011c"); 123 table.put((int) 'H', "\u0124"); 124 table.put((int) 'I', "\u00ce"); 125 table.put((int) 'J', "\u0134"); 126 table.put((int) 'K', "\u0136"); 127 table.put((int) 'L', "\u013b"); 128 table.put((int) 'M', "\u1e40"); 129 table.put((int) 'N', "\u00d1"); 130 table.put((int) 'O', "\u00d6"); 131 table.put((int) 'P', "\u00de"); 132 table.put((int) 'Q', "\u01ea"); 133 table.put((int) 'R', "\u0154"); 134 table.put((int) 'S', "\u0160"); 135 table.put((int) 'T', "\u0162"); 136 table.put((int) 'U', "\u00db"); 137 table.put((int) 'V', "\u1e7c"); 138 table.put((int) 'W', "\u0174"); 139 table.put((int) 'X', "\u1e8a"); 140 table.put((int) 'Y', "\u00dd"); 141 table.put((int) 'Z', "\u017d"); 142 table.put((int) '[', "\u2045"); 143 table.put((int) '\\', "\u2216"); 144 table.put((int) ']', "\u2046"); 145 table.put((int) '^', "\u02c4"); 146 table.put((int) '_', "\u203f"); 147 table.put((int) '`', "\u2035"); 148 table.put((int) 'a', "\u00e5"); 149 table.put((int) 'b', "\u0180"); 150 table.put((int) 'c', "\u00e7"); 151 table.put((int) 'd', "\u00f0"); 152 table.put((int) 'e', "\u00e9"); 153 table.put((int) 'f', "\u0192"); 154 table.put((int) 'g', "\u011d"); 155 table.put((int) 'h', "\u0125"); 156 table.put((int) 'i', "\u00ee"); 157 table.put((int) 'j', "\u0135"); 158 table.put((int) 'k', "\u0137"); 159 table.put((int) 'l', "\u013c"); 160 table.put((int) 'm', "\u0271"); 161 table.put((int) 'n', "\u00f1"); 162 table.put((int) 'o', "\u00f6"); 163 table.put((int) 'p', "\u00fe"); 164 table.put((int) 'q', "\u01eb"); 165 table.put((int) 'r', "\u0155"); 166 table.put((int) 's', "\u0161"); 167 table.put((int) 't', "\u0163"); 168 table.put((int) 'u', "\u00fb"); 169 table.put((int) 'v', "\u1e7d"); 170 table.put((int) 'w', "\u0175"); 171 table.put((int) 'x', "\u1e8b"); 172 table.put((int) 'y', "\u00fd"); 173 table.put((int) 'z', "\u017e"); 174 table.put((int) '|', "\u00a6"); 175 table.put((int) '~', "\u02de"); 176 return table; 177 } 178 start()179 public String start() { 180 charCount = 0; 181 return "["; 182 } 183 end()184 public String end() { 185 StringBuilder expansionText = new StringBuilder(); 186 int expansion = (charCount + 1) / 2; 187 int wordIndex = 0; 188 while (expansion > 0) { 189 String word = NUMBERS[wordIndex++ % NUMBERS.length]; 190 expansionText.append(' '); 191 // Protect expansion strings with single quotes for patterns. 192 if (getPattern()) { 193 expansionText.append('\''); 194 } 195 expansionText.append(word); 196 if (getPattern()) { 197 expansionText.append('\''); 198 } 199 expansion -= word.length() + 1; 200 } 201 expansionText.append(']'); 202 return expansionText.toString(); 203 } 204 fragment(String text)205 public String fragment(String text) { 206 StringBuilder buf = new StringBuilder(); 207 int index = 0; 208 while (index < text.length()) { 209 int codePoint = text.codePointAt(index); 210 charCount++; 211 index += Character.charCount(codePoint); 212 String replacement = REPLACEMENTS.get(codePoint); 213 if (replacement != null) { 214 buf.append(replacement); 215 } else { 216 buf.appendCodePoint(codePoint); 217 } 218 } 219 return buf.toString(); 220 } 221 } 222 223 private static class PseudolocalizerXB extends Pseudolocalizer { 224 /** Right-to-left override character. */ 225 private static final String RLO = "\u202e"; 226 // Android patch (b/37512961) begin. 227 /** Arabic letter mark character. */ 228 private static final String ALM = "\u061C"; 229 /** Pop direction formatting character. */ 230 private static final String PDF = "\u202c"; 231 /** Prefix to add before each LTR word */ 232 private static final String BIDI_PREFIX = ALM + RLO; 233 /** Postfix to add after each LTR word */ 234 private static final String BIDI_POSTFIX = PDF + ALM; 235 // Android patch (b/37512961) end. 236 fragment(String text)237 public String fragment(String text) { 238 StringBuilder output = new StringBuilder(); 239 boolean wrapping = false; 240 for (int index = 0; index < text.length();) { 241 int codePoint = text.codePointAt(index); 242 index += Character.charCount(codePoint); 243 byte directionality = Character.getDirectionality(codePoint); 244 boolean needsWrap = (directionality == Character.DIRECTIONALITY_LEFT_TO_RIGHT); 245 if (needsWrap != wrapping) { 246 wrapping = needsWrap; 247 output.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX); 248 } 249 output.appendCodePoint(codePoint); 250 } 251 if (wrapping) { 252 output.append(BIDI_POSTFIX); 253 } 254 return output.toString(); 255 } 256 } 257 258 private String outputLocale; 259 private Pseudolocalizer pseudolocalizer; 260 261 /** 262 * Construct new CLDRPseudolocalization object. 263 * 264 * @param outputLocale 265 * name of target locale 266 * @param pipeline 267 * pseudolocalization pipeline to generate target locale data 268 */ CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer)269 public CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer) { 270 this.outputLocale = outputLocale; 271 this.pseudolocalizer = pseudolocalizer; 272 } 273 createInstanceXA()274 public static CLDRFilePseudolocalizer createInstanceXA() { 275 return new CLDRFilePseudolocalizer("en_XA", new PseudolocalizerXA()); 276 } 277 createInstanceXB()278 public static CLDRFilePseudolocalizer createInstanceXB() { 279 return new CLDRFilePseudolocalizer("ar_XB", new PseudolocalizerXB()); 280 } 281 282 /** 283 * Transforms a CLDRFile value into another form. 284 * 285 * @return pseudolocalized value. 286 */ transformValue(String path, String value)287 private String transformValue(String path, String value) { 288 if (containsOneOf(path, EXCLUDE_LIST)) { 289 return value; 290 } 291 if (containsOneOf(path, PATTERN_LIST)) { 292 return createMessage(value, QUOTED_TEXT, true); 293 } else { 294 return createMessage(value, NUMERIC_PLACEHOLDER, false); 295 } 296 } 297 298 /** 299 * Check if string contains any substring from the provided list. 300 */ containsOneOf(String string, String[] substrings)301 private boolean containsOneOf(String string, String[] substrings) { 302 for (String substring : substrings) { 303 if (string.contains(substring)) { 304 return true; 305 } 306 } 307 return false; 308 } 309 310 /** 311 * Create either localizable or non-localizable text fragment depending on flag value. 312 */ pseudolocalizeFragment(String text, boolean localizable)313 private String pseudolocalizeFragment(String text, boolean localizable) { 314 return localizable ? pseudolocalizer.fragment(text) : text; 315 } 316 317 /** 318 * Create a message that can contain localizable and non-localizable parts. 319 */ createMessage(String text, Pattern pattern, boolean matchIsLocalizable)320 private String createMessage(String text, Pattern pattern, 321 boolean matchIsLocalizable) { 322 StringBuffer buffer = new StringBuffer(pseudolocalizer.start()); 323 Matcher match = pattern.matcher(text); 324 int start = 0; 325 pseudolocalizer.setPattern(matchIsLocalizable); 326 for (; match.find(); start = match.end()) { 327 if (match.start() > start) { 328 buffer.append(pseudolocalizeFragment( 329 text.substring(start, match.start()), !matchIsLocalizable)); 330 } 331 buffer.append(pseudolocalizeFragment(match.group(), matchIsLocalizable)); 332 } 333 if (start < text.length()) { 334 buffer.append(pseudolocalizeFragment(text.substring(start), !matchIsLocalizable)); 335 } 336 buffer.append(pseudolocalizer.end()); 337 return buffer.toString(); 338 } 339 340 /** 341 * Add pseudolocale characters to exemplarCharacters entry pointed by xpath. 342 */ mergeExemplars(String value)343 private String mergeExemplars(String value) { 344 String pseudolocalized = createMessage(value, NUMERIC_PLACEHOLDER, false); 345 StringBuffer result = new StringBuffer(value.substring(0, value.length() - 1)); 346 final char CLOSING_BRACKET = ']'; 347 for (int i = 0; i < pseudolocalized.length(); i++) { 348 char c = pseudolocalized.charAt(i); 349 if (c != CLOSING_BRACKET) { 350 String chunk; 351 if (Character.isAlphabetic(c)) { 352 chunk = String.valueOf(c); 353 } else { 354 chunk = String.format("\\u%04X", (int) c); 355 } 356 if (result.indexOf(chunk) == -1 357 && result.indexOf(String.valueOf(c)) == -1) { 358 result.append(' '); 359 result.append(chunk); 360 } 361 } 362 } 363 result.append(CLOSING_BRACKET); 364 return result.toString(); 365 } 366 367 /** 368 * Generate CLDRFile object. Original CLDRFile is created from .xml file and its 369 * content is passed through pseudolocalization pipeline. 370 */ generate()371 public CLDRFile generate() { 372 Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 373 // Create input CLDRFile object resolving inherited data. 374 CLDRFile input = factory.make(ORIGINAL_LOCALE, false); 375 XMLSource outputSource = new SimpleXMLSource(outputLocale); 376 for (String xpath : input) { 377 String fullPath = input.getFullXPath(xpath); 378 String value = input.getStringValue(xpath); 379 if (!value.isEmpty()) { 380 String newValue = transformValue(xpath, value); 381 if (!newValue.equals(value)) { 382 outputSource.putValueAtPath(fullPath, newValue); 383 } 384 } 385 } 386 // Pseudolocalize exemplar characters and put them into auxiliary set. 387 outputSource.putValueAtPath(EXEMPLAR_AUX_PATH, 388 mergeExemplars(input.getStringValue(EXEMPLAR_PATH))); 389 // Create fake pseudolocales territories. 390 addTerritory(outputSource, "XA"); 391 addTerritory(outputSource, "XB"); 392 // Android patch (b/37512961) begin. 393 // Use latin numbers for pseudolocales. 394 outputSource.putValueAtPath(NUMBERS_PATH, "latn"); 395 // Android patch (b/37512961) end. 396 return new CLDRFile(outputSource); 397 } 398 399 /** 400 * Add a territory into output xml. 401 */ addTerritory(XMLSource outputSource, String territory)402 private void addTerritory(XMLSource outputSource, String territory) { 403 String territoryPath = String.format(TERRITORY_PATTERN, territory); 404 outputSource.putValueAtPath(territoryPath, String.format("[%s]", territory)); 405 } 406 407 /** 408 * Generate CLDRFile object and save it into .xml file. 409 */ generateAndSave()410 public String generateAndSave() throws Exception { 411 CLDRFile output = generate(); 412 String outputDir = CLDRPaths.GEN_DIRECTORY + "main" + File.separator + PSEUDOLOCALES_DIRECTORY + File.separator; 413 String outputFile = output.getLocaleID() + ".xml"; 414 PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile); 415 output.write(out); 416 out.close(); 417 return (outputDir + outputFile); 418 } 419 main(String[] args)420 public static void main(String[] args) throws Exception { 421 // Generate en-XA locale (accents, brackets and expansion), 422 // dump resulting file name to stdout. 423 System.out.println(createInstanceXA().generateAndSave()); 424 // Generate ar-XB (fake Bidi) locale. 425 System.out.println(createInstanceXB().generateAndSave()); 426 } 427 } 428