1 // Copyright 2011-2017 Google Inc. All Rights Reserved. 2 package org.unicode.cldr.tool; 3 4 import java.io.File; 5 import java.io.PrintWriter; 6 import java.util.HashMap; 7 import java.util.Map; 8 import java.util.regex.Matcher; 9 import java.util.regex.Pattern; 10 11 import org.unicode.cldr.draft.FileUtilities; 12 import org.unicode.cldr.util.CLDRFile; 13 import org.unicode.cldr.util.CLDRPaths; 14 import org.unicode.cldr.util.Factory; 15 import org.unicode.cldr.util.SimpleXMLSource; 16 import org.unicode.cldr.util.XMLSource; 17 18 /** 19 * Generates pseudolocalized contents of a CLDRFile. 20 * 21 * @author viarheichyk@google.com (Igor Viarheichyk) 22 */ 23 public class CLDRFilePseudolocalizer { 24 private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}"); 25 private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'"); 26 // Android patch (b/37077221) begin. 27 private static final String PSEUDOLOCALES_DIRECTORY = "."; 28 // Android patch (b/37077221) end. 29 private static final String ORIGINAL_LOCALE = "en"; 30 // Android patch (b/37512961) begin. 31 private static final String NUMBERS_PATH = "//ldml/numbers/defaultNumberingSystem"; 32 // Android patch (b/37512961) end. 33 private static final String EXEMPLAR_PATH = "//ldml/characters/exemplarCharacters"; 34 private static final String EXEMPLAR_AUX_PATH = "//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]"; 35 private static final String TERRITORY_PATTERN = "//ldml/localeDisplayNames/territories/territory[@type=\"%s\"]"; 36 private static final String[] EXCLUDE_LIST = { "/exemplarCharacters", "/delimiters", 37 "/contextTransforms", "/numbers", 38 "/units", // [ and ] are not allowed in units 39 "narrow", "localeDisplayPattern", "timeZoneNames/fallbackFormat", // Expansion limits 40 }; 41 private static final String[] PATTERN_LIST = { "/pattern", "FormatItem", "hourFormat" }; 42 43 private static class Pseudolocalizer { 44 private boolean pattern; 45 Pseudolocalizer()46 public Pseudolocalizer() { 47 pattern = false; 48 } 49 getPattern()50 public boolean getPattern() { 51 return pattern; 52 } 53 start()54 public String start() { 55 return ""; 56 } 57 end()58 public String end() { 59 return ""; 60 } 61 fragment(String text)62 public String fragment(String text) { 63 return text; 64 } 65 setPattern(boolean pattern)66 protected void setPattern(boolean pattern) { 67 this.pattern = pattern; 68 } 69 } 70 71 private static class PseudolocalizerXA extends Pseudolocalizer { 72 private static final String[] NUMBERS = { 73 "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", 74 "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", 75 "seventeen", "eighteen", "nineteen", "twenty", "twentyone", "twentytwo", 76 "twentythree", "twentyfour", "twentyfive", "twentysix", "twentyseven", 77 "twentyeight", "twentynine", "thirty", "thirtyone", "thirtytwo", 78 "thirtythree", "thirtyfour", "thirtyfive", "thirtysix", "thirtyseven", 79 "thirtyeight", "thirtynine", "forty" 80 }; 81 private static final Map<Integer, String> REPLACEMENTS = buildReplacementsTable(); 82 private int charCount = 0; 83 buildReplacementsTable()84 private static Map<Integer, String> buildReplacementsTable() { 85 Map<Integer, String> table = new HashMap<>(); 86 table.put((int) ' ', "\u2003"); 87 table.put((int) '!', "\u00a1"); 88 table.put((int) '"', "\u2033"); 89 table.put((int) '#', "\u266f"); 90 table.put((int) '$', "\u20ac"); 91 table.put((int) '%', "\u2030"); 92 table.put((int) '&', "\u214b"); 93 table.put((int) '*', "\u204e"); 94 table.put((int) '+', "\u207a"); 95 table.put((int) ',', "\u060c"); 96 table.put((int) '-', "\u2010"); 97 table.put((int) '.', "\u00b7"); 98 table.put((int) '/', "\u2044"); 99 table.put((int) '0', "\u24ea"); 100 table.put((int) '1', "\u2460"); 101 table.put((int) '2', "\u2461"); 102 table.put((int) '3', "\u2462"); 103 table.put((int) '4', "\u2463"); 104 table.put((int) '5', "\u2464"); 105 table.put((int) '6', "\u2465"); 106 table.put((int) '7', "\u2466"); 107 table.put((int) '8', "\u2467"); 108 table.put((int) '9', "\u2468"); 109 table.put((int) ':', "\u2236"); 110 table.put((int) ';', "\u204f"); 111 table.put((int) '<', "\u2264"); 112 table.put((int) '=', "\u2242"); 113 table.put((int) '>', "\u2265"); 114 table.put((int) '?', "\u00bf"); 115 table.put((int) '@', "\u055e"); 116 table.put((int) 'A', "\u00c5"); 117 table.put((int) 'B', "\u0181"); 118 table.put((int) 'C', "\u00c7"); 119 table.put((int) 'D', "\u00d0"); 120 table.put((int) 'E', "\u00c9"); 121 table.put((int) 'F', "\u0191"); 122 table.put((int) 'G', "\u011c"); 123 table.put((int) 'H', "\u0124"); 124 table.put((int) 'I', "\u00ce"); 125 table.put((int) 'J', "\u0134"); 126 table.put((int) 'K', "\u0136"); 127 table.put((int) 'L', "\u013b"); 128 table.put((int) 'M', "\u1e40"); 129 table.put((int) 'N', "\u00d1"); 130 table.put((int) 'O', "\u00d6"); 131 table.put((int) 'P', "\u00de"); 132 table.put((int) 'Q', "\u01ea"); 133 table.put((int) 'R', "\u0154"); 134 table.put((int) 'S', "\u0160"); 135 table.put((int) 'T', "\u0162"); 136 table.put((int) 'U', "\u00db"); 137 table.put((int) 'V', "\u1e7c"); 138 table.put((int) 'W', "\u0174"); 139 table.put((int) 'X', "\u1e8a"); 140 table.put((int) 'Y', "\u00dd"); 141 table.put((int) 'Z', "\u017d"); 142 table.put((int) '[', "\u2045"); 143 table.put((int) '\\', "\u2216"); 144 table.put((int) ']', "\u2046"); 145 table.put((int) '^', "\u02c4"); 146 table.put((int) '_', "\u203f"); 147 table.put((int) '`', "\u2035"); 148 table.put((int) 'a', "\u00e5"); 149 table.put((int) 'b', "\u0180"); 150 table.put((int) 'c', "\u00e7"); 151 table.put((int) 'd', "\u00f0"); 152 table.put((int) 'e', "\u00e9"); 153 table.put((int) 'f', "\u0192"); 154 table.put((int) 'g', "\u011d"); 155 table.put((int) 'h', "\u0125"); 156 table.put((int) 'i', "\u00ee"); 157 table.put((int) 'j', "\u0135"); 158 table.put((int) 'k', "\u0137"); 159 table.put((int) 'l', "\u013c"); 160 table.put((int) 'm', "\u0271"); 161 table.put((int) 'n', "\u00f1"); 162 table.put((int) 'o', "\u00f6"); 163 table.put((int) 'p', "\u00fe"); 164 table.put((int) 'q', "\u01eb"); 165 table.put((int) 'r', "\u0155"); 166 table.put((int) 's', "\u0161"); 167 table.put((int) 't', "\u0163"); 168 table.put((int) 'u', "\u00fb"); 169 table.put((int) 'v', "\u1e7d"); 170 table.put((int) 'w', "\u0175"); 171 table.put((int) 'x', "\u1e8b"); 172 table.put((int) 'y', "\u00fd"); 173 table.put((int) 'z', "\u017e"); 174 table.put((int) '|', "\u00a6"); 175 table.put((int) '~', "\u02de"); 176 return table; 177 } 178 179 @Override start()180 public String start() { 181 charCount = 0; 182 return "["; 183 } 184 185 @Override end()186 public String end() { 187 StringBuilder expansionText = new StringBuilder(); 188 int expansion = (charCount + 1) / 2; 189 int wordIndex = 0; 190 while (expansion > 0) { 191 String word = NUMBERS[wordIndex++ % NUMBERS.length]; 192 expansionText.append(' '); 193 // Protect expansion strings with single quotes for patterns. 194 if (getPattern()) { 195 expansionText.append('\''); 196 } 197 expansionText.append(word); 198 if (getPattern()) { 199 expansionText.append('\''); 200 } 201 expansion -= word.length() + 1; 202 } 203 expansionText.append(']'); 204 return expansionText.toString(); 205 } 206 207 @Override fragment(String text)208 public String fragment(String text) { 209 StringBuilder buf = new StringBuilder(); 210 int index = 0; 211 while (index < text.length()) { 212 int codePoint = text.codePointAt(index); 213 charCount++; 214 index += Character.charCount(codePoint); 215 String replacement = REPLACEMENTS.get(codePoint); 216 if (replacement != null) { 217 buf.append(replacement); 218 } else { 219 buf.appendCodePoint(codePoint); 220 } 221 } 222 return buf.toString(); 223 } 224 } 225 226 private static class PseudolocalizerXB extends Pseudolocalizer { 227 /** Right-to-left override character. */ 228 private static final String RLO = "\u202e"; 229 // Android patch (b/37512961) begin. 230 /** Arabic letter mark character. */ 231 private static final String ALM = "\u061C"; 232 /** Pop direction formatting character. */ 233 private static final String PDF = "\u202c"; 234 /** Prefix to add before each LTR word */ 235 private static final String BIDI_PREFIX = ALM + RLO; 236 /** Postfix to add after each LTR word */ 237 private static final String BIDI_POSTFIX = PDF + ALM; 238 // Android patch (b/37512961) end. 239 240 @Override fragment(String text)241 public String fragment(String text) { 242 StringBuilder output = new StringBuilder(); 243 boolean wrapping = false; 244 for (int index = 0; index < text.length();) { 245 int codePoint = text.codePointAt(index); 246 index += Character.charCount(codePoint); 247 byte directionality = Character.getDirectionality(codePoint); 248 boolean needsWrap = (directionality == Character.DIRECTIONALITY_LEFT_TO_RIGHT); 249 if (needsWrap != wrapping) { 250 wrapping = needsWrap; 251 output.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX); 252 } 253 output.appendCodePoint(codePoint); 254 } 255 if (wrapping) { 256 output.append(BIDI_POSTFIX); 257 } 258 return output.toString(); 259 } 260 } 261 262 private String outputLocale; 263 private Pseudolocalizer pseudolocalizer; 264 265 /** 266 * Construct new CLDRPseudolocalization object. 267 * 268 * @param outputLocale 269 * name of target locale 270 * @param pipeline 271 * pseudolocalization pipeline to generate target locale data 272 */ CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer)273 public CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer) { 274 this.outputLocale = outputLocale; 275 this.pseudolocalizer = pseudolocalizer; 276 } 277 createInstanceXA()278 public static CLDRFilePseudolocalizer createInstanceXA() { 279 return new CLDRFilePseudolocalizer("en_XA", new PseudolocalizerXA()); 280 } 281 createInstanceXB()282 public static CLDRFilePseudolocalizer createInstanceXB() { 283 return new CLDRFilePseudolocalizer("ar_XB", new PseudolocalizerXB()); 284 } 285 286 /** 287 * Transforms a CLDRFile value into another form. 288 * 289 * @return pseudolocalized value. 290 */ transformValue(String path, String value)291 private String transformValue(String path, String value) { 292 if (containsOneOf(path, EXCLUDE_LIST)) { 293 return value; 294 } 295 if (containsOneOf(path, PATTERN_LIST)) { 296 return createMessage(value, QUOTED_TEXT, true); 297 } else { 298 return createMessage(value, NUMERIC_PLACEHOLDER, false); 299 } 300 } 301 302 /** 303 * Check if string contains any substring from the provided list. 304 */ containsOneOf(String string, String[] substrings)305 private boolean containsOneOf(String string, String[] substrings) { 306 for (String substring : substrings) { 307 if (string.contains(substring)) { 308 return true; 309 } 310 } 311 return false; 312 } 313 314 /** 315 * Create either localizable or non-localizable text fragment depending on flag value. 316 */ pseudolocalizeFragment(String text, boolean localizable)317 private String pseudolocalizeFragment(String text, boolean localizable) { 318 return localizable ? pseudolocalizer.fragment(text) : text; 319 } 320 321 /** 322 * Create a message that can contain localizable and non-localizable parts. 323 */ createMessage(String text, Pattern pattern, boolean matchIsLocalizable)324 private String createMessage(String text, Pattern pattern, 325 boolean matchIsLocalizable) { 326 StringBuffer buffer = new StringBuffer(pseudolocalizer.start()); 327 Matcher match = pattern.matcher(text); 328 int start = 0; 329 pseudolocalizer.setPattern(matchIsLocalizable); 330 for (; match.find(); start = match.end()) { 331 if (match.start() > start) { 332 buffer.append(pseudolocalizeFragment( 333 text.substring(start, match.start()), !matchIsLocalizable)); 334 } 335 buffer.append(pseudolocalizeFragment(match.group(), matchIsLocalizable)); 336 } 337 if (start < text.length()) { 338 buffer.append(pseudolocalizeFragment(text.substring(start), !matchIsLocalizable)); 339 } 340 buffer.append(pseudolocalizer.end()); 341 return buffer.toString(); 342 } 343 344 /** 345 * Add pseudolocale characters to exemplarCharacters entry pointed by xpath. 346 */ mergeExemplars(String value)347 private String mergeExemplars(String value) { 348 String pseudolocalized = createMessage(value, NUMERIC_PLACEHOLDER, false); 349 StringBuffer result = new StringBuffer(value.substring(0, value.length() - 1)); 350 final char CLOSING_BRACKET = ']'; 351 for (int i = 0; i < pseudolocalized.length(); i++) { 352 char c = pseudolocalized.charAt(i); 353 if (c != CLOSING_BRACKET) { 354 String chunk; 355 if (Character.isAlphabetic(c)) { 356 chunk = String.valueOf(c); 357 } else { 358 chunk = String.format("\\u%04X", (int) c); 359 } 360 if (result.indexOf(chunk) == -1 361 && result.indexOf(String.valueOf(c)) == -1) { 362 result.append(' '); 363 result.append(chunk); 364 } 365 } 366 } 367 result.append(CLOSING_BRACKET); 368 return result.toString(); 369 } 370 371 /** 372 * Generate CLDRFile object. Original CLDRFile is created from .xml file and its 373 * content is passed through pseudolocalization pipeline. 374 */ generate()375 public CLDRFile generate() { 376 Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 377 // Create input CLDRFile object resolving inherited data. 378 CLDRFile input = factory.make(ORIGINAL_LOCALE, false); 379 XMLSource outputSource = new SimpleXMLSource(outputLocale); 380 for (String xpath : input) { 381 String fullPath = input.getFullXPath(xpath); 382 String value = input.getStringValue(xpath); 383 if (!value.isEmpty()) { 384 String newValue = transformValue(xpath, value); 385 if (!newValue.equals(value)) { 386 outputSource.putValueAtPath(fullPath, newValue); 387 } 388 } 389 } 390 // Pseudolocalize exemplar characters and put them into auxiliary set. 391 outputSource.putValueAtPath(EXEMPLAR_AUX_PATH, 392 mergeExemplars(input.getStringValue(EXEMPLAR_PATH))); 393 // Create fake pseudolocales territories. 394 addTerritory(outputSource, "XA"); 395 addTerritory(outputSource, "XB"); 396 // Android patch (b/37512961) begin. 397 // Use latin numbers for pseudolocales. 398 outputSource.putValueAtPath(NUMBERS_PATH, "latn"); 399 // Android patch (b/37512961) end. 400 return new CLDRFile(outputSource); 401 } 402 403 /** 404 * Add a territory into output xml. 405 */ addTerritory(XMLSource outputSource, String territory)406 private void addTerritory(XMLSource outputSource, String territory) { 407 String territoryPath = String.format(TERRITORY_PATTERN, territory); 408 outputSource.putValueAtPath(territoryPath, String.format("[%s]", territory)); 409 } 410 411 /** 412 * Generate CLDRFile object and save it into .xml file. 413 */ generateAndSave()414 public String generateAndSave() throws Exception { 415 CLDRFile output = generate(); 416 String outputDir = CLDRPaths.GEN_DIRECTORY + "main" + File.separator + PSEUDOLOCALES_DIRECTORY + File.separator; 417 String outputFile = output.getLocaleID() + ".xml"; 418 PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile); 419 output.write(out); 420 out.close(); 421 return (outputDir + outputFile); 422 } 423 main(String[] args)424 public static void main(String[] args) throws Exception { 425 // Generate en-XA locale (accents, brackets and expansion), 426 // dump resulting file name to stdout. 427 System.out.println(createInstanceXA().generateAndSave()); 428 // Generate ar-XB (fake Bidi) locale. 429 System.out.println(createInstanceXB().generateAndSave()); 430 } 431 } 432