1 // Copyright 2011-2017 Google Inc. All Rights Reserved.
2 package org.unicode.cldr.tool;
3 
4 import java.io.File;
5 import java.io.PrintWriter;
6 import java.util.HashMap;
7 import java.util.Map;
8 import java.util.regex.Matcher;
9 import java.util.regex.Pattern;
10 
11 import org.unicode.cldr.draft.FileUtilities;
12 import org.unicode.cldr.util.CLDRFile;
13 import org.unicode.cldr.util.CLDRPaths;
14 import org.unicode.cldr.util.Factory;
15 import org.unicode.cldr.util.SimpleXMLSource;
16 import org.unicode.cldr.util.XMLSource;
17 
18 /**
19  * Generates pseudolocalized contents of a CLDRFile.
20  *
21  * @author viarheichyk@google.com (Igor Viarheichyk)
22  */
23 public class CLDRFilePseudolocalizer {
24     private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}");
25     private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'");
26     // Android patch (b/37077221) begin.
27     private static final String PSEUDOLOCALES_DIRECTORY = ".";
28     // Android patch (b/37077221) end.
29     private static final String ORIGINAL_LOCALE = "en";
30     // Android patch (b/37512961) begin.
31     private static final String NUMBERS_PATH = "//ldml/numbers/defaultNumberingSystem";
32     // Android patch (b/37512961) end.
33     private static final String EXEMPLAR_PATH = "//ldml/characters/exemplarCharacters";
34     private static final String EXEMPLAR_AUX_PATH = "//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]";
35     private static final String TERRITORY_PATTERN = "//ldml/localeDisplayNames/territories/territory[@type=\"%s\"]";
36     private static final String[] EXCLUDE_LIST = { "/exemplarCharacters", "/delimiters",
37         "/contextTransforms", "/numbers",
38         "/units", // [ and ] are not allowed in units
39         "narrow", "localeDisplayPattern", "timeZoneNames/fallbackFormat", // Expansion limits
40     };
41     private static final String[] PATTERN_LIST = { "/pattern", "FormatItem", "hourFormat" };
42 
43     private static class Pseudolocalizer {
44         private boolean pattern;
45 
Pseudolocalizer()46         public Pseudolocalizer() {
47             pattern = false;
48         }
49 
getPattern()50         public boolean getPattern() {
51             return pattern;
52         }
53 
start()54         public String start() {
55             return "";
56         }
57 
end()58         public String end() {
59             return "";
60         }
61 
fragment(String text)62         public String fragment(String text) {
63             return text;
64         }
65 
setPattern(boolean pattern)66         protected void setPattern(boolean pattern) {
67             this.pattern = pattern;
68         }
69     }
70 
71     private static class PseudolocalizerXA extends Pseudolocalizer {
72         private static final String[] NUMBERS = {
73             "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
74             "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
75             "seventeen", "eighteen", "nineteen", "twenty", "twentyone", "twentytwo",
76             "twentythree", "twentyfour", "twentyfive", "twentysix", "twentyseven",
77             "twentyeight", "twentynine", "thirty", "thirtyone", "thirtytwo",
78             "thirtythree", "thirtyfour", "thirtyfive", "thirtysix", "thirtyseven",
79             "thirtyeight", "thirtynine", "forty"
80         };
81         private static final Map<Integer, String> REPLACEMENTS = buildReplacementsTable();
82         private int charCount = 0;
83 
buildReplacementsTable()84         private static Map<Integer, String> buildReplacementsTable() {
85             Map<Integer, String> table = new HashMap<Integer, String>();
86             table.put((int) ' ', "\u2003");
87             table.put((int) '!', "\u00a1");
88             table.put((int) '"', "\u2033");
89             table.put((int) '#', "\u266f");
90             table.put((int) '$', "\u20ac");
91             table.put((int) '%', "\u2030");
92             table.put((int) '&', "\u214b");
93             table.put((int) '*', "\u204e");
94             table.put((int) '+', "\u207a");
95             table.put((int) ',', "\u060c");
96             table.put((int) '-', "\u2010");
97             table.put((int) '.', "\u00b7");
98             table.put((int) '/', "\u2044");
99             table.put((int) '0', "\u24ea");
100             table.put((int) '1', "\u2460");
101             table.put((int) '2', "\u2461");
102             table.put((int) '3', "\u2462");
103             table.put((int) '4', "\u2463");
104             table.put((int) '5', "\u2464");
105             table.put((int) '6', "\u2465");
106             table.put((int) '7', "\u2466");
107             table.put((int) '8', "\u2467");
108             table.put((int) '9', "\u2468");
109             table.put((int) ':', "\u2236");
110             table.put((int) ';', "\u204f");
111             table.put((int) '<', "\u2264");
112             table.put((int) '=', "\u2242");
113             table.put((int) '>', "\u2265");
114             table.put((int) '?', "\u00bf");
115             table.put((int) '@', "\u055e");
116             table.put((int) 'A', "\u00c5");
117             table.put((int) 'B', "\u0181");
118             table.put((int) 'C', "\u00c7");
119             table.put((int) 'D', "\u00d0");
120             table.put((int) 'E', "\u00c9");
121             table.put((int) 'F', "\u0191");
122             table.put((int) 'G', "\u011c");
123             table.put((int) 'H', "\u0124");
124             table.put((int) 'I', "\u00ce");
125             table.put((int) 'J', "\u0134");
126             table.put((int) 'K', "\u0136");
127             table.put((int) 'L', "\u013b");
128             table.put((int) 'M', "\u1e40");
129             table.put((int) 'N', "\u00d1");
130             table.put((int) 'O', "\u00d6");
131             table.put((int) 'P', "\u00de");
132             table.put((int) 'Q', "\u01ea");
133             table.put((int) 'R', "\u0154");
134             table.put((int) 'S', "\u0160");
135             table.put((int) 'T', "\u0162");
136             table.put((int) 'U', "\u00db");
137             table.put((int) 'V', "\u1e7c");
138             table.put((int) 'W', "\u0174");
139             table.put((int) 'X', "\u1e8a");
140             table.put((int) 'Y', "\u00dd");
141             table.put((int) 'Z', "\u017d");
142             table.put((int) '[', "\u2045");
143             table.put((int) '\\', "\u2216");
144             table.put((int) ']', "\u2046");
145             table.put((int) '^', "\u02c4");
146             table.put((int) '_', "\u203f");
147             table.put((int) '`', "\u2035");
148             table.put((int) 'a', "\u00e5");
149             table.put((int) 'b', "\u0180");
150             table.put((int) 'c', "\u00e7");
151             table.put((int) 'd', "\u00f0");
152             table.put((int) 'e', "\u00e9");
153             table.put((int) 'f', "\u0192");
154             table.put((int) 'g', "\u011d");
155             table.put((int) 'h', "\u0125");
156             table.put((int) 'i', "\u00ee");
157             table.put((int) 'j', "\u0135");
158             table.put((int) 'k', "\u0137");
159             table.put((int) 'l', "\u013c");
160             table.put((int) 'm', "\u0271");
161             table.put((int) 'n', "\u00f1");
162             table.put((int) 'o', "\u00f6");
163             table.put((int) 'p', "\u00fe");
164             table.put((int) 'q', "\u01eb");
165             table.put((int) 'r', "\u0155");
166             table.put((int) 's', "\u0161");
167             table.put((int) 't', "\u0163");
168             table.put((int) 'u', "\u00fb");
169             table.put((int) 'v', "\u1e7d");
170             table.put((int) 'w', "\u0175");
171             table.put((int) 'x', "\u1e8b");
172             table.put((int) 'y', "\u00fd");
173             table.put((int) 'z', "\u017e");
174             table.put((int) '|', "\u00a6");
175             table.put((int) '~', "\u02de");
176             return table;
177         }
178 
start()179         public String start() {
180             charCount = 0;
181             return "[";
182         }
183 
end()184         public String end() {
185             StringBuilder expansionText = new StringBuilder();
186             int expansion = (charCount + 1) / 2;
187             int wordIndex = 0;
188             while (expansion > 0) {
189                 String word = NUMBERS[wordIndex++ % NUMBERS.length];
190                 expansionText.append(' ');
191                 // Protect expansion strings with single quotes for patterns.
192                 if (getPattern()) {
193                     expansionText.append('\'');
194                 }
195                 expansionText.append(word);
196                 if (getPattern()) {
197                     expansionText.append('\'');
198                 }
199                 expansion -= word.length() + 1;
200             }
201             expansionText.append(']');
202             return expansionText.toString();
203         }
204 
fragment(String text)205         public String fragment(String text) {
206             StringBuilder buf = new StringBuilder();
207             int index = 0;
208             while (index < text.length()) {
209                 int codePoint = text.codePointAt(index);
210                 charCount++;
211                 index += Character.charCount(codePoint);
212                 String replacement = REPLACEMENTS.get(codePoint);
213                 if (replacement != null) {
214                     buf.append(replacement);
215                 } else {
216                     buf.appendCodePoint(codePoint);
217                 }
218             }
219             return buf.toString();
220         }
221     }
222 
223     private static class PseudolocalizerXB extends Pseudolocalizer {
224         /** Right-to-left override character. */
225         private static final String RLO = "\u202e";
226         // Android patch (b/37512961) begin.
227         /** Arabic letter mark character. */
228         private static final String ALM = "\u061C";
229         /** Pop direction formatting character. */
230         private static final String PDF = "\u202c";
231         /** Prefix to add before each LTR word */
232         private static final String BIDI_PREFIX = ALM + RLO;
233         /** Postfix to add after each LTR word */
234         private static final String BIDI_POSTFIX = PDF + ALM;
235         // Android patch (b/37512961) end.
236 
fragment(String text)237         public String fragment(String text) {
238             StringBuilder output = new StringBuilder();
239             boolean wrapping = false;
240             for (int index = 0; index < text.length();) {
241                 int codePoint = text.codePointAt(index);
242                 index += Character.charCount(codePoint);
243                 byte directionality = Character.getDirectionality(codePoint);
244                 boolean needsWrap = (directionality == Character.DIRECTIONALITY_LEFT_TO_RIGHT);
245                 if (needsWrap != wrapping) {
246                     wrapping = needsWrap;
247                     output.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX);
248                 }
249                 output.appendCodePoint(codePoint);
250             }
251             if (wrapping) {
252                 output.append(BIDI_POSTFIX);
253             }
254             return output.toString();
255         }
256     }
257 
258     private String outputLocale;
259     private Pseudolocalizer pseudolocalizer;
260 
261     /**
262      * Construct new CLDRPseudolocalization object.
263      *
264      * @param outputLocale
265      *             name of target locale
266      * @param pipeline
267      *             pseudolocalization pipeline to generate target locale data
268      */
CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer)269     public CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer) {
270         this.outputLocale = outputLocale;
271         this.pseudolocalizer = pseudolocalizer;
272     }
273 
createInstanceXA()274     public static CLDRFilePseudolocalizer createInstanceXA() {
275         return new CLDRFilePseudolocalizer("en_XA", new PseudolocalizerXA());
276     }
277 
createInstanceXB()278     public static CLDRFilePseudolocalizer createInstanceXB() {
279         return new CLDRFilePseudolocalizer("ar_XB", new PseudolocalizerXB());
280     }
281 
282     /**
283      * Transforms a CLDRFile value into another form.
284      *
285      * @return pseudolocalized value.
286      */
transformValue(String path, String value)287     private String transformValue(String path, String value) {
288         if (containsOneOf(path, EXCLUDE_LIST)) {
289             return value;
290         }
291         if (containsOneOf(path, PATTERN_LIST)) {
292             return createMessage(value, QUOTED_TEXT, true);
293         } else {
294             return createMessage(value, NUMERIC_PLACEHOLDER, false);
295         }
296     }
297 
298     /**
299      * Check if string contains any substring from the provided list.
300      */
containsOneOf(String string, String[] substrings)301     private boolean containsOneOf(String string, String[] substrings) {
302         for (String substring : substrings) {
303             if (string.contains(substring)) {
304                 return true;
305             }
306         }
307         return false;
308     }
309 
310     /**
311      * Create either localizable or non-localizable text fragment depending on flag value.
312      */
pseudolocalizeFragment(String text, boolean localizable)313     private String pseudolocalizeFragment(String text, boolean localizable) {
314         return localizable ? pseudolocalizer.fragment(text) : text;
315     }
316 
317     /**
318      * Create a message that can contain localizable and non-localizable parts.
319      */
createMessage(String text, Pattern pattern, boolean matchIsLocalizable)320     private String createMessage(String text, Pattern pattern,
321         boolean matchIsLocalizable) {
322         StringBuffer buffer = new StringBuffer(pseudolocalizer.start());
323         Matcher match = pattern.matcher(text);
324         int start = 0;
325         pseudolocalizer.setPattern(matchIsLocalizable);
326         for (; match.find(); start = match.end()) {
327             if (match.start() > start) {
328                 buffer.append(pseudolocalizeFragment(
329                     text.substring(start, match.start()), !matchIsLocalizable));
330             }
331             buffer.append(pseudolocalizeFragment(match.group(), matchIsLocalizable));
332         }
333         if (start < text.length()) {
334             buffer.append(pseudolocalizeFragment(text.substring(start), !matchIsLocalizable));
335         }
336         buffer.append(pseudolocalizer.end());
337         return buffer.toString();
338     }
339 
340     /**
341      * Add pseudolocale characters to exemplarCharacters entry pointed by xpath.
342      */
mergeExemplars(String value)343     private String mergeExemplars(String value) {
344         String pseudolocalized = createMessage(value, NUMERIC_PLACEHOLDER, false);
345         StringBuffer result = new StringBuffer(value.substring(0, value.length() - 1));
346         final char CLOSING_BRACKET = ']';
347         for (int i = 0; i < pseudolocalized.length(); i++) {
348             char c = pseudolocalized.charAt(i);
349             if (c != CLOSING_BRACKET) {
350                 String chunk;
351                 if (Character.isAlphabetic(c)) {
352                     chunk = String.valueOf(c);
353                 } else {
354                     chunk = String.format("\\u%04X", (int) c);
355                 }
356                 if (result.indexOf(chunk) == -1
357                     && result.indexOf(String.valueOf(c)) == -1) {
358                     result.append(' ');
359                     result.append(chunk);
360                 }
361             }
362         }
363         result.append(CLOSING_BRACKET);
364         return result.toString();
365     }
366 
367     /**
368      * Generate CLDRFile object. Original CLDRFile is created from .xml file and its
369      * content is passed through pseudolocalization pipeline.
370      */
generate()371     public CLDRFile generate() {
372         Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
373         // Create input CLDRFile object resolving inherited data.
374         CLDRFile input = factory.make(ORIGINAL_LOCALE, false);
375         XMLSource outputSource = new SimpleXMLSource(outputLocale);
376         for (String xpath : input) {
377             String fullPath = input.getFullXPath(xpath);
378             String value = input.getStringValue(xpath);
379             if (!value.isEmpty()) {
380                 String newValue = transformValue(xpath, value);
381                 if (!newValue.equals(value)) {
382                     outputSource.putValueAtPath(fullPath, newValue);
383                 }
384             }
385         }
386         // Pseudolocalize exemplar characters and put them into auxiliary set.
387         outputSource.putValueAtPath(EXEMPLAR_AUX_PATH,
388             mergeExemplars(input.getStringValue(EXEMPLAR_PATH)));
389         // Create fake pseudolocales territories.
390         addTerritory(outputSource, "XA");
391         addTerritory(outputSource, "XB");
392         // Android patch (b/37512961) begin.
393         // Use latin numbers for pseudolocales.
394         outputSource.putValueAtPath(NUMBERS_PATH, "latn");
395         // Android patch (b/37512961) end.
396         return new CLDRFile(outputSource);
397     }
398 
399     /**
400      * Add a territory into output xml.
401      */
addTerritory(XMLSource outputSource, String territory)402     private void addTerritory(XMLSource outputSource, String territory) {
403         String territoryPath = String.format(TERRITORY_PATTERN, territory);
404         outputSource.putValueAtPath(territoryPath, String.format("[%s]", territory));
405     }
406 
407     /**
408      * Generate CLDRFile object and save it into .xml file.
409      */
generateAndSave()410     public String generateAndSave() throws Exception {
411         CLDRFile output = generate();
412         String outputDir = CLDRPaths.GEN_DIRECTORY + "main" + File.separator + PSEUDOLOCALES_DIRECTORY + File.separator;
413         String outputFile = output.getLocaleID() + ".xml";
414         PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile);
415         output.write(out);
416         out.close();
417         return (outputDir + outputFile);
418     }
419 
main(String[] args)420     public static void main(String[] args) throws Exception {
421         // Generate en-XA locale (accents, brackets and expansion),
422         // dump resulting file name to stdout.
423         System.out.println(createInstanceXA().generateAndSave());
424         // Generate ar-XB (fake Bidi) locale.
425         System.out.println(createInstanceXB().generateAndSave());
426     }
427 }
428