1 // Copyright 2011-2017 Google Inc. All Rights Reserved.
2 package org.unicode.cldr.tool;
3 
4 import java.io.File;
5 import java.io.PrintWriter;
6 import java.util.HashMap;
7 import java.util.Map;
8 import java.util.regex.Matcher;
9 import java.util.regex.Pattern;
10 
11 import org.unicode.cldr.draft.FileUtilities;
12 import org.unicode.cldr.util.CLDRFile;
13 import org.unicode.cldr.util.CLDRPaths;
14 import org.unicode.cldr.util.Factory;
15 import org.unicode.cldr.util.SimpleXMLSource;
16 import org.unicode.cldr.util.XMLSource;
17 
18 /**
19  * Generates pseudolocalized contents of a CLDRFile.
20  *
21  * @author viarheichyk@google.com (Igor Viarheichyk)
22  */
23 public class CLDRFilePseudolocalizer {
24     private static final Pattern NUMERIC_PLACEHOLDER = Pattern.compile("\\{\\d+\\}");
25     private static final Pattern QUOTED_TEXT = Pattern.compile("'.*?'");
26     // Android patch (b/37077221) begin.
27     private static final String PSEUDOLOCALES_DIRECTORY = ".";
28     // Android patch (b/37077221) end.
29     private static final String ORIGINAL_LOCALE = "en";
30     // Android patch (b/37512961) begin.
31     private static final String NUMBERS_PATH = "//ldml/numbers/defaultNumberingSystem";
32     // Android patch (b/37512961) end.
33     private static final String EXEMPLAR_PATH = "//ldml/characters/exemplarCharacters";
34     private static final String EXEMPLAR_AUX_PATH = "//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]";
35     private static final String TERRITORY_PATTERN = "//ldml/localeDisplayNames/territories/territory[@type=\"%s\"]";
36     private static final String[] EXCLUDE_LIST = { "/exemplarCharacters", "/delimiters",
37         "/contextTransforms", "/numbers",
38         "/units", // [ and ] are not allowed in units
39         "narrow", "localeDisplayPattern", "timeZoneNames/fallbackFormat", // Expansion limits
40     };
41     private static final String[] PATTERN_LIST = { "/pattern", "FormatItem", "hourFormat" };
42 
43     private static class Pseudolocalizer {
44         private boolean pattern;
45 
Pseudolocalizer()46         public Pseudolocalizer() {
47             pattern = false;
48         }
49 
getPattern()50         public boolean getPattern() {
51             return pattern;
52         }
53 
start()54         public String start() {
55             return "";
56         }
57 
end()58         public String end() {
59             return "";
60         }
61 
fragment(String text)62         public String fragment(String text) {
63             return text;
64         }
65 
setPattern(boolean pattern)66         protected void setPattern(boolean pattern) {
67             this.pattern = pattern;
68         }
69     }
70 
71     private static class PseudolocalizerXA extends Pseudolocalizer {
72         private static final String[] NUMBERS = {
73             "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
74             "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
75             "seventeen", "eighteen", "nineteen", "twenty", "twentyone", "twentytwo",
76             "twentythree", "twentyfour", "twentyfive", "twentysix", "twentyseven",
77             "twentyeight", "twentynine", "thirty", "thirtyone", "thirtytwo",
78             "thirtythree", "thirtyfour", "thirtyfive", "thirtysix", "thirtyseven",
79             "thirtyeight", "thirtynine", "forty"
80         };
81         private static final Map<Integer, String> REPLACEMENTS = buildReplacementsTable();
82         private int charCount = 0;
83 
buildReplacementsTable()84         private static Map<Integer, String> buildReplacementsTable() {
85             Map<Integer, String> table = new HashMap<>();
86             table.put((int) ' ', "\u2003");
87             table.put((int) '!', "\u00a1");
88             table.put((int) '"', "\u2033");
89             table.put((int) '#', "\u266f");
90             table.put((int) '$', "\u20ac");
91             table.put((int) '%', "\u2030");
92             table.put((int) '&', "\u214b");
93             table.put((int) '*', "\u204e");
94             table.put((int) '+', "\u207a");
95             table.put((int) ',', "\u060c");
96             table.put((int) '-', "\u2010");
97             table.put((int) '.', "\u00b7");
98             table.put((int) '/', "\u2044");
99             table.put((int) '0', "\u24ea");
100             table.put((int) '1', "\u2460");
101             table.put((int) '2', "\u2461");
102             table.put((int) '3', "\u2462");
103             table.put((int) '4', "\u2463");
104             table.put((int) '5', "\u2464");
105             table.put((int) '6', "\u2465");
106             table.put((int) '7', "\u2466");
107             table.put((int) '8', "\u2467");
108             table.put((int) '9', "\u2468");
109             table.put((int) ':', "\u2236");
110             table.put((int) ';', "\u204f");
111             table.put((int) '<', "\u2264");
112             table.put((int) '=', "\u2242");
113             table.put((int) '>', "\u2265");
114             table.put((int) '?', "\u00bf");
115             table.put((int) '@', "\u055e");
116             table.put((int) 'A', "\u00c5");
117             table.put((int) 'B', "\u0181");
118             table.put((int) 'C', "\u00c7");
119             table.put((int) 'D', "\u00d0");
120             table.put((int) 'E', "\u00c9");
121             table.put((int) 'F', "\u0191");
122             table.put((int) 'G', "\u011c");
123             table.put((int) 'H', "\u0124");
124             table.put((int) 'I', "\u00ce");
125             table.put((int) 'J', "\u0134");
126             table.put((int) 'K', "\u0136");
127             table.put((int) 'L', "\u013b");
128             table.put((int) 'M', "\u1e40");
129             table.put((int) 'N', "\u00d1");
130             table.put((int) 'O', "\u00d6");
131             table.put((int) 'P', "\u00de");
132             table.put((int) 'Q', "\u01ea");
133             table.put((int) 'R', "\u0154");
134             table.put((int) 'S', "\u0160");
135             table.put((int) 'T', "\u0162");
136             table.put((int) 'U', "\u00db");
137             table.put((int) 'V', "\u1e7c");
138             table.put((int) 'W', "\u0174");
139             table.put((int) 'X', "\u1e8a");
140             table.put((int) 'Y', "\u00dd");
141             table.put((int) 'Z', "\u017d");
142             table.put((int) '[', "\u2045");
143             table.put((int) '\\', "\u2216");
144             table.put((int) ']', "\u2046");
145             table.put((int) '^', "\u02c4");
146             table.put((int) '_', "\u203f");
147             table.put((int) '`', "\u2035");
148             table.put((int) 'a', "\u00e5");
149             table.put((int) 'b', "\u0180");
150             table.put((int) 'c', "\u00e7");
151             table.put((int) 'd', "\u00f0");
152             table.put((int) 'e', "\u00e9");
153             table.put((int) 'f', "\u0192");
154             table.put((int) 'g', "\u011d");
155             table.put((int) 'h', "\u0125");
156             table.put((int) 'i', "\u00ee");
157             table.put((int) 'j', "\u0135");
158             table.put((int) 'k', "\u0137");
159             table.put((int) 'l', "\u013c");
160             table.put((int) 'm', "\u0271");
161             table.put((int) 'n', "\u00f1");
162             table.put((int) 'o', "\u00f6");
163             table.put((int) 'p', "\u00fe");
164             table.put((int) 'q', "\u01eb");
165             table.put((int) 'r', "\u0155");
166             table.put((int) 's', "\u0161");
167             table.put((int) 't', "\u0163");
168             table.put((int) 'u', "\u00fb");
169             table.put((int) 'v', "\u1e7d");
170             table.put((int) 'w', "\u0175");
171             table.put((int) 'x', "\u1e8b");
172             table.put((int) 'y', "\u00fd");
173             table.put((int) 'z', "\u017e");
174             table.put((int) '|', "\u00a6");
175             table.put((int) '~', "\u02de");
176             return table;
177         }
178 
179         @Override
start()180         public String start() {
181             charCount = 0;
182             return "[";
183         }
184 
185         @Override
end()186         public String end() {
187             StringBuilder expansionText = new StringBuilder();
188             int expansion = (charCount + 1) / 2;
189             int wordIndex = 0;
190             while (expansion > 0) {
191                 String word = NUMBERS[wordIndex++ % NUMBERS.length];
192                 expansionText.append(' ');
193                 // Protect expansion strings with single quotes for patterns.
194                 if (getPattern()) {
195                     expansionText.append('\'');
196                 }
197                 expansionText.append(word);
198                 if (getPattern()) {
199                     expansionText.append('\'');
200                 }
201                 expansion -= word.length() + 1;
202             }
203             expansionText.append(']');
204             return expansionText.toString();
205         }
206 
207         @Override
fragment(String text)208         public String fragment(String text) {
209             StringBuilder buf = new StringBuilder();
210             int index = 0;
211             while (index < text.length()) {
212                 int codePoint = text.codePointAt(index);
213                 charCount++;
214                 index += Character.charCount(codePoint);
215                 String replacement = REPLACEMENTS.get(codePoint);
216                 if (replacement != null) {
217                     buf.append(replacement);
218                 } else {
219                     buf.appendCodePoint(codePoint);
220                 }
221             }
222             return buf.toString();
223         }
224     }
225 
226     private static class PseudolocalizerXB extends Pseudolocalizer {
227         /** Right-to-left override character. */
228         private static final String RLO = "\u202e";
229         // Android patch (b/37512961) begin.
230         /** Arabic letter mark character. */
231         private static final String ALM = "\u061C";
232         /** Pop direction formatting character. */
233         private static final String PDF = "\u202c";
234         /** Prefix to add before each LTR word */
235         private static final String BIDI_PREFIX = ALM + RLO;
236         /** Postfix to add after each LTR word */
237         private static final String BIDI_POSTFIX = PDF + ALM;
238         // Android patch (b/37512961) end.
239 
240         @Override
fragment(String text)241         public String fragment(String text) {
242             StringBuilder output = new StringBuilder();
243             boolean wrapping = false;
244             for (int index = 0; index < text.length();) {
245                 int codePoint = text.codePointAt(index);
246                 index += Character.charCount(codePoint);
247                 byte directionality = Character.getDirectionality(codePoint);
248                 boolean needsWrap = (directionality == Character.DIRECTIONALITY_LEFT_TO_RIGHT);
249                 if (needsWrap != wrapping) {
250                     wrapping = needsWrap;
251                     output.append(wrapping ? BIDI_PREFIX : BIDI_POSTFIX);
252                 }
253                 output.appendCodePoint(codePoint);
254             }
255             if (wrapping) {
256                 output.append(BIDI_POSTFIX);
257             }
258             return output.toString();
259         }
260     }
261 
262     private String outputLocale;
263     private Pseudolocalizer pseudolocalizer;
264 
265     /**
266      * Construct new CLDRPseudolocalization object.
267      *
268      * @param outputLocale
269      *             name of target locale
270      * @param pipeline
271      *             pseudolocalization pipeline to generate target locale data
272      */
CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer)273     public CLDRFilePseudolocalizer(String outputLocale, Pseudolocalizer pseudolocalizer) {
274         this.outputLocale = outputLocale;
275         this.pseudolocalizer = pseudolocalizer;
276     }
277 
createInstanceXA()278     public static CLDRFilePseudolocalizer createInstanceXA() {
279         return new CLDRFilePseudolocalizer("en_XA", new PseudolocalizerXA());
280     }
281 
createInstanceXB()282     public static CLDRFilePseudolocalizer createInstanceXB() {
283         return new CLDRFilePseudolocalizer("ar_XB", new PseudolocalizerXB());
284     }
285 
286     /**
287      * Transforms a CLDRFile value into another form.
288      *
289      * @return pseudolocalized value.
290      */
transformValue(String path, String value)291     private String transformValue(String path, String value) {
292         if (containsOneOf(path, EXCLUDE_LIST)) {
293             return value;
294         }
295         if (containsOneOf(path, PATTERN_LIST)) {
296             return createMessage(value, QUOTED_TEXT, true);
297         } else {
298             return createMessage(value, NUMERIC_PLACEHOLDER, false);
299         }
300     }
301 
302     /**
303      * Check if string contains any substring from the provided list.
304      */
containsOneOf(String string, String[] substrings)305     private boolean containsOneOf(String string, String[] substrings) {
306         for (String substring : substrings) {
307             if (string.contains(substring)) {
308                 return true;
309             }
310         }
311         return false;
312     }
313 
314     /**
315      * Create either localizable or non-localizable text fragment depending on flag value.
316      */
pseudolocalizeFragment(String text, boolean localizable)317     private String pseudolocalizeFragment(String text, boolean localizable) {
318         return localizable ? pseudolocalizer.fragment(text) : text;
319     }
320 
321     /**
322      * Create a message that can contain localizable and non-localizable parts.
323      */
createMessage(String text, Pattern pattern, boolean matchIsLocalizable)324     private String createMessage(String text, Pattern pattern,
325         boolean matchIsLocalizable) {
326         StringBuffer buffer = new StringBuffer(pseudolocalizer.start());
327         Matcher match = pattern.matcher(text);
328         int start = 0;
329         pseudolocalizer.setPattern(matchIsLocalizable);
330         for (; match.find(); start = match.end()) {
331             if (match.start() > start) {
332                 buffer.append(pseudolocalizeFragment(
333                     text.substring(start, match.start()), !matchIsLocalizable));
334             }
335             buffer.append(pseudolocalizeFragment(match.group(), matchIsLocalizable));
336         }
337         if (start < text.length()) {
338             buffer.append(pseudolocalizeFragment(text.substring(start), !matchIsLocalizable));
339         }
340         buffer.append(pseudolocalizer.end());
341         return buffer.toString();
342     }
343 
344     /**
345      * Add pseudolocale characters to exemplarCharacters entry pointed by xpath.
346      */
mergeExemplars(String value)347     private String mergeExemplars(String value) {
348         String pseudolocalized = createMessage(value, NUMERIC_PLACEHOLDER, false);
349         StringBuffer result = new StringBuffer(value.substring(0, value.length() - 1));
350         final char CLOSING_BRACKET = ']';
351         for (int i = 0; i < pseudolocalized.length(); i++) {
352             char c = pseudolocalized.charAt(i);
353             if (c != CLOSING_BRACKET) {
354                 String chunk;
355                 if (Character.isAlphabetic(c)) {
356                     chunk = String.valueOf(c);
357                 } else {
358                     chunk = String.format("\\u%04X", (int) c);
359                 }
360                 if (result.indexOf(chunk) == -1
361                     && result.indexOf(String.valueOf(c)) == -1) {
362                     result.append(' ');
363                     result.append(chunk);
364                 }
365             }
366         }
367         result.append(CLOSING_BRACKET);
368         return result.toString();
369     }
370 
371     /**
372      * Generate CLDRFile object. Original CLDRFile is created from .xml file and its
373      * content is passed through pseudolocalization pipeline.
374      */
generate()375     public CLDRFile generate() {
376         Factory factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
377         // Create input CLDRFile object resolving inherited data.
378         CLDRFile input = factory.make(ORIGINAL_LOCALE, false);
379         XMLSource outputSource = new SimpleXMLSource(outputLocale);
380         for (String xpath : input) {
381             String fullPath = input.getFullXPath(xpath);
382             String value = input.getStringValue(xpath);
383             if (!value.isEmpty()) {
384                 String newValue = transformValue(xpath, value);
385                 if (!newValue.equals(value)) {
386                     outputSource.putValueAtPath(fullPath, newValue);
387                 }
388             }
389         }
390         // Pseudolocalize exemplar characters and put them into auxiliary set.
391         outputSource.putValueAtPath(EXEMPLAR_AUX_PATH,
392             mergeExemplars(input.getStringValue(EXEMPLAR_PATH)));
393         // Create fake pseudolocales territories.
394         addTerritory(outputSource, "XA");
395         addTerritory(outputSource, "XB");
396         // Android patch (b/37512961) begin.
397         // Use latin numbers for pseudolocales.
398         outputSource.putValueAtPath(NUMBERS_PATH, "latn");
399         // Android patch (b/37512961) end.
400         return new CLDRFile(outputSource);
401     }
402 
403     /**
404      * Add a territory into output xml.
405      */
addTerritory(XMLSource outputSource, String territory)406     private void addTerritory(XMLSource outputSource, String territory) {
407         String territoryPath = String.format(TERRITORY_PATTERN, territory);
408         outputSource.putValueAtPath(territoryPath, String.format("[%s]", territory));
409     }
410 
411     /**
412      * Generate CLDRFile object and save it into .xml file.
413      */
generateAndSave()414     public String generateAndSave() throws Exception {
415         CLDRFile output = generate();
416         String outputDir = CLDRPaths.GEN_DIRECTORY + "main" + File.separator + PSEUDOLOCALES_DIRECTORY + File.separator;
417         String outputFile = output.getLocaleID() + ".xml";
418         PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile);
419         output.write(out);
420         out.close();
421         return (outputDir + outputFile);
422     }
423 
main(String[] args)424     public static void main(String[] args) throws Exception {
425         // Generate en-XA locale (accents, brackets and expansion),
426         // dump resulting file name to stdout.
427         System.out.println(createInstanceXA().generateAndSave());
428         // Generate ar-XB (fake Bidi) locale.
429         System.out.println(createInstanceXB().generateAndSave());
430     }
431 }
432