1 package org.unicode.cldr.draft;
2 
3 import java.util.Arrays;
4 import java.util.Collections;
5 import java.util.HashMap;
6 import java.util.HashSet;
7 import java.util.LinkedHashSet;
8 import java.util.List;
9 import java.util.Locale;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeSet;
14 
15 import org.unicode.cldr.tool.CountryCodeConverter;
16 import org.unicode.cldr.util.CldrUtility;
17 import org.unicode.cldr.util.Containment;
18 import org.unicode.cldr.util.SemiFileReader;
19 import org.unicode.cldr.util.StandardCodes;
20 import org.unicode.cldr.util.With;
21 
22 import com.ibm.icu.dev.util.CollectionUtilities;
23 import com.ibm.icu.impl.Relation;
24 import com.ibm.icu.lang.UScript;
25 import com.ibm.icu.text.Transform;
26 import com.ibm.icu.text.UTF16;
27 import com.ibm.icu.util.ICUException;
28 import com.ibm.icu.util.VersionInfo;
29 
30 public class ScriptMetadata {
31     private static final int MAX_RANK = 33;
32     private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv";
33     private static final VersionInfo UNICODE_VERSION = VersionInfo.getInstance(CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "11"));
34 
35     // To get the data, go do the Script MetaData spreadsheet
36     // Download As Comma Separated Items into DATA_FILE
37     // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for Unicode beta).
38     // Run TestScriptMetadata.
39     // Then run GenerateScriptMetadata.
40     // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata
41     private enum Column {
42         // must match the spreadsheet header (caseless compare) or have the alternate header as an argument.
43         // doesn't have to be in order
44         WR, AGE, SAMPLE_CODE, ID_USAGE("ID Usage (UAX31)"), RTL("RTL?"), LB_LETTERS("LB letters?"), SHAPING_REQ("Shaping Req?"), IME("IME?"), ORIGIN_COUNTRY(
45             "Origin Country"), DENSITY("~Density"), LANG_CODE, HAS_CASE("Has Case?");
46 
47         int columnNumber = -1;
48         final Set<String> names = new HashSet<String>();
49 
Column(String... alternateNames)50         Column(String... alternateNames) {
51             names.add(this.name());
52             for (String name : alternateNames) {
53                 names.add(name.toUpperCase(Locale.ENGLISH));
54             }
55         }
56 
setColumns(String[] headers)57         static void setColumns(String[] headers) {
58             for (int i = 0; i < headers.length; ++i) {
59                 String header = headers[i].toUpperCase(Locale.ENGLISH);
60                 for (Column v : values()) {
61                     if (v.names.contains(header)) {
62                         v.columnNumber = i;
63                     }
64                 }
65             }
66             for (Column v : values()) {
67                 if (v.columnNumber == -1) {
68                     throw new IllegalArgumentException("Missing field for " + v
69                         + ", may need to add additional column alias");
70                 }
71             }
72         }
73 
getItem(String[] items)74         String getItem(String[] items) {
75             return items[columnNumber];
76         }
77 
getInt(String[] items, int defaultValue)78         int getInt(String[] items, int defaultValue) {
79             final String item = getItem(items);
80             return item.isEmpty() || item.equalsIgnoreCase("n/a") ? defaultValue : Integer.parseInt(item);
81         }
82     }
83 
84     public enum IdUsage {
85         UNKNOWN("Other"), EXCLUSION("Historic"), LIMITED_USE("Limited Use"), ASPIRATIONAL("Aspirational"), RECOMMENDED("Major Use");
86 
87         public final String name;
88 
IdUsage(String name)89         private IdUsage(String name) {
90             this.name = name;
91         }
92     }
93 
94     public enum Trinary {
95         UNKNOWN, NO, YES
96     }
97 
98     public enum Shaping {
99         UNKNOWN, NO, MIN, YES
100     }
101 
102     static StandardCodes SC = StandardCodes.make();
103     // static HashMap<String,String> NAME_TO_REGION_CODE = new HashMap<String,String>();
104     // static HashMap<String,String> NAME_TO_LANGUAGE_CODE = new HashMap<String,String>();
105     static EnumLookup<Shaping> shapingLookup = EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN);
106     static EnumLookup<Trinary> trinaryLookup = EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN);
107     static EnumLookup<IdUsage> idUsageLookup = EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN);
108     static {
109         // addNameToCode("language", NAME_TO_LANGUAGE_CODE);
110         // // NAME_TO_LANGUAGE_CODE.put("", "und");
111         // NAME_TO_LANGUAGE_CODE.put("N/A", "und");
112         // addSynonym(NAME_TO_LANGUAGE_CODE, "Ancient Greek", "Ancient Greek (to 1453)");
113         // //addSynonym(NAME_TO_LANGUAGE_CODE, "Khmer", "Cambodian");
114         // addSynonym(NAME_TO_LANGUAGE_CODE, "Old Irish", "Old Irish (to 900)");
115 
116         // addNameToCode("region", NAME_TO_REGION_CODE);
117         // // NAME_TO_REGION_CODE.put("UNKNOWN", "ZZ");
118         // // NAME_TO_REGION_CODE.put("", "ZZ");
119         // NAME_TO_REGION_CODE.put("N/A", "ZZ");
120         // addSynonym(NAME_TO_REGION_CODE, "Laos", "Lao People's Democratic Republic");
121     }
122 
addNameToCode(String type, Map<String, String> hashMap)123     public static void addNameToCode(String type, Map<String, String> hashMap) {
124         for (String language : SC.getAvailableCodes(type)) {
125             Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language);
126             String name = fullData.get("Description");
127             hashMap.put(name.toUpperCase(Locale.ENGLISH), language);
128         }
129     }
130 
addSynonym(Map<String, String> map, String newTerm, String oldTerm)131     public static void addSynonym(Map<String, String> map, String newTerm, String oldTerm) {
132         String code = map.get(oldTerm.toUpperCase(Locale.ENGLISH));
133         map.put(newTerm.toUpperCase(Locale.ENGLISH), code);
134     }
135 
136     public static final class SkipNewUnicodeException extends ICUException {
137     }
138 
139     public static class Info implements Comparable<Info> {
140         public final int rank;
141         public final VersionInfo age;
142         public final String sampleChar;
143         public final IdUsage idUsage;
144         public final Trinary rtl;
145         public final Trinary lbLetters;
146         public final Trinary hasCase;
147         public final Shaping shapingReq;
148         public final Trinary ime;
149         public final int density;
150         public final String originCountry;
151         public final String likelyLanguage;
152 
Info(String[] items)153         private Info(String[] items) {
154             // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no
155             rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK);
156             age = VersionInfo.getInstance(Column.AGE.getItem(items));
157             if (age.compareTo(UNICODE_VERSION) > 0) {
158                 throw new SkipNewUnicodeException();
159             }
160             // Parse the code point of the sample character, rather than the sample character itself.
161             // The code point is more reliable, especially when the spreadsheet has a bug
162             // for supplementary characters.
163             int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16);
164             sampleChar = UTF16.valueOf(sampleCode);
165             idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items));
166             rtl = trinaryLookup.forString(Column.RTL.getItem(items));
167             lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items));
168             shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items));
169             ime = trinaryLookup.forString(Column.IME.getItem(items));
170             hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items));
171             density = Column.DENSITY.getInt(items, -1);
172 
173             final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items);
174             String country = CountryCodeConverter.getCodeFromName(countryRaw);
175             // NAME_TO_REGION_CODE.get(countryRaw.toUpperCase(Locale.ENGLISH));
176             if (country == null) {
177                 errors.add("Can't map " + countryRaw + " to country/region");
178             }
179             originCountry = country == null ? "ZZ" : country;
180 
181             String langCode = Column.LANG_CODE.getItem(items);
182             if (langCode.equals("n/a")) {
183                 langCode = null;
184             }
185             likelyLanguage = langCode == null ? "und" : langCode;
186         }
187 
Info(Info other, String string, String sampleCharacter)188         public Info(Info other, String string, String sampleCharacter) {
189             rank = other.rank;
190             age = other.age;
191             sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter;
192             idUsage = other.idUsage;
193             rtl = other.rtl;
194             lbLetters = other.lbLetters;
195             hasCase = other.hasCase;
196             shapingReq = other.shapingReq;
197             ime = "IME:YES".equals(string) ? Trinary.YES : other.ime;
198             density = other.density;
199             originCountry = other.originCountry;
200             likelyLanguage = other.likelyLanguage;
201         }
202 
203         // public Trinary parseTrinary(Column title, String[] items) {
204         // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH));
205         // }
fix(String in)206         String fix(String in) {
207             return in.toUpperCase(Locale.ENGLISH).replace("N/A", "UNKNOWN").replace("?", "UNKNOWN")
208                 .replace("RTL", "YES");
209         }
210 
toString()211         public String toString() {
212             return rank
213                 + "\tSample: " + sampleChar
214                 + "\tCountry: " + getName("territory", originCountry) + " (" + originCountry + ")"
215                 + "\tLanguage: " + getName("language", likelyLanguage) + " (" + likelyLanguage + ")"
216                 + "\tId: " + idUsage
217                 + "\tRtl: " + rtl
218                 + "\tLb: " + lbLetters
219                 + "\tShape: " + shapingReq
220                 + "\tIme: " + ime
221                 + "\tCase: " + hasCase
222                 + "\tDensity: " + density;
223         }
224 
getName(String type, String code)225         public Object getName(String type, String code) {
226             List<String> fullData = SC.getFullData(type, code);
227             if (fullData == null) {
228                 return "unavailable";
229             }
230             return fullData.get(0);
231         }
232 
233         @Override
compareTo(Info o)234         public int compareTo(Info o) {
235             // we don't actually care what the comparison value is, as long as it is transitive and consistent with equals.
236             return toString().compareTo(o.toString());
237         }
238     }
239 
240     public static Set<String> errors = new LinkedHashSet<String>();
241     static HashMap<String, Integer> titleToColumn = new HashMap<String, Integer>();
242 
243     private static class MyFileReader extends SemiFileReader {
244         private Map<String, Info> data = new HashMap<String, Info>();
245 
246         @Override
isCodePoint()247         protected boolean isCodePoint() {
248             return false;
249         }
250 
251         @Override
splitLine(String line)252         protected String[] splitLine(String line) {
253             return CldrUtility.splitCommaSeparated(line);
254         };
255 
256         @Override
handleLine(int lineCount, int start, int end, String[] items)257         protected boolean handleLine(int lineCount, int start, int end, String[] items) {
258             if (items[0].startsWith("For help") || items[0].isEmpty()) {
259                 return true; // header lines
260             }
261             if (items[0].equals("WR")) {
262                 Column.setColumns(items);
263                 return true;
264             }
265             Info info;
266             try {
267                 info = new Info(items);
268             } catch (SkipNewUnicodeException e) {
269                 return true;
270             } catch (Exception e) {
271                 errors.add(e.getClass().getName() + "\t" + e.getMessage() + "\t" + Arrays.asList(items));
272                 return true;
273             }
274 
275             String script = items[2];
276             data.put(script, info);
277             Set<String> extras = EXTRAS.get(script);
278             if (extras != null) {
279                 for (String script2 : extras) {
280                     Info info2 = info;
281                     if (script2.equals("Jpan")) {
282                         // HACK
283                         info2 = new Info(info, "IME:YES", null);
284                     } else if (script2.equals("Jamo")) {
285                         info2 = new Info(info, null, "ᄒ");
286                     }
287                     data.put(script2, info2);
288                 }
289             }
290             return true;
291         }
292 
293         @Override
process(Class<?> classLocation, String fileName)294         public MyFileReader process(Class<?> classLocation, String fileName) {
295             super.process(classLocation, fileName);
296             return this;
297         }
298 
getData()299         private Map<String, Info> getData() {
300             if (!errors.isEmpty()) {
301                 throw new RuntimeException(CollectionUtilities.join(errors, "\n\t"));
302             }
303             return Collections.unmodifiableMap(data);
304         }
305     }
306 
307     public enum Groupings {
308         EUROPEAN("150"),
309         MIDDLE_EASTERN("145"),
310         CENTRAL_ASIAN("143"),
311         SOUTH_ASIAN("034"),
312         SOUTHEAST_ASIAN("035"),
313         EAST_ASIAN("030"),
314         AFRICAN("002"),
315         AMERICAN("019"),;
316         public final Set<String> scripts;
317 
Groupings(String... regions)318         private Groupings(String... regions) {
319             scripts = With
320                 .in(getScripts())
321                 .toUnmodifiableCollection(
322                     new ScriptMetadata.RegionFilter(regions), new TreeSet<String>());
323         }
324     }
325 
326     static class RegionFilter implements com.ibm.icu.text.Transform<String, String> {
327         final String[] containingRegion;
328 
RegionFilter(String... containingRegion)329         RegionFilter(String... containingRegion) {
330             this.containingRegion = containingRegion;
331         }
332 
333         @Override
transform(String script)334         public String transform(String script) {
335             String currentRegion = getInfo(script).originCountry;
336             while (true) {
337                 for (String s : containingRegion) {
338                     if (s.equals(currentRegion)) {
339                         return script;
340                     }
341                 }
342                 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) {
343                     return null;
344                 }
345                 currentRegion = Containment.getContainer(currentRegion);
346             }
347         }
348     }
349 
350     static Relation<String, String> EXTRAS = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
351     static {
352         EXTRAS.put("Hani", "Hans");
353         EXTRAS.put("Hani", "Hant");
354         EXTRAS.put("Hani", "Hanb");
355         EXTRAS.put("Hang", "Kore");
356         EXTRAS.put("Hang", "Jamo");
357         EXTRAS.put("Hira", "Jpan");
EXTRAS.freeze()358         EXTRAS.freeze();
359     }
360     static final Map<String, Info> data = new MyFileReader()
361         .process(ScriptMetadata.class, DATA_FILE).getData();
362 
getInfo(String s)363     public static Info getInfo(String s) {
364         Info result = data.get(s);
365         if (result == null) {
366             try {
367                 String name2 = UScript.getShortName(UScript.getCodeFromName(s));
368                 result = data.get(name2);
369             } catch (Exception e) {
370             }
371         }
372         return result;
373     }
374 
getScripts()375     public static Set<String> getScripts() {
376         return data.keySet();
377     }
378 
getInfo(int i)379     public static Info getInfo(int i) {
380         return data.get(UScript.getShortName(i));
381     }
382 
iterable()383     public static Set<Entry<String, Info>> iterable() {
384         return data.entrySet();
385     }
386 
387     /**
388      * Specialized scripts
389      * @return
390      */
getExtras()391     public static Set<String> getExtras() {
392         return EXTRAS.values();
393     }
394 
395     public static Transform<String, String> TO_SHORT_SCRIPT = new Transform<String, String>() {
396         @Override
397         public String transform(String source) {
398             return UScript.getShortName(UScript.getCodeFromName(source));
399         }
400     };
401     public static Transform<String, String> TO_LONG_SCRIPT = new Transform<String, String>() {
402         @Override
403         public String transform(String source) {
404             return UScript.getName(UScript.getCodeFromName(source));
405         }
406     };
407 }
408