1 package org.unicode.cldr.util;
2 
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.util.Arrays;
6 import java.util.Collections;
7 import java.util.HashMap;
8 import java.util.LinkedHashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.TreeMap;
13 import java.util.regex.Pattern;
14 
15 import com.google.common.base.Splitter;
16 import com.ibm.icu.impl.Relation;
17 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap;
18 import com.ibm.icu.util.ICUUncheckedIOException;
19 
20 public class Iso639Data {
21 
22     static Map<String, String> toAlpha3;
23 
24     static Map<String, String> fromAlpha3;
25 
26     static Map<String, String> toBiblio3;
27 
28     static Map<String, String> fromBiblio3;
29 
30     static Relation<String, String> toNames;
31 
32     static Relation<String, String> toRetirements;
33 
34     static Map<String, String> toChangeTo;
35 
36     static Map<String, Scope> toScope;
37 
38     static Map<String, List<String>> toHeirarchy;
39 
40     static Map<String, Type> toType;
41 
42     static Map<String, String> encompassed_macro;
43 
44     static Relation<String, String> macro_encompassed;
45 
46     static Map<String, Source> toSource;
47 
48     private static String version;
49 
50     /**
51      * <h3><a NAME="I">Individual</a> languages</h3>
52      * <p>
53      * Judgments regarding when two varieties are considered to be the same or different languages are based on a number
54      * of factors, including linguistic similarity, intelligibility, a common literature, the views of speakers
55      * concerning the relationship between language and identity, and other factors.
56      * </p>
57      * <h3><a NAME="M">Macrolanguages</a></h3>
58      * <p>
59      * In various parts of the world, there are clusters of closely-related language varieties that, based on the
60      * criteria discussed above, can be considered distinct individual languages, yet in certain usage contexts a single
61      * language identity for all is needed.
62      * </p>
63      * <p>
64      * Macrolanguages are distinguished from language collections in that the individual languages that correspond to a
65      * macrolanguage must be very closely related, and there must be some domain in which only a single language
66      * identity is recognized.
67      * </p>
68      *
69      * <h3><a NAME="C">Collections</a> of languages</h3>
70      * <p>
71      * A collective language code element is an identifier that represents a group of individual languages that are not
72      * deemed to be one language in any usage context.
73      * </p>
74      * </p> <h3><a NAME="R">Private Use</a></h3>
75      * <p>
76      * Identifiers <tt>qaa</tt> through <tt>qtz</tt> are reserved for local use, to be used in cases in which there is
77      * no suitable existing code in ISO 639. There are no constraints as to scope of denotation. These identifiers may
78      * only be used locally, and may not be used in interchange without a private agreement.
79      * </p>
80      * <h3><a NAME="S">Special situations</a></h3>
81      * <p>
82      * A few code elements are defined for other special situations.
83      * </p>
84      * For more information, see http://www.sil.org/iso639-3/scope.asp
85      * <p>
86      * Note that the casing on these enum values is chosen to match standard usage.
87      * </p>
88      */
89     public enum Scope {
90         Individual, Macrolanguage, Special, Collection, PrivateUse, Unknown;
fromString(String input)91         public static Scope fromString(String input) {
92             input = input.replace("-", "");
93             for (Scope item : Scope.values()) {
94                 if (item.toString().equalsIgnoreCase(input)) {
95                     return item;
96                 }
97             }
98             return Scope.valueOf(input); // to get exception
99         }
100     };
101 
102     /**
103      * <h3><a NAME="L"></a>Living languages</h3>
104      * <p>
105      * A language is listed as <i>living</i> when there are people still living who learned it as a first language.
106      * </p>
107      * <h3><a NAME="E"></a>Extinct languages</h3>
108      *
109      * <p>
110      * A language is listed as <i>extinct</i> if it has gone extinct in recent times. (e.g. in the last few centuries).
111      * </p>
112      * <h3><a NAME="A"></a>Ancient languages</h3>
113      * <p>
114      * A language is listed as <i>ancient</i> if it went extinct in ancient times (e.g. more than a millennium ago).
115      * </p>
116      * <h3><a NAME="H"></a>Historic languages</h3>
117      * <p>
118      * A language is listed as <i>historic</i> when it is considered to be distinct from any modern languages that are
119      * descended from it; for instance, Old English and Middle English.
120      * </p>
121      *
122      * <h3><a NAME="C"></a>Constructed languages</h3>
123      * <p>
124      * Artificial languages are those like Esperanto: it excludes programming languages.
125      * </p>
126      * <p>
127      * Note that the casing on these enum values is chosen to match standard usage. <i>For more information, see
128      * http://www.sil.org/iso639-3/scope.asp</i>
129      * </p>
130      */
131     public enum Type {
132         Ancient, Constructed, Extinct, Historical, Living, Special, Collection, Unknown
133     };
134 
135     /**
136      * This indicates the source of the language subtag.
137      *
138      * @author markdavis
139      *
140      */
141     public enum Source {
142         ISO_639_1, ISO_639_2, ISO_639_3, BCP47, CLDR
143     };
144 
getVersion()145     public static String getVersion() {
146         return version;
147     }
148 
getSource(String languageSubtag)149     public static Source getSource(String languageSubtag) {
150         if (toAlpha3 == null) {
151             getData();
152         }
153         if (!isValid(languageSubtag)) {
154             return null;
155         }
156         Source result = toSource.get(languageSubtag);
157         if (result == null)
158             return Source.ISO_639_3;
159         return result;
160     }
161 
toAlpha3(String languageSubtag)162     public static String toAlpha3(String languageSubtag) {
163         if (toAlpha3 == null) {
164             getData();
165         }
166         if (!isValid(languageSubtag)) {
167             return null;
168         }
169         return toAlpha3.get(languageSubtag);
170     }
171 
fromAlpha3(String alpha3)172     public static String fromAlpha3(String alpha3) {
173         if (fromAlpha3 == null) {
174             getData();
175         }
176         String alpha2 = fromAlpha3.get(alpha3);
177         if (alpha2 != null) {
178             return alpha2;
179         }
180         // it only exists if it has a name
181         if (isValid(alpha3)) {
182             return alpha3;
183         }
184         return null;
185     }
186 
isValid(String alpha3)187     private static boolean isValid(String alpha3) {
188         return toNames.containsKey(alpha3);
189     }
190 
fromBiblio3(String biblio3)191     public static String fromBiblio3(String biblio3) {
192         if (toNames == null) {
193             getData();
194         }
195         String result = fromBiblio3.get(biblio3);
196         if (result != null) {
197             return result;
198         }
199         return fromAlpha3(biblio3);
200     }
201 
toBiblio3(String languageTag)202     public static String toBiblio3(String languageTag) {
203         if (toNames == null) {
204             getData();
205         }
206         String result = toBiblio3.get(languageTag);
207         if (result != null) {
208             return result;
209         }
210         return toAlpha3(languageTag);
211     }
212 
hasBiblio3()213     public static Set<String> hasBiblio3() {
214         return toBiblio3.keySet();
215     }
216 
getNames(String languageSubtag)217     public static Set<String> getNames(String languageSubtag) {
218         if (toNames == null) {
219             getData();
220         }
221         return toNames.getAll(languageSubtag);
222     }
223 
getScope(String languageSubtag)224     public static Scope getScope(String languageSubtag) {
225         if (toScope == null) {
226             getData();
227         }
228         if (!isValid(languageSubtag))
229             return Scope.Unknown;
230         Scope result = toScope.get(languageSubtag);
231         if (result != null)
232             return result;
233         return Scope.Individual;
234     }
235 
236     /**
237      * Returns the ISO 639-5 heirarchy if available, otherwise null.
238      */
getHeirarchy(String languageSubtag)239     public static List<String> getHeirarchy(String languageSubtag) {
240         if (toHeirarchy == null) {
241             getData();
242         }
243         return toHeirarchy.get(languageSubtag);
244     }
245 
getType(String languageSubtag)246     public static Type getType(String languageSubtag) {
247         if (toAlpha3 == null) {
248             getData();
249         }
250         if (!isValid(languageSubtag))
251             return Type.Unknown;
252         Type result = toType.get(languageSubtag);
253         if (result != null)
254             return result;
255         return Type.Living;
256     }
257 
258     /**
259      * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Part2B char(3)
260      * NULL, -- Equivalent 639-2 identifier of the bibliographic applications code
261      * set, if there is one Part2T char(3) NULL, -- Equivalent 639-2 identifier of
262      * the terminology applications code set, if there is one Part1 char(2) NULL, --
263      * Equivalent 639-1 identifier, if there is one Scope char(1) NOT NULL, --
264      * I(ndividual), M(acrolanguage), S(pecial) Type char(1) NOT NULL, --
265      * A(ncient), C(onstructed), -- E(xtinct), H(istorical), L(iving), S(pecial)
266      * Ref_Name varchar(150) NOT NULL) -- Reference language name
267      *
268      * @throws IOException
269      */
270     enum IsoColumn {
271         Id, Part2B, Part2T, Part1, Scope, Type, Ref_Name
272     };
273 
274     /**
275      * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Print_Name
276      * varchar(75) NOT NULL, -- One of the names associated with this identifier
277      * Inverted_Name varchar(75) NOT NULL) -- The inverted form of this Print_Name
278      * form
279      */
280     enum IsoNamesColumn {
281         Id, Print_Name, Inverted_Name
282     };
283 
getData()284     private static void getData() {
285         try {
286             BufferedReader in = CldrUtility.getUTF8Data("iso-639-3-version.tab");
287             version = in.readLine().trim();
288             in.close();
289 
290             in = CldrUtility.getUTF8Data("iso-639-3.tab");
291             SplitToArray tabs = new SplitToArray(Splitter.on('\t').trimResults());
292             toAlpha3 = new HashMap<String, String>();
293             fromAlpha3 = new HashMap<String, String>();
294             toBiblio3 = new HashMap<String, String>();
295             fromBiblio3 = new HashMap<String, String>();
296             toScope = new HashMap<String, Scope>();
297             toType = new HashMap<String, Type>();
298             toNames = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
299             toRetirements = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
300             toChangeTo = new TreeMap<String, String>();
301             macro_encompassed = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
302             encompassed_macro = new HashMap<String, String>();
303             toSource = new HashMap<String, Source>();
304             toSource.put("sh", Source.ISO_639_1); // add deprecated language
305 
306             int count = 0; // line count for debugging
307             while (true) {
308                 ++count;
309                 String line = in.readLine();
310                 if (line == null) {
311                     break;
312                 }
313                 if (line.startsWith("\uFEFF")) {
314                     line = line.substring(1);
315                 }
316                 line = line.trim();
317                 if (line.isEmpty()) {
318                     continue;
319                 }
320                 String[] parts = tabs.split(line);
321                 String alpha3 = parts[IsoColumn.Id.ordinal()];
322                 if (alpha3.equals("Id"))
323                     continue;
324                 String languageSubtag = alpha3;
325                 if (parts[IsoColumn.Part1.ordinal()].length() != 0) { // parts.length >
326                     // IsoColumn.Part1.ordinal()
327                     // &&
328                     languageSubtag = parts[IsoColumn.Part1.ordinal()];
329                     toAlpha3.put(languageSubtag, alpha3);
330                     fromAlpha3.put(alpha3, languageSubtag);
331                 }
332 
333                 if (parts[IsoColumn.Part2B.ordinal()].length() != 0) { // parts.length >
334                     // IsoColumn.Part1.ordinal()
335                     // &&
336                     String biblio = parts[IsoColumn.Part2B.ordinal()];
337                     if (!biblio.equals(alpha3)) {
338                         toBiblio3.put(languageSubtag, biblio);
339                         fromBiblio3.put(biblio, languageSubtag);
340                     }
341                 }
342 
343                 toNames.put(languageSubtag, parts[IsoColumn.Ref_Name.ordinal()]);
344                 Scope scope = findMatchToPrefix(parts[IsoColumn.Scope.ordinal()], Scope.values());
345                 if (scope != Scope.Individual)
346                     toScope.put(languageSubtag, scope);
347                 Type type = findMatchToPrefix(parts[IsoColumn.Type.ordinal()], Type.values());
348                 if (type != Type.Living)
349                     toType.put(languageSubtag, type);
350             }
351             // System.out.println("Size:\t" + toNames.size());
352             in.close();
353 
354             // Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective
355             in = CldrUtility.getUTF8Data("iso-639-3_Retirements.tab");
356             while (true) {
357                 String line = in.readLine();
358                 if (line == null)
359                     break;
360                 if (line.startsWith("\uFEFF"))
361                     line = line.substring(1);
362                 String[] parts = tabs.split(line);
363                 String alpha3 = parts[0];
364                 if (alpha3.equals("Id"))
365                     continue;
366                 // Id   Ref_Name    Ret_Reason  Change_To   Ret_Remedy  Effective
367                 // fri  Western Frisian C   fry     2007-02-01
368 
369                 toNames.put(alpha3, parts[1]);
370                 if (!parts[3].isEmpty()) {
371                     toChangeTo.put(alpha3, parts[3]);
372                 }
373                 toRetirements.put(alpha3, line);
374                 // skip inverted name for now
375             }
376             // System.out.println("Size:\t" + toNames.size());
377             in.close();
378 
379             // Id Print_Name Inverted_Name
380             in = CldrUtility.getUTF8Data("iso-639-3-macrolanguages.tab");
381             while (true) {
382                 String line = in.readLine();
383                 if (line == null)
384                     break;
385                 if (line.startsWith("\uFEFF"))
386                     line = line.substring(1);
387                 String[] parts = tabs.split(line);
388                 String prefix = parts[0];
389                 if (prefix.equals("M_Id"))
390                     continue;
391                 prefix = fromAlpha3(prefix);
392                 String suffix = fromAlpha3(parts[1]);
393                 if (suffix == null || prefix == null) {
394                     throw new IllegalArgumentException();
395                 }
396                 encompassed_macro.put(suffix, prefix);
397                 macro_encompassed.put(prefix, suffix);
398                 // skip inverted name for now
399             }
400             // System.out.println("Size:\t" + toNames.size());
401             in.close();
402 
403             // Id Print_Name Inverted_Name
404             in = CldrUtility.getUTF8Data("iso-639-3_Name_Index.tab");
405             while (true) {
406                 String line = in.readLine();
407                 if (line == null)
408                     break;
409                 if (line.startsWith("\uFEFF"))
410                     line = line.substring(1);
411                 String[] parts = tabs.split(line);
412                 String alpha3 = parts[IsoColumn.Id.ordinal()];
413                 if (alpha3.equals("Id"))
414                     continue;
415                 String languageSubTag = fromAlpha3(alpha3);
416                 toNames.put(languageSubTag, parts[IsoNamesColumn.Print_Name.ordinal()]);
417                 // skip inverted name for now
418             }
419             // System.out.println("Size:\t" + toNames.size());
420             in.close();
421 
422             in = CldrUtility.getUTF8Data("ISO-639-2_values_8bits.txt");
423             // An alpha-3 (bibliographic) code,
424             // an alpha-3 (terminologic) code (when given),
425             // an alpha-2 code (when given),
426             // an English name,
427             // and a French name of a language are all separated by pipe (|)
428             // characters.
429             while (true) {
430                 String line = in.readLine();
431                 if (line == null)
432                     break;
433                 if (line.startsWith("\uFEFF"))
434                     line = line.substring(1);
435                 String[] parts = line.split("\\s*\\|\\s*");
436                 String alpha3 = parts[0];
437                 if (alpha3.equals("qaa-qtz")) {
438                     for (char second = 'a'; second <= 't'; ++second) {
439                         for (char third = 'a'; third <= 'z'; ++third) {
440                             String languageSubtag = (("q" + second) + third);
441                             toScope.put(languageSubtag, Scope.PrivateUse);
442                             toType.put(languageSubtag, Type.Special);
443                             toNames.put(languageSubtag, "private-use");
444                             toSource.put(languageSubtag, Source.ISO_639_2);
445                         }
446                     }
447                     continue;
448                 }
449                 if (parts[1].length() != 0)
450                     alpha3 = parts[1];
451                 String languageSubtag = parts[2];
452                 if (languageSubtag.length() == 0) {
453                     languageSubtag = alpha3;
454                 }
455                 String[] english = parts[3].split(";");
456                 toSource.put(languageSubtag, languageSubtag.length() == 2 ? Source.ISO_639_1 : Source.ISO_639_2);
457                 if (!isValid(languageSubtag)) {
458                     // we don't have it already,
459                     // System.out.println("Adding2: " + alpha3 + "\t" + languageSubtag + "\t" + Arrays.asList(english));
460                     if (languageSubtag.length() == 2) {
461                         toAlpha3.put(languageSubtag, alpha3);
462                         fromAlpha3.put(alpha3, languageSubtag);
463                     }
464                     toScope.put(languageSubtag, Scope.Collection);
465                     toType.put(languageSubtag, Type.Special);
466                     toNames.putAll(languageSubtag, Arrays.asList(english));
467                 }
468                 // skip inverted name for now
469             }
470             in.close();
471 
472             Map<String, String> toHeirarchyTemp = new TreeMap<String, String>();
473             in = CldrUtility.getUTF8Data("external/Iso639-5.html");
474             String lastCode = null;
475             int column = 0;
476             boolean lastAttributeIsScope = false;
477             boolean lastElementIsTD = false;
478             boolean hadPop = true;
479             // if the table level is 1 (we are in the main table), then we look for <td>...</td><td>...</td>. That means
480             // that we have column 1 and column 2.
481 
482             SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in);
483             StringBuilder result = new StringBuilder();
484 
485             main: while (true) {
486                 SimpleHtmlParser.Type x = simple.next(result);
487                 // System.out.println(column + "\t" + x + "\t" + result);
488                 switch (x) {
489                 case ELEMENT_START:
490                     hadPop = false;
491                     lastElementIsTD = false;
492                     break;
493                 case ELEMENT:
494                     if (SimpleHtmlParser.equals("tr", result)) {
495                         column = 0;
496                     } else if (SimpleHtmlParser.equals("td", result)) {
497                         lastElementIsTD = true;
498                     }
499                     break;
500                 case ELEMENT_POP:
501                     hadPop = true;
502                     break;
503                 case ELEMENT_END:
504                     // if we get a POP and a TD, and we have column > 0, we increment
505                     if (lastElementIsTD && hadPop && column > 0) {
506                         ++column;
507                     }
508                     break;
509                 case ELEMENT_CONTENT:
510                     /*
511                      * <th scope="col">Identifier<br />Indicatif</th>
512                      * <th scope="col">English name<br />Nom anglais</th>
513                      * <th scope="col">French name<br />Nom français</th>
514                      * <th scope="col">639-2</th>
515                      * <th scope="col">Hierarchy<br />Hiérarchie</th>
516                      * <th scope="col">Notes<br />Notes</th>
517                      *
518                      * <td scope="row">apa</td>
519                      * <td>Apache languages</td>
520                      * <td>apaches, langues</td>
521                      * <td>language group<br />groupe de langues</td>
522                      * <td>nai : xnd : ath : apa</td>
523                      * <td>
524                      * <br />
525                      * </td>
526                      */
527                     switch (column) {
528                     case 1:
529                         lastCode = result.toString();
530                         break;
531                     case 5:
532                         String old = toHeirarchyTemp.get(lastCode);
533                         toHeirarchyTemp.put(lastCode, old == null || old.length() == 0 ? result.toString().trim()
534                             : old + " " + result.toString().trim());
535                         break;
536                     case 2:
537                         break;
538                     case 3:
539                         break;
540                     case 4:
541                         break;
542                     case 0:
543                         break;
544                     default:
545                         break;
546                     }
547                     break;
548                 case ATTRIBUTE:
549                     lastAttributeIsScope = SimpleHtmlParser.equals("scope", result);
550                     break;
551                 case ATTRIBUTE_CONTENT:
552                     if (lastAttributeIsScope && SimpleHtmlParser.equals("row", result)) {
553                         column = 1;
554                     }
555                     break;
556                 case QUOTE:
557                     break;
558                 case DONE:
559                     break main;
560                 }
561             }
562 
563             in.close();
564 
565             Pattern SPLIT_HEIRARCHY = PatternCache.get("\\s*:\\s*");
566             toHeirarchy = new TreeMap<String, List<String>>();
567             // for (String code : toHeirarchyTemp.keySet()) {
568             // System.out.println(code + " => " + toHeirarchyTemp.get(code));
569             // }
570             for (String code : toHeirarchyTemp.keySet()) {
571                 String valueString = toHeirarchyTemp.get(code);
572                 String[] values = SPLIT_HEIRARCHY.split(valueString);
573                 for (String value : values) {
574                     if (toScope.get(value) == null && toHeirarchyTemp.get(value) == null) {
575                         throw new IllegalArgumentException("Unexpected value in heirarchy:\t" + value + "\t" + code
576                             + "\t" + valueString);
577                     }
578                 }
579                 toHeirarchy.put(code, Arrays.asList(values));
580             }
581             // System.out.println("Size:\t" + toNames.size());
582 
583             // make data unmodifiable, just to prevent mistakes
584 
585             toAlpha3 = Collections.unmodifiableMap(toAlpha3);
586             fromAlpha3 = Collections.unmodifiableMap(fromAlpha3);
587             toBiblio3 = Collections.unmodifiableMap(toBiblio3);
588             fromBiblio3 = Collections.unmodifiableMap(fromBiblio3);
589             toScope = Collections.unmodifiableMap(toScope);
590             toType = Collections.unmodifiableMap(toType);
591             toHeirarchy = Collections.unmodifiableMap(toHeirarchy);
592 
593             toNames.freeze();
594             toRetirements.freeze();
595             macro_encompassed.freeze();
596             toChangeTo = ImmutableMap.copyOf(toChangeTo);
597 
598         } catch (IOException e) {
599             throw new ICUUncheckedIOException("Cannot parse file", e);
600         }
601     }
602 
findMatchToPrefix(String prefix, T[] values)603     public static <T> T findMatchToPrefix(String prefix, T[] values) {
604         for (T x : values) {
605             if (x.toString().startsWith(prefix)) {
606                 return x;
607             }
608         }
609         throw new IllegalArgumentException("Prefix <" + prefix + "> not found in " + Arrays.asList(values));
610     }
611 
getAvailable()612     public static Set<String> getAvailable() {
613         if (toAlpha3 == null) {
614             getData();
615         }
616         return toNames.keySet();
617     }
618 
getMacroForEncompassed(String suffix)619     public static String getMacroForEncompassed(String suffix) {
620         String prefix = encompassed_macro.get(suffix);
621         if (prefix != null)
622             return prefix;
623         if (suffix.equals("sgn"))
624             return null;
625         Set<String> names = toNames.getAll(suffix);
626         if (names == null)
627             return null;
628         for (String name : names) {
629             if (name.contains("Sign Language"))
630                 return "sgn";
631         }
632         return null;
633     }
634 
getEncompassedForMacro(String prefix)635     public static Set<String> getEncompassedForMacro(String prefix) {
636         return macro_encompassed.getAll(prefix);
637     }
638 
getMacros()639     public static Set<String> getMacros() {
640         return macro_encompassed.keySet();
641     }
642 
getEncompassed()643     public static Set<String> getEncompassed() {
644         return encompassed_macro.keySet();
645     }
646 
getChangeTo(String subtag)647     public static String getChangeTo(String subtag) {
648         return getChangeToMap().get(subtag);
649     }
650 
getChangeToMap()651     public static Map<String, String> getChangeToMap() {
652         if (toChangeTo == null) {
653             getData();
654         }
655         return toChangeTo;
656     }
657 }