1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.Iterator;
7 import java.util.List;
8 import java.util.Locale;
9 import java.util.Set;
10 import java.util.TreeSet;
11 
12 import org.unicode.cldr.util.CldrUtility;
13 import org.unicode.cldr.util.CldrUtility.LineHandler;
14 import org.unicode.cldr.util.Counter2;
15 import org.unicode.cldr.util.StandardCodes;
16 
17 import com.ibm.icu.text.NumberFormat;
18 import com.ibm.icu.text.UnicodeSet;
19 import com.ibm.icu.util.ULocale;
20 
21 public class AddPopulationData {
22     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
23     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
24 
25     enum WBLine {
26         // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
27 
28         Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017, YR2018, YR2019;
get(String[] pieces)29         String get(String[] pieces) {
30             return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY;
31         }
32     }
33 
34     enum FBLine {
35         Rank, Country, Value, Year;
get(String[] pieces)36         String get(String[] pieces) {
37             return pieces[ordinal()];
38         }
39     }
40 
41     enum FBLiteracy {
42         Rank, Country, Percent;
get(String[] pieces)43         String get(String[] pieces) {
44             return pieces[ordinal()];
45         }
46     }
47 
48     private static final String GCP = "NY.GNP.MKTP.PP.CD";
49     private static final String POP = "SP.POP.TOTL";
50     private static final String EMPTY = "..";
51     private static Counter2<String> worldbank_gdp = new Counter2<>();
52     private static Counter2<String> worldbank_population = new Counter2<>();
53     private static Counter2<String> un_literacy = new Counter2<>();
54 
55     private static Counter2<String> factbook_gdp = new Counter2<>();
56     private static Counter2<String> factbook_population = new Counter2<>();
57     private static Counter2<String> factbook_literacy = new Counter2<>();
58 
59     private static CountryData other = new CountryData();
60 
61     static class CountryData {
62         private static Counter2<String> population = new Counter2<>();
63         private static Counter2<String> gdp = new Counter2<>();
64         private static Counter2<String> literacy = new Counter2<>();
65     }
66 
main(String[] args)67     public static void main(String[] args) throws IOException {
68 
69         System.out.println("Code"
70             + "\t" + "Name"
71             + "\t" + "Pop"
72             + "\t" + "GDP-PPP"
73             + "\t" + "UN Literacy");
74 
75         for (String country : StandardCodes.make().getGoodCountries()) {
76             showCountryData(country);
77         }
78         Set<String> outliers = new TreeSet<>();
79         outliers.addAll(factbook_population.keySet());
80         outliers.addAll(worldbank_population.keySet());
81         outliers.addAll(factbook_gdp.keySet());
82         outliers.addAll(worldbank_gdp.keySet());
83         outliers.addAll(un_literacy.keySet());
84         for (Iterator<String> it = outliers.iterator(); it.hasNext();) {
85             if (StandardCodes.isCountry(it.next())) {
86                 it.remove();
87             }
88         }
89         // outliers.remove("AN");
90         if (outliers.size() != 0) {
91             System.out.println("Mistakes: data for non-UN codes");
92             for (String country : outliers) {
93                 showCountryData(country);
94             }
95             throw new IllegalArgumentException("Mistakes: data for non-country codes");
96         }
97         Set<String> altNames = new TreeSet<>();
98         String oldCode = "";
99         for (String display : CountryCodeConverter.names()) {
100             String code = CountryCodeConverter.getCodeFromName(display, true);
101             String icu = ULocale.getDisplayCountry("und-" + code, "en");
102             if (!display.equalsIgnoreCase(icu)) {
103                 altNames.add(code + "\t" + display + "\t" + icu);
104             }
105         }
106         oldCode = "";
107         if (SHOW_ALTERNATE_NAMES) {
108             for (String altName : altNames) {
109                 String[] pieces = altName.split("\t");
110                 String code = pieces[0];
111                 if (code.equals("ZZ")) continue;
112                 if (!code.equals(oldCode)) {
113                     oldCode = code;
114                     System.out.println();
115                 }
116                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
117                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] +
118                 // "</territory> <!-- " + pieces[2] + " -->");
119             }
120         }
121     }
122 
showCountryData(String country)123     private static void showCountryData(String country) {
124         number.setMaximumFractionDigits(0);
125         System.out.println(country
126             + "\t" + ULocale.getDisplayCountry("und-" + country, "en")
127             + "\t" + number.format(getPopulation(country))
128             + "\t" + number.format(getGdp(country))
129             + "\t" + percent.format(getLiteracy(country) / 100));
130     }
131 
getLiteracy(String country)132     public static Double getLiteracy(String country) {
133         return firstNonZero(factbook_literacy.getCount(country),
134             un_literacy.getCount(country),
135             CountryData.literacy.getCount(country));
136     }
137 
getGdp(String country)138     public static Double getGdp(String country) {
139         return firstNonZero(factbook_gdp.getCount(country),
140             worldbank_gdp.getCount(country),
141             CountryData.gdp.getCount(country));
142     }
143 
getPopulation(String country)144     public static Double getPopulation(String country) {
145         return firstNonZero(factbook_population.getCount(country),
146             worldbank_population.getCount(country),
147             CountryData.population.getCount(country));
148     }
149 
firstNonZero(Double... items)150     private static Double firstNonZero(Double... items) {
151         for (Double item : items) {
152             if (item.doubleValue() != 0) {
153                 return item;
154             }
155         }
156         return 0.0;
157     }
158 
splitCommaSeparated(String line)159     static String[] splitCommaSeparated(String line) {
160         // items are separated by ','
161         // each item is of the form abc...
162         // or "..." (required if a comma or quote is contained)
163         // " in a field is represented by ""
164         List<String> result = new ArrayList<>();
165         StringBuilder item = new StringBuilder();
166         boolean inQuote = false;
167         for (int i = 0; i < line.length(); ++i) {
168             char ch = line.charAt(i); // don't worry about supplementaries
169             switch (ch) {
170             case '"':
171                 inQuote = !inQuote;
172                 // at start or end, that's enough
173                 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote
174                 if (inQuote && item.length() != 0) {
175                     item.append('"');
176                     inQuote = true;
177                 }
178                 break;
179             case ',':
180                 if (!inQuote) {
181                     result.add(item.toString());
182                     item.setLength(0);
183                 } else {
184                     item.append(ch);
185                 }
186                 break;
187             default:
188                 item.append(ch);
189                 break;
190             }
191         }
192         result.add(item.toString());
193         return result.toArray(new String[result.size()]);
194     }
195 
loadFactbookInfo(String filename, final Counter2<String> factbookGdp)196     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException {
197         CldrUtility.handleFile(filename, new LineHandler() {
198             @Override
199             public boolean handle(String line) {
200                 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank")
201                     || line.startsWith(" This file")) {
202                     return false;
203                 }
204                 String[] pieces = line.split("\\s{2,}");
205                 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces), true);
206                 if (code == null) {
207                     return false;
208                 }
209                 if (!StandardCodes.isCountry(code)) {
210                     if (ADD_POP) {
211                         System.out.println("Skipping factbook info for: " + code);
212                     }
213                     return false;
214                 }
215                 code = code.toUpperCase(Locale.ENGLISH);
216                 String valueString = FBLine.Value.get(pieces).trim();
217                 if (valueString.startsWith("$")) {
218                     valueString = valueString.substring(1);
219                 }
220                 valueString = valueString.replace(",", "");
221                 double value = Double.parseDouble(valueString.trim());
222                 factbookGdp.add(code, value);
223                 if (ADD_POP) {
224                     System.out.println("Factbook gdp:\t" + code + "\t" + value);
225                 }
226                 return true;
227             }
228         });
229     }
230 
231     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
232     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
233     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
234 
235     static class MyLineHandler implements LineHandler {
236         CountryData countryData;
237 
MyLineHandler(CountryData countryData)238         public MyLineHandler(CountryData countryData) {
239             super();
240             this.countryData = countryData;
241         }
242 
243         @Override
handle(String line)244         public boolean handle(String line) throws ParseException {
245             if (line.startsWith("#")) return true;
246             if (line.length() == 0) {
247                 return true;
248             }
249             String[] pieces = line.split(";");
250             final String code = pieces[0].trim();
251             if (code.equals("Code")) {
252                 return false;
253             }
254             // Code;Name;Type;Data;Source
255             final String typeString = pieces[2].trim();
256             final String data = pieces[3].trim();
257             if (typeString.equals("gdp-ppp")) {
258                 if (StandardCodes.isCountry(data)) {
259                     Double otherPop = getPopulation(data);
260                     Double otherGdp = getGdp(data);
261                     Double myPop = getPopulation(code);
262                     if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) {
263                         otherPop = getPopulation(data);
264                         otherGdp = getPopulation(data);
265                         myPop = getPopulation(code);
266                         throw new IllegalArgumentException("Zero population");
267                     }
268                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
269                 } else {
270                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
271                 }
272             } else if (typeString.equals("population")) {
273                 if (StandardCodes.isCountry(data)) {
274                     throw new IllegalArgumentException("Population can't use other country's");
275                 }
276                 CountryData.population.add(code, number.parse(data).doubleValue());
277             } else if (typeString.equals("literacy")) {
278                 if (StandardCodes.isCountry(data)) {
279                     Double otherPop = getLiteracy(data);
280                     CountryData.literacy.add(code, otherPop);
281                 } else {
282                     CountryData.literacy.add(code, number.parse(data).doubleValue());
283                 }
284             } else {
285                 throw new IllegalArgumentException("Illegal type");
286             }
287             return true;
288         }
289     }
290 
291     static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze();
292 
loadFactbookLiteracy()293     private static void loadFactbookLiteracy() throws IOException {
294         final String filename = "external/factbook_literacy.txt";
295         CldrUtility.handleFile(filename, new LineHandler() {
296             @Override
297             public boolean handle(String line) {
298                 String[] pieces = line.split("\\t");
299                 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces), true);
300                 if (code == null) {
301                     return false;
302                 }
303                 if (!StandardCodes.isCountry(code)) {
304                     if (ADD_POP) {
305                         System.out.println("Skipping factbook literacy for: " + code);
306                     }
307                     return false;
308                 }
309                 code = code.toUpperCase(Locale.ENGLISH);
310                 String valueString = FBLiteracy.Percent.get(pieces).trim();
311                 double percent = Double.parseDouble(valueString);
312                 factbook_literacy.put(code, percent);
313                 if (ADD_POP) {
314                     System.out.println("Factbook literacy:\t" + code + "\t" + percent);
315                 }
316                 code = null;
317                 return true;
318             }
319         });
320     }
321 
loadWorldBankInfo()322     private static void loadWorldBankInfo() throws IOException {
323         final String filename = "external/world_bank_data.csv";
324 
325         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
326 
327         CldrUtility.handleFile(filename, new LineHandler() {
328             @Override
329             public boolean handle(String line) {
330                 if (line.contains("Series Code")) {
331                     return false;
332                 }
333                 String[] pieces = splitCommaSeparated(line);
334 
335                 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
336 
337                 final String seriesCode = WBLine.Series_Code.get(pieces);
338 
339                 String last = null;
340                 for (WBLine i : WBLine.values()) {
341                     if (i.compareTo(WBLine.YR2000) >= 0) {
342                         String current = i.get(pieces);
343                         if (current.length() != 0 && !current.equals(EMPTY)) {
344                             last = current;
345                         }
346                     }
347                 }
348                 if (last == null) {
349                     return false;
350                 }
351                 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces), true);
352                 if (country == null) {
353                     return false;
354                 }
355                 if (!StandardCodes.isCountry(country)) {
356                     if (ADD_POP) {
357                         System.out.println("Skipping worldbank info for: " + country);
358                     }
359                     return false;
360                 }
361                 double value;
362                 try {
363                     value = Double.parseDouble(last);
364                 } catch (NumberFormatException e) {
365                     throw new IllegalArgumentException("File changed format: need to modify code");
366                 }
367                 if (seriesCode.equals(GCP)) {
368                     worldbank_gdp.add(country, value);
369                 } else if (seriesCode.equals(POP)) {
370                     worldbank_population.add(country, value);
371                 } else {
372                     throw new IllegalArgumentException();
373                 }
374                 return true;
375             }
376         });
377     }
378 
loadUnLiteracy()379     private static void loadUnLiteracy() throws IOException {
380         CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() {
381             @Override
382             public boolean handle(String line) {
383                 // Afghanistan,2000, ,28,43,13,,34,51,18
384                 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,,"         Youth (15-24) literacy rate",,,,
385                 // ,,,Total,Men,Women,,Total,Men,Women
386                 // "Albania",2008,,96,,97,,95,,99,,99,,99
387                 String[] pieces = splitCommaSeparated(line);
388                 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) {
389                     return false;
390                 }
391                 String code = CountryCodeConverter.getCodeFromName(pieces[0], true);
392                 if (code == null) {
393                     return false;
394                 }
395                 if (!StandardCodes.isCountry(code)) {
396                     if (ADD_POP) {
397                         System.out.println("Skipping UN info for: " + code);
398                     }
399                     return false;
400                 }
401                 String totalLiteracy = pieces[3];
402                 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) {
403                     return true;
404                 }
405                 double percent = Double.parseDouble(totalLiteracy);
406                 un_literacy.add(code, percent);
407                 return true;
408             }
409         });
410     }
411 
412     static {
413         try {
loadFactbookLiteracy()414             loadFactbookLiteracy();
loadUnLiteracy()415             loadUnLiteracy();
416 
417             loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
418             loadFactbookInfo("external/factbook_population.txt", factbook_population);
419             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
420 
loadWorldBankInfo()421             loadWorldBankInfo();
422             StandardCodes sc = StandardCodes.make();
423             StringBuilder myErrors = new StringBuilder();
424             for (String territory : sc.getGoodAvailableCodes("territory")) {
425                 if (!StandardCodes.isCountry(territory)) {
426                     continue;
427                 }
428                 double gdp = getGdp(territory);
429                 double literacy = getLiteracy(territory);
430                 double population = getPopulation(territory);
431                 if (gdp == 0) {
432                     // AX;Aland Islands;population;26,200;www.aland.ax
433                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason");
434                 }
435                 if (literacy == 0) {
436                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason");
437                 }
438                 if (population == 0) {
439                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory)
440                         + ";population;0;reason");
441                 }
442             }
443             if (myErrors.length() != 0) {
444                 throw new IllegalArgumentException(
445                     "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:"
446                         + myErrors);
447             }
448         } catch (IOException e) {
449         }
450     }
451 }
452