1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.Iterator;
7 import java.util.List;
8 import java.util.Locale;
9 import java.util.Set;
10 import java.util.TreeSet;
11 
12 import org.unicode.cldr.util.CldrUtility;
13 import org.unicode.cldr.util.CldrUtility.LineHandler;
14 import org.unicode.cldr.util.Counter2;
15 import org.unicode.cldr.util.StandardCodes;
16 
17 import com.ibm.icu.text.NumberFormat;
18 import com.ibm.icu.text.UnicodeSet;
19 import com.ibm.icu.util.ULocale;
20 
21 public class AddPopulationData {
22     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
23     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
24 
25     enum WBLine {
26         // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
27 
28         Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017;
get(String[] pieces)29         String get(String[] pieces) {
30             return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY;
31         }
32     }
33 
34     enum FBLine {
35         Rank, Country, Value, Year;
get(String[] pieces)36         String get(String[] pieces) {
37             return pieces[ordinal()];
38         }
39     }
40 
41     enum FBLiteracy {
42         Rank, Country, Percent;
get(String[] pieces)43         String get(String[] pieces) {
44             return pieces[ordinal()];
45         }
46     }
47 
48     private static final String GCP = "NY.GNP.MKTP.PP.CD";
49     private static final String POP = "SP.POP.TOTL";
50     private static final String EMPTY = "..";
51     private static Counter2<String> worldbank_gdp = new Counter2<String>();
52     private static Counter2<String> worldbank_population = new Counter2<String>();
53     private static Counter2<String> un_literacy = new Counter2<String>();
54 
55     private static Counter2<String> factbook_gdp = new Counter2<String>();
56     private static Counter2<String> factbook_population = new Counter2<String>();
57     private static Counter2<String> factbook_literacy = new Counter2<String>();
58 
59     private static CountryData other = new CountryData();
60 
61     static class CountryData {
62         private static Counter2<String> population = new Counter2<String>();
63         private static Counter2<String> gdp = new Counter2<String>();
64         private static Counter2<String> literacy = new Counter2<String>();
65     }
66 
main(String[] args)67     public static void main(String[] args) throws IOException {
68 
69         System.out.println("Code"
70             + "\t" + "Name"
71             + "\t" + "Pop"
72             + "\t" + "GDP-PPP"
73             + "\t" + "UN Literacy");
74 
75         for (String country : StandardCodes.make().getGoodCountries()) {
76             showCountryData(country);
77         }
78         Set<String> outliers = new TreeSet<String>();
79         outliers.addAll(factbook_population.keySet());
80         outliers.addAll(worldbank_population.keySet());
81         outliers.addAll(factbook_gdp.keySet());
82         outliers.addAll(worldbank_gdp.keySet());
83         outliers.addAll(un_literacy.keySet());
84         for (Iterator<String> it = outliers.iterator(); it.hasNext();) {
85             if (StandardCodes.isCountry(it.next())) {
86                 it.remove();
87             }
88         }
89         // outliers.remove("AN");
90         if (outliers.size() != 0) {
91             System.out.println("Mistakes: data for non-UN codes");
92             for (String country : outliers) {
93                 showCountryData(country);
94             }
95             throw new IllegalArgumentException("Mistakes: data for non-country codes");
96         }
97         Set<String> altNames = new TreeSet<String>();
98         String oldCode = "";
99         for (String display : CountryCodeConverter.names()) {
100             String code = CountryCodeConverter.getCodeFromName(display);
101             String icu = ULocale.getDisplayCountry("und-" + code, "en");
102             if (!display.equalsIgnoreCase(icu)) {
103                 altNames.add(code + "\t" + display + "\t" + icu);
104             }
105         }
106         oldCode = "";
107         if (SHOW_ALTERNATE_NAMES) {
108             for (String altName : altNames) {
109                 String[] pieces = altName.split("\t");
110                 String code = pieces[0];
111                 if (code.equals("ZZ")) continue;
112                 if (!code.equals(oldCode)) {
113                     oldCode = code;
114                     System.out.println();
115                 }
116                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
117                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] +
118                 // "</territory> <!-- " + pieces[2] + " -->");
119             }
120         }
121     }
122 
showCountryData(String country)123     private static void showCountryData(String country) {
124         number.setMaximumFractionDigits(0);
125         System.out.println(country
126             + "\t" + ULocale.getDisplayCountry("und-" + country, "en")
127             + "\t" + number.format(getPopulation(country))
128             + "\t" + number.format(getGdp(country))
129             + "\t" + percent.format(getLiteracy(country) / 100));
130     }
131 
getLiteracy(String country)132     public static Double getLiteracy(String country) {
133         return firstNonZero(factbook_literacy.getCount(country),
134             un_literacy.getCount(country),
135             CountryData.literacy.getCount(country));
136     }
137 
getGdp(String country)138     public static Double getGdp(String country) {
139         return firstNonZero(factbook_gdp.getCount(country),
140             worldbank_gdp.getCount(country),
141             CountryData.gdp.getCount(country));
142     }
143 
getPopulation(String country)144     public static Double getPopulation(String country) {
145         return firstNonZero(factbook_population.getCount(country),
146             worldbank_population.getCount(country),
147             CountryData.population.getCount(country));
148     }
149 
firstNonZero(Double... items)150     private static Double firstNonZero(Double... items) {
151         for (Double item : items) {
152             if (item.doubleValue() != 0) {
153                 return item;
154             }
155         }
156         return 0.0;
157     }
158 
splitCommaSeparated(String line)159     static String[] splitCommaSeparated(String line) {
160         // items are separated by ','
161         // each item is of the form abc...
162         // or "..." (required if a comma or quote is contained)
163         // " in a field is represented by ""
164         List<String> result = new ArrayList<String>();
165         StringBuilder item = new StringBuilder();
166         boolean inQuote = false;
167         for (int i = 0; i < line.length(); ++i) {
168             char ch = line.charAt(i); // don't worry about supplementaries
169             switch (ch) {
170             case '"':
171                 inQuote = !inQuote;
172                 // at start or end, that's enough
173                 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote
174                 if (inQuote && item.length() != 0) {
175                     item.append('"');
176                     inQuote = true;
177                 }
178                 break;
179             case ',':
180                 if (!inQuote) {
181                     result.add(item.toString());
182                     item.setLength(0);
183                 } else {
184                     item.append(ch);
185                 }
186                 break;
187             default:
188                 item.append(ch);
189                 break;
190             }
191         }
192         result.add(item.toString());
193         return result.toArray(new String[result.size()]);
194     }
195 
loadFactbookInfo(String filename, final Counter2<String> factbookGdp)196     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException {
197         CldrUtility.handleFile(filename, new LineHandler() {
198             public boolean handle(String line) {
199                 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank")
200                     || line.startsWith(" This file")) {
201                     return false;
202                 }
203                 String[] pieces = line.split("\\s{2,}");
204                 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces));
205                 if (code == null) {
206                     return false;
207                 }
208                 if (!StandardCodes.isCountry(code)) {
209                     if (ADD_POP) {
210                         System.out.println("Skipping factbook info for: " + code);
211                     }
212                     return false;
213                 }
214                 code = code.toUpperCase(Locale.ENGLISH);
215                 String valueString = FBLine.Value.get(pieces).trim();
216                 if (valueString.startsWith("$")) {
217                     valueString = valueString.substring(1);
218                 }
219                 valueString = valueString.replace(",", "");
220                 double value = Double.parseDouble(valueString.trim());
221                 factbookGdp.add(code, value);
222                 if (ADD_POP) {
223                     System.out.println("Factbook gdp:\t" + code + "\t" + value);
224                 }
225                 return true;
226             }
227         });
228     }
229 
230     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
231     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
232     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
233 
234     static class MyLineHandler implements LineHandler {
235         CountryData countryData;
236 
MyLineHandler(CountryData countryData)237         public MyLineHandler(CountryData countryData) {
238             super();
239             this.countryData = countryData;
240         }
241 
handle(String line)242         public boolean handle(String line) throws ParseException {
243             if (line.startsWith("#")) return true;
244             if (line.length() == 0) {
245                 return true;
246             }
247             String[] pieces = line.split(";");
248             final String code = pieces[0].trim();
249             if (code.equals("Code")) {
250                 return false;
251             }
252             // Code;Name;Type;Data;Source
253             final String typeString = pieces[2].trim();
254             final String data = pieces[3].trim();
255             if (typeString.equals("gdp-ppp")) {
256                 if (StandardCodes.isCountry(data)) {
257                     Double otherPop = getPopulation(data);
258                     Double otherGdp = getGdp(data);
259                     Double myPop = getPopulation(code);
260                     if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) {
261                         otherPop = getPopulation(data);
262                         otherGdp = getPopulation(data);
263                         myPop = getPopulation(code);
264                         throw new IllegalArgumentException("Zero population");
265                     }
266                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
267                 } else {
268                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
269                 }
270             } else if (typeString.equals("population")) {
271                 if (StandardCodes.isCountry(data)) {
272                     throw new IllegalArgumentException("Population can't use other country's");
273                 }
274                 CountryData.population.add(code, number.parse(data).doubleValue());
275             } else if (typeString.equals("literacy")) {
276                 if (StandardCodes.isCountry(data)) {
277                     Double otherPop = getLiteracy(data);
278                     CountryData.literacy.add(code, otherPop);
279                 } else {
280                     CountryData.literacy.add(code, number.parse(data).doubleValue());
281                 }
282             } else {
283                 throw new IllegalArgumentException("Illegal type");
284             }
285             return true;
286         }
287     }
288 
289     static final UnicodeSet DIGITS = (UnicodeSet) new UnicodeSet("[:Nd:]").freeze();
290 
loadFactbookLiteracy()291     private static void loadFactbookLiteracy() throws IOException {
292         final String filename = "external/factbook_literacy.txt";
293         CldrUtility.handleFile(filename, new LineHandler() {
294             public boolean handle(String line) {
295                 String[] pieces = line.split("\\t");
296                 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces));
297                 if (code == null) {
298                     return false;
299                 }
300                 if (!StandardCodes.isCountry(code)) {
301                     if (ADD_POP) {
302                         System.out.println("Skipping factbook literacy for: " + code);
303                     }
304                     return false;
305                 }
306                 code = code.toUpperCase(Locale.ENGLISH);
307                 String valueString = FBLiteracy.Percent.get(pieces).trim();
308                 double percent = Double.parseDouble(valueString);
309                 factbook_literacy.put(code, percent);
310                 if (ADD_POP) {
311                     System.out.println("Factbook literacy:\t" + code + "\t" + percent);
312                 }
313                 code = null;
314                 return true;
315             }
316         });
317     }
318 
loadWorldBankInfo()319     private static void loadWorldBankInfo() throws IOException {
320         final String filename = "external/world_bank_data.csv";
321 
322         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
323 
324         CldrUtility.handleFile(filename, new LineHandler() {
325             public boolean handle(String line) {
326                 if (line.contains("Series Code")) {
327                     return false;
328                 }
329                 String[] pieces = splitCommaSeparated(line);
330 
331                 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
332 
333                 final String seriesCode = WBLine.Series_Code.get(pieces);
334 
335                 String last = null;
336                 for (WBLine i : WBLine.values()) {
337                     if (i.compareTo(WBLine.YR2000) >= 0) {
338                         String current = i.get(pieces);
339                         if (current.length() != 0 && !current.equals(EMPTY)) {
340                             last = current;
341                         }
342                     }
343                 }
344                 if (last == null) {
345                     return false;
346                 }
347                 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces));
348                 if (country == null) {
349                     return false;
350                 }
351                 if (!StandardCodes.isCountry(country)) {
352                     if (ADD_POP) {
353                         System.out.println("Skipping worldbank info for: " + country);
354                     }
355                     return false;
356                 }
357                 double value;
358                 try {
359                     value = Double.parseDouble(last);
360                 } catch (NumberFormatException e) {
361                     throw new IllegalArgumentException("File changed format: need to modify code");
362                 }
363                 if (seriesCode.equals(GCP)) {
364                     worldbank_gdp.add(country, value);
365                 } else if (seriesCode.equals(POP)) {
366                     worldbank_population.add(country, value);
367                 } else {
368                     throw new IllegalArgumentException();
369                 }
370                 return true;
371             }
372         });
373     }
374 
loadUnLiteracy()375     private static void loadUnLiteracy() throws IOException {
376         CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() {
377             public boolean handle(String line) {
378                 // Afghanistan,2000, ,28,43,13,,34,51,18
379                 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,,"         Youth (15-24) literacy rate",,,,
380                 // ,,,Total,Men,Women,,Total,Men,Women
381                 // "Albania",2008,,96,,97,,95,,99,,99,,99
382                 String[] pieces = splitCommaSeparated(line);
383                 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) {
384                     return false;
385                 }
386                 String code = CountryCodeConverter.getCodeFromName(pieces[0]);
387                 if (code == null) {
388                     return false;
389                 }
390                 if (!StandardCodes.isCountry(code)) {
391                     if (ADD_POP) {
392                         System.out.println("Skipping UN info for: " + code);
393                     }
394                     return false;
395                 }
396                 String totalLiteracy = pieces[3];
397                 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) {
398                     return true;
399                 }
400                 double percent = Double.parseDouble(totalLiteracy);
401                 un_literacy.add(code, percent);
402                 return true;
403             }
404         });
405     }
406 
407     static {
408         try {
loadFactbookLiteracy()409             loadFactbookLiteracy();
loadUnLiteracy()410             loadUnLiteracy();
411 
412             loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
413             loadFactbookInfo("external/factbook_population.txt", factbook_population);
414             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
415 
loadWorldBankInfo()416             loadWorldBankInfo();
417             StandardCodes sc = StandardCodes.make();
418             StringBuilder myErrors = new StringBuilder();
419             for (String territory : sc.getGoodAvailableCodes("territory")) {
420                 if (!StandardCodes.isCountry(territory)) {
421                     continue;
422                 }
423                 double gdp = getGdp(territory);
424                 double literacy = getLiteracy(territory);
425                 double population = getPopulation(territory);
426                 if (gdp == 0) {
427                     // AX;Aland Islands;population;26,200;www.aland.ax
428                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason");
429                 }
430                 if (literacy == 0) {
431                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason");
432                 }
433                 if (population == 0) {
434                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory)
435                         + ";population;0;reason");
436                 }
437             }
438             if (myErrors.length() != 0) {
439                 throw new IllegalArgumentException(
440                     "Missing Country values, the following and add to external/other_country_data to fix:"
441                         + myErrors);
442             }
443         } catch (IOException e) {
444         }
445     }
446 }
447