1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.text.ParseException; 5 import java.util.ArrayList; 6 import java.util.Iterator; 7 import java.util.List; 8 import java.util.Locale; 9 import java.util.Set; 10 import java.util.TreeSet; 11 12 import org.unicode.cldr.util.CldrUtility; 13 import org.unicode.cldr.util.CldrUtility.LineHandler; 14 import org.unicode.cldr.util.Counter2; 15 import org.unicode.cldr.util.StandardCodes; 16 17 import com.ibm.icu.text.NumberFormat; 18 import com.ibm.icu.text.UnicodeSet; 19 import com.ibm.icu.util.ULocale; 20 21 public class AddPopulationData { 22 static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false); 23 static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false); 24 25 enum WBLine { 26 // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..", 27 28 Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017, YR2018, YR2019; get(String[] pieces)29 String get(String[] pieces) { 30 return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY; 31 } 32 } 33 34 enum FBLine { 35 Rank, Country, Value, Year; get(String[] pieces)36 String get(String[] pieces) { 37 return pieces[ordinal()]; 38 } 39 } 40 41 enum FBLiteracy { 42 Rank, Country, Percent; get(String[] pieces)43 String get(String[] pieces) { 44 return pieces[ordinal()]; 45 } 46 } 47 48 private static final String GCP = "NY.GNP.MKTP.PP.CD"; 49 private static final String POP = "SP.POP.TOTL"; 50 private static final String EMPTY = ".."; 51 private static Counter2<String> worldbank_gdp = new Counter2<>(); 52 private static Counter2<String> worldbank_population = new Counter2<>(); 53 private static Counter2<String> un_literacy = new Counter2<>(); 54 55 private static Counter2<String> factbook_gdp = new Counter2<>(); 56 private static Counter2<String> factbook_population = new Counter2<>(); 57 private static Counter2<String> factbook_literacy = new Counter2<>(); 58 59 private static CountryData other = new CountryData(); 60 61 static class CountryData { 62 private static Counter2<String> population = new Counter2<>(); 63 private static Counter2<String> gdp = new Counter2<>(); 64 private static Counter2<String> literacy = new Counter2<>(); 65 } 66 main(String[] args)67 public static void main(String[] args) throws IOException { 68 69 System.out.println("Code" 70 + "\t" + "Name" 71 + "\t" + "Pop" 72 + "\t" + "GDP-PPP" 73 + "\t" + "UN Literacy"); 74 75 for (String country : StandardCodes.make().getGoodCountries()) { 76 showCountryData(country); 77 } 78 Set<String> outliers = new TreeSet<>(); 79 outliers.addAll(factbook_population.keySet()); 80 outliers.addAll(worldbank_population.keySet()); 81 outliers.addAll(factbook_gdp.keySet()); 82 outliers.addAll(worldbank_gdp.keySet()); 83 outliers.addAll(un_literacy.keySet()); 84 for (Iterator<String> it = outliers.iterator(); it.hasNext();) { 85 if (StandardCodes.isCountry(it.next())) { 86 it.remove(); 87 } 88 } 89 // outliers.remove("AN"); 90 if (outliers.size() != 0) { 91 System.out.println("Mistakes: data for non-UN codes"); 92 for (String country : outliers) { 93 showCountryData(country); 94 } 95 throw new IllegalArgumentException("Mistakes: data for non-country codes"); 96 } 97 Set<String> altNames = new TreeSet<>(); 98 String oldCode = ""; 99 for (String display : CountryCodeConverter.names()) { 100 String code = CountryCodeConverter.getCodeFromName(display, true); 101 String icu = ULocale.getDisplayCountry("und-" + code, "en"); 102 if (!display.equalsIgnoreCase(icu)) { 103 altNames.add(code + "\t" + display + "\t" + icu); 104 } 105 } 106 oldCode = ""; 107 if (SHOW_ALTERNATE_NAMES) { 108 for (String altName : altNames) { 109 String[] pieces = altName.split("\t"); 110 String code = pieces[0]; 111 if (code.equals("ZZ")) continue; 112 if (!code.equals(oldCode)) { 113 oldCode = code; 114 System.out.println(); 115 } 116 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]); 117 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] + 118 // "</territory> <!-- " + pieces[2] + " -->"); 119 } 120 } 121 } 122 showCountryData(String country)123 private static void showCountryData(String country) { 124 number.setMaximumFractionDigits(0); 125 System.out.println(country 126 + "\t" + ULocale.getDisplayCountry("und-" + country, "en") 127 + "\t" + number.format(getPopulation(country)) 128 + "\t" + number.format(getGdp(country)) 129 + "\t" + percent.format(getLiteracy(country) / 100)); 130 } 131 getLiteracy(String country)132 public static Double getLiteracy(String country) { 133 return firstNonZero(factbook_literacy.getCount(country), 134 un_literacy.getCount(country), 135 CountryData.literacy.getCount(country)); 136 } 137 getGdp(String country)138 public static Double getGdp(String country) { 139 return firstNonZero(factbook_gdp.getCount(country), 140 worldbank_gdp.getCount(country), 141 CountryData.gdp.getCount(country)); 142 } 143 getPopulation(String country)144 public static Double getPopulation(String country) { 145 return firstNonZero(factbook_population.getCount(country), 146 worldbank_population.getCount(country), 147 CountryData.population.getCount(country)); 148 } 149 firstNonZero(Double... items)150 private static Double firstNonZero(Double... items) { 151 for (Double item : items) { 152 if (item.doubleValue() != 0) { 153 return item; 154 } 155 } 156 return 0.0; 157 } 158 splitCommaSeparated(String line)159 static String[] splitCommaSeparated(String line) { 160 // items are separated by ',' 161 // each item is of the form abc... 162 // or "..." (required if a comma or quote is contained) 163 // " in a field is represented by "" 164 List<String> result = new ArrayList<>(); 165 StringBuilder item = new StringBuilder(); 166 boolean inQuote = false; 167 for (int i = 0; i < line.length(); ++i) { 168 char ch = line.charAt(i); // don't worry about supplementaries 169 switch (ch) { 170 case '"': 171 inQuote = !inQuote; 172 // at start or end, that's enough 173 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote 174 if (inQuote && item.length() != 0) { 175 item.append('"'); 176 inQuote = true; 177 } 178 break; 179 case ',': 180 if (!inQuote) { 181 result.add(item.toString()); 182 item.setLength(0); 183 } else { 184 item.append(ch); 185 } 186 break; 187 default: 188 item.append(ch); 189 break; 190 } 191 } 192 result.add(item.toString()); 193 return result.toArray(new String[result.size()]); 194 } 195 loadFactbookInfo(String filename, final Counter2<String> factbookGdp)196 private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException { 197 CldrUtility.handleFile(filename, new LineHandler() { 198 @Override 199 public boolean handle(String line) { 200 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank") 201 || line.startsWith(" This file")) { 202 return false; 203 } 204 String[] pieces = line.split("\\s{2,}"); 205 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces), true); 206 if (code == null) { 207 return false; 208 } 209 if (!StandardCodes.isCountry(code)) { 210 if (ADD_POP) { 211 System.out.println("Skipping factbook info for: " + code); 212 } 213 return false; 214 } 215 code = code.toUpperCase(Locale.ENGLISH); 216 String valueString = FBLine.Value.get(pieces).trim(); 217 if (valueString.startsWith("$")) { 218 valueString = valueString.substring(1); 219 } 220 valueString = valueString.replace(",", ""); 221 double value = Double.parseDouble(valueString.trim()); 222 factbookGdp.add(code, value); 223 if (ADD_POP) { 224 System.out.println("Factbook gdp:\t" + code + "\t" + value); 225 } 226 return true; 227 } 228 }); 229 } 230 231 static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US); 232 static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US); 233 static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US); 234 235 static class MyLineHandler implements LineHandler { 236 CountryData countryData; 237 MyLineHandler(CountryData countryData)238 public MyLineHandler(CountryData countryData) { 239 super(); 240 this.countryData = countryData; 241 } 242 243 @Override handle(String line)244 public boolean handle(String line) throws ParseException { 245 if (line.startsWith("#")) return true; 246 if (line.length() == 0) { 247 return true; 248 } 249 String[] pieces = line.split(";"); 250 final String code = pieces[0].trim(); 251 if (code.equals("Code")) { 252 return false; 253 } 254 // Code;Name;Type;Data;Source 255 final String typeString = pieces[2].trim(); 256 final String data = pieces[3].trim(); 257 if (typeString.equals("gdp-ppp")) { 258 if (StandardCodes.isCountry(data)) { 259 Double otherPop = getPopulation(data); 260 Double otherGdp = getGdp(data); 261 Double myPop = getPopulation(code); 262 if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) { 263 otherPop = getPopulation(data); 264 otherGdp = getPopulation(data); 265 myPop = getPopulation(code); 266 throw new IllegalArgumentException("Zero population"); 267 } 268 CountryData.gdp.add(code, otherGdp * myPop / otherPop); 269 } else { 270 CountryData.gdp.add(code, dollars.parse(data).doubleValue()); 271 } 272 } else if (typeString.equals("population")) { 273 if (StandardCodes.isCountry(data)) { 274 throw new IllegalArgumentException("Population can't use other country's"); 275 } 276 CountryData.population.add(code, number.parse(data).doubleValue()); 277 } else if (typeString.equals("literacy")) { 278 if (StandardCodes.isCountry(data)) { 279 Double otherPop = getLiteracy(data); 280 CountryData.literacy.add(code, otherPop); 281 } else { 282 CountryData.literacy.add(code, number.parse(data).doubleValue()); 283 } 284 } else { 285 throw new IllegalArgumentException("Illegal type"); 286 } 287 return true; 288 } 289 } 290 291 static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze(); 292 loadFactbookLiteracy()293 private static void loadFactbookLiteracy() throws IOException { 294 final String filename = "external/factbook_literacy.txt"; 295 CldrUtility.handleFile(filename, new LineHandler() { 296 @Override 297 public boolean handle(String line) { 298 String[] pieces = line.split("\\t"); 299 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces), true); 300 if (code == null) { 301 return false; 302 } 303 if (!StandardCodes.isCountry(code)) { 304 if (ADD_POP) { 305 System.out.println("Skipping factbook literacy for: " + code); 306 } 307 return false; 308 } 309 code = code.toUpperCase(Locale.ENGLISH); 310 String valueString = FBLiteracy.Percent.get(pieces).trim(); 311 double percent = Double.parseDouble(valueString); 312 factbook_literacy.put(code, percent); 313 if (ADD_POP) { 314 System.out.println("Factbook literacy:\t" + code + "\t" + percent); 315 } 316 code = null; 317 return true; 318 } 319 }); 320 } 321 loadWorldBankInfo()322 private static void loadWorldBankInfo() throws IOException { 323 final String filename = "external/world_bank_data.csv"; 324 325 // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename)); 326 327 CldrUtility.handleFile(filename, new LineHandler() { 328 @Override 329 public boolean handle(String line) { 330 if (line.contains("Series Code")) { 331 return false; 332 } 333 String[] pieces = splitCommaSeparated(line); 334 335 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\""); 336 337 final String seriesCode = WBLine.Series_Code.get(pieces); 338 339 String last = null; 340 for (WBLine i : WBLine.values()) { 341 if (i.compareTo(WBLine.YR2000) >= 0) { 342 String current = i.get(pieces); 343 if (current.length() != 0 && !current.equals(EMPTY)) { 344 last = current; 345 } 346 } 347 } 348 if (last == null) { 349 return false; 350 } 351 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces), true); 352 if (country == null) { 353 return false; 354 } 355 if (!StandardCodes.isCountry(country)) { 356 if (ADD_POP) { 357 System.out.println("Skipping worldbank info for: " + country); 358 } 359 return false; 360 } 361 double value; 362 try { 363 value = Double.parseDouble(last); 364 } catch (NumberFormatException e) { 365 throw new IllegalArgumentException("File changed format: need to modify code"); 366 } 367 if (seriesCode.equals(GCP)) { 368 worldbank_gdp.add(country, value); 369 } else if (seriesCode.equals(POP)) { 370 worldbank_population.add(country, value); 371 } else { 372 throw new IllegalArgumentException(); 373 } 374 return true; 375 } 376 }); 377 } 378 loadUnLiteracy()379 private static void loadUnLiteracy() throws IOException { 380 CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() { 381 @Override 382 public boolean handle(String line) { 383 // Afghanistan,2000, ,28,43,13,,34,51,18 384 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,," Youth (15-24) literacy rate",,,, 385 // ,,,Total,Men,Women,,Total,Men,Women 386 // "Albania",2008,,96,,97,,95,,99,,99,,99 387 String[] pieces = splitCommaSeparated(line); 388 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) { 389 return false; 390 } 391 String code = CountryCodeConverter.getCodeFromName(pieces[0], true); 392 if (code == null) { 393 return false; 394 } 395 if (!StandardCodes.isCountry(code)) { 396 if (ADD_POP) { 397 System.out.println("Skipping UN info for: " + code); 398 } 399 return false; 400 } 401 String totalLiteracy = pieces[3]; 402 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) { 403 return true; 404 } 405 double percent = Double.parseDouble(totalLiteracy); 406 un_literacy.add(code, percent); 407 return true; 408 } 409 }); 410 } 411 412 static { 413 try { loadFactbookLiteracy()414 loadFactbookLiteracy(); loadUnLiteracy()415 loadUnLiteracy(); 416 417 loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp); 418 loadFactbookInfo("external/factbook_population.txt", factbook_population); 419 CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other)); 420 loadWorldBankInfo()421 loadWorldBankInfo(); 422 StandardCodes sc = StandardCodes.make(); 423 StringBuilder myErrors = new StringBuilder(); 424 for (String territory : sc.getGoodAvailableCodes("territory")) { 425 if (!StandardCodes.isCountry(territory)) { 426 continue; 427 } 428 double gdp = getGdp(territory); 429 double literacy = getLiteracy(territory); 430 double population = getPopulation(territory); 431 if (gdp == 0) { 432 // AX;Aland Islands;population;26,200;www.aland.ax 433 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason"); 434 } 435 if (literacy == 0) { 436 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason"); 437 } 438 if (population == 0) { 439 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) 440 + ";population;0;reason"); 441 } 442 } 443 if (myErrors.length() != 0) { 444 throw new IllegalArgumentException( 445 "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:" 446 + myErrors); 447 } 448 } catch (IOException e) { 449 } 450 } 451 } 452