1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.text.ParseException; 5 import java.util.ArrayList; 6 import java.util.Iterator; 7 import java.util.List; 8 import java.util.Locale; 9 import java.util.Set; 10 import java.util.TreeSet; 11 12 import org.unicode.cldr.util.CldrUtility; 13 import org.unicode.cldr.util.CldrUtility.LineHandler; 14 import org.unicode.cldr.util.Counter2; 15 import org.unicode.cldr.util.StandardCodes; 16 17 import com.ibm.icu.text.NumberFormat; 18 import com.ibm.icu.text.UnicodeSet; 19 import com.ibm.icu.util.ULocale; 20 21 public class AddPopulationData { 22 static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false); 23 static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false); 24 25 enum WBLine { 26 // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..", 27 28 Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017; get(String[] pieces)29 String get(String[] pieces) { 30 return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY; 31 } 32 } 33 34 enum FBLine { 35 Rank, Country, Value, Year; get(String[] pieces)36 String get(String[] pieces) { 37 return pieces[ordinal()]; 38 } 39 } 40 41 enum FBLiteracy { 42 Rank, Country, Percent; get(String[] pieces)43 String get(String[] pieces) { 44 return pieces[ordinal()]; 45 } 46 } 47 48 private static final String GCP = "NY.GNP.MKTP.PP.CD"; 49 private static final String POP = "SP.POP.TOTL"; 50 private static final String EMPTY = ".."; 51 private static Counter2<String> worldbank_gdp = new Counter2<String>(); 52 private static Counter2<String> worldbank_population = new Counter2<String>(); 53 private static Counter2<String> un_literacy = new Counter2<String>(); 54 55 private static Counter2<String> factbook_gdp = new Counter2<String>(); 56 private static Counter2<String> factbook_population = new Counter2<String>(); 57 private static Counter2<String> factbook_literacy = new Counter2<String>(); 58 59 private static CountryData other = new CountryData(); 60 61 static class CountryData { 62 private static Counter2<String> population = new Counter2<String>(); 63 private static Counter2<String> gdp = new Counter2<String>(); 64 private static Counter2<String> literacy = new Counter2<String>(); 65 } 66 main(String[] args)67 public static void main(String[] args) throws IOException { 68 69 System.out.println("Code" 70 + "\t" + "Name" 71 + "\t" + "Pop" 72 + "\t" + "GDP-PPP" 73 + "\t" + "UN Literacy"); 74 75 for (String country : StandardCodes.make().getGoodCountries()) { 76 showCountryData(country); 77 } 78 Set<String> outliers = new TreeSet<String>(); 79 outliers.addAll(factbook_population.keySet()); 80 outliers.addAll(worldbank_population.keySet()); 81 outliers.addAll(factbook_gdp.keySet()); 82 outliers.addAll(worldbank_gdp.keySet()); 83 outliers.addAll(un_literacy.keySet()); 84 for (Iterator<String> it = outliers.iterator(); it.hasNext();) { 85 if (StandardCodes.isCountry(it.next())) { 86 it.remove(); 87 } 88 } 89 // outliers.remove("AN"); 90 if (outliers.size() != 0) { 91 System.out.println("Mistakes: data for non-UN codes"); 92 for (String country : outliers) { 93 showCountryData(country); 94 } 95 throw new IllegalArgumentException("Mistakes: data for non-country codes"); 96 } 97 Set<String> altNames = new TreeSet<String>(); 98 String oldCode = ""; 99 for (String display : CountryCodeConverter.names()) { 100 String code = CountryCodeConverter.getCodeFromName(display); 101 String icu = ULocale.getDisplayCountry("und-" + code, "en"); 102 if (!display.equalsIgnoreCase(icu)) { 103 altNames.add(code + "\t" + display + "\t" + icu); 104 } 105 } 106 oldCode = ""; 107 if (SHOW_ALTERNATE_NAMES) { 108 for (String altName : altNames) { 109 String[] pieces = altName.split("\t"); 110 String code = pieces[0]; 111 if (code.equals("ZZ")) continue; 112 if (!code.equals(oldCode)) { 113 oldCode = code; 114 System.out.println(); 115 } 116 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]); 117 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] + 118 // "</territory> <!-- " + pieces[2] + " -->"); 119 } 120 } 121 } 122 showCountryData(String country)123 private static void showCountryData(String country) { 124 number.setMaximumFractionDigits(0); 125 System.out.println(country 126 + "\t" + ULocale.getDisplayCountry("und-" + country, "en") 127 + "\t" + number.format(getPopulation(country)) 128 + "\t" + number.format(getGdp(country)) 129 + "\t" + percent.format(getLiteracy(country) / 100)); 130 } 131 getLiteracy(String country)132 public static Double getLiteracy(String country) { 133 return firstNonZero(factbook_literacy.getCount(country), 134 un_literacy.getCount(country), 135 CountryData.literacy.getCount(country)); 136 } 137 getGdp(String country)138 public static Double getGdp(String country) { 139 return firstNonZero(factbook_gdp.getCount(country), 140 worldbank_gdp.getCount(country), 141 CountryData.gdp.getCount(country)); 142 } 143 getPopulation(String country)144 public static Double getPopulation(String country) { 145 return firstNonZero(factbook_population.getCount(country), 146 worldbank_population.getCount(country), 147 CountryData.population.getCount(country)); 148 } 149 firstNonZero(Double... items)150 private static Double firstNonZero(Double... items) { 151 for (Double item : items) { 152 if (item.doubleValue() != 0) { 153 return item; 154 } 155 } 156 return 0.0; 157 } 158 splitCommaSeparated(String line)159 static String[] splitCommaSeparated(String line) { 160 // items are separated by ',' 161 // each item is of the form abc... 162 // or "..." (required if a comma or quote is contained) 163 // " in a field is represented by "" 164 List<String> result = new ArrayList<String>(); 165 StringBuilder item = new StringBuilder(); 166 boolean inQuote = false; 167 for (int i = 0; i < line.length(); ++i) { 168 char ch = line.charAt(i); // don't worry about supplementaries 169 switch (ch) { 170 case '"': 171 inQuote = !inQuote; 172 // at start or end, that's enough 173 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote 174 if (inQuote && item.length() != 0) { 175 item.append('"'); 176 inQuote = true; 177 } 178 break; 179 case ',': 180 if (!inQuote) { 181 result.add(item.toString()); 182 item.setLength(0); 183 } else { 184 item.append(ch); 185 } 186 break; 187 default: 188 item.append(ch); 189 break; 190 } 191 } 192 result.add(item.toString()); 193 return result.toArray(new String[result.size()]); 194 } 195 loadFactbookInfo(String filename, final Counter2<String> factbookGdp)196 private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException { 197 CldrUtility.handleFile(filename, new LineHandler() { 198 public boolean handle(String line) { 199 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank") 200 || line.startsWith(" This file")) { 201 return false; 202 } 203 String[] pieces = line.split("\\s{2,}"); 204 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces)); 205 if (code == null) { 206 return false; 207 } 208 if (!StandardCodes.isCountry(code)) { 209 if (ADD_POP) { 210 System.out.println("Skipping factbook info for: " + code); 211 } 212 return false; 213 } 214 code = code.toUpperCase(Locale.ENGLISH); 215 String valueString = FBLine.Value.get(pieces).trim(); 216 if (valueString.startsWith("$")) { 217 valueString = valueString.substring(1); 218 } 219 valueString = valueString.replace(",", ""); 220 double value = Double.parseDouble(valueString.trim()); 221 factbookGdp.add(code, value); 222 if (ADD_POP) { 223 System.out.println("Factbook gdp:\t" + code + "\t" + value); 224 } 225 return true; 226 } 227 }); 228 } 229 230 static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US); 231 static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US); 232 static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US); 233 234 static class MyLineHandler implements LineHandler { 235 CountryData countryData; 236 MyLineHandler(CountryData countryData)237 public MyLineHandler(CountryData countryData) { 238 super(); 239 this.countryData = countryData; 240 } 241 handle(String line)242 public boolean handle(String line) throws ParseException { 243 if (line.startsWith("#")) return true; 244 if (line.length() == 0) { 245 return true; 246 } 247 String[] pieces = line.split(";"); 248 final String code = pieces[0].trim(); 249 if (code.equals("Code")) { 250 return false; 251 } 252 // Code;Name;Type;Data;Source 253 final String typeString = pieces[2].trim(); 254 final String data = pieces[3].trim(); 255 if (typeString.equals("gdp-ppp")) { 256 if (StandardCodes.isCountry(data)) { 257 Double otherPop = getPopulation(data); 258 Double otherGdp = getGdp(data); 259 Double myPop = getPopulation(code); 260 if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) { 261 otherPop = getPopulation(data); 262 otherGdp = getPopulation(data); 263 myPop = getPopulation(code); 264 throw new IllegalArgumentException("Zero population"); 265 } 266 CountryData.gdp.add(code, otherGdp * myPop / otherPop); 267 } else { 268 CountryData.gdp.add(code, dollars.parse(data).doubleValue()); 269 } 270 } else if (typeString.equals("population")) { 271 if (StandardCodes.isCountry(data)) { 272 throw new IllegalArgumentException("Population can't use other country's"); 273 } 274 CountryData.population.add(code, number.parse(data).doubleValue()); 275 } else if (typeString.equals("literacy")) { 276 if (StandardCodes.isCountry(data)) { 277 Double otherPop = getLiteracy(data); 278 CountryData.literacy.add(code, otherPop); 279 } else { 280 CountryData.literacy.add(code, number.parse(data).doubleValue()); 281 } 282 } else { 283 throw new IllegalArgumentException("Illegal type"); 284 } 285 return true; 286 } 287 } 288 289 static final UnicodeSet DIGITS = (UnicodeSet) new UnicodeSet("[:Nd:]").freeze(); 290 loadFactbookLiteracy()291 private static void loadFactbookLiteracy() throws IOException { 292 final String filename = "external/factbook_literacy.txt"; 293 CldrUtility.handleFile(filename, new LineHandler() { 294 public boolean handle(String line) { 295 String[] pieces = line.split("\\t"); 296 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces)); 297 if (code == null) { 298 return false; 299 } 300 if (!StandardCodes.isCountry(code)) { 301 if (ADD_POP) { 302 System.out.println("Skipping factbook literacy for: " + code); 303 } 304 return false; 305 } 306 code = code.toUpperCase(Locale.ENGLISH); 307 String valueString = FBLiteracy.Percent.get(pieces).trim(); 308 double percent = Double.parseDouble(valueString); 309 factbook_literacy.put(code, percent); 310 if (ADD_POP) { 311 System.out.println("Factbook literacy:\t" + code + "\t" + percent); 312 } 313 code = null; 314 return true; 315 } 316 }); 317 } 318 loadWorldBankInfo()319 private static void loadWorldBankInfo() throws IOException { 320 final String filename = "external/world_bank_data.csv"; 321 322 // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename)); 323 324 CldrUtility.handleFile(filename, new LineHandler() { 325 public boolean handle(String line) { 326 if (line.contains("Series Code")) { 327 return false; 328 } 329 String[] pieces = splitCommaSeparated(line); 330 331 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\""); 332 333 final String seriesCode = WBLine.Series_Code.get(pieces); 334 335 String last = null; 336 for (WBLine i : WBLine.values()) { 337 if (i.compareTo(WBLine.YR2000) >= 0) { 338 String current = i.get(pieces); 339 if (current.length() != 0 && !current.equals(EMPTY)) { 340 last = current; 341 } 342 } 343 } 344 if (last == null) { 345 return false; 346 } 347 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces)); 348 if (country == null) { 349 return false; 350 } 351 if (!StandardCodes.isCountry(country)) { 352 if (ADD_POP) { 353 System.out.println("Skipping worldbank info for: " + country); 354 } 355 return false; 356 } 357 double value; 358 try { 359 value = Double.parseDouble(last); 360 } catch (NumberFormatException e) { 361 throw new IllegalArgumentException("File changed format: need to modify code"); 362 } 363 if (seriesCode.equals(GCP)) { 364 worldbank_gdp.add(country, value); 365 } else if (seriesCode.equals(POP)) { 366 worldbank_population.add(country, value); 367 } else { 368 throw new IllegalArgumentException(); 369 } 370 return true; 371 } 372 }); 373 } 374 loadUnLiteracy()375 private static void loadUnLiteracy() throws IOException { 376 CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() { 377 public boolean handle(String line) { 378 // Afghanistan,2000, ,28,43,13,,34,51,18 379 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,," Youth (15-24) literacy rate",,,, 380 // ,,,Total,Men,Women,,Total,Men,Women 381 // "Albania",2008,,96,,97,,95,,99,,99,,99 382 String[] pieces = splitCommaSeparated(line); 383 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) { 384 return false; 385 } 386 String code = CountryCodeConverter.getCodeFromName(pieces[0]); 387 if (code == null) { 388 return false; 389 } 390 if (!StandardCodes.isCountry(code)) { 391 if (ADD_POP) { 392 System.out.println("Skipping UN info for: " + code); 393 } 394 return false; 395 } 396 String totalLiteracy = pieces[3]; 397 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) { 398 return true; 399 } 400 double percent = Double.parseDouble(totalLiteracy); 401 un_literacy.add(code, percent); 402 return true; 403 } 404 }); 405 } 406 407 static { 408 try { loadFactbookLiteracy()409 loadFactbookLiteracy(); loadUnLiteracy()410 loadUnLiteracy(); 411 412 loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp); 413 loadFactbookInfo("external/factbook_population.txt", factbook_population); 414 CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other)); 415 loadWorldBankInfo()416 loadWorldBankInfo(); 417 StandardCodes sc = StandardCodes.make(); 418 StringBuilder myErrors = new StringBuilder(); 419 for (String territory : sc.getGoodAvailableCodes("territory")) { 420 if (!StandardCodes.isCountry(territory)) { 421 continue; 422 } 423 double gdp = getGdp(territory); 424 double literacy = getLiteracy(territory); 425 double population = getPopulation(territory); 426 if (gdp == 0) { 427 // AX;Aland Islands;population;26,200;www.aland.ax 428 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason"); 429 } 430 if (literacy == 0) { 431 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason"); 432 } 433 if (population == 0) { 434 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) 435 + ";population;0;reason"); 436 } 437 } 438 if (myErrors.length() != 0) { 439 throw new IllegalArgumentException( 440 "Missing Country values, the following and add to external/other_country_data to fix:" 441 + myErrors); 442 } 443 } catch (IOException e) { 444 } 445 } 446 } 447