1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.util.Arrays; 7 import java.util.regex.Matcher; 8 import java.util.regex.Pattern; 9 10 import org.unicode.cldr.draft.FileUtilities; 11 import org.unicode.cldr.util.CLDRPaths; 12 import org.unicode.cldr.util.CldrUtility; 13 import org.unicode.cldr.util.SimpleHtmlParser; 14 import org.unicode.cldr.util.SimpleHtmlParser.Type; 15 16 import com.ibm.icu.util.ICUUncheckedIOException; 17 18 /** 19 * Run this code to extract the ISO currency data from a file. 20 * Use -Dinput=xxx for the input file, and -Doutput=xxx for the output file 21 * 22 * @author markdavis 23 */ 24 public class ExtractIsoCurrencyData { 25 private static final boolean VERBOSE = true; 26 27 private static final Matcher HAS_DATE = Pattern.compile( 28 "last modified.*([0-9]{4}-[0-9]{2}-[0-9]{2})", 29 Pattern.DOTALL).matcher(""); 30 main(String[] args)31 public static void main(String[] args) throws IOException { 32 final String inputFile = CldrUtility.getProperty("input"); 33 BufferedReader in; 34 if (inputFile == null) { 35 in = CldrUtility.getUTF8Data("currency_codes_list-1.htm"); 36 } else { 37 in = FileUtilities.openUTF8Reader("", inputFile); 38 } 39 // NOTE: UTIL_DATA_DIR is required here because it is used as an output directory. 40 final String outputFile = CldrUtility.getProperty("output", CLDRPaths.UTIL_DATA_DIR 41 + "/currencycodeslist.txt"); 42 PrintWriter out = FileUtilities.openUTF8Writer(null, outputFile); 43 try { 44 String version = null; 45 String[][] parts = new String[5][5]; 46 int count = 0; 47 48 //boolean inContent = false; 49 // if the table level is 1 (we are in the main table), then we look for <td>...</td><td>...</td>. That means 50 // that we have column 1 and column 2. 51 52 SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in); 53 StringBuilder result = new StringBuilder(); 54 boolean hadPop = false; 55 int column = -1; 56 int row = -1; 57 main: while (true) { 58 Type x = simple.next(result); 59 // System.out.println(x + "\t" + result); 60 switch (x) { 61 case ELEMENT: // with /table we pop the count 62 if (SimpleHtmlParser.equals("tr", result)) { 63 if (hadPop) { 64 for (int i = 0; i < parts.length; ++i) { 65 boolean empty = true; 66 for (int j = 0; j < parts[i].length; ++j) { 67 parts[i][j] = parts[i][j].replace(" ", " "); 68 parts[i][j] = parts[i][j].replace("\u2020", " "); 69 parts[i][j] = parts[i][j].replace("\u2021", " "); 70 parts[i][j] = parts[i][j].replace("\u00A0", " "); 71 parts[i][j] = parts[i][j].trim(); 72 empty &= parts[i][j].length() == 0; 73 } 74 if (empty) { 75 continue; 76 } 77 if (parts[i][0].length() == 0) { 78 parts[i][0] = i == 0 ? "ZZ" : parts[0][0]; // hack because of iso format 79 } else if (parts[i][0].equals("Entity")) { 80 continue; 81 } 82 if (parts[i][1].equals("Special settlement currencies")) { 83 continue; 84 } else if (parts[i][1].equals("No universal currency")) { 85 parts[i][2] = "XXX"; 86 parts[i][3] = "999"; 87 } 88 // fix numbers to match old style 89 if (VERBOSE) 90 System.out.println("\tDATA: " + Arrays.asList(parts[i])); 91 int num = parts[i][3].equals("Nil") ? -1 : Integer.parseInt(parts[i][3]); 92 parts[i][3] = String.valueOf(num); 93 out.println(CldrUtility.join(parts[i], "\t").trim()); 94 count++; 95 // Data data = new Data(country, parts[i][1], parts[i][3]); 96 // codeList.put(parts[i][2], data); 97 } 98 column = -1; 99 row = -1; 100 } else { 101 column = 0; 102 row = 0; 103 for (int i = 0; i < parts.length; ++i) { 104 for (int j = 0; j < parts[i].length; ++j) { 105 parts[i][j] = ""; 106 } 107 } 108 } 109 } else if (SimpleHtmlParser.equals("td", result) 110 || SimpleHtmlParser.equals("th", result)) { 111 if (hadPop) { 112 column++; 113 row = 0; 114 } 115 } else if (SimpleHtmlParser.equals("br", result)) { // because ISO has screwy format 116 row++; 117 } 118 break; 119 case ELEMENT_CONTENT: 120 if (column >= 0) { 121 parts[row][column] += result; 122 } 123 break; 124 case QUOTE: 125 if (HAS_DATE.reset(result).find()) { 126 version = HAS_DATE.group(1); 127 } 128 break; 129 case ELEMENT_POP: 130 hadPop = true; 131 break; 132 case ELEMENT_START: 133 hadPop = false; 134 break; 135 case DONE: 136 break main; 137 case ELEMENT_END: 138 case ATTRIBUTE: 139 case ATTRIBUTE_CONTENT: 140 break; // for debugging 141 } 142 } 143 in.close(); 144 if (version == null) { 145 throw new IllegalArgumentException("Missing version; ISO file format probably changed."); 146 } 147 if (count < 50) { 148 throw new IllegalArgumentException("Data too small; ISO file format probably changed."); 149 } 150 out.println("Last modified " + version); 151 } catch (IOException e) { 152 throw new ICUUncheckedIOException("Can't read currency file " + e.getMessage(), e); 153 } 154 out.close(); 155 } 156 157 /** 158 * Was code to check when we moved from flat file to html to alert on differences. Not necessary any more. 159 * 160 * @throws IOException 161 */ 162 // public void CheckISOCurrencyParser() throws IOException { 163 // Relation<String, Data> codeList = new Relation(new TreeMap(), TreeSet.class, null); 164 // Relation<String, Data> codeListHtml = new Relation(new TreeMap(), TreeSet.class, null); 165 // 166 // String version = IsoCurrencyParser.getFlatList(codeList); 167 // String versionHtml = IsoCurrencyParser.getHtmlList(codeListHtml); // getFlatList 168 // assertEquals("Versions don't match", version, versionHtml); 169 // Set<String> keys = new TreeSet(codeList.keySet()); 170 // keys.addAll(codeListHtml.keySet()); 171 // for (String key : keys) { 172 // Set<Data> flat = codeList.getAll(key); 173 // Set<Data> html = codeListHtml.getAll(key); 174 // if (flat == null || !flat.equals(html)) { 175 // if (flat != null) { 176 // Set inFlatOnly = new TreeSet(flat); 177 // if (html != null) inFlatOnly.removeAll(html); 178 // if (inFlatOnly.size() != 0) errln(key + "\t\tflat: " + inFlatOnly); 179 // } 180 // if (html != null) { 181 // Set inHtmlOnly = new TreeSet(html); 182 // if (flat != null) inHtmlOnly.removeAll(flat); 183 // if (inHtmlOnly.size() != 0) errln("\t" + key + "\thtml: " + inHtmlOnly); 184 // } 185 // } 186 // } 187 // System.out.println(codeList); 188 // } 189 } 190