1 package org.unicode.cldr.tool;
2 
3 import java.io.BufferedReader;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.util.Arrays;
7 import java.util.regex.Matcher;
8 import java.util.regex.Pattern;
9 
10 import org.unicode.cldr.draft.FileUtilities;
11 import org.unicode.cldr.util.CLDRPaths;
12 import org.unicode.cldr.util.CldrUtility;
13 import org.unicode.cldr.util.SimpleHtmlParser;
14 import org.unicode.cldr.util.SimpleHtmlParser.Type;
15 
16 import com.ibm.icu.util.ICUUncheckedIOException;
17 
18 /**
19  * Run this code to extract the ISO currency data from a file.
20  * Use -Dinput=xxx for the input file, and -Doutput=xxx for the output file
21  *
22  * @author markdavis
23  */
24 public class ExtractIsoCurrencyData {
25     private static final boolean VERBOSE = true;
26 
27     private static final Matcher HAS_DATE = Pattern.compile(
28         "last modified.*([0-9]{4}-[0-9]{2}-[0-9]{2})",
29         Pattern.DOTALL).matcher("");
30 
main(String[] args)31     public static void main(String[] args) throws IOException {
32         final String inputFile = CldrUtility.getProperty("input");
33         BufferedReader in;
34         if (inputFile == null) {
35             in = CldrUtility.getUTF8Data("currency_codes_list-1.htm");
36         } else {
37             in = FileUtilities.openUTF8Reader("", inputFile);
38         }
39         // NOTE: UTIL_DATA_DIR is required here because it is used as an output directory.
40         final String outputFile = CldrUtility.getProperty("output", CLDRPaths.UTIL_DATA_DIR
41             + "/currencycodeslist.txt");
42         PrintWriter out = FileUtilities.openUTF8Writer(null, outputFile);
43         try {
44             String version = null;
45             String[][] parts = new String[5][5];
46             int count = 0;
47 
48             //boolean inContent = false;
49             // if the table level is 1 (we are in the main table), then we look for <td>...</td><td>...</td>. That means
50             // that we have column 1 and column 2.
51 
52             SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in);
53             StringBuilder result = new StringBuilder();
54             boolean hadPop = false;
55             int column = -1;
56             int row = -1;
57             main: while (true) {
58                 Type x = simple.next(result);
59                 // System.out.println(x + "\t" + result);
60                 switch (x) {
61                 case ELEMENT: // with /table we pop the count
62                     if (SimpleHtmlParser.equals("tr", result)) {
63                         if (hadPop) {
64                             for (int i = 0; i < parts.length; ++i) {
65                                 boolean empty = true;
66                                 for (int j = 0; j < parts[i].length; ++j) {
67                                     parts[i][j] = parts[i][j].replace("&nbsp;", " ");
68                                     parts[i][j] = parts[i][j].replace("\u2020", " ");
69                                     parts[i][j] = parts[i][j].replace("\u2021", " ");
70                                     parts[i][j] = parts[i][j].replace("\u00A0", " ");
71                                     parts[i][j] = parts[i][j].trim();
72                                     empty &= parts[i][j].length() == 0;
73                                 }
74                                 if (empty) {
75                                     continue;
76                                 }
77                                 if (parts[i][0].length() == 0) {
78                                     parts[i][0] = i == 0 ? "ZZ" : parts[0][0]; // hack because of iso format
79                                 } else if (parts[i][0].equals("Entity")) {
80                                     continue;
81                                 }
82                                 if (parts[i][1].equals("Special settlement currencies")) {
83                                     continue;
84                                 } else if (parts[i][1].equals("No universal currency")) {
85                                     parts[i][2] = "XXX";
86                                     parts[i][3] = "999";
87                                 }
88                                 // fix numbers to match old style
89                                 if (VERBOSE)
90                                     System.out.println("\tDATA: " + Arrays.asList(parts[i]));
91                                 int num = parts[i][3].equals("Nil") ? -1 : Integer.parseInt(parts[i][3]);
92                                 parts[i][3] = String.valueOf(num);
93                                 out.println(CldrUtility.join(parts[i], "\t").trim());
94                                 count++;
95                                 // Data data = new Data(country, parts[i][1], parts[i][3]);
96                                 // codeList.put(parts[i][2], data);
97                             }
98                             column = -1;
99                             row = -1;
100                         } else {
101                             column = 0;
102                             row = 0;
103                             for (int i = 0; i < parts.length; ++i) {
104                                 for (int j = 0; j < parts[i].length; ++j) {
105                                     parts[i][j] = "";
106                                 }
107                             }
108                         }
109                     } else if (SimpleHtmlParser.equals("td", result)
110                         || SimpleHtmlParser.equals("th", result)) {
111                         if (hadPop) {
112                             column++;
113                             row = 0;
114                         }
115                     } else if (SimpleHtmlParser.equals("br", result)) { // because ISO has screwy format
116                         row++;
117                     }
118                     break;
119                 case ELEMENT_CONTENT:
120                     if (column >= 0) {
121                         parts[row][column] += result;
122                     }
123                     break;
124                 case QUOTE:
125                     if (HAS_DATE.reset(result).find()) {
126                         version = HAS_DATE.group(1);
127                     }
128                     break;
129                 case ELEMENT_POP:
130                     hadPop = true;
131                     break;
132                 case ELEMENT_START:
133                     hadPop = false;
134                     break;
135                 case DONE:
136                     break main;
137                 case ELEMENT_END:
138                 case ATTRIBUTE:
139                 case ATTRIBUTE_CONTENT:
140                     break; // for debugging
141                 }
142             }
143             in.close();
144             if (version == null) {
145                 throw new IllegalArgumentException("Missing version; ISO file format probably changed.");
146             }
147             if (count < 50) {
148                 throw new IllegalArgumentException("Data too small; ISO file format probably changed.");
149             }
150             out.println("Last modified " + version);
151         } catch (IOException e) {
152             throw new ICUUncheckedIOException("Can't read currency file " + e.getMessage(), e);
153         }
154         out.close();
155     }
156 
157     /**
158      * Was code to check when we moved from flat file to html to alert on differences. Not necessary any more.
159      *
160      * @throws IOException
161      */
162     // public void CheckISOCurrencyParser() throws IOException {
163     // Relation<String, Data> codeList = new Relation(new TreeMap(), TreeSet.class, null);
164     // Relation<String, Data> codeListHtml = new Relation(new TreeMap(), TreeSet.class, null);
165     //
166     // String version = IsoCurrencyParser.getFlatList(codeList);
167     // String versionHtml = IsoCurrencyParser.getHtmlList(codeListHtml); // getFlatList
168     // assertEquals("Versions don't match", version, versionHtml);
169     // Set<String> keys = new TreeSet(codeList.keySet());
170     // keys.addAll(codeListHtml.keySet());
171     // for (String key : keys) {
172     // Set<Data> flat = codeList.getAll(key);
173     // Set<Data> html = codeListHtml.getAll(key);
174     // if (flat == null || !flat.equals(html)) {
175     // if (flat != null) {
176     // Set inFlatOnly = new TreeSet(flat);
177     // if (html != null) inFlatOnly.removeAll(html);
178     // if (inFlatOnly.size() != 0) errln(key + "\t\tflat: " + inFlatOnly);
179     // }
180     // if (html != null) {
181     // Set inHtmlOnly = new TreeSet(html);
182     // if (flat != null) inHtmlOnly.removeAll(flat);
183     // if (inHtmlOnly.size() != 0) errln("\t" + key + "\thtml: " + inHtmlOnly);
184     // }
185     // }
186     // }
187     // System.out.println(codeList);
188     // }
189 }
190