1 /*
2  *******************************************************************************
3  * Copyright (C) 2002-2012, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package org.unicode.cldr.util;
8 
9 import java.text.ParsePosition;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.Map;
13 import java.util.Set;
14 
15 import org.unicode.cldr.util.props.BagFormatter;
16 
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.SymbolTable;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.text.UnicodeMatcher;
21 import com.ibm.icu.text.UnicodeSet;
22 
23 public class Tokenizer {
24     protected String source;
25 
26     protected StringBuffer buffer = new StringBuffer();
27     protected long number;
28     protected UnicodeSet unicodeSet = null;
29     protected int index;
30     boolean backedup = false;
31     protected int lastIndex = -1;
32     protected int nextIndex;
33     int lastValue = BACKEDUP_TOO_FAR;
34     TokenSymbolTable symbolTable = new TokenSymbolTable();
35 
36     private static final char QUOTE = '\'',
37         BSLASH = '\\';
38     private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
39     private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +
40         "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +
41         "]");
42     private static final UnicodeSet SYNTAX = new UnicodeSet("[" +
43         "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +
44         "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +
45         "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +
46         "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +
47         "\\u3001\\u3003\\u3008-\\u3020\\u3030" +
48         "\\uFD3E\\uFD3F\\uFE45\\uFE46" +
49         "]").removeAll(QUOTERS).remove('$');
50     private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
51     //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
52     private static final UnicodeSet NON_STRING = new UnicodeSet()
53         .addAll(WHITESPACE)
54         .addAll(SYNTAX);
55 
56     protected UnicodeSet whiteSpace = WHITESPACE;
57     protected UnicodeSet syntax = SYNTAX;
58     private UnicodeSet non_string = NON_STRING;
59 
fixSets()60     private void fixSets() {
61         if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
62             syntax = ((UnicodeSet) syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
63         }
64         if (whiteSpace.containsSome(QUOTERS)) {
65             whiteSpace = ((UnicodeSet) whiteSpace.clone()).removeAll(QUOTERS);
66         }
67         non_string = new UnicodeSet(syntax)
68             .addAll(whiteSpace);
69     }
70 
setSource(String source)71     public Tokenizer setSource(String source) {
72         this.source = source;
73         this.index = 0;
74         return this; // for chaining
75     }
76 
setIndex(int index)77     public Tokenizer setIndex(int index) {
78         this.index = index;
79         return this; // for chaining
80     }
81 
82     public static final int DONE = -1,
83         NUMBER = -2,
84         STRING = -3,
85         UNICODESET = -4,
86         UNTERMINATED_QUOTE = -5,
87         BACKEDUP_TOO_FAR = -6;
88 
89     private static final int
90     //FIRST = 0,
91     //IN_NUMBER = 1,
92     //IN_SPACE = 2,
93     AFTER_QUOTE = 3, // warning: order is important for switch statement
94         IN_STRING = 4,
95         AFTER_BSLASH = 5,
96         IN_QUOTE = 6;
97 
toString(int type, boolean backedupBefore)98     public String toString(int type, boolean backedupBefore) {
99         String s = backedup ? "@" : "*";
100         switch (type) {
101         case DONE:
102             return s + "Done" + s;
103         case BACKEDUP_TOO_FAR:
104             return s + "Illegal Backup" + s;
105         case UNTERMINATED_QUOTE:
106             return s + "Unterminated Quote=" + getString() + s;
107         case STRING:
108             return s + "s=" + getString() + s;
109         case NUMBER:
110             return s + "n=" + getNumber() + s;
111         case UNICODESET:
112             return s + "n=" + getUnicodeSet() + s;
113         default:
114             return s + "c=" + usf.getName(type, true) + s;
115         }
116     }
117 
118     private static final BagFormatter usf = new BagFormatter();
119 
backup()120     public void backup() {
121         if (backedup) throw new IllegalArgumentException("backup too far");
122         backedup = true;
123         nextIndex = index;
124         index = lastIndex;
125     }
126 
127     /*
128     public int next2() {
129         boolean backedupBefore = backedup;
130         int result = next();
131         System.out.println(toString(result, backedupBefore));
132         return result;
133     }
134     */
135 
next()136     public int next() {
137         if (backedup) {
138             backedup = false;
139             index = nextIndex;
140             return lastValue;
141         }
142         int cp = 0;
143         boolean inComment = false;
144         // clean off any leading whitespace or comments
145         while (true) {
146             if (index >= source.length()) return lastValue = DONE;
147             cp = nextChar();
148             if (inComment) {
149                 if (NEWLINE.contains(cp)) inComment = false;
150             } else {
151                 if (cp == '#')
152                     inComment = true;
153                 else if (!whiteSpace.contains(cp)) break;
154             }
155         }
156         // record the last index in case we have to backup
157         lastIndex = index;
158 
159         if (cp == '[') {
160             ParsePosition pos = new ParsePosition(index - 1);
161             unicodeSet = new UnicodeSet(source, pos, symbolTable);
162             index = pos.getIndex();
163             return lastValue = UNICODESET;
164         }
165         // get syntax character
166         if (syntax.contains(cp)) return lastValue = cp;
167 
168         // get number, if there is one
169         if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
170             number = UCharacter.getNumericValue(cp);
171             while (index < source.length()) {
172                 cp = nextChar();
173                 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
174                     index -= UTF16.getCharCount(cp); // BACKUP!
175                     break;
176                 }
177                 number *= 10;
178                 number += UCharacter.getNumericValue(cp);
179             }
180             return lastValue = NUMBER;
181         }
182         buffer.setLength(0);
183         int status = IN_STRING;
184         main: while (true) {
185             switch (status) {
186             case AFTER_QUOTE: // check for double ''?
187                 if (cp == QUOTE) {
188                     UTF16.append(buffer, QUOTE);
189                     status = IN_QUOTE;
190                     break;
191                 }
192                 // OTHERWISE FALL THROUGH!!!
193             case IN_STRING:
194                 if (cp == QUOTE)
195                     status = IN_QUOTE;
196                 else if (cp == BSLASH)
197                     status = AFTER_BSLASH;
198                 else if (non_string.contains(cp)) {
199                     index -= UTF16.getCharCount(cp); // BACKUP!
200                     break main;
201                 } else
202                     UTF16.append(buffer, cp);
203                 break;
204             case IN_QUOTE:
205                 if (cp == QUOTE)
206                     status = AFTER_QUOTE;
207                 else
208                     UTF16.append(buffer, cp);
209                 break;
210             case AFTER_BSLASH:
211                 switch (cp) {
212                 case 'n':
213                     cp = '\n';
214                     break;
215                 case 'r':
216                     cp = '\r';
217                     break;
218                 case 't':
219                     cp = '\t';
220                     break;
221                 }
222                 UTF16.append(buffer, cp);
223                 status = IN_STRING;
224                 break;
225             default:
226                 throw new IllegalArgumentException("Internal Error");
227             }
228             if (index >= source.length()) break;
229             cp = nextChar();
230         }
231         if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
232         return lastValue = STRING;
233     }
234 
getString()235     public String getString() {
236         return buffer.toString();
237     }
238 
toString()239     public String toString() {
240         return source.substring(0, index) + "$$$" + source.substring(index);
241     }
242 
getNumber()243     public long getNumber() {
244         return number;
245     }
246 
getUnicodeSet()247     public UnicodeSet getUnicodeSet() {
248         return unicodeSet;
249     }
250 
nextChar()251     private int nextChar() {
252         int cp = UTF16.charAt(source, index);
253         index += UTF16.getCharCount(cp);
254         return cp;
255     }
256 
getIndex()257     public int getIndex() {
258         return index;
259     }
260 
getSource()261     public String getSource() {
262         return source;
263     }
264 
getSyntax()265     public UnicodeSet getSyntax() {
266         return syntax;
267     }
268 
getWhiteSpace()269     public UnicodeSet getWhiteSpace() {
270         return whiteSpace;
271     }
272 
setSyntax(UnicodeSet set)273     public void setSyntax(UnicodeSet set) {
274         syntax = set;
275         fixSets();
276     }
277 
setWhiteSpace(UnicodeSet set)278     public void setWhiteSpace(UnicodeSet set) {
279         whiteSpace = set;
280         fixSets();
281     }
282 
getLookedUpItems()283     public Set getLookedUpItems() {
284         return symbolTable.itemsLookedUp;
285     }
286 
addSymbol(String var, String value, int start, int limit)287     public void addSymbol(String var, String value, int start, int limit) {
288         // the limit is after the ';', so remove it
289         --limit;
290         char[] body = new char[limit - start];
291         value.getChars(start, limit, body, 0);
292         symbolTable.add(var, body);
293     }
294 
295     public class TokenSymbolTable implements SymbolTable {
296         Map contents = new HashMap();
297         Set itemsLookedUp = new HashSet();
298 
add(String var, char[] body)299         public void add(String var, char[] body) {
300             // start from 1 to avoid the $
301             contents.put(var.substring(1), body);
302         }
303 
304         /* (non-Javadoc)
305          * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
306          */
lookup(String s)307         public char[] lookup(String s) {
308             itemsLookedUp.add('$' + s);
309             return (char[]) contents.get(s);
310         }
311 
312         /* (non-Javadoc)
313          * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
314          */
lookupMatcher(int ch)315         public UnicodeMatcher lookupMatcher(int ch) {
316             // TODO Auto-generated method stub
317             return null;
318         }
319 
320         /* (non-Javadoc)
321          * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
322          */
parseReference(String text, ParsePosition pos, int limit)323         public String parseReference(String text, ParsePosition pos, int limit) {
324             int cp;
325             int start = pos.getIndex();
326             int i;
327             for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
328                 cp = UTF16.charAt(text, i);
329                 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
330                     break;
331                 }
332             }
333             pos.setIndex(i);
334             return text.substring(start, i);
335         }
336 
337     }
338 }
339