1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 **********************************************************************
5 * Copyright (c) 2004-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 * Author: Alan Liu
9 * Created: March 16 2004
10 * Since: ICU 3.0
11 **********************************************************************
12 */
13 package com.ibm.icu.impl.data;
14 
15 import java.io.IOException;
16 
17 import com.ibm.icu.impl.PatternProps;
18 import com.ibm.icu.impl.Utility;
19 import com.ibm.icu.text.UTF16;
20 
21 /**
22  * An iterator class that returns successive string tokens from some
23  * source.  String tokens are, in general, separated by Pattern_White_Space
24  * in the source test.  Furthermore, they may be delimited by
25  * either single or double quotes (opening and closing quotes must
26  * match).  Escapes are processed using standard ICU unescaping.
27  *
28  * <p>2015-sep-03 TODO: Only used in com.ibm.icu.dev.test.format, move there.
29  */
30 public class TokenIterator {
31 
32     private ResourceReader reader;
33     private String line;
34     private StringBuffer buf;
35     private boolean done;
36     private int pos;
37     private int lastpos;
38 
39     /**
40      * Construct an iterator over the tokens returned by the given
41      * ResourceReader, ignoring blank lines and comment lines (first
42      * non-blank character is '#').  Note that trailing comments on a
43      * line, beginning with the first unquoted '#', are recognized.
44      */
TokenIterator(ResourceReader r)45     public TokenIterator(ResourceReader r) {
46         reader = r;
47         line = null;
48         done = false;
49         buf = new StringBuffer();
50         pos = lastpos = -1;
51     }
52 
53     /**
54      * Return the next token from this iterator, or null if the last
55      * token has been returned.
56      */
next()57     public String next() throws IOException {
58         if (done) {
59             return null;
60         }
61         for (;;) {
62             if (line == null) {
63                 line = reader.readLineSkippingComments();
64                 if (line == null) {
65                     done = true;
66                     return null;
67                 }
68                 pos = 0;
69             }
70             buf.setLength(0);
71             lastpos = pos;
72             pos = nextToken(pos);
73             if (pos < 0) {
74                 line = null;
75                 continue;
76             }
77             return buf.toString();
78         }
79     }
80 
81     /**
82      * Return the one-based line number of the line of the last token returned by
83      * next(). Should only be called
84      * after a call to next(); otherwise the return
85      * value is undefined.
86      */
getLineNumber()87     public int getLineNumber() {
88         return reader.getLineNumber();
89     }
90 
91     /**
92      * Return a string description of the position of the last line
93      * returned by readLine() or readLineSkippingComments().
94      */
describePosition()95     public String describePosition() {
96         return reader.describePosition() + ':' + (lastpos+1);
97     }
98 
99     /**
100      * Read the next token from 'this.line' and append it to
101      * 'this.buf'.  Tokens are separated by Pattern_White_Space.  Tokens
102      * may also be delimited by double or single quotes.  The closing
103      * quote must match the opening quote.  If a '#' is encountered,
104      * the rest of the line is ignored, unless it is backslash-escaped
105      * or within quotes.
106      * @param position the offset into the string
107      * @return offset to the next character to read from line, or if
108      * the end of the line is reached without scanning a valid token,
109      * -1
110      */
nextToken(int position)111     private int nextToken(int position) {
112         position = PatternProps.skipWhiteSpace(line, position);
113         if (position == line.length()) {
114             return -1;
115         }
116         int startpos = position;
117         char c = line.charAt(position++);
118         char quote = 0;
119         switch (c) {
120         case '"':
121         case '\'':
122             quote = c;
123             break;
124         case '#':
125             return -1;
126         default:
127             buf.append(c);
128             break;
129         }
130         int[] posref = null;
131         while (position < line.length()) {
132             c = line.charAt(position); // 16-bit ok
133             if (c == '\\') {
134                 if (posref == null) {
135                     posref = new int[1];
136                 }
137                 posref[0] = position+1;
138                 int c32 = Utility.unescapeAt(line, posref);
139                 if (c32 < 0) {
140                     throw new RuntimeException("Invalid escape at " +
141                                                reader.describePosition() + ':' +
142                                                position);
143                 }
144                 UTF16.append(buf, c32);
145                 position = posref[0];
146             } else if ((quote != 0 && c == quote) ||
147                        (quote == 0 && PatternProps.isWhiteSpace(c))) {
148                 return ++position;
149             } else if (quote == 0 && c == '#') {
150                 return position; // do NOT increment
151             } else {
152                 buf.append(c);
153                 ++position;
154             }
155         }
156         if (quote != 0) {
157             throw new RuntimeException("Unterminated quote at " +
158                                        reader.describePosition() + ':' +
159                                        startpos);
160         }
161         return position;
162     }
163 }
164