1 package org.unicode.cldr.util;
2 
3 import java.io.IOException;
4 import java.io.Reader;
5 
6 /**
7  * Extremely simple class for parsing HTML. Extremely lenient. Call next() until
8  * DONE is returned.
9  * <p>
10  * Element content will be returned in the following sequence:
11  *
12  * <pre>
13  *  ELEMENT_START
14  *  ELEMENT strong
15  *  ELEMENT_END
16  *  ELEMENT_CONTENT Alphabetic code
17  *  ELEMENT_START
18  *  ELEMENT_POP
19  *  ELEMENT strong
20  *  ELEMENT_END
21  * </pre>
22  *
23  * while attributes will be returned as:
24  *
25  * <pre>
26  *  ELEMENT_START
27  *  ELEMENT div
28  *  ATTRIBUTE id
29  *  ATTRIBUTE_CONTENT mainContent
30  *  ELEMENT_END
31  * </pre>
32  *
33  *
34  * @author markdavis
35  *
36  */
37 public class SimpleHtmlParser {
38     public enum Type {
39         DONE,
40         /**
41          * No contents, set when we hit <
42          */
43         ELEMENT_START,
44         /**
45          * '&lt;' contents/b
46          */
47         ELEMENT,
48         /**
49          * '&lt;element/bcontents(=...)
50          */
51         ATTRIBUTE,
52         /**
53          * attribute=['"]contents['"]
54          */
55         ATTRIBUTE_CONTENT,
56         /**
57          * No contents, set when we hit '&gt'
58          */
59         ELEMENT_END,
60         /**
61          * No contents, set when we hit '/' after '&lt;'
62          */
63         ELEMENT_POP,
64         /**
65          * '&lt;!--' contents '--&gt;'
66          */
67         QUOTE,
68         /**
69          * '&lt;element&gt;' contents '&lt;/element&gt;'
70          */
71         ELEMENT_CONTENT
72     };
73 
74     private enum State {
75         BASE, IN_ELEMENT, AFTER_ELEMENT, IN_CONTENT, IN_ATTRIBUTE, IN_ATTRIBUTE_CONTENT, IN_ATTRIBUTE_CONTENT1, IN_ATTRIBUTE_CONTENT2, ELEMENT_STOP, IN_QUOTE
76     };
77 
78     private Reader input;
79 
80     private State state;
81 
82     private Type bufferedReturn;
83 
84     private int lineCount;
85 
setReader(Reader input)86     public SimpleHtmlParser setReader(Reader input) {
87         this.input = input;
88         state = State.IN_CONTENT;
89         bufferedReturn = null;
90         lineCount = 0;
91         return this;
92     }
93 
getLineCount()94     public int getLineCount() {
95         return lineCount;
96     }
97 
next(StringBuilder result)98     public Type next(StringBuilder result) throws IOException {
99         result.setLength(0);
100         if (bufferedReturn != null) {
101             if (bufferedReturn == Type.DONE) { // once DONE, stay DONE
102                 return Type.DONE;
103             }
104             Type temp = bufferedReturn;
105             bufferedReturn = null;
106             return temp;
107         }
108         while (true) {
109             char ch;
110             {
111                 int chi = input.read();
112                 if (chi < 0) {
113                     bufferedReturn = Type.DONE;
114                     chi = 0;
115                 }
116                 ch = (char) chi;
117                 if (ch == '\n') {
118                     ++lineCount;
119                 }
120             }
121 
122             switch (state) {
123             case BASE:
124                 if (ch == 0xFEFF)
125                     break;
126                 // fall through!
127 
128             case IN_CONTENT:
129                 if (ch == '<') {
130                     state = State.IN_ELEMENT;
131                     bufferedReturn = Type.ELEMENT_START;
132                     return Type.ELEMENT_CONTENT;
133                 }
134                 if (ch == 0) {
135                     return Type.ELEMENT_CONTENT;
136                 }
137                 result.append(ch);
138                 break;
139 
140             case IN_ELEMENT:
141                 if (ch <= ' ') {
142                     if (equals(result, "!--")) {
143                         state = State.IN_QUOTE;
144                         result.setLength(0);
145                         break;
146                     }
147                     state = State.AFTER_ELEMENT;
148                     return Type.ELEMENT;
149                 }
150                 if (ch == '>') {
151                     state = State.IN_CONTENT;
152                     bufferedReturn = Type.ELEMENT_END;
153                     return Type.ELEMENT;
154                 }
155                 if (ch == '/') {
156                     return Type.ELEMENT_POP;
157                 }
158                 result.append(ch);
159                 break;
160 
161             case AFTER_ELEMENT:
162                 if (ch <= ' ')
163                     break;
164                 if (ch == '>') {
165                     state = State.IN_CONTENT;
166                     return Type.ELEMENT_END;
167                 }
168                 result.append(ch);
169                 state = State.IN_ATTRIBUTE;
170                 break;
171 
172             case IN_ATTRIBUTE:
173                 if (ch <= ' ') {
174                     state = State.AFTER_ELEMENT;
175                     return Type.ATTRIBUTE;
176                 }
177                 if (ch == '>') {
178                     state = State.IN_CONTENT;
179                     bufferedReturn = Type.ELEMENT_END;
180                     return Type.ATTRIBUTE;
181                 }
182                 if (ch == '=') {
183                     state = State.IN_ATTRIBUTE_CONTENT;
184                     return Type.ATTRIBUTE;
185                 }
186                 result.append(ch);
187                 break;
188 
189             case IN_ATTRIBUTE_CONTENT:
190                 if (ch <= ' ') {
191                     break;
192                 }
193                 if (ch == '>') {
194                     state = State.IN_CONTENT;
195                     bufferedReturn = Type.ELEMENT_END;
196                     return Type.ATTRIBUTE_CONTENT;
197                 }
198                 if (ch == '\'') {
199                     state = State.IN_ATTRIBUTE_CONTENT1;
200                     break;
201                 }
202                 if (ch == '"') {
203                     state = State.IN_ATTRIBUTE_CONTENT2;
204                     break;
205                 }
206                 result.append(ch);
207                 break;
208 
209             case IN_ATTRIBUTE_CONTENT1:
210                 if (ch == 0 || ch == '\'') {
211                     state = State.AFTER_ELEMENT;
212                     return Type.ATTRIBUTE_CONTENT;
213                 }
214                 result.append(ch);
215                 break;
216 
217             case IN_ATTRIBUTE_CONTENT2:
218                 if (ch == 0 || ch == '"') {
219                     state = State.AFTER_ELEMENT;
220                     return Type.ATTRIBUTE_CONTENT;
221                 }
222                 result.append(ch);
223                 break;
224 
225             case IN_QUOTE:
226                 if (ch == 0) {
227                     state = State.IN_CONTENT;
228                     return Type.QUOTE;
229                 }
230                 if (ch == '>' && endsWith(result, "--")) {
231                     result.setLength(result.length() - 2);
232                     state = State.IN_CONTENT;
233                     return Type.QUOTE;
234                 }
235                 result.append(ch);
236                 break;
237             default:
238             }
239         }
240     }
241 
endsWith(CharSequence a, CharSequence b)242     public static final boolean endsWith(CharSequence a, CharSequence b) {
243         int aStart = a.length() - b.length();
244         if (aStart < 0) {
245             return false;
246         }
247         return regionEquals(a, aStart, b, 0, b.length());
248     }
249 
equals(CharSequence a, CharSequence b)250     public static final boolean equals(CharSequence a, CharSequence b) {
251         int len = a.length();
252         if (len != b.length()) {
253             return false;
254         }
255         return regionEquals(a, 0, b, 0, len);
256     }
257 
regionEquals(CharSequence a, int i, CharSequence b, int j, int len)258     public static boolean regionEquals(CharSequence a, int i, CharSequence b, int j, int len) {
259         for (; --len >= 0; ++i, ++j) {
260             if (a.charAt(i) != b.charAt(j)) {
261                 return false;
262             }
263         }
264         return true;
265     }
266 
writeResult(Type type, StringBuilder result, Appendable writer)267     public static void writeResult(Type type, StringBuilder result, Appendable writer) throws IOException {
268         switch (type) {
269         case ELEMENT:
270             writer.append(result);
271             break;
272         case ELEMENT_START:
273             writer.append('<');
274             break;
275         case ELEMENT_END:
276             writer.append('>');
277             break;
278         case ATTRIBUTE:
279             writer.append(' ').append(result);
280             break;
281         case ATTRIBUTE_CONTENT:
282             writer.append("=\"").append(result).append('"');
283             break;
284         case ELEMENT_CONTENT:
285             writer.append(result);
286             break;
287         case ELEMENT_POP:
288             writer.append('/');
289             break;
290         case QUOTE:
291             writer.append(result);
292             break;
293         case DONE:
294             break;
295         default:
296             throw new IllegalArgumentException("Missing case: " + type);
297         }
298     }
299 }
300