1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser.impl;
18 
19 import com.google.common.base.Preconditions;
20 import com.google.streamhtmlparser.ExternalState;
21 import com.google.streamhtmlparser.Parser;
22 import com.google.streamhtmlparser.ParseException;
23 import com.google.streamhtmlparser.util.HtmlUtils;
24 
25 import java.util.Map;
26 
27 /**
28  * An implementation of the {@code Parser} interface that is common to both
29  * {@code HtmlParser} and {@code JavascriptParser}.
30  *
31  * <p>Provides methods for parsing input and ensuring that all in-state,
32  * entering-a-state and exiting-a-state callbacks are invoked as appropriate.
33  *
34  * <p>This class started as abstract but it was found better for testing to
35  * make it instantiatable so that the parsing logic can be tested with dummy
36  * state transitions.
37  */
38 public class GenericParser implements Parser {
39 
40   protected final ParserStateTable parserStateTable;
41   protected final Map<InternalState, ExternalState> intToExtStateTable;
42   protected final InternalState initialState;
43   protected InternalState currentState;
44   protected int lineNumber;
45   protected int columnNumber;
46 
GenericParser(ParserStateTable parserStateTable, Map<InternalState, ExternalState> intToExtStateTable, InternalState initialState)47   protected GenericParser(ParserStateTable parserStateTable,
48                           Map<InternalState, ExternalState> intToExtStateTable,
49                           InternalState initialState) {
50     this.parserStateTable = parserStateTable;
51     this.intToExtStateTable = intToExtStateTable;
52     this.initialState = initialState;
53     this.currentState = initialState;
54     this.lineNumber = 1;
55     this.columnNumber = 1;
56   }
57 
58   /**
59    * Constructs a generic parser that is an exact copy of the
60    * one given. Note that here too, data structures that do not
61    * change are shallow-copied (parser state table and state mappings).
62    *
63    * @param aGenericParser the {@code GenericParser} to copy
64    */
GenericParser(GenericParser aGenericParser)65   protected GenericParser(GenericParser aGenericParser) {
66     parserStateTable = aGenericParser.parserStateTable;
67     intToExtStateTable = aGenericParser.intToExtStateTable;
68     initialState = aGenericParser.initialState;
69     currentState = aGenericParser.currentState;
70     lineNumber = aGenericParser.lineNumber;
71     columnNumber = aGenericParser.columnNumber;
72   }
73 
74   /**
75    * Tell the parser to process the provided {@code String}. This is just a
76    * convenience method that wraps over {@link Parser#parse(char)}.
77    * @param input the {@code String} to parse
78    * @throws ParseException if an unrecoverable error occurred during parsing
79    */
80   @Override
parse(String input)81   public void parse(String input) throws ParseException {
82     for (int i = 0; i < input.length(); i++)
83       parse(input.charAt(i));
84   }
85 
86   /**
87    * Main loop for parsing of input.
88    *
89    * <p>Absent any callbacks defined, this function simply determines the
90    * next state to switch to based on the <code>ParserStateTable</code> which is
91    * derived from a state-machine configuration file in the original C++ parser.
92    *
93    * <p>However some states have specific callbacks defined which when
94    * receiving specific characters may decide to overwrite the next state to
95    * go to. Hence the next state is a function both of the main state table
96    * in {@code ParserStateTable} as well as specific run-time information
97    * from the callback functions.
98    *
99    * <p>Also note that the callbacks are called in a proper sequence,
100    * first the exit-state one then the enter-state one and finally the
101    * in-state one. Changing the order may result in a functional change.
102    *
103    * @param input the input character to parse (process)
104    * @throws ParseException if an unrecoverable error occurred during parsing
105    */
106   @Override
parse(char input)107   public void parse(char input) throws ParseException {
108     InternalState nextState =
109         parserStateTable.getNextState(currentState, input);
110 
111     if (nextState == InternalState.INTERNAL_ERROR_STATE) {
112         String errorMsg =
113             String.format("Unexpected character '%s' in int_state '%s' " +
114                           "(ext_state '%s')",
115                           HtmlUtils.encodeCharForAscii(input),
116                           currentState.getName(), getState().getName());
117       currentState = InternalState.INTERNAL_ERROR_STATE;
118       throw new ParseException(this, errorMsg);
119     }
120 
121     if (currentState != nextState) {
122       nextState = handleExitState(currentState, nextState, input);
123     }
124     if (currentState != nextState) {
125       nextState = handleEnterState(nextState, nextState, input);
126     }
127     nextState = handleInState(nextState, input);
128     currentState = nextState;
129     record(input);
130 
131     columnNumber++;
132     if (input == '\n') {
133       lineNumber++;
134       columnNumber = 1;
135     }
136   }
137 
138   /**
139    * Return the current state of the parser.
140    */
141   @Override
getState()142   public ExternalState getState() {
143     if (!intToExtStateTable.containsKey(currentState)) {
144       throw new NullPointerException("Did not find external state mapping " +
145                                      "For internal state: " + currentState);
146     }
147     return intToExtStateTable.get(currentState);
148   }
149 
150   /**
151    * Reset the parser back to its initial default state.
152    */
153   @Override
reset()154   public void reset() {
155     currentState = initialState;
156     lineNumber = 1;
157     columnNumber = 1;
158   }
159 
160   /**
161    * Sets the current line number which is returned during error messages.
162    */
163   @Override
setLineNumber(int lineNumber)164   public void setLineNumber(int lineNumber) {
165     this.lineNumber = lineNumber;
166   }
167 
168   /**
169    * Returns the current line number.
170    */
171   @Override
getLineNumber()172   public int getLineNumber() {
173     return lineNumber;
174   }
175 
176   /**
177    * Sets the current column number which is returned during error messages.
178    */
179   @Override
setColumnNumber(int columnNumber)180   public void setColumnNumber(int columnNumber) {
181     this.columnNumber = columnNumber;
182   }
183 
184   /**
185    * Returns the current column number.
186    */
187   @Override
getColumnNumber()188   public int getColumnNumber() {
189     return columnNumber;
190   }
191 
getCurrentInternalState()192   InternalState getCurrentInternalState() {
193     return currentState;
194   }
195 
setNextState(InternalState nextState)196   protected void setNextState(InternalState nextState) throws ParseException {
197     Preconditions.checkNotNull(nextState);   // Developer error if it triggers.
198 
199     /* We are not actually parsing hence providing
200      * a null char to the event handlers.
201      */
202     // TODO: Complicated logic to follow in C++ but clean it up.
203     final char nullChar = '\0';
204 
205     if (currentState != nextState) {
206       nextState = handleExitState(currentState, nextState, nullChar);
207     }
208     if (currentState != nextState) {
209       handleEnterState(nextState, nextState, nullChar);
210     }
211     currentState = nextState;
212   }
213 
214   /**
215    * Invoked when the parser enters a new state.
216    *
217    * @param currentState the current state of the parser
218    * @param expectedNextState the next state according to the
219    *        state table definition
220    * @param input the last character parsed
221    * @return the state to change to, could be the same as the
222    *         {@code expectedNextState} provided
223    * @throws ParseException if an unrecoverable error occurred during parsing
224    */
handleEnterState(InternalState currentState, InternalState expectedNextState, char input)225   protected InternalState handleEnterState(InternalState currentState,
226                                            InternalState expectedNextState,
227                                            char input) throws ParseException {
228     return expectedNextState;
229   }
230 
231   /**
232    * Invoked when the parser exits a state.
233    *
234    * @param currentState the current state of the parser
235    * @param expectedNextState the next state according to the
236    *        state table definition
237    * @param input the last character parsed
238    * @return the state to change to, could be the same as the
239    *         {@code expectedNextState} provided
240    * @throws ParseException if an unrecoverable error occurred during parsing
241    */
handleExitState(InternalState currentState, InternalState expectedNextState, char input)242   protected InternalState handleExitState(InternalState currentState,
243                                           InternalState expectedNextState,
244                                           char input) throws ParseException {
245     return expectedNextState;
246   }
247 
248   /**
249    * Invoked for each character read when no state change occured.
250    *
251    * @param currentState the current state of the parser
252    * @param input the last character parsed
253    * @return the state to change to, could be the same as the
254    *         {@code expectedNextState} provided
255    * @throws ParseException if an unrecoverable error occurred during parsing
256    */
handleInState(InternalState currentState, char input)257   protected InternalState handleInState(InternalState currentState,
258                                         char input) throws ParseException {
259     return currentState;
260   }
261 
262   /**
263    * Perform some processing on the given character. Derived classes
264    * may override this method in order to perform additional logic
265    * on every processed character beyond the logic defined in
266    * state transitions.
267    *
268    * @param input the input character to operate on
269    */
record(char input)270   protected void record(char input) { }
271 }
272