1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.streamhtmlparser.impl; 18 19 import com.google.common.base.Preconditions; 20 import com.google.streamhtmlparser.ExternalState; 21 import com.google.streamhtmlparser.Parser; 22 import com.google.streamhtmlparser.ParseException; 23 import com.google.streamhtmlparser.util.HtmlUtils; 24 25 import java.util.Map; 26 27 /** 28 * An implementation of the {@code Parser} interface that is common to both 29 * {@code HtmlParser} and {@code JavascriptParser}. 30 * 31 * <p>Provides methods for parsing input and ensuring that all in-state, 32 * entering-a-state and exiting-a-state callbacks are invoked as appropriate. 33 * 34 * <p>This class started as abstract but it was found better for testing to 35 * make it instantiatable so that the parsing logic can be tested with dummy 36 * state transitions. 37 */ 38 public class GenericParser implements Parser { 39 40 protected final ParserStateTable parserStateTable; 41 protected final Map<InternalState, ExternalState> intToExtStateTable; 42 protected final InternalState initialState; 43 protected InternalState currentState; 44 protected int lineNumber; 45 protected int columnNumber; 46 GenericParser(ParserStateTable parserStateTable, Map<InternalState, ExternalState> intToExtStateTable, InternalState initialState)47 protected GenericParser(ParserStateTable parserStateTable, 48 Map<InternalState, ExternalState> intToExtStateTable, 49 InternalState initialState) { 50 this.parserStateTable = parserStateTable; 51 this.intToExtStateTable = intToExtStateTable; 52 this.initialState = initialState; 53 this.currentState = initialState; 54 this.lineNumber = 1; 55 this.columnNumber = 1; 56 } 57 58 /** 59 * Constructs a generic parser that is an exact copy of the 60 * one given. Note that here too, data structures that do not 61 * change are shallow-copied (parser state table and state mappings). 62 * 63 * @param aGenericParser the {@code GenericParser} to copy 64 */ GenericParser(GenericParser aGenericParser)65 protected GenericParser(GenericParser aGenericParser) { 66 parserStateTable = aGenericParser.parserStateTable; 67 intToExtStateTable = aGenericParser.intToExtStateTable; 68 initialState = aGenericParser.initialState; 69 currentState = aGenericParser.currentState; 70 lineNumber = aGenericParser.lineNumber; 71 columnNumber = aGenericParser.columnNumber; 72 } 73 74 /** 75 * Tell the parser to process the provided {@code String}. This is just a 76 * convenience method that wraps over {@link Parser#parse(char)}. 77 * @param input the {@code String} to parse 78 * @throws ParseException if an unrecoverable error occurred during parsing 79 */ 80 @Override parse(String input)81 public void parse(String input) throws ParseException { 82 for (int i = 0; i < input.length(); i++) 83 parse(input.charAt(i)); 84 } 85 86 /** 87 * Main loop for parsing of input. 88 * 89 * <p>Absent any callbacks defined, this function simply determines the 90 * next state to switch to based on the <code>ParserStateTable</code> which is 91 * derived from a state-machine configuration file in the original C++ parser. 92 * 93 * <p>However some states have specific callbacks defined which when 94 * receiving specific characters may decide to overwrite the next state to 95 * go to. Hence the next state is a function both of the main state table 96 * in {@code ParserStateTable} as well as specific run-time information 97 * from the callback functions. 98 * 99 * <p>Also note that the callbacks are called in a proper sequence, 100 * first the exit-state one then the enter-state one and finally the 101 * in-state one. Changing the order may result in a functional change. 102 * 103 * @param input the input character to parse (process) 104 * @throws ParseException if an unrecoverable error occurred during parsing 105 */ 106 @Override parse(char input)107 public void parse(char input) throws ParseException { 108 InternalState nextState = 109 parserStateTable.getNextState(currentState, input); 110 111 if (nextState == InternalState.INTERNAL_ERROR_STATE) { 112 String errorMsg = 113 String.format("Unexpected character '%s' in int_state '%s' " + 114 "(ext_state '%s')", 115 HtmlUtils.encodeCharForAscii(input), 116 currentState.getName(), getState().getName()); 117 currentState = InternalState.INTERNAL_ERROR_STATE; 118 throw new ParseException(this, errorMsg); 119 } 120 121 if (currentState != nextState) { 122 nextState = handleExitState(currentState, nextState, input); 123 } 124 if (currentState != nextState) { 125 nextState = handleEnterState(nextState, nextState, input); 126 } 127 nextState = handleInState(nextState, input); 128 currentState = nextState; 129 record(input); 130 131 columnNumber++; 132 if (input == '\n') { 133 lineNumber++; 134 columnNumber = 1; 135 } 136 } 137 138 /** 139 * Return the current state of the parser. 140 */ 141 @Override getState()142 public ExternalState getState() { 143 if (!intToExtStateTable.containsKey(currentState)) { 144 throw new NullPointerException("Did not find external state mapping " + 145 "For internal state: " + currentState); 146 } 147 return intToExtStateTable.get(currentState); 148 } 149 150 /** 151 * Reset the parser back to its initial default state. 152 */ 153 @Override reset()154 public void reset() { 155 currentState = initialState; 156 lineNumber = 1; 157 columnNumber = 1; 158 } 159 160 /** 161 * Sets the current line number which is returned during error messages. 162 */ 163 @Override setLineNumber(int lineNumber)164 public void setLineNumber(int lineNumber) { 165 this.lineNumber = lineNumber; 166 } 167 168 /** 169 * Returns the current line number. 170 */ 171 @Override getLineNumber()172 public int getLineNumber() { 173 return lineNumber; 174 } 175 176 /** 177 * Sets the current column number which is returned during error messages. 178 */ 179 @Override setColumnNumber(int columnNumber)180 public void setColumnNumber(int columnNumber) { 181 this.columnNumber = columnNumber; 182 } 183 184 /** 185 * Returns the current column number. 186 */ 187 @Override getColumnNumber()188 public int getColumnNumber() { 189 return columnNumber; 190 } 191 getCurrentInternalState()192 InternalState getCurrentInternalState() { 193 return currentState; 194 } 195 setNextState(InternalState nextState)196 protected void setNextState(InternalState nextState) throws ParseException { 197 Preconditions.checkNotNull(nextState); // Developer error if it triggers. 198 199 /* We are not actually parsing hence providing 200 * a null char to the event handlers. 201 */ 202 // TODO: Complicated logic to follow in C++ but clean it up. 203 final char nullChar = '\0'; 204 205 if (currentState != nextState) { 206 nextState = handleExitState(currentState, nextState, nullChar); 207 } 208 if (currentState != nextState) { 209 handleEnterState(nextState, nextState, nullChar); 210 } 211 currentState = nextState; 212 } 213 214 /** 215 * Invoked when the parser enters a new state. 216 * 217 * @param currentState the current state of the parser 218 * @param expectedNextState the next state according to the 219 * state table definition 220 * @param input the last character parsed 221 * @return the state to change to, could be the same as the 222 * {@code expectedNextState} provided 223 * @throws ParseException if an unrecoverable error occurred during parsing 224 */ handleEnterState(InternalState currentState, InternalState expectedNextState, char input)225 protected InternalState handleEnterState(InternalState currentState, 226 InternalState expectedNextState, 227 char input) throws ParseException { 228 return expectedNextState; 229 } 230 231 /** 232 * Invoked when the parser exits a state. 233 * 234 * @param currentState the current state of the parser 235 * @param expectedNextState the next state according to the 236 * state table definition 237 * @param input the last character parsed 238 * @return the state to change to, could be the same as the 239 * {@code expectedNextState} provided 240 * @throws ParseException if an unrecoverable error occurred during parsing 241 */ handleExitState(InternalState currentState, InternalState expectedNextState, char input)242 protected InternalState handleExitState(InternalState currentState, 243 InternalState expectedNextState, 244 char input) throws ParseException { 245 return expectedNextState; 246 } 247 248 /** 249 * Invoked for each character read when no state change occured. 250 * 251 * @param currentState the current state of the parser 252 * @param input the last character parsed 253 * @return the state to change to, could be the same as the 254 * {@code expectedNextState} provided 255 * @throws ParseException if an unrecoverable error occurred during parsing 256 */ handleInState(InternalState currentState, char input)257 protected InternalState handleInState(InternalState currentState, 258 char input) throws ParseException { 259 return currentState; 260 } 261 262 /** 263 * Perform some processing on the given character. Derived classes 264 * may override this method in order to perform additional logic 265 * on every processed character beyond the logic defined in 266 * state transitions. 267 * 268 * @param input the input character to operate on 269 */ record(char input)270 protected void record(char input) { } 271 } 272