1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser.impl;
18 
19 import com.google.common.collect.Maps;
20 import com.google.streamhtmlparser.ExternalState;
21 import com.google.streamhtmlparser.JavascriptParser;
22 import com.google.streamhtmlparser.util.HtmlUtils;
23 import com.google.streamhtmlparser.util.JavascriptTokenBuffer;
24 
25 import java.util.Map;
26 
27 /**
28  * <p>Many comments copied almost verbatim from the original C version.
29  */
30 public class JavascriptParserImpl extends GenericParser
31     implements JavascriptParser {
32 
33   final static InternalState JS_TEXT;
34   final static InternalState JS_Q;
35   final static InternalState JS_Q_E;
36   final static InternalState JS_DQ;
37   final static InternalState JS_DQ_E;
38   final static InternalState JS_SLASH;
39   final static InternalState JS_REGEXP_SLASH;
40   final static InternalState JS_REGEXP;
41   final static InternalState JS_REGEXP_BRK;
42   final static InternalState JS_REGEXP_BRK_E;
43   final static InternalState JS_REGEXP_E;
44   final static InternalState JS_COM_LN;
45   final static InternalState JS_COM_ML;
46   final static InternalState JS_COM_ML_CLOSE;
47   final static InternalState JS_COM_AFTER;
48 
49   static {
50     JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT");
51     JS_Q  = InternalState.getInstanceJavascript("JS_Q");
52     JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E");
53     JS_DQ = InternalState.getInstanceJavascript("JS_DQ");
54     JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E");
55     JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH");
56     JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP");
57     JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH");
58     JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E");
59     JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK");
60     JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E");
61     JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN");
62     JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML");
63     JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE");
64     JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER");
65   }
66 
67   private static final Map<InternalState, ExternalState> STATE_MAPPING =
68       Maps.newHashMap();
69   static {
initializeStateMapping()70     initializeStateMapping();
71   }
72 
73   private static final ParserStateTable STATE_TABLE = new ParserStateTable();
74   static {
initializeParserStateTable()75     initializeParserStateTable();
76   }
77 
78   private final JavascriptTokenBuffer ccBuffer;
79 
80   /**
81    * Creates a {@code JavascriptParserImpl} object.
82    */
JavascriptParserImpl()83   public JavascriptParserImpl() {
84     super(STATE_TABLE, STATE_MAPPING, JS_TEXT);
85     ccBuffer = new JavascriptTokenBuffer();
86   }
87 
88   /**
89    * Creates a {@code JavascriptParserImpl} object that is a copy
90    * of the one provided.
91    *
92    * @param aJavascriptParserImpl the {@code JavascriptParserImpl} to copy
93    */
JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl)94   public JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl) {
95     super(aJavascriptParserImpl);
96     ccBuffer = new JavascriptTokenBuffer(aJavascriptParserImpl.ccBuffer);
97   }
98 
99   @Override
reset()100   public void reset() {
101     super.reset();
102     currentState = JS_TEXT;
103   }
104 
105   @Override
handleEnterState(InternalState currentState, InternalState expectedNextState, char input)106   protected InternalState handleEnterState(InternalState currentState,
107                                            InternalState expectedNextState,
108                                            char input) {
109     InternalState nextState = expectedNextState;
110     if (currentState == JS_SLASH) {
111       nextState = enterStateJsSlash(currentState, input);
112     } else if (currentState == JS_COM_AFTER) {
113       enterStateJsCommentAfter();
114     }
115     return nextState;
116   }
117 
118   @Override
handleExitState(InternalState currentState, InternalState expectedNextState, char input)119   protected InternalState handleExitState(InternalState currentState,
120                                           InternalState expectedNextState,
121                                           char input) {
122     // Nothing to do - no handlers for exit states
123     return expectedNextState;
124   }
125 
126   @Override
handleInState(InternalState currentState, char input)127   protected InternalState handleInState(InternalState currentState,
128                                         char input) {
129     if (currentState == JS_TEXT) {
130       inStateJsText(input);
131     }
132     return currentState;
133   }
134 
135   /**
136    * Called every time we find a slash ('/') character in the javascript
137    * text (except for slashes that close comments or regexp literals).
138    *
139    * <p>Comment copied verbatim from the corresponding C-version.
140    *
141    * <p>Implements the logic to figure out if this slash character is a
142    * division operator or if it opens a regular expression literal.
143    * This is heavily inspired by the syntactic resynchronization
144    * for javascript 2.0:
145    *
146    * <p>When we receive a '/', we look at the previous non space character
147    * to figure out if it's the ending of a punctuator that can precede a
148    * regexp literal, in which case we assume the current '/' is part of a
149    * regular expression literal (or the opening of a javascript comment,
150    * but that part is dealt with in the state machine). The exceptions to
151    * this are unary operators, so we look back a second character to rule
152    * out '++' and '--'.
153    *
154    * <p> Although it is not straightforward to figure out if the binary
155    * operator is a postfix of the previous expression or a prefix of the
156    * regular expression, we rule out the later as it is an uncommon practice.
157    *
158    * <p>If we ruled out the previous token to be a valid regexp preceding
159    * punctuator, we extract the last identifier in the buffer and match
160    * against a list of keywords that are known to precede expressions in
161    * the grammar. If we get a match on any of these keywords, then we are
162    * opening a regular expression, if not, then we have a division operator.
163    *
164    * <p>Known cases that are accepted by the grammar but we handle
165    * differently, although I (falmeida) don't believe there is a
166    * legitimate usage for those:
167    *   Division of a regular expression: var result = /test/ / 5;
168    *   Prefix unary increment of a regular expression: var result = ++/test/;
169    *   Division of an object literal: { a: 1 } /x/.exec('x');
170    *
171    * @param state being entered to
172    * @param input character being processed
173    * @return state next state to go to, may be the same as the one we
174    *     were called with
175    *
176    * <a>http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html>
177    * Syntactic Resynchronization</a>
178    */
enterStateJsSlash(InternalState state, char input)179   private InternalState enterStateJsSlash(InternalState state, char input) {
180 
181     InternalState nextState = state;
182     int position = -1;
183 
184     // Consume the last whitespace
185     if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) {
186       --position;
187     }
188 
189     switch (ccBuffer.getChar(position)) {
190       // Ignore unary increment
191       case '+':
192         if (ccBuffer.getChar(position - 1) != '+') {
193           nextState = JS_REGEXP_SLASH;
194         }
195         break;
196       case '-':
197         // Ignore unary decrement
198         if (ccBuffer.getChar(position - 1) != '-') {
199           nextState = JS_REGEXP_SLASH;
200         }
201         break;
202         // List of punctuator endings except ), ], }, + and - *
203       case '=':
204       case '<':
205       case '>':
206       case '&':
207       case '|':
208       case '!':
209       case '%':
210       case '*':
211       case '/':
212       case ',':
213       case ';':
214       case '?':
215       case ':':
216       case '^':
217       case '~':
218       case '{':
219       case '(':
220       case '[':
221       case '}':
222       case '\0':
223         nextState = JS_REGEXP_SLASH;
224         break;
225       default:
226         String lastIdentifier = ccBuffer.getLastIdentifier();
227         if (lastIdentifier != null && HtmlUtils
228             .isJavascriptRegexpPrefix(lastIdentifier)) {
229           nextState = JS_REGEXP_SLASH;
230         }
231     }
232     ccBuffer.appendChar(input);
233     return nextState;
234   }
235 
236   /**
237    * Called at the end of a javascript comment.
238    *
239    * <p>When we open a comment, the initial '/' was inserted into the ring
240    * buffer, but it is not a token and should be considered whitespace
241    * for parsing purposes.
242    *
243    * <p>When we first saw the '/' character, we didn't yet know if it was
244    * the beginning of a comment, a division operator, or a regexp.
245    *
246    * <p>In this function we just replace the inital '/' with a whitespace
247    * character, unless we had a preceding whitespace character, in which
248    * case we just remove the '/'. This is needed to ensure all spaces in
249    * the buffer are correctly folded.
250    */
enterStateJsCommentAfter()251   private void enterStateJsCommentAfter() {
252     if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) {
253       ccBuffer.popChar();
254     } else {
255       ccBuffer.setChar(-1, ' ');
256     }
257   }
258 
inStateJsText(char input)259   private void inStateJsText(char input) {
260     ccBuffer.appendChar(input);
261   }
262 
263 // ======================================================= //
264 // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
265 // ======================================================= //
266 
registerMapping(InternalState internalState, ExternalState externalState)267   private static void registerMapping(InternalState internalState,
268                                       ExternalState externalState) {
269     STATE_MAPPING.put(internalState, externalState);
270   }
271 
initializeStateMapping()272   private static void initializeStateMapping() {
273     // Each parser implementation must map the error state appropriately.
274     registerMapping(InternalState.INTERNAL_ERROR_STATE,
275                     JavascriptParser.STATE_ERROR);
276 
277     registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT);
278     registerMapping(JS_Q, JavascriptParser.STATE_Q);
279     registerMapping(JS_Q_E, JavascriptParser.STATE_Q);
280     registerMapping(JS_DQ, JavascriptParser.STATE_DQ);
281     registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ);
282     registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT);
283     registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT);
284     registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP);
285     registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP);
286     registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP);
287     registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP);
288     registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT);
289     registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT);
290     registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT);
291     registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT);
292   }
293 
registerTransition(String expression, InternalState source, InternalState to)294   private static void registerTransition(String expression,
295                                          InternalState source,
296                                          InternalState to) {
297     // It seems to silly to go through a StateTableTransition here
298     // but it adds extra data checking.
299     StateTableTransition stt = new StateTableTransition(expression,
300                                                         source, to);
301     STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
302                               stt.getTo());
303   }
304 
initializeParserStateTable()305   private static void initializeParserStateTable() {
306     registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT);
307     registerTransition("/", JS_COM_AFTER, JS_SLASH);
308     registerTransition("\"", JS_COM_AFTER, JS_DQ);
309     registerTransition("\'", JS_COM_AFTER, JS_Q);
310     registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML);
311     registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER);
312     registerTransition("[:default:]", JS_COM_ML, JS_COM_ML);
313     registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE);
314     registerTransition("[:default:]", JS_COM_LN,JS_COM_LN);
315     registerTransition("\n", JS_COM_LN,JS_COM_AFTER);
316     registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP);
317     registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK);
318     registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK);
319     registerTransition("]", JS_REGEXP_BRK, JS_REGEXP);
320     registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E);
321     registerTransition("[:default:]", JS_REGEXP, JS_REGEXP);
322     registerTransition("/", JS_REGEXP, JS_TEXT);
323     registerTransition("[", JS_REGEXP, JS_REGEXP_BRK);
324     registerTransition("\\", JS_REGEXP, JS_REGEXP_E);
325     registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP);
326     registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK);
327     registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E);
328     registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML);
329     registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN);
330     registerTransition("[:default:]", JS_SLASH, JS_TEXT);
331     registerTransition("*", JS_SLASH, JS_COM_ML);
332     registerTransition("/", JS_SLASH, JS_COM_LN);
333     registerTransition("[:default:]", JS_DQ_E,JS_DQ);
334     registerTransition("[:default:]", JS_DQ,JS_DQ);
335     registerTransition("\"", JS_DQ, JS_TEXT);
336     registerTransition("\\", JS_DQ, JS_DQ_E);
337     registerTransition("[:default:]", JS_Q_E,JS_Q);
338     registerTransition("[:default:]", JS_Q,JS_Q);
339     registerTransition("\'", JS_Q, JS_TEXT);
340     registerTransition("\\", JS_Q, JS_Q_E);
341     registerTransition("[:default:]", JS_TEXT, JS_TEXT);
342     registerTransition("/", JS_TEXT, JS_SLASH);
343     registerTransition("\"", JS_TEXT, JS_DQ);
344     registerTransition("\'", JS_TEXT, JS_Q);
345   }
346 }