/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.impl; import com.google.common.collect.Maps; import com.google.streamhtmlparser.ExternalState; import com.google.streamhtmlparser.JavascriptParser; import com.google.streamhtmlparser.util.HtmlUtils; import com.google.streamhtmlparser.util.JavascriptTokenBuffer; import java.util.Map; /** *
Many comments copied almost verbatim from the original C version.
*/
public class JavascriptParserImpl extends GenericParser
implements JavascriptParser {
final static InternalState JS_TEXT;
final static InternalState JS_Q;
final static InternalState JS_Q_E;
final static InternalState JS_DQ;
final static InternalState JS_DQ_E;
final static InternalState JS_SLASH;
final static InternalState JS_REGEXP_SLASH;
final static InternalState JS_REGEXP;
final static InternalState JS_REGEXP_BRK;
final static InternalState JS_REGEXP_BRK_E;
final static InternalState JS_REGEXP_E;
final static InternalState JS_COM_LN;
final static InternalState JS_COM_ML;
final static InternalState JS_COM_ML_CLOSE;
final static InternalState JS_COM_AFTER;
static {
JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT");
JS_Q = InternalState.getInstanceJavascript("JS_Q");
JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E");
JS_DQ = InternalState.getInstanceJavascript("JS_DQ");
JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E");
JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH");
JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP");
JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH");
JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E");
JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK");
JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E");
JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN");
JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML");
JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE");
JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER");
}
private static final Map Comment copied verbatim from the corresponding C-version.
*
* Implements the logic to figure out if this slash character is a
* division operator or if it opens a regular expression literal.
* This is heavily inspired by the syntactic resynchronization
* for javascript 2.0:
*
* When we receive a '/', we look at the previous non space character
* to figure out if it's the ending of a punctuator that can precede a
* regexp literal, in which case we assume the current '/' is part of a
* regular expression literal (or the opening of a javascript comment,
* but that part is dealt with in the state machine). The exceptions to
* this are unary operators, so we look back a second character to rule
* out '++' and '--'.
*
* Although it is not straightforward to figure out if the binary
* operator is a postfix of the previous expression or a prefix of the
* regular expression, we rule out the later as it is an uncommon practice.
*
* If we ruled out the previous token to be a valid regexp preceding
* punctuator, we extract the last identifier in the buffer and match
* against a list of keywords that are known to precede expressions in
* the grammar. If we get a match on any of these keywords, then we are
* opening a regular expression, if not, then we have a division operator.
*
* Known cases that are accepted by the grammar but we handle
* differently, although I (falmeida) don't believe there is a
* legitimate usage for those:
* Division of a regular expression: var result = /test/ / 5;
* Prefix unary increment of a regular expression: var result = ++/test/;
* Division of an object literal: { a: 1 } /x/.exec('x');
*
* @param state being entered to
* @param input character being processed
* @return state next state to go to, may be the same as the one we
* were called with
*
* http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html>
* Syntactic Resynchronization
*/
private InternalState enterStateJsSlash(InternalState state, char input) {
InternalState nextState = state;
int position = -1;
// Consume the last whitespace
if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) {
--position;
}
switch (ccBuffer.getChar(position)) {
// Ignore unary increment
case '+':
if (ccBuffer.getChar(position - 1) != '+') {
nextState = JS_REGEXP_SLASH;
}
break;
case '-':
// Ignore unary decrement
if (ccBuffer.getChar(position - 1) != '-') {
nextState = JS_REGEXP_SLASH;
}
break;
// List of punctuator endings except ), ], }, + and - *
case '=':
case '<':
case '>':
case '&':
case '|':
case '!':
case '%':
case '*':
case '/':
case ',':
case ';':
case '?':
case ':':
case '^':
case '~':
case '{':
case '(':
case '[':
case '}':
case '\0':
nextState = JS_REGEXP_SLASH;
break;
default:
String lastIdentifier = ccBuffer.getLastIdentifier();
if (lastIdentifier != null && HtmlUtils
.isJavascriptRegexpPrefix(lastIdentifier)) {
nextState = JS_REGEXP_SLASH;
}
}
ccBuffer.appendChar(input);
return nextState;
}
/**
* Called at the end of a javascript comment.
*
* When we open a comment, the initial '/' was inserted into the ring
* buffer, but it is not a token and should be considered whitespace
* for parsing purposes.
*
* When we first saw the '/' character, we didn't yet know if it was
* the beginning of a comment, a division operator, or a regexp.
*
* In this function we just replace the inital '/' with a whitespace
* character, unless we had a preceding whitespace character, in which
* case we just remove the '/'. This is needed to ensure all spaces in
* the buffer are correctly folded.
*/
private void enterStateJsCommentAfter() {
if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) {
ccBuffer.popChar();
} else {
ccBuffer.setChar(-1, ' ');
}
}
private void inStateJsText(char input) {
ccBuffer.appendChar(input);
}
// ======================================================= //
// SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. //
// ======================================================= //
private static void registerMapping(InternalState internalState,
ExternalState externalState) {
STATE_MAPPING.put(internalState, externalState);
}
private static void initializeStateMapping() {
// Each parser implementation must map the error state appropriately.
registerMapping(InternalState.INTERNAL_ERROR_STATE,
JavascriptParser.STATE_ERROR);
registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT);
registerMapping(JS_Q, JavascriptParser.STATE_Q);
registerMapping(JS_Q_E, JavascriptParser.STATE_Q);
registerMapping(JS_DQ, JavascriptParser.STATE_DQ);
registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ);
registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT);
registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT);
registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP);
registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP);
registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP);
registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP);
registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT);
registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT);
registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT);
registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT);
}
private static void registerTransition(String expression,
InternalState source,
InternalState to) {
// It seems to silly to go through a StateTableTransition here
// but it adds extra data checking.
StateTableTransition stt = new StateTableTransition(expression,
source, to);
STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
stt.getTo());
}
private static void initializeParserStateTable() {
registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT);
registerTransition("/", JS_COM_AFTER, JS_SLASH);
registerTransition("\"", JS_COM_AFTER, JS_DQ);
registerTransition("\'", JS_COM_AFTER, JS_Q);
registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML);
registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER);
registerTransition("[:default:]", JS_COM_ML, JS_COM_ML);
registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE);
registerTransition("[:default:]", JS_COM_LN,JS_COM_LN);
registerTransition("\n", JS_COM_LN,JS_COM_AFTER);
registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP);
registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK);
registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK);
registerTransition("]", JS_REGEXP_BRK, JS_REGEXP);
registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E);
registerTransition("[:default:]", JS_REGEXP, JS_REGEXP);
registerTransition("/", JS_REGEXP, JS_TEXT);
registerTransition("[", JS_REGEXP, JS_REGEXP_BRK);
registerTransition("\\", JS_REGEXP, JS_REGEXP_E);
registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP);
registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK);
registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E);
registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML);
registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN);
registerTransition("[:default:]", JS_SLASH, JS_TEXT);
registerTransition("*", JS_SLASH, JS_COM_ML);
registerTransition("/", JS_SLASH, JS_COM_LN);
registerTransition("[:default:]", JS_DQ_E,JS_DQ);
registerTransition("[:default:]", JS_DQ,JS_DQ);
registerTransition("\"", JS_DQ, JS_TEXT);
registerTransition("\\", JS_DQ, JS_DQ_E);
registerTransition("[:default:]", JS_Q_E,JS_Q);
registerTransition("[:default:]", JS_Q,JS_Q);
registerTransition("\'", JS_Q, JS_TEXT);
registerTransition("\\", JS_Q, JS_Q_E);
registerTransition("[:default:]", JS_TEXT, JS_TEXT);
registerTransition("/", JS_TEXT, JS_SLASH);
registerTransition("\"", JS_TEXT, JS_DQ);
registerTransition("\'", JS_TEXT, JS_Q);
}
}