1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser.impl;
18 
19 import com.google.common.base.Preconditions;
20 import com.google.common.collect.Maps;
21 import com.google.streamhtmlparser.ExternalState;
22 import com.google.streamhtmlparser.HtmlParser;
23 import com.google.streamhtmlparser.ParseException;
24 import com.google.streamhtmlparser.util.CharacterRecorder;
25 import com.google.streamhtmlparser.util.EntityResolver;
26 import com.google.streamhtmlparser.util.HtmlUtils;
27 
28 import java.util.Map;
29 
30 /**
31  * A custom specialized parser - ported from the main C++ version - used to
32  * implement context-aware escaping of run-time data in web-application
33  * templates.
34  *
35  * <p>This is the main class in the package. It implements the
36  * {@code HtmlParser} interface.
37  *
38  * <p>This class is not thread-safe, in particular you cannot invoke any
39  * state changing operations (such as {@code parse} from multiple threads
40  * on the same object.
41  *
42  * <p>If you are looking at this class, chances are very high you are
43  * implementing Auto-Escaping for a new template system. Please see the
44  * landing page including a design document at
45  * <a href="http://go/autoescape">Auto-Escape Landing Page</a>.
46  */
47 public class HtmlParserImpl extends GenericParser implements HtmlParser {
48 
49   /*
50    * Internal representation of the parser state, which is at a
51    * finer-granularity than the external state as given to callers.
52    * The relationship between <code>InternalState</code> and
53    * <code>ExternalState</code> is a many-to-one relationship.
54    */
55   private static final InternalState TEXT;
56   private static final InternalState TAG_START;
57   private static final InternalState TAG_NAME;
58   private static final InternalState DECL_START;
59   private static final InternalState DECL_BODY;
60   private static final InternalState COM_OPEN;
61   private static final InternalState COM_BODY;
62   private static final InternalState COM_DASH;
63   private static final InternalState COM_DASH_DASH;
64   private static final InternalState PI;
65   private static final InternalState PI_MAY_END;
66   private static final InternalState TAG_SPACE;
67   private static final InternalState TAG_CLOSE;
68   private static final InternalState ATTR;
69   private static final InternalState ATTR_SPACE;
70   private static final InternalState VALUE;
71   private static final InternalState VALUE_TEXT;
72   private static final InternalState VALUE_Q_START;
73   private static final InternalState VALUE_Q;
74   private static final InternalState VALUE_DQ_START;
75   private static final InternalState VALUE_DQ;
76   private static final InternalState CDATA_COM_START;
77   private static final InternalState CDATA_COM_START_DASH;
78   private static final InternalState CDATA_COM_BODY;
79   private static final InternalState CDATA_COM_DASH;
80   private static final InternalState CDATA_COM_DASH_DASH;
81   private static final InternalState CDATA_TEXT;
82   private static final InternalState CDATA_LT;
83   private static final InternalState CDATA_MAY_CLOSE;
84   private static final InternalState JS_FILE;
85   private static final InternalState CSS_FILE;
86 
87   static {
88     TEXT = InternalState.getInstanceHtml("TEXT");
89     TAG_START = InternalState.getInstanceHtml("TAG_START");
90     TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
91     DECL_START = InternalState.getInstanceHtml("DECL_START");
92     DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
93     COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
94     COM_BODY = InternalState.getInstanceHtml("COM_BODY");
95     COM_DASH = InternalState.getInstanceHtml("COM_DASH");
96     COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
97     PI =InternalState.getInstanceHtml("PI");
98     PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
99     TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
100     TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
101     ATTR = InternalState.getInstanceHtml("ATTR");
102     ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
103     VALUE = InternalState.getInstanceHtml("VALUE");
104     VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
105     VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
106     VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
107     VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
108     VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
109     CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
110     CDATA_COM_START_DASH =
111         InternalState.getInstanceHtml("CDATA_COM_START_DASH");
112     CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
113     CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
114     CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
115     CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
116     CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
117     CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
118     JS_FILE = InternalState.getInstanceHtml("JS_FILE");
119     CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
120   }
121 
122   private static final Map<InternalState, ExternalState> STATE_MAPPING =
123       Maps.newHashMap();
124   static {
initializeStateMapping()125     initializeStateMapping();
126   }
127 
128   private static final ParserStateTable STATE_TABLE = new ParserStateTable();
129   static {
initializeParserStateTable()130     initializeParserStateTable();
131   }
132 
133   private final CharacterRecorder tag;
134   private final CharacterRecorder attr;
135   private final CharacterRecorder value;
136   private final CharacterRecorder cdataCloseTag;
137   private final EntityResolver entityResolver;
138   private final JavascriptParserImpl jsParser;
139   private boolean insideJavascript;
140   private int valueIndex;
141   // True iff InsertText() was called at the start of a URL attribute value.
142   private boolean textInsideUrlValue;
143 
144   /**
145    * Creates an {@code HtmlParserImpl} object.
146    *
147    * <p>Both for performance reasons and to leverage code a state-flow machine
148    * that is automatically generated from Python for multiple target
149    * languages, this object uses a static {@code ParserStateTable} that
150    * is read-only and obtained from the generated code in {@code HtmlParserFsm}.
151    * That code also maintains the mapping from internal states
152    * ({@code InternalState}) to external states ({@code ExternalState}).
153    */
HtmlParserImpl()154   public HtmlParserImpl() {
155     super(STATE_TABLE, STATE_MAPPING, TEXT);
156     tag = new CharacterRecorder();
157     attr = new CharacterRecorder();
158     value = new CharacterRecorder();
159     cdataCloseTag = new CharacterRecorder();
160     entityResolver = new EntityResolver();
161     jsParser = new JavascriptParserImpl();
162     insideJavascript = false;
163     valueIndex = 0;
164     textInsideUrlValue = false;
165   }
166 
167   /**
168    * Creates an {@code HtmlParserImpl} that is a copy of the one provided.
169    *
170    * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
171    */
HtmlParserImpl(HtmlParserImpl aHtmlParserImpl)172   public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
173     super(aHtmlParserImpl);
174     tag = new CharacterRecorder(aHtmlParserImpl.tag);
175     attr = new CharacterRecorder(aHtmlParserImpl.attr);
176     value = new CharacterRecorder(aHtmlParserImpl.value);
177     cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
178     entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
179     jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
180     insideJavascript = aHtmlParserImpl.insideJavascript;
181     valueIndex = aHtmlParserImpl.valueIndex;
182     textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
183   }
184 
185   @Override
inJavascript()186   public boolean inJavascript() {
187     return (insideJavascript
188             && ( (getState() == STATE_VALUE)
189                  || (currentState == CDATA_TEXT)
190                  || (currentState == CDATA_COM_START)
191                  || (currentState == CDATA_COM_START_DASH)
192                  || (currentState == CDATA_COM_BODY)
193                  || (currentState == CDATA_COM_DASH)
194                  || (currentState == CDATA_COM_DASH_DASH)
195                  || (currentState == CDATA_LT)
196                  || (currentState == CDATA_MAY_CLOSE)
197                  || (currentState == JS_FILE) ));
198   }
199 
200   @Override
isJavascriptQuoted()201   public boolean isJavascriptQuoted() {
202     if (inJavascript()) {
203       ExternalState jsParserState = jsParser.getState();
204       return (jsParserState == JavascriptParserImpl.STATE_Q
205               || jsParserState == JavascriptParserImpl.STATE_DQ);
206     }
207     return false;
208   }
209 
210   @Override
inAttribute()211   public boolean inAttribute() {
212     ExternalState extState = getState();
213     return (extState != null && (extState == STATE_ATTR
214                                  || extState == STATE_VALUE));
215   }
216 
217   /**
218    * Returns {@code true} if and only if the parser is currently within
219    * a CSS context. A CSS context is one of the below:
220    * <ul>
221    * <li>Inside a STYLE tag.
222    * <li>Inside a STYLE attribute.
223    * <li>Inside a CSS file when the parser was reset in the CSS mode.
224    * </ul>
225    *
226    * @return {@code true} if and only if the parser is inside CSS
227    */
228   @Override
inCss()229   public boolean inCss() {
230     return (currentState == CSS_FILE
231             || (getState() == STATE_VALUE
232                 && (getAttributeType() == ATTR_TYPE.STYLE))
233             || ("style".equals(getTag())));
234   }
235 
236   @Override
getAttributeType()237   public ATTR_TYPE getAttributeType() {
238     String attribute = getAttribute();
239     if (!inAttribute()) {
240       return ATTR_TYPE.NONE;
241     }
242     if (HtmlUtils.isAttributeJavascript(attribute)) {
243       return ATTR_TYPE.JS;
244     }
245     if (HtmlUtils.isAttributeUri(attribute)) {
246       return ATTR_TYPE.URI;
247     }
248     if (HtmlUtils.isAttributeStyle(attribute)) {
249       return ATTR_TYPE.STYLE;
250     }
251 
252     // Special logic to handle the "content" attribute of the "meta" tag.
253     if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
254       HtmlUtils.META_REDIRECT_TYPE redirectType =
255           HtmlUtils.parseContentAttributeForUrl(getValue());
256       if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
257           redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
258         return ATTR_TYPE.URI;
259     }
260 
261     return ATTR_TYPE.REGULAR;
262   }
263 
264   @Override
getJavascriptState()265   public ExternalState getJavascriptState() {
266     return jsParser.getState();
267   }
268 
269   @Override
isAttributeQuoted()270   public boolean isAttributeQuoted() {
271     return (currentState == VALUE_Q_START
272             || currentState == VALUE_Q
273             || currentState == VALUE_DQ_START
274             || currentState == VALUE_DQ);
275   }
276 
277   @Override
getTag()278   public String getTag() {
279     return tag.getContent().toLowerCase();
280   }
281 
282   @Override
getAttribute()283   public String getAttribute() {
284     return inAttribute() ? attr.getContent().toLowerCase() : "";
285   }
286 
287   @Override
getValue()288   public String getValue() {
289     return (getState() == STATE_VALUE) ? value.getContent() : "";
290   }
291 
292   @Override
getValueIndex()293   public int getValueIndex() {
294     if (getState() != STATE_VALUE) {
295       return 0;
296     }
297     return valueIndex;
298   }
299 
300   @Override
isUrlStart()301   public boolean isUrlStart() {
302     // False when not inside an HTML attribute value
303     if (getState() != STATE_VALUE) {
304       return false;
305     }
306 
307     //  Or when the HTML attribute is not of URI type.
308     if (getAttributeType() != ATTR_TYPE.URI) {
309       return false;
310     }
311 
312     // Or when we received an InsertText() directive at the start of a URL.
313     if (textInsideUrlValue) {
314       return false;
315     }
316 
317     if ("meta".equals(getTag())) {
318       // At this point, we know we are in the "content" attribute
319       // or we would not have the URI attribute type.
320       return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
321               HtmlUtils.META_REDIRECT_TYPE.URL_START);
322     }
323 
324     // For all other URI attributes, check if we are at index 0.
325     return (getValueIndex() == 0);
326 }
327 
328   /**
329    * {@inheritDoc}
330    *
331    * Resets the state of the parser to a state consistent with the
332    * {@code Mode} provided. This will reset finer-grained state
333    * information back to a default value, hence use only when
334    * you want to parse text from a very clean slate.
335    *
336    * <p>See the {@link HtmlParser.Mode} enum for information on all
337    * the valid modes.
338    *
339    * @param mode is an enum representing the high-level state of the parser
340    */
341   @Override
resetMode(Mode mode)342   public void resetMode(Mode mode) {
343     insideJavascript = false;
344     tag.reset();
345     attr.reset();
346     value.reset();
347     cdataCloseTag.reset();
348     valueIndex = 0;
349     textInsideUrlValue = false;
350     jsParser.reset();
351 
352     switch (mode) {
353       case HTML:
354         currentState = TEXT;
355         break;
356       case JS:
357         currentState = JS_FILE;
358         insideJavascript = true;
359         break;
360       case CSS:
361         currentState = CSS_FILE;
362         break;
363       case HTML_IN_TAG:
364         currentState = TAG_SPACE;
365         break;
366       default:
367         throw new IllegalArgumentException("Did not recognize Mode: " +
368                                            mode.toString());
369     }
370   }
371 
372   /**
373    * Resets the state of the parser to the initial state of parsing HTML.
374    */
reset()375   public void reset() {
376     super.reset();
377     resetMode(Mode.HTML);
378   }
379 
380   /**
381    * A specialized directive to tell the parser there is some content
382    * that will be inserted here but that it will not get to parse. Used
383    * by the template system that may not be able to give some content
384    * to the parser but wants it to know there typically will be content
385    * inserted at that point.  This is a hint used in corner cases within
386    * parsing of HTML attribute names and values where content we do not
387    * get to see could affect our parsing and alter our current state.
388    *
389    * <p>The two cases where {@code #insertText()} affects our parsing are:
390    * <ul>
391    * <li>We are at the start of the value of a URL-accepting HTML attribute. In
392    * that case, we change internal state to no longer be considered at the
393    * start of the URL. This may affect what escaping template systems may want
394    * to perform on the HTML attribute value. We avoid injecting fake data and
395    * hence not modify the current index of the value as determined by
396    * {@link #getValueIndex()}</li>
397    * <li>We just transitioned from an attribute name to an attribute value
398    * (by parsing the separating {@code '='} character). In that case, we
399    * change internal state to be now inside a non-quoted HTML attribute
400    * value.</li>
401    * </ul>
402    *
403    * @throws ParseException if an unrecoverable error occurred during parsing
404    */
405   @Override
insertText()406   public void insertText() throws ParseException {
407     // Case: Inside URL attribute value.
408     if (getState() == STATE_VALUE
409         && getAttributeType() == ATTR_TYPE.URI
410         && isUrlStart()) {
411       textInsideUrlValue = true;
412     }
413     // Case: Before parsing any attribute value.
414     if (currentState == VALUE) {
415       setNextState(VALUE_TEXT);
416     }
417   }
418 
419   @Override
handleEnterState(InternalState currentState, InternalState expectedNextState, char input)420   protected InternalState handleEnterState(InternalState currentState,
421                                            InternalState expectedNextState,
422                                            char input) {
423     InternalState nextState = expectedNextState;
424     if (currentState == TAG_NAME) {
425       enterTagName();
426     } else if (currentState == ATTR) {
427       enterAttribute();
428     } else if (currentState == TAG_CLOSE) {
429       nextState = tagClose(currentState);
430     } else if (currentState == CDATA_MAY_CLOSE) {
431       enterStateCdataMayClose();
432     } else if (currentState == VALUE) {
433       enterValue();
434     } else
435     if (currentState == VALUE_TEXT || currentState == VALUE_Q
436         || currentState == VALUE_DQ) {
437       enterValueContent();
438     }
439     return nextState;
440   }
441 
442   @Override
handleExitState(InternalState currentState, InternalState expectedNextState, char input)443   protected InternalState handleExitState(InternalState currentState,
444                                           InternalState expectedNextState,
445                                           char input) {
446     InternalState nextState = expectedNextState;
447     if (currentState == TAG_NAME) {
448       exitTagName();
449     } else if (currentState == ATTR) {
450       exitAttribute();
451     } else if (currentState == CDATA_MAY_CLOSE) {
452       nextState = exitStateCdataMayClose(nextState, input);
453     } else
454     if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
455         || (currentState == VALUE_DQ)) {
456       exitValueContent();
457     }
458     return nextState;
459   }
460 
461   @Override
handleInState(InternalState currentState, char input)462   protected InternalState handleInState(InternalState currentState,
463                                         char input) throws ParseException {
464     if ((currentState == CDATA_TEXT)
465         || (currentState == CDATA_COM_START)
466         || (currentState == CDATA_COM_START_DASH)
467         || (currentState == CDATA_COM_BODY)
468         || (currentState == CDATA_COM_DASH)
469         || (currentState == CDATA_COM_DASH_DASH)
470         || (currentState == CDATA_LT)
471         || (currentState == CDATA_MAY_CLOSE)
472         || (currentState == JS_FILE)) {
473       inStateCdata(input);
474     } else if ((currentState == VALUE_TEXT)
475                || (currentState == VALUE_Q)
476                || (currentState == VALUE_DQ)) {
477       inStateValue(input);
478     }
479     return currentState;
480   }
481 
482   /**
483    * Invokes recording on all CharacterRecorder objects. Currently we do
484    * not check that one and only one of them is recording. I did a fair
485    * bit of testing on the C++ parser and was not convinced there is
486    * such a guarantee.
487    */
488   @Override
record(char input)489   protected void record(char input) {
490     attr.maybeRecord(input);
491     tag.maybeRecord(input);
492     value.maybeRecord(input);
493     cdataCloseTag.maybeRecord(input);
494   }
495 
496   /**
497    * Starts recording the name of the HTML tag. Called when the parser
498    * enters a new tag.
499    */
enterTagName()500   private void enterTagName() {
501     tag.startRecording();
502   }
503 
exitTagName()504   private void exitTagName() {
505     tag.stopRecording();
506     String tagString = tag.getContent();
507     if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
508       tag.reset();
509     }
510   }
511 
512   /**
513    * Starts recording the name of the HTML attribute. Called when the parser
514    * enters a new HTML attribute.
515    */
enterAttribute()516   private void enterAttribute() {
517     attr.startRecording();
518   }
519 
exitAttribute()520   private void exitAttribute() {
521     attr.stopRecording();
522   }
523 
524   /**
525    * Tracks the index within the HTML attribute value and initializes
526    * the javascript parser for attributes that take javascript.
527    *
528    * Called when the parser enters a new HTML attribute value.
529    */
enterValue()530   private void enterValue() {
531     valueIndex = 0;
532     textInsideUrlValue = false;
533     if (HtmlUtils.isAttributeJavascript(getAttribute())) {
534       entityResolver.reset();
535       jsParser.reset();
536       insideJavascript = true;
537     } else {
538       insideJavascript = false;
539     }
540   }
541 
542   /**
543    * Starts recordning the contents of the attribute value.
544    *
545    * Called when entering an attribute value.
546    */
enterValueContent()547   private void enterValueContent() {
548     value.startRecording();
549   }
550 
551   /**
552    * Stops the recording of the attribute value and exits javascript
553    * (in case we were inside it).
554    */
exitValueContent()555   private void exitValueContent() {
556     value.stopRecording();
557     insideJavascript = false;
558   }
559 
560   /**
561    * Processes javascript after performing entity resolution and updates
562    * the position within the attribute value.
563    * If the status of the entity resolution is <code>IN_PROGRESS</code>,
564    * we don't invoke the javascript parser.
565    *
566    * <p>Called for every character inside an attribute value.
567    *
568    * @param input character read
569    * @throws ParseException if an unrecoverable error occurred during parsing
570    */
inStateValue(char input)571   private void inStateValue(char input) throws ParseException {
572     valueIndex++;
573     if (insideJavascript) {
574       EntityResolver.Status status = entityResolver.processChar(input);
575       if (status == EntityResolver.Status.COMPLETED) {
576         jsParser.parse(entityResolver.getEntity());
577         entityResolver.reset();
578       } else if (status == EntityResolver.Status.NOT_STARTED) {
579         jsParser.parse(input);
580       }
581     }
582   }
583 
584   /**
585    * Handles the tag it finished reading.
586    *
587    * <p>For a script tag, it initializes the javascript parser. For all
588    * tags that are recognized to have CDATA values
589    * (including the script tag), it switches the CDATA state to handle them
590    * properly. For code simplification, CDATA and RCDATA sections are
591    * treated the same.
592    *
593    * <p>Called when the parser leaves a tag definition.
594    *
595    * @param state current state
596    * @return state next state, could be the same as current state
597    */
tagClose(InternalState state)598   private InternalState tagClose(InternalState state) {
599     InternalState nextState = state;
600     String tagName = getTag();
601     if ("script".equals(tagName)) {
602       nextState = CDATA_TEXT;
603       jsParser.reset();
604       insideJavascript = true;
605     } else if ("style".equals(tagName)
606                  || "title".equals(tagName)
607                  || "textarea".equals(tagName)) {
608       nextState = CDATA_TEXT;
609       insideJavascript = false;
610     }
611     return nextState;
612   }
613 
614   /**
615    * Feeds the character to the javascript parser for processing.
616    *
617    * <p>Called inside CDATA blocks to parse javascript.
618    *
619    * @param input character read
620    * @throws ParseException if an unrecoverable error occurred during parsing
621    */
inStateCdata(char input)622   private void inStateCdata(char input) throws ParseException {
623     if (insideJavascript) {
624       jsParser.parse(input);
625     }
626   }
627 
628   /**
629    * Starts recording. This is so we find the closing tag name in order to
630    * know if the tag is going to be closed or not.
631    *
632    * <p>Called when encountering a '<' character in a CDATA section.
633    */
enterStateCdataMayClose()634   private void enterStateCdataMayClose() {
635     cdataCloseTag.startRecording();
636   }
637 
638   /**
639    * Determines whether to close the tag element, It closes it if it finds
640    * the corresponding end tag. Called when reading what could be a
641    * closing CDATA tag.
642    *
643    * @param input the character read
644    * @param expectedNextState the expected state to go to next
645    *        unless we want to change it here
646    * @return the next state to go to
647    */
exitStateCdataMayClose( InternalState expectedNextState, char input)648   private InternalState exitStateCdataMayClose(
649       InternalState expectedNextState,
650       char input) {
651     InternalState nextState = expectedNextState;
652     cdataCloseTag.stopRecording();
653     String cdataCloseTagString = cdataCloseTag.getContent();
654     Preconditions.checkState(!cdataCloseTagString.isEmpty()
655         && cdataCloseTagString.charAt(0) == '/');  // Developer error.
656 
657     if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
658         && (input == '>' || HtmlUtils.isHtmlSpace(input))) {
659       tag.clear();
660       insideJavascript = false;
661     } else {
662       nextState = CDATA_TEXT;
663     }
664     return nextState;
665   }
666 
667 
668   // ======================================================= //
669   // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
670   // ======================================================= //
671 
registerMapping(InternalState internalState, ExternalState externalState)672   private static void registerMapping(InternalState internalState,
673                                       ExternalState externalState) {
674     STATE_MAPPING.put(internalState, externalState);
675   }
676 
initializeStateMapping()677   private static void initializeStateMapping() {
678     // Each parser implementation must map the error state appropriately.
679     registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);
680 
681     registerMapping(TEXT, HtmlParser.STATE_TEXT);
682     registerMapping(TAG_START, HtmlParser.STATE_TAG);
683     registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
684     registerMapping(DECL_START, HtmlParser.STATE_TEXT);
685     registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
686     registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
687     registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
688     registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
689     registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
690     registerMapping(PI, HtmlParser.STATE_TEXT);
691     registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
692     registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
693     registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
694     registerMapping(ATTR, HtmlParser.STATE_ATTR);
695     registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
696     registerMapping(VALUE, HtmlParser.STATE_VALUE);
697     registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
698     registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
699     registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
700     registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
701     registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
702     registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
703     registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
704     registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
705     registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
706     registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
707     registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
708     registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
709     registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
710     registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
711     registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
712   }
713 
registerTransition(String expression, InternalState source, InternalState to)714   private static void registerTransition(String expression,
715                                          InternalState source,
716                                          InternalState to) {
717     // It seems to silly to go through a StateTableTransition here
718     // but it adds extra data checking.
719     StateTableTransition stt = new StateTableTransition(expression,
720                                                         source, to);
721     STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
722                               stt.getTo());
723   }
724 
725   // NOTE: The "[:default:]" transition should be registered before any
726   //   other transitions for a given state or it will over-write them.
initializeParserStateTable()727   private static void initializeParserStateTable() {
728     registerTransition("[:default:]", CSS_FILE, CSS_FILE);
729     registerTransition("[:default:]", JS_FILE, JS_FILE);
730     registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
731     registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
732     registerTransition(">", CDATA_MAY_CLOSE, TEXT);
733     registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
734     registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
735     registerTransition("!", CDATA_LT, CDATA_COM_START);
736     registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
737     registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
738     registerTransition("<", CDATA_TEXT, CDATA_LT);
739     registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
740     registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
741     registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
742     registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
743     registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
744     registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
745     registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
746     registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
747     registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
748     registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
749     registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
750     registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
751     registerTransition("\"", VALUE_DQ, TAG_SPACE);
752     registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
753     registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
754     registerTransition("[:default:]", VALUE_Q, VALUE_Q);
755     registerTransition("\'", VALUE_Q, TAG_SPACE);
756     registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
757     registerTransition("\'", VALUE_Q_START, TAG_SPACE);
758     registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
759     registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
760     registerTransition(">", VALUE_TEXT, TAG_CLOSE);
761     registerTransition("[:default:]", VALUE, VALUE_TEXT);
762     registerTransition(">", VALUE, TAG_CLOSE);
763     registerTransition(" \t\n\r", VALUE, VALUE);
764     registerTransition("\"", VALUE, VALUE_DQ_START);
765     registerTransition("\'", VALUE, VALUE_Q_START);
766     registerTransition("=", ATTR_SPACE, VALUE);
767     registerTransition("/", ATTR_SPACE, TAG_SPACE);
768     registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
769     registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
770     registerTransition(">", ATTR_SPACE, TAG_CLOSE);
771     registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
772     registerTransition("=", ATTR, VALUE);
773     registerTransition("/", ATTR, TAG_SPACE);
774     registerTransition(">", ATTR, TAG_CLOSE);
775     registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
776     registerTransition("[:default:]", TAG_CLOSE, TEXT);
777     registerTransition("<", TAG_CLOSE, TAG_START);
778     registerTransition("/", TAG_SPACE, TAG_SPACE);
779     registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
780     registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
781     registerTransition(">", TAG_SPACE, TAG_CLOSE);
782     registerTransition("[:default:]", PI_MAY_END, PI);
783     registerTransition(">", PI_MAY_END, TEXT);
784     registerTransition("[:default:]", PI, PI);
785     registerTransition("?", PI, PI_MAY_END);
786     registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
787     registerTransition(">", COM_DASH_DASH, TEXT);
788     registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
789     registerTransition("[:default:]", COM_DASH, COM_BODY);
790     registerTransition("-", COM_DASH, COM_DASH_DASH);
791     registerTransition("[:default:]", COM_BODY, COM_BODY);
792     registerTransition("-", COM_BODY, COM_DASH);
793     registerTransition("[:default:]", COM_OPEN, TEXT);
794     registerTransition("-", COM_OPEN, COM_BODY);
795     registerTransition("[:default:]", DECL_BODY, DECL_BODY);
796     registerTransition(">", DECL_BODY, TEXT);
797     registerTransition("[:default:]", DECL_START, DECL_BODY);
798     registerTransition(">", DECL_START, TEXT);
799     registerTransition("-", DECL_START, COM_OPEN);
800     registerTransition(">", TAG_NAME, TAG_CLOSE);
801     registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
802     registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);
803 
804     // Manual change to remain in-sync with CL 10597850 in C HtmlParser.
805     registerTransition("[:default:]", TAG_START, TEXT);
806     registerTransition("<", TAG_START, TAG_START);
807     // End of manual change.
808 
809     registerTransition("!", TAG_START, DECL_START);
810     registerTransition("?", TAG_START, PI);
811     registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
812     registerTransition("[:default:]", TEXT, TEXT);
813     registerTransition("<", TEXT, TAG_START);
814   }
815 }
816