1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser;
18 
19 /**
20  * Methods exposed for HTML parsing of text to facilitate implementation
21  * of Automatic context-aware escaping. The HTML parser also embeds a
22  * Javascript parser for processing Javascript fragments. In the future,
23  * it will also embed other specific parsers and hence most likely remain
24  * the main interface to callers of this package.
25  *
26  * <p>Note: These are the exact methods exposed in the original C++ Parser. The
27  * names are simply modified to conform to Java.
28  */
29 public interface HtmlParser extends Parser {
30 
31   /**
32    * The Parser Mode requested for parsing a given template.
33    * Currently we support:
34    * <ul>
35    * <li>{@code HTML} for HTML templates.
36    * <li>{@code JS} for javascript templates.
37    * <li>{@code CSS} for Cascading Style-Sheets templates.
38    * <li>{@code HTML_IN_TAG} for HTML templates that consist only of
39    *     HTML attribute name and value pairs. This is typically the case for
40    *     a template that is being included from a parent template where the
41    *     parent template contains the start and the closing of the HTML tag.
42    *     This is a special mode, for standard HTML templates please use
43    *     {@link #HTML}.
44    *     An example of such as template is:
45    *     <p><code>class="someClass" target="_blank"</code></p>
46    *     <p>Which could be included from a parent template that contains
47    *     an anchor tag, say:</p>
48    *     <p><code>&lt;a href="/bla" ["INCLUDED_TEMPLATE"]&gt;</code></p>
49    * </ul>
50    */
51   public enum Mode {
52     HTML,
53     JS,
54     CSS,
55     HTML_IN_TAG
56   }
57 
58   /**
59    * Indicates the type of HTML attribute that the parser is currently in or
60    * {@code NONE} if the parser is not currently in an attribute.
61    * {@code URI} is for attributes taking a URI such as "href" and "src".
62    * {@code JS} is for attributes taking javascript such as "onclick".
63    * {@code STYLE} is for the "style" attribute.
64    * All other attributes fall under {@code REGULAR}.
65    *
66    * Returned by {@link HtmlParser#getAttributeType()}
67    */
68   public enum ATTR_TYPE {
69     NONE,
70     REGULAR,
71     URI,
72     JS,
73     STYLE
74   }
75 
76   /**
77    * All the states in which the parser can be. These are external states.
78    * The parser has many more internal states that are not exposed and which
79    * are instead mapped to one of these external ones.
80    * {@code STATE_TEXT} the parser is in HTML proper.
81    * {@code STATE_TAG} the parser is inside an HTML tag name.
82    * {@code STATE_COMMENT} the parser is inside an HTML comment.
83    * {@code STATE_ATTR} the parser is inside an HTML attribute name.
84    * {@code STATE_VALUE} the parser is inside an HTML attribute value.
85    * {@code STATE_JS_FILE} the parser is inside javascript code.
86    * {@code STATE_CSS_FILE} the parser is inside CSS code.
87    *
88    * <p>All these states map exactly to those exposed in the C++ (original)
89    * version of the HtmlParser.
90    */
91   public final static ExternalState STATE_TEXT =
92       new ExternalState("STATE_TEXT");
93   public final static ExternalState STATE_TAG =
94       new ExternalState("STATE_TAG");
95   public final static ExternalState STATE_COMMENT =
96       new ExternalState("STATE_COMMENT");
97   public final static ExternalState STATE_ATTR =
98       new ExternalState("STATE_ATTR");
99   public final static ExternalState STATE_VALUE =
100       new ExternalState("STATE_VALUE");
101   public final static ExternalState STATE_JS_FILE =
102       new ExternalState("STATE_JS_FILE");
103   public final static ExternalState STATE_CSS_FILE =
104       new ExternalState("STATE_CSS_FILE");
105 
106   /**
107    * Returns {@code true} if the parser is currently processing Javascript.
108    * Such is the case if and only if, the parser is processing an attribute
109    * that takes Javascript, a Javascript script block or the parser
110    * is (re)set with {@link Mode#JS}.
111    *
112    * @return {@code true} if the parser is processing Javascript,
113    *         {@code false} otherwise
114    */
inJavascript()115   public boolean inJavascript();
116 
117   /**
118    * Returns {@code true} if the parser is currently processing
119    * a Javascript litteral that is quoted. The caller will typically
120    * invoke this method after determining that the parser is processing
121    * Javascript. Knowing whether the element is quoted or not helps
122    * determine which escaping to apply to it when needed.
123    *
124    * @return {@code true} if and only if the parser is inside a quoted
125    *         Javascript literal
126    */
isJavascriptQuoted()127   public boolean isJavascriptQuoted();
128 
129 
130   /**
131    * Returns {@code true} if and only if the parser is currently within
132    * an attribute, be it within the attribute name or the attribute value.
133    *
134    * @return {@code true} if and only if inside an attribute
135    */
inAttribute()136   public boolean inAttribute();
137 
138   /**
139    * Returns {@code true} if and only if the parser is currently within
140    * a CSS context. A CSS context is one of the below:
141    * <ul>
142    * <li>Inside a STYLE tag.
143    * <li>Inside a STYLE attribute.
144    * <li>Inside a CSS file when the parser was reset in the CSS mode.
145    * </ul>
146    *
147    * @return {@code true} if and only if the parser is inside CSS
148    */
inCss()149   public boolean inCss();
150 
151   /**
152    * Returns the type of the attribute that the parser is in
153    * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute.
154    * The caller will typically invoke this method after determining
155    * that the parser is processing an attribute.
156    *
157    * <p>This is useful to determine which escaping to apply based
158    * on the type of value this attribute expects.
159    *
160    * @return type of the attribute
161    * @see HtmlParser.ATTR_TYPE
162    */
getAttributeType()163   public ATTR_TYPE getAttributeType();
164 
165   /**
166    * Returns {@code true} if and only if the parser is currently within
167    * an attribute value and that attribute value is quoted.
168    *
169    * @return {@code true} if and only if the attribute value is quoted
170    */
isAttributeQuoted()171   public boolean isAttributeQuoted();
172 
173 
174   /**
175    * Returns the name of the HTML tag if the parser is currently within one.
176    * Note that the name may be incomplete if the parser is currently still
177    * parsing the name. Returns an empty {@code String} if the parser is not
178    * in a tag as determined by {@code getCurrentExternalState}.
179    *
180    * @return the name of the HTML tag or an empty {@code String} if we are
181    *         not within an HTML tag
182    */
getTag()183   public String getTag();
184 
185   /**
186    * Returns the name of the HTML attribute the parser is currently processing.
187    * If the parser is still parsing the name, then the returned name
188    * may be incomplete. Returns an empty {@code String} if the parser is not
189    * in an attribute as determined by {@code getCurrentExternalState}.
190    *
191    * @return the name of the HTML attribute or an empty {@code String}
192    *         if we are not within an HTML attribute
193    */
getAttribute()194   public String getAttribute();
195 
196   /**
197    * Returns the value of an HTML attribute if the parser is currently
198    * within one. If the parser is currently parsing the value, the returned
199    * value may be incomplete. The caller will typically first determine
200    * that the parser is processing a value by calling
201    * {@code getCurrentExternalState}.
202    *
203    * @return the value, could be an empty {@code String} if the parser is not
204    *         in an HTML attribute value
205    */
getValue()206   public String getValue();
207 
208   /**
209    * Returns the current position of the parser within the HTML attribute
210    * value, zero being the position of the first character in the value.
211    * The caller will typically first determine that the parser is
212    * processing a value by calling {@link #getState()}.
213    *
214    * @return the index or zero if the parser is not processing a value
215    */
getValueIndex()216   public int getValueIndex();
217 
218   /**
219    * Returns {@code true} if and only if the current position of the parser is
220    * at the start of a URL HTML attribute value. This is the case when the
221    * following three conditions are all met:
222    * <p>
223    * <ol>
224    * <li>The parser is in an HTML attribute value.
225    * <li>The HTML attribute expects a URL, as determined by
226    *     {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}.
227    * <li>The parser has not yet seen any characters from that URL.
228    * </ol>
229    *
230    * <p> This method may be used by an Html Sanitizer or an Auto-Escape system
231    * to determine whether to validate the URL for well-formedness and validate
232    * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe.
233    * In particular, it is recommended to use this method instead of
234    * checking that {@link #getValueIndex()} is {@code 0} to support attribute
235    * types where the URL does not start at index zero, such as the
236    * {@code content} attribute of the {@code meta} HTML tag.
237    *
238    * @return {@code true} if and only if the parser is at the start of the URL
239    */
isUrlStart()240   public boolean isUrlStart();
241 
242   /**
243    * Resets the state of the parser, allowing for reuse of the
244    * {@code HtmlParser} object.
245    *
246    * <p>See the {@link HtmlParser.Mode} enum for information on all
247    * the valid modes.
248    *
249    * @param mode is an enum representing the high-level state of the parser
250    */
resetMode(HtmlParser.Mode mode)251   public void resetMode(HtmlParser.Mode mode);
252 
253   /**
254    * A specialized directive to tell the parser there is some content
255    * that will be inserted here but that it will not get to parse. Used
256    * by the template system that may not be able to give some content
257    * to the parser but wants it to know there typically will be content
258    * inserted at that point. This is a hint used in corner cases within
259    * parsing of HTML attribute names and values where content we do not
260    * get to see could affect our parsing and alter our current state.
261    *
262    * <p>Returns {@code false} if and only if the parser encountered
263    * a fatal error which prevents it from continuing further parsing.
264    *
265    * <p>Note: The return value is different from the C++ Parser which
266    * always returns {@code true} but in my opinion makes more sense.
267    *
268    * @throws ParseException if an unrecoverable error occurred during parsing
269    */
insertText()270   public void insertText() throws ParseException;
271 
272   /**
273    * Returns the state the Javascript parser is in.
274    *
275    * <p>See {@link JavascriptParser} for more information on the valid
276    * external states. The caller will typically first determine that the
277    * parser is processing Javascript and then invoke this method to
278    * obtain more fine-grained state information.
279    *
280    * @return external state of the javascript parser
281    */
getJavascriptState()282   public ExternalState getJavascriptState();
283 }
284