1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser.util;
18 
19 import com.google.common.collect.ImmutableSortedSet;
20 
21 import java.util.Set;
22 import java.util.regex.Pattern;
23 import java.util.regex.Matcher;
24 
25 /**
26  * Utility functions for HTML and Javascript that are most likely
27  * not interesting to users outside this package.
28  *
29  * <p>The <code>HtmlParser</code> will be open-sourced hence we took the
30  * decision to keep these utilities in this package as well as not to
31  * leverage others that may exist in the <code>google3</code> code base.
32  *
33  * <p>The functionality exposed is designed to be 100% compatible with
34  * the corresponding logic in the C-version of the HtmlParser as such
35  * we are particularly concerned with cross-language compatibility.
36  *
37  * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
38  * interchangeably unless otherwise noted.
39  */
40 public final class HtmlUtils {
41 
42   /**
43    * static utility class
44    */
HtmlUtils()45   private HtmlUtils() {
46   }  // COV_NF_LINE
47 
48   /**
49    * Indicates the type of content contained in the {@code content} HTML
50    * attribute of the {@code meta} HTML tag. Used by
51    * {@link HtmlUtils#parseContentAttributeForUrl(String)}.
52    * <p>The values are:
53    * <ul>
54    * <li>{@code NONE} if it does not contain a URL in the expected format.
55    * <li>{@code URL_START} if it contains a URL but hasn't seen any of
56    * its contents.
57    * <li>{@code URL} if it contains a URL and has seen at least some of
58    * its contents.
59    * </ul>
60    */
61   public enum META_REDIRECT_TYPE {
62     NONE,
63     URL_START,
64     URL
65   }
66 
67   /**
68    * A regular expression matching the format of a {@code content} attribute
69    * that contains a URL. Used by {@link #parseContentAttributeForUrl}.
70    */
71   private static final String META_REDIRECT_REGEX =
72       "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";
73 
74   // Safe for use by concurrent threads so we compile once.
75   private static final Pattern META_REDIRECT_PATTERN =
76       Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);
77 
78   /**
79    * Set of keywords that can precede a regular expression literal. Taken from:
80    * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
81    * Language Syntax</a>
82    *
83    * <p>The token {@code void} was added to the list. Several keywords are
84    * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
85    * simple we do not differentiate on the version and bundle them all together.
86    */
87   private static final Set<String> REGEXP_TOKEN_PREFIXS =
88       ImmutableSortedSet.of(
89           "abstract",
90           "break",
91           "case",
92           "catch",
93           "class",
94           "const",
95           "continue",
96           "debugger",
97           "default",
98           "delete",
99           "do",
100           "else",
101           "enum",
102           "eval",
103           "export",
104           "extends",
105           "field",
106           "final",
107           "finally",
108           "for",
109           "function",
110           "goto",
111           "if",
112           "implements",
113           "import",
114           "in",
115           "instanceof",
116           "native",
117           "new",
118           "package",
119           "private",
120           "protected",
121           "public",
122           "return",
123           "static",
124           "switch",
125           "synchronized",
126           "throw",
127           "throws",
128           "transient",
129           "try",
130           "typeof",
131           "var",
132           "void",
133           "volatile",
134           "while",
135           "with");
136 
137   /**
138    * Set of all HTML attributes which expect a URI (as the value).
139    * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
140    */
141   private static final Set<String> ATTRIBUTE_EXPECTS_URI =
142       ImmutableSortedSet.of(
143           "action",
144           "archive",
145           "background",
146           "cite",
147           "classid",
148           "codebase",
149           "data",
150           "dynsrc",
151           "href",
152           "longdesc",
153           "src",
154           "usemap");
155 
156   /**
157    * Set of {@code Character}s considered whitespace in Javascript.
158    * See {@link #isJavascriptWhitespace(char)}
159    */
160   private static final Set<Character> JAVASCRIPT_WHITESPACE =
161       ImmutableSortedSet.of(
162             '\u0009',         /* Tab \t */
163             '\n',             /* Line-Feed 0x0A */
164             '\u000B',         /* Vertical Tab 0x0B */
165             '\u000C',         /* Form Feed \f */
166             '\r',             /* Carriage Return 0x0D */
167             ' ',              /* Space 0x20 */
168             '\u00A0',         /* Non-breaking space 0xA0 */
169             '\u2028',         /* Line separator */
170             '\u2029');        /* Paragraph separator */
171 
172   /**
173   * Set of {@code Character}s considered whitespace in HTML.
174   * See {@link #isHtmlSpace(char)}
175   */
176  private static final Set<Character> HTML_WHITESPACE =
177       ImmutableSortedSet.of(
178           ' ',
179           '\t',
180           '\n',
181           '\r',
182           '\u200B');
183 
184 
185   /**
186    * Determines if the HTML attribute specified expects javascript
187    * for its value. Such is the case for example with the {@code onclick}
188    * attribute.
189    *
190    * <p>Currently returns {@code true} for any attribute name that starts
191    * with "on" which is not exactly correct but we trust a developer to
192    * not use non-spec compliant attribute names (e.g. onbogus).
193    *
194    * @param attribute the name of an HTML attribute
195    * @return {@code false} if the input is null or is not an attribute
196    *         that expects javascript code; {@code true}
197    */
isAttributeJavascript(String attribute)198   public static boolean isAttributeJavascript(String attribute) {
199     return ((attribute != null) && attribute.startsWith("on"));
200   }
201 
202   /**
203    * Determines if the HTML attribute specified expects a {@code style}
204    * for its value. Currently this is only true for the {@code style}
205    * HTML attribute.
206    *
207    * @param attribute the name of an HTML attribute
208    * @return {@code true} iff the attribute name is one that expects a
209    *     style for a value; otherwise {@code false}
210    */
isAttributeStyle(String attribute)211   public static boolean isAttributeStyle(String attribute) {
212     return "style".equals(attribute);
213   }
214 
215   /**
216    * Determines if the HTML attribute specified expects a {@code URI}
217    * for its value. For example, both {@code href} and {@code src}
218    * expect a {@code URI} but {@code style} does not. Returns
219    * {@code false} if the attribute given was {@code null}.
220    *
221    * @param attribute the name of an HTML attribute
222    * @return {@code true} if the attribute name is one that expects
223    *         a URI for a value; otherwise {@code null}
224    *
225    * @see #ATTRIBUTE_EXPECTS_URI
226    */
isAttributeUri(String attribute)227   public static boolean isAttributeUri(String attribute) {
228     return ATTRIBUTE_EXPECTS_URI.contains(attribute);
229   }
230 
231   /**
232    * Determines if the specified character is an HTML whitespace character.
233    * A character is an HTML whitespace character if and only if it is one
234    * of the characters below.
235    * <ul>
236    * <li>A <code>Space</code> character
237    * <li>A <code>Tab</code> character
238    * <li>A <code>Line feed</code> character
239    * <li>A <code>Carriage Return</code> character
240    * <li>A <code>Zero-Width Space</code> character
241    * </ul>
242    *
243    * Note: The list includes the zero-width space (<code>&amp;#x200B;</code>)
244    * which is not included in the C version.
245    *
246    * @param chr the {@code char} to check
247    * @return {@code true} if the character is an HTML whitespace character
248    *
249    * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
250    */
isHtmlSpace(char chr)251   public static boolean isHtmlSpace(char chr) {
252     return HTML_WHITESPACE.contains(chr);
253   }
254 
255   /**
256    * Determines if the specified character is an ECMAScript whitespace or line
257    * terminator character. A character is a whitespace or line terminator if
258    * and only if it is one of the characters below:
259    * <ul>
260    * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
261    *     <code>Form Feed</code>, <code>Space</code>,
262    *     <code>No-break space</code>)
263    * <li>A line terminator character (<code>Line Feed</code>,
264    *     <code>Carriage Return</code>, <code>Line separator</code>,
265    *     <code>Paragraph Separator</code>).
266    * </ul>
267    *
268    * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
269    * particular, this list is quite different from that in
270    * <code>Character.isWhitespace</code>.
271    * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
272    * ECMAScript Language Specification</a>
273    *
274    * @param chr the {@code char} to check
275    * @return {@code true} or {@code false}
276    *
277    */
isJavascriptWhitespace(char chr)278   public static boolean isJavascriptWhitespace(char chr) {
279     return JAVASCRIPT_WHITESPACE.contains(chr);
280   }
281 
282   /**
283    * Determines if the specified character is a valid character in an
284    * ECMAScript identifier. This determination is currently not exact,
285    * in particular:
286    * <ul>
287    * <li>It does not accept Unicode letters, only ASCII ones.
288    * <li>It does not distinguish between the first character of an identifier
289    *     (which cannot contain numbers) and subsequent characters.
290    * </li>
291    * </ul>
292    *
293    * We are considering leveraging <code>Character.isJavaIdentifierStart</code>
294    * and <code>Character.isJavaIdentifierPart</code> given that Java
295    * and Javascript follow similar identifier naming rules but we lose
296    * compatibility with the C-version.
297    *
298    * @param chr {@code char} to check
299    * @return {@code true} if the {@code chr} is a Javascript whitespace
300    *         character; otherwise {@code false}
301    */
isJavascriptIdentifier(char chr)302   public static boolean isJavascriptIdentifier(char chr) {
303     return ((chr >= 'a' && chr <= 'z')
304         || (chr >= 'A' && chr <= 'Z')
305         || (chr >= '0' && chr <= '9')
306         || chr == '_' || chr == '$');
307   }
308 
309   /**
310    * Determines if the input token provided is a valid token prefix to a
311    * javascript regular expression.  The token argument is compared against
312    * a {@code Set} of identifiers that can precede a regular expression in the
313    * javascript grammar, and returns {@code true} if the provided
314    * {@code String} is in that {@code Set}.
315    *
316    * @param input the {@code String} token to check
317    * @return {@code true} iff the token is a valid prefix of a regexp
318    */
isJavascriptRegexpPrefix(String input)319   public static boolean isJavascriptRegexpPrefix(String input) {
320     return REGEXP_TOKEN_PREFIXS.contains(input);
321   }
322 
323   /**
324    * Encodes the specified character using Ascii for convenient insertion into
325    * a single-quote enclosed {@code String}. Printable characters
326    * are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
327    * back-slash and single quote are all backslash-escaped. All other characters
328    * are returned hex-encoded.
329    *
330    * @param chr {@code char} to encode
331    * @return an Ascii-friendly encoding of the given {@code char}
332    */
encodeCharForAscii(char chr)333   public static String encodeCharForAscii(char chr) {
334     if (chr == '\'') {
335       return "\\'";
336     } else if (chr == '\\') {
337       return "\\\\";
338     } else if (chr >= 32 && chr <= 126) {
339       return String.format("%c", chr);
340     } else if (chr == '\n') {
341       return "\\n";
342     } else if (chr == '\r') {
343       return "\\r";
344     } else if (chr == '\t') {
345       return "\\t";
346     } else {
347       // Cannot apply a precision specifier for integral types. Specifying
348       // 0-padded hex-encoding with minimum width of two.
349       return String.format("\\u%04x", (int)chr);
350     }
351   }
352 
353   /**
354    * Parses the given {@code String} to determine if it contains a URL in the
355    * format followed by the {@code content} attribute of the {@code meta}
356    * HTML tag.
357    *
358    * <p>This function expects to receive the value of the {@code content} HTML
359    * attribute. This attribute takes on different meanings depending on the
360    * value of the {@code http-equiv} HTML attribute of the same {@code meta}
361    * tag. Since we may not have access to the {@code http-equiv} attribute,
362    * we instead rely on parsing the given value to determine if it contains
363    * a URL.
364    *
365    * The specification of the {@code meta} HTML tag can be found in:
366    *   http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
367    *
368    * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
369    * value contains a URL and whether we are at the start of the URL or past
370    * the start. We are at the start of the URL if and only if one of the two
371    * conditions below is true:
372    * <ul>
373    * <li>The given input does not contain any characters from the URL proper.
374    * Example "5; URL=".
375    * <li>The given input only contains the optional leading single or double
376    * quote leading the URL. Example "5; URL='".
377    * </li>
378    * </ul>
379    *
380    * <p>Examples:
381    * <ul>
382    * <li> Example of a complete {@code meta} tag where the {@code content}
383    * attribute contains a URL [we are not at the start of the URL]:
384    * <pre>
385    * &lt;meta http-equiv="refresh" content="5; URL=http://www.google.com"&gt;
386    * </pre>
387    * <li> Example of a complete {@code meta} tag where the {@code content}
388    * attribute contains a URL [we are at the start of the URL]:
389    * <pre>
390    * &lt;meta http-equiv="refresh" content="5; URL="&gt;
391    * </pre>
392    * <li>Example of a complete {@code meta} tag where the {@code content}
393    * attribute does not contain a URL:
394    * <pre>
395    * &lt;meta http-equiv="content-type" content="text/html"&gt;
396    * </pre>
397    * </ul>
398    *
399    * @param value {@code String} to parse
400    * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
401    * of a URL in the given value
402    */
parseContentAttributeForUrl(String value)403   public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
404     if (value == null)
405       return META_REDIRECT_TYPE.NONE;
406 
407     Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
408     if (!matcher.find())
409       return META_REDIRECT_TYPE.NONE;
410 
411     // We have more content.
412     if (value.length() > matcher.end())
413       return META_REDIRECT_TYPE.URL;
414 
415     return META_REDIRECT_TYPE.URL_START;
416   }
417 }
418