1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.streamhtmlparser;
18 
19 import com.google.streamhtmlparser.impl.HtmlParserImpl;
20 
21 import java.util.Set;
22 import java.util.logging.Logger;
23 
24 /**
25  * A factory class to obtain instances of an {@link HtmlParser}.
26  * Currently each instance is a new object given these are fairly
27  * light-weight.
28  *
29  * <p>In the unlikely case that this class fails to initialize properly
30  * (a developer error), an error is emitted to the error console and the logs
31  * and the specialized parser creation methods will throw
32  * an {@link AssertionError} on all invokations.
33  */
34 public class HtmlParserFactory {
35 
36   private static final Logger logger =
37       Logger.getLogger(HtmlParserFactory.class.getName());
38 
39   /**
40    * To provide additional options when creating an {@code HtmlParser} using
41    * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE,
42    *        boolean, Set)}
43    */
44   public enum AttributeOptions {
45 
46     /**
47      * Indicates that the attribute value is Javascript-quoted. Only takes
48      * effect for Javascript-accepting attributes - as identified by
49      * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also
50      * HTML quoted.
51      */
52     JS_QUOTED,
53 
54     /**
55      * Indicates the attribute value is only a part of a URL as opposed to a
56      * full URL. In particular, the value is not at the start of a URL and
57      * hence does not necessitate validation of the URL scheme.
58      * Only valid for URI-accepting attributes - as identified by
59      * {@link HtmlParser.ATTR_TYPE#URI}.
60      */
61     URL_PARTIAL,
62   }
63 
64   /**
65    * To provide additional options when creating an {@code HtmlParser} using
66    * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)}
67    */
68   public enum ModeOptions {
69 
70     /**
71      * Indicates that the parser is inside a quoted {@code String}. Only
72      * valid in the {@link HtmlParser.Mode#JS} mode.
73      */
74     JS_QUOTED
75   }
76 
77   private static final HtmlParser parserInDefaultAttr = createParser();
78   private static final HtmlParser parserInDefaultAttrQ = createParser();
79   private static final HtmlParser parserInUriAttrComplete = createParser();
80   private static final HtmlParser parserInUriAttrQComplete = createParser();
81   private static final HtmlParser parserInUriAttrPartial = createParser();
82   private static final HtmlParser parserInUriAttrQPartial = createParser();
83   private static final HtmlParser parserInJsAttr = createParser();
84   private static final HtmlParser parserInJsAttrQ = createParser();
85   private static final HtmlParser parserInQJsAttr = createParser();
86   private static final HtmlParser parserInStyleAttr = createParser();
87   private static final HtmlParser parserInStyleAttrQ = createParser();
88   private static final HtmlParser parserInJsQ = createParser();
89 
90   /**
91    * Protects all the createParserXXX methods by throwing a run-time exception
92    * if this class failed to initialize properly.
93    */
94   private static boolean initSuccess = false;
95 
96   static {
97     try {
initializeParsers()98       initializeParsers();
99       initSuccess = true;
100     } catch (ParseException e) {
101       // Log a severe error and print it to stderr along with a stack trace.
102       String error = HtmlParserFactory.class.getName() +
103                      " Failed initialization: " + e.getMessage();
104       logger.severe(error);
105       System.err.println(error);
106       e.printStackTrace();
107     }
108   }
109 
110   // Static class.
HtmlParserFactory()111   private HtmlParserFactory() {
112   }  // COV_NF_LINE
113 
114   /**
115    * Returns an {@code HtmlParser} object ready to parse HTML input.
116    *
117    * @return an {@code HtmlParser} in the provided mode
118    */
createParser()119   public static HtmlParser createParser() {
120     return new HtmlParserImpl();
121   }
122 
123   /**
124    * Returns an {@code HtmlParser} object initialized with the
125    * requested Mode. Provide non {@code null} options to provide
126    * a more precise initialization with the desired Mode.
127    *
128    * @param mode the mode to reset the parser with
129    * @param options additional options or {@code null} for none
130    * @return an {@code HtmlParser} in the provided mode
131    * @throws AssertionError when this class failed to initialize
132    */
createParserInMode(HtmlParser.Mode mode, Set<ModeOptions> options)133   public static HtmlParser createParserInMode(HtmlParser.Mode mode,
134                                               Set<ModeOptions> options) {
135     requireInitialized();
136 
137     if (options != null && options.contains(ModeOptions.JS_QUOTED))
138       return createParser(parserInJsQ);
139 
140     // With no options given, this method is just a convenience wrapper for
141     // the two calls below.
142     HtmlParser parser = new HtmlParserImpl();
143     parser.resetMode(mode);
144     return parser;
145   }
146 
147   /**
148    * Returns an {@code HtmlParser} that is a copy of the one
149    * supplied. It holds the same internal state and hence can
150    * proceed with parsing in-lieu of the supplied parser.
151    *
152    * @param aHtmlParser a {@code HtmlParser} to copy from
153    * @return an {@code HtmlParser} that is a copy of the provided one
154    * @throws AssertionError when this class failed to initialize
155    */
createParser(HtmlParser aHtmlParser)156   public static HtmlParser createParser(HtmlParser aHtmlParser) {
157     requireInitialized();
158 
159     // Should never get a ClassCastException since there is only one
160     // implementation of the HtmlParser interface.
161     return new HtmlParserImpl((HtmlParserImpl) aHtmlParser);
162   }
163 
164   /**
165    * A very specialized {@code HtmlParser} accessor that returns a parser
166    * in a state where it expects to read the value of an attribute
167    * of an HTML tag. This is only useful when the parser has not seen a
168    * certain HTML tag and an attribute name and needs to continue parsing
169    * from a state as though it has.
170    *
171    * <p>For example, to create a parser in a state akin to that
172    * after the parser has parsed "&lt;a href=\"", invoke:
173    * <pre>
174    *   createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)}
175    * </pre>
176    *
177    * <p>You must provide the proper value of quoting or the parser
178    * will go into an unexpected state.
179    * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE}
180    * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state
181    * inside an HTML tag where it expects an attribute name not an attribute
182    * value. It becomes equivalent to a parser initialized in the
183    * {@code HTML_IN_TAG} mode.
184    *
185    * @param attrtype the attribute type which the parser should be in
186    * @param quoted whether the attribute value is enclosed in double quotes
187    * @param options additional options or {@code null} for none
188    * @return an {@code HtmlParser} initialized in the given attribute type
189    *         and quoting
190    * @throws AssertionError when this class failed to initialize
191    */
createParserInAttribute( HtmlParser.ATTR_TYPE attrtype, boolean quoted, Set<AttributeOptions> options)192   public static HtmlParser createParserInAttribute(
193       HtmlParser.ATTR_TYPE attrtype,
194       boolean quoted, Set<AttributeOptions> options) {
195     requireInitialized();
196 
197     HtmlParser parser;
198     switch (attrtype) {
199       case REGULAR:
200         parser = createParser(
201             quoted ? parserInDefaultAttrQ : parserInDefaultAttr);
202         break;
203       case URI:
204         if (options != null && options.contains(AttributeOptions.URL_PARTIAL))
205           parser = createParser(
206               quoted ? parserInUriAttrQPartial : parserInUriAttrPartial);
207         else
208           parser = createParser(
209               quoted ? parserInUriAttrQComplete : parserInUriAttrComplete);
210         break;
211       case JS:
212         // Note: We currently do not support the case of the value being
213         // inside a Javascript quoted string that is in an unquoted HTML
214         // attribute, such as <a href=bla onmouseover=alert('[VALUE')>.
215         // It would be simple to add but currently we assume Javascript
216         // quoted attribute values are always HTML quoted.
217         if (quoted) {
218           if (options != null && options.contains(AttributeOptions.JS_QUOTED))
219             parser = createParser(parserInQJsAttr);
220           else
221             parser = createParser(parserInJsAttrQ);
222         } else {
223           parser = createParser(parserInJsAttr);
224         }
225         break;
226       case STYLE:
227         parser = createParser(
228             quoted ? parserInStyleAttrQ : parserInStyleAttr);
229         break;
230       case NONE:
231         parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null);
232         break;
233       default:
234         throw new IllegalArgumentException(
235             "Did not recognize ATTR_TYPE given: " + attrtype);
236     }
237     return parser;
238   }
239 
240   /**
241    * Initializes a set of static parsers to be subsequently used
242    * by the various createParserXXX methods.
243    * The parsers are set to their proper states by making them parse
244    * an appropriate HTML input fragment. This approach is the most likely
245    * to ensure all their internal state is consistent.
246    *
247    * <p>In the very unexpected case of the parsing failing (developer error),
248    * this class will fail to initialize properly.
249    *
250    * <p>In addition:
251    * <ul>
252    * <li>The HTML tag is set to a fictitious name {@code xparsertag}.
253    * <li>The attribute name is chosen to match the required attribute type.
254    *     When several possibilities exist, one is chosen arbitrarily.
255    * <li>If quoting is required, a double quote is provided after the '='.
256    * </ul>
257    *
258    * @throws ParseException if parsing failed.
259    */
initializeParsers()260   private static void initializeParsers() throws ParseException {
261     parserInDefaultAttr.parse("<xparsertag htmlparser=");
262     parserInDefaultAttrQ.parse("<xparsertag htmlparser=\"");
263 
264     // Chosing the "src" attribute, one of several possible names here
265     parserInUriAttrComplete.parse("<xparsertag src=");
266     parserInUriAttrQComplete.parse("<xparsertag src=\"");
267 
268     // To support a parser that is initialized within a URL parameter
269     // rather than at the beginning of a URL. We use a fake domain
270     // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>)
271     // and a fake query parameter.
272     final String fakeUrlPrefix = "http://example.com/fakequeryparam=";
273     parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix);
274     parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix);
275 
276     // Using onmouse= which is a fictitious attribute name that the parser
277     // understands as being a valid javascript-enabled attribute. Chosing fake
278     // names may help during debugging.
279     parserInJsAttr.parse("<xparsertag onmouse=");
280     parserInJsAttrQ.parse("<xparsertag onmouse=\"");
281     // Single quote added as the Javascript is itself quoted.
282     parserInQJsAttr.parse("<xparsertag onmouse=\"'");
283 
284     // A parser in the Javascript context within a (single) quoted string.
285     parserInJsQ.resetMode(HtmlParser.Mode.JS);
286     parserInJsQ.parse("var fakeparservar='");
287 
288     // Chosing the "style" attribute as it is the only option
289     parserInStyleAttr.parse("<xparsertag style=");
290     parserInStyleAttrQ.parse("<xparsertag style=\"");
291   }
292 
293   /**
294    * Throws an {@link AssertionError} if the class was not initialized
295    * correctly, otherwise simply returns. This is to protect against the
296    * possibility the needed parsers were not created successfully during
297    * static initialized, which can only happen due to an error during
298    * development of this library.
299    *
300    * @throws AssertionError when this class failed to initialize
301    */
requireInitialized()302   private static void requireInitialized() {
303     if (!initSuccess)
304       throw new AssertionError("HtmlParserFactory failed initialization.");
305   }
306 }
307