1 /**
2  * Copyright (c) 2004, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.android.mail.common.html.parser;
17 
18 import com.google.android.mail.common.base.CharEscapers;
19 import com.google.android.mail.common.base.CharMatcher;
20 import com.google.android.mail.common.base.Preconditions;
21 import com.google.android.mail.common.base.StringUtil;
22 import com.google.android.mail.common.base.X;
23 import com.google.common.collect.Lists;
24 import com.google.common.collect.Maps;
25 import com.google.common.io.ByteStreams;
26 
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.LinkedList;
31 import java.util.List;
32 import java.util.ListIterator;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35 
36 /**
37  * HtmlParser is a simple but efficient html parser.
38  * - It's simple because it does not do incremental parsing like some other
39  * parser. It assumes that the entire html text is available.
40  * - It offers 3 levels of aggressiveness in correcting errors in HTML (see
41  * HtmlParser.ParseStyle).
42  * - HTML comments are ignored unless initialized with ParseStyle.PRESERVE_ALL.
43  */
44 public class HtmlParser {
45 
46   // States
47   private enum State {
48     IN_TEXT, IN_TAG, IN_COMMENT, IN_CDATA
49   }
50 
51   // The current state
52   private State state;
53 
54   private int clipLength = Integer.MAX_VALUE;
55   private boolean clipped;
56 
57   // The html text
58   private String html;
59 
60   // The entire array of nodes
61   private List<HtmlDocument.Node> nodes;
62 
63   // Turn on for debug information.
64   private static boolean DEBUG = false;
65 
66   // Default whitelist
67   public static final HtmlWhitelist DEFAULT_WHITELIST = HTML4.getWhitelist();
68 
69   // Whitelists for looking up accepted HTML tags and attributes
70   private List<HtmlWhitelist> whitelists = Lists.newArrayList(DEFAULT_WHITELIST);
71 
72   /**
73    * This setting controls how much of the original HTML is preserved.  In
74    * ascending order of aggressiveness:
75    * - PRESERVE_ALL: Preserves all of original content.
76    * *** Warning - PRESERVE_ALL mode preserves invalid and unsafe HTML. ***
77    * - PRESERVE_VALID: Preserves only valid visible HTML handled by
78    * most browsers.  Discards comments, unknown tags and attributes, and
79    * nameless end tags.  Encodes all '<' characters that aren't part of a tag.
80    * - NORMALIZE: In addition to the changes made by PRESERVE_VALID, also
81    *   - unescapes then reescapes text to normalize entities
82    *   - normalizes whitespace and quotes around tag attributes
83    */
84   public enum ParseStyle { NORMALIZE, PRESERVE_VALID, PRESERVE_ALL }
85 
86   /**
87    * True only in PRESERVE_ALL mode.
88    * @see HtmlParser.ParseStyle
89    */
90   private final boolean preserveAll;
91 
92   /**
93    * True in either PRESERVE_ALL or PRESERVE_VALID mode.
94    * @see HtmlParser.ParseStyle
95    */
96   private final boolean preserveValidHtml;
97 
98   /**
99    * @see HtmlParser#HtmlParser(HtmlParser.ParseStyle)
100    */
HtmlParser()101   public HtmlParser() {
102     this(ParseStyle.NORMALIZE);
103   }
104 
105   /**
106    * @param parseStyle Level of aggressiveness for how different
107    * toHTML()/toXHTML() are from original.
108    * @see HtmlParser.ParseStyle
109    */
HtmlParser(ParseStyle parseStyle)110   public HtmlParser(ParseStyle parseStyle) {
111     preserveAll = (parseStyle == ParseStyle.PRESERVE_ALL);
112     preserveValidHtml = preserveAll || (parseStyle == ParseStyle.PRESERVE_VALID);
113   }
114 
115   /**
116    * Sets the maximum length, in characters, of an HTML message.
117    *
118    * @param clipLength must be greater than zero.
119    * (It starts as Integer.MAX_VALUE)
120    */
setClipLength(int clipLength)121   public void setClipLength(int clipLength) {
122     if (clipLength <= 0) {
123       throw new IllegalArgumentException(
124         "clipLength '" + clipLength + "' <= 0");
125     }
126     this.clipLength = clipLength;
127   }
128 
isClipped()129   public boolean isClipped() {
130     return clipped;
131   }
132 
133   /**
134    * Sets the HTML whitelist. Calling this overrides any whitelist(s) that
135    * the parser is configured to use. By default, the parser uses the standard
136    * HTML4 whitelist.
137    *
138    * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode.
139    *
140    * @param whitelist The whitelist to use. Must not be null.
141    */
setWhitelist(HtmlWhitelist whitelist)142   public void setWhitelist(HtmlWhitelist whitelist) {
143     Preconditions.checkNotNull(whitelist);
144     whitelists = Lists.newArrayList(whitelist);
145   }
146 
147   /**
148    * Adds an HTML whitelist to the list of whitelists consulted when
149    * processing an element or attribute. By default, the parser only uses
150    * the standard HTML4 whitelist.
151    *
152    * Whitelists are consulted in reverse chronological order (starting from
153    * the most recently added whitelist). The last whitelist consulted will
154    * always be the standard HTML4 whitelist, unless this was overridden by
155    * a call to {@link #setWhitelist}.
156    *
157    * This has no effect in <code>ParseStyle.PRESERVE_ALL</code> mode.
158    *
159    * @param whitelist The whitelist to use.
160    */
addWhitelist(HtmlWhitelist whitelist)161   public void addWhitelist(HtmlWhitelist whitelist) {
162     whitelists.add(whitelist);
163   }
164 
165   /**
166    * These are characters that we don't want to allow unquoted in an attribute
167    * value because they might be interpreted by the browser as HTML control
168    * characters. They are the 5 characters that are escaped by
169    * com.google.common.base.CharEscapers.HTML_ESCAPE, plus '=' and whitespace.
170    * Note that it shouldn't be possible for '>' or whitespace to be parsed as
171    * part of an unquoted attribute value, but we leave them here for
172    * completeness.
173    * Package visibility for unit tests.
174    */
175   static Pattern NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX = Pattern.compile("[\"\'&<>=\\s]");
176 
177   //------------------------------------------------------------------------
178   // Parsing
179   //------------------------------------------------------------------------
180 
181   /**
182    * Parses a String as HTML.
183    *
184    * @param html String to parse
185    * @return an Html document
186    */
parse(String html)187   public HtmlDocument parse(String html) {
188     this.html = html;
189     // Use a LinkedList because we don't know the number of nodes ahead of
190     // time. This will be compacted into an ArrayList in coalesceTextNodes().
191     nodes = Lists.newLinkedList();
192     state = State.IN_TEXT;
193 
194     clipped = false;
195     int end = html.length();
196     int clipEnd = Math.min(clipLength, end);
197 
198     for (int i = 0; i < end && !clipped;) {
199 
200       // At any one time, the parser is in one of these states:
201       int pos;
202       switch (state) {
203         case IN_TEXT:
204           // text will not attempt to parse beyond the clipping length
205           pos = scanText(i, clipEnd);
206           X.assertTrue(pos > i || state != State.IN_TEXT); // Must make progress.
207           break;
208 
209         case IN_TAG:
210           pos = scanTag(i, end);
211           X.assertTrue(pos > i);        // Must make progress
212           break;
213 
214         case IN_COMMENT:
215           pos = scanComment(i, end);
216           state = State.IN_TEXT;
217           X.assertTrue(pos > i);        // Must make progress
218           break;
219 
220         case IN_CDATA:
221           pos = scanCDATA(i, end);
222           X.assertTrue(pos > i || state != State.IN_CDATA); // Must make progress
223           break;
224 
225         default:
226           throw new Error("Unknown state!");
227       }
228 
229       i = pos;
230 
231       // If we've reached or gone beyond the clipping length, stop.
232       clipped = pos >= clipLength;
233     }
234 
235     nodes = coalesceTextNodes(nodes);
236 
237     HtmlDocument doc = new HtmlDocument(nodes);
238     nodes = null;
239     html = null;
240     return doc;
241   }
242 
243   /**
244    * During the course of parsing, we may have multiple adjacent Text nodes,
245    * due to the sanitizer stripping out nodes between Text nodes. It is
246    * important to coalesce them so that later steps in the pipeline can
247    * treat the text as a single block (e.g. the step that inserts <wbr> tags).
248    * @param nodes Original nodes.
249    * @return Nodes with text nodes changed.
250    */
coalesceTextNodes( List<HtmlDocument.Node> nodes)251   static List<HtmlDocument.Node> coalesceTextNodes(
252       List<HtmlDocument.Node> nodes) {
253     List<HtmlDocument.Node> out =
254         new ArrayList<HtmlDocument.Node>(nodes.size());
255     LinkedList<HtmlDocument.Text> textNodes = Lists.newLinkedList();
256 
257     for (HtmlDocument.Node node : nodes) {
258       if (node instanceof HtmlDocument.Text) {
259         textNodes.add((HtmlDocument.Text) node);
260       } else {
261         mergeTextNodes(textNodes, out);
262         out.add(node);
263       }
264     }
265     mergeTextNodes(textNodes, out);
266     return out;
267   }
268 
269   /**
270    * Flushes any Text nodes in {@code textNodes} into a single Text node
271    * in {@code output}. {@code textNodes} is guaranteed to be empty when
272    * the function returns.
273    * @param textNodes Text nodes.
274    * @param output Destination to which results are added.
275    */
mergeTextNodes(LinkedList<HtmlDocument.Text> textNodes, List<HtmlDocument.Node> output)276   private static void mergeTextNodes(LinkedList<HtmlDocument.Text> textNodes,
277                                      List<HtmlDocument.Node> output) {
278     if (!textNodes.isEmpty()) {
279       if (textNodes.size() == 1) {
280         output.add(textNodes.removeFirst());
281       } else {
282         int combinedTextLen = 0;
283         int combinedInputLen = 0;
284         for (HtmlDocument.Text text : textNodes) {
285           combinedTextLen += text.getText().length();
286           if (text.getOriginalHTML() != null) {
287             combinedInputLen += text.getOriginalHTML().length();
288           }
289         }
290         StringBuilder combinedText = new StringBuilder(combinedTextLen);
291         StringBuilder combinedInput = new StringBuilder(combinedInputLen);
292         while (!textNodes.isEmpty()) {
293           HtmlDocument.Text text = textNodes.removeFirst();
294           combinedText.append(text.getText());
295           if (text.getOriginalHTML() != null) {
296             combinedInput.append(text.getOriginalHTML());
297           }
298         }
299         String originalInput = combinedInputLen > 0 ? combinedInput.toString() : null;
300         output.add(HtmlDocument.createText(combinedText.toString(), originalInput));
301       }
302     }
303   }
304 
305   //------------------------------------------------------------------------
306   // Text scanning
307   //------------------------------------------------------------------------
308   /**
309    * A truncated entity is something like <pre>&nbs or &#1a3</pre>.
310    * We only want to find these at the end of a clipped text.
311    */
312   private static final Pattern TRUNCATED_ENTITY =
313     Pattern.compile("\\& \\#? [0-9a-zA-Z]{0,8} $", Pattern.COMMENTS);
314 
315   /**
316    * In a text mode, scan for a tag
317    * @param start Position in original html.
318    * @param end Position in original html.
319    * @return End position of scanned content.
320    */
scanText(final int start, final int end)321   int scanText(final int start, final int end) {
322     int pos;
323     for (pos = start; pos < end; pos++) {
324       char ch = html.charAt(pos);
325       if (ch == '<' && pos + 1 < end) {
326         // Check the next char
327         ch = html.charAt(pos + 1);
328         if (ch == '/' || Character.isLetter(ch) || ch == '!' || ch == '?') {
329 
330           // Check if it's an html comment or tag
331           if (html.regionMatches(pos + 1, "!--", 0, 3)) {
332             state = State.IN_COMMENT;
333           } else {
334             state = State.IN_TAG;
335           }
336           break;
337         }
338       }
339     }
340 
341     if (pos > start) {
342       int finalPos = pos;
343       String htmlTail = this.html.substring(start, finalPos);
344 
345       if ((pos == clipLength) && (clipLength < html.length())) {
346         // We're clipping this HTML, not running off the end.
347         // If we're ending with what looks like a truncated entity,
348         // then clip that part off, too.
349         // If it really was a truncated entity, great.
350         // If it was a false positive, the user won't notice that we clipped
351         // an additional handful of characters.
352         Matcher matcher = TRUNCATED_ENTITY.matcher(htmlTail);
353         if (matcher.find()) {
354           int matchStart = matcher.start();
355           // The matcher matched in htmlTail, not html.
356           // htmlTail starts at html[start]
357           finalPos = start + matchStart;
358           htmlTail = htmlTail.substring(0, matchStart);
359         }
360       }
361 
362       if (finalPos > start) {
363         String originalHtml = null;
364         if (preserveAll) {
365           originalHtml = htmlTail;
366         } else if (preserveValidHtml) {
367           // the only way htmlTail can start with '<' is if it's the last character
368           // in html; otherwise, we would have entered State.IN_TAG or
369           // State.IN_COMMENT above
370 
371           // officially a '<' can be valid in a text node, but to be safe we
372           // always escape them
373           originalHtml = CharMatcher.is('<').replaceFrom(htmlTail, "&lt;");
374         }
375 
376         HtmlDocument.Text textnode = HtmlDocument.createEscapedText(htmlTail, originalHtml);
377         nodes.add(textnode);
378       }
379     }
380     return pos;
381   }
382 
383   //------------------------------------------------------------------------
384   // Tag name scanning utility class
385   //------------------------------------------------------------------------
386   private static class TagNameScanner {
387     private final String html;
388     private String tagName;
389     private int startNamePos = -1;
390     private int endNamePos = -1;
391 
TagNameScanner(String html)392     public TagNameScanner(String html) {
393       this.html = html;
394     }
395 
396     /**
397      * Scans for a tag name. Sets #startNamePos and #endNamePos.
398      * @param start Position in original html.
399      * @param end Position in original html.
400      * @return End position of scanned content.
401      */
scanName(final int start, final int end)402     public int scanName(final int start, final int end) {
403       int pos;
404       for (pos = start; pos < end; pos++) {
405         char ch = html.charAt(pos);
406 
407         // End of tag or end of name.
408         if ((ch == '>') || (ch == '/') || Character.isWhitespace(ch)) {
409           break;
410         }
411       }
412       if (pos > start) {
413         startNamePos = start;
414         endNamePos = pos;
415       }
416       return pos;
417     }
418 
419     /**
420      * @return Tag name.
421      */
getTagName()422     public String getTagName() {
423       if (tagName == null && startNamePos != -1 && endNamePos != -1) {
424         tagName = html.substring(startNamePos, endNamePos);
425       }
426       return tagName;
427     }
428   }
429 
430   //------------------------------------------------------------------------
431   // Attribute scanning utility class
432   //------------------------------------------------------------------------
433   private static class AttributeScanner {
434     private final String html;
435     private String name;
436     private String value;
437 
438     // The following have package visibility because they are accessed from
439     // HtmlParser.addAttribute() to handle preservation of original content
440     // around the attribute value, but quoting and escaping of the value itself.
441     int startNamePos = -1;
442     int endNamePos = -1;
443     int startValuePos = -1;
444     int endValuePos = -1;
445     boolean attrValueIsQuoted = false;
446 
AttributeScanner(String html)447     public AttributeScanner(String html) {
448       this.html = html;
449     }
450 
451     /**
452      * Reset to scan another attribute.
453      */
reset()454     public void reset() {
455       startNamePos = -1;
456       endNamePos = -1;
457       startValuePos = -1;
458       endValuePos = -1;
459       attrValueIsQuoted = false;
460       name = null;
461       value = null;
462     }
463 
464     /**
465      * Scans for a tag attribute name. Sets startNamePos and endNamePos. Sets
466      * 'attrName'.
467      *
468      * @param start Position in original html
469      * @param end Position in original html
470      * @return End position of scanned content
471      */
scanName(final int start, final int end)472     int scanName(final int start, final int end) {
473       X.assertTrue(html.charAt(start) != '>');
474       if (start == end) {
475         // No attribute name
476         return start;
477       }
478 
479       int pos;
480       for (pos = start + 1; pos < end; pos++) {
481         char ch = html.charAt(pos);
482 
483         // End of tag or end of name.
484         if ((ch == '>') || (ch == '=') || (ch == '/') || Character.isWhitespace(ch)) {
485           break;
486         }
487       }
488       startNamePos = start;
489       endNamePos = pos;
490       return pos;
491     }
492 
493     /**
494      * Scans for a tag attribute value. Sets startValuePos_ and endValuePos_.
495      *
496      * @param start Position in original html
497      * @param end Position in original html
498      * @return End position of scanned content
499      */
scanValue(final int start, final int end)500     int scanValue(final int start, final int end) {
501       // Skip whitespace before '='.
502       int pos = skipSpaces(start, end);
503 
504       // Handle cases with no attribute value.
505       if ((pos == end) || (html.charAt(pos) != '=')) {
506         // Return start so spaces will be parsed as part of next attribute,
507         // or end of tag.
508         return start;
509       }
510 
511       // Skip '=' and whitespace after it.
512       pos++;
513       pos = skipSpaces(pos, end);
514 
515       // Handle cases with no attribute value.
516       if (pos == end) {
517         return pos;
518       }
519 
520       // Check for quote character ' or "
521       char ch = html.charAt(pos);
522       if (ch == '\'' || ch == '\"') {
523         attrValueIsQuoted = true;
524         pos++;
525         int valueStart = pos;
526         while (pos < end && html.charAt(pos) != ch) {
527           pos++;
528         }
529         startValuePos = valueStart;
530         endValuePos = pos;
531         if (pos < end) {
532           pos++;                        // Skip the ending quote char
533         }
534       } else {
535         int valueStart = pos;
536         for (; pos < end; pos++) {
537           ch = html.charAt(pos);
538 
539           // End of tag or end of value. Not that '/' is included in the value
540           // even if it is the '/>' at the end of the tag.
541           if ((ch == '>') || Character.isWhitespace(ch)) {
542             break;
543           }
544         }
545         startValuePos = valueStart;
546         endValuePos = pos;
547       }
548 
549       X.assertTrue(startValuePos > -1);
550       X.assertTrue(endValuePos > -1);
551       X.assertTrue(startValuePos <= endValuePos);
552       X.assertTrue(pos <= end);
553 
554       return pos;
555     }
556 
557     /**
558      * Skips white spaces.
559      *
560      * @param start Position in original html
561      * @param end Position in original html
562      * @return End position of scanned content
563      */
skipSpaces(final int start, final int end)564     private int skipSpaces(final int start, final int end) {
565       int pos;
566       for (pos = start; pos < end; pos++) {
567         if (!Character.isWhitespace(html.charAt(pos))) {
568           break;
569         }
570       }
571       return pos;
572     }
573 
getName()574     public String getName() {
575       if (name == null && startNamePos != -1 && endNamePos != -1) {
576         name = html.substring(startNamePos, endNamePos);
577       }
578       return name;
579     }
580 
getValue()581     public String getValue() {
582       if (value == null && startValuePos != -1 && endValuePos != -1) {
583         value = html.substring(startValuePos, endValuePos);
584       }
585       return value;
586     }
587   }
588 
589   /**
590    * Holds any unrecognized elements we encounter.  Only applicable in
591    * PRESERVE_ALL mode.
592    */
593   private final HashMap<String,HTML.Element> unknownElements = Maps.newHashMap();
594 
595   /**
596    * Holds any unrecognized attributes we encounter.  Only applicable in
597    * PRESERVE_ALL mode.
598    */
599   private final HashMap<String,HTML.Attribute> unknownAttributes = Maps.newHashMap();
600 
601   /**
602    * @param name Element name.
603    * @return "Dummy" element.  Not useful for any real HTML processing, but
604    * gives us a placeholder for tracking original HTML contents.
605    */
lookupUnknownElement(String name)606   private HTML.Element lookupUnknownElement(String name) {
607     name = name.toLowerCase();
608     HTML.Element result = unknownElements.get(name);
609     if (result == null) {
610       result = new HTML.Element(name,
611           HTML.Element.NO_TYPE,
612           /* empty */ false,
613           /* optional end tag */ true,
614           /* breaks flow*/ false,
615           HTML.Element.Flow.NONE);
616       unknownElements.put(name, result);
617     }
618     return result;
619   }
620 
621   /**
622    * @param name Attribute name.
623    * @return "Dummy" attribute. Not useful for any real HTML processing, but
624    *         gives us a placeholder for tracking original HTML contents.
625    */
lookupUnknownAttribute(String name)626   private HTML.Attribute lookupUnknownAttribute(String name) {
627     name = name.toLowerCase();
628     HTML.Attribute result = unknownAttributes.get(name);
629     if (result == null) {
630       result = new HTML.Attribute(name, HTML.Attribute.NO_TYPE);
631       unknownAttributes.put(name, result);
632     }
633     return result;
634   }
635 
636   /**
637    * Scans for an HTML tag.
638    *
639    * @param start Position in original html.
640    * @param end Position in original html.
641    * @return End position of scanned content.
642    */
scanTag(final int start, final int end)643   int scanTag(final int start, final int end) {
644     X.assertTrue(html.charAt(start) == '<');
645 
646     // nameStart is where we begin scanning for the tag name and attributes,
647     // so we skip '<'.
648     int nameStart = start + 1;
649 
650     // Next state is Text, except the case when we see a STYLE/SCRIPT tag. See
651     // code below.
652     state = State.IN_TEXT;
653 
654     // End tag?
655     boolean isEndTag = false;
656     if (html.charAt(nameStart) == '/') {
657       isEndTag = true;
658       ++nameStart;
659     }
660 
661     // Tag name and element
662     TagNameScanner tagNameScanner = new TagNameScanner(html);
663     int pos = tagNameScanner.scanName(nameStart, end);
664     String tagName = tagNameScanner.getTagName();
665     HTML.Element element = null;
666     if (tagName == null) {
667       // For some reason, browsers treat start and end tags differently
668       // when they don't have a valid tag name - end tags are swallowed
669       // (e.g., "</ >"), start tags treated as text (e.g., "< >")
670       if (!isEndTag) {
671         // This is not really a tag, treat the '<' as text.
672         HtmlDocument.Text text = HtmlDocument.createText("<", preserveAll ? "<" : null);
673         nodes.add(text);
674         state = State.IN_TEXT;
675         return nameStart;
676       }
677 
678       if (preserveAll) {
679         element = lookupUnknownElement("");
680       }
681     } else {
682       element = lookupElement(tagName);
683       if (element == null) {
684         if (DEBUG) {
685           // Unknown element
686           debug("Unknown element: " + tagName);
687         }
688         if (preserveAll) {
689           element = lookupUnknownElement(tagName);
690         }
691       }
692     }
693 
694     // Attributes
695     boolean isSingleTag = false;
696     ArrayList<HtmlDocument.TagAttribute> attributes = null;
697     int allAttributesStartPos = pos;
698     int nextAttributeStartPos = pos;
699     AttributeScanner attributeScanner = new AttributeScanner(html);
700     while (pos < end) {
701       int startPos = pos;
702       char ch = html.charAt(pos);
703 
704       // Are we at the end of the tag?
705       if ((pos + 1 < end) && (ch == '/') && (html.charAt(pos + 1) == '>')) {
706         isSingleTag = true;
707         ++pos;
708         break;                          // Done
709       }
710       if (ch == '>') {
711         break;                          // Done
712       }
713 
714       // See bug 870742 (Buganizer).
715       if (isEndTag && ('<' == ch)) {
716         // '<' not allowed in end tag, so we finish processing this tag and
717         // return to State.IN_TEXT. We mimic Safari & Firefox, which both
718         // terminate the tag when it contains a '<'.
719         if (element != null) {
720           addEndTag(element, start, allAttributesStartPos, pos);
721         }
722         state = State.IN_TEXT;
723         return pos;
724       }
725 
726       if (Character.isWhitespace(ch)) {
727         // White space, skip it.
728         ++pos;
729       } else {
730         // Scan for attribute
731         attributeScanner.reset();
732         pos = attributeScanner.scanName(pos, end);
733         X.assertTrue(pos > startPos);
734 
735         // If it's a valid attribute, scan attribute values
736         if (attributeScanner.getName() != null) {
737           pos = attributeScanner.scanValue(pos, end);
738 
739           // Add the attribute to the list
740           if (element != null) {
741             if (attributes == null) {
742               attributes = new ArrayList<HtmlDocument.TagAttribute>();
743             }
744             addAttribute(attributes, attributeScanner, nextAttributeStartPos, pos);
745           }
746           nextAttributeStartPos = pos;
747         }
748       }
749 
750       // Make sure that we make progress!
751       X.assertTrue(pos > startPos);
752     }
753 
754     // Cannot find the close tag, so we treat this as text
755     if (pos == end) {
756       X.assertTrue(start < end);
757       String textNodeContent = html.substring(start, end);
758       String originalContent = null;
759       if (preserveAll) {
760         originalContent = textNodeContent;
761       } else if (preserveValidHtml) {
762         // Officially a '<' can be valid in a text node, but to be safe we
763         // always escape them.
764         originalContent =
765             CharMatcher.is('<').replaceFrom(html.substring(start, end), "&lt;");
766       }
767       nodes.add(HtmlDocument.createEscapedText(textNodeContent, originalContent));
768       return end;
769     }
770 
771     // Skip '>'
772     X.assertTrue(html.charAt(pos) == '>');
773     pos++;
774 
775     // Check if it's an element we're keeping (either an HTML4 element, or an
776     // unknown element we're preserving). If not, ignore the tag.
777     if (element != null) {
778       if (isEndTag) {
779         addEndTag(element, start, allAttributesStartPos, pos);
780       } else {
781         // Special case: if it's a STYLE/SCRIPT element, we go to into
782         // CDATA state.
783         if (HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element)) {
784           state = State.IN_CDATA;
785         }
786 
787         addStartTag(element, start, allAttributesStartPos,
788             nextAttributeStartPos,
789             pos, isSingleTag, attributes);
790       }
791     }
792 
793     return pos;
794   }
795 
796   /**
797    * Lookups the element in our whitelist(s). Whitelists are consulted in
798    * reverse chronological order (starting from the most recently added
799    * whitelist), allowing clients to override the default behavior.
800    *
801    * @param name Element name.
802    * @return Element.
803    */
804   HTML.Element lookupElement(String name) {
805     ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size());
806     while (iter.hasPrevious()) {
807       HTML.Element elem = iter.previous().lookupElement(name);
808       if (elem != null) {
809         return elem;
810       }
811     }
812     return null;
813   }
814 
815   /**
816    * Lookups the attribute in our whitelist(s). Whitelists are consulted in
817    * reverse chronological order (starting from the most recently added
818    * whitelist), allowing clients to override the default behavior.
819    *
820    * @param name Attribute name.
821    * @return Attribute.
822    */
823   HTML.Attribute lookupAttribute(String name) {
824     ListIterator<HtmlWhitelist> iter = whitelists.listIterator(whitelists.size());
825     while (iter.hasPrevious()) {
826       HTML.Attribute attr = iter.previous().lookupAttribute(name);
827       if (attr != null) {
828         return attr;
829       }
830     }
831     return null;
832   }
833 
834   /**
835    * @param element Tag element
836    * @param startPos Start of tag, including '<'
837    * @param startAttributesPos Start of attributes. This is the first
838    * character after the tag name. If there are no attributes, this is the end
839    * of the tag.
840    * @param endAttributesPos First position after last attribute
841    * @param endPos End of tag, including '>' character
842    * @param isSingleTag True iff this is a self-terminating tag
843    * @param attributes Tag attributes
844    */
845   private void addStartTag(HTML.Element element, final int startPos,
846       final int startAttributesPos, final int endAttributesPos,
847       final int endPos, final boolean isSingleTag,
848       ArrayList<HtmlDocument.TagAttribute> attributes) {
849     X.assertTrue(startPos < startAttributesPos);
850     X.assertTrue(startAttributesPos <= endAttributesPos);
851     X.assertTrue(endAttributesPos <= endPos);
852 
853     if (preserveAll) {
854       String beforeAttrs = html.substring(startPos, startAttributesPos);
855       String afterAttrs = html.substring(endAttributesPos, endPos);
856       HtmlDocument.Tag tag = (isSingleTag)
857           ? HtmlDocument.createSelfTerminatingTag(element, attributes,
858               beforeAttrs, afterAttrs)
859           : HtmlDocument.createTag(element, attributes,
860               beforeAttrs, afterAttrs);
861       nodes.add(tag);
862     } else if (preserveValidHtml) {
863       // This is the beginning of the tag up through the tag name. It should not
864       // be possible for this to contain characters needing escaping, but we add
865       // this redundant check to avoid an XSS attack that might get past our
866       // parser but trick a browser into executing a script.
867       X.assertTrue(html.charAt(startPos) == '<');
868       StringBuilder beforeAttrs = new StringBuilder("<");
869       String tagName = html.substring(startPos + 1, startAttributesPos);
870       beforeAttrs.append(CharEscapers.asciiHtmlEscaper().escape(tagName));
871 
872       // Verify end-of-tag characters
873       int endContentPos = endPos - 1;
874       X.assertTrue(html.charAt(endContentPos) == '>');
875       if (isSingleTag) {
876         --endContentPos;
877         X.assertTrue(html.charAt(endContentPos) == '/');
878       }
879       X.assertTrue(endAttributesPos <= endContentPos);
880 
881       // This is any extra characters between the last attribute and the end of
882       // the tag.
883       X.assertTrue(endAttributesPos < endPos);
884       String afterAttrs = html.substring(endAttributesPos, endPos);
885 
886       // Strip all but preceding whitespace.
887       HtmlDocument.Tag tag = (isSingleTag)
888           ? HtmlDocument.createSelfTerminatingTag(element, attributes,
889               beforeAttrs.toString(), afterAttrs)
890           : HtmlDocument.createTag(element, attributes,
891               beforeAttrs.toString(), afterAttrs);
892       nodes.add(tag);
893     } else {
894       // Normalize.
895       HtmlDocument.Tag tag = (isSingleTag)
896           ? HtmlDocument.createSelfTerminatingTag(element, attributes)
897           : HtmlDocument.createTag(element, attributes);
898       nodes.add(tag);
899     }
900   }
901 
902  /**
903    * @param element End tag element.
904    * @param startPos Start of tag, including '<'.
905    * @param startAttributesPos Start of attributes. This is the first
906    * character after the tag name. If there are no attributes, this is the end
907    * of the tag.
908    * @param endPos End of tag. This usually contains the '>' character, but in
909    * the case where browsers force a termination of a malformed tag, it doesn't.
910    */
911   private void addEndTag(HTML.Element element, final int startPos,
912       final int startAttributesPos, final int endPos) {
913     X.assertTrue(element != null);
914     X.assertTrue(html.charAt(startPos) == '<');
915     X.assertTrue(html.charAt(startPos + 1) == '/');
916 
917     if (preserveAll) {
918       // Preserve all: keep actual content even if it's malformed.
919       X.assertTrue(startPos < endPos);
920       String content = html.substring(startPos, endPos);
921       nodes.add(HtmlDocument.createEndTag(element, content));
922     } else if (preserveValidHtml) {
923       // Preserve valid: terminate the tag.
924 
925       StringBuilder validContent = new StringBuilder("</");
926 
927       // This is the beginning of the tag up through the tag name. It should not
928       // be possible for this to contain characters needing escaping, but we add
929       // this redundant check to avoid an XSS attack that might get past our
930       // parser but trick a browser into executing a script.
931       X.assertTrue(startPos < startAttributesPos);
932       String tagName = html.substring(startPos + 2, startAttributesPos);
933       validContent.append(CharEscapers.asciiHtmlEscaper().escape(tagName));
934 
935       // This is the rest of the tag, including any attributes.
936       // See bug 874396 (Buganizer). We don't allow attributes in an end tag.
937       X.assertTrue(startAttributesPos <= endPos);
938       String endOfTag = html.substring(startAttributesPos, endPos);
939       if (endOfTag.charAt(endOfTag.length() - 1) != '>') {
940         endOfTag += '>';
941       }
942 
943       // Strip everything but leading whitespace.
944       validContent.append(endOfTag.replaceAll("\\S+.*>", ">"));
945 
946       nodes.add(HtmlDocument.createEndTag(element, validContent.toString()));
947     } else {
948       // Normalize: ignore the original content.
949       nodes.add(HtmlDocument.createEndTag(element));
950     }
951   }
952 
953   /**
954    * Creates and adds an attribute to the list.
955    *
956    * @param attributes Destination of new attribute.
957    * @param scanner Scanned attribute.
958    * @param startPos start position (inclusive) in original HTML of this
959    *        attribute, including preceeding separator characters (generally this
960    *        is whitespace, but it might contain other characters). This is the
961    *        end position of the tag name or previous attribute +1.
962    * @param endPos end position (exclusive) in original HTML of this attribute.
963    */
964   private void addAttribute(ArrayList<HtmlDocument.TagAttribute> attributes,
965       AttributeScanner scanner, final int startPos, final int endPos) {
966     X.assertTrue(startPos < endPos);
967 
968     String name = scanner.getName();
969     X.assertTrue(name != null);
970     HTML.Attribute htmlAttribute = lookupAttribute(name);
971 
972     // This can be null when there's no value, e.g., input.checked attribute.
973     String value = scanner.getValue();
974 
975     if (htmlAttribute == null) {
976       // Unknown attribute.
977       if (DEBUG) {
978         debug("Unknown attribute: " + name);
979       }
980       if (preserveAll) {
981         String original = html.substring(startPos, endPos);
982         attributes.add(HtmlDocument.createTagAttribute(
983             lookupUnknownAttribute(name), value, original));
984       }
985     } else {
986       String unescapedValue = (value == null) ? null : StringUtil.unescapeHTML(value);
987       if (preserveAll) {
988         attributes.add(HtmlDocument.createTagAttribute(htmlAttribute,
989             unescapedValue, html.substring(startPos, endPos)));
990       } else if (preserveValidHtml) {
991         StringBuilder original = new StringBuilder();
992 
993         // This includes any separator characters between the tag name or
994         // preceding attribute and this one.
995         // This addresses bugs 870757 and 875303 (Buganizer).
996         // Don't allow non-whitespace separators between attributes.
997         X.assertTrue(startPos <= scanner.startNamePos);
998         String originalPrefix = html.substring(
999             startPos, scanner.startNamePos).replaceAll("\\S+", "");
1000         if (originalPrefix.length() == 0) {
1001           originalPrefix = " ";
1002         }
1003         original.append(originalPrefix);
1004 
1005         if (value == null) {
1006           // This includes the name and any following whitespace. Escape in case
1007           // the name has any quotes or '<' that could confuse a browser.
1008           X.assertTrue(scanner.startNamePos < endPos);
1009           String nameEtc = html.substring(scanner.startNamePos, endPos);
1010           original.append(CharEscapers.asciiHtmlEscaper().escape(nameEtc));
1011         } else {
1012           // Escape name in case the name has any quotes or '<' that could
1013           // confuse a browser.
1014           original.append(CharEscapers.asciiHtmlEscaper().escape(name));
1015 
1016           // This includes the equal sign, and any other whitespace
1017           // between the name and value. It also contains the opening quote
1018           // character if there is one.
1019           X.assertTrue(scanner.endNamePos < scanner.startValuePos);
1020           original.append(html.substring(scanner.endNamePos, scanner.startValuePos));
1021 
1022           // This is the value, excluding any quotes.
1023           if (scanner.attrValueIsQuoted) {
1024             // Officially a '<' can be valid in an attribute value, but to be
1025             // safe we always escape them.
1026             original.append(value.replaceAll("<", "&lt;"));
1027           } else {
1028             // This addresses bug 881426 (Buganizer). Put quotes around any
1029             // dangerous characters, which is what most of the browsers do.
1030             if (NEEDS_QUOTING_ATTRIBUTE_VALUE_REGEX.matcher(value).find()) {
1031               original.append('"');
1032               original.append(value.replaceAll("\"", "&quot;"));
1033               original.append('"');
1034             } else {
1035               original.append(value);
1036             }
1037           }
1038 
1039           // This includes end quote, if applicable.
1040           X.assertTrue(scanner.endValuePos <= endPos);
1041           original.append(html.substring(scanner.endValuePos, endPos));
1042         }
1043 
1044         attributes.add(HtmlDocument.createTagAttribute(
1045             htmlAttribute, unescapedValue, original.toString()));
1046       } else {
1047         attributes.add(HtmlDocument.createTagAttribute(
1048             htmlAttribute, unescapedValue));
1049       }
1050     }
1051   }
1052 
1053   //------------------------------------------------------------------------
1054   // Comment scanning
1055   //------------------------------------------------------------------------
1056   private static final String START_COMMENT = "<!--";
1057   private static final String END_COMMENT = "-->";
1058 
1059   private int scanComment(final int start, final int end) {
1060 
1061     X.assertTrue(html.regionMatches(start, START_COMMENT, 0, START_COMMENT.length()));
1062 
1063     // Scan for end of comment
1064     int pos = html.indexOf(END_COMMENT, start + START_COMMENT.length());
1065     if (pos != -1) {
1066       pos += END_COMMENT.length();
1067     } else {
1068       // Look for '>'. If we can't find that, the rest of the text is comments.
1069       pos = html.indexOf('>', start + 4);
1070       if (pos != -1) {
1071         ++pos;
1072       } else {
1073         pos = end;
1074       }
1075     }
1076 
1077     if (preserveAll) {
1078       nodes.add(HtmlDocument.createHtmlComment(html.substring(start, pos)));
1079     }
1080 
1081     return pos;
1082   }
1083 
1084   //------------------------------------------------------------------------
1085   // CDATA scanning
1086   //------------------------------------------------------------------------
1087   int scanCDATA(final int start, final int end) {
1088 
1089     // Get the tag: must be either STYLE or SCRIPT
1090     HtmlDocument.Tag tag = (HtmlDocument.Tag) nodes.get(nodes.size() - 1);
1091     HTML.Element element = tag.getElement();
1092     X.assertTrue(HTML4.SCRIPT_ELEMENT.equals(element) || HTML4.STYLE_ELEMENT.equals(element));
1093 
1094     int pos;
1095     for (pos = start; pos < end; pos++) {
1096       if (pos + 2 < end &&
1097           html.charAt(pos) == '<' &&
1098           html.charAt(pos + 1) == '/' &&
1099           html.regionMatches(true, pos + 2, element.getName(), 0,
1100                               element.getName().length())) {
1101         break;
1102       }
1103     }
1104 
1105     // Add a CDATA node
1106     if (pos > start) {
1107       HtmlDocument.CDATA cdata =
1108         HtmlDocument.createCDATA(html.substring(start, pos));
1109       nodes.add(cdata);
1110     }
1111 
1112     state = State.IN_TAG;
1113     return pos;
1114   }
1115 
1116   //------------------------------------------------------------------------
1117   public static void main(String[] args) throws IOException {
1118 
1119     DEBUG = true;
1120 
1121     String html = new String(ByteStreams.toByteArray(System.in), "ISO-8859-1");
1122 
1123     HtmlParser parser = new HtmlParser();
1124     HtmlDocument doc = parser.parse(html);
1125     System.out.println(doc.toString());
1126   }
1127 
1128   private static void debug(String str) {
1129     System.err.println(str);
1130   }
1131 }