1 /**
2  * Copyright (c) 2004, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.android.mail.common.html.parser;
18 
19 import com.google.android.mail.common.base.CharEscapers;
20 import com.google.android.mail.common.base.CharMatcher;
21 import com.google.android.mail.common.base.StringUtil;
22 import com.google.android.mail.common.base.X;
23 import com.google.common.collect.Lists;
24 
25 import java.io.PrintWriter;
26 import java.io.StringWriter;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 import java.util.List;
30 
31 
32 /**
33  * HtmlDocument is a container for a list of html nodes, and represents the
34  * entire html document. It contains toHTML() method which prints out the html
35  * text, toXHTML for printing out XHTML text and toString() which prints out in
36  * debug format.
37  *
38  * @author jlim@google.com (Jing Yee Lim)
39  */
40 public class HtmlDocument {
41   /** List of Node objects */
42   private final List<Node> nodes;
43 
44   /**
45    * Creates a Html document.
46    * @param nodes list of html nodes
47    */
HtmlDocument(List<Node> nodes)48   public HtmlDocument(List<Node> nodes) {
49     this.nodes = nodes;
50   }
51 
52   /** Gets the list of nodes */
getNodes()53   public List<Node> getNodes() {
54     return nodes;
55   }
56 
57   /** Returns a HTML string for the current document */
toHTML()58   public String toHTML() {
59     StringBuilder sb = new StringBuilder(nodes.size() * 10);
60     for (Node n : nodes) {
61       n.toHTML(sb);
62     }
63     return sb.toString();
64   }
65 
66   /** Returns a XHTML string for the current document */
toXHTML()67   public String toXHTML() {
68     StringBuilder sb = new StringBuilder(nodes.size() * 10);
69     for (Node n : nodes) {
70       n.toXHTML(sb);
71     }
72     return sb.toString();
73   }
74 
75   /**
76    * Returns, as much as possible, original content of preparsed nodes.  This
77    * is only different from toHTML() if the nodes were created with original
78    * content, e.g., by HtmlParser in preserve mode.
79    */
toOriginalHTML()80   public String toOriginalHTML() {
81     StringBuilder sb = new StringBuilder(nodes.size() * 10);
82     for (Node n : nodes) {
83       n.toOriginalHTML(sb);
84     }
85     return sb.toString();
86   }
87 
88   /** Returns the HTML document in debug format */
89   @Override
toString()90   public String toString() {
91     StringWriter strWriter = new StringWriter();
92     accept(new DebugPrinter(new PrintWriter(strWriter)));
93     return strWriter.toString();
94   }
95 
96   /**
97    * Creates start Tag Node.
98    * @see HtmlDocument#createTag(HTML.Element, List, String, String)
99    */
createTag(HTML.Element element, List<TagAttribute> attributes)100   public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) {
101     return createTag(element, attributes, null, null);
102   }
103 
104   /**
105    * Creates start Tag Node.
106    * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String)
107    */
createTag(HTML.Element element, List<TagAttribute> attributes, String originalHtmlBeforeAttributes, String originalHtmlAfterAttributes)108   public static Tag createTag(HTML.Element element,
109       List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
110       String originalHtmlAfterAttributes) {
111     return new Tag(element, attributes, false, originalHtmlBeforeAttributes,
112         originalHtmlAfterAttributes);
113   }
114 
115   /**
116    * Creates self-terminating Tag Node.
117    * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String)
118    */
createSelfTerminatingTag(HTML.Element element, List<TagAttribute> attributes)119   public static Tag createSelfTerminatingTag(HTML.Element element,
120       List<TagAttribute> attributes) {
121     return createSelfTerminatingTag(element, attributes, null, null);
122   }
123 
124   /**
125    * Creates self-terminating Tag Node.
126    * @see HtmlDocument#createTag(HTML.Element, List, String, String)
127    */
createSelfTerminatingTag(HTML.Element element, List<TagAttribute> attributes, String originalHtmlBeforeAttributes, String originalHtmlAfterAttributes)128   public static Tag createSelfTerminatingTag(HTML.Element element,
129       List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
130       String originalHtmlAfterAttributes) {
131     return new Tag(element, attributes, true, originalHtmlBeforeAttributes,
132         originalHtmlAfterAttributes);
133   }
134 
135   /**
136    * @see HtmlDocument#createEndTag(HTML.Element, String)
137    */
createEndTag(HTML.Element element)138   public static EndTag createEndTag(HTML.Element element) {
139     return createEndTag(element, null);
140   }
141 
142   /**
143    * @see HtmlDocument.EndTag#EndTag(HTML.Element, String)
144    */
createEndTag(HTML.Element element, String originalHtml)145   public static EndTag createEndTag(HTML.Element element, String originalHtml) {
146     return new EndTag(element, originalHtml);
147   }
148 
149   /**
150    * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String)
151    */
createTagAttribute(HTML.Attribute attr, String value)152   public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) {
153     return createTagAttribute(attr, value, null);
154   }
155 
156   /**
157    * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String)
158    */
createTagAttribute(HTML.Attribute attr, String value, String originalHtml)159   public static TagAttribute createTagAttribute(HTML.Attribute attr,
160       String value, String originalHtml) {
161     X.assertTrue(attr != null);
162     return new TagAttribute(attr, value, originalHtml);
163   }
164 
165   /**
166    * @see HtmlDocument#createText(String, String)
167    */
createText(String text)168   public static Text createText(String text) {
169     return createText(text, null);
170   }
171 
172   /**
173    * Creates a Text node.
174    * @see UnescapedText#UnescapedText(String, String)
175    */
createText(String text, String original)176   public static Text createText(String text, String original) {
177     return new UnescapedText(text, original);
178   }
179 
180   /**
181    * Creates a Text node where the content hasn't been unescaped yet (this will
182    * be done lazily).
183    */
createEscapedText(String htmlText, String original)184   public static Text createEscapedText(String htmlText, String original) {
185     return new EscapedText(htmlText, original);
186   }
187 
188   /**
189    * Creates an Comment node.
190    * @see Comment#Comment(String)
191    */
createHtmlComment(String content)192   public static Comment createHtmlComment(String content) {
193     return new Comment(content);
194   }
195 
196   /**
197    * Creates a CDATA node.
198    * @see CDATA#CDATA(String)
199    */
createCDATA(String text)200   public static CDATA createCDATA(String text) {
201     return new CDATA(text);
202   }
203 
204   /** Accepts a Visitor */
accept(Visitor v)205   public void accept(Visitor v) {
206     v.start();
207     for (Node node : nodes) {
208       node.accept(v);
209     }
210     v.finish();
211   }
212 
213   /**
214    * @param filter results of this filter replace the existing nodes
215    * @return new document with filtered nodes
216    */
filter(MultiplexFilter filter)217   public HtmlDocument filter(MultiplexFilter filter) {
218     filter.start();
219     List<Node> newNodes = new ArrayList<Node>();
220     for (Node node : nodes) {
221       filter.filter(node, newNodes);
222     }
223     filter.finish(newNodes);
224     return new HtmlDocument(newNodes);
225   }
226 
227   /**
228    * Html node
229    */
230   public static abstract class Node {
231 
232     /** Accepts a visitor */
accept(Visitor visitor)233     public abstract void accept(Visitor visitor);
234 
235     /** Converts to HTML */
toHTML()236     public String toHTML() {
237       StringBuilder sb = new StringBuilder();
238       toHTML(sb);
239       return sb.toString();
240     }
241 
242     /** Converts to HTML */
toHTML(StringBuilder sb)243     public abstract void toHTML(StringBuilder sb);
244 
245     /** Converts to XHTML */
toXHTML()246     public String toXHTML() {
247       StringBuilder sb = new StringBuilder();
248       toXHTML(sb);
249       return sb.toString();
250     }
251 
252     /** Converts to XHTML */
toXHTML(StringBuilder sb)253     public abstract void toXHTML(StringBuilder sb);
254 
255     /**
256      * @return Original if it's available; otherwise, returns
257      * <code>toHTML()</code>
258      */
toOriginalHTML()259     public String toOriginalHTML() {
260       StringBuilder sb = new StringBuilder();
261       toOriginalHTML(sb);
262       return sb.toString();
263     }
264 
265     /**
266      * @param sb Destination of HTML to be appended.  Appends original if it's
267      * available; otherwise, appends <code>toHTML()</code>
268      */
toOriginalHTML(StringBuilder sb)269     public abstract void toOriginalHTML(StringBuilder sb);
270   }
271 
272   /**
273    * HTML comment node.
274    */
275   public static class Comment extends Node {
276 
277     private final String content;
278 
279     /**
280      * @param content Raw comment, including "&lt;!--" and "--&gt;".
281      */
Comment(String content)282     public Comment(String content) {
283       this.content = content;
284     }
285 
286     @Override
accept(Visitor visitor)287     public void accept(Visitor visitor) {
288       visitor.visitComment(this);
289     }
290 
291     /**
292      * Emit original unchanged.
293      * @param sb Destination of result.
294      */
295     @Override
toHTML(StringBuilder sb)296     public void toHTML(StringBuilder sb) {
297       sb.append(content);
298     }
299 
300     /**
301      * Emit original unchanged.
302      * @param sb Destination of result.
303      */
304     @Override
toXHTML(StringBuilder sb)305     public void toXHTML(StringBuilder sb) {
306       sb.append(content);
307     }
308 
309     /**
310      * Emit original unchanged.
311      * @param sb Destination of result.
312      */
313     @Override
toOriginalHTML(StringBuilder sb)314     public void toOriginalHTML(StringBuilder sb) {
315       sb.append(content);
316     }
317 
318     /**
319      * @return Original unchanged.
320      */
getContent()321     public String getContent() {
322       return content;
323     }
324   }
325 
326   /**
327    * Text node
328    */
329   public static abstract class Text extends Node {
330 
331     /**
332      * unaltered original content of this node
333      */
334     private final String originalHtml;
335 
336     /**
337      * content of this node in HTML format
338      */
339     private String html;
340 
341     /**
342      * @param originalHtml Unaltered original HTML. If not null,
343      *        toOriginalHTML() will return this.
344      */
Text(String originalHtml)345     protected Text(String originalHtml) {
346       this.originalHtml = originalHtml;
347     }
348 
349     /**
350      * Gets the plain, unescaped text.
351      */
getText()352     abstract public String getText();
353 
354     // Returns true if it contains only white space
isWhitespace()355     public boolean isWhitespace() {
356       String text = getText();
357       int len = text.length();
358       for (int i = 0; i < len; i++) {
359         if (!Character.isWhitespace(text.charAt(i))) {
360           return false;
361         }
362       }
363       return true;
364     }
365 
366     @Override
equals(Object o)367     public boolean equals(Object o) {
368       if (o == this) {
369         return true;
370       }
371       if (o instanceof Text) {
372         Text that = (Text) o;
373 
374         return this.originalHtml == null ? that.originalHtml == null
375             : this.originalHtml.equals(that.originalHtml);
376       }
377       return false;
378     }
379 
380     @Override
hashCode()381     public int hashCode() {
382       return originalHtml == null ? 0 : originalHtml.hashCode();
383     }
384 
385     @Override
toString()386     public String toString() {
387       return getText();
388     }
389 
390     /** Extends Node.accept */
391     @Override
accept(Visitor visitor)392     public void accept(Visitor visitor) {
393       visitor.visitText(this);
394     }
395 
396     /**
397      * Gets the HTML, with HTML entities escaped.
398      */
399     @Override
toHTML(StringBuilder sb)400     public void toHTML(StringBuilder sb) {
401       if (html == null) {
402         html = CharEscapers.asciiHtmlEscaper().escape(getText());
403       }
404       sb.append(html);
405     }
406 
407     /**
408      * @see HtmlDocument.Text#toHTML(StringBuilder)
409      */
410     @Override
toXHTML(StringBuilder sb)411     public void toXHTML(StringBuilder sb) {
412       toHTML(sb);
413     }
414 
415     /**
416      * @param sb Appends original HTML to this if available.  Otherwise,
417      * same as toHTML().
418      */
419     @Override
toOriginalHTML(StringBuilder sb)420     public void toOriginalHTML(StringBuilder sb) {
421       if (originalHtml != null) {
422         sb.append(originalHtml);
423       } else {
424         toHTML(sb);
425       }
426     }
427 
428     /**
429      * @return the original HTML (possibly with entities unescaped if the
430      * document was malformed). May be null if original HTML was not preserved
431      * (see constructor argument of {@link HtmlParser})
432      */
getOriginalHTML()433     public String getOriginalHTML() {
434       return originalHtml;
435     }
436   }
437 
438   /**
439    * {@link Text} implementation where the given text is assumed to have been
440    * already HTML unescaped.
441    */
442   private static class UnescapedText extends Text {
443     /**
444      * content of this node as plain, unescaped text
445      */
446     protected final String text;
447 
UnescapedText(String plainText, String originalHtml)448     private UnescapedText(String plainText, String originalHtml) {
449       super(originalHtml);
450       X.assertTrue(plainText != null);
451       this.text = plainText;
452     }
453 
getText()454     @Override public String getText() {
455       return text;
456     }
457   }
458 
459   /**
460    * {@link Text} implementation where the given text is not unescaped yet, and
461    * unescaping will only be done lazily.
462    */
463   private static class EscapedText extends Text {
464     private final String htmlText;
465     private String text;
466 
EscapedText(String htmlText, String originalHtml)467     private EscapedText(String htmlText, String originalHtml) {
468       super(originalHtml);
469       this.htmlText = htmlText;
470     }
471 
getText()472     @Override public String getText() {
473       if (text == null) {
474         text = StringUtil.unescapeHTML(htmlText);
475       }
476       return text;
477     }
478   }
479 
480   /**
481    * CDATA node is a subclass of Text node.
482    */
483   public static class CDATA extends UnescapedText {
CDATA(String text)484     private CDATA(String text) {
485       super(text, text);
486     }
487 
toHTML(StringBuilder sb)488     @Override public void toHTML(StringBuilder sb) {
489       // Do not htmlescape CDATA text
490       sb.append(text);
491     }
492 
toXHTML(StringBuilder sb)493     @Override public void toXHTML(StringBuilder sb) {
494       sb.append("<![CDATA[")
495         .append(text)
496         .append("]]>");
497     }
498   }
499 
500   /**
501    * Tag is a HTML open tag.
502    */
503   public static class Tag extends Node {
504     // The element
505     private final HTML.Element element;
506 
507     // List of TagAttribute objects. This may be null.
508     private List<TagAttribute> attributes;
509 
510     private final boolean isSelfTerminating;
511 
512     private final String originalHtmlBeforeAttributes;
513 
514     private final String originalHtmlAfterAttributes;
515 
516     /**
517      * @param element the HTML4 element
518      * @param attributes list of TagAttribute objects, may be null
519      * @param isSelfTerminating
520      * @param originalHtmlBeforeAttributes Original tag's full content before
521      *        first attribute, including beginning '&lt;'. This should not
522      *        include preceeding whitespace for the first attribute, as that
523      *        should be included in the attribute node. If not null, tag will
524      *        preserve this original content. e.g., if original tag were
525      *        "&lt;foO bar='zbc'&gt;", case of foO would be preserved. This
526      *        method does not validate that
527      *        <code>originalHtmlBeforeAttributes</code> is a valid tag String.
528      * @param originalHtmlAfterAttributes Full content of original tag after
529      *        last attribute, including ending '>'. If not null, tag will
530      *        preserve this original content. e.g., if original tag were
531      *        "&lt;foo bar='zbc'  &gt;", the spaces before '&gt;' be preserved.
532      *        This method does not validate that
533      *        <code>originalHtmlAfterAttributes</code> is a valid tag String.
534      */
Tag(HTML.Element element, List<TagAttribute> attributes, boolean isSelfTerminating, String originalHtmlBeforeAttributes, String originalHtmlAfterAttributes)535     private Tag(HTML.Element element, List<TagAttribute> attributes,
536         boolean isSelfTerminating, String originalHtmlBeforeAttributes,
537         String originalHtmlAfterAttributes) {
538       X.assertTrue(element != null);
539       this.element = element;
540       this.attributes = attributes;
541       this.isSelfTerminating = isSelfTerminating;
542       this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes;
543       this.originalHtmlAfterAttributes = originalHtmlAfterAttributes;
544     }
545 
546     /** Gets the name */
getName()547     public String getName() {
548       return element.getName();
549     }
550 
551     /** Gets the element */
getElement()552     public HTML.Element getElement() {
553       return element;
554     }
555 
556     /** Adds an attribute */
addAttribute(HTML.Attribute attr, String value)557     public void addAttribute(HTML.Attribute attr, String value) {
558       X.assertTrue(attr != null);
559       addAttribute(new TagAttribute(attr, value, null));
560     }
561 
562     /** Adds an attribute */
addAttribute(TagAttribute attr)563     public void addAttribute(TagAttribute attr) {
564       X.assertTrue(attr != null);
565       if (attributes == null) {
566         attributes = new ArrayList<TagAttribute>();
567       }
568       attributes.add(attr);
569     }
570 
571     /** Gets the list of attributes, note that this maybe null. */
getAttributes()572     public List<TagAttribute> getAttributes() {
573       return attributes;
574     }
575 
576     /** Finds and returns a TagAttribute, or null if not found */
getAttribute(HTML.Attribute attr)577     public TagAttribute getAttribute(HTML.Attribute attr) {
578       if (attributes != null) {
579         for (TagAttribute attribute : attributes) {
580           if (attribute.getAttribute().equals(attr)) {
581             return attribute;
582           }
583         }
584       }
585       return null;
586     }
587 
588     /**
589      * Finds and returns list of TagAttribute of given attribute
590      * type, or empty list if not found,
591      */
getAttributes(HTML.Attribute attr)592     public List<TagAttribute> getAttributes(HTML.Attribute attr) {
593       List<TagAttribute> result = Lists.newArrayList();
594       if (attributes != null) {
595         for (TagAttribute attribute : attributes) {
596           if (attribute.getAttribute().equals(attr)) {
597             result.add(attribute);
598           }
599         }
600       }
601       return result;
602     }
603 
604     /** Returns debug string */
605     @Override
toString()606     public String toString() {
607       StringBuilder sb = new StringBuilder();
608       sb.append("Start Tag: ");
609       sb.append(element.getName());
610       if (attributes != null) {
611         for (TagAttribute attr : attributes) {
612           sb.append(' ');
613           sb.append(attr.toString());
614         }
615       }
616       return sb.toString();
617     }
618 
619     /** Implements Node.accept */
620     @Override
accept(Visitor visitor)621     public void accept(Visitor visitor) {
622       visitor.visitTag(this);
623     }
624 
625     /** Implements Node.toHTML */
626     @Override
toHTML(StringBuilder sb)627     public void toHTML(StringBuilder sb) {
628       serialize(sb, SerializeType.HTML);
629     }
630 
631     @Override
toXHTML(StringBuilder sb)632     public void toXHTML(StringBuilder sb) {
633       serialize(sb, SerializeType.XHTML);
634     }
635 
636     @Override
toOriginalHTML(StringBuilder sb)637     public void toOriginalHTML(StringBuilder sb) {
638       serialize(sb, SerializeType.ORIGINAL_HTML);
639     }
640 
641     /**
642      * Specifies format of serialized output.
643      */
644     private enum SerializeType {
645       ORIGINAL_HTML, HTML, XHTML
646     }
647 
serialize(StringBuilder sb, SerializeType type)648     private void serialize(StringBuilder sb, SerializeType type) {
649       // before attributes
650       if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) {
651         sb.append(originalHtmlBeforeAttributes);
652       } else {
653         sb.append('<');
654         sb.append(element.getName());
655       }
656 
657       // attributes
658       if (attributes != null) {
659         for (TagAttribute attr : attributes) {
660           // attribute includes leading whitespace, so we needn't add it here
661           if (type == SerializeType.ORIGINAL_HTML) {
662             attr.toOriginalHTML(sb);
663           } else if (type == SerializeType.HTML) {
664             attr.toHTML(sb);
665           } else {
666             attr.toXHTML(sb);
667           }
668         }
669       }
670 
671       // after attributes
672       if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) {
673         sb.append(originalHtmlAfterAttributes);
674       } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) {
675         sb.append(" />");
676       } else {
677         sb.append('>');
678       }
679     }
680 
isSelfTerminating()681     public boolean isSelfTerminating() {
682       return isSelfTerminating;
683     }
684 
getOriginalHtmlBeforeAttributes()685     public String getOriginalHtmlBeforeAttributes() {
686       return originalHtmlBeforeAttributes;
687     }
688 
getOriginalHtmlAfterAttributes()689     public String getOriginalHtmlAfterAttributes() {
690       return originalHtmlAfterAttributes;
691     }
692   }
693 
694   /**
695    * EndTag is a closing HTML tag.
696    */
697   public static class EndTag extends Node {
698     // The element
699     private final HTML.Element element;
700 
701     private final String originalHtml;
702 
703     /**
704      * @param element The HTML.Element element.  Can not be null.
705      * @param originalHtml Full content of original tag, including beginning
706      * and ending '<' and '>'.  If not null, tag will preserve this original
707      * content. e.g., if original tag were "&lt;/foo &gt;", the space after foo
708      * would be preserved.  This method does not validate that originalHtml is a
709      * valid tag String.
710      */
EndTag(HTML.Element element, String originalHtml)711     private EndTag(HTML.Element element, String originalHtml) {
712       X.assertTrue(element != null);
713       this.element = element;
714       this.originalHtml = originalHtml;
715     }
716 
717     /** Gets the name */
getName()718     public String getName() {
719       return element.getName();
720     }
721 
722     /** Gets the element */
getElement()723     public HTML.Element getElement() {
724       return element;
725     }
726 
727     /** Returns debug string */
728     @Override
toString()729     public String toString() {
730       return "End Tag: " + element.getName();
731     }
732 
733     /** Implements Node.accept */
734     @Override
accept(Visitor visitor)735     public void accept(Visitor visitor) {
736       visitor.visitEndTag(this);
737     }
738 
739     /** Implements Node.toHTML */
740     @Override
toHTML(StringBuilder sb)741     public void toHTML(StringBuilder sb) {
742       sb.append("</");
743       sb.append(element.getName());
744       sb.append('>');
745     }
746 
747     @Override
toXHTML(StringBuilder sb)748     public void toXHTML(StringBuilder sb) {
749       toHTML(sb);
750     }
751 
752     @Override
toOriginalHTML(StringBuilder sb)753     public void toOriginalHTML(StringBuilder sb) {
754       if (originalHtml != null) {
755         sb.append(originalHtml);
756       } else {
757         toHTML(sb);
758       }
759     }
760   }
761 
762   /**
763    * TagAttribute represents an attribute in a HTML tag.
764    */
765   public static class TagAttribute {
766     private final HTML.Attribute attribute;
767     private String value;
768     private String originalHtml;
769 
770     /**
771      * @param attribute the HTML.Attribute. Can't be null.
772      * @param value The value in plain-text format. This can be null if the
773      *        attribute has no value.
774      * @param originalHtml If not null, toOriginalHTML() will preserve original
775      *        content. This should contain any leading whitespace from the
776      *        original.
777      */
TagAttribute(HTML.Attribute attribute, String value, String originalHtml)778     private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) {
779       X.assertTrue(attribute != null);
780       this.attribute = attribute;
781       this.value = value;
782       this.originalHtml = originalHtml;
783     }
784 
785     /** Gets the name */
getName()786     public String getName() {
787       return attribute.getName();
788     }
789 
790     /** Gets the HTML.Attribute information */
getAttribute()791     public HTML.Attribute getAttribute() {
792       return attribute;
793     }
794 
795     /**
796      * Sets the attribute value.
797      * This value must be in plain-text, not html-escaped.
798      * This can be null, if the attribute has no values.
799      * This clears <code>originalHtml_</code> if it were set, so
800      * <code>toOriginalHTML()</code> might not preserve original any more.
801      */
setValue(String value)802     public void setValue(String value) {
803       this.value = value;
804       originalHtml = null;
805     }
806 
807     /** Returns the attribute value in plain-text, never null */
getValue()808     public String getValue() {
809       return value != null ? value : "";
810     }
811 
812     /** Returns true if the attribute value is not empty */
hasValue()813     public boolean hasValue() {
814       return value != null;
815     }
816 
817     /**
818      * Writes out the attribute in HTML format with all necessary preceding
819      * whitespace. Emits originalHtml_ if it were specified to the constructor.
820      * Otherwise, emits a new name="value" string with a single preceding space.
821      */
toHTML(StringBuilder sb)822     public void toHTML(StringBuilder sb) {
823       sb.append(' ');
824       sb.append(attribute.getName());
825       if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) {
826         sb.append("=\"");
827         sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
828         sb.append("\"");
829       }
830     }
831 
832     /** Returns the attribute html string */
toHTML()833     public String toHTML() {
834       StringBuilder sb = new StringBuilder();
835       toHTML(sb);
836       return sb.toString();
837     }
838 
839     /**
840      * Writes out the attribute in XHTML format (value is always appended,
841      * even if it is empty) with all necessary preceeding whitespace.
842      */
toXHTML(StringBuilder sb)843     public void toXHTML(StringBuilder sb) {
844       sb.append(' ');
845       sb.append(attribute.getName()).append("=\"");
846 
847       // Assume that value-less attribute are boolean attributes like "disabled"
848       if (hasValue()) {
849         sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
850       } else {
851         sb.append(attribute.getName());
852       }
853 
854       sb.append("\"");
855     }
856 
857     /** Returns the attribute XHTML string */
toXHTML()858     public String toXHTML() {
859       StringBuilder sb = new StringBuilder();
860       toXHTML(sb);
861       return sb.toString();
862     }
863 
864     /**
865      * @param sb Destination to which attribute is written, in its original
866      * preparsed form if possible.
867      */
toOriginalHTML(StringBuilder sb)868     public void toOriginalHTML(StringBuilder sb) {
869       if (originalHtml != null) {
870         sb.append(originalHtml);
871       } else {
872         toHTML(sb);
873       }
874     }
875 
876     /**
877      * Writes out the attribute in its original form as it was parsed..
878      */
toOriginalHTML()879     public String toOriginalHTML() {
880       StringBuilder sb = new StringBuilder();
881       toOriginalHTML(sb);
882       return sb.toString();
883     }
884 
885     @Override
toString()886     public String toString() {
887       return "{" + attribute.getName() + "=" + value + "}";
888     }
889   }
890 
891   /**
892    * Filter is like Visitor, except it implies that the nodes may be changed,
893    * whereas HtmlDocument.Visitor just implies that the nodes are iterated
894    * over. A Filter can behave just like a Visitor if it merely returns the
895    * same node that it visited. Also, methods may be called on a node to change
896    * the values it contains. Alternatively, a new node entirely can be created
897    * and returned, which will essentially replace the previous node with the
898    * new node in the document tree. A node may be removed by returning null
899    * instead of a node.
900    */
901   public static interface Filter {
902     /** This is called first */
start()903     void start();
904 
905     /** A text node */
visitText(Text n)906     Text visitText(Text n);
907 
908     /** An open tag */
visitTag(Tag n)909     Tag visitTag(Tag n);
910 
911     /** End tag */
visitEndTag(EndTag n)912     EndTag visitEndTag(EndTag n);
913 
914     /** HTML comment */
visitComment(Comment n)915     Comment visitComment(Comment n);
916 
917     /* Called at the end. */
finish()918     void finish();
919   }
920 
921   /**
922    * Like Filter, except each node may be replaced by multiple nodes.  Also,
923    * does not do double dispatch accept/visit.
924    */
925   public static interface MultiplexFilter {
926     /**
927      * Called first.
928      */
start()929     void start();
930 
931     /**
932      * @param originalNode node to filter
933      * @param out Destination to which this object appends nodes to replace
934      * originalNode.  Can not be null.
935      */
filter(Node originalNode, List<Node> out)936     void filter(Node originalNode, List<Node> out);
937 
938     /**
939      * Called at the end.
940      * @param out Destination to which this object appends nodes at the end of
941      * the document.  Can not be null.
942      */
finish(List<Node> out)943     void finish(List<Node> out);
944   }
945 
946   /**
947    * Converts a normal {@link Filter} into a {@link MultiplexFilter}.
948    */
949   public static class MultiplexFilterAdapter implements MultiplexFilter {
950 
951     private final Filter filter;
952 
MultiplexFilterAdapter(Filter filter)953     public MultiplexFilterAdapter(Filter filter) {
954       this.filter = filter;
955     }
956 
start()957     public void start() {
958       filter.start();
959     }
960 
filter(Node originalNode, List<Node> out)961     public void filter(Node originalNode, List<Node> out) {
962       if (originalNode == null) {
963         return;
964       }
965 
966       Node resultNode;
967       if (originalNode instanceof Tag) {
968         resultNode = filter.visitTag((Tag) originalNode);
969       } else if (originalNode instanceof Text) {
970         resultNode = filter.visitText((Text) originalNode);
971       } else if (originalNode instanceof EndTag) {
972         resultNode = filter.visitEndTag((EndTag) originalNode);
973       } else if (originalNode instanceof Comment) {
974         resultNode = filter.visitComment((Comment) originalNode);
975       } else {
976         throw new IllegalArgumentException("unknown node type: " + originalNode.getClass());
977       }
978 
979       if (resultNode != null) {
980         out.add(resultNode);
981       }
982     }
983 
finish(List<Node> out)984     public void finish(List<Node> out) {
985       filter.finish();
986     }
987   }
988 
989   /**
990    * Like Filter, except each node may be replaced by multiple nodes.  Also,
991    * does not do double dispatch accept/visit.  Dispatches filterNode() to
992    * node-specific methods.
993    */
994   public static abstract class SimpleMultiplexFilter implements MultiplexFilter {
995 
996     /**
997      * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
998      */
filter(Node originalNode, List<Node> out)999     public void filter(Node originalNode, List<Node> out) {
1000       if (originalNode == null) {
1001         return;
1002       }
1003 
1004       if (originalNode instanceof Tag) {
1005         filterTag((Tag) originalNode, out);
1006       } else if (originalNode instanceof Text) {
1007         filterText((Text) originalNode, out);
1008       } else if (originalNode instanceof EndTag) {
1009         filterEndTag((EndTag) originalNode, out);
1010       } else if (originalNode instanceof Comment) {
1011         filterComment((Comment) originalNode, out);
1012       } else {
1013         throw new IllegalArgumentException("unknown node type: "
1014             + originalNode.getClass());
1015       }
1016     }
1017 
filterTag(Tag originalTag, List<Node> out)1018     public abstract void filterTag(Tag originalTag, List<Node> out);
1019 
filterText(Text originalText, List<Node> out)1020     public abstract void filterText(Text originalText, List<Node> out);
1021 
filterEndTag(EndTag originalEndTag, List<Node> out)1022     public abstract void filterEndTag(EndTag originalEndTag, List<Node> out);
1023 
filterComment(Comment originalComment, List<Node> out)1024     public void filterComment(Comment originalComment, List<Node> out) {
1025     }
1026   }
1027 
1028   /**
1029    * Contains a list of filters which are applied, in order, to each Node.  The
1030    * output of each becomes the input to the next.  As soon as one returns an
1031    * empty list it breaks the chain.
1032    */
1033   public static class MultiplexFilterChain implements MultiplexFilter {
1034 
1035     private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>();
1036 
1037     /**
1038      * @param sourceFilters these filters are applied in List order
1039      */
MultiplexFilterChain(List<MultiplexFilter> sourceFilters)1040     public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) {
1041       filters.addAll(sourceFilters);
1042     }
1043 
1044     /**
1045      * @see HtmlDocument.MultiplexFilter#start()
1046      */
start()1047     public void start() {
1048       for (MultiplexFilter filter : filters) {
1049         filter.start();
1050       }
1051     }
1052 
1053     /**
1054      * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
1055      */
filter(Node originalNode, List<Node> out)1056     public void filter(Node originalNode, List<Node> out) {
1057       List<Node> result = new ArrayList<Node>();
1058       result.add(originalNode);
1059 
1060       // loop through filters until one returns nothing, or until we're out of
1061       // filters
1062       for (MultiplexFilter filter : filters) {
1063         if (result.isEmpty()) {
1064           return;
1065         }
1066 
1067         // apply filter to each node and collect results
1068         List<Node> newResult = new ArrayList<Node>();
1069         for (Node node : result) {
1070           filter.filter(node, newResult);
1071         }
1072         result = newResult;
1073       }
1074 
1075       out.addAll(result);
1076     }
1077 
1078     /**
1079      * @see HtmlDocument.MultiplexFilter#finish(List)
1080      */
finish(List<Node> out)1081     public void finish(List<Node> out) {
1082       List<Node> result = new ArrayList<Node>();
1083 
1084       // loop through filters until one returns nothing, or until we're out of
1085       // filters
1086       for (MultiplexFilter filter : filters) {
1087         // apply filter to each node and collect results
1088         List<Node> newResult = new ArrayList<Node>();
1089         for (Node node : result) {
1090           filter.filter(node, newResult);
1091         }
1092         filter.finish(newResult);
1093         result = newResult;
1094       }
1095 
1096       out.addAll(result);
1097     }
1098   }
1099 
1100   /**
1101    * Html visitor allows external code to iterate through the nodes in the
1102    * document. See HtmlDocument.accept.
1103    */
1104   public static interface Visitor {
1105     /** This is called first */
start()1106     void start();
1107 
1108     /** A text node */
visitText(Text n)1109     void visitText(Text n);
1110 
1111     /** An open tag */
visitTag(Tag n)1112     void visitTag(Tag n);
1113 
1114     /** End tag */
visitEndTag(EndTag n)1115     void visitEndTag(EndTag n);
1116 
1117     /** comment */
visitComment(Comment n)1118     void visitComment(Comment n);
1119 
1120     /* Called at the end. */
finish()1121     void finish();
1122   }
1123 
1124   /**
1125    * An implementation of the Visitor interface which simply delegates its
1126    * methods to a wrapped instance of another Visitor.
1127    *
1128    * <p>This is useful for chaining Visitors together.
1129    */
1130   public static class VisitorWrapper implements Visitor {
1131     private final Visitor wrapped;
1132 
VisitorWrapper(Visitor wrap)1133     protected VisitorWrapper(Visitor wrap) {
1134       wrapped = wrap;
1135     }
1136 
start()1137     public void start() {
1138       wrapped.start();
1139     }
1140 
visitText(Text n)1141     public void visitText(Text n) {
1142       wrapped.visitText(n);
1143     }
1144 
visitTag(Tag n)1145     public void visitTag(Tag n) {
1146       wrapped.visitTag(n);
1147     }
1148 
visitEndTag(EndTag n)1149     public void visitEndTag(EndTag n) {
1150       wrapped.visitEndTag(n);
1151     }
1152 
visitComment(Comment n)1153     public void visitComment(Comment n) {
1154       wrapped.visitComment(n);
1155     }
1156 
finish()1157     public void finish() {
1158       wrapped.finish();
1159     }
1160   }
1161 
1162   /**
1163    * A special helper Visitor that builds a HtmlDocument.
1164    */
1165   public static class Builder implements Visitor {
1166     private final boolean preserveComments;
1167     private final List<Node> nodes = new ArrayList<Node>();
1168     private HtmlDocument doc;
1169 
1170     /**
1171      * @see Builder#Builder(boolean)
1172      */
Builder()1173     public Builder() {
1174       this(false);
1175     }
1176 
1177     /**
1178      * @param preserveComments If false, ignores Comment nodes
1179      */
Builder(boolean preserveComments)1180     public Builder(boolean preserveComments) {
1181       this.preserveComments = preserveComments;
1182     }
1183 
addNode(Node node)1184     public void addNode(Node node) {
1185       nodes.add(node);
1186     }
start()1187     public void start() {
1188     }
visitText(Text t)1189     public void visitText(Text t) {
1190       addNode(t);
1191     }
visitTag(Tag t)1192     public void visitTag(Tag t) {
1193       addNode(t);
1194     }
visitComment(Comment n)1195     public void visitComment(Comment n) {
1196       if (preserveComments) {
1197         addNode(n);
1198       }
1199     }
visitEndTag(EndTag t)1200     public void visitEndTag(EndTag t) {
1201       addNode(t);
1202     }
finish()1203     public void finish() {
1204       doc = new HtmlDocument(nodes);
1205     }
1206 
1207     /** Gets the html document that has been constructed */
getDocument()1208     public HtmlDocument getDocument() {
1209       return doc;
1210     }
1211   }
1212 
1213   /**
1214    * A Visitor that prints out the html document in debug format.
1215    */
1216   public static class DebugPrinter implements Visitor {
1217 
1218     private final PrintWriter writer;
1219 
DebugPrinter(PrintWriter writer)1220     public DebugPrinter(PrintWriter writer) {
1221       this.writer = writer;
1222     }
1223 
start()1224     public void start() {
1225     }
1226 
visitText(Text t)1227     public void visitText(Text t) {
1228       writeCollapsed("TEXT", t.getText());
1229     }
1230 
visitComment(Comment n)1231     public void visitComment(Comment n) {
1232       writeCollapsed("COMMENT", n.getContent());
1233     }
1234 
writeCollapsed(String type, String s)1235     private void writeCollapsed(String type, String s) {
1236       writer.print(type);
1237       writer.print(": ");
1238       String noNewlines = s.replace("\n", " ");
1239       // Use CharMatcher#WHITESPACE?
1240       String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' ');
1241       writer.print(collapsed);
1242     }
1243 
visitTag(Tag tag)1244     public void visitTag(Tag tag) {
1245       writer.print("==<" + tag.getName() + ">");
1246       List<TagAttribute> attributes = tag.getAttributes();
1247       if (attributes != null) {
1248 
1249         // Attribute values
1250         List<String> attrs = new ArrayList<String>();
1251         for (TagAttribute a : attributes) {
1252           attrs.add("[" + a.getName() + " : " + a.getValue() + "]");
1253         }
1254         String[] array = attrs.toArray(new String[attrs.size()]);
1255 
1256         // Sort the attributes so that it's easier to read and compare
1257         Arrays.sort(array);
1258         for (int i = 0; i < array.length; i++) {
1259           writer.print(" " + array[i]);
1260         }
1261       }
1262       writer.println();
1263     }
1264 
visitEndTag(EndTag endtag)1265     public void visitEndTag(EndTag endtag) {
1266       writer.println("==</" + endtag.getName() + ">");
1267     }
1268 
finish()1269     public void finish() {
1270     }
1271   }
1272 
1273 }