1 /**
2  * Copyright (c) 2004, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.android.mail.common.html.parser;
17 
18 import com.google.android.mail.common.base.X;
19 import com.google.android.mail.common.html.parser.HtmlDocument.EndTag;
20 import com.google.common.io.ByteStreams;
21 
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.List;
25 import java.util.logging.Level;
26 import java.util.logging.Logger;
27 
28 /**
29  * HtmlTreeBuilder builds a well-formed HtmlTree.
30  *
31  * @see HtmlTree
32  * @author jlim@google.com (Jing Yee Lim)
33  */
34 public class HtmlTreeBuilder implements HtmlDocument.Visitor {
35 
36   private static final Logger logger = Logger.getLogger(HtmlTreeBuilder.class.getName());
37 
38   /** Stack contains HTML4.Element objects to keep track of unclosed tags */
39   private final List<HTML.Element> stack = new ArrayList<HTML.Element>();
40   private final TableFixer tableFixer = new TableFixer();
41   private HtmlTree tree;
42   private boolean built = false;
43 
44   /** Gets the built html tree */
getTree()45   public HtmlTree getTree() {
46     X.assertTrue(built);
47     return tree;
48   }
49 
50   /** Implements HtmlDocument.Visitor.start */
start()51   public void start() {
52     tree = new HtmlTree();
53     tree.start();
54   }
55 
56   /** Implements HtmlDocument.Visitor.finish */
finish()57   public void finish() {
58     // Close all tags
59     while (stack.size() > 0) {
60       addMissingEndTag();
61     }
62     tableFixer.finish();
63     tree.finish();
64 
65     built = true;
66   }
67 
68   /** Implements HtmlDocument.Visitor.visitTag */
visitTag(HtmlDocument.Tag t)69   public void visitTag(HtmlDocument.Tag t) {
70     tableFixer.seeTag(t);
71 
72     HTML.Element element = t.getElement();
73     if (element.isEmpty()) {
74       tree.addSingularTag(t);
75     } else if (t.isSelfTerminating()) {
76       // Explicitly create a non-selfterminating open tag and add it to the tree
77       // and also immediately add the corresponding close tag. This is done
78       // so that the toHTML, toXHTML and toOriginalHTML of the tree's node list
79       // will be balanced consistently.
80       // Otherwise there is a possibility of "<span /></span>" for example, if
81       // the created tree is converted to string through toXHTML.
82       tree.addStartTag(HtmlDocument.createTag(element,
83           t.getAttributes(), t.getOriginalHtmlBeforeAttributes(),
84           t.getOriginalHtmlAfterAttributes()));
85       EndTag end = HtmlDocument.createEndTag(element);
86       tableFixer.seeEndTag(end);
87       tree.addEndTag(end);
88     } else {
89       tree.addStartTag(t);
90       push(element);                       // Track the open tags
91     }
92   }
93 
94   /** Implements HtmlVisitor.visit */
visitEndTag(HtmlDocument.EndTag t)95   public void visitEndTag(HtmlDocument.EndTag t) {
96 
97     // Here we pop back to the start tag
98     HTML.Element element = t.getElement();
99     int pos = findStartTag(element);
100     if (pos >= 0) {
101 
102       // Add missing end-tags if any
103       while (pos < stack.size() - 1) {
104         addMissingEndTag();
105       }
106 
107       pop();
108       tableFixer.seeEndTag(t);
109       tree.addEndTag(t);
110 
111     } else {
112       // Not found, ignore this end tag
113       logger.finest("Ignoring end tag: " + element.getName());
114     }
115   }
116 
117   /** Implements HtmlDocument.Visitor.visitText */
visitText(HtmlDocument.Text t)118   public void visitText(HtmlDocument.Text t) {
119     tableFixer.seeText(t);
120     tree.addText(t);
121   }
122 
123   /** Implements HtmlDocument.Visitor.visitComment */
visitComment(HtmlDocument.Comment n)124   public void visitComment(HtmlDocument.Comment n) {
125     // ignore
126   }
127 
128   /** Finds the start tag from the stack, returns -1 if not found */
findStartTag(HTML.Element element)129   private int findStartTag(HTML.Element element) {
130     for (int i = stack.size() - 1; i >= 0; i--) {
131       HTML.Element e = stack.get(i);
132       if (e == element) {
133         return i;
134       }
135     }
136     return -1;
137   }
138 
139   /**
140    * Adds a close tag corresponding to a tag on the stack, if
141    * the tag needs a close tag.
142    */
addMissingEndTag()143   private void addMissingEndTag() {
144     HTML.Element element = pop();
145 
146     HtmlDocument.EndTag endTag = HtmlDocument.createEndTag(element);
147     tableFixer.seeEndTag(endTag);
148     tree.addEndTag(endTag);
149   }
150 
151   /** Pushes a tag onto the stack */
push(HTML.Element element)152   private void push(HTML.Element element) {
153     stack.add(element);
154   }
155 
156   /** Pops an elemnt from the stack */
pop()157   private HTML.Element pop() {
158     return stack.remove(stack.size() - 1);
159   }
160 
161   /**
162    * The TableFixer makes sure that a <table> structure is more or less well
163    * formed. Note that it only ensures that data within the <table> tag doesn't
164    * "leak out" of the table.
165    *
166    * For instance, all the tags here are balanced with end tags. But the
167    * 'outside' text ends up leaking out of the table.
168    * <table><tr><td bgcolor=yellow>
169    * <table><table>inside</table><td>outside</td></table>
170    * </td></tr></table>
171    *
172    * The TableFixer makes sure that
173    * 1) Within a table:, text and other elements are enclosed within a TD.
174    *    A TD tag is inserted where necessary.
175    * 2) All table structure tags are enclosed within a <table>. A TABLE tag
176    *    is inserted where necessary.
177    *
178    * Note that the TableFixer only adds open tags, it doesn't add end tags.
179    * The HtmlTreeVerifier ensures that all open tags are properly matched
180    * up and closed.
181    *
182    * @author Jing Yee Lim (jlim@google.com)
183    */
184   class TableFixer {
185 
186     private int tables = 0;             // table nesting level
187 
188     // States within a <table>
189     static final int NULL = 0;
190     static final int IN_CELL = 1;       // in a <td> or <th> tag
191     static final int IN_CAPTION = 2;    // in a <caption> tag
192 
193     private int state;
194 
seeTag(HtmlDocument.Tag tag)195     void seeTag(HtmlDocument.Tag tag) {
196       HTML.Element element = tag.getElement();
197       if (element.getType() == HTML.Element.TABLE_TYPE) {
198 
199         if (HTML4.TABLE_ELEMENT.equals(element)) {
200           if (tables > 0) {
201             ensureCellState();
202           }
203           tables++;
204           state = NULL;
205 
206         } else {
207           // Make sure that we're in a table
208           ensureTableState();
209 
210           // In cell/caption?
211           if (HTML4.TD_ELEMENT.equals(element) ||
212               HTML4.TH_ELEMENT.equals(element)) {
213             state = IN_CELL;
214 
215           } else if (HTML4.CAPTION_ELEMENT.equals(element)) {
216             state = IN_CAPTION;
217           }
218         }
219       } else {
220         if (tables > 0) {
221 
222           // Ok to have a form element outside a table cell.
223           // e.g. <TR><FORM><TD>...
224           if (!HTML4.FORM_ELEMENT.equals(element)) {
225             ensureCellState();
226           }
227         }
228       }
229     }
230 
seeEndTag(HtmlDocument.EndTag endTag)231     void seeEndTag(HtmlDocument.EndTag endTag) {
232       HTML.Element element= endTag.getElement();
233 
234       if (tables > 0 && element.getType() == HTML.Element.TABLE_TYPE) {
235 
236         if (HTML4.TD_ELEMENT.equals(element) ||
237             HTML4.TR_ELEMENT.equals(element) ||
238             HTML4.TH_ELEMENT.equals(element)) {
239           // End of a cell
240           state = NULL;
241 
242         } else if (HTML4.CAPTION_ELEMENT.equals(element)) { // End caption
243           state = NULL;
244 
245         } else if (HTML4.TABLE_ELEMENT.equals(element)) { // End table
246           X.assertTrue(tables > 0);
247           tables--;
248           state = (tables > 0) ? IN_CELL : NULL;
249         }
250       }
251     }
252 
seeText(HtmlDocument.Text textNode)253     void seeText(HtmlDocument.Text textNode) {
254       // If we're in a table, but not in a cell or caption, and the
255       // text is not whitespace, add a <TD>
256       if (tables > 0 &&
257           state == NULL &&
258           !textNode.isWhitespace()) {
259         ensureCellState();
260       }
261     }
262 
finish()263     void finish() {
264       X.assertTrue(tables == 0);
265       X.assertTrue(state == NULL);
266     }
267 
268     // Ensure that we're within a TABLE
ensureTableState()269     private void ensureTableState() {
270       if (tables == 0) {
271         push(HTML4.TABLE_ELEMENT);
272 
273         HtmlDocument.Tag tableTag =
274           HtmlDocument.createTag(HTML4.TABLE_ELEMENT, null);
275         tree.addStartTag(tableTag);
276 
277         tables++;
278       }
279     }
280 
281     // Ensure that we're within a TD or TH cell
ensureCellState()282     private void ensureCellState() {
283       if (state != IN_CELL) {
284         push(HTML4.TD_ELEMENT);
285 
286         HtmlDocument.Tag tdTag = HtmlDocument.createTag(HTML4.TD_ELEMENT, null);
287         tree.addStartTag(tdTag);
288 
289         state = IN_CELL;
290       }
291     }
292   }
293 
294   /** For testing */
main(String[] args)295   public static void main(String[] args) throws IOException {
296     logger.setLevel(Level.FINEST);
297 
298     String html = new String(ByteStreams.toByteArray(System.in));
299     HtmlParser parser = new HtmlParser();
300     HtmlDocument doc = parser.parse(html);
301 
302     HtmlTreeBuilder builder = new HtmlTreeBuilder();
303     doc.accept(builder);
304     String outputHtml = builder.getTree().getHtml();
305 
306     System.out.println(outputHtml);
307   }
308 }