001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html; 030 031 import java.util.LinkedList; 032 import java.util.List; 033 import javax.annotation.Nullable; 034 035 import com.google.common.collect.Lists; 036 037 /** 038 * Consumes an HTML stream, and dispatches events to a policy object which 039 * decides which elements and attributes to allow. 040 */ 041 public final class HtmlSanitizer { 042 043 /** 044 * Receives events based on the HTML stream, and applies a policy to decide 045 * what HTML constructs to allow. 046 * Typically, implementations use an {@link HtmlStreamRenderer} to produce 047 * the sanitized output. 048 * 049 * <p> 050 * <b>Implementations of this class are in the TCB.</b></p> 051 */ 052 @TCB 053 public interface Policy extends HtmlStreamEventReceiver { 054 /** 055 * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input. 056 * 057 * @param elementName a normalized (lower-case for non-namespaced names) 058 * element name. 059 * @param attrs a list of alternating attribute name and value pairs. 060 * For efficiency, this list may be mutated by this during this method 061 * call, but ownership reverts to the caller on method exit. 062 * The values are raw -- HTML entities have been decoded. 063 * Specifically, implementations are allowed to use a list iterator 064 * and remove all disallowed attributes, add necessary attributes, and 065 * then pass the list to an {@link HtmlStreamRenderer}. 066 */ 067 void openTag(String elementName, List<String> attrs); 068 069 /** 070 * Called when an HTML tag like {@code </foo>} is seen in the input. 071 * 072 * @param elementName a normalized (lower-case for non-namespaced names) 073 * element name. 074 */ 075 void closeTag(String elementName); 076 077 /** 078 * Called when textual content is seen. 079 * @param textChunk raw content -- HTML entities have been decoded. 080 */ 081 void text(String textChunk); 082 } 083 084 /** 085 * Sanitizes the given HTML by applying the given policy to it. 086 * 087 * <p> 088 * This method is not in the TCB. 089 * 090 * <p> 091 * This method has no return value since policies are assumed to render things 092 * they accept and do nothing on things they reject. 093 * Use {@link HtmlStreamRenderer} to render content to an output buffer. 094 * 095 * @param html A snippet of HTML to sanitize. {@code null} is treated as the 096 * empty string and will not result in a {@code NullPointerException}. 097 * @param policy The Policy that will receive events based on the tokens in 098 * HTML. Typically, this policy ends up routing the events to an 099 * {@link HtmlStreamRenderer} after filtering. 100 * {@link HtmlPolicyBuilder} provides an easy way to create policies. 101 */ 102 public static void sanitize(@Nullable String html, final Policy policy) { 103 if (html == null) { html = ""; } 104 105 TagBalancingHtmlStreamEventReceiver balancer 106 = new TagBalancingHtmlStreamEventReceiver(policy); 107 108 // According to Opera the maximum table nesting depth seen in the wild is 109 // 795, but 99.99% of documents have a table nesting depth of less than 22. 110 // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a 111 // document depth of 90 (incl. HTML & BODY). 112 // Obviously table nesting depth is not the same as whole document depth, 113 // but it is the best proxy I have available. 114 // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for 115 // the original data. 116 117 // Webkit defines the maximum HTML parser tree depth as 512. 118 // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408 119 // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512; 120 121 // The first number gives us a lower bound on the nesting depth we allow, 122 // 90, and the second gives us an upper bound: 512. 123 // We do not want to bump right up against that limit. 124 // 256 is substantially larger than the lower bound and well clear of the 125 // upper bound. 126 balancer.setNestingLimit(256); 127 128 balancer.openDocument(); 129 130 HtmlLexer lexer = new HtmlLexer(html); 131 // Use a linked list so that policies can use Iterator.remove() in an O(1) 132 // way. 133 LinkedList<String> attrs = Lists.newLinkedList(); 134 while (lexer.hasNext()) { 135 HtmlToken token = lexer.next(); 136 switch (token.type) { 137 case TEXT: 138 balancer.text( 139 Encoding.decodeHtml(html.substring(token.start, token.end))); 140 break; 141 case UNESCAPED: 142 balancer.text(Encoding.stripBannedCodeunits( 143 html.substring(token.start, token.end))); 144 break; 145 case TAGBEGIN: 146 if (html.charAt(token.start + 1) == '/') { // A close tag. 147 balancer.closeTag(HtmlLexer.canonicalName( 148 html.substring(token.start + 2, token.end))); 149 while (lexer.hasNext() 150 && lexer.next().type != HtmlTokenType.TAGEND) { 151 // skip tokens until we see a ">" 152 } 153 } else { 154 attrs.clear(); 155 156 boolean attrsReadyForName = true; 157 tagBody: 158 while (lexer.hasNext()) { 159 HtmlToken tagBodyToken = lexer.next(); 160 switch (tagBodyToken.type) { 161 case ATTRNAME: 162 if (!attrsReadyForName) { 163 // Last attribute added was valueless. 164 attrs.add(attrs.getLast()); 165 } else { 166 attrsReadyForName = false; 167 } 168 attrs.add(HtmlLexer.canonicalName( 169 html.substring(tagBodyToken.start, tagBodyToken.end))); 170 break; 171 case ATTRVALUE: 172 attrs.add(Encoding.decodeHtml(stripQuotes( 173 html.substring(tagBodyToken.start, tagBodyToken.end)))); 174 attrsReadyForName = true; 175 break; 176 case TAGEND: 177 break tagBody; 178 default: 179 // Just drop anything not recognized 180 } 181 } 182 if (!attrsReadyForName) { 183 attrs.add(attrs.getLast()); 184 } 185 balancer.openTag( 186 HtmlLexer.canonicalName( 187 html.substring(token.start + 1, token.end)), 188 attrs); 189 } 190 break; 191 default: 192 // Ignore comments, XML prologues, processing instructions, and other 193 // stuff that shouldn't show up in the output. 194 break; 195 } 196 } 197 198 balancer.closeDocument(); 199 } 200 201 private static String stripQuotes(String encodedAttributeValue) { 202 int n = encodedAttributeValue.length(); 203 if (n > 0) { 204 char last = encodedAttributeValue.charAt(n - 1); 205 if (last == '"' || last == '\'') { 206 int start = 0; 207 if (n != 1 && last == encodedAttributeValue.charAt(0)) { 208 start = 1; 209 } else { 210 // Browsers deal with missing left quotes : <img src=foo.png"> 211 // but generally do not deal with missing right : <img src="foo.png> 212 } 213 return encodedAttributeValue.substring(start, n - 1); 214 } 215 } 216 return encodedAttributeValue; 217 } 218 219 }