1<HTML> 2<BODY BGCOLOR="white"> 3<PRE> 4<FONT color="green">001</FONT> // Copyright (c) 2011, Mike Samuel<a name="line.1"></a> 5<FONT color="green">002</FONT> // All rights reserved.<a name="line.2"></a> 6<FONT color="green">003</FONT> //<a name="line.3"></a> 7<FONT color="green">004</FONT> // Redistribution and use in source and binary forms, with or without<a name="line.4"></a> 8<FONT color="green">005</FONT> // modification, are permitted provided that the following conditions<a name="line.5"></a> 9<FONT color="green">006</FONT> // are met:<a name="line.6"></a> 10<FONT color="green">007</FONT> //<a name="line.7"></a> 11<FONT color="green">008</FONT> // Redistributions of source code must retain the above copyright<a name="line.8"></a> 12<FONT color="green">009</FONT> // notice, this list of conditions and the following disclaimer.<a name="line.9"></a> 13<FONT color="green">010</FONT> // Redistributions in binary form must reproduce the above copyright<a name="line.10"></a> 14<FONT color="green">011</FONT> // notice, this list of conditions and the following disclaimer in the<a name="line.11"></a> 15<FONT color="green">012</FONT> // documentation and/or other materials provided with the distribution.<a name="line.12"></a> 16<FONT color="green">013</FONT> // Neither the name of the OWASP nor the names of its contributors may<a name="line.13"></a> 17<FONT color="green">014</FONT> // be used to endorse or promote products derived from this software<a name="line.14"></a> 18<FONT color="green">015</FONT> // without specific prior written permission.<a name="line.15"></a> 19<FONT color="green">016</FONT> // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS<a name="line.16"></a> 20<FONT color="green">017</FONT> // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT<a name="line.17"></a> 21<FONT color="green">018</FONT> // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS<a name="line.18"></a> 22<FONT color="green">019</FONT> // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE<a name="line.19"></a> 23<FONT color="green">020</FONT> // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,<a name="line.20"></a> 24<FONT color="green">021</FONT> // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,<a name="line.21"></a> 25<FONT color="green">022</FONT> // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;<a name="line.22"></a> 26<FONT color="green">023</FONT> // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER<a name="line.23"></a> 27<FONT color="green">024</FONT> // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT<a name="line.24"></a> 28<FONT color="green">025</FONT> // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN<a name="line.25"></a> 29<FONT color="green">026</FONT> // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE<a name="line.26"></a> 30<FONT color="green">027</FONT> // POSSIBILITY OF SUCH DAMAGE.<a name="line.27"></a> 31<FONT color="green">028</FONT> <a name="line.28"></a> 32<FONT color="green">029</FONT> package org.owasp.html;<a name="line.29"></a> 33<FONT color="green">030</FONT> <a name="line.30"></a> 34<FONT color="green">031</FONT> import java.util.LinkedList;<a name="line.31"></a> 35<FONT color="green">032</FONT> import java.util.List;<a name="line.32"></a> 36<FONT color="green">033</FONT> import javax.annotation.Nullable;<a name="line.33"></a> 37<FONT color="green">034</FONT> <a name="line.34"></a> 38<FONT color="green">035</FONT> import com.google.common.collect.Lists;<a name="line.35"></a> 39<FONT color="green">036</FONT> <a name="line.36"></a> 40<FONT color="green">037</FONT> /**<a name="line.37"></a> 41<FONT color="green">038</FONT> * Consumes an HTML stream, and dispatches events to a policy object which<a name="line.38"></a> 42<FONT color="green">039</FONT> * decides which elements and attributes to allow.<a name="line.39"></a> 43<FONT color="green">040</FONT> */<a name="line.40"></a> 44<FONT color="green">041</FONT> public final class HtmlSanitizer {<a name="line.41"></a> 45<FONT color="green">042</FONT> <a name="line.42"></a> 46<FONT color="green">043</FONT> /**<a name="line.43"></a> 47<FONT color="green">044</FONT> * Receives events based on the HTML stream, and applies a policy to decide<a name="line.44"></a> 48<FONT color="green">045</FONT> * what HTML constructs to allow.<a name="line.45"></a> 49<FONT color="green">046</FONT> * Typically, implementations use an {@link HtmlStreamRenderer} to produce<a name="line.46"></a> 50<FONT color="green">047</FONT> * the sanitized output.<a name="line.47"></a> 51<FONT color="green">048</FONT> *<a name="line.48"></a> 52<FONT color="green">049</FONT> * <p><a name="line.49"></a> 53<FONT color="green">050</FONT> * <b>Implementations of this class are in the TCB.</b></p><a name="line.50"></a> 54<FONT color="green">051</FONT> */<a name="line.51"></a> 55<FONT color="green">052</FONT> @TCB<a name="line.52"></a> 56<FONT color="green">053</FONT> public interface Policy extends HtmlStreamEventReceiver {<a name="line.53"></a> 57<FONT color="green">054</FONT> /**<a name="line.54"></a> 58<FONT color="green">055</FONT> * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.<a name="line.55"></a> 59<FONT color="green">056</FONT> *<a name="line.56"></a> 60<FONT color="green">057</FONT> * @param elementName a normalized (lower-case for non-namespaced names)<a name="line.57"></a> 61<FONT color="green">058</FONT> * element name.<a name="line.58"></a> 62<FONT color="green">059</FONT> * @param attrs a list of alternating attribute name and value pairs.<a name="line.59"></a> 63<FONT color="green">060</FONT> * For efficiency, this list may be mutated by this during this method<a name="line.60"></a> 64<FONT color="green">061</FONT> * call, but ownership reverts to the caller on method exit.<a name="line.61"></a> 65<FONT color="green">062</FONT> * The values are raw -- HTML entities have been decoded.<a name="line.62"></a> 66<FONT color="green">063</FONT> * Specifically, implementations are allowed to use a list iterator<a name="line.63"></a> 67<FONT color="green">064</FONT> * and remove all disallowed attributes, add necessary attributes, and<a name="line.64"></a> 68<FONT color="green">065</FONT> * then pass the list to an {@link HtmlStreamRenderer}.<a name="line.65"></a> 69<FONT color="green">066</FONT> */<a name="line.66"></a> 70<FONT color="green">067</FONT> void openTag(String elementName, List<String> attrs);<a name="line.67"></a> 71<FONT color="green">068</FONT> <a name="line.68"></a> 72<FONT color="green">069</FONT> /**<a name="line.69"></a> 73<FONT color="green">070</FONT> * Called when an HTML tag like {@code </foo>} is seen in the input.<a name="line.70"></a> 74<FONT color="green">071</FONT> *<a name="line.71"></a> 75<FONT color="green">072</FONT> * @param elementName a normalized (lower-case for non-namespaced names)<a name="line.72"></a> 76<FONT color="green">073</FONT> * element name.<a name="line.73"></a> 77<FONT color="green">074</FONT> */<a name="line.74"></a> 78<FONT color="green">075</FONT> void closeTag(String elementName);<a name="line.75"></a> 79<FONT color="green">076</FONT> <a name="line.76"></a> 80<FONT color="green">077</FONT> /**<a name="line.77"></a> 81<FONT color="green">078</FONT> * Called when textual content is seen.<a name="line.78"></a> 82<FONT color="green">079</FONT> * @param textChunk raw content -- HTML entities have been decoded.<a name="line.79"></a> 83<FONT color="green">080</FONT> */<a name="line.80"></a> 84<FONT color="green">081</FONT> void text(String textChunk);<a name="line.81"></a> 85<FONT color="green">082</FONT> }<a name="line.82"></a> 86<FONT color="green">083</FONT> <a name="line.83"></a> 87<FONT color="green">084</FONT> /**<a name="line.84"></a> 88<FONT color="green">085</FONT> * Sanitizes the given HTML by applying the given policy to it.<a name="line.85"></a> 89<FONT color="green">086</FONT> *<a name="line.86"></a> 90<FONT color="green">087</FONT> * <p><a name="line.87"></a> 91<FONT color="green">088</FONT> * This method is not in the TCB.<a name="line.88"></a> 92<FONT color="green">089</FONT> *<a name="line.89"></a> 93<FONT color="green">090</FONT> * <p><a name="line.90"></a> 94<FONT color="green">091</FONT> * This method has no return value since policies are assumed to render things<a name="line.91"></a> 95<FONT color="green">092</FONT> * they accept and do nothing on things they reject.<a name="line.92"></a> 96<FONT color="green">093</FONT> * Use {@link HtmlStreamRenderer} to render content to an output buffer.<a name="line.93"></a> 97<FONT color="green">094</FONT> *<a name="line.94"></a> 98<FONT color="green">095</FONT> * @param html A snippet of HTML to sanitize. {@code null} is treated as the<a name="line.95"></a> 99<FONT color="green">096</FONT> * empty string and will not result in a {@code NullPointerException}.<a name="line.96"></a> 100<FONT color="green">097</FONT> * @param policy The Policy that will receive events based on the tokens in<a name="line.97"></a> 101<FONT color="green">098</FONT> * HTML. Typically, this policy ends up routing the events to an<a name="line.98"></a> 102<FONT color="green">099</FONT> * {@link HtmlStreamRenderer} after filtering.<a name="line.99"></a> 103<FONT color="green">100</FONT> * {@link HtmlPolicyBuilder} provides an easy way to create policies.<a name="line.100"></a> 104<FONT color="green">101</FONT> */<a name="line.101"></a> 105<FONT color="green">102</FONT> public static void sanitize(@Nullable String html, final Policy policy) {<a name="line.102"></a> 106<FONT color="green">103</FONT> if (html == null) { html = ""; }<a name="line.103"></a> 107<FONT color="green">104</FONT> <a name="line.104"></a> 108<FONT color="green">105</FONT> TagBalancingHtmlStreamEventReceiver balancer<a name="line.105"></a> 109<FONT color="green">106</FONT> = new TagBalancingHtmlStreamEventReceiver(policy);<a name="line.106"></a> 110<FONT color="green">107</FONT> <a name="line.107"></a> 111<FONT color="green">108</FONT> // According to Opera the maximum table nesting depth seen in the wild is<a name="line.108"></a> 112<FONT color="green">109</FONT> // 795, but 99.99% of documents have a table nesting depth of less than 22.<a name="line.109"></a> 113<FONT color="green">110</FONT> // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a<a name="line.110"></a> 114<FONT color="green">111</FONT> // document depth of 90 (incl. HTML & BODY).<a name="line.111"></a> 115<FONT color="green">112</FONT> // Obviously table nesting depth is not the same as whole document depth,<a name="line.112"></a> 116<FONT color="green">113</FONT> // but it is the best proxy I have available.<a name="line.113"></a> 117<FONT color="green">114</FONT> // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for<a name="line.114"></a> 118<FONT color="green">115</FONT> // the original data.<a name="line.115"></a> 119<FONT color="green">116</FONT> <a name="line.116"></a> 120<FONT color="green">117</FONT> // Webkit defines the maximum HTML parser tree depth as 512.<a name="line.117"></a> 121<FONT color="green">118</FONT> // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408<a name="line.118"></a> 122<FONT color="green">119</FONT> // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;<a name="line.119"></a> 123<FONT color="green">120</FONT> <a name="line.120"></a> 124<FONT color="green">121</FONT> // The first number gives us a lower bound on the nesting depth we allow,<a name="line.121"></a> 125<FONT color="green">122</FONT> // 90, and the second gives us an upper bound: 512.<a name="line.122"></a> 126<FONT color="green">123</FONT> // We do not want to bump right up against that limit.<a name="line.123"></a> 127<FONT color="green">124</FONT> // 256 is substantially larger than the lower bound and well clear of the<a name="line.124"></a> 128<FONT color="green">125</FONT> // upper bound.<a name="line.125"></a> 129<FONT color="green">126</FONT> balancer.setNestingLimit(256);<a name="line.126"></a> 130<FONT color="green">127</FONT> <a name="line.127"></a> 131<FONT color="green">128</FONT> balancer.openDocument();<a name="line.128"></a> 132<FONT color="green">129</FONT> <a name="line.129"></a> 133<FONT color="green">130</FONT> HtmlLexer lexer = new HtmlLexer(html);<a name="line.130"></a> 134<FONT color="green">131</FONT> // Use a linked list so that policies can use Iterator.remove() in an O(1)<a name="line.131"></a> 135<FONT color="green">132</FONT> // way.<a name="line.132"></a> 136<FONT color="green">133</FONT> LinkedList<String> attrs = Lists.newLinkedList();<a name="line.133"></a> 137<FONT color="green">134</FONT> while (lexer.hasNext()) {<a name="line.134"></a> 138<FONT color="green">135</FONT> HtmlToken token = lexer.next();<a name="line.135"></a> 139<FONT color="green">136</FONT> switch (token.type) {<a name="line.136"></a> 140<FONT color="green">137</FONT> case TEXT:<a name="line.137"></a> 141<FONT color="green">138</FONT> balancer.text(<a name="line.138"></a> 142<FONT color="green">139</FONT> Encoding.decodeHtml(html.substring(token.start, token.end)));<a name="line.139"></a> 143<FONT color="green">140</FONT> break;<a name="line.140"></a> 144<FONT color="green">141</FONT> case UNESCAPED:<a name="line.141"></a> 145<FONT color="green">142</FONT> balancer.text(Encoding.stripBannedCodeunits(<a name="line.142"></a> 146<FONT color="green">143</FONT> html.substring(token.start, token.end)));<a name="line.143"></a> 147<FONT color="green">144</FONT> break;<a name="line.144"></a> 148<FONT color="green">145</FONT> case TAGBEGIN:<a name="line.145"></a> 149<FONT color="green">146</FONT> if (html.charAt(token.start + 1) == '/') { // A close tag.<a name="line.146"></a> 150<FONT color="green">147</FONT> balancer.closeTag(HtmlLexer.canonicalName(<a name="line.147"></a> 151<FONT color="green">148</FONT> html.substring(token.start + 2, token.end)));<a name="line.148"></a> 152<FONT color="green">149</FONT> while (lexer.hasNext()<a name="line.149"></a> 153<FONT color="green">150</FONT> && lexer.next().type != HtmlTokenType.TAGEND) {<a name="line.150"></a> 154<FONT color="green">151</FONT> // skip tokens until we see a ">"<a name="line.151"></a> 155<FONT color="green">152</FONT> }<a name="line.152"></a> 156<FONT color="green">153</FONT> } else {<a name="line.153"></a> 157<FONT color="green">154</FONT> attrs.clear();<a name="line.154"></a> 158<FONT color="green">155</FONT> <a name="line.155"></a> 159<FONT color="green">156</FONT> boolean attrsReadyForName = true;<a name="line.156"></a> 160<FONT color="green">157</FONT> tagBody:<a name="line.157"></a> 161<FONT color="green">158</FONT> while (lexer.hasNext()) {<a name="line.158"></a> 162<FONT color="green">159</FONT> HtmlToken tagBodyToken = lexer.next();<a name="line.159"></a> 163<FONT color="green">160</FONT> switch (tagBodyToken.type) {<a name="line.160"></a> 164<FONT color="green">161</FONT> case ATTRNAME:<a name="line.161"></a> 165<FONT color="green">162</FONT> if (!attrsReadyForName) {<a name="line.162"></a> 166<FONT color="green">163</FONT> // Last attribute added was valueless.<a name="line.163"></a> 167<FONT color="green">164</FONT> attrs.add(attrs.getLast());<a name="line.164"></a> 168<FONT color="green">165</FONT> } else {<a name="line.165"></a> 169<FONT color="green">166</FONT> attrsReadyForName = false;<a name="line.166"></a> 170<FONT color="green">167</FONT> }<a name="line.167"></a> 171<FONT color="green">168</FONT> attrs.add(HtmlLexer.canonicalName(<a name="line.168"></a> 172<FONT color="green">169</FONT> html.substring(tagBodyToken.start, tagBodyToken.end)));<a name="line.169"></a> 173<FONT color="green">170</FONT> break;<a name="line.170"></a> 174<FONT color="green">171</FONT> case ATTRVALUE:<a name="line.171"></a> 175<FONT color="green">172</FONT> attrs.add(Encoding.decodeHtml(stripQuotes(<a name="line.172"></a> 176<FONT color="green">173</FONT> html.substring(tagBodyToken.start, tagBodyToken.end))));<a name="line.173"></a> 177<FONT color="green">174</FONT> attrsReadyForName = true;<a name="line.174"></a> 178<FONT color="green">175</FONT> break;<a name="line.175"></a> 179<FONT color="green">176</FONT> case TAGEND:<a name="line.176"></a> 180<FONT color="green">177</FONT> break tagBody;<a name="line.177"></a> 181<FONT color="green">178</FONT> default:<a name="line.178"></a> 182<FONT color="green">179</FONT> // Just drop anything not recognized<a name="line.179"></a> 183<FONT color="green">180</FONT> }<a name="line.180"></a> 184<FONT color="green">181</FONT> }<a name="line.181"></a> 185<FONT color="green">182</FONT> if (!attrsReadyForName) {<a name="line.182"></a> 186<FONT color="green">183</FONT> attrs.add(attrs.getLast());<a name="line.183"></a> 187<FONT color="green">184</FONT> }<a name="line.184"></a> 188<FONT color="green">185</FONT> balancer.openTag(<a name="line.185"></a> 189<FONT color="green">186</FONT> HtmlLexer.canonicalName(<a name="line.186"></a> 190<FONT color="green">187</FONT> html.substring(token.start + 1, token.end)),<a name="line.187"></a> 191<FONT color="green">188</FONT> attrs);<a name="line.188"></a> 192<FONT color="green">189</FONT> }<a name="line.189"></a> 193<FONT color="green">190</FONT> break;<a name="line.190"></a> 194<FONT color="green">191</FONT> default:<a name="line.191"></a> 195<FONT color="green">192</FONT> // Ignore comments, XML prologues, processing instructions, and other<a name="line.192"></a> 196<FONT color="green">193</FONT> // stuff that shouldn't show up in the output.<a name="line.193"></a> 197<FONT color="green">194</FONT> break;<a name="line.194"></a> 198<FONT color="green">195</FONT> }<a name="line.195"></a> 199<FONT color="green">196</FONT> }<a name="line.196"></a> 200<FONT color="green">197</FONT> <a name="line.197"></a> 201<FONT color="green">198</FONT> balancer.closeDocument();<a name="line.198"></a> 202<FONT color="green">199</FONT> }<a name="line.199"></a> 203<FONT color="green">200</FONT> <a name="line.200"></a> 204<FONT color="green">201</FONT> private static String stripQuotes(String encodedAttributeValue) {<a name="line.201"></a> 205<FONT color="green">202</FONT> int n = encodedAttributeValue.length();<a name="line.202"></a> 206<FONT color="green">203</FONT> if (n > 0) {<a name="line.203"></a> 207<FONT color="green">204</FONT> char last = encodedAttributeValue.charAt(n - 1);<a name="line.204"></a> 208<FONT color="green">205</FONT> if (last == '"' || last == '\'') {<a name="line.205"></a> 209<FONT color="green">206</FONT> int start = 0;<a name="line.206"></a> 210<FONT color="green">207</FONT> if (n != 1 && last == encodedAttributeValue.charAt(0)) {<a name="line.207"></a> 211<FONT color="green">208</FONT> start = 1;<a name="line.208"></a> 212<FONT color="green">209</FONT> } else {<a name="line.209"></a> 213<FONT color="green">210</FONT> // Browsers deal with missing left quotes : <img src=foo.png"><a name="line.210"></a> 214<FONT color="green">211</FONT> // but generally do not deal with missing right : <img src="foo.png><a name="line.211"></a> 215<FONT color="green">212</FONT> }<a name="line.212"></a> 216<FONT color="green">213</FONT> return encodedAttributeValue.substring(start, n - 1);<a name="line.213"></a> 217<FONT color="green">214</FONT> }<a name="line.214"></a> 218<FONT color="green">215</FONT> }<a name="line.215"></a> 219<FONT color="green">216</FONT> return encodedAttributeValue;<a name="line.216"></a> 220<FONT color="green">217</FONT> }<a name="line.217"></a> 221<FONT color="green">218</FONT> <a name="line.218"></a> 222<FONT color="green">219</FONT> }<a name="line.219"></a> 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283</PRE> 284</BODY> 285</HTML> 286