1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.android.mail.utils;
17 
18 import android.os.Looper;
19 import android.util.Log;
20 
21 import com.android.mail.perf.Timer;
22 import com.google.common.collect.ImmutableList;
23 import com.google.common.collect.ImmutableSet;
24 
25 import org.owasp.html.AttributePolicy;
26 import org.owasp.html.CssSchema;
27 import org.owasp.html.ElementPolicy;
28 import org.owasp.html.FilterUrlByProtocolAttributePolicy;
29 import org.owasp.html.Handler;
30 import org.owasp.html.HtmlPolicyBuilder;
31 import org.owasp.html.HtmlStreamRenderer;
32 import org.owasp.html.PolicyFactory;
33 
34 import java.util.List;
35 
36 /**
37  * This sanitizer is meant to strip all scripts and any malicious HTML from untrusted emails. It
38  * uses the <a href="https://www.owasp.org/index.php/OWASP_Java_HTML_Sanitizer_Project">OWASP Java
39  * HTML Sanitizer Project</a> to whitelist the subset of HTML elements and attributes as well as CSS
40  * properties that are considered safe. Any unmatched HTML or CSS is discarded.
41  *
42  * All URLS are scrubbed to ensure they match the blessed form of "http://the.url.here",
43  * "https://the.url.here" or "mailto:address@server.com" and cannot resemble "javascript:badness()"
44  * or comparable.
45  */
46 public final class HtmlSanitizer {
47 
48     /**
49      * This version number should be bumped each time a meaningful change is made to this sanitizer
50      * configuration which influences its output. It is compared against a minimum target version
51      * number. If it meets or exceeds the minimum target version, the result of the sanitizer is
52      * free to be shown in a standard webview. If it does not meet the minimum target version then
53      * the sanitized output is deemed untrustworthy and is shown in a sandboxed webview with
54      * javascript execution disabled.
55      */
56     public static final int VERSION = 1;
57 
58     private static final String LOG_TAG = LogTag.getLogTag();
59 
60     /**
61      * The following CSS properties do not appear in the default whitelist from OWASP, but they
62      * improve the fidelity of the HTML display without unacceptable risk.
63      */
64     private static final CssSchema ADDITIONAL_CSS = CssSchema.withProperties(ImmutableSet.of(
65             "float",
66             "display"
67     ));
68 
69     /**
70      * Translates the body tag into the div tag
71      */
72     private static final ElementPolicy TRANSLATE_BODY_TO_DIV = new ElementPolicy() {
73         public String apply(String elementName, List<String> attrs) {
74             return "div";
75         }
76     };
77 
78     /**
79      * Translates <div> tags surrounding quoted text into <div class="elided-text"> which allows
80      * quoted text collapsing in ConversationViewFragment.
81      */
82     private static final ElementPolicy TRANSLATE_DIV_CLASS = new ElementPolicy() {
83         public String apply(String elementName, List<String> attrs) {
84             boolean showHideQuotedText = false;
85 
86             // check if the class attribute is listed
87             final int classIndex = attrs.indexOf("class");
88             if (classIndex >= 0) {
89                 // remove the class attribute and its value
90                 final String value = attrs.remove(classIndex + 1);
91                 attrs.remove(classIndex);
92 
93                 // gmail and yahoo use a specific div class name to indicate quoted text
94                 showHideQuotedText = "gmail_quote".equals(value) || "yahoo_quoted".equals(value);
95             }
96 
97             // check if the id attribute is listed
98             final int idIndex = attrs.indexOf("id");
99             if (idIndex >= 0) {
100                 // remove the id attribute and its value
101                 final String value = attrs.remove(idIndex + 1);
102                 attrs.remove(idIndex);
103 
104                 // AOL uses a specific id value to indicate quoted text
105                 showHideQuotedText = value.startsWith("AOLMsgPart");
106             }
107 
108             // insert a class attribute with a value of "elided-text" to hide/show quoted text
109             if (showHideQuotedText) {
110                 attrs.add("class");
111                 attrs.add("elided-text");
112             }
113 
114             return "div";
115         }
116     };
117 
118     /**
119      * Disallow "cid:" and "mailto:" urls on all tags not &lt;a&gt; or &lt;img&gt;.
120      */
121     private static final AttributePolicy URL_PROTOCOLS =
122             new FilterUrlByProtocolAttributePolicy(ImmutableList.of("http", "https"));
123 
124     /**
125      * Disallow the "cid:" url on links. Do allow "mailto:" urls to support sending mail.
126      */
127     private static final AttributePolicy A_HREF_PROTOCOLS =
128             new FilterUrlByProtocolAttributePolicy(ImmutableList.of("mailto", "http", "https"));
129 
130     /**
131      * Disallow the "mailto:" url on images so that "Show pictures" can't be used to start composing
132      * a bajillion emails. Do allow "cid:" urls to support inline image attachments.
133      */
134     private static final AttributePolicy IMG_SRC_PROTOCOLS =
135             new FilterUrlByProtocolAttributePolicy(ImmutableList.of("cid", "http", "https"));
136 
137     /**
138      * This sanitizer policy removes these elements and the content within:
139      * <ul>
140      *     <li>APPLET</li>
141      *     <li>FRAMESET</li>
142      *     <li>OBJECT</li>
143      *     <li>SCRIPT</li>
144      *     <li>STYLE</li>
145      *     <li>TITLE</li>
146      * </ul>
147      *
148      * This sanitizer policy removes these elements but preserves the content within:
149      * <ul>
150      *     <li>BASEFONT</li>
151      *     <li>FRAME</li>
152      *     <li>HEAD</li>
153      *     <li>IFRAME</li>
154      *     <li>ISINDEX</li>
155      *     <li>LINK</li>
156      *     <li>META</li>
157      *     <li>NOFRAMES</li>
158      *     <li>PARAM</li>
159      *     <li>NOSCRIPT</li>
160      * </ul>
161      *
162      * This sanitizer policy removes these attributes from all elements:
163      * <ul>
164      *     <li>code</li>
165      *     <li>codebase</li>
166      *     <li>id</li>
167      *     <li>for</li>
168      *     <li>headers</li>
169      *     <li>onblur</li>
170      *     <li>onchange</li>
171      *     <li>onclick</li>
172      *     <li>ondblclick</li>
173      *     <li>onfocus</li>
174      *     <li>onkeydown</li>
175      *     <li>onkeypress</li>
176      *     <li>onkeyup</li>
177      *     <li>onload</li>
178      *     <li>onmousedown</li>
179      *     <li>onmousemove</li>
180      *     <li>onmouseout</li>
181      *     <li>onmouseover</li>
182      *     <li>onmouseup</li>
183      *     <li>onreset</li>
184      *     <li>onselect</li>
185      *     <li>onsubmit</li>
186      *     <li>onunload</li>
187      *     <li>tabindex</li>
188      * </ul>
189      */
190     private static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder()
191             .allowAttributes("dir").matching(true, "ltr", "rtl").globally()
192             .allowUrlProtocols("cid", "http", "https", "mailto")
193             .allowStyling(CssSchema.union(CssSchema.DEFAULT, ADDITIONAL_CSS))
194             .disallowTextIn("applet", "frameset", "object", "script", "style", "title")
195             .allowElements("a")
196                 .allowAttributes("coords", "name", "shape").onElements("a")
197                 .allowAttributes("href").matching(A_HREF_PROTOCOLS).onElements("a")
198             .allowElements("abbr").allowAttributes("title").onElements("abbr")
199             .allowElements("acronym").allowAttributes("title").onElements("acronym")
200             .allowElements("address")
201             .allowElements("area")
202                 .allowAttributes("alt", "coords", "nohref", "name", "shape").onElements("area")
203                 .allowAttributes("href").matching(URL_PROTOCOLS).onElements("area")
204             .allowElements("article")
205             .allowElements("aside")
206             .allowElements("b")
207             .allowElements("base")
208                 .allowAttributes("href").matching(URL_PROTOCOLS).onElements("base")
209             .allowElements("bdi").allowAttributes("dir").onElements("bdi")
210             .allowElements("bdo").allowAttributes("dir").onElements("bdo")
211             .allowElements("big")
212             .allowElements("blockquote").allowAttributes("cite").onElements("blockquote")
213             .allowElements(TRANSLATE_BODY_TO_DIV, "body")
214             .allowElements("br").allowAttributes("clear").onElements("br")
215             .allowElements("button")
216                 .allowAttributes("autofocus", "disabled", "form", "formaction", "formenctype",
217                         "formmethod", "formnovalidate", "formtarget", "name", "type", "value")
218             .onElements("button")
219             .allowElements("canvas").allowAttributes("width", "height").onElements("canvas")
220             .allowElements("caption").allowAttributes("align").onElements("caption")
221             .allowElements("center")
222             .allowElements("cite")
223             .allowElements("code")
224             .allowElements("col")
225                 .allowAttributes("align", "bgcolor", "char", "charoff", "span", "valign", "width")
226             .onElements("col")
227             .allowElements("colgroup")
228                 .allowAttributes("align", "char", "charoff", "span", "valign", "width")
229             .onElements("colgroup")
230             .allowElements("datalist")
231             .allowElements("dd")
232             .allowElements("del").allowAttributes("cite", "datetime").onElements("del")
233             .allowElements("details")
234             .allowElements("dfn")
235             .allowElements("dir").allowAttributes("compact").onElements("dir")
236             .allowElements(TRANSLATE_DIV_CLASS, "div")
237                 .allowAttributes("align", "background", "class", "id")
238             .onElements("div")
239             .allowElements("dl")
240             .allowElements("dt")
241             .allowElements("em")
242             .allowElements("fieldset")
243                 .allowAttributes("disabled", "form", "name")
244             .onElements("fieldset")
245             .allowElements("figcaption")
246             .allowElements("figure")
247             .allowElements("font").allowAttributes("color", "face", "size").onElements("font")
248             .allowElements("footer")
249             .allowElements("form")
250                 .allowAttributes("accept", "action", "accept-charset", "autocomplete", "enctype",
251                         "method", "name", "novalidate", "target")
252             .onElements("form")
253             .allowElements("header")
254             .allowElements("h1").allowAttributes("align").onElements("h1")
255             .allowElements("h2").allowAttributes("align").onElements("h2")
256             .allowElements("h3").allowAttributes("align").onElements("h3")
257             .allowElements("h4").allowAttributes("align").onElements("h4")
258             .allowElements("h5").allowAttributes("align").onElements("h5")
259             .allowElements("h6").allowAttributes("align").onElements("h6")
260             .allowElements("hr")
261                 .allowAttributes("align", "noshade", "size", "width")
262             .onElements("hr")
263             .allowElements("i")
264             .allowElements("img")
265                 .allowAttributes("src").matching(IMG_SRC_PROTOCOLS).onElements("img")
266                 .allowAttributes("longdesc").matching(URL_PROTOCOLS).onElements("img")
267                 .allowAttributes("align", "alt", "border", "crossorigin", "height", "hspace",
268                         "ismap", "usemap", "vspace", "width")
269             .onElements("img")
270             .allowElements("input")
271                 .allowAttributes("src").matching(URL_PROTOCOLS).onElements("input")
272                 .allowAttributes("formaction").matching(URL_PROTOCOLS).onElements("input")
273                 .allowAttributes("accept", "align", "alt", "autocomplete", "autofocus", "checked",
274                         "disabled", "form", "formenctype", "formmethod", "formnovalidate",
275                         "formtarget", "height", "list", "max", "maxlength", "min", "multiple",
276                         "name", "pattern", "placeholder", "readonly", "required", "size", "step",
277                         "type", "value", "width")
278             .onElements("input")
279             .allowElements("ins")
280                 .allowAttributes("cite").matching(URL_PROTOCOLS).onElements("ins")
281                 .allowAttributes("datetime").onElements("ins")
282             .allowElements("kbd")
283             .allowElements("keygen")
284                 .allowAttributes("autofocus", "challenge", "disabled", "form", "keytype", "name")
285             .onElements("keygen")
286             .allowElements("label").allowAttributes("form").onElements("label")
287             .allowElements("legend").allowAttributes("align").onElements("legend")
288             .allowElements("li").allowAttributes("type", "value").onElements("li")
289             .allowElements("main")
290             .allowElements("map").allowAttributes("name").onElements("map")
291             .allowElements("mark")
292             .allowElements("menu").allowAttributes("label", "type").onElements("menu")
293             .allowElements("menuitem")
294                 .allowAttributes("icon").matching(URL_PROTOCOLS).onElements("menuitem")
295                 .allowAttributes("checked", "command", "default", "disabled", "label", "type",
296                         "radiogroup").onElements("menuitem")
297             .allowElements("meter")
298                 .allowAttributes("form", "high", "low", "max", "min", "optimum", "value")
299             .onElements("meter")
300             .allowElements("nav")
301             .allowElements("ol")
302                 .allowAttributes("compact", "reversed", "start", "type")
303             .onElements("ol")
304             .allowElements("optgroup").allowAttributes("disabled", "label").onElements("optgroup")
305             .allowElements("option")
306                 .allowAttributes("disabled", "label", "selected", "value")
307             .onElements("option")
308             .allowElements("output").allowAttributes("form", "name").onElements("output")
309             .allowElements("p").allowAttributes("align").onElements("p")
310             .allowElements("pre").allowAttributes("width").onElements("pre")
311             .allowElements("progress").allowAttributes("max", "value").onElements("progress")
312             .allowElements("q").allowAttributes("cite").matching(URL_PROTOCOLS).onElements("q")
313             .allowElements("rp")
314             .allowElements("rt")
315             .allowElements("ruby")
316             .allowElements("s")
317             .allowElements("samp")
318             .allowElements("section")
319             .allowElements("select")
320                 .allowAttributes("autofocus", "disabled", "form", "multiple", "name", "required",
321                         "size")
322             .onElements("select")
323             .allowElements("small")
324             .allowElements("span")
325             .allowElements("strike")
326             .allowElements("strong")
327             .allowElements("sub")
328             .allowElements("summary")
329             .allowElements("sup")
330             .allowElements("table")
331                 .allowAttributes("align", "bgcolor", "border", "cellpadding", "cellspacing",
332                         "frame", "rules", "sortable", "summary", "width")
333             .onElements("table")
334             .allowElements("tbody")
335                 .allowAttributes("align", "char", "charoff", "valign").onElements("tbody")
336             .allowElements("td")
337                 .allowAttributes("abbr", "align", "axis", "bgcolor", "char", "charoff", "colspan",
338                         "height", "nowrap", "rowspan", "scope", "valign", "width")
339             .onElements("td")
340             .allowElements("textarea")
341                 .allowAttributes("autofocus", "cols", "disabled", "form", "maxlength", "name",
342                         "placeholder", "readonly", "required", "rows", "wrap")
343             .onElements("textarea")
344             .allowElements("tfoot")
345                 .allowAttributes("align", "char", "charoff", "valign").onElements("tfoot")
346             .allowElements("th")
347                 .allowAttributes("abbr", "align", "axis", "bgcolor", "char", "charoff", "colspan",
348                         "height", "nowrap", "rowspan", "scope", "sorted", "valign", "width")
349             .onElements("th")
350             .allowElements("thead")
351                 .allowAttributes("align", "char", "charoff", "valign").onElements("thead")
352             .allowElements("time").allowAttributes("datetime").onElements("time")
353             .allowElements("tr")
354                 .allowAttributes("align", "bgcolor", "char", "charoff", "valign").onElements("tr")
355             .allowElements("tt")
356             .allowElements("u")
357             .allowElements("ul").allowAttributes("compact", "type").onElements("ul")
358             .allowElements("var")
359             .allowElements("wbr")
360             .toFactory();
361 
HtmlSanitizer()362     private HtmlSanitizer() {}
363 
364     /**
365      * Sanitizing email is treated as an expensive operation; this method should be called from
366      * a background Thread.
367      *
368      * @param rawHtml the unsanitized, suspicious html
369      * @return the sanitized form of the <code>rawHtml</code>; <code>null</code> if
370      *      <code>rawHtml</code> was <code>null</code>
371      */
sanitizeHtml(final String rawHtml)372     public static String sanitizeHtml(final String rawHtml) {
373         if (Looper.getMainLooper() == Looper.myLooper()) {
374             throw new IllegalStateException("sanitizing email should not occur on the main thread");
375         }
376 
377         if (rawHtml == null) {
378             return null;
379         }
380 
381         // create the builder into which the sanitized email will be written
382         final StringBuilder htmlBuilder = new StringBuilder(rawHtml.length());
383 
384         // create the renderer that will write the sanitized HTML to the builder
385         final HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
386                 htmlBuilder,
387                 Handler.PROPAGATE,
388                 // log errors resulting from exceptionally bizarre inputs
389                 new Handler<String>() {
390                     public void handle(final String x) {
391                         Log.wtf(LOG_TAG, "Mangled HTML content cannot be parsed: " + x);
392                         throw new AssertionError(x);
393                     }
394                 }
395         );
396 
397         // create a thread-specific policy
398         final org.owasp.html.HtmlSanitizer.Policy policy = POLICY_DEFINITION.apply(renderer);
399 
400         // run the html through the sanitizer
401         Timer.startTiming("sanitizingHTMLEmail");
402         try {
403             org.owasp.html.HtmlSanitizer.sanitize(rawHtml, policy);
404         } finally {
405             Timer.stopTiming("sanitizingHTMLEmail");
406         }
407 
408         // return the resulting HTML from the builder
409         return htmlBuilder.toString();
410     }
411 }
412