1 // Copyright (c) 2011, Mike Samuel
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 // Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // Neither the name of the OWASP nor the names of its contributors may
14 // be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 // POSSIBILITY OF SUCH DAMAGE.
28 
29 package org.owasp.html.examples;
30 
31 import java.io.IOException;
32 import java.io.InputStreamReader;
33 import java.util.regex.Pattern;
34 
35 import org.owasp.html.Handler;
36 import org.owasp.html.HtmlPolicyBuilder;
37 import org.owasp.html.HtmlSanitizer;
38 import org.owasp.html.HtmlStreamRenderer;
39 import org.owasp.html.PolicyFactory;
40 
41 import com.google.common.base.Charsets;
42 import com.google.common.base.Predicate;
43 import com.google.common.base.Throwables;
44 import com.google.common.io.CharStreams;
45 
46 /**
47  * Based on the
48  * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>.
49  * <blockquote>
50  * eBay (http://www.ebay.com/) is the most popular online auction site in the
51  * universe, as far as I can tell. It is a public site so anyone is allowed to
52  * post listings with rich HTML content. It's not surprising that given the
53  * attractiveness of eBay as a target that it has been subject to a few complex
54  * XSS attacks. Listings are allowed to contain much more rich content than,
55  * say, Slashdot- so it's attack surface is considerably larger. The following
56  * tags appear to be accepted by eBay (they don't publish rules):
57  * {@code <a>},...
58  * </blockquote>
59  */
60 public class EbayPolicyExample {
61 
62   // Some common regular expression definitions.
63 
64   // The 16 colors defined by the HTML Spec (also used by the CSS Spec)
65   private static final Pattern COLOR_NAME = Pattern.compile(
66       "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple"
67       + "|red|silver|teal|white|yellow)");
68 
69   // HTML/CSS Spec allows 3 or 6 digit hex to specify color
70   private static final Pattern COLOR_CODE = Pattern.compile(
71       "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))");
72 
73   private static final Pattern NUMBER_OR_PERCENT = Pattern.compile(
74       "[0-9]+%?");
75   private static final Pattern PARAGRAPH = Pattern.compile(
76       "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*");
77   private static final Pattern HTML_ID = Pattern.compile(
78       "[a-zA-Z0-9\\:\\-_\\.]+");
79   // force non-empty with a '+' at the end instead of '*'
80   private static final Pattern HTML_TITLE = Pattern.compile(
81       "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*");
82   private static final Pattern HTML_CLASS = Pattern.compile(
83       "[a-zA-Z0-9\\s,\\-_]+");
84 
85   private static final Pattern ONSITE_URL = Pattern.compile(
86       "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)");
87   private static final Pattern OFFSITE_URL = Pattern.compile(
88       "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]"
89       + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*+\\s*");
90 
91   private static final Pattern NUMBER = Pattern.compile(
92       "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)");
93 
94   private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+");
95 
96   private static final Pattern ALIGN = Pattern.compile(
97       "(?i)center|left|right|justify|char");
98 
99   private static final Pattern VALIGN = Pattern.compile(
100       "(?i)baseline|bottom|middle|top");
101 
102   private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE
103       = new Predicate<String>() {
104         public boolean apply(String s) {
105           return COLOR_NAME.matcher(s).matches()
106               || COLOR_CODE.matcher(s).matches();
107         }
108       };
109 
110   private static final Predicate<String> ONSITE_OR_OFFSITE_URL
111       = new Predicate<String>() {
112         public boolean apply(String s) {
113           return ONSITE_URL.matcher(s).matches()
114               || OFFSITE_URL.matcher(s).matches();
115         }
116       };
117 
118   private static final Pattern HISTORY_BACK = Pattern.compile(
119       "(?:javascript:)?\\Qhistory.go(-1)\\E");
120 
121   private static final Pattern ONE_CHAR = Pattern.compile(
122       ".?", Pattern.DOTALL);
123 
124 
125 
126   public static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder()
127           .allowAttributes("id").matching(HTML_ID).globally()
128           .allowAttributes("class").matching(HTML_CLASS).globally()
129           .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
130               .globally()
131           .allowAttributes("title").matching(HTML_TITLE).globally()
132           .allowStyling()
133           .allowAttributes("align").matching(ALIGN).onElements("p")
134           .allowAttributes("for").matching(HTML_ID).onElements("label")
135           .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE)
136               .onElements("font")
137           .allowAttributes("face")
138               .matching(Pattern.compile("[\\w;, \\-]+"))
139               .onElements("font")
140           .allowAttributes("size").matching(NUMBER).onElements("font")
141           .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL)
142               .onElements("a")
143           .allowStandardUrlProtocols()
144           .allowAttributes("nohref").onElements("a")
145           .allowAttributes("name").matching(NAME).onElements("a")
146           .allowAttributes(
147               "onfocus", "onblur", "onclick", "onmousedown", "onmouseup")
148               .matching(HISTORY_BACK).onElements("a")
149           .requireRelNofollowOnLinks()
150           .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL)
151               .onElements("img")
152           .allowAttributes("name").matching(NAME)
153               .onElements("img")
154           .allowAttributes("alt").matching(PARAGRAPH)
155               .onElements("img")
156           .allowAttributes("border", "hspace", "vspace").matching(NUMBER)
157               .onElements("img")
158           .allowAttributes("border", "cellpadding", "cellspacing")
159               .matching(NUMBER).onElements("table")
160           .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
161               .onElements("table")
162           .allowAttributes("background").matching(ONSITE_URL)
163               .onElements("table")
164           .allowAttributes("align").matching(ALIGN)
165               .onElements("table")
166           .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize"))
167               .onElements("table")
168           .allowAttributes("background").matching(ONSITE_URL)
169               .onElements("td", "th", "tr")
170           .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
171               .onElements("td", "th")
172           .allowAttributes("abbr").matching(PARAGRAPH)
173               .onElements("td", "th")
174           .allowAttributes("axis", "headers").matching(NAME)
175               .onElements("td", "th")
176           .allowAttributes("scope")
177               .matching(Pattern.compile("(?i)(?:row|col)(?:group)?"))
178               .onElements("td", "th")
179           .allowAttributes("nowrap")
180               .onElements("td", "th")
181           .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT)
182               .onElements("table", "td", "th", "tr", "img")
183           .allowAttributes("align").matching(ALIGN)
184               .onElements("thead", "tbody", "tfoot", "img",
185                                "td", "th", "tr", "colgroup", "col")
186           .allowAttributes("valign").matching(VALIGN)
187               .onElements("thead", "tbody", "tfoot",
188                               "td", "th", "tr", "colgroup", "col")
189           .allowAttributes("charoff").matching(NUMBER_OR_PERCENT)
190               .onElements("td", "th", "tr", "colgroup", "col",
191                               "thead", "tbody", "tfoot")
192           .allowAttributes("char").matching(ONE_CHAR)
193               .onElements("td", "th", "tr", "colgroup", "col",
194                                "thead", "tbody", "tfoot")
195           .allowAttributes("colspan", "rowspan").matching(NUMBER)
196               .onElements("td", "th")
197           .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT)
198               .onElements("colgroup", "col")
199           .allowElements(
200               "a", "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6",
201               "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code",
202               "cite", "samp", "sub", "sup", "strike", "center", "blockquote",
203               "hr", "br", "col", "font", "map", "span", "div", "img",
204               "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot",
205               "table", "td", "th", "tr", "colgroup", "fieldset", "legend")
206           .toFactory();
207 
main(String[] args)208   public static void main(String[] args) throws IOException {
209     if (args.length != 0) {
210       System.err.println("Reads from STDIN and writes to STDOUT");
211       System.exit(-1);
212     }
213     System.err.println("[Reading from STDIN]");
214     // Fetch the HTML to sanitize.
215     String html = CharStreams.toString(
216         new InputStreamReader(System.in, Charsets.UTF_8));
217     // Set up an output channel to receive the sanitized HTML.
218     HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
219         System.out,
220         // Receives notifications on a failure to write to the output.
221         new Handler<IOException>() {
222           public void handle(IOException ex) {
223             Throwables.propagate(ex);  // System.out suppresses IOExceptions
224           }
225         },
226         // Our HTML parser is very lenient, but this receives notifications on
227         // truly bizarre inputs.
228         new Handler<String>() {
229           public void handle(String x) {
230             throw new AssertionError(x);
231           }
232         });
233     // Use the policy defined above to sanitize the HTML.
234     HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
235   }
236 }
237