1 // Copyright (c) 2012, Mike Samuel
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 // Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // Neither the name of the OWASP nor the names of its contributors may
14 // be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 // POSSIBILITY OF SUCH DAMAGE.
28 
29 package org.owasp.html;
30 
31 import java.io.IOException;
32 
33 import com.google.common.annotations.VisibleForTesting;
34 
35 /** Encoders and decoders for HTML. */
36 final class Encoding {
37 
38   /**
39    * Decodes HTML entities to produce a string containing only valid
40    * Unicode scalar values.
41    */
42   @VisibleForTesting
decodeHtml(String s)43   static String decodeHtml(String s) {
44     int firstAmp = s.indexOf('&');
45     int safeLimit = longestPrefixOfGoodCodeunits(s);
46     if ((firstAmp & safeLimit) < 0) { return s; }
47 
48     StringBuilder sb;
49     {
50       int n = s.length();
51       sb = new StringBuilder(n);
52       int pos = 0;
53       int amp = firstAmp;
54       while (amp >= 0) {
55         long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
56         int end = (int) (endAndCodepoint >>> 32);
57         int codepoint = (int) endAndCodepoint;
58         sb.append(s, pos, amp).appendCodePoint(codepoint);
59         pos = end;
60         amp = s.indexOf('&', end);
61       }
62       sb.append(s, pos, n);
63     }
64 
65     stripBannedCodeunits(
66         sb,
67         firstAmp < 0
68           ? safeLimit : safeLimit < 0
69           ? firstAmp : Math.min(firstAmp, safeLimit));
70 
71     return sb.toString();
72   }
73 
74   /**
75    * Returns the portion of its input that consists of XML safe chars.
76    * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
77    */
78   @TCB
stripBannedCodeunits(String s)79   static String stripBannedCodeunits(String s) {
80     int safeLimit = longestPrefixOfGoodCodeunits(s);
81     if (safeLimit < 0) { return s; }
82 
83     StringBuilder sb = new StringBuilder(s);
84     stripBannedCodeunits(sb, safeLimit);
85     return sb.toString();
86   }
87 
88   /**
89    * Leaves in the input buffer only code-units that comprise XML safe chars.
90    * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
91    */
92   @TCB
stripBannedCodeunits(StringBuilder sb)93   static void stripBannedCodeunits(StringBuilder sb) {
94     stripBannedCodeunits(sb, 0);
95   }
96 
97   @TCB
stripBannedCodeunits(StringBuilder sb, int start)98   private static void stripBannedCodeunits(StringBuilder sb, int start) {
99     int k = start;
100     for (int i = start, n = sb.length(); i < n; ++i) {
101       char ch = sb.charAt(i);
102       if (ch < 0x20) {
103         if (IS_BANNED_ASCII[ch]) {
104           continue;
105         }
106       } else if (0xd800 <= ch) {
107         if (ch <= 0xdfff) {
108           if (i+1 < n) {
109             char next = sb.charAt(i+1);
110             if (Character.isSurrogatePair(ch, next)) {
111               sb.setCharAt(k++, ch);
112               sb.setCharAt(k++, next);
113               ++i;
114             }
115           }
116           continue;
117         } else if ((ch & 0xfffe) == 0xfffe) {
118           continue;
119         }
120       }
121       sb.setCharAt(k++, ch);
122     }
123     sb.setLength(k);
124   }
125 
126   /**
127    * The number of code-units at the front of s that form code-points in the
128    * XML Character production.
129    * @return -1 if all of s is in the XML Character production.
130    */
131   @TCB
longestPrefixOfGoodCodeunits(String s)132   private static int longestPrefixOfGoodCodeunits(String s) {
133     int n = s.length(), i;
134     for (i = 0; i < n; ++i) {
135       char ch = s.charAt(i);
136       if (ch < 0x20) {
137         if (IS_BANNED_ASCII[ch]) {
138           return i;
139         }
140       } else if (0xd800 <= ch) {
141         if (ch <= 0xdfff) {
142           if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
143             ++i;  // Skip over low surrogate since we know it's ok.
144           } else {
145             return i;
146           }
147         } else if ((ch & 0xfffe) == 0xfffe) {
148           return i;
149         }
150       }
151     }
152     return -1;
153   }
154 
155   /**
156    * Writes the HTML equivalent of the given plain text to output.
157    * For example, {@code escapeHtmlOnto("1 < 2", w)},
158    * is equivalent to {@code w.append("1 &lt; 2")} but possibly with fewer
159    * smaller appends.
160    * Elides code-units that are not valid XML Characters.
161    * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
162    */
163   @TCB
encodeHtmlOnto(String plainText, Appendable output)164   static void encodeHtmlOnto(String plainText, Appendable output)
165       throws IOException {
166     int n = plainText.length();
167     int pos = 0;
168     for (int i = 0; i < n; ++i) {
169       char ch = plainText.charAt(i);
170       if (ch < REPLACEMENTS.length) {
171         String repl = REPLACEMENTS[ch];
172         if (repl != null) {
173           output.append(plainText, pos, i).append(repl);
174           pos = i + 1;
175         }
176       } else if (((char) 0xd800) <= ch) {
177         if (ch <= ((char) 0xdfff)) {
178           char next;
179           if (i + 1 < n
180               && Character.isSurrogatePair(
181                   ch, next = plainText.charAt(i + 1))) {
182             // Emit supplemental codepoints as entity so that they cannot
183             // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
184             // and get involved in UTF-16/UCS-2 confusion.
185             int codepoint = Character.toCodePoint(ch, next);
186             output.append(plainText, pos, i);
187             appendNumericEntity(codepoint, output);
188             ++i;
189             pos = i + 1;
190           } else {
191             output.append(plainText, pos, i);
192             // Elide the orphaned surrogate.
193             pos = i + 1;
194           }
195         } else if (0xff00 <= ch) {
196           output.append(plainText, pos, i);
197           pos = i + 1;
198           // Is a control character or possible full-width version of a
199           // special character.
200           if ((ch & 0xfffe) == 0xfffe) {
201             // Elide since not an the XML Character.
202           } else {
203             appendNumericEntity(ch, output);
204           }
205         }
206       }
207     }
208     output.append(plainText, pos, n);
209   }
210 
211   @TCB
appendNumericEntity(int codepoint, Appendable output)212   static void appendNumericEntity(int codepoint, Appendable output)
213       throws IOException {
214     if (codepoint < 100) {
215       // TODO: is this dead code due to REPLACEMENTS above.
216       output.append("&#");
217       if (codepoint < 10) {
218         output.append((char) ('0' + codepoint));
219       } else {
220         output.append((char) ('0' + (codepoint / 10)));
221         output.append((char) ('0' + (codepoint % 10)));
222       }
223       output.append(";");
224     } else {
225       int nDigits = (codepoint < 0x1000
226                      ? codepoint < 0x100 ? 2 : 3
227                      : (codepoint < 0x10000 ? 4
228                         : codepoint < 0x100000 ? 5 : 6));
229       output.append("&#x");
230       for (int digit = nDigits; --digit >= 0;) {
231         int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
232         output.append(HEX_NUMERAL[hexDigit]);
233       }
234       output.append(";");
235     }
236   }
237 
238   private static final char[] HEX_NUMERAL = {
239    '0', '1', '2', '3', '4', '5', '6', '7',
240    '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
241   };
242 
243   /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
244   static final String[] REPLACEMENTS = new String[0x61];
245   static {
246     for (int i = 0; i < ' '; ++i) {
247       // We elide control characters so that we can ensure that our output is
248       // in the intersection of valid HTML5 and XML.  According to
249       // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets
250       // Char      ::=          #x9 | #xA | #xD | [#x20-#xD7FF]
251       //             |          [#xE000-#xFFFD] | [#x10000-#x10FFFF]
252       if (i != '\t' && i != '\n' && i != '\r') {
253         REPLACEMENTS[i] = "";  // Elide
254       }
255     }
256     // "&#34;" is shorter than "&quot;"
257     REPLACEMENTS['"']  = "&#" + ((int) '"')  + ";";  // Attribute delimiter.
258     REPLACEMENTS['&']  = "&amp;";                    // HTML special.
259     // We don't use &apos; since that is not in the intersection of HTML&XML.
260     REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";";  // Attribute delimiter.
261     REPLACEMENTS['+']  = "&#" + ((int) '+')  + ";";  // UTF-7 special.
262     REPLACEMENTS['<']  = "&lt;";                     // HTML special.
263     REPLACEMENTS['=']  = "&#" + ((int) '=')  + ";";  // Special in attributes.
264     REPLACEMENTS['>']  = "&gt;";                     // HTML special.
265     REPLACEMENTS['@']  = "&#" + ((int) '@')  + ";";  // Conditional compilation.
266     REPLACEMENTS['`']  = "&#" + ((int) '`')  + ";";  // Attribute delimiter.
267   }
268 
269   /**
270    * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in
271    * an HTML5 text node or properly quoted attribute value.
272    */
273   private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
274   static {
275     for (int i = 0; i < IS_BANNED_ASCII.length; ++i) {
276       IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r');
277     }
278   }
279 
280 }
281