1 /**
2  * Copyright (c) 2008, Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.android.mail.common.base;
18 
19 import static com.google.android.mail.common.base.Preconditions.checkNotNull;
20 
21 /**
22  * A {@code UnicodeEscaper} that escapes some set of Java characters using
23  * the URI percent encoding scheme. The set of safe characters (those which
24  * remain unescaped) can be specified on construction.
25  *
26  * <p>For details on escaping URIs for use in web pages, see section 2.4 of
27  * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
28  *
29  * <p>In most cases this class should not need to be used directly. If you
30  * have no special requirements for escaping your URIs, you should use either
31  * {@link CharEscapers#uriEscaper()} or
32  * {@link CharEscapers#uriEscaper(boolean)}.
33  *
34  * <p>When encoding a String, the following rules apply:
35  * <ul>
36  * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
37  * through "9" remain the same.
38  * <li>Any additionally specified safe characters remain the same.
39  * <li>If {@code plusForSpace} was specified, the space character " " is
40  * converted into a plus sign "+".
41  * <li>All other characters are converted into one or more bytes using UTF-8
42  *     encoding and each byte is then represented by the 3-character string
43  *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
44  *     of the byte value.
45  * </ul>
46  *
47  * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
48  * "~", "*", "'", "(" and ")". It goes on to state:
49  *
50  * <p><i>Unreserved characters can be escaped without changing the semantics
51  * of the URI, but this should not be done unless the URI is being used
52  * in a context that does not allow the unescaped character to appear.</i>
53  *
54  * <p>For performance reasons the only currently supported character encoding of
55  * this class is UTF-8.
56  *
57  * <p><b>Note</b>: This escaper produces uppercase hexadecimal sequences. From
58  * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
59  * <i>"URI producers and normalizers should use uppercase hexadecimal digits
60  * for all percent-encodings."</i>
61  *
62  * @author dbeaumont@google.com (David Beaumont)
63  */
64 public class PercentEscaper extends UnicodeEscaper {
65   /**
66    * A string of safe characters that mimics the behavior of
67    * {@link java.net.URLEncoder}.
68    *
69    * TODO(dbeaumont): Fix escapers to be compliant with RFC 3986
70    */
71   public static final String SAFECHARS_URLENCODER = "-_.*";
72 
73   /**
74    * A string of characters that do not need to be encoded when used in URI
75    * path segments, as specified in RFC 3986. Note that some of these
76    * characters do need to be escaped when used in other parts of the URI.
77    */
78   public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
79 
80   /**
81    * A string of characters that do not need to be encoded when used in URI
82    * query strings, as specified in RFC 3986. Note that some of these
83    * characters do need to be escaped when used in other parts of the URI.
84    */
85   public static final String SAFEQUERYSTRINGCHARS_URLENCODER
86       = "-_.!~*'()@:$,;/?:";
87 
88   // In some uri escapers spaces are escaped to '+'
89   private static final char[] URI_ESCAPED_SPACE = { '+' };
90 
91   // TODO(dbeaumont): Remove this once UriEscaper uses lower case
92   private static final char[] UPPER_HEX_DIGITS =
93       "0123456789ABCDEF".toCharArray();
94 
95   /**
96    * If true we should convert space to the {@code +} character.
97    */
98   private final boolean plusForSpace;
99 
100   /**
101    * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
102    * true then {@code c} should remain unmodified in the output. If
103    * {@code c > safeOctets.length} then it should be escaped.
104    */
105   private final boolean[] safeOctets;
106 
107   /**
108    * Constructs a URI escaper with the specified safe characters and optional
109    * handling of the space character.
110    *
111    * @param safeChars a non null string specifying additional safe characters
112    *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
113    *        should not be specified here)
114    * @param plusForSpace true if ASCII space should be escaped to {@code +}
115    *        rather than {@code %20}
116    * @throws IllegalArgumentException if any of the parameters were invalid
117    */
PercentEscaper(String safeChars, boolean plusForSpace)118   public PercentEscaper(String safeChars, boolean plusForSpace) {
119     checkNotNull(safeChars);  // eager for GWT.
120 
121     // Avoid any misunderstandings about the behavior of this escaper
122     if (safeChars.matches(".*[0-9A-Za-z].*")) {
123       throw new IllegalArgumentException(
124           "Alphanumeric characters are always 'safe' and should not be " +
125           "explicitly specified");
126     }
127     // Avoid ambiguous parameters. Safe characters are never modified so if
128     // space is a safe character then setting plusForSpace is meaningless.
129     if (plusForSpace && safeChars.contains(" ")) {
130       throw new IllegalArgumentException(
131           "plusForSpace cannot be specified when space is a 'safe' character");
132     }
133     if (safeChars.contains("%")) {
134       throw new IllegalArgumentException(
135           "The '%' character cannot be specified as 'safe'");
136     }
137     this.plusForSpace = plusForSpace;
138     this.safeOctets = createSafeOctets(safeChars);
139   }
140 
141   /**
142    * Creates a boolean[] with entries corresponding to the character values
143    * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
144    * is as small as is required to hold the given character information.
145    */
createSafeOctets(String safeChars)146   private static boolean[] createSafeOctets(String safeChars) {
147     int maxChar = 'z';
148     char[] safeCharArray = safeChars.toCharArray();
149     for (char c : safeCharArray) {
150       maxChar = Math.max(c, maxChar);
151     }
152     boolean[] octets = new boolean[maxChar + 1];
153     for (int c = '0'; c <= '9'; c++) {
154       octets[c] = true;
155     }
156     for (int c = 'A'; c <= 'Z'; c++) {
157       octets[c] = true;
158     }
159     for (int c = 'a'; c <= 'z'; c++) {
160       octets[c] = true;
161     }
162     for (char c : safeCharArray) {
163       octets[c] = true;
164     }
165     return octets;
166   }
167 
168   /*
169    * Overridden for performance. For unescaped strings this improved the
170    * performance of the uri escaper from ~760ns to ~400ns as measured by
171    * {@link CharEscapersBenchmark}.
172    */
173   @Override
nextEscapeIndex(CharSequence csq, int index, int end)174   protected int nextEscapeIndex(CharSequence csq, int index, int end) {
175     for (; index < end; index++) {
176       char c = csq.charAt(index);
177       if (c >= safeOctets.length || !safeOctets[c]) {
178         break;
179       }
180     }
181     return index;
182   }
183 
184   /*
185    * Overridden for performance. For unescaped strings this improved the
186    * performance of the uri escaper from ~400ns to ~170ns as measured by
187    * {@link CharEscapersBenchmark}.
188    */
189   @Override
escape(String s)190   public String escape(String s) {
191     checkNotNull(s);
192     int slen = s.length();
193     for (int index = 0; index < slen; index++) {
194       char c = s.charAt(index);
195       if (c >= safeOctets.length || !safeOctets[c]) {
196         return escapeSlow(s, index);
197       }
198     }
199     return s;
200   }
201 
202   /**
203    * Escapes the given Unicode code point in UTF-8.
204    */
205   @Override
escape(int cp)206   protected char[] escape(int cp) {
207     // We should never get negative values here but if we do it will throw an
208     // IndexOutOfBoundsException, so at least it will get spotted.
209     if (cp < safeOctets.length && safeOctets[cp]) {
210       return null;
211     } else if (cp == ' ' && plusForSpace) {
212       return URI_ESCAPED_SPACE;
213     } else if (cp <= 0x7F) {
214       // Single byte UTF-8 characters
215       // Start with "%--" and fill in the blanks
216       char[] dest = new char[3];
217       dest[0] = '%';
218       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
219       dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
220       return dest;
221     } else if (cp <= 0x7ff) {
222       // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
223       // Start with "%--%--" and fill in the blanks
224       char[] dest = new char[6];
225       dest[0] = '%';
226       dest[3] = '%';
227       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
228       cp >>>= 4;
229       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
230       cp >>>= 2;
231       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
232       cp >>>= 4;
233       dest[1] = UPPER_HEX_DIGITS[0xC | cp];
234       return dest;
235     } else if (cp <= 0xffff) {
236       // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
237       // Start with "%E-%--%--" and fill in the blanks
238       char[] dest = new char[9];
239       dest[0] = '%';
240       dest[1] = 'E';
241       dest[3] = '%';
242       dest[6] = '%';
243       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
244       cp >>>= 4;
245       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
246       cp >>>= 2;
247       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
248       cp >>>= 4;
249       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
250       cp >>>= 2;
251       dest[2] = UPPER_HEX_DIGITS[cp];
252       return dest;
253     } else if (cp <= 0x10ffff) {
254       char[] dest = new char[12];
255       // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
256       // Start with "%F-%--%--%--" and fill in the blanks
257       dest[0] = '%';
258       dest[1] = 'F';
259       dest[3] = '%';
260       dest[6] = '%';
261       dest[9] = '%';
262       dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
263       cp >>>= 4;
264       dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
265       cp >>>= 2;
266       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
267       cp >>>= 4;
268       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
269       cp >>>= 2;
270       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
271       cp >>>= 4;
272       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
273       cp >>>= 2;
274       dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
275       return dest;
276     } else {
277       // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
278       throw new IllegalArgumentException(
279           "Invalid unicode character value " + cp);
280     }
281   }
282 }