1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 
17 package libcore.net;
18 
19 import java.io.ByteArrayOutputStream;
20 import java.net.URISyntaxException;
21 import java.nio.ByteBuffer;
22 import java.nio.CharBuffer;
23 import java.nio.charset.CharacterCodingException;
24 import java.nio.charset.Charset;
25 import java.nio.charset.CharsetDecoder;
26 import java.nio.charset.CharsetEncoder;
27 import java.nio.charset.CoderResult;
28 import java.nio.charset.CodingErrorAction;
29 import java.nio.charset.StandardCharsets;
30 
31 /**
32  * Encodes and decodes “application/x-www-form-urlencoded” content.
33  *
34  * Subclasses define “isRetained”, which decides which chars need to be escaped and which don’t.
35  * Output is encoded as UTF-8 by default. I.e, each character (or surrogate pair) is converted to
36  * its equivalent UTF-8 encoded byte sequence, which is then converted to it’s escaped form.
37  * e.g a 4 byte sequence might look like” %c6%ef%e0%e8”
38  */
39 public abstract class UriCodec {
40     /**
41      * Returns true iff. ‘c’ does not need to be escaped.
42      * 'a’ - ‘z’ , ‘A’ - ‘Z’ and ‘0’ - ‘9’ are always considered valid (i.e, don’t need to be
43      * escaped. This set is referred to as the ``whitelist''.
44      */
isRetained(char c)45     protected abstract boolean isRetained(char c);
46 
isWhitelisted(char c)47     private static boolean isWhitelisted(char c) {
48         return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9');
49     }
50 
isWhitelistedOrRetained(char c)51     private boolean isWhitelistedOrRetained(char c) {
52         return isWhitelisted(c) || isRetained(c);
53     }
54 
55     /**
56      * Throw URISyntaxException if any of the characters in the range [start, end) are not valid
57      * according to this codec.
58      *  - If a char is in the whitelist or retained, it is valid both escaped and unescaped.
59      *  - All escaped octets appearing in the input are structurally valid hex, i.e convertible to
60      *  decimals.
61      *
62      * On success, the substring [start, end) is returned.
63      * {@code name} is not used, except to generate debugging info.
64      */
validate(String uri, int start, int end, String name)65     public final String validate(String uri, int start, int end, String name)
66             throws URISyntaxException {
67         int i = start;
68         while (i < end) {
69             char c = uri.charAt(i++);
70             if (isWhitelistedOrRetained(c)) {
71                 continue;
72             }
73             // c is either '%' or character not allowed in a uri.
74             if (c != '%') {
75                 throw unexpectedCharacterException(uri, name, c, i - 1);
76             }
77             // Expect two characters representing a number in hex.
78             for (int j = 0; j < 2; j++) {
79                 c = getNextCharacter(uri, i++, end, name);
80                 if (hexCharToValue(c) < 0) {
81                     throw unexpectedCharacterException(uri, name, c, i - 1);
82                 }
83             }
84         }
85         return uri.substring(start, end);
86     }
87 
88     /**
89      * Interprets a char as hex digits, returning a number from -1 (invalid char) to 15 ('f').
90      */
hexCharToValue(char c)91     private static int hexCharToValue(char c) {
92         if('0' <= c && c <= '9') {
93             return c - '0';
94         }
95         if ('a' <= c && c <= 'f') {
96             return 10 + c - 'a';
97         }
98         if ('A' <= c && c <= 'F') {
99             return 10 + c - 'A';
100         }
101         return -1;
102     }
103 
unexpectedCharacterException( String uri, String name, char unexpected, int index)104     private static URISyntaxException unexpectedCharacterException(
105             String uri, String name, char unexpected, int index) {
106         String nameString = (name == null) ? "" :  " in [" + name + "]";
107         return new URISyntaxException(
108                 uri, "Unexpected character" + nameString + ": " + unexpected, index);
109     }
110 
getNextCharacter(String uri, int index, int end, String name)111     private static char getNextCharacter(String uri, int index, int end, String name)
112              throws URISyntaxException {
113         if (index >= end) {
114             String nameString = (name == null) ? "" :  " in [" + name + "]";
115             throw new URISyntaxException(
116                     uri, "Unexpected end of string" + nameString, index);
117         }
118         return uri.charAt(index);
119     }
120 
121     /**
122      * Throws {@link URISyntaxException} if any character in {@code uri} is neither whitelisted nor
123      * in {@code legal}.
124      */
validateSimple(String uri, String legal)125     public static void validateSimple(String uri, String legal) throws URISyntaxException {
126         for (int i = 0; i < uri.length(); i++) {
127             char c = uri.charAt(i);
128             if (!isWhitelisted(c) && legal.indexOf(c) < 0) {
129                 throw unexpectedCharacterException(uri, null /* name */, c, i);
130             }
131         }
132     }
133 
134     /**
135      * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
136      *
137      * @throws IllegalArgumentException if the encoder is unable to encode a sequence of bytes.
138      */
encode(String s, Charset charset)139     public final String encode(String s, Charset charset) {
140         StringBuilder builder = new StringBuilder(s.length());
141         appendEncoded(builder, s, charset, false);
142         return builder.toString();
143     }
144 
145     /**
146      * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
147      *
148      * Encoded output is appended to {@code builder}. This uses the default output encoding (UTF-8).
149      */
appendEncoded(StringBuilder builder, String s)150     public final void appendEncoded(StringBuilder builder, String s) {
151         appendEncoded(builder, s, StandardCharsets.UTF_8, false);
152     }
153 
154     /**
155      * Encodes the string {@code s} as per the rules of this encoder (see class level comment).
156      *
157      * Encoded output is appended to {@code builder}. This uses the default output encoding (UTF-8).
158      * This method must produce partially encoded output. What this means is that if encoded octets
159      * appear in the input string, they are passed through unmodified, instead of being double
160      * escaped. Consider a decoder operating on the global whitelist dealing with a string
161      * “foo%25bar”. With this method, the output will be “foo%25bar”, but with appendEncoded, it
162      * will be double encoded into “foo%2525bar”.
163      */
appendPartiallyEncoded(StringBuilder builder, String s)164     public final void appendPartiallyEncoded(StringBuilder builder, String s) {
165         appendEncoded(builder, s, StandardCharsets.UTF_8, true);
166     }
167 
appendEncoded( StringBuilder builder, String s, Charset charset, boolean partiallyEncoded)168     private void appendEncoded(
169             StringBuilder builder, String s, Charset charset, boolean partiallyEncoded) {
170         CharsetEncoder encoder = charset.newEncoder()
171                 .onMalformedInput(CodingErrorAction.REPORT)
172                 .onUnmappableCharacter(CodingErrorAction.REPORT);
173         CharBuffer cBuffer = CharBuffer.allocate(s.length());
174         for (int i = 0; i < s.length(); i++) {
175             char c = s.charAt(i);
176             if (c == '%' && partiallyEncoded) {
177                 // In case there are characters waiting to be encoded.
178                 flushEncodingCharBuffer(builder, encoder, cBuffer);
179                 builder.append('%');
180                 continue;
181             }
182 
183             if (c == ' ' && isRetained(' ')) {
184                 flushEncodingCharBuffer(builder, encoder, cBuffer);
185                 builder.append('+');
186                 continue;
187             }
188 
189             if (isWhitelistedOrRetained(c)) {
190                 flushEncodingCharBuffer(builder, encoder, cBuffer);
191                 builder.append(c);
192                 continue;
193             }
194 
195             // Put the character in the queue for encoding.
196             cBuffer.put(c);
197         }
198         flushEncodingCharBuffer(builder, encoder, cBuffer);
199     }
200 
flushEncodingCharBuffer( StringBuilder builder, CharsetEncoder encoder, CharBuffer cBuffer)201     private static void flushEncodingCharBuffer(
202             StringBuilder builder,
203             CharsetEncoder encoder,
204             CharBuffer cBuffer) {
205         if (cBuffer.position() == 0) {
206             return;
207         }
208         // We are reading from the buffer now.
209         cBuffer.flip();
210         ByteBuffer byteBuffer = ByteBuffer.allocate(
211                 cBuffer.remaining() * (int) Math.ceil(encoder.maxBytesPerChar()));
212         byteBuffer.position(0);
213         CoderResult result = encoder.encode(cBuffer, byteBuffer, true /* endOfInput */);
214         // According to the {@code CharsetEncoder#encode} spec, the method returns underflow
215         // and leaves an empty output when all bytes were processed correctly.
216         if (result != CoderResult.UNDERFLOW) {
217             throw new IllegalArgumentException(
218                     "Error encoding, unexpected result ["
219                             + result.toString()
220                             + "] using encoder for ["
221                             + encoder.charset().name()
222                             + "]");
223         }
224         if (cBuffer.hasRemaining()) {
225             throw new IllegalArgumentException(
226                     "Encoder for [" + encoder.charset().name() + "] failed with underflow with "
227                             + "remaining input [" + cBuffer + "]");
228         }
229         // Need to flush in case the encoder saves internal state.
230         encoder.flush(byteBuffer);
231         if (result != CoderResult.UNDERFLOW) {
232             throw new IllegalArgumentException(
233                     "Error encoding, unexpected result ["
234                             + result.toString()
235                             + "] flushing encoder for ["
236                             + encoder.charset().name()
237                             + "]");
238         }
239         encoder.reset();
240 
241         byteBuffer.flip();
242         // Write the encoded bytes.
243         while(byteBuffer.hasRemaining()) {
244             byte b = byteBuffer.get();
245             builder.append('%');
246             builder.append(intToHexDigit((b & 0xf0) >>> 4));
247             builder.append(intToHexDigit(b & 0x0f));
248 
249         }
250         // Use the character buffer to write again.
251         cBuffer.flip();
252         cBuffer.limit(cBuffer.capacity());
253     }
254 
intToHexDigit(int b)255     private static char intToHexDigit(int b) {
256         if (b < 10) {
257             return (char) ('0' + b);
258         } else {
259             return (char) ('A' + b - 10);
260         }
261     }
262 
263     /**
264      * Decode a string according to the rules of this decoder.
265      *
266      * - if {@code convertPlus == true} all ‘+’ chars in the decoded output are converted to ‘ ‘
267      *   (white space)
268      * - if {@code throwOnFailure == true}, an {@link IllegalArgumentException} is thrown for
269      *   invalid inputs. Else, U+FFFd is emitted to the output in place of invalid input octets.
270      */
decode( String s, boolean convertPlus, Charset charset, boolean throwOnFailure)271     public static String decode(
272             String s, boolean convertPlus, Charset charset, boolean throwOnFailure) {
273         StringBuilder builder = new StringBuilder(s.length());
274         appendDecoded(builder, s, convertPlus, charset, throwOnFailure);
275         return builder.toString();
276     }
277 
278     /**
279      * Character to be output when there's an error decoding an input.
280      */
281     private static final char INVALID_INPUT_CHARACTER = '\ufffd';
282 
appendDecoded( StringBuilder builder, String s, boolean convertPlus, Charset charset, boolean throwOnFailure)283     private static void appendDecoded(
284             StringBuilder builder,
285             String s,
286             boolean convertPlus,
287             Charset charset,
288             boolean throwOnFailure) {
289         CharsetDecoder decoder = charset.newDecoder()
290                 .onMalformedInput(CodingErrorAction.REPLACE)
291                 .replaceWith("\ufffd")
292                 .onUnmappableCharacter(CodingErrorAction.REPORT);
293         // Holds the bytes corresponding to the escaped chars being read (empty if the last char
294         // wasn't a escaped char).
295         ByteBuffer byteBuffer = ByteBuffer.allocate(s.length());
296         int i = 0;
297         while (i < s.length()) {
298             char c = s.charAt(i);
299             i++;
300             switch (c) {
301                 case '+':
302                     flushDecodingByteAccumulator(
303                             builder, decoder, byteBuffer, throwOnFailure);
304                     builder.append(convertPlus ? ' ' : '+');
305                     break;
306                 case '%':
307                     // Expect two characters representing a number in hex.
308                     byte hexValue = 0;
309                     for (int j = 0; j < 2; j++) {
310                         try {
311                             c = getNextCharacter(s, i, s.length(), null /* name */);
312                         } catch (URISyntaxException e) {
313                             // Unexpected end of input.
314                             if (throwOnFailure) {
315                                 throw new IllegalArgumentException(e);
316                             } else {
317                                 flushDecodingByteAccumulator(
318                                         builder, decoder, byteBuffer, throwOnFailure);
319                                 builder.append(INVALID_INPUT_CHARACTER);
320                                 return;
321                             }
322                         }
323                         i++;
324                         int newDigit = hexCharToValue(c);
325                         if (newDigit < 0) {
326                             if (throwOnFailure) {
327                                 throw new IllegalArgumentException(
328                                         unexpectedCharacterException(s, null /* name */, c, i - 1));
329                             } else {
330                                 flushDecodingByteAccumulator(
331                                         builder, decoder, byteBuffer, throwOnFailure);
332                                 builder.append(INVALID_INPUT_CHARACTER);
333                                 break;
334                             }
335                         }
336                         hexValue = (byte) (hexValue * 0x10 + newDigit);
337                     }
338                     byteBuffer.put(hexValue);
339                     break;
340                 default:
341                     flushDecodingByteAccumulator(builder, decoder, byteBuffer, throwOnFailure);
342                     builder.append(c);
343             }
344         }
345         flushDecodingByteAccumulator(builder, decoder, byteBuffer, throwOnFailure);
346     }
347 
flushDecodingByteAccumulator( StringBuilder builder, CharsetDecoder decoder, ByteBuffer byteBuffer, boolean throwOnFailure)348     private static void flushDecodingByteAccumulator(
349             StringBuilder builder,
350             CharsetDecoder decoder,
351             ByteBuffer byteBuffer,
352             boolean throwOnFailure) {
353         if (byteBuffer.position() == 0) {
354             return;
355         }
356         byteBuffer.flip();
357         try {
358             builder.append(decoder.decode(byteBuffer));
359         } catch (CharacterCodingException e) {
360             if (throwOnFailure) {
361                 throw new IllegalArgumentException(e);
362             } else {
363                 builder.append(INVALID_INPUT_CHARACTER);
364             }
365         } finally {
366             // Use the byte buffer to write again.
367             byteBuffer.flip();
368             byteBuffer.limit(byteBuffer.capacity());
369         }
370     }
371 
372     /**
373      * Equivalent to {@code decode(s, false, UTF_8, true)}
374      */
decode(String s)375     public static String decode(String s) {
376         return decode(
377                 s, false /* convertPlus */, StandardCharsets.UTF_8, true /* throwOnFailure */);
378     }
379 }