1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License
15  */
16 
17 package java.nio.charset;
18 
19 import java.io.UTFDataFormatException;
20 
21 /**
22  * Encoding and decoding methods for Modified UTF-8
23  *
24  * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
25  * 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
26  *
27  * @hide
28  */
29 public class ModifiedUtf8 {
30 
31     /**
32      * Count the number of bytes in the modified UTF-8 representation of {@code s}.
33      *
34      * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
35      * the size cannot be presented in an (unsigned) java short.
36      */
countBytes(String s, boolean shortLength)37     public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
38         long counter = 0;
39         int strLen = s.length();
40         for (int i = 0; i < strLen; i++) {
41             char c = s.charAt(i);
42             if (c < '\u0080') {
43                 counter++;
44                 if (c == '\u0000') {
45                     counter++;
46                 }
47             } else if (c < '\u0800') {
48                 counter += 2;
49             } else {
50                 counter += 3;
51             }
52         }
53         // Allow up to the maximum value of an unsigned short (as the value is known to be
54         // unsigned.
55         if (shortLength && counter > 0xffff) {
56             throw new UTFDataFormatException(
57                     "Size of the encoded string doesn't fit in two bytes");
58         }
59         return counter;
60     }
61 
62     /**
63      * Encode {@code s} into {@code dst} starting at offset {@code offset}.
64      *
65      * <p>The output buffer is guaranteed to have enough space.
66      */
encode(byte[] dst, int offset, String s)67     public static void encode(byte[] dst, int offset, String s) {
68         int strLen = s.length();
69         for (int i = 0; i < strLen; i++) {
70             char c = s.charAt(i);
71             if (c < '\u0080') {
72                 if (c == 0) {
73                     dst[offset++] = (byte) 0xc0;
74                     dst[offset++] = (byte) 0x80;
75                 } else {
76                     dst[offset++] = (byte) c;
77                 }
78             } else if (c < '\u0800') {
79                 dst[offset++] = (byte) ((c >>> 6) | 0xc0);
80                 dst[offset++] = (byte) ((c & 0x3f) | 0x80);
81             } else {
82                 dst[offset++] = (byte) ((c >>> 12) | 0xe0);
83                 dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
84                 dst[offset++] = (byte) ((c & 0x3f) | 0x80);
85             }
86         }
87     }
88 
89     /**
90      * Encodes {@code s} into a buffer with the following format:
91      *
92      * <p>- the first two bytes of the buffer are the length of the modified-utf8 output
93      * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
94      * represented as a short.
95      *
96      * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
97      * {@code encode(buf, 2, s)}).
98      */
encode(String s)99     public static byte[] encode(String s) throws UTFDataFormatException {
100         long size = countBytes(s, true);
101         byte[] output = new byte[(int) size + 2];
102         encode(output, 2, s);
103         output[0] = (byte) (size >>> 8);
104         output[1] = (byte) size;
105         return output;
106     }
107 
108     /**
109      * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
110      * {@code out},
111      *
112      * <p>A maximum of {@code length} chars are written to the output starting at offset 0.
113      * {@code out} is assumed to have enough space for the output (a standard
114      * {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
115      *
116      * <p>If a ‘0’ byte is encountered, it is converted to U+0000.
117      */
decode(byte[] in, char[] out, int offset, int length)118     public static String decode(byte[] in, char[] out, int offset, int length)
119             throws UTFDataFormatException {
120         if (offset < 0 || length < 0) {
121             throw new IllegalArgumentException("Illegal arguments: offset " + offset
122                     + ". Length: " + length);
123         }
124         int outputIndex = 0;
125         int limitIndex = offset + length;
126         while (offset < limitIndex) {
127             int i = in[offset] & 0xff;
128             offset++;
129             if (i < 0x80) {
130                 out[outputIndex] = (char) i;
131                 outputIndex++;
132                 continue;
133             }
134             if (0xc0 <= i && i < 0xe0) {
135                 // This branch covers the case 0 = 0xc080.
136 
137                 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
138                 i = (i & 0x1f) << 6;
139                 if(offset == limitIndex) {
140                     throw new UTFDataFormatException("unexpected end of input");
141                 }
142                 // Include 6 least-significant bits of the input byte.
143                 if ((in[offset] & 0xc0) != 0x80) {
144                     throw new UTFDataFormatException("bad second byte at " + offset);
145                 }
146                 out[outputIndex] = (char) (i | (in[offset] & 0x3f));
147                 offset++;
148                 outputIndex++;
149             } else if(i < 0xf0) {
150                 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
151                 // + 6 l-s of next to next input byte.
152                 i = (i & 0x1f) << 12;
153                 // Make sure there are are at least two bytes left.
154                 if (offset + 1 >= limitIndex) {
155                     throw new UTFDataFormatException("unexpected end of input");
156                 }
157                 // Include 6 least-significant bits of the input byte, with 6 bits of room
158                 // for the next byte.
159                 if ((in[offset] & 0xc0) != 0x80) {
160                     throw new UTFDataFormatException("bad second byte at " + offset);
161                 }
162                 i = i | (in[offset] & 0x3f) << 6;
163                 offset++;
164                 // Include 6 least-significant bits of the input byte.
165                 if ((in[offset] & 0xc0) != 0x80) {
166                     throw new UTFDataFormatException("bad third byte at " + offset);
167                 }
168                 out[outputIndex] = (char) (i | (in[offset] & 0x3f));
169                 offset++;
170                 outputIndex++;
171             } else {
172                 throw new UTFDataFormatException("Invalid UTF8 byte "
173                         + (int) i + " at position " + (offset - 1));
174             }
175         }
176         return String.valueOf(out, 0, outputIndex);
177     }
178 }
179