1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.modules.utils;
18 
19 import java.io.UTFDataFormatException;
20 
21 public class ModifiedUtf8 {
22     /**
23      * Decodes a byte array containing <i>modified UTF-8</i> bytes into a string.
24      *
25      * <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000,
26      * that's what the RI does too.
27      */
decode(byte[] in, char[] out, int offset, int utfSize)28     public static String decode(byte[] in, char[] out, int offset, int utfSize)
29             throws UTFDataFormatException {
30         int count = 0, s = 0, a;
31         while (count < utfSize) {
32             if ((out[s] = (char) in[offset + count++]) < '\u0080') {
33                 s++;
34             } else if (((a = out[s]) & 0xe0) == 0xc0) {
35                 if (count >= utfSize) {
36                     throw new UTFDataFormatException("bad second byte at " + count);
37                 }
38                 int b = in[offset + count++];
39                 if ((b & 0xC0) != 0x80) {
40                     throw new UTFDataFormatException("bad second byte at " + (count - 1));
41                 }
42                 out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F));
43             } else if ((a & 0xf0) == 0xe0) {
44                 if (count + 1 >= utfSize) {
45                     throw new UTFDataFormatException("bad third byte at " + (count + 1));
46                 }
47                 int b = in[offset + count++];
48                 int c = in[offset + count++];
49                 if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) {
50                     throw new UTFDataFormatException("bad second or third byte at " + (count - 2));
51                 }
52                 out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F));
53             } else {
54                 throw new UTFDataFormatException("bad byte at " + (count - 1));
55             }
56         }
57         return new String(out, 0, s);
58     }
59 
60     /**
61      * Returns the number of bytes the modified UTF-8 representation of 's' would take. Note
62      * that this is just the space for the bytes representing the characters, not the length
63      * which precedes those bytes, because different callers represent the length differently,
64      * as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an
65      * exception if the string is too long for its length to be represented by a short.
66      */
countBytes(String s, boolean shortLength)67     public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
68         long result = 0;
69         final int length = s.length();
70         for (int i = 0; i < length; ++i) {
71             char ch = s.charAt(i);
72             if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
73                 ++result;
74             } else if (ch <= 2047) {
75                 result += 2;
76             } else {
77                 result += 3;
78             }
79             if (shortLength && result > 65535) {
80                 throw new UTFDataFormatException("String more than 65535 UTF bytes long");
81             }
82         }
83         return result;
84     }
85 
86     /**
87      * Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the
88      * byte array {@code dst}, starting at the given {@code offset}.
89      */
encode(byte[] dst, int offset, String s)90     public static void encode(byte[] dst, int offset, String s) {
91         final int length = s.length();
92         for (int i = 0; i < length; i++) {
93             char ch = s.charAt(i);
94             if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
95                 dst[offset++] = (byte) ch;
96             } else if (ch <= 2047) {
97                 dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6)));
98                 dst[offset++] = (byte) (0x80 | (0x3f & ch));
99             } else {
100                 dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12)));
101                 dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6)));
102                 dst[offset++] = (byte) (0x80 | (0x3f & ch));
103             }
104         }
105     }
106 
ModifiedUtf8()107     private ModifiedUtf8() {
108     }
109 }
110