1 /* 2 * Copyright (C) 2022 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.modules.utils; 18 19 import java.io.UTFDataFormatException; 20 21 public class ModifiedUtf8 { 22 /** 23 * Decodes a byte array containing <i>modified UTF-8</i> bytes into a string. 24 * 25 * <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000, 26 * that's what the RI does too. 27 */ decode(byte[] in, char[] out, int offset, int utfSize)28 public static String decode(byte[] in, char[] out, int offset, int utfSize) 29 throws UTFDataFormatException { 30 int count = 0, s = 0, a; 31 while (count < utfSize) { 32 if ((out[s] = (char) in[offset + count++]) < '\u0080') { 33 s++; 34 } else if (((a = out[s]) & 0xe0) == 0xc0) { 35 if (count >= utfSize) { 36 throw new UTFDataFormatException("bad second byte at " + count); 37 } 38 int b = in[offset + count++]; 39 if ((b & 0xC0) != 0x80) { 40 throw new UTFDataFormatException("bad second byte at " + (count - 1)); 41 } 42 out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F)); 43 } else if ((a & 0xf0) == 0xe0) { 44 if (count + 1 >= utfSize) { 45 throw new UTFDataFormatException("bad third byte at " + (count + 1)); 46 } 47 int b = in[offset + count++]; 48 int c = in[offset + count++]; 49 if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) { 50 throw new UTFDataFormatException("bad second or third byte at " + (count - 2)); 51 } 52 out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F)); 53 } else { 54 throw new UTFDataFormatException("bad byte at " + (count - 1)); 55 } 56 } 57 return new String(out, 0, s); 58 } 59 60 /** 61 * Returns the number of bytes the modified UTF-8 representation of 's' would take. Note 62 * that this is just the space for the bytes representing the characters, not the length 63 * which precedes those bytes, because different callers represent the length differently, 64 * as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an 65 * exception if the string is too long for its length to be represented by a short. 66 */ countBytes(String s, boolean shortLength)67 public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException { 68 long result = 0; 69 final int length = s.length(); 70 for (int i = 0; i < length; ++i) { 71 char ch = s.charAt(i); 72 if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. 73 ++result; 74 } else if (ch <= 2047) { 75 result += 2; 76 } else { 77 result += 3; 78 } 79 if (shortLength && result > 65535) { 80 throw new UTFDataFormatException("String more than 65535 UTF bytes long"); 81 } 82 } 83 return result; 84 } 85 86 /** 87 * Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the 88 * byte array {@code dst}, starting at the given {@code offset}. 89 */ encode(byte[] dst, int offset, String s)90 public static void encode(byte[] dst, int offset, String s) { 91 final int length = s.length(); 92 for (int i = 0; i < length; i++) { 93 char ch = s.charAt(i); 94 if (ch != 0 && ch <= 127) { // U+0000 uses two bytes. 95 dst[offset++] = (byte) ch; 96 } else if (ch <= 2047) { 97 dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6))); 98 dst[offset++] = (byte) (0x80 | (0x3f & ch)); 99 } else { 100 dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12))); 101 dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6))); 102 dst[offset++] = (byte) (0x80 | (0x3f & ch)); 103 } 104 } 105 } 106 ModifiedUtf8()107 private ModifiedUtf8() { 108 } 109 } 110