1 /*
2 * Copyright (C) 2016 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "androidfw/Util.h"
18
19 #include <algorithm>
20 #include <string>
21
22 #include "utils/ByteOrder.h"
23 #include "utils/Unicode.h"
24
25 #ifdef _WIN32
26 #ifdef ERROR
27 #undef ERROR
28 #endif
29 #endif
30
31 namespace android {
32 namespace util {
33
ReadUtf16StringFromDevice(const uint16_t * src,size_t len,std::string * out)34 void ReadUtf16StringFromDevice(const uint16_t* src, size_t len, std::string* out) {
35 char buf[5];
36 while (*src && len != 0) {
37 char16_t c = static_cast<char16_t>(dtohs(*src));
38 utf16_to_utf8(&c, 1, buf, sizeof(buf));
39 out->append(buf, strlen(buf));
40 ++src;
41 --len;
42 }
43 }
44
Utf8ToUtf16(StringPiece utf8)45 std::u16string Utf8ToUtf16(StringPiece utf8) {
46 ssize_t utf16_length =
47 utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length());
48 if (utf16_length <= 0) {
49 return {};
50 }
51
52 std::u16string utf16;
53 utf16.resize(utf16_length);
54 utf8_to_utf16(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length(), &*utf16.begin(),
55 utf16_length + 1);
56 return utf16;
57 }
58
Utf16ToUtf8(StringPiece16 utf16)59 std::string Utf16ToUtf8(StringPiece16 utf16) {
60 ssize_t utf8_length = utf16_to_utf8_length(utf16.data(), utf16.length());
61 if (utf8_length <= 0) {
62 return {};
63 }
64
65 std::string utf8;
66 utf8.resize(utf8_length);
67 utf16_to_utf8(utf16.data(), utf16.length(), &*utf8.begin(), utf8_length + 1);
68 return utf8;
69 }
70
Utf8ToModifiedUtf8(std::string_view utf8)71 std::string Utf8ToModifiedUtf8(std::string_view utf8) {
72 // Java uses Modified UTF-8 which only supports the 1, 2, and 3 byte formats of UTF-8. To encode
73 // 4 byte UTF-8 codepoints, Modified UTF-8 allows the use of surrogate pairs in the same format
74 // of CESU-8 surrogate pairs. Calculate the size of the utf8 string with all 4 byte UTF-8
75 // codepoints replaced with 2 3 byte surrogate pairs
76 size_t modified_size = 0;
77 const size_t size = utf8.size();
78 for (size_t i = 0; i < size; i++) {
79 if (((uint8_t)utf8[i] >> 4) == 0xF) {
80 modified_size += 6;
81 i += 3;
82 } else {
83 modified_size++;
84 }
85 }
86
87 // Early out if no 4 byte codepoints are found
88 if (size == modified_size) {
89 return std::string(utf8);
90 }
91
92 std::string output;
93 output.reserve(modified_size);
94 for (size_t i = 0; i < size; i++) {
95 if (((uint8_t)utf8[i] >> 4) == 0xF) {
96 int32_t codepoint = utf32_from_utf8_at(utf8.data(), size, i, nullptr);
97
98 // Calculate the high and low surrogates as UTF-16 would
99 int32_t high = ((codepoint - 0x10000) / 0x400) + 0xD800;
100 int32_t low = ((codepoint - 0x10000) % 0x400) + 0xDC00;
101
102 // Encode each surrogate in UTF-8
103 output.push_back((char)(0xE4 | ((high >> 12) & 0xF)));
104 output.push_back((char)(0x80 | ((high >> 6) & 0x3F)));
105 output.push_back((char)(0x80 | (high & 0x3F)));
106 output.push_back((char)(0xE4 | ((low >> 12) & 0xF)));
107 output.push_back((char)(0x80 | ((low >> 6) & 0x3F)));
108 output.push_back((char)(0x80 | (low & 0x3F)));
109 i += 3;
110 } else {
111 output.push_back(utf8[i]);
112 }
113 }
114
115 return output;
116 }
117
ModifiedUtf8ToUtf8(std::string_view modified_utf8)118 std::string ModifiedUtf8ToUtf8(std::string_view modified_utf8) {
119 // The UTF-8 representation will have a byte length less than or equal to the Modified UTF-8
120 // representation.
121 std::string output;
122 output.reserve(modified_utf8.size());
123
124 size_t index = 0;
125 const size_t modified_size = modified_utf8.size();
126 while (index < modified_size) {
127 size_t next_index;
128 int32_t high_surrogate =
129 utf32_from_utf8_at(modified_utf8.data(), modified_size, index, &next_index);
130 if (high_surrogate < 0) {
131 return {};
132 }
133
134 // Check that the first codepoint is within the high surrogate range
135 if (high_surrogate >= 0xD800 && high_surrogate <= 0xDB7F) {
136 int32_t low_surrogate =
137 utf32_from_utf8_at(modified_utf8.data(), modified_size, next_index, &next_index);
138 if (low_surrogate < 0) {
139 return {};
140 }
141
142 // Check that the second codepoint is within the low surrogate range
143 if (low_surrogate >= 0xDC00 && low_surrogate <= 0xDFFF) {
144 const char32_t codepoint =
145 (char32_t)(((high_surrogate - 0xD800) * 0x400) + (low_surrogate - 0xDC00) + 0x10000);
146
147 // The decoded codepoint should represent a 4 byte, UTF-8 character
148 const size_t utf8_length = (size_t)utf32_to_utf8_length(&codepoint, 1);
149 if (utf8_length != 4) {
150 return {};
151 }
152
153 // Encode the UTF-8 representation of the codepoint into the string
154 const size_t start_index = output.size();
155 output.resize(start_index + utf8_length);
156 char* start = &output[start_index];
157 utf32_to_utf8((char32_t*)&codepoint, 1, start, utf8_length + 1);
158
159 index = next_index;
160 continue;
161 }
162 }
163
164 // Append non-surrogate pairs to the output string
165 for (size_t i = index; i < next_index; i++) {
166 output.push_back(modified_utf8[i]);
167 }
168 index = next_index;
169 }
170 return output;
171 }
172
173 template <class Func>
SplitAndTransform(StringPiece str,char sep,Func && f)174 static std::vector<std::string> SplitAndTransform(StringPiece str, char sep, Func&& f) {
175 std::vector<std::string> parts;
176 const StringPiece::const_iterator end = std::end(str);
177 StringPiece::const_iterator start = std::begin(str);
178 StringPiece::const_iterator current;
179 do {
180 current = std::find(start, end, sep);
181 parts.emplace_back(StringPiece(start, current - start));
182 std::string& part = parts.back();
183 std::transform(part.begin(), part.end(), part.begin(), f);
184 start = current + 1;
185 } while (current != end);
186 return parts;
187 }
188
SplitAndLowercase(StringPiece str,char sep)189 std::vector<std::string> SplitAndLowercase(StringPiece str, char sep) {
190 return SplitAndTransform(str, sep, [](char c) { return ::tolower(c); });
191 }
192
Copy(const BigBuffer & buffer)193 std::unique_ptr<uint8_t[]> Copy(const BigBuffer& buffer) {
194 auto data = std::unique_ptr<uint8_t[]>(new uint8_t[buffer.size()]);
195 uint8_t* p = data.get();
196 for (const auto& block : buffer) {
197 memcpy(p, block.buffer.get(), block.size);
198 p += block.size;
199 }
200 return data;
201 }
202
GetString16(const android::ResStringPool & pool,size_t idx)203 StringPiece16 GetString16(const android::ResStringPool& pool, size_t idx) {
204 if (auto str = pool.stringAt(idx); str.ok()) {
205 return *str;
206 }
207 return StringPiece16();
208 }
209
GetString(const android::ResStringPool & pool,size_t idx)210 std::string GetString(const android::ResStringPool& pool, size_t idx) {
211 if (auto str = pool.string8At(idx); str.ok()) {
212 return ModifiedUtf8ToUtf8(*str);
213 }
214 return Utf16ToUtf8(GetString16(pool, idx));
215 }
216
217 } // namespace util
218 } // namespace android
219