1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "descriptors_names.h"
18
19 #include <algorithm>
20
21 #include "android-base/stringprintf.h"
22 #include "android-base/strings.h"
23
24 #include "base/macros.h"
25 #include "dex/utf-inl.h"
26
27 namespace art {
28
29 using android::base::StringAppendF;
30
AppendPrettyDescriptor(const char * descriptor,std::string * result)31 void AppendPrettyDescriptor(const char* descriptor, std::string* result) {
32 // Count the number of '['s to get the dimensionality.
33 const char* c = descriptor;
34 size_t dim = 0;
35 while (*c == '[') {
36 dim++;
37 c++;
38 }
39
40 // Reference or primitive?
41 bool primitive = false;
42 if (*c == 'L') {
43 // "[[La/b/C;" -> "a.b.C[][]".
44 c++; // Skip the 'L'.
45 } else {
46 primitive = true;
47 // "[[B" -> "byte[][]".
48 switch (*c) {
49 case 'B':
50 c = "byte";
51 break;
52 case 'C':
53 c = "char";
54 break;
55 case 'D':
56 c = "double";
57 break;
58 case 'F':
59 c = "float";
60 break;
61 case 'I':
62 c = "int";
63 break;
64 case 'J':
65 c = "long";
66 break;
67 case 'S':
68 c = "short";
69 break;
70 case 'Z':
71 c = "boolean";
72 break;
73 case 'V':
74 c = "void";
75 break; // Used when decoding return types.
76 default: result->append(descriptor); return;
77 }
78 }
79
80 // At this point, 'c' is a string of the form "fully/qualified/Type;" or
81 // "primitive". In the former case, rewrite the type with '.' instead of '/':
82 std::string temp(c);
83 if (!primitive) {
84 std::replace(temp.begin(), temp.end(), '/', '.');
85 // ...and remove the semicolon:
86 if (temp.back() == ';') {
87 temp.pop_back();
88 }
89 }
90 result->append(temp);
91
92 // Finally, add 'dim' "[]" pairs:
93 for (size_t i = 0; i < dim; ++i) {
94 result->append("[]");
95 }
96 }
97
PrettyDescriptor(const char * descriptor)98 std::string PrettyDescriptor(const char* descriptor) {
99 std::string result;
100 AppendPrettyDescriptor(descriptor, &result);
101 return result;
102 }
103
InversePrettyDescriptor(const std::string & pretty_descriptor)104 std::string InversePrettyDescriptor(const std::string& pretty_descriptor) {
105 std::string result;
106
107 // Used to determine the length of the descriptor without trailing "[]"s.
108 size_t l = pretty_descriptor.length();
109
110 // Determine dimensionality, and append the necessary leading '['s.
111 size_t dim = 0;
112 size_t pos = 0;
113 static const std::string array_indicator = "[]";
114 while ((pos = pretty_descriptor.find(array_indicator, pos)) != std::string::npos) {
115 if (dim == 0) {
116 l = pos;
117 }
118 ++dim;
119 pos += array_indicator.length();
120 }
121 for (size_t i = 0; i < dim; ++i) {
122 result += '[';
123 }
124
125 // temp_descriptor is now in the form of "some.pretty.Type" or "primitive".
126 std::string temp_descriptor(pretty_descriptor, 0, l);
127 if (temp_descriptor == "byte") {
128 result += 'B';
129 } else if (temp_descriptor == "char") {
130 result += 'C';
131 } else if (temp_descriptor == "double") {
132 result += 'D';
133 } else if (temp_descriptor == "float") {
134 result += 'F';
135 } else if (temp_descriptor == "int") {
136 result += 'I';
137 } else if (temp_descriptor == "long") {
138 result += 'J';
139 } else if (temp_descriptor == "short") {
140 result += 'S';
141 } else if (temp_descriptor == "boolean") {
142 result += 'Z';
143 } else if (temp_descriptor == "void") {
144 result += 'V';
145 } else {
146 result += 'L';
147 std::replace(temp_descriptor.begin(), temp_descriptor.end(), '.', '/');
148 result += temp_descriptor;
149 result += ';';
150 }
151 return result;
152 }
153
GetJniShortName(const std::string & class_descriptor,const std::string & method)154 std::string GetJniShortName(const std::string& class_descriptor, const std::string& method) {
155 // Remove the leading 'L' and trailing ';'...
156 std::string class_name(class_descriptor);
157 CHECK_EQ(class_name[0], 'L') << class_name;
158 CHECK_EQ(class_name[class_name.size() - 1], ';') << class_name;
159 class_name.erase(0, 1);
160 class_name.erase(class_name.size() - 1, 1);
161
162 std::string short_name;
163 short_name += "Java_";
164 short_name += MangleForJni(class_name);
165 short_name += "_";
166 short_name += MangleForJni(method);
167 return short_name;
168 }
169
170 // See http://java.sun.com/j2se/1.5.0/docs/guide/jni/spec/design.html#wp615 for the full rules.
MangleForJni(const std::string & s)171 std::string MangleForJni(const std::string& s) {
172 std::string result;
173 size_t char_count = CountModifiedUtf8Chars(s.c_str());
174 const char* cp = &s[0];
175 for (size_t i = 0; i < char_count; ++i) {
176 uint32_t ch = GetUtf16FromUtf8(&cp);
177 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
178 result.push_back(ch);
179 } else if (ch == '.' || ch == '/') {
180 result += "_";
181 } else if (ch == '_') {
182 result += "_1";
183 } else if (ch == ';') {
184 result += "_2";
185 } else if (ch == '[') {
186 result += "_3";
187 } else {
188 const uint16_t leading = GetLeadingUtf16Char(ch);
189 const uint32_t trailing = GetTrailingUtf16Char(ch);
190
191 StringAppendF(&result, "_0%04x", leading);
192 if (trailing != 0) {
193 StringAppendF(&result, "_0%04x", trailing);
194 }
195 }
196 }
197 return result;
198 }
199
DotToDescriptor(const char * class_name)200 std::string DotToDescriptor(const char* class_name) {
201 std::string descriptor(class_name);
202 std::replace(descriptor.begin(), descriptor.end(), '.', '/');
203 if (descriptor.length() > 0 && descriptor[0] != '[') {
204 descriptor = "L" + descriptor + ";";
205 }
206 return descriptor;
207 }
208
DescriptorToDot(const char * descriptor)209 std::string DescriptorToDot(const char* descriptor) {
210 size_t length = strlen(descriptor);
211 if (length > 1) {
212 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') {
213 // Descriptors have the leading 'L' and trailing ';' stripped.
214 std::string result(descriptor + 1, length - 2);
215 std::replace(result.begin(), result.end(), '/', '.');
216 return result;
217 } else {
218 // For arrays the 'L' and ';' remain intact.
219 std::string result(descriptor);
220 std::replace(result.begin(), result.end(), '/', '.');
221 return result;
222 }
223 }
224 // Do nothing for non-class/array descriptors.
225 return descriptor;
226 }
227
DescriptorToName(const char * descriptor)228 std::string DescriptorToName(const char* descriptor) {
229 size_t length = strlen(descriptor);
230 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') {
231 std::string result(descriptor + 1, length - 2);
232 return result;
233 }
234 return descriptor;
235 }
236
237 // Helper for IsValidPartOfMemberNameUtf8(), a bit vector indicating valid low ascii.
238 static constexpr uint32_t DEX_MEMBER_VALID_LOW_ASCII[4] = {
239 0x00000000, // 00..1f low control characters; nothing valid
240 0x03ff2011, // 20..3f space, digits and symbols; valid: ' ', '0'..'9', '$', '-'
241 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
242 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z'
243 };
244
245 // Helper for IsValidPartOfMemberNameUtf8(); do not call directly.
246 COLD_ATTR
IsValidPartOfMemberNameUtf8Slow(const char ** pUtf8Ptr)247 static bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) {
248 /*
249 * It's a multibyte encoded character. Decode it and analyze. We
250 * accept anything that isn't:
251 * - an improperly encoded low value
252 * - an improper surrogate pair
253 * - an encoded '\0'
254 * - a C1 control character U+0080..U+009f
255 * - a format character U+200b..U+200f, U+2028..U+202e
256 * - a special character U+fff0..U+ffff
257 * Prior to DEX format version 040, we also excluded some of the Unicode
258 * space characters:
259 * - U+00a0, U+2000..U+200a, U+202f
260 * This is all specified in the dex format document.
261 */
262
263 const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr);
264 const uint16_t leading = GetLeadingUtf16Char(pair);
265
266 // We have a surrogate pair resulting from a valid 4 byte UTF sequence.
267 // No further checks are necessary because 4 byte sequences span code
268 // points [U+10000, U+1FFFFF], which are valid codepoints in a dex
269 // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of
270 // the surrogate halves are valid and well formed in this instance.
271 if (GetTrailingUtf16Char(pair) != 0) {
272 return true;
273 }
274
275
276 // We've encountered a one, two or three byte UTF-8 sequence. The
277 // three byte UTF-8 sequence could be one half of a surrogate pair.
278 switch (leading >> 8) {
279 case 0x00:
280 // It's in the range that has C1 control characters.
281 return (leading >= 0x00a0);
282 case 0xd8:
283 case 0xd9:
284 case 0xda:
285 case 0xdb:
286 {
287 // We found a three byte sequence encoding one half of a surrogate.
288 // Look for the other half.
289 const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr);
290 const uint16_t trailing = GetLeadingUtf16Char(pair2);
291
292 return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff);
293 }
294 case 0xdc:
295 case 0xdd:
296 case 0xde:
297 case 0xdf:
298 // It's a trailing surrogate, which is not valid at this point.
299 return false;
300 case 0x20:
301 case 0xff:
302 // It's in the range that has format characters and specials.
303 switch (leading & 0xfff8) {
304 case 0x2008:
305 return (leading <= 0x200a);
306 case 0x2028:
307 return (leading == 0x202f);
308 case 0xfff0:
309 case 0xfff8:
310 return false;
311 }
312 return true;
313 default:
314 return true;
315 }
316 }
317
318 /* Return whether the pointed-at modified-UTF-8 encoded character is
319 * valid as part of a member name, updating the pointer to point past
320 * the consumed character. This will consume two encoded UTF-16 code
321 * points if the character is encoded as a surrogate pair. Also, if
322 * this function returns false, then the given pointer may only have
323 * been partially advanced.
324 */
325 ALWAYS_INLINE
IsValidPartOfMemberNameUtf8(const char ** pUtf8Ptr)326 static bool IsValidPartOfMemberNameUtf8(const char** pUtf8Ptr) {
327 uint8_t c = (uint8_t) **pUtf8Ptr;
328 if (LIKELY(c <= 0x7f)) {
329 // It's low-ascii, so check the table.
330 uint32_t wordIdx = c >> 5;
331 uint32_t bitIdx = c & 0x1f;
332 (*pUtf8Ptr)++;
333 return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
334 }
335
336 // It's a multibyte encoded character. Call a non-inline function
337 // for the heavy lifting.
338 return IsValidPartOfMemberNameUtf8Slow(pUtf8Ptr);
339 }
340
IsValidMemberName(const char * s)341 bool IsValidMemberName(const char* s) {
342 bool angle_name = false;
343
344 switch (*s) {
345 case '\0':
346 // The empty string is not a valid name.
347 return false;
348 case '<':
349 angle_name = true;
350 s++;
351 break;
352 }
353
354 while (true) {
355 switch (*s) {
356 case '\0':
357 return !angle_name;
358 case '>':
359 return angle_name && s[1] == '\0';
360 }
361
362 if (!IsValidPartOfMemberNameUtf8(&s)) {
363 return false;
364 }
365 }
366 }
367
368 enum ClassNameType { kName, kDescriptor };
369 template<ClassNameType kType, char kSeparator>
IsValidClassName(const char * s)370 static bool IsValidClassName(const char* s) {
371 int arrayCount = 0;
372 while (*s == '[') {
373 arrayCount++;
374 s++;
375 }
376
377 if (arrayCount > 255) {
378 // Arrays may have no more than 255 dimensions.
379 return false;
380 }
381
382 ClassNameType type = kType;
383 if (type != kDescriptor && arrayCount != 0) {
384 /*
385 * If we're looking at an array of some sort, then it doesn't
386 * matter if what is being asked for is a class name; the
387 * format looks the same as a type descriptor in that case, so
388 * treat it as such.
389 */
390 type = kDescriptor;
391 }
392
393 if (type == kDescriptor) {
394 /*
395 * We are looking for a descriptor. Either validate it as a
396 * single-character primitive type, or continue on to check the
397 * embedded class name (bracketed by "L" and ";").
398 */
399 switch (*(s++)) {
400 case 'B':
401 case 'C':
402 case 'D':
403 case 'F':
404 case 'I':
405 case 'J':
406 case 'S':
407 case 'Z':
408 // These are all single-character descriptors for primitive types.
409 return (*s == '\0');
410 case 'V':
411 // Non-array void is valid, but you can't have an array of void.
412 return (arrayCount == 0) && (*s == '\0');
413 case 'L':
414 // Class name: Break out and continue below.
415 break;
416 default:
417 // Oddball descriptor character.
418 return false;
419 }
420 }
421
422 /*
423 * We just consumed the 'L' that introduces a class name as part
424 * of a type descriptor, or we are looking for an unadorned class
425 * name.
426 */
427
428 bool sepOrFirst = true; // first character or just encountered a separator.
429 for (;;) {
430 uint8_t c = (uint8_t) *s;
431 switch (c) {
432 case '\0':
433 /*
434 * Premature end for a type descriptor, but valid for
435 * a class name as long as we haven't encountered an
436 * empty component (including the degenerate case of
437 * the empty string "").
438 */
439 return (type == kName) && !sepOrFirst;
440 case ';':
441 /*
442 * Invalid character for a class name, but the
443 * legitimate end of a type descriptor. In the latter
444 * case, make sure that this is the end of the string
445 * and that it doesn't end with an empty component
446 * (including the degenerate case of "L;").
447 */
448 return (type == kDescriptor) && !sepOrFirst && (s[1] == '\0');
449 case '/':
450 case '.':
451 if (c != kSeparator) {
452 // The wrong separator character.
453 return false;
454 }
455 if (sepOrFirst) {
456 // Separator at start or two separators in a row.
457 return false;
458 }
459 sepOrFirst = true;
460 s++;
461 break;
462 default:
463 if (!IsValidPartOfMemberNameUtf8(&s)) {
464 return false;
465 }
466 sepOrFirst = false;
467 break;
468 }
469 }
470 }
471
IsValidBinaryClassName(const char * s)472 bool IsValidBinaryClassName(const char* s) {
473 return IsValidClassName<kName, '.'>(s);
474 }
475
IsValidJniClassName(const char * s)476 bool IsValidJniClassName(const char* s) {
477 return IsValidClassName<kName, '/'>(s);
478 }
479
IsValidDescriptor(const char * s)480 bool IsValidDescriptor(const char* s) {
481 return IsValidClassName<kDescriptor, '/'>(s);
482 }
483
PrettyDescriptor(Primitive::Type type)484 std::string PrettyDescriptor(Primitive::Type type) {
485 return PrettyDescriptor(Primitive::Descriptor(type));
486 }
487
488 } // namespace art
489