1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "descriptors_names.h"
18 
19 #include <algorithm>
20 
21 #include "android-base/stringprintf.h"
22 #include "android-base/strings.h"
23 
24 #include "base/macros.h"
25 #include "dex/utf-inl.h"
26 
27 namespace art {
28 
29 using android::base::StringAppendF;
30 
AppendPrettyDescriptor(const char * descriptor,std::string * result)31 void AppendPrettyDescriptor(const char* descriptor, std::string* result) {
32   // Count the number of '['s to get the dimensionality.
33   const char* c = descriptor;
34   size_t dim = 0;
35   while (*c == '[') {
36     dim++;
37     c++;
38   }
39 
40   // Reference or primitive?
41   bool primitive = false;
42   if (*c == 'L') {
43     // "[[La/b/C;" -> "a.b.C[][]".
44     c++;  // Skip the 'L'.
45   } else {
46     primitive = true;
47     // "[[B" -> "byte[][]".
48     switch (*c) {
49       case 'B':
50         c = "byte";
51         break;
52       case 'C':
53         c = "char";
54         break;
55       case 'D':
56         c = "double";
57         break;
58       case 'F':
59         c = "float";
60         break;
61       case 'I':
62         c = "int";
63         break;
64       case 'J':
65         c = "long";
66         break;
67       case 'S':
68         c = "short";
69         break;
70       case 'Z':
71         c = "boolean";
72         break;
73       case 'V':
74         c = "void";
75         break;  // Used when decoding return types.
76       default: result->append(descriptor); return;
77     }
78   }
79 
80   // At this point, 'c' is a string of the form "fully/qualified/Type;" or
81   // "primitive". In the former case, rewrite the type with '.' instead of '/':
82   std::string temp(c);
83   if (!primitive) {
84     std::replace(temp.begin(), temp.end(), '/', '.');
85     // ...and remove the semicolon:
86     if (temp.back() == ';') {
87       temp.pop_back();
88     }
89   }
90   result->append(temp);
91 
92   // Finally, add 'dim' "[]" pairs:
93   for (size_t i = 0; i < dim; ++i) {
94     result->append("[]");
95   }
96 }
97 
PrettyDescriptor(const char * descriptor)98 std::string PrettyDescriptor(const char* descriptor) {
99   std::string result;
100   AppendPrettyDescriptor(descriptor, &result);
101   return result;
102 }
103 
InversePrettyDescriptor(const std::string & pretty_descriptor)104 std::string InversePrettyDescriptor(const std::string& pretty_descriptor) {
105   std::string result;
106 
107   // Used to determine the length of the descriptor without trailing "[]"s.
108   size_t l = pretty_descriptor.length();
109 
110   // Determine dimensionality, and append the necessary leading '['s.
111   size_t dim = 0;
112   size_t pos = 0;
113   static const std::string array_indicator = "[]";
114   while ((pos = pretty_descriptor.find(array_indicator, pos)) != std::string::npos) {
115     if (dim == 0) {
116       l = pos;
117     }
118     ++dim;
119     pos += array_indicator.length();
120   }
121   for (size_t i = 0; i < dim; ++i) {
122     result += '[';
123   }
124 
125   // temp_descriptor is now in the form of "some.pretty.Type" or "primitive".
126   std::string temp_descriptor(pretty_descriptor, 0, l);
127   if (temp_descriptor == "byte") {
128     result += 'B';
129   } else if (temp_descriptor == "char") {
130     result += 'C';
131   } else if (temp_descriptor == "double") {
132     result += 'D';
133   } else if (temp_descriptor == "float") {
134     result += 'F';
135   } else if (temp_descriptor == "int") {
136     result += 'I';
137   } else if (temp_descriptor == "long") {
138     result += 'J';
139   } else if (temp_descriptor == "short") {
140     result += 'S';
141   } else if (temp_descriptor == "boolean") {
142     result += 'Z';
143   } else if (temp_descriptor == "void") {
144     result += 'V';
145   } else {
146     result += 'L';
147     std::replace(temp_descriptor.begin(), temp_descriptor.end(), '.', '/');
148     result += temp_descriptor;
149     result += ';';
150   }
151   return result;
152 }
153 
GetJniShortName(const std::string & class_descriptor,const std::string & method)154 std::string GetJniShortName(const std::string& class_descriptor, const std::string& method) {
155   // Remove the leading 'L' and trailing ';'...
156   std::string class_name(class_descriptor);
157   CHECK_EQ(class_name[0], 'L') << class_name;
158   CHECK_EQ(class_name[class_name.size() - 1], ';') << class_name;
159   class_name.erase(0, 1);
160   class_name.erase(class_name.size() - 1, 1);
161 
162   std::string short_name;
163   short_name += "Java_";
164   short_name += MangleForJni(class_name);
165   short_name += "_";
166   short_name += MangleForJni(method);
167   return short_name;
168 }
169 
170 // See http://java.sun.com/j2se/1.5.0/docs/guide/jni/spec/design.html#wp615 for the full rules.
MangleForJni(const std::string & s)171 std::string MangleForJni(const std::string& s) {
172   std::string result;
173   size_t char_count = CountModifiedUtf8Chars(s.c_str());
174   const char* cp = &s[0];
175   for (size_t i = 0; i < char_count; ++i) {
176     uint32_t ch = GetUtf16FromUtf8(&cp);
177     if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
178       result.push_back(ch);
179     } else if (ch == '.' || ch == '/') {
180       result += "_";
181     } else if (ch == '_') {
182       result += "_1";
183     } else if (ch == ';') {
184       result += "_2";
185     } else if (ch == '[') {
186       result += "_3";
187     } else {
188       const uint16_t leading = GetLeadingUtf16Char(ch);
189       const uint32_t trailing = GetTrailingUtf16Char(ch);
190 
191       StringAppendF(&result, "_0%04x", leading);
192       if (trailing != 0) {
193         StringAppendF(&result, "_0%04x", trailing);
194       }
195     }
196   }
197   return result;
198 }
199 
DotToDescriptor(const char * class_name)200 std::string DotToDescriptor(const char* class_name) {
201   std::string descriptor(class_name);
202   std::replace(descriptor.begin(), descriptor.end(), '.', '/');
203   if (descriptor.length() > 0 && descriptor[0] != '[') {
204     descriptor = "L" + descriptor + ";";
205   }
206   return descriptor;
207 }
208 
DescriptorToDot(const char * descriptor)209 std::string DescriptorToDot(const char* descriptor) {
210   size_t length = strlen(descriptor);
211   if (length > 1) {
212     if (descriptor[0] == 'L' && descriptor[length - 1] == ';') {
213       // Descriptors have the leading 'L' and trailing ';' stripped.
214       std::string result(descriptor + 1, length - 2);
215       std::replace(result.begin(), result.end(), '/', '.');
216       return result;
217     } else {
218       // For arrays the 'L' and ';' remain intact.
219       std::string result(descriptor);
220       std::replace(result.begin(), result.end(), '/', '.');
221       return result;
222     }
223   }
224   // Do nothing for non-class/array descriptors.
225   return descriptor;
226 }
227 
DescriptorToName(const char * descriptor)228 std::string DescriptorToName(const char* descriptor) {
229   size_t length = strlen(descriptor);
230   if (descriptor[0] == 'L' && descriptor[length - 1] == ';') {
231     std::string result(descriptor + 1, length - 2);
232     return result;
233   }
234   return descriptor;
235 }
236 
237 // Helper for IsValidPartOfMemberNameUtf8(), a bit vector indicating valid low ascii.
238 static constexpr uint32_t DEX_MEMBER_VALID_LOW_ASCII[4] = {
239   0x00000000,  // 00..1f low control characters; nothing valid
240   0x03ff2011,  // 20..3f space, digits and symbols; valid: ' ', '0'..'9', '$', '-'
241   0x87fffffe,  // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
242   0x07fffffe   // 60..7f lowercase etc.; valid: 'a'..'z'
243 };
244 
245 // Helper for IsValidPartOfMemberNameUtf8(); do not call directly.
246 COLD_ATTR
IsValidPartOfMemberNameUtf8Slow(const char ** pUtf8Ptr)247 static bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) {
248   /*
249    * It's a multibyte encoded character. Decode it and analyze. We
250    * accept anything that isn't:
251    *   - an improperly encoded low value
252    *   - an improper surrogate pair
253    *   - an encoded '\0'
254    *   - a C1 control character U+0080..U+009f
255    *   - a format character U+200b..U+200f, U+2028..U+202e
256    *   - a special character U+fff0..U+ffff
257    * Prior to DEX format version 040, we also excluded some of the Unicode
258    * space characters:
259    *   - U+00a0, U+2000..U+200a, U+202f
260    * This is all specified in the dex format document.
261    */
262 
263   const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr);
264   const uint16_t leading = GetLeadingUtf16Char(pair);
265 
266   // We have a surrogate pair resulting from a valid 4 byte UTF sequence.
267   // No further checks are necessary because 4 byte sequences span code
268   // points [U+10000, U+1FFFFF], which are valid codepoints in a dex
269   // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of
270   // the surrogate halves are valid and well formed in this instance.
271   if (GetTrailingUtf16Char(pair) != 0) {
272     return true;
273   }
274 
275 
276   // We've encountered a one, two or three byte UTF-8 sequence. The
277   // three byte UTF-8 sequence could be one half of a surrogate pair.
278   switch (leading >> 8) {
279     case 0x00:
280       // It's in the range that has C1 control characters.
281       return (leading >= 0x00a0);
282     case 0xd8:
283     case 0xd9:
284     case 0xda:
285     case 0xdb:
286       {
287         // We found a three byte sequence encoding one half of a surrogate.
288         // Look for the other half.
289         const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr);
290         const uint16_t trailing = GetLeadingUtf16Char(pair2);
291 
292         return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff);
293       }
294     case 0xdc:
295     case 0xdd:
296     case 0xde:
297     case 0xdf:
298       // It's a trailing surrogate, which is not valid at this point.
299       return false;
300     case 0x20:
301     case 0xff:
302       // It's in the range that has format characters and specials.
303       switch (leading & 0xfff8) {
304         case 0x2008:
305           return (leading <= 0x200a);
306         case 0x2028:
307           return (leading == 0x202f);
308         case 0xfff0:
309         case 0xfff8:
310           return false;
311       }
312       return true;
313     default:
314       return true;
315   }
316 }
317 
318 /* Return whether the pointed-at modified-UTF-8 encoded character is
319  * valid as part of a member name, updating the pointer to point past
320  * the consumed character. This will consume two encoded UTF-16 code
321  * points if the character is encoded as a surrogate pair. Also, if
322  * this function returns false, then the given pointer may only have
323  * been partially advanced.
324  */
325 ALWAYS_INLINE
IsValidPartOfMemberNameUtf8(const char ** pUtf8Ptr)326 static bool IsValidPartOfMemberNameUtf8(const char** pUtf8Ptr) {
327   uint8_t c = (uint8_t) **pUtf8Ptr;
328   if (LIKELY(c <= 0x7f)) {
329     // It's low-ascii, so check the table.
330     uint32_t wordIdx = c >> 5;
331     uint32_t bitIdx = c & 0x1f;
332     (*pUtf8Ptr)++;
333     return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
334   }
335 
336   // It's a multibyte encoded character. Call a non-inline function
337   // for the heavy lifting.
338   return IsValidPartOfMemberNameUtf8Slow(pUtf8Ptr);
339 }
340 
IsValidMemberName(const char * s)341 bool IsValidMemberName(const char* s) {
342   bool angle_name = false;
343 
344   switch (*s) {
345     case '\0':
346       // The empty string is not a valid name.
347       return false;
348     case '<':
349       angle_name = true;
350       s++;
351       break;
352   }
353 
354   while (true) {
355     switch (*s) {
356       case '\0':
357         return !angle_name;
358       case '>':
359         return angle_name && s[1] == '\0';
360     }
361 
362     if (!IsValidPartOfMemberNameUtf8(&s)) {
363       return false;
364     }
365   }
366 }
367 
368 enum ClassNameType { kName, kDescriptor };
369 template<ClassNameType kType, char kSeparator>
IsValidClassName(const char * s)370 static bool IsValidClassName(const char* s) {
371   int arrayCount = 0;
372   while (*s == '[') {
373     arrayCount++;
374     s++;
375   }
376 
377   if (arrayCount > 255) {
378     // Arrays may have no more than 255 dimensions.
379     return false;
380   }
381 
382   ClassNameType type = kType;
383   if (type != kDescriptor && arrayCount != 0) {
384     /*
385      * If we're looking at an array of some sort, then it doesn't
386      * matter if what is being asked for is a class name; the
387      * format looks the same as a type descriptor in that case, so
388      * treat it as such.
389      */
390     type = kDescriptor;
391   }
392 
393   if (type == kDescriptor) {
394     /*
395      * We are looking for a descriptor. Either validate it as a
396      * single-character primitive type, or continue on to check the
397      * embedded class name (bracketed by "L" and ";").
398      */
399     switch (*(s++)) {
400     case 'B':
401     case 'C':
402     case 'D':
403     case 'F':
404     case 'I':
405     case 'J':
406     case 'S':
407     case 'Z':
408       // These are all single-character descriptors for primitive types.
409       return (*s == '\0');
410     case 'V':
411       // Non-array void is valid, but you can't have an array of void.
412       return (arrayCount == 0) && (*s == '\0');
413     case 'L':
414       // Class name: Break out and continue below.
415       break;
416     default:
417       // Oddball descriptor character.
418       return false;
419     }
420   }
421 
422   /*
423    * We just consumed the 'L' that introduces a class name as part
424    * of a type descriptor, or we are looking for an unadorned class
425    * name.
426    */
427 
428   bool sepOrFirst = true;  // first character or just encountered a separator.
429   for (;;) {
430     uint8_t c = (uint8_t) *s;
431     switch (c) {
432     case '\0':
433       /*
434        * Premature end for a type descriptor, but valid for
435        * a class name as long as we haven't encountered an
436        * empty component (including the degenerate case of
437        * the empty string "").
438        */
439       return (type == kName) && !sepOrFirst;
440     case ';':
441       /*
442        * Invalid character for a class name, but the
443        * legitimate end of a type descriptor. In the latter
444        * case, make sure that this is the end of the string
445        * and that it doesn't end with an empty component
446        * (including the degenerate case of "L;").
447        */
448       return (type == kDescriptor) && !sepOrFirst && (s[1] == '\0');
449     case '/':
450     case '.':
451       if (c != kSeparator) {
452         // The wrong separator character.
453         return false;
454       }
455       if (sepOrFirst) {
456         // Separator at start or two separators in a row.
457         return false;
458       }
459       sepOrFirst = true;
460       s++;
461       break;
462     default:
463       if (!IsValidPartOfMemberNameUtf8(&s)) {
464         return false;
465       }
466       sepOrFirst = false;
467       break;
468     }
469   }
470 }
471 
IsValidBinaryClassName(const char * s)472 bool IsValidBinaryClassName(const char* s) {
473   return IsValidClassName<kName, '.'>(s);
474 }
475 
IsValidJniClassName(const char * s)476 bool IsValidJniClassName(const char* s) {
477   return IsValidClassName<kName, '/'>(s);
478 }
479 
IsValidDescriptor(const char * s)480 bool IsValidDescriptor(const char* s) {
481   return IsValidClassName<kDescriptor, '/'>(s);
482 }
483 
PrettyDescriptor(Primitive::Type type)484 std::string PrettyDescriptor(Primitive::Type type) {
485   return PrettyDescriptor(Primitive::Descriptor(type));
486 }
487 
488 }  // namespace art
489