1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * Validate and manipulate MUTF-8 encoded string data.
19  */
20 
21 #include "DexUtf.h"
22 
23 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
24  * code point values for comparison. This treats different encodings
25  * for the same code point as equivalent, except that only a real '\0'
26  * byte is considered the string terminator. The return value is as
27  * for strcmp(). */
dexUtf8Cmp(const char * s1,const char * s2)28 int dexUtf8Cmp(const char* s1, const char* s2) {
29     for (;;) {
30         if (*s1 == '\0') {
31             if (*s2 == '\0') {
32                 return 0;
33             }
34             return -1;
35         } else if (*s2 == '\0') {
36             return 1;
37         }
38 
39         int utf1 = dexGetUtf16FromUtf8(&s1);
40         int utf2 = dexGetUtf16FromUtf8(&s2);
41         int diff = utf1 - utf2;
42 
43         if (diff != 0) {
44             return diff;
45         }
46     }
47 }
48 
49 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
50 u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
51     0x00000000, // 00..1f low control characters; nothing valid
52     0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
53     0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
54     0x07fffffe  // 60..7f lowercase etc.; valid: 'a'..'z'
55 };
56 
57 /* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
dexIsValidMemberNameUtf8_0(const char ** pUtf8Ptr)58 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
59     /*
60      * It's a multibyte encoded character. Decode it and analyze. We
61      * accept anything that isn't (a) an improperly encoded low value,
62      * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
63      * control character, or (e) a high space, layout, or special
64      * character (U+00a0, U+2000..U+200f, U+2028..U+202f,
65      * U+fff0..U+ffff). This is all specified in the dex format
66      * document.
67      */
68 
69     u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
70 
71     // Perform follow-up tests based on the high 8 bits.
72     switch (utf16 >> 8) {
73         case 0x00: {
74             // It's only valid if it's above the ISO-8859-1 high space (0xa0).
75             return (utf16 > 0x00a0);
76         }
77         case 0xd8:
78         case 0xd9:
79         case 0xda:
80         case 0xdb: {
81             /*
82              * It's a leading surrogate. Check to see that a trailing
83              * surrogate follows.
84              */
85             utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
86             return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
87         }
88         case 0xdc:
89         case 0xdd:
90         case 0xde:
91         case 0xdf: {
92             // It's a trailing surrogate, which is not valid at this point.
93             return false;
94         }
95         case 0x20:
96         case 0xff: {
97             // It's in the range that has spaces, controls, and specials.
98             switch (utf16 & 0xfff8) {
99                 case 0x2000:
100                 case 0x2008:
101                 case 0x2028:
102                 case 0xfff0:
103                 case 0xfff8: {
104                     return false;
105                 }
106             }
107             break;
108         }
109     }
110 
111     return true;
112 }
113 
114 /* Return whether the given string is a valid field or method name. */
dexIsValidMemberName(const char * s)115 bool dexIsValidMemberName(const char* s) {
116     bool angleName = false;
117 
118     switch (*s) {
119         case '\0': {
120             // The empty string is not a valid name.
121             return false;
122         }
123         case '<': {
124             /*
125              * '<' is allowed only at the start of a name, and if present,
126              * means that the name must end with '>'.
127              */
128             angleName = true;
129             s++;
130             break;
131         }
132     }
133 
134     for (;;) {
135         switch (*s) {
136             case '\0': {
137                 return !angleName;
138             }
139             case '>': {
140                 return angleName && s[1] == '\0';
141             }
142         }
143         if (!dexIsValidMemberNameUtf8(&s)) {
144             return false;
145         }
146     }
147 }
148 
149 /* Helper for validating type descriptors and class names, which is parametric
150  * with respect to type vs. class and dot vs. slash. */
isValidTypeDescriptorOrClassName(const char * s,bool isClassName,bool dotSeparator)151 static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
152         bool dotSeparator) {
153     int arrayCount = 0;
154 
155     while (*s == '[') {
156         arrayCount++;
157         s++;
158     }
159 
160     if (arrayCount > 255) {
161         // Arrays may have no more than 255 dimensions.
162         return false;
163     }
164 
165     if (arrayCount != 0) {
166         /*
167          * If we're looking at an array of some sort, then it doesn't
168          * matter if what is being asked for is a class name; the
169          * format looks the same as a type descriptor in that case, so
170          * treat it as such.
171          */
172         isClassName = false;
173     }
174 
175     if (!isClassName) {
176         /*
177          * We are looking for a descriptor. Either validate it as a
178          * single-character primitive type, or continue on to check the
179          * embedded class name (bracketed by "L" and ";").
180          */
181         switch (*(s++)) {
182             case 'B':
183             case 'C':
184             case 'D':
185             case 'F':
186             case 'I':
187             case 'J':
188             case 'S':
189             case 'Z': {
190                 // These are all single-character descriptors for primitive types.
191                 return (*s == '\0');
192             }
193             case 'V': {
194                 // Non-array void is valid, but you can't have an array of void.
195                 return (arrayCount == 0) && (*s == '\0');
196             }
197             case 'L': {
198                 // Class name: Break out and continue below.
199                 break;
200             }
201             default: {
202                 // Oddball descriptor character.
203                 return false;
204             }
205         }
206     }
207 
208     /*
209      * We just consumed the 'L' that introduces a class name as part
210      * of a type descriptor, or we are looking for an unadorned class
211      * name.
212      */
213 
214     bool sepOrFirst = true; // first character or just encountered a separator.
215     for (;;) {
216         u1 c = (u1) *s;
217         switch (c) {
218             case '\0': {
219                 /*
220                  * Premature end for a type descriptor, but valid for
221                  * a class name as long as we haven't encountered an
222                  * empty component (including the degenerate case of
223                  * the empty string "").
224                  */
225                 return isClassName && !sepOrFirst;
226             }
227             case ';': {
228                 /*
229                  * Invalid character for a class name, but the
230                  * legitimate end of a type descriptor. In the latter
231                  * case, make sure that this is the end of the string
232                  * and that it doesn't end with an empty component
233                  * (including the degenerate case of "L;").
234                  */
235                 return !isClassName && !sepOrFirst && (s[1] == '\0');
236             }
237             case '/':
238             case '.': {
239                 if (dotSeparator != (c == '.')) {
240                     // The wrong separator character.
241                     return false;
242                 }
243                 if (sepOrFirst) {
244                     // Separator at start or two separators in a row.
245                     return false;
246                 }
247                 sepOrFirst = true;
248                 s++;
249                 break;
250             }
251             default: {
252                 if (!dexIsValidMemberNameUtf8(&s)) {
253                     return false;
254                 }
255                 sepOrFirst = false;
256                 break;
257             }
258         }
259     }
260 }
261 
262 /* Return whether the given string is a valid type descriptor. */
dexIsValidTypeDescriptor(const char * s)263 bool dexIsValidTypeDescriptor(const char* s) {
264     return isValidTypeDescriptorOrClassName(s, false, false);
265 }
266 
267 /* (documented in header) */
dexIsValidClassName(const char * s,bool dotSeparator)268 bool dexIsValidClassName(const char* s, bool dotSeparator) {
269     return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
270 }
271 
272 /* Return whether the given string is a valid reference descriptor. This
273  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
274  * is for a class or array and not a primitive type. */
dexIsReferenceDescriptor(const char * s)275 bool dexIsReferenceDescriptor(const char* s) {
276     if (!dexIsValidTypeDescriptor(s)) {
277         return false;
278     }
279 
280     return (s[0] == 'L') || (s[0] == '[');
281 }
282 
283 /* Return whether the given string is a valid class descriptor. This
284  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
285  * is for a class and not an array or primitive type. */
dexIsClassDescriptor(const char * s)286 bool dexIsClassDescriptor(const char* s) {
287     if (!dexIsValidTypeDescriptor(s)) {
288         return false;
289     }
290 
291     return s[0] == 'L';
292 }
293 
294 /* Return whether the given string is a valid field type descriptor. This
295  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
296  * is for anything but "void". */
dexIsFieldDescriptor(const char * s)297 bool dexIsFieldDescriptor(const char* s) {
298     if (!dexIsValidTypeDescriptor(s)) {
299         return false;
300     }
301 
302     return s[0] != 'V';
303 }
304 
305