1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "util/Util.h"
18 
19 #include <utils/Unicode.h>
20 #include <algorithm>
21 #include <ostream>
22 #include <string>
23 #include <vector>
24 
25 #include "androidfw/StringPiece.h"
26 
27 #include "util/BigBuffer.h"
28 #include "util/Maybe.h"
29 
30 using android::StringPiece;
31 using android::StringPiece16;
32 
33 namespace aapt {
34 namespace util {
35 
SplitAndTransform(const StringPiece & str,char sep,const std::function<char (char)> & f)36 static std::vector<std::string> SplitAndTransform(
37     const StringPiece& str, char sep, const std::function<char(char)>& f) {
38   std::vector<std::string> parts;
39   const StringPiece::const_iterator end = std::end(str);
40   StringPiece::const_iterator start = std::begin(str);
41   StringPiece::const_iterator current;
42   do {
43     current = std::find(start, end, sep);
44     parts.emplace_back(str.substr(start, current).to_string());
45     if (f) {
46       std::string& part = parts.back();
47       std::transform(part.begin(), part.end(), part.begin(), f);
48     }
49     start = current + 1;
50   } while (current != end);
51   return parts;
52 }
53 
Split(const StringPiece & str,char sep)54 std::vector<std::string> Split(const StringPiece& str, char sep) {
55   return SplitAndTransform(str, sep, nullptr);
56 }
57 
SplitAndLowercase(const StringPiece & str,char sep)58 std::vector<std::string> SplitAndLowercase(const StringPiece& str, char sep) {
59   return SplitAndTransform(str, sep, ::tolower);
60 }
61 
StartsWith(const StringPiece & str,const StringPiece & prefix)62 bool StartsWith(const StringPiece& str, const StringPiece& prefix) {
63   if (str.size() < prefix.size()) {
64     return false;
65   }
66   return str.substr(0, prefix.size()) == prefix;
67 }
68 
EndsWith(const StringPiece & str,const StringPiece & suffix)69 bool EndsWith(const StringPiece& str, const StringPiece& suffix) {
70   if (str.size() < suffix.size()) {
71     return false;
72   }
73   return str.substr(str.size() - suffix.size(), suffix.size()) == suffix;
74 }
75 
TrimWhitespace(const StringPiece & str)76 StringPiece TrimWhitespace(const StringPiece& str) {
77   if (str.size() == 0 || str.data() == nullptr) {
78     return str;
79   }
80 
81   const char* start = str.data();
82   const char* end = str.data() + str.length();
83 
84   while (start != end && isspace(*start)) {
85     start++;
86   }
87 
88   while (end != start && isspace(*(end - 1))) {
89     end--;
90   }
91 
92   return StringPiece(start, end - start);
93 }
94 
FindNonAlphaNumericAndNotInSet(const StringPiece & str,const StringPiece & allowed_chars)95 StringPiece::const_iterator FindNonAlphaNumericAndNotInSet(
96     const StringPiece& str, const StringPiece& allowed_chars) {
97   const auto end_iter = str.end();
98   for (auto iter = str.begin(); iter != end_iter; ++iter) {
99     char c = *iter;
100     if ((c >= u'a' && c <= u'z') || (c >= u'A' && c <= u'Z') ||
101         (c >= u'0' && c <= u'9')) {
102       continue;
103     }
104 
105     bool match = false;
106     for (char i : allowed_chars) {
107       if (c == i) {
108         match = true;
109         break;
110       }
111     }
112 
113     if (!match) {
114       return iter;
115     }
116   }
117   return end_iter;
118 }
119 
IsJavaClassName(const StringPiece & str)120 bool IsJavaClassName(const StringPiece& str) {
121   size_t pieces = 0;
122   for (const StringPiece& piece : Tokenize(str, '.')) {
123     pieces++;
124     if (piece.empty()) {
125       return false;
126     }
127 
128     // Can't have starting or trailing $ character.
129     if (piece.data()[0] == '$' || piece.data()[piece.size() - 1] == '$') {
130       return false;
131     }
132 
133     if (FindNonAlphaNumericAndNotInSet(piece, "$_") != piece.end()) {
134       return false;
135     }
136   }
137   return pieces >= 2;
138 }
139 
IsJavaPackageName(const StringPiece & str)140 bool IsJavaPackageName(const StringPiece& str) {
141   if (str.empty()) {
142     return false;
143   }
144 
145   size_t pieces = 0;
146   for (const StringPiece& piece : Tokenize(str, '.')) {
147     pieces++;
148     if (piece.empty()) {
149       return false;
150     }
151 
152     if (piece.data()[0] == '_' || piece.data()[piece.size() - 1] == '_') {
153       return false;
154     }
155 
156     if (FindNonAlphaNumericAndNotInSet(piece, "_") != piece.end()) {
157       return false;
158     }
159   }
160   return pieces >= 1;
161 }
162 
GetFullyQualifiedClassName(const StringPiece & package,const StringPiece & classname)163 Maybe<std::string> GetFullyQualifiedClassName(const StringPiece& package,
164                                               const StringPiece& classname) {
165   if (classname.empty()) {
166     return {};
167   }
168 
169   if (util::IsJavaClassName(classname)) {
170     return classname.to_string();
171   }
172 
173   if (package.empty()) {
174     return {};
175   }
176 
177   std::string result(package.data(), package.size());
178   if (classname.data()[0] != '.') {
179     result += '.';
180   }
181 
182   result.append(classname.data(), classname.size());
183   if (!IsJavaClassName(result)) {
184     return {};
185   }
186   return result;
187 }
188 
ConsumeDigits(const char * start,const char * end)189 static size_t ConsumeDigits(const char* start, const char* end) {
190   const char* c = start;
191   for (; c != end && *c >= '0' && *c <= '9'; c++) {
192   }
193   return static_cast<size_t>(c - start);
194 }
195 
VerifyJavaStringFormat(const StringPiece & str)196 bool VerifyJavaStringFormat(const StringPiece& str) {
197   const char* c = str.begin();
198   const char* const end = str.end();
199 
200   size_t arg_count = 0;
201   bool nonpositional = false;
202   while (c != end) {
203     if (*c == '%' && c + 1 < end) {
204       c++;
205 
206       if (*c == '%') {
207         c++;
208         continue;
209       }
210 
211       arg_count++;
212 
213       size_t num_digits = ConsumeDigits(c, end);
214       if (num_digits > 0) {
215         c += num_digits;
216         if (c != end && *c != '$') {
217           // The digits were a size, but not a positional argument.
218           nonpositional = true;
219         }
220       } else if (*c == '<') {
221         // Reusing last argument, bad idea since positions can be moved around
222         // during translation.
223         nonpositional = true;
224 
225         c++;
226 
227         // Optionally we can have a $ after
228         if (c != end && *c == '$') {
229           c++;
230         }
231       } else {
232         nonpositional = true;
233       }
234 
235       // Ignore size, width, flags, etc.
236       while (c != end && (*c == '-' || *c == '#' || *c == '+' || *c == ' ' ||
237                           *c == ',' || *c == '(' || (*c >= '0' && *c <= '9'))) {
238         c++;
239       }
240 
241       /*
242        * This is a shortcut to detect strings that are going to Time.format()
243        * instead of String.format()
244        *
245        * Comparison of String.format() and Time.format() args:
246        *
247        * String: ABC E GH  ST X abcdefgh  nost x
248        *   Time:    DEFGHKMS W Za  d   hkm  s w yz
249        *
250        * Therefore we know it's definitely Time if we have:
251        *     DFKMWZkmwyz
252        */
253       if (c != end) {
254         switch (*c) {
255           case 'D':
256           case 'F':
257           case 'K':
258           case 'M':
259           case 'W':
260           case 'Z':
261           case 'k':
262           case 'm':
263           case 'w':
264           case 'y':
265           case 'z':
266             return true;
267         }
268       }
269     }
270 
271     if (c != end) {
272       c++;
273     }
274   }
275 
276   if (arg_count > 1 && nonpositional) {
277     // Multiple arguments were specified, but some or all were non positional.
278     // Translated
279     // strings may rearrange the order of the arguments, which will break the
280     // string.
281     return false;
282   }
283   return true;
284 }
285 
ParseUnicodeCodepoint(const char ** start,const char * end)286 static Maybe<std::string> ParseUnicodeCodepoint(const char** start,
287                                                 const char* end) {
288   char32_t code = 0;
289   for (size_t i = 0; i < 4 && *start != end; i++, (*start)++) {
290     char c = **start;
291     char32_t a;
292     if (c >= '0' && c <= '9') {
293       a = c - '0';
294     } else if (c >= 'a' && c <= 'f') {
295       a = c - 'a' + 10;
296     } else if (c >= 'A' && c <= 'F') {
297       a = c - 'A' + 10;
298     } else {
299       return {};
300     }
301     code = (code << 4) | a;
302   }
303 
304   ssize_t len = utf32_to_utf8_length(&code, 1);
305   if (len < 0) {
306     return {};
307   }
308 
309   std::string result_utf8;
310   result_utf8.resize(len);
311   utf32_to_utf8(&code, 1, &*result_utf8.begin(), len + 1);
312   return result_utf8;
313 }
314 
Append(const StringPiece & str)315 StringBuilder& StringBuilder::Append(const StringPiece& str) {
316   if (!error_.empty()) {
317     return *this;
318   }
319 
320   // Where the new data will be appended to.
321   size_t new_data_index = str_.size();
322 
323   const char* const end = str.end();
324   const char* start = str.begin();
325   const char* current = start;
326   while (current != end) {
327     if (last_char_was_escape_) {
328       switch (*current) {
329         case 't':
330           str_ += '\t';
331           break;
332         case 'n':
333           str_ += '\n';
334           break;
335         case '#':
336           str_ += '#';
337           break;
338         case '@':
339           str_ += '@';
340           break;
341         case '?':
342           str_ += '?';
343           break;
344         case '"':
345           str_ += '"';
346           break;
347         case '\'':
348           str_ += '\'';
349           break;
350         case '\\':
351           str_ += '\\';
352           break;
353         case 'u': {
354           current++;
355           Maybe<std::string> c = ParseUnicodeCodepoint(&current, end);
356           if (!c) {
357             error_ = "invalid unicode escape sequence";
358             return *this;
359           }
360           str_ += c.value();
361           current -= 1;
362           break;
363         }
364 
365         default:
366           // Ignore.
367           break;
368       }
369       last_char_was_escape_ = false;
370       start = current + 1;
371     } else if (*current == '"') {
372       if (!quote_ && trailing_space_) {
373         // We found an opening quote, and we have
374         // trailing space, so we should append that
375         // space now.
376         if (trailing_space_) {
377           // We had trailing whitespace, so
378           // replace with a single space.
379           if (!str_.empty()) {
380             str_ += ' ';
381           }
382           trailing_space_ = false;
383         }
384       }
385       quote_ = !quote_;
386       str_.append(start, current - start);
387       start = current + 1;
388     } else if (*current == '\'' && !quote_) {
389       // This should be escaped.
390       error_ = "unescaped apostrophe";
391       return *this;
392     } else if (*current == '\\') {
393       // This is an escape sequence, convert to the real value.
394       if (!quote_ && trailing_space_) {
395         // We had trailing whitespace, so
396         // replace with a single space.
397         if (!str_.empty()) {
398           str_ += ' ';
399         }
400         trailing_space_ = false;
401       }
402       str_.append(start, current - start);
403       start = current + 1;
404       last_char_was_escape_ = true;
405     } else if (!quote_) {
406       // This is not quoted text, so look for whitespace.
407       if (isspace(*current)) {
408         // We found whitespace, see if we have seen some
409         // before.
410         if (!trailing_space_) {
411           // We didn't see a previous adjacent space,
412           // so mark that we did.
413           trailing_space_ = true;
414           str_.append(start, current - start);
415         }
416 
417         // Keep skipping whitespace.
418         start = current + 1;
419       } else if (trailing_space_) {
420         // We saw trailing space before, so replace all
421         // that trailing space with one space.
422         if (!str_.empty()) {
423           str_ += ' ';
424         }
425         trailing_space_ = false;
426       }
427     }
428     current++;
429   }
430   str_.append(start, end - start);
431 
432   // Accumulate the added string's UTF-16 length.
433   ssize_t len = utf8_to_utf16_length(
434       reinterpret_cast<const uint8_t*>(str_.data()) + new_data_index,
435       str_.size() - new_data_index);
436   if (len < 0) {
437     error_ = "invalid unicode code point";
438     return *this;
439   }
440   utf16_len_ += len;
441   return *this;
442 }
443 
Utf8ToUtf16(const StringPiece & utf8)444 std::u16string Utf8ToUtf16(const StringPiece& utf8) {
445   ssize_t utf16_length = utf8_to_utf16_length(
446       reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length());
447   if (utf16_length <= 0) {
448     return {};
449   }
450 
451   std::u16string utf16;
452   utf16.resize(utf16_length);
453   utf8_to_utf16(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length(),
454                 &*utf16.begin(), utf16_length + 1);
455   return utf16;
456 }
457 
Utf16ToUtf8(const StringPiece16 & utf16)458 std::string Utf16ToUtf8(const StringPiece16& utf16) {
459   ssize_t utf8_length = utf16_to_utf8_length(utf16.data(), utf16.length());
460   if (utf8_length <= 0) {
461     return {};
462   }
463 
464   std::string utf8;
465   utf8.resize(utf8_length);
466   utf16_to_utf8(utf16.data(), utf16.length(), &*utf8.begin(), utf8_length + 1);
467   return utf8;
468 }
469 
WriteAll(std::ostream & out,const BigBuffer & buffer)470 bool WriteAll(std::ostream& out, const BigBuffer& buffer) {
471   for (const auto& b : buffer) {
472     if (!out.write(reinterpret_cast<const char*>(b.buffer.get()), b.size)) {
473       return false;
474     }
475   }
476   return true;
477 }
478 
Copy(const BigBuffer & buffer)479 std::unique_ptr<uint8_t[]> Copy(const BigBuffer& buffer) {
480   std::unique_ptr<uint8_t[]> data =
481       std::unique_ptr<uint8_t[]>(new uint8_t[buffer.size()]);
482   uint8_t* p = data.get();
483   for (const auto& block : buffer) {
484     memcpy(p, block.buffer.get(), block.size);
485     p += block.size;
486   }
487   return data;
488 }
489 
operator ++()490 typename Tokenizer::iterator& Tokenizer::iterator::operator++() {
491   const char* start = token_.end();
492   const char* end = str_.end();
493   if (start == end) {
494     end_ = true;
495     token_.assign(token_.end(), 0);
496     return *this;
497   }
498 
499   start += 1;
500   const char* current = start;
501   while (current != end) {
502     if (*current == separator_) {
503       token_.assign(start, current - start);
504       return *this;
505     }
506     ++current;
507   }
508   token_.assign(start, end - start);
509   return *this;
510 }
511 
operator ==(const iterator & rhs) const512 bool Tokenizer::iterator::operator==(const iterator& rhs) const {
513   // We check equality here a bit differently.
514   // We need to know that the addresses are the same.
515   return token_.begin() == rhs.token_.begin() &&
516          token_.end() == rhs.token_.end() && end_ == rhs.end_;
517 }
518 
operator !=(const iterator & rhs) const519 bool Tokenizer::iterator::operator!=(const iterator& rhs) const {
520   return !(*this == rhs);
521 }
522 
iterator(StringPiece s,char sep,StringPiece tok,bool end)523 Tokenizer::iterator::iterator(StringPiece s, char sep, StringPiece tok,
524                               bool end)
525     : str_(s), separator_(sep), token_(tok), end_(end) {}
526 
Tokenizer(StringPiece str,char sep)527 Tokenizer::Tokenizer(StringPiece str, char sep)
528     : begin_(++iterator(str, sep, StringPiece(str.begin() - 1, 0), false)),
529       end_(str, sep, StringPiece(str.end(), 0), true) {}
530 
ExtractResFilePathParts(const StringPiece & path,StringPiece * out_prefix,StringPiece * out_entry,StringPiece * out_suffix)531 bool ExtractResFilePathParts(const StringPiece& path, StringPiece* out_prefix,
532                              StringPiece* out_entry, StringPiece* out_suffix) {
533   const StringPiece res_prefix("res/");
534   if (!StartsWith(path, res_prefix)) {
535     return false;
536   }
537 
538   StringPiece::const_iterator last_occurence = path.end();
539   for (auto iter = path.begin() + res_prefix.size(); iter != path.end();
540        ++iter) {
541     if (*iter == '/') {
542       last_occurence = iter;
543     }
544   }
545 
546   if (last_occurence == path.end()) {
547     return false;
548   }
549 
550   auto iter = std::find(last_occurence, path.end(), '.');
551   *out_suffix = StringPiece(iter, path.end() - iter);
552   *out_entry = StringPiece(last_occurence + 1, iter - last_occurence - 1);
553   *out_prefix = StringPiece(path.begin(), last_occurence - path.begin() + 1);
554   return true;
555 }
556 
GetString16(const android::ResStringPool & pool,size_t idx)557 StringPiece16 GetString16(const android::ResStringPool& pool, size_t idx) {
558   size_t len;
559   const char16_t* str = pool.stringAt(idx, &len);
560   if (str != nullptr) {
561     return StringPiece16(str, len);
562   }
563   return StringPiece16();
564 }
565 
GetString(const android::ResStringPool & pool,size_t idx)566 std::string GetString(const android::ResStringPool& pool, size_t idx) {
567   size_t len;
568   const char* str = pool.string8At(idx, &len);
569   if (str != nullptr) {
570     return std::string(str, len);
571   }
572   return Utf16ToUtf8(GetString16(pool, idx));
573 }
574 
575 }  // namespace util
576 }  // namespace aapt
577