1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dexanalyze_strings.h"
18 
19 #include <algorithm>
20 #include <iomanip>
21 #include <iostream>
22 #include <queue>
23 
24 #include "base/time_utils.h"
25 #include "dex/class_accessor-inl.h"
26 #include "dex/code_item_accessors-inl.h"
27 #include "dex/dex_instruction-inl.h"
28 
29 namespace art {
30 namespace dexanalyze {
31 
32 // Tunable parameters.
33 static const size_t kMinPrefixLen = 1;
34 static const size_t kMaxPrefixLen = 255;
35 static const size_t kPrefixConstantCost = 4;
36 static const size_t kPrefixIndexCost = 2;
37 
38 class PrefixDictionary {
39  public:
40   // Add prefix data and return the offset to the start of the added data.
AddPrefixData(const uint8_t * data,size_t len)41   size_t AddPrefixData(const uint8_t* data, size_t len) {
42     const size_t offset = prefix_data_.size();
43     prefix_data_.insert(prefix_data_.end(), data, data + len);
44     return offset;
45   }
46 
47   static constexpr size_t kLengthBits = 8;
48   static constexpr size_t kLengthMask = (1u << kLengthBits) - 1;
49 
50   // Return the prefix offset and length.
GetOffset(uint32_t prefix_index,uint32_t * offset,uint32_t * length) const51   ALWAYS_INLINE void GetOffset(uint32_t prefix_index, uint32_t* offset, uint32_t* length) const {
52     CHECK_LT(prefix_index, offsets_.size());
53     const uint32_t data = offsets_[prefix_index];
54     *length = data & kLengthMask;
55     *offset = data >> kLengthBits;
56   }
57 
AddOffset(uint32_t offset,uint32_t length)58   uint32_t AddOffset(uint32_t offset, uint32_t length) {
59     CHECK_LE(length, kLengthMask);
60     offsets_.push_back((offset << kLengthBits) | length);
61     return offsets_.size() - 1;
62   }
63 
64  public:
65   std::vector<uint32_t> offsets_;
66   std::vector<uint8_t> prefix_data_;
67 };
68 
69 class PrefixStrings {
70  public:
71   class Builder {
72    public:
Builder(PrefixStrings * output)73     explicit Builder(PrefixStrings* output) : output_(output) {}
74     void Build(const std::vector<std::string>& strings);
75 
76    private:
77     PrefixStrings* const output_;
78   };
79 
80   // Return the string index that was added.
AddString(uint16_t prefix,const std::string & str)81   size_t AddString(uint16_t prefix, const std::string& str) {
82     const size_t string_offset = chars_.size();
83     chars_.push_back(static_cast<uint8_t>(prefix >> 8));
84     chars_.push_back(static_cast<uint8_t>(prefix >> 0));
85     EncodeUnsignedLeb128(&chars_, str.length());
86     const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]);
87     chars_.insert(chars_.end(), ptr, ptr + str.length());
88     string_offsets_.push_back(string_offset);
89     return string_offsets_.size() - 1;
90   }
91 
GetString(uint32_t string_idx) const92   std::string GetString(uint32_t string_idx) const {
93     const size_t offset = string_offsets_[string_idx];
94     const uint8_t* suffix_data = &chars_[offset];
95     uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) +
96         suffix_data[1];
97     suffix_data += 2;
98     uint32_t prefix_offset;
99     uint32_t prefix_len;
100     dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len);
101     const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset];
102     std::string ret(prefix_data, prefix_data + prefix_len);
103     uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data);
104     ret.insert(ret.end(), suffix_data, suffix_data + suffix_len);
105     return ret;
106   }
107 
Equal(uint32_t string_idx,const uint8_t * data,size_t len) const108   ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const {
109     const size_t offset = string_offsets_[string_idx];
110     const uint8_t* suffix_data = &chars_[offset];
111     uint16_t prefix_idx = (static_cast<uint16_t>(suffix_data[0]) << 8) +
112         suffix_data[1];
113     suffix_data += 2;
114     uint32_t prefix_offset;
115     uint32_t prefix_len;
116     dictionary_.GetOffset(prefix_idx, &prefix_offset, &prefix_len);
117     uint32_t suffix_len = DecodeUnsignedLeb128(&suffix_data);
118     if (prefix_len + suffix_len != len) {
119       return false;
120     }
121     const uint8_t* prefix_data = &dictionary_.prefix_data_[prefix_offset];
122     if ((true)) {
123       return memcmp(prefix_data, data, prefix_len) == 0u &&
124           memcmp(suffix_data, data + prefix_len, len - prefix_len) == 0u;
125     } else {
126       len -= prefix_len;
127       while (prefix_len != 0u) {
128         if (*prefix_data++ != *data++) {
129           return false;
130         }
131         --prefix_len;
132       }
133       while (len != 0u) {
134         if (*suffix_data++ != *data++) {
135           return false;
136         }
137         --len;
138       }
139       return true;
140     }
141   }
142 
143  public:
144   PrefixDictionary dictionary_;
145   std::vector<uint8_t> chars_;
146   std::vector<uint32_t> string_offsets_;
147 };
148 
149 // Normal non prefix strings.
150 class NormalStrings {
151  public:
152   // Return the string index that was added.
AddString(const std::string & str)153   size_t AddString(const std::string& str) {
154     const size_t string_offset = chars_.size();
155     EncodeUnsignedLeb128(&chars_, str.length());
156     const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&str[0]);
157     chars_.insert(chars_.end(), ptr, ptr + str.length());
158     string_offsets_.push_back(string_offset);
159     return string_offsets_.size() - 1;
160   }
161 
GetString(uint32_t string_idx) const162   std::string GetString(uint32_t string_idx) const {
163     const size_t offset = string_offsets_[string_idx];
164     const uint8_t* data = &chars_[offset];
165     uint32_t len = DecodeUnsignedLeb128(&data);
166     return std::string(data, data + len);
167   }
168 
Equal(uint32_t string_idx,const uint8_t * data,size_t len) const169   ALWAYS_INLINE bool Equal(uint32_t string_idx, const uint8_t* data, size_t len) const {
170     const size_t offset = string_offsets_[string_idx];
171     const uint8_t* str_data = &chars_[offset];
172     uint32_t str_len = DecodeUnsignedLeb128(&str_data);
173     if (str_len != len) {
174       return false;
175     }
176     return memcmp(data, str_data, len) == 0u;
177   }
178 
179  public:
180   std::vector<uint8_t> chars_;
181   std::vector<uint32_t> string_offsets_;
182 };
183 
184 // Node value = (distance from root) * (occurrences - 1).
185 class MatchTrie {
186  public:
Add(const std::string & str)187   MatchTrie* Add(const std::string& str) {
188     MatchTrie* node = this;
189     size_t depth = 0u;
190     for (uint8_t c : str) {
191       ++depth;
192       if (node->nodes_[c] == nullptr) {
193         MatchTrie* new_node = new MatchTrie();
194         node->nodes_[c].reset(new_node);
195         new_node->parent_ = node;
196         new_node->depth_ = depth;
197         new_node->incoming_ = c;
198         node = new_node;
199       } else {
200         node = node->nodes_[c].get();
201       }
202       ++node->count_;
203     }
204     return node;
205   }
206 
207   // Returns the length of the longest prefix and if it's a leaf node.
LongestPrefix(const std::string & str)208   MatchTrie* LongestPrefix(const std::string& str) {
209     MatchTrie* node = this;
210     for (uint8_t c : str) {
211       if (node->nodes_[c] == nullptr) {
212         break;
213       }
214       node = node->nodes_[c].get();
215     }
216     return node;
217   }
218 
IsLeaf() const219   bool IsLeaf() const {
220     for (const std::unique_ptr<MatchTrie>& cur_node : nodes_) {
221       if (cur_node != nullptr) {
222         return false;
223       }
224     }
225     return true;
226   }
227 
Savings() const228   int32_t Savings() const {
229     int32_t cost = kPrefixConstantCost;
230     int32_t first_used = 0u;
231     if (chosen_suffix_count_ == 0u) {
232       cost += depth_;
233     }
234     uint32_t extra_savings = 0u;
235     for (MatchTrie* cur = parent_; cur != nullptr; cur = cur->parent_) {
236       if (cur->chosen_) {
237         first_used = cur->depth_;
238         if (cur->chosen_suffix_count_ == 0u) {
239           // First suffix for the chosen parent, remove the cost of the dictionary entry.
240           extra_savings += first_used;
241         }
242         break;
243       }
244     }
245     return count_ * (depth_ - first_used) - cost + extra_savings;
246   }
247 
248   template <typename T, typename... Args, template <typename...> class Queue>
PopRealTop(Queue<T,Args...> & queue)249   T PopRealTop(Queue<T, Args...>& queue) {
250     auto pair = queue.top();
251     queue.pop();
252     // Keep updating values until one sticks.
253     while (pair.second->Savings() != pair.first) {
254       pair.first = pair.second->Savings();
255       queue.push(pair);
256       pair = queue.top();
257       queue.pop();
258     }
259     return pair;
260   }
261 
ExtractPrefixes(size_t max)262   std::vector<std::string> ExtractPrefixes(size_t max) {
263     std::vector<std::string> ret;
264     // Make priority queue and adaptively update it. Each node value is the savings from picking
265     // it. Insert all of the interesting nodes in the queue (children != 1).
266     std::priority_queue<std::pair<int32_t, MatchTrie*>> queue;
267     // Add all of the nodes to the queue.
268     std::vector<MatchTrie*> work(1, this);
269     while (!work.empty()) {
270       MatchTrie* elem = work.back();
271       work.pop_back();
272       size_t num_childs = 0u;
273       for (const std::unique_ptr<MatchTrie>& child : elem->nodes_) {
274         if (child != nullptr) {
275           work.push_back(child.get());
276           ++num_childs;
277         }
278       }
279       if (num_childs > 1u || elem->value_ != 0u) {
280         queue.emplace(elem->Savings(), elem);
281       }
282     }
283     std::priority_queue<std::pair<int32_t, MatchTrie*>> prefixes;
284     // The savings can only ever go down for a given node, never up.
285     while (max != 0u && !queue.empty()) {
286       std::pair<int32_t, MatchTrie*> pair = PopRealTop(queue);
287       if (pair.second != this && pair.first > 0) {
288         // Pick this node.
289         uint32_t count = pair.second->count_;
290         pair.second->chosen_ = true;
291         for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) {
292           if (cur->chosen_) {
293             break;
294           }
295           cur->count_ -= count;
296         }
297         for (MatchTrie* cur = pair.second->parent_; cur != this; cur = cur->parent_) {
298           ++cur->chosen_suffix_count_;
299         }
300         prefixes.emplace(pair.first, pair.second);
301         --max;
302       } else {
303         // Negative or no EV, just delete the node.
304       }
305     }
306     while (!prefixes.empty()) {
307       std::pair<int32_t, MatchTrie*> pair = PopRealTop(prefixes);
308       if (pair.first <= 0) {
309         continue;
310       }
311       ret.push_back(pair.second->GetString());
312     }
313     return ret;
314   }
315 
GetString() const316   std::string GetString() const {
317     std::vector<uint8_t> chars;
318     for (const MatchTrie* cur = this; cur->parent_ != nullptr; cur = cur->parent_) {
319       chars.push_back(cur->incoming_);
320     }
321     return std::string(chars.rbegin(), chars.rend());
322   }
323 
324   std::unique_ptr<MatchTrie> nodes_[256];
325   MatchTrie* parent_ = nullptr;
326   uint32_t count_ = 0u;
327   uint32_t depth_ = 0u;
328   int32_t savings_ = 0u;
329   uint8_t incoming_ = 0u;
330   // Value of the current node, non zero if the node is chosen.
331   uint32_t value_ = 0u;
332   // If the current node is chosen to be a used prefix.
333   bool chosen_ = false;
334   // If the current node is a prefix of a longer chosen prefix.
335   uint32_t chosen_suffix_count_ = 0u;
336 };
337 
Build(const std::vector<std::string> & strings)338 void PrefixStrings::Builder::Build(const std::vector<std::string>& strings) {
339   std::unique_ptr<MatchTrie> prefixe_trie(new MatchTrie());
340   for (size_t i = 0; i < strings.size(); ++i) {
341     size_t len = 0u;
342     if (i > 0u) {
343       CHECK_GT(strings[i], strings[i - 1]);
344       len = std::max(len, PrefixLen(strings[i], strings[i - 1]));
345     }
346     if (i < strings.size() - 1) {
347       len = std::max(len, PrefixLen(strings[i], strings[i + 1]));
348     }
349     len = std::min(len, kMaxPrefixLen);
350     if (len >= kMinPrefixLen) {
351       prefixe_trie->Add(strings[i].substr(0, len))->value_ = 1u;
352     }
353   }
354 
355   // Build prefixes.
356   {
357     static constexpr size_t kPrefixBits = 15;
358     std::vector<std::string> prefixes(prefixe_trie->ExtractPrefixes(1 << kPrefixBits));
359     // Add longest prefixes first so that subprefixes can share data.
360     std::sort(prefixes.begin(), prefixes.end(), [](const std::string& a, const std::string& b) {
361       return a.length() > b.length();
362     });
363     prefixe_trie.reset();
364     prefixe_trie.reset(new MatchTrie());
365     uint32_t prefix_idx = 0u;
366     CHECK_EQ(output_->dictionary_.AddOffset(0u, 0u), prefix_idx++);
367     for (const std::string& str : prefixes) {
368       uint32_t prefix_offset = 0u;
369       MatchTrie* node = prefixe_trie->LongestPrefix(str);
370       if (node != nullptr && node->depth_ == str.length() && node->value_ != 0u) {
371         CHECK_EQ(node->GetString(), str);
372         uint32_t existing_len = 0u;
373         output_->dictionary_.GetOffset(node->value_, &prefix_offset, &existing_len);
374         // Make sure to register the current node.
375         prefixe_trie->Add(str)->value_ = prefix_idx;
376       } else {
377         auto add_str = [&](const std::string& s) {
378           node = prefixe_trie->Add(s);
379           node->value_ = prefix_idx;
380           while (node != nullptr) {
381             node->value_ = prefix_idx;
382             node = node->parent_;
383           }
384         };
385         static constexpr size_t kNumSubstrings = 1u;
386         // Increasing kNumSubstrings provides savings since it enables common substrings and not
387         // only prefixes to share data. The problem is that it's slow.
388         for (size_t i = 0; i < std::min(str.length(), kNumSubstrings); ++i) {
389           add_str(str.substr(i));
390         }
391         prefix_offset = output_->dictionary_.AddPrefixData(
392             reinterpret_cast<const uint8_t*>(&str[0]),
393             str.length());
394       }
395       // TODO: Validiate the prefix offset.
396       CHECK_EQ(output_->dictionary_.AddOffset(prefix_offset, str.length()), prefix_idx);
397       ++prefix_idx;
398     }
399   }
400 
401   // Add strings to the dictionary.
402   for (const std::string& str : strings) {
403     MatchTrie* node = prefixe_trie->LongestPrefix(str);
404     uint32_t prefix_idx = 0u;
405     uint32_t best_length = 0u;
406     while (node != nullptr) {
407       uint32_t offset = 0u;
408       uint32_t length = 0u;
409       output_->dictionary_.GetOffset(node->value_, &offset, &length);
410       if (node->depth_ == length) {
411         prefix_idx = node->value_;
412         best_length = node->depth_;
413         break;
414         // Actually the prefix we want.
415       }
416       node = node->parent_;
417     }
418     output_->AddString(prefix_idx, str.substr(best_length));
419   }
420 }
421 
ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>> & dex_files)422 void AnalyzeStrings::ProcessDexFiles(const std::vector<std::unique_ptr<const DexFile>>& dex_files) {
423   std::set<std::string> unique_strings;
424   // Accumulate the strings.
425   for (const std::unique_ptr<const DexFile>& dex_file : dex_files) {
426     for (size_t i = 0; i < dex_file->NumStringIds(); ++i) {
427       uint32_t length = 0;
428       const char* data = dex_file->GetStringDataAndUtf16Length(dex::StringIndex(i), &length);
429       // Analyze if the string has any UTF16 chars.
430       bool have_wide_char = false;
431       const char* ptr = data;
432       for (size_t j = 0; j < length; ++j) {
433         have_wide_char = have_wide_char || GetUtf16FromUtf8(&ptr) >= 0x100;
434       }
435       if (have_wide_char) {
436         wide_string_bytes_ += 2 * length;
437       } else {
438         ascii_string_bytes_ += length;
439       }
440       string_data_bytes_ += ptr - data;
441       unique_strings.insert(data);
442     }
443   }
444   // Unique strings only since we want to exclude savings from multi-dex duplication.
445   ProcessStrings(std::vector<std::string>(unique_strings.begin(), unique_strings.end()));
446 }
447 
ProcessStrings(const std::vector<std::string> & strings)448 void AnalyzeStrings::ProcessStrings(const std::vector<std::string>& strings) {
449   // Calculate total shared prefix.
450   size_t prefix_index_cost_ = 0u;
451   for (size_t i = 0; i < strings.size(); ++i) {
452     size_t best_len = 0;
453     if (i > 0) {
454       best_len = std::max(best_len, PrefixLen(strings[i], strings[i - 1]));
455     }
456     if (i < strings.size() - 1) {
457       best_len = std::max(best_len, PrefixLen(strings[i], strings[i + 1]));
458     }
459     best_len = std::min(best_len, kMaxPrefixLen);
460     if (best_len >= kMinPrefixLen) {
461       total_shared_prefix_bytes_ += best_len;
462     }
463     prefix_index_cost_ += kPrefixIndexCost;
464     if (strings[i].length() < 64) {
465       ++short_strings_;
466     } else {
467       ++long_strings_;
468     }
469   }
470   total_prefix_index_cost_ += prefix_index_cost_;
471 
472   PrefixStrings prefix_strings;
473   {
474     PrefixStrings::Builder prefix_builder(&prefix_strings);
475     prefix_builder.Build(strings);
476   }
477   Benchmark(prefix_strings, strings, &prefix_timings_);
478   const size_t num_prefixes = prefix_strings.dictionary_.offsets_.size();
479   total_num_prefixes_ += num_prefixes;
480   total_prefix_table_ += num_prefixes * sizeof(prefix_strings.dictionary_.offsets_[0]);
481   total_prefix_dict_ += prefix_strings.dictionary_.prefix_data_.size();
482 
483   {
484     NormalStrings normal_strings;
485     for (const std::string& s : strings) {
486       normal_strings.AddString(s);
487     }
488     const uint64_t unique_string_data_bytes = normal_strings.chars_.size();
489     total_unique_string_data_bytes_ += unique_string_data_bytes;
490     total_prefix_savings_ += unique_string_data_bytes - prefix_strings.chars_.size() +
491         prefix_index_cost_;
492     Benchmark(normal_strings, strings, &normal_timings_);
493   }
494 }
495 
496 template <typename Strings>
Benchmark(const Strings & strings,const std::vector<std::string> & reference,StringTimings * timings)497 void AnalyzeStrings::Benchmark(const Strings& strings,
498                                const std::vector<std::string>& reference,
499                                StringTimings* timings) {
500   const size_t kIterations = 100;
501   timings->num_comparisons_ += reference.size() * kIterations;
502 
503   uint64_t start = NanoTime();
504   for (size_t j = 0; j < kIterations; ++j) {
505     for (size_t i = 0; i < reference.size(); ++i) {
506       CHECK(strings.Equal(
507           i,
508           reinterpret_cast<const uint8_t*>(&reference[i][0]),
509           reference[i].length()))
510           << i << ": " << strings.GetString(i) << " vs " << reference[i];
511     }
512   }
513   timings->time_equal_comparisons_ += NanoTime() - start;
514 
515   start = NanoTime();
516   for (size_t j = 0; j < kIterations; ++j) {
517     size_t count = 0u;
518     for (size_t i = 0; i < reference.size(); ++i) {
519       count += strings.Equal(
520           reference.size() - 1 - i,
521           reinterpret_cast<const uint8_t*>(&reference[i][0]),
522           reference[i].length());
523     }
524     CHECK_LT(count, 2u);
525   }
526   timings->time_non_equal_comparisons_ += NanoTime() - start;
527 }
528 
529 template void AnalyzeStrings::Benchmark(const PrefixStrings&,
530                                         const std::vector<std::string>&,
531                                         StringTimings* timings);
532 template void AnalyzeStrings::Benchmark(const NormalStrings&,
533                                         const std::vector<std::string>&,
534                                         StringTimings* timings);
535 
Dump(std::ostream & os) const536 void StringTimings::Dump(std::ostream& os) const {
537   const double comparisons = static_cast<double>(num_comparisons_);
538   os << "Compare equal " << static_cast<double>(time_equal_comparisons_) / comparisons << "\n";
539   os << "Compare not equal " << static_cast<double>(time_non_equal_comparisons_) / comparisons << "\n";
540 }
541 
Dump(std::ostream & os,uint64_t total_size) const542 void AnalyzeStrings::Dump(std::ostream& os, uint64_t total_size) const {
543   os << "Total string data bytes " << Percent(string_data_bytes_, total_size) << "\n";
544   os << "Total unique string data bytes "
545      << Percent(total_unique_string_data_bytes_, total_size) << "\n";
546   os << "UTF-16 string data bytes " << Percent(wide_string_bytes_, total_size) << "\n";
547   os << "ASCII string data bytes " << Percent(ascii_string_bytes_, total_size) << "\n";
548 
549   os << "Prefix string timings\n";
550   prefix_timings_.Dump(os);
551   os << "Normal string timings\n";
552   normal_timings_.Dump(os);
553 
554   // Prefix based strings.
555   os << "Total shared prefix bytes " << Percent(total_shared_prefix_bytes_, total_size) << "\n";
556   os << "Prefix dictionary cost " << Percent(total_prefix_dict_, total_size) << "\n";
557   os << "Prefix table cost " << Percent(total_prefix_table_, total_size) << "\n";
558   os << "Prefix index cost " << Percent(total_prefix_index_cost_, total_size) << "\n";
559   int64_t net_savings = total_prefix_savings_;
560   net_savings -= total_prefix_dict_;
561   net_savings -= total_prefix_table_;
562   net_savings -= total_prefix_index_cost_;
563   os << "Prefix dictionary elements " << total_num_prefixes_ << "\n";
564   os << "Prefix base savings " << Percent(total_prefix_savings_, total_size) << "\n";
565   os << "Prefix net savings " << Percent(net_savings, total_size) << "\n";
566   os << "Strings using prefix "
567      << Percent(strings_used_prefixed_, total_prefix_index_cost_ / kPrefixIndexCost) << "\n";
568   os << "Short strings " << Percent(short_strings_, short_strings_ + long_strings_) << "\n";
569   if (verbose_level_ >= VerboseLevel::kEverything) {
570     std::vector<std::pair<std::string, size_t>> pairs;  // (prefixes_.begin(), prefixes_.end());
571     // Sort lexicographically.
572     std::sort(pairs.begin(), pairs.end());
573     for (const auto& pair : pairs) {
574       os << pair.first << " : " << pair.second << "\n";
575     }
576   }
577 }
578 
579 }  // namespace dexanalyze
580 }  // namespace art
581