1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_TRACE_PROCESSOR_CONTAINERS_STRING_POOL_H_
18 #define SRC_TRACE_PROCESSOR_CONTAINERS_STRING_POOL_H_
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include <limits>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "perfetto/ext/base/optional.h"
28 #include "perfetto/ext/base/paged_memory.h"
29 #include "perfetto/protozero/proto_utils.h"
30 #include "src/trace_processor/containers/null_term_string_view.h"
31 
32 namespace perfetto {
33 namespace trace_processor {
34 
35 // Interns strings in a string pool and hands out compact StringIds which can
36 // be used to retrieve the string in O(1).
37 class StringPool {
38  public:
39   struct Id {
40     Id() = default;
41 
42     bool operator==(const Id& other) const { return other.id == id; }
43     bool operator!=(const Id& other) const { return !(other == *this); }
44     bool operator<(const Id& other) const { return id < other.id; }
45 
is_nullId46     bool is_null() const { return id == 0u; }
47 
is_large_stringId48     bool is_large_string() const { return id & kLargeStringFlagBitMask; }
49 
block_offsetId50     uint32_t block_offset() const { return id & kBlockOffsetBitMask; }
51 
block_indexId52     uint32_t block_index() const {
53       return (id & kBlockIndexBitMask) >> kNumBlockOffsetBits;
54     }
55 
large_string_indexId56     uint32_t large_string_index() const {
57       PERFETTO_DCHECK(is_large_string());
58       return id & ~kLargeStringFlagBitMask;
59     }
60 
raw_idId61     uint32_t raw_id() const { return id; }
62 
LargeStringId63     static Id LargeString(size_t index) {
64       PERFETTO_DCHECK(index <= static_cast<uint32_t>(index));
65       PERFETTO_DCHECK(!(index & kLargeStringFlagBitMask));
66       return Id(kLargeStringFlagBitMask | static_cast<uint32_t>(index));
67     }
68 
BlockStringId69     static Id BlockString(size_t index, uint32_t offset) {
70       PERFETTO_DCHECK(index < (1u << (kNumBlockIndexBits + 1)));
71       PERFETTO_DCHECK(offset < (1u << (kNumBlockOffsetBits + 1)));
72       return Id(~kLargeStringFlagBitMask &
73                 (static_cast<uint32_t>(index << kNumBlockOffsetBits) |
74                  (offset & kBlockOffsetBitMask)));
75     }
76 
RawId77     static constexpr Id Raw(uint32_t raw) { return Id(raw); }
78 
NullId79     static constexpr Id Null() { return Id(0u); }
80 
81    private:
IdId82     constexpr Id(uint32_t i) : id(i) {}
83 
84     uint32_t id;
85   };
86 
87   // Iterator over the strings in the pool.
88   class Iterator {
89    public:
90     Iterator(const StringPool*);
91 
92     explicit operator bool() const;
93     Iterator& operator++();
94 
95     NullTermStringView StringView();
96     Id StringId();
97 
98    private:
99     const StringPool* pool_ = nullptr;
100     uint32_t block_index_ = 0;
101     uint32_t block_offset_ = 0;
102     uint32_t large_strings_index_ = 0;
103   };
104 
105   StringPool();
106   ~StringPool();
107 
108   // Allow std::move().
109   StringPool(StringPool&&);
110   StringPool& operator=(StringPool&&);
111 
112   // Disable implicit copy.
113   StringPool(const StringPool&) = delete;
114   StringPool& operator=(const StringPool&) = delete;
115 
InternString(base::StringView str)116   Id InternString(base::StringView str) {
117     if (str.data() == nullptr)
118       return Id::Null();
119 
120     auto hash = str.Hash();
121     auto id_it = string_index_.find(hash);
122     if (id_it != string_index_.end()) {
123       PERFETTO_DCHECK(Get(id_it->second) == str);
124       return id_it->second;
125     }
126     return InsertString(str, hash);
127   }
128 
GetId(base::StringView str)129   base::Optional<Id> GetId(base::StringView str) const {
130     if (str.data() == nullptr)
131       return Id::Null();
132 
133     auto hash = str.Hash();
134     auto id_it = string_index_.find(hash);
135     if (id_it != string_index_.end()) {
136       PERFETTO_DCHECK(Get(id_it->second) == str);
137       return id_it->second;
138     }
139     return base::nullopt;
140   }
141 
Get(Id id)142   NullTermStringView Get(Id id) const {
143     if (id.is_null())
144       return NullTermStringView();
145     if (id.is_large_string())
146       return GetLargeString(id);
147     return GetFromBlockPtr(IdToPtr(id));
148   }
149 
CreateIterator()150   Iterator CreateIterator() const { return Iterator(this); }
151 
size()152   size_t size() const { return string_index_.size(); }
153 
154  private:
155   using StringHash = uint64_t;
156 
157   struct Block {
BlockBlock158     explicit Block(size_t size)
159         : mem_(base::PagedMemory::Allocate(size,
160                                            base::PagedMemory::kDontCommit)),
161           size_(size) {}
162     ~Block() = default;
163 
164     // Allow std::move().
165     Block(Block&&) noexcept = default;
166     Block& operator=(Block&&) = default;
167 
168     // Disable implicit copy.
169     Block(const Block&) = delete;
170     Block& operator=(const Block&) = delete;
171 
GetBlock172     uint8_t* Get(uint32_t offset) const {
173       return static_cast<uint8_t*>(mem_.Get()) + offset;
174     }
175 
176     std::pair<bool /*success*/, uint32_t /*offset*/> TryInsert(
177         base::StringView str);
178 
OffsetOfBlock179     uint32_t OffsetOf(const uint8_t* ptr) const {
180       PERFETTO_DCHECK(Get(0) < ptr &&
181                       ptr <= Get(static_cast<uint32_t>(size_ - 1)));
182       return static_cast<uint32_t>(ptr - Get(0));
183     }
184 
posBlock185     uint32_t pos() const { return pos_; }
186 
187    private:
188     base::PagedMemory mem_;
189     uint32_t pos_ = 0;
190     size_t size_ = 0;
191   };
192 
193   friend class Iterator;
194   friend class StringPoolTest;
195 
196   // StringPool IDs are 32-bit. If the MSB is 1, the remaining bits of the ID
197   // are an index into the |large_strings_| vector. Otherwise, the next 6 bits
198   // are the index of the Block in the pool, and the remaining 25 bits the
199   // offset of the encoded string inside the pool.
200   //
201   // [31] [30:25] [24:0]
202   //  |      |       |
203   //  |      |       +---- offset in block (or LSB of large string index).
204   //  |      +------------ block index (or MSB of large string index).
205   //  +------------------- 1: large string, 0: string in a Block.
206   static constexpr size_t kNumBlockIndexBits = 6;
207   static constexpr size_t kNumBlockOffsetBits = 25;
208 
209   static constexpr size_t kLargeStringFlagBitMask = 1u << 31;
210   static constexpr size_t kBlockOffsetBitMask = (1u << kNumBlockOffsetBits) - 1;
211   static constexpr size_t kBlockIndexBitMask =
212       0xffffffff & ~kLargeStringFlagBitMask & ~kBlockOffsetBitMask;
213 
214   static constexpr size_t kBlockSizeBytes = kBlockOffsetBitMask + 1;  // 32 MB
215 
216   // If a string doesn't fit into the current block, we can either start a new
217   // block or insert the string into the |large_strings_| vector. To maximize
218   // the used proportion of each block's memory, we only start a new block if
219   // the string isn't very large.
220   static constexpr size_t kMinLargeStringSizeBytes = kBlockSizeBytes / 8;
221 
222   // Number of bytes to reserve for size and null terminator.
223   // This is the upper limit on metadata size: 5 bytes for max uint32,
224   // plus 1 byte for null terminator. The actual size may be lower.
225   static constexpr uint8_t kMaxMetadataSize = 6;
226 
227   // Inserts the string with the given hash into the pool and return its Id.
228   Id InsertString(base::StringView, uint64_t hash);
229 
230   // Insert a large string into the pool and return its Id.
231   Id InsertLargeString(base::StringView, uint64_t hash);
232 
233   // The returned pointer points to the start of the string metadata (i.e. the
234   // first byte of the size).
IdToPtr(Id id)235   const uint8_t* IdToPtr(Id id) const {
236     // If the MSB is set, the ID represents an index into |large_strings_|, so
237     // shouldn't be converted into a block pointer.
238     PERFETTO_DCHECK(!id.is_large_string());
239 
240     size_t block_index = id.block_index();
241     uint32_t block_offset = id.block_offset();
242 
243     PERFETTO_DCHECK(block_index < blocks_.size());
244     PERFETTO_DCHECK(block_offset < blocks_[block_index].pos());
245 
246     return blocks_[block_index].Get(block_offset);
247   }
248 
249   // |ptr| should point to the start of the string metadata (i.e. the first byte
250   // of the size).
251   // Returns pointer to the start of the string.
ReadSize(const uint8_t * ptr,uint32_t * size)252   static const uint8_t* ReadSize(const uint8_t* ptr, uint32_t* size) {
253     uint64_t value = 0;
254     const uint8_t* str_ptr = protozero::proto_utils::ParseVarInt(
255         ptr, ptr + kMaxMetadataSize, &value);
256     PERFETTO_DCHECK(str_ptr != ptr);
257     PERFETTO_DCHECK(value < std::numeric_limits<uint32_t>::max());
258     *size = static_cast<uint32_t>(value);
259     return str_ptr;
260   }
261 
262   // |ptr| should point to the start of the string metadata (i.e. the first byte
263   // of the size).
GetFromBlockPtr(const uint8_t * ptr)264   static NullTermStringView GetFromBlockPtr(const uint8_t* ptr) {
265     uint32_t size = 0;
266     const uint8_t* str_ptr = ReadSize(ptr, &size);
267     return NullTermStringView(reinterpret_cast<const char*>(str_ptr), size);
268   }
269 
270   // Lookup a string in the |large_strings_| vector. |id| should have the MSB
271   // set.
GetLargeString(Id id)272   NullTermStringView GetLargeString(Id id) const {
273     PERFETTO_DCHECK(id.is_large_string());
274     size_t index = id.large_string_index();
275     PERFETTO_DCHECK(index < large_strings_.size());
276     const std::string* str = large_strings_[index].get();
277     return NullTermStringView(str->c_str(), str->size());
278   }
279 
280   // The actual memory storing the strings.
281   std::vector<Block> blocks_;
282 
283   // Any string that is too large to fit into a Block is stored separately
284   // (inside a unique_ptr to ensure any references to it remain valid even if
285   // |large_strings_| is resized).
286   std::vector<std::unique_ptr<std::string>> large_strings_;
287 
288   // Maps hashes of strings to the Id in the string pool.
289   // TODO(lalitm): At some point we should benchmark just using a static
290   // hashtable of 1M elements, we can afford paying a fixed 8MB here
291   std::unordered_map<StringHash, Id> string_index_;
292 };
293 
294 }  // namespace trace_processor
295 }  // namespace perfetto
296 
297 template <>
298 struct std::hash<::perfetto::trace_processor::StringPool::Id> {
299   using argument_type = ::perfetto::trace_processor::StringPool::Id;
300   using result_type = size_t;
301 
302   result_type operator()(const argument_type& r) const {
303     return std::hash<uint32_t>{}(r.raw_id());
304   }
305 };
306 
307 #endif  // SRC_TRACE_PROCESSOR_CONTAINERS_STRING_POOL_H_
308