1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Copyright 2011 Google Inc. All Rights Reserved.
16 // Author: ulas@google.com (Ulas Kirazci)
17 //
18 // Trie for word prefix lookups. Features:
19 //
20 // - Dynamic additions (but not deletions)
21 // - Low memory usage
22 // - Reasonable latency but not QPS
23 // - Revive from persistence is a disk read
24 // - Stores a 4-byte value associated with every key
25 //
26 // Associated with each value in the trie is a set of property ids.  For
27 // efficiency, property ids should start at 0 and be densely packed.  A value
28 // may have more than one id set.  There is an additional deleted property
29 // for each value, which is set only when all the property ids associated with a
30 // value have been cleared.  In the flash_index, property ids are used to track
31 // corpus ids.
32 //
33 // Not thread-safe.
34 
35 #ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
36 #define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
37 
38 #include <stdint.h>
39 
40 #include <memory>
41 #include <string>
42 #include <unordered_map>
43 #include <vector>
44 
45 #include "icing/legacy/core/icing-compat.h"
46 #include "icing/legacy/core/icing-packed-pod.h"
47 #include "icing/legacy/index/icing-filesystem.h"
48 #include "icing/legacy/index/icing-mmapper.h"
49 #include "icing/legacy/index/icing-storage.h"
50 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
51 #include "icing/util/i18n-utils.h"
52 #include "unicode/utf8.h"
53 
54 namespace icing {
55 namespace lib {
56 
57 class IcingFlashBitmap;
58 
59 class IcingDynamicTrie : public IIcingStorage {
60   class Dumper;
61   class IcingDynamicTrieStorage;
62 
63  public:
64   // Adjacent bit fields are usually packed automatically. However, that is
65   // implementation specific:
66   // http://en.cppreference.com/w/cpp/language/bit_field
67   // So we'll set packed to be explicit.
68   class Node {
69    public:
70     // This object is only ever used by an ArrayStorage, which allocates
71     // sizeof(Node) bytes, zeroes them out and then casts to a Node.
72     Node() = delete;
73 
next_index()74     uint32_t next_index() const { return next_index_; }
set_next_index(uint32_t next_index)75     void set_next_index(uint32_t next_index) { next_index_ = next_index; }
76 
is_leaf()77     bool is_leaf() const { return is_leaf_; }
set_is_leaf(bool is_leaf)78     void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; }
79 
log2_num_children()80     uint8_t log2_num_children() const { return log2_num_children_; }
set_log2_num_children(uint8_t log2_num_children)81     void set_log2_num_children(uint8_t log2_num_children) {
82       log2_num_children_ = log2_num_children;
83     }
84 
85    private:
86     uint32_t next_index_ : 27;
87     uint32_t is_leaf_ : 1;
88     uint32_t log2_num_children_ : 4;
89   } __attribute__((packed));
90   static_assert(sizeof(Node) == 4, "");
91   static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan");
92 
93   // Adjacent bit fields are usually packed automatically. However, that is
94   // implementation specific:
95   // http://en.cppreference.com/w/cpp/language/bit_field.
96   // So we'll set packed to be explicit.
97   union Next {
Next(uint8_t val,uint32_t node_index)98     Next(uint8_t val, uint32_t node_index) {
99       used.val = val;
100       used.node_index = node_index;
101     }
102 
val()103     uint8_t val() const { return used.val; }
set_val(uint8_t val)104     void set_val(uint8_t val) { used.val = val; }
105 
node_index()106     uint32_t node_index() const { return used.node_index; }
set_node_index(uint32_t node_index)107     void set_node_index(uint32_t node_index) { used.node_index = node_index; }
108 
next_index()109     uint32_t next_index() const { return freelink.next_index; }
set_next_index(uint32_t next_index)110     void set_next_index(uint32_t next_index) {
111       freelink.next_index = next_index;
112     }
113 
114     bool operator<(const Next &next2) const {
115       if (val() == next2.val()) {
116         return node_index() < next2.node_index();
117       }
118       return val() < next2.val();
119     }
120 
121    private:
122     // This object is only ever used by an ArrayStorage, which allocates
123     // sizeof(Node) bytes, zeroes them out and then casts to a Node.
124     Next() = default;
125 
126     struct {
127       uint32_t val : 8;
128       uint32_t node_index : 24;
129     } used;
130     struct {
131       uint32_t next_index : 32;
132     } freelink;
133   } __attribute__((packed));
134   static_assert(sizeof(Next) == 4, "");
135   static_assert(sizeof(Next) % alignof(Next) == 0, "");
136   static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan");
137 
138   static const int kMaxNextArraySize = 256;
139   static const int kNumNextAllocationBuckets = 9;  // [log2(1), log2(256)]
140 
141   static const uint32_t kMaxPropertyId = (1 << 16) - 1;
142 
143   static const uint32_t kInvalidValueIndex = 0;
144 
145   static const uint32_t kNoCrc = 0;
146 
147   struct Stats {
148     uint32_t num_keys;
149 
150     // Node stats
151 
152     uint32_t num_nodes;
153     uint32_t max_nodes;
154     // Count of intermediate nodes.
155     uint32_t num_intermediates;
156     // Count of leaf nodes.
157     uint32_t num_leaves;
158 
159     // Next stats
160 
161     uint32_t num_nexts;
162     uint32_t max_nexts;
163     // Count of next arrays by size.
164     uint32_t child_counts[kMaxNextArraySize];
165     // Wasted next array space per allocation bucket (in Nexts, not
166     // bytes).
167     uint32_t wasted[kNumNextAllocationBuckets];
168     // Sum of wasted array.
169     uint32_t total_wasted;
170 
171     // Suffix stats
172 
173     uint32_t suffixes_size;
174     uint32_t max_suffixes_size;
175     // Bytes actually used by suffixes.
176     uint32_t suffixes_used;
177     // Number of suffixes that are just empty strings.
178     uint32_t null_suffixes;
179 
180     // Next free-list stats
181     uint32_t num_free[kNumNextAllocationBuckets];
182     // Total Next nodes free (weighted sum of the above).
183     uint32_t total_free;
184 
185     // Dirty pages.
186     uint32_t dirty_pages_nodes;
187     uint32_t dirty_pages_nexts;
188     uint32_t dirty_pages_suffixes;
189 
190     std::string DumpStats(int verbosity) const;
191   };
192 
193   // Options when creating the trie. Maximums for the node/next/suffix
194   // arrays must be specified in advance.
195   struct Options {
196     // Absolute maximums.
197     static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize;
198 
199     // The default takes 13MB of memory and can take about 1M English
200     // words.
OptionsOptions201     Options()
202         : max_nodes(1U << 20),
203           max_nexts(1U << 20),
204           max_suffixes_size(5U << 20),
205           value_size(sizeof(uint32_t)) {}
OptionsOptions206     Options(uint32_t max_nodes_in, uint32_t max_nexts_in,
207             uint32_t max_suffixes_size_in, uint32_t value_size_in)
208         : max_nodes(max_nodes_in),
209           max_nexts(max_nexts_in),
210           max_suffixes_size(max_suffixes_size_in),
211           value_size(value_size_in) {}
212 
213     uint32_t max_nodes;
214     uint32_t max_nexts;
215     uint32_t max_suffixes_size;
216     uint32_t value_size;
217 
218     // True if options do not exceed absolute maximums.
219     bool is_valid() const;
220   };
221 
222   // These can be supplied during runtime, as opposed to the persisted
223   // Options above.
224   struct RuntimeOptions {
225     enum StoragePolicy {
226       // Changes are reflected in the underlying file immediately but
227       // more vulnerable to corruption.
228       kMapSharedWithCrc,
229 
230       // Changes only applied during Flush. Smaller window of
231       // vulnerability to corruption.
232       kExplicitFlush,
233     };
234 
set_storage_policyRuntimeOptions235     RuntimeOptions &set_storage_policy(StoragePolicy sp) {
236       storage_policy = sp;
237       return *this;
238     }
239 
240     StoragePolicy storage_policy = kExplicitFlush;
241   };
242 
max_value_index(const Options & options)243   static uint32_t max_value_index(const Options &options) {
244     return options.max_suffixes_size;
245   }
246 
247   // Light-weight constructor. Real work happens in Create or Init.
248   IcingDynamicTrie(const std::string &filename_base,
249                    const RuntimeOptions &runtime_options,
250                    const IcingFilesystem *filesystem);
251   ~IcingDynamicTrie() override;
252 
is_initialized()253   bool is_initialized() const { return is_initialized_; }
254 
255   // Create, but do not Init, a new trie with options if the file does
256   // not already exist.
257   //
258   // Returns true if successfully created all files or files already
259   // exist. Does not do a complete sanity check for when files seem to
260   // exist. Cleans up files if creation fails midstream.
261   bool CreateIfNotExist(const Options &options);
262 
UpgradeTo(int new_version)263   bool UpgradeTo(int new_version) override { return true; }
264   bool Init() override;
265   void Close() override;
266   bool Remove() override;
267   uint64_t GetDiskUsage() const override;
268 
269   // Returns the size of the elements held in the trie. This excludes the size
270   // of any internal metadata of the trie, e.g. the trie's header.
271   uint64_t GetElementsSize() const;
272 
273   // REQUIRED: For all functions below is_initialized() == true.
274 
275   // Number of keys in trie.
276   uint32_t size() const;
277 
278   // Collecting stats.
279   void CollectStats(Stats *stats) const;
280 
281   // Gets all of the contents of the trie for debugging purposes. Note: this
282   // stores the entire set of terms in memory.
283   //   pretty_print - The tree structure of the trie will be written to this.
284   //   keys - All keys in the trie are appended to this vector.
285   void DumpTrie(std::ostream *pretty_print,
286                 std::vector<std::string> *keys) const;
287 
288   // Empty out the trie without closing or removing.
289   void Clear();
290 
291   // Clears the suffix and value at the given index. Returns true on success.
292   bool ClearSuffixAndValue(uint32_t suffix_value_index);
293 
294   // Resets the next at the given index so that it points to no node.
295   // Returns true on success.
296   bool ResetNext(uint32_t next_index);
297 
298   // Sorts the next array of the node. Returns true on success.
299   bool SortNextArray(const Node *node);
300 
301   // Sync to disk.
302   bool Sync() override;
303 
304   // Tell kernel we will access the memory shortly.
305   void Warm() const;
306 
307   // Potentially about to get nuked.
308   void OnSleep() override;
309 
310   // Compact trie into out for value indices present in old_tvi_to_new_value.
311   class NewValueMap {
312    public:
313     virtual ~NewValueMap();
314 
315     // Returns the new value we want to assign to the entry at old
316     // value index. We don't take ownership of the pointer.
317     virtual const void *GetNewValue(uint32_t old_value_index) const = 0;
318   };
319   // Compacts this trie. This drops all deleted keys, drops all keys for which
320   // old_tvi_to_new_value returns nullptr, updates values to be the values
321   // returned by old_tvi_to_new_value, rewrites tvis, and saves the results into
322   // the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of
323   // old value_index to new value_index.
324   bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
325                std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const;
326 
327   // Insert value at key. If key already exists and replace == true,
328   // replaces old value with value. We take a copy of value.
329   //
330   // If value_index is not NULL, returns a pointer to value in
331   // value_index. This can then be used with SetValueAtIndex
332   // below. value_index is not valid past a Clear/Read/Write.
333   //
334   // Returns false if there is no space left in the trie.
335   //
336   // REQUIRES: value a buffer of size value_size()
Insert(const char * key,const void * value)337   bool Insert(const char *key, const void *value) {
338     return Insert(key, value, nullptr, true, nullptr);
339   }
Insert(const char * key,const void * value,uint32_t * value_index,bool replace)340   bool Insert(const char *key, const void *value, uint32_t *value_index,
341               bool replace) {
342     return Insert(key, value, value_index, replace, nullptr);
343   }
344   bool Insert(const char *key, const void *value, uint32_t *value_index,
345               bool replace, bool *pnew_key);
346 
347   // Get a value returned by Insert value_index. This points to the
348   // value in the trie. The pointer is immutable and always valid
349   // while the trie is alive.
350   const void *GetValueAtIndex(uint32_t value_index) const;
351 
352   // Set a value returned by Insert value_index. We take a copy of
353   // value.
354   //
355   // REQUIRES: value a buffer of size value_size()
356   void SetValueAtIndex(uint32_t value_index, const void *value);
357 
358   // Returns true if key is found and sets value. If value_index is
359   // not NULL, returns value_index (see Insert discussion above).
360   // If the key is not found, returns false and neither value nor
361   // value_index is modified.
362   //
363   // REQUIRES: value a buffer of size value_size()
Find(const char * key,void * value)364   bool Find(const char *key, void *value) const {
365     return Find(key, value, nullptr);
366   }
367   bool Find(const char *key, void *value, uint32_t *value_index) const;
368 
369   // Find the input key and all keys that are a variant of the input
370   // key according to a variant map. Currently supports
371   // transliteration. For example "a" is a variant for "à" or "á" so
372   // an "a" in the input key can match those characters in the trie in
373   // addition to itself.
374   //
375   // If prefix is set, also returns any prefix matches (so value_index
376   // will be invalid).
377   //
378   // REQUIRES: all terms in the lexicon to be valid utf8.
379   struct OriginalMatch {
380     uint32_t value_index;
381     std::string orig;
382 
OriginalMatchOriginalMatch383     OriginalMatch() : value_index(kInvalidValueIndex) {}
384 
is_full_matchOriginalMatch385     bool is_full_match() const { return value_index != kInvalidValueIndex; }
386   };
387 
388   static constexpr int kNoBranchFound = -1;
389   // Return prefix of any new branches created if key were inserted. If utf8 is
390   // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches
391   // would be created.
392   int FindNewBranchingPrefixLength(const char *key, bool utf8) const;
393 
394   // Find all prefixes of key where the trie branches. Excludes the key
395   // itself. If utf8 is true, does not cut key mid-utf8.
396   std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const;
397 
398   void GetDebugInfo(int verbosity, std::string *out) const override;
399 
400   double min_free_fraction() const;
401 
402   uint32_t value_size() const;
403 
404   uint32_t max_value_index() const;
405 
406   // If in kMapSharedWithCrc mode, update crcs and return the master
407   // crc, else return kNoCrc. This crc includes both the trie files
408   // and property bitmaps.
409   uint32_t UpdateCrc();
410 
411   // Store dynamic properties for each value.  When a property is added to
412   // a value, the deleted flag is cleared for it (if it was previously set).
413   bool SetProperty(uint32_t value_index, uint32_t property_id);
414   bool ClearProperty(uint32_t value_index, uint32_t property_id);
415 
416   // Store deleted property for each value.
417   // This method is not the only way the deleted property can be set; the trie
418   // may set this property itself during other operations if it can determine a
419   // value becomes superfluous.
420   bool SetDeleted(uint32_t value_index);
421 
422   // Clears the deleted property for each value.
423   bool ClearDeleted(uint32_t value_index);
424 
425   // Deletes the entry associated with the key. Data can not be recovered after
426   // the deletion. Returns true on success.
427   bool Delete(std::string_view key);
428 
429   // Clear a specific property id from all values.  For each value that has this
430   // property cleared, also check to see if it was the only property set;  if
431   // so, set the deleted property for the value to indicate it no longer has any
432   // properties associated with it.
433   bool ClearPropertyForAllValues(uint32_t property_id);
434 
435   // Access properties. Usage:
436   //
437   // IcingDynamicTrie::PropertyReader reader(trie, 10);
438   // char value[SIZE];
439   // uint32_t value_index;
440   // if (trie.Find("abc", value, &value_index) &&
441   //     reader.HasProperty(value_index)) {
442   //     ...
443   // }
444   //
445   // Readers are valid as long as the underlying trie is open.
446   class PropertyReaderBase {
447    public:
448     // Whether underlying file exists.
449     bool Exists() const;
450 
451     // Returns false for all values if underlying file is missing.
452     bool HasProperty(uint32_t value_index) const;
453 
454    protected:
455     PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted,
456                        uint32_t property_id);
457 
458     // Does not own.
459     const IcingFlashBitmap *bitmap_;
460     const IcingDynamicTrie &trie_;
461   };
462 
463   // Reader for a given property. It is invalidated when the underlying property
464   // is deleted, or the trie is closed.
465   class PropertyReader : public PropertyReaderBase {
466    public:
PropertyReader(const IcingDynamicTrie & trie,uint32_t property_id)467     PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id)
468         : PropertyReaderBase(trie, false, property_id) {}
469   };
470 
471   // Reader for the deleted property. It is invalidated when the trie is closed.
472   class PropertyDeletedReader : public PropertyReaderBase {
473    public:
PropertyDeletedReader(const IcingDynamicTrie & trie)474     explicit PropertyDeletedReader(const IcingDynamicTrie &trie)
475         : PropertyReaderBase(trie, true, 0) {}
476   };
477 
478   // Reader for all properties (but not the deleted one). It is invalidated when
479   // the trie is closed.
480   class PropertyReadersAll {
481    public:
482     explicit PropertyReadersAll(const IcingDynamicTrie &trie);
483 
484     // Whether underlying file for property_id exists.
485     bool Exists(uint32_t property_id) const;
486 
487     // Returns false if underlying file or property doesn't exist.
488     bool HasProperty(uint32_t property_id, uint32_t value_index) const;
489 
490     // Returns true if the value at value_index is set for the only the supplied
491     // property_id, and none of the other properties.
492     bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const;
493 
494     // For iterating.
495     size_t size() const;
496 
497    private:
498     const IcingDynamicTrie &trie_;
499   };
500 
501   // Iterate through trie in lexicographic order.
502   //
503   // Not thread-safe.
504   //
505   // Change in underlying trie invalidates iterator.
506   class Iterator {
507    public:
508     Iterator(const IcingDynamicTrie &trie, const char *prefix);
509     void Reset();
510     bool Advance();
511 
512     // If !IsValid(), GetKey() will return NULL and GetValue() will
513     // return 0.
514     bool IsValid() const;
515     const char *GetKey() const;
516     // This points directly to the underlying data and is valid while
517     // the trie is alive. We keep ownership of the pointer.
518     const void *GetValue() const;
519     uint32_t GetValueIndex() const;
520 
521    private:
522     Iterator();
523     // Copy is ok.
524 
525     // Helper function that takes the left-most branch down
526     // intermediate nodes to a leaf.
527     void LeftBranchToLeaf(uint32_t node_index);
528 
529     std::string cur_key_;
530     const char *cur_suffix_;
531     int cur_suffix_len_;
532     struct Branch {
533       uint32_t node_idx;
534       int child_idx;
535 
BranchBranch536       explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {}
537     };
538     std::vector<Branch> branch_stack_;
539     bool single_leaf_match_;
540 
541     const IcingDynamicTrie &trie_;
542   };
543 
544   // Represents a non-leaf node or a "virtual" trie node in the suffix
545   // region.
546   struct LogicalNode {
547     const Node *node;
548     int suffix_offset;
549 
LogicalNodeLogicalNode550     LogicalNode() : node(nullptr), suffix_offset(0) {}
LogicalNodeLogicalNode551     LogicalNode(const Node *node_in, int suffix_offset_in)
552         : node(node_in), suffix_offset(suffix_offset_in) {}
553   };
554 
555   // Iterate over all utf8 chars in the trie anchored at prefix (or
556   // node). If trie has invalid utf8 chars, behavior is undefined (but
557   // won't crash).
558   class Utf8Iterator {
559    public:
560     void Reset();
561     bool Advance();
562 
563     bool IsValid() const;
564 
565    private:
566     struct Branch {
567       const Node *node;
568       const Next *child;
569       const Next *child_end;
570 
571       bool IsFinished();
572     };
573 
574     Utf8Iterator();
575     // Copy is ok.
576 
577     void LeftBranchToUtf8End();
578     void InitBranch(Branch *branch, const Node *start, char key_char);
579     void GoIntoSuffix(const Node *node);
580 
581     char cur_[U8_MAX_LENGTH + 1];  // NULL-terminated
582     int cur_len_;
583     LogicalNode cur_logical_node_;
584 
585     Branch branch_stack_[U8_MAX_LENGTH];
586     Branch *branch_end_;
587 
588     const IcingDynamicTrie &trie_;
589     const Node *start_node_;
590   };
591 
592  private:
593   class CandidateSet;
594 
595   // For testing only.
596   friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test;
597   friend class IcingDynamicTrieTest_SyncErrorRecovery_Test;
598   friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test;
599   void GetHeader(IcingDynamicTrieHeader *hdr) const;
600   void SetHeader(const IcingDynamicTrieHeader &new_hdr);
601 
602   static const uint32_t kInvalidSuffixIndex;
603 
604   // Stats helpers.
605   void CollectStatsRecursive(const Node &node, Stats *stats) const;
606 
607   // Helpers for Find and Insert.
608   const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
609   const Next *LowerBound(const Next *start, const Next *end,
610                          uint8_t key_char) const;
611   void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
612                     bool prefix, bool utf8 = false) const;
613 
614   // For value properties.  This truncates the data by clearing it, but leaving
615   // the storage intact.
616   bool InitPropertyBitmaps();
617 
618   // Returns a pointer to a bitmap that is successfully opened.
619   static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap(
620       const std::string &filename, bool verify,
621       const IcingFilesystem *filesystem);
622 
623   // Returns a pointer to a writable bitmap, creating it if necessary.  Returned
624   // pointer should not be freed, it will be maintained by property_bitmaps_.
625   // Returns null if bitmap failed to load.
626   IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id);
627 
628   uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const;
629 
630   const std::string filename_base_;
631   bool is_initialized_;
632   const RuntimeOptions runtime_options_;
633   std::unique_ptr<IcingDynamicTrieStorage> storage_;
634   const std::string property_bitmaps_prefix_;
635   std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_;
636   const std::string deleted_bitmap_filename_;
637   std::unique_ptr<IcingFlashBitmap> deleted_bitmap_;
638   const IcingFilesystem *const filesystem_;
639 };
640 
641 }  // namespace lib
642 }  // namespace icing
643 
644 #endif  // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
645