1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Copyright 2011 Google Inc. All Rights Reserved. 16 // Author: ulas@google.com (Ulas Kirazci) 17 // 18 // Trie for word prefix lookups. Features: 19 // 20 // - Dynamic additions (but not deletions) 21 // - Low memory usage 22 // - Reasonable latency but not QPS 23 // - Revive from persistence is a disk read 24 // - Stores a 4-byte value associated with every key 25 // 26 // Associated with each value in the trie is a set of property ids. For 27 // efficiency, property ids should start at 0 and be densely packed. A value 28 // may have more than one id set. There is an additional deleted property 29 // for each value, which is set only when all the property ids associated with a 30 // value have been cleared. In the flash_index, property ids are used to track 31 // corpus ids. 32 // 33 // Not thread-safe. 34 35 #ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 36 #define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 37 38 #include <stdint.h> 39 40 #include <memory> 41 #include <string> 42 #include <unordered_map> 43 #include <vector> 44 45 #include "icing/legacy/core/icing-compat.h" 46 #include "icing/legacy/core/icing-packed-pod.h" 47 #include "icing/legacy/index/icing-filesystem.h" 48 #include "icing/legacy/index/icing-mmapper.h" 49 #include "icing/legacy/index/icing-storage.h" 50 #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h" 51 #include "icing/util/i18n-utils.h" 52 #include "unicode/utf8.h" 53 54 namespace icing { 55 namespace lib { 56 57 class IcingFlashBitmap; 58 59 class IcingDynamicTrie : public IIcingStorage { 60 class Dumper; 61 class IcingDynamicTrieStorage; 62 63 public: 64 // Adjacent bit fields are usually packed automatically. However, that is 65 // implementation specific: 66 // http://en.cppreference.com/w/cpp/language/bit_field 67 // So we'll set packed to be explicit. 68 class Node { 69 public: 70 // This object is only ever used by an ArrayStorage, which allocates 71 // sizeof(Node) bytes, zeroes them out and then casts to a Node. 72 Node() = delete; 73 next_index()74 uint32_t next_index() const { return next_index_; } set_next_index(uint32_t next_index)75 void set_next_index(uint32_t next_index) { next_index_ = next_index; } 76 is_leaf()77 bool is_leaf() const { return is_leaf_; } set_is_leaf(bool is_leaf)78 void set_is_leaf(bool is_leaf) { is_leaf_ = is_leaf; } 79 log2_num_children()80 uint8_t log2_num_children() const { return log2_num_children_; } set_log2_num_children(uint8_t log2_num_children)81 void set_log2_num_children(uint8_t log2_num_children) { 82 log2_num_children_ = log2_num_children; 83 } 84 85 private: 86 uint32_t next_index_ : 27; 87 uint32_t is_leaf_ : 1; 88 uint32_t log2_num_children_ : 4; 89 } __attribute__((packed)); 90 static_assert(sizeof(Node) == 4, ""); 91 static_assert(icing_is_packed_pod<Node>::value, "go/icing-ubsan"); 92 93 // Adjacent bit fields are usually packed automatically. However, that is 94 // implementation specific: 95 // http://en.cppreference.com/w/cpp/language/bit_field. 96 // So we'll set packed to be explicit. 97 union Next { Next(uint8_t val,uint32_t node_index)98 Next(uint8_t val, uint32_t node_index) { 99 used.val = val; 100 used.node_index = node_index; 101 } 102 val()103 uint8_t val() const { return used.val; } set_val(uint8_t val)104 void set_val(uint8_t val) { used.val = val; } 105 node_index()106 uint32_t node_index() const { return used.node_index; } set_node_index(uint32_t node_index)107 void set_node_index(uint32_t node_index) { used.node_index = node_index; } 108 next_index()109 uint32_t next_index() const { return freelink.next_index; } set_next_index(uint32_t next_index)110 void set_next_index(uint32_t next_index) { 111 freelink.next_index = next_index; 112 } 113 114 bool operator<(const Next &next2) const { 115 if (val() == next2.val()) { 116 return node_index() < next2.node_index(); 117 } 118 return val() < next2.val(); 119 } 120 121 private: 122 // This object is only ever used by an ArrayStorage, which allocates 123 // sizeof(Node) bytes, zeroes them out and then casts to a Node. 124 Next() = default; 125 126 struct { 127 uint32_t val : 8; 128 uint32_t node_index : 24; 129 } used; 130 struct { 131 uint32_t next_index : 32; 132 } freelink; 133 } __attribute__((packed)); 134 static_assert(sizeof(Next) == 4, ""); 135 static_assert(sizeof(Next) % alignof(Next) == 0, ""); 136 static_assert(icing_is_packed_pod<Next>::value, "go/icing-ubsan"); 137 138 static const int kMaxNextArraySize = 256; 139 static const int kNumNextAllocationBuckets = 9; // [log2(1), log2(256)] 140 141 static const uint32_t kMaxPropertyId = (1 << 16) - 1; 142 143 static const uint32_t kInvalidValueIndex = 0; 144 145 static const uint32_t kNoCrc = 0; 146 147 struct Stats { 148 uint32_t num_keys; 149 150 // Node stats 151 152 uint32_t num_nodes; 153 uint32_t max_nodes; 154 // Count of intermediate nodes. 155 uint32_t num_intermediates; 156 // Count of leaf nodes. 157 uint32_t num_leaves; 158 159 // Next stats 160 161 uint32_t num_nexts; 162 uint32_t max_nexts; 163 // Count of next arrays by size. 164 uint32_t child_counts[kMaxNextArraySize]; 165 // Wasted next array space per allocation bucket (in Nexts, not 166 // bytes). 167 uint32_t wasted[kNumNextAllocationBuckets]; 168 // Sum of wasted array. 169 uint32_t total_wasted; 170 171 // Suffix stats 172 173 uint32_t suffixes_size; 174 uint32_t max_suffixes_size; 175 // Bytes actually used by suffixes. 176 uint32_t suffixes_used; 177 // Number of suffixes that are just empty strings. 178 uint32_t null_suffixes; 179 180 // Next free-list stats 181 uint32_t num_free[kNumNextAllocationBuckets]; 182 // Total Next nodes free (weighted sum of the above). 183 uint32_t total_free; 184 185 // Dirty pages. 186 uint32_t dirty_pages_nodes; 187 uint32_t dirty_pages_nexts; 188 uint32_t dirty_pages_suffixes; 189 190 std::string DumpStats(int verbosity) const; 191 }; 192 193 // Options when creating the trie. Maximums for the node/next/suffix 194 // arrays must be specified in advance. 195 struct Options { 196 // Absolute maximums. 197 static const uint32_t kMaxNodes, kMaxNexts, kMaxSuffixesSize, kMaxValueSize; 198 199 // The default takes 13MB of memory and can take about 1M English 200 // words. OptionsOptions201 Options() 202 : max_nodes(1U << 20), 203 max_nexts(1U << 20), 204 max_suffixes_size(5U << 20), 205 value_size(sizeof(uint32_t)) {} OptionsOptions206 Options(uint32_t max_nodes_in, uint32_t max_nexts_in, 207 uint32_t max_suffixes_size_in, uint32_t value_size_in) 208 : max_nodes(max_nodes_in), 209 max_nexts(max_nexts_in), 210 max_suffixes_size(max_suffixes_size_in), 211 value_size(value_size_in) {} 212 213 uint32_t max_nodes; 214 uint32_t max_nexts; 215 uint32_t max_suffixes_size; 216 uint32_t value_size; 217 218 // True if options do not exceed absolute maximums. 219 bool is_valid() const; 220 }; 221 222 // These can be supplied during runtime, as opposed to the persisted 223 // Options above. 224 struct RuntimeOptions { 225 enum StoragePolicy { 226 // Changes are reflected in the underlying file immediately but 227 // more vulnerable to corruption. 228 kMapSharedWithCrc, 229 230 // Changes only applied during Flush. Smaller window of 231 // vulnerability to corruption. 232 kExplicitFlush, 233 }; 234 set_storage_policyRuntimeOptions235 RuntimeOptions &set_storage_policy(StoragePolicy sp) { 236 storage_policy = sp; 237 return *this; 238 } 239 240 StoragePolicy storage_policy = kExplicitFlush; 241 }; 242 max_value_index(const Options & options)243 static uint32_t max_value_index(const Options &options) { 244 return options.max_suffixes_size; 245 } 246 247 // Light-weight constructor. Real work happens in Create or Init. 248 IcingDynamicTrie(const std::string &filename_base, 249 const RuntimeOptions &runtime_options, 250 const IcingFilesystem *filesystem); 251 ~IcingDynamicTrie() override; 252 is_initialized()253 bool is_initialized() const { return is_initialized_; } 254 255 // Create, but do not Init, a new trie with options if the file does 256 // not already exist. 257 // 258 // Returns true if successfully created all files or files already 259 // exist. Does not do a complete sanity check for when files seem to 260 // exist. Cleans up files if creation fails midstream. 261 bool CreateIfNotExist(const Options &options); 262 UpgradeTo(int new_version)263 bool UpgradeTo(int new_version) override { return true; } 264 bool Init() override; 265 void Close() override; 266 bool Remove() override; 267 uint64_t GetDiskUsage() const override; 268 269 // Returns the size of the elements held in the trie. This excludes the size 270 // of any internal metadata of the trie, e.g. the trie's header. 271 uint64_t GetElementsSize() const; 272 273 // REQUIRED: For all functions below is_initialized() == true. 274 275 // Number of keys in trie. 276 uint32_t size() const; 277 278 // Collecting stats. 279 void CollectStats(Stats *stats) const; 280 281 // Gets all of the contents of the trie for debugging purposes. Note: this 282 // stores the entire set of terms in memory. 283 // pretty_print - The tree structure of the trie will be written to this. 284 // keys - All keys in the trie are appended to this vector. 285 void DumpTrie(std::ostream *pretty_print, 286 std::vector<std::string> *keys) const; 287 288 // Empty out the trie without closing or removing. 289 void Clear(); 290 291 // Clears the suffix and value at the given index. Returns true on success. 292 bool ClearSuffixAndValue(uint32_t suffix_value_index); 293 294 // Resets the next at the given index so that it points to no node. 295 // Returns true on success. 296 bool ResetNext(uint32_t next_index); 297 298 // Sorts the next array of the node. Returns true on success. 299 bool SortNextArray(const Node *node); 300 301 // Sync to disk. 302 bool Sync() override; 303 304 // Tell kernel we will access the memory shortly. 305 void Warm() const; 306 307 // Potentially about to get nuked. 308 void OnSleep() override; 309 310 // Compact trie into out for value indices present in old_tvi_to_new_value. 311 class NewValueMap { 312 public: 313 virtual ~NewValueMap(); 314 315 // Returns the new value we want to assign to the entry at old 316 // value index. We don't take ownership of the pointer. 317 virtual const void *GetNewValue(uint32_t old_value_index) const = 0; 318 }; 319 // Compacts this trie. This drops all deleted keys, drops all keys for which 320 // old_tvi_to_new_value returns nullptr, updates values to be the values 321 // returned by old_tvi_to_new_value, rewrites tvis, and saves the results into 322 // the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of 323 // old value_index to new value_index. 324 bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out, 325 std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const; 326 327 // Insert value at key. If key already exists and replace == true, 328 // replaces old value with value. We take a copy of value. 329 // 330 // If value_index is not NULL, returns a pointer to value in 331 // value_index. This can then be used with SetValueAtIndex 332 // below. value_index is not valid past a Clear/Read/Write. 333 // 334 // Returns false if there is no space left in the trie. 335 // 336 // REQUIRES: value a buffer of size value_size() Insert(const char * key,const void * value)337 bool Insert(const char *key, const void *value) { 338 return Insert(key, value, nullptr, true, nullptr); 339 } Insert(const char * key,const void * value,uint32_t * value_index,bool replace)340 bool Insert(const char *key, const void *value, uint32_t *value_index, 341 bool replace) { 342 return Insert(key, value, value_index, replace, nullptr); 343 } 344 bool Insert(const char *key, const void *value, uint32_t *value_index, 345 bool replace, bool *pnew_key); 346 347 // Get a value returned by Insert value_index. This points to the 348 // value in the trie. The pointer is immutable and always valid 349 // while the trie is alive. 350 const void *GetValueAtIndex(uint32_t value_index) const; 351 352 // Set a value returned by Insert value_index. We take a copy of 353 // value. 354 // 355 // REQUIRES: value a buffer of size value_size() 356 void SetValueAtIndex(uint32_t value_index, const void *value); 357 358 // Returns true if key is found and sets value. If value_index is 359 // not NULL, returns value_index (see Insert discussion above). 360 // If the key is not found, returns false and neither value nor 361 // value_index is modified. 362 // 363 // REQUIRES: value a buffer of size value_size() Find(const char * key,void * value)364 bool Find(const char *key, void *value) const { 365 return Find(key, value, nullptr); 366 } 367 bool Find(const char *key, void *value, uint32_t *value_index) const; 368 369 // Find the input key and all keys that are a variant of the input 370 // key according to a variant map. Currently supports 371 // transliteration. For example "a" is a variant for "à" or "á" so 372 // an "a" in the input key can match those characters in the trie in 373 // addition to itself. 374 // 375 // If prefix is set, also returns any prefix matches (so value_index 376 // will be invalid). 377 // 378 // REQUIRES: all terms in the lexicon to be valid utf8. 379 struct OriginalMatch { 380 uint32_t value_index; 381 std::string orig; 382 OriginalMatchOriginalMatch383 OriginalMatch() : value_index(kInvalidValueIndex) {} 384 is_full_matchOriginalMatch385 bool is_full_match() const { return value_index != kInvalidValueIndex; } 386 }; 387 388 static constexpr int kNoBranchFound = -1; 389 // Return prefix of any new branches created if key were inserted. If utf8 is 390 // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches 391 // would be created. 392 int FindNewBranchingPrefixLength(const char *key, bool utf8) const; 393 394 // Find all prefixes of key where the trie branches. Excludes the key 395 // itself. If utf8 is true, does not cut key mid-utf8. 396 std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const; 397 398 void GetDebugInfo(int verbosity, std::string *out) const override; 399 400 double min_free_fraction() const; 401 402 uint32_t value_size() const; 403 404 uint32_t max_value_index() const; 405 406 // If in kMapSharedWithCrc mode, update crcs and return the master 407 // crc, else return kNoCrc. This crc includes both the trie files 408 // and property bitmaps. 409 uint32_t UpdateCrc(); 410 411 // Store dynamic properties for each value. When a property is added to 412 // a value, the deleted flag is cleared for it (if it was previously set). 413 bool SetProperty(uint32_t value_index, uint32_t property_id); 414 bool ClearProperty(uint32_t value_index, uint32_t property_id); 415 416 // Store deleted property for each value. 417 // This method is not the only way the deleted property can be set; the trie 418 // may set this property itself during other operations if it can determine a 419 // value becomes superfluous. 420 bool SetDeleted(uint32_t value_index); 421 422 // Clears the deleted property for each value. 423 bool ClearDeleted(uint32_t value_index); 424 425 // Deletes the entry associated with the key. Data can not be recovered after 426 // the deletion. Returns true on success. 427 bool Delete(std::string_view key); 428 429 // Clear a specific property id from all values. For each value that has this 430 // property cleared, also check to see if it was the only property set; if 431 // so, set the deleted property for the value to indicate it no longer has any 432 // properties associated with it. 433 bool ClearPropertyForAllValues(uint32_t property_id); 434 435 // Access properties. Usage: 436 // 437 // IcingDynamicTrie::PropertyReader reader(trie, 10); 438 // char value[SIZE]; 439 // uint32_t value_index; 440 // if (trie.Find("abc", value, &value_index) && 441 // reader.HasProperty(value_index)) { 442 // ... 443 // } 444 // 445 // Readers are valid as long as the underlying trie is open. 446 class PropertyReaderBase { 447 public: 448 // Whether underlying file exists. 449 bool Exists() const; 450 451 // Returns false for all values if underlying file is missing. 452 bool HasProperty(uint32_t value_index) const; 453 454 protected: 455 PropertyReaderBase(const IcingDynamicTrie &trie, bool deleted, 456 uint32_t property_id); 457 458 // Does not own. 459 const IcingFlashBitmap *bitmap_; 460 const IcingDynamicTrie &trie_; 461 }; 462 463 // Reader for a given property. It is invalidated when the underlying property 464 // is deleted, or the trie is closed. 465 class PropertyReader : public PropertyReaderBase { 466 public: PropertyReader(const IcingDynamicTrie & trie,uint32_t property_id)467 PropertyReader(const IcingDynamicTrie &trie, uint32_t property_id) 468 : PropertyReaderBase(trie, false, property_id) {} 469 }; 470 471 // Reader for the deleted property. It is invalidated when the trie is closed. 472 class PropertyDeletedReader : public PropertyReaderBase { 473 public: PropertyDeletedReader(const IcingDynamicTrie & trie)474 explicit PropertyDeletedReader(const IcingDynamicTrie &trie) 475 : PropertyReaderBase(trie, true, 0) {} 476 }; 477 478 // Reader for all properties (but not the deleted one). It is invalidated when 479 // the trie is closed. 480 class PropertyReadersAll { 481 public: 482 explicit PropertyReadersAll(const IcingDynamicTrie &trie); 483 484 // Whether underlying file for property_id exists. 485 bool Exists(uint32_t property_id) const; 486 487 // Returns false if underlying file or property doesn't exist. 488 bool HasProperty(uint32_t property_id, uint32_t value_index) const; 489 490 // Returns true if the value at value_index is set for the only the supplied 491 // property_id, and none of the other properties. 492 bool IsPropertyUnique(uint32_t property_id, uint32_t value_index) const; 493 494 // For iterating. 495 size_t size() const; 496 497 private: 498 const IcingDynamicTrie &trie_; 499 }; 500 501 // Iterate through trie in lexicographic order. 502 // 503 // Not thread-safe. 504 // 505 // Change in underlying trie invalidates iterator. 506 class Iterator { 507 public: 508 Iterator(const IcingDynamicTrie &trie, const char *prefix); 509 void Reset(); 510 bool Advance(); 511 512 // If !IsValid(), GetKey() will return NULL and GetValue() will 513 // return 0. 514 bool IsValid() const; 515 const char *GetKey() const; 516 // This points directly to the underlying data and is valid while 517 // the trie is alive. We keep ownership of the pointer. 518 const void *GetValue() const; 519 uint32_t GetValueIndex() const; 520 521 private: 522 Iterator(); 523 // Copy is ok. 524 525 // Helper function that takes the left-most branch down 526 // intermediate nodes to a leaf. 527 void LeftBranchToLeaf(uint32_t node_index); 528 529 std::string cur_key_; 530 const char *cur_suffix_; 531 int cur_suffix_len_; 532 struct Branch { 533 uint32_t node_idx; 534 int child_idx; 535 BranchBranch536 explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {} 537 }; 538 std::vector<Branch> branch_stack_; 539 bool single_leaf_match_; 540 541 const IcingDynamicTrie &trie_; 542 }; 543 544 // Represents a non-leaf node or a "virtual" trie node in the suffix 545 // region. 546 struct LogicalNode { 547 const Node *node; 548 int suffix_offset; 549 LogicalNodeLogicalNode550 LogicalNode() : node(nullptr), suffix_offset(0) {} LogicalNodeLogicalNode551 LogicalNode(const Node *node_in, int suffix_offset_in) 552 : node(node_in), suffix_offset(suffix_offset_in) {} 553 }; 554 555 // Iterate over all utf8 chars in the trie anchored at prefix (or 556 // node). If trie has invalid utf8 chars, behavior is undefined (but 557 // won't crash). 558 class Utf8Iterator { 559 public: 560 void Reset(); 561 bool Advance(); 562 563 bool IsValid() const; 564 565 private: 566 struct Branch { 567 const Node *node; 568 const Next *child; 569 const Next *child_end; 570 571 bool IsFinished(); 572 }; 573 574 Utf8Iterator(); 575 // Copy is ok. 576 577 void LeftBranchToUtf8End(); 578 void InitBranch(Branch *branch, const Node *start, char key_char); 579 void GoIntoSuffix(const Node *node); 580 581 char cur_[U8_MAX_LENGTH + 1]; // NULL-terminated 582 int cur_len_; 583 LogicalNode cur_logical_node_; 584 585 Branch branch_stack_[U8_MAX_LENGTH]; 586 Branch *branch_end_; 587 588 const IcingDynamicTrie &trie_; 589 const Node *start_node_; 590 }; 591 592 private: 593 class CandidateSet; 594 595 // For testing only. 596 friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test; 597 friend class IcingDynamicTrieTest_SyncErrorRecovery_Test; 598 friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test; 599 void GetHeader(IcingDynamicTrieHeader *hdr) const; 600 void SetHeader(const IcingDynamicTrieHeader &new_hdr); 601 602 static const uint32_t kInvalidSuffixIndex; 603 604 // Stats helpers. 605 void CollectStatsRecursive(const Node &node, Stats *stats) const; 606 607 // Helpers for Find and Insert. 608 const Next *GetNextByChar(const Node *node, uint8_t key_char) const; 609 const Next *LowerBound(const Next *start, const Next *end, 610 uint8_t key_char) const; 611 void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset, 612 bool prefix, bool utf8 = false) const; 613 614 // For value properties. This truncates the data by clearing it, but leaving 615 // the storage intact. 616 bool InitPropertyBitmaps(); 617 618 // Returns a pointer to a bitmap that is successfully opened. 619 static std::unique_ptr<IcingFlashBitmap> OpenAndInitBitmap( 620 const std::string &filename, bool verify, 621 const IcingFilesystem *filesystem); 622 623 // Returns a pointer to a writable bitmap, creating it if necessary. Returned 624 // pointer should not be freed, it will be maintained by property_bitmaps_. 625 // Returns null if bitmap failed to load. 626 IcingFlashBitmap *OpenOrCreatePropertyBitmap(uint32_t property_id); 627 628 uint64_t ValueIndexToPropertyBitmapIndex(uint32_t value_index) const; 629 630 const std::string filename_base_; 631 bool is_initialized_; 632 const RuntimeOptions runtime_options_; 633 std::unique_ptr<IcingDynamicTrieStorage> storage_; 634 const std::string property_bitmaps_prefix_; 635 std::vector<std::unique_ptr<IcingFlashBitmap>> property_bitmaps_; 636 const std::string deleted_bitmap_filename_; 637 std::unique_ptr<IcingFlashBitmap> deleted_bitmap_; 638 const IcingFilesystem *const filesystem_; 639 }; 640 641 } // namespace lib 642 } // namespace icing 643 644 #endif // ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_ 645