1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_ 16 #define ICING_INDEX_FLASH_INDEX_STORAGE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/absl_ports/canonical_errors.h" 24 #include "icing/file/filesystem.h" 25 #include "icing/index/main/flash-index-storage-header.h" 26 #include "icing/index/main/index-block.h" 27 #include "icing/index/main/posting-list-free.h" 28 #include "icing/index/main/posting-list-identifier.h" 29 #include "icing/index/main/posting-list-used.h" 30 #include "icing/legacy/core/icing-packed-pod.h" 31 #include "icing/store/document-id.h" 32 33 namespace icing { 34 namespace lib { 35 36 // The PostingListHolder struct exists to group together related PostingListUsed 37 // IndexBlock pairs and their ids. 38 struct PostingListHolder { 39 // PostingListUseds interpret data that they themselves do NOT own. The data 40 // being interpreted is stored on a flash block and its memory mapping is 41 // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must 42 // NOT exceed the lifecycle of the IndexBlock. 43 PostingListUsed posting_list; 44 IndexBlock block; 45 // The PostingListIdentifier, which identifies both the IndexBlock and the 46 // PostingListUsed, is also returned for convenience. 47 PostingListIdentifier id; 48 }; 49 50 // The FlashIndexStorage class manages the actual file that makes up the index. 51 // It allocates IndexBlocks as needed and maintains freelists to prevent 52 // excessive block fragmentation. 53 // 54 // It maintains two types of free lists: 55 // 1. On-disk, Header free list - This free list is stored in the Header 56 // block. There is a free list for every possible posting list size. Each 57 // entry for a posting list size contains the block_index of the 58 // IndexBlock that starts the free list chain. Each IndexBlock in the free 59 // list chain stores the index of the next IndexBlock in the chain. 60 // 2. In-memory free list - Like the Header free list, there is a free list of 61 // every possible posting list size. This free list contains not just the 62 // block_index of the available IndexBlock, but also the posting_list_index 63 // of the available PostingListUsed within the IndexBlock. This is because, 64 // unlike the Header free list, PostingListUseds are not actually freed 65 // when added to this free list. 66 // 67 // Whether or not the in-memory free list is used can be chosen via the 68 // in_memory param to the Create factory function. 69 // 70 // The advantage of using the in-memory free list is that it reduces the amount 71 // of flash writes made while editing the index (because actually freeing the 72 // PostingLists would require writing to that flash block). The disadvantage is 73 // that it introduces code complexity and potentially leaks blocks if power is 74 // lost or if FlashIndexStorage is destroyed before emptying the free list. 75 class FlashIndexStorage { 76 public: 77 // Creates a FlashIndexStorage at index_filename. in_memory determines whether 78 // or not the FlashIndexStorage maintains an in-memory freelist in order to 79 // avoid writes to the on-disk freelist. 80 // 81 // RETURNS: 82 // - On success, a valid instance of FlashIndexStorage 83 // - INTERNAL error if unable to create a new header or read the existing 84 // one from disk. 85 static libtextclassifier3::StatusOr<FlashIndexStorage> Create( 86 const std::string& index_filename, const Filesystem* filesystem, 87 bool in_memory = true); 88 89 // Retrieve the PostingList referred to by PostingListIdentifier. This posting 90 // list must have been previously allocated by a prior call to 91 // AllocatePostingList. 92 // 93 // RETURNS: 94 // - On success, a valid instance of PostingListHolder containing the 95 // requested PostingListUsed. 96 // - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the 97 // IndexBlock referred to by id.block_index() 98 // - INTERNAL_ERROR if unable to access the region in file. 99 libtextclassifier3::StatusOr<PostingListHolder> GetPostingList( 100 PostingListIdentifier id) const; 101 102 // Allocates and returns a PostingListHolder containing a PostingListUsed that 103 // can fit min_posting_list_bytes. 104 // 105 // RETURNS: 106 // - On success, a valid instance of PostingListHolder containing the 107 // requested PostingListUsed. 108 // - RESOURCE_EXHAUSTED error if unable to grow the index to create a 109 // PostingListUsed of the requested size. 110 libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList( 111 uint32_t min_posting_list_bytes); 112 113 ~FlashIndexStorage(); 114 FlashIndexStorage(FlashIndexStorage&&) = default; 115 FlashIndexStorage(const FlashIndexStorage&) = delete; 116 FlashIndexStorage& operator=(FlashIndexStorage&&) = default; 117 FlashIndexStorage& operator=(const FlashIndexStorage&) = delete; 118 119 // Free the PostingListUsed that this holder holds. 120 void FreePostingList(PostingListHolder holder); 121 122 // Used to track the largest docid indexed in the index. get_last_indexed_docid()123 DocumentId get_last_indexed_docid() const { 124 return header_block_->header()->last_indexed_docid; 125 } set_last_indexed_docid(DocumentId docid)126 void set_last_indexed_docid(DocumentId docid) { 127 header_block_->header()->last_indexed_docid = docid; 128 } 129 130 // Updates the header and persists all changes to the index to disk. Returns 131 // true on success. 132 bool PersistToDisk(); 133 134 // Returns the size of the index file in bytes. GetDiskUsage()135 int64_t GetDiskUsage() const { 136 return filesystem_->GetDiskUsage(block_fd_.get()); 137 } 138 139 // Returns the size of the index file used to contains hits. GetElementsSize()140 uint64_t GetElementsSize() const { 141 // Element size is the same as disk size excluding the header block. 142 return GetDiskUsage() - block_size(); 143 } 144 num_blocks()145 int num_blocks() const { return num_blocks_; } 146 147 // Info about the index based on the block size. block_size()148 int block_size() const { return header_block_->header()->block_size; } 149 150 // Num blocks starts at 1 since the first block is the header. empty()151 bool empty() const { return num_blocks_ <= 1; } 152 153 // The percentage of the maximum index size that is free. Allocated blocks are 154 // treated as fully used, even if they are only partially used. In this way, 155 // min_free_fraction is a lower bound of available space. min_free_fraction()156 double min_free_fraction() const { 157 return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex; 158 } 159 160 libtextclassifier3::Status Reset(); 161 162 void GetDebugInfo(int verbosity, std::string* out) const; 163 164 private: 165 FlashIndexStorage(const std::string& index_filename, 166 const Filesystem* filesystem, bool has_in_memory_freelists); 167 168 // Init the index from persistence. Create if file does not exist. We do not 169 // erase corrupt files. 170 // 171 // Returns false if unable to create a new header or if the existing one is 172 // corrupt. 173 bool Init(); 174 175 // Create or open the header block. Returns true on success. 176 bool InitHeader(); 177 178 // Create a new header block for an empty index file. 179 bool CreateHeader(); 180 181 // Loads the header stored at the beginning of the index file and validates 182 // the values stored in it. 183 bool OpenHeader(int64_t file_size); 184 185 // Add the IndexBlock referred to by block_index in the on-disk free list with 186 // index block_info_index. 187 void AddToOnDiskFreeList(uint32_t block_index, int block_info_index, 188 IndexBlock* index_block); 189 190 // Remove the IndexBlock referred to by block_index from the Header free list 191 // with index block_info_index. 192 void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index, 193 IndexBlock* index_block); 194 195 // Returns: 196 // - On success, a valid PostingListHolder created from the first entry of 197 // the in-memory freelist at block_info_index 198 // - NOT_FOUND if there was no entry in the freelist 199 // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be 200 // allocated for some reason. 201 libtextclassifier3::StatusOr<PostingListHolder> 202 GetPostingListFromInMemoryFreeList(int block_info_index); 203 204 // Returns: 205 // - On success, a valid PostingListHolder created from the first entry of 206 // the on-disk freelist at block_info_index 207 // - NOT_FOUND if there was no entry in the freelist 208 // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be 209 // allocated for some reason. 210 libtextclassifier3::StatusOr<PostingListHolder> 211 GetPostingListFromOnDiskFreeList(int block_info_index); 212 213 // Returns: 214 // - On success, a valid PostingListHolder created from a newly allocated 215 // IndexBlock. 216 // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new 217 // IndexBlock. 218 libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList( 219 int block_info_index); 220 221 // Returns: 222 // - On success, a newly created IndexBlock at block_index with posting 223 // lists of size posting_list_size 224 // - INTERNAL_ERROR if unable to access the region in file representing the 225 // IndexBlock 226 libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock( 227 int block_index, uint32_t posting_list_size) const; 228 229 // Returns: 230 // - On success, the IndexBlock that exists at block_index 231 // - INTERNAL_ERROR if unable to access the region in file representing the 232 // IndexBlock 233 libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const; 234 235 // Add a new block to the end of the file and return its block 236 // index. Returns kInvalidBlockIndex if unable to grow the index file. 237 int GrowIndex(); 238 239 // Return the index into index_block_infos of the smallest posting_list free 240 // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds 241 // the max-sized posting list. 242 int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const; 243 244 // Flushes the in-memory free list to disk. 245 void FlushInMemoryFreeList(); 246 247 // Underlying filename. 248 std::string index_filename_; 249 250 // We open the index file into this fd. 251 ScopedFd block_fd_; 252 int num_blocks_; // can be inferred from index file size 253 254 std::unique_ptr<HeaderBlock> header_block_; 255 256 // In-memory cache of free posting lists. 257 struct FreeList { 258 // Experimentally determined that high watermark for largest 259 // freelist was ~3500. 260 static constexpr size_t kMaxSize = 4096; 261 262 // Push a new PostingListIdentifier if there is space. 263 void Push(PostingListIdentifier id); 264 265 // Attempt to pop a PostingListIdentifier. 266 // 267 // RETURNS: 268 // - identifier of a free posting list, on success 269 // - NOT_FOUND if there are no free posting lists on this free list. 270 libtextclassifier3::StatusOr<PostingListIdentifier> TryPop(); 271 272 std::string DebugString() const; 273 274 private: 275 std::vector<PostingListIdentifier> free_list_; 276 int free_list_size_high_watermark_ = 0; 277 int num_dropped_free_list_entries_ = 0; 278 }; 279 std::vector<FreeList> in_memory_freelists_; 280 281 const Filesystem* filesystem_; // not owned; can't be null 282 283 bool has_in_memory_freelists_; 284 }; 285 286 } // namespace lib 287 } // namespace icing 288 289 #endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_ 290