1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_
16 #define ICING_INDEX_FLASH_INDEX_STORAGE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/absl_ports/canonical_errors.h"
24 #include "icing/file/filesystem.h"
25 #include "icing/index/main/flash-index-storage-header.h"
26 #include "icing/index/main/index-block.h"
27 #include "icing/index/main/posting-list-free.h"
28 #include "icing/index/main/posting-list-identifier.h"
29 #include "icing/index/main/posting-list-used.h"
30 #include "icing/legacy/core/icing-packed-pod.h"
31 #include "icing/store/document-id.h"
32 
33 namespace icing {
34 namespace lib {
35 
36 // The PostingListHolder struct exists to group together related PostingListUsed
37 // IndexBlock pairs and their ids.
38 struct PostingListHolder {
39   // PostingListUseds interpret data that they themselves do NOT own. The data
40   // being interpreted is stored on a flash block and its memory mapping is
41   // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must
42   // NOT exceed the lifecycle of the IndexBlock.
43   PostingListUsed posting_list;
44   IndexBlock block;
45   // The PostingListIdentifier, which identifies both the IndexBlock and the
46   // PostingListUsed, is also returned for convenience.
47   PostingListIdentifier id;
48 };
49 
50 // The FlashIndexStorage class manages the actual file that makes up the index.
51 // It allocates IndexBlocks as needed and maintains freelists to prevent
52 // excessive block fragmentation.
53 //
54 // It maintains two types of free lists:
55 //   1. On-disk, Header free list - This free list is stored in the Header
56 //      block. There is a free list for every possible posting list size. Each
57 //      entry for a posting list size contains the block_index of the
58 //      IndexBlock that starts the free list chain. Each IndexBlock in the free
59 //      list chain stores the index of the next IndexBlock in the chain.
60 //   2. In-memory free list - Like the Header free list, there is a free list of
61 //      every possible posting list size. This free list contains not just the
62 //      block_index of the available IndexBlock, but also the posting_list_index
63 //      of the available PostingListUsed within the IndexBlock. This is because,
64 //      unlike the Header free list, PostingListUseds are not actually freed
65 //      when added to this free list.
66 //
67 // Whether or not the in-memory free list is used can be chosen via the
68 // in_memory param to the Create factory function.
69 //
70 // The advantage of using the in-memory free list is that it reduces the amount
71 // of flash writes made while editing the index (because actually freeing the
72 // PostingLists would require writing to that flash block). The disadvantage is
73 // that it introduces code complexity and potentially leaks blocks if power is
74 // lost or if FlashIndexStorage is destroyed before emptying the free list.
75 class FlashIndexStorage {
76  public:
77   // Creates a FlashIndexStorage at index_filename. in_memory determines whether
78   // or not the FlashIndexStorage maintains an in-memory freelist in order to
79   // avoid writes to the on-disk freelist.
80   //
81   // RETURNS:
82   //   - On success, a valid instance of FlashIndexStorage
83   //   - INTERNAL error if unable to create a new header or read the existing
84   //     one from disk.
85   static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
86       const std::string& index_filename, const Filesystem* filesystem,
87       bool in_memory = true);
88 
89   // Retrieve the PostingList referred to by PostingListIdentifier. This posting
90   // list must have been previously allocated by a prior call to
91   // AllocatePostingList.
92   //
93   // RETURNS:
94   //   - On success, a valid instance of PostingListHolder containing the
95   //     requested PostingListUsed.
96   //   - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the
97   //     IndexBlock referred to by id.block_index()
98   //   - INTERNAL_ERROR if unable to access the region in file.
99   libtextclassifier3::StatusOr<PostingListHolder> GetPostingList(
100       PostingListIdentifier id) const;
101 
102   // Allocates and returns a PostingListHolder containing a PostingListUsed that
103   // can fit min_posting_list_bytes.
104   //
105   // RETURNS:
106   //   - On success, a valid instance of PostingListHolder containing the
107   //     requested PostingListUsed.
108   //   - RESOURCE_EXHAUSTED error if unable to grow the index to create a
109   //     PostingListUsed of the requested size.
110   libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList(
111       uint32_t min_posting_list_bytes);
112 
113   ~FlashIndexStorage();
114   FlashIndexStorage(FlashIndexStorage&&) = default;
115   FlashIndexStorage(const FlashIndexStorage&) = delete;
116   FlashIndexStorage& operator=(FlashIndexStorage&&) = default;
117   FlashIndexStorage& operator=(const FlashIndexStorage&) = delete;
118 
119   // Free the PostingListUsed that this holder holds.
120   void FreePostingList(PostingListHolder holder);
121 
122   // Used to track the largest docid indexed in the index.
get_last_indexed_docid()123   DocumentId get_last_indexed_docid() const {
124     return header_block_->header()->last_indexed_docid;
125   }
set_last_indexed_docid(DocumentId docid)126   void set_last_indexed_docid(DocumentId docid) {
127     header_block_->header()->last_indexed_docid = docid;
128   }
129 
130   // Updates the header and persists all changes to the index to disk. Returns
131   // true on success.
132   bool PersistToDisk();
133 
134   // Returns the size of the index file in bytes.
GetDiskUsage()135   int64_t GetDiskUsage() const {
136     return filesystem_->GetDiskUsage(block_fd_.get());
137   }
138 
139   // Returns the size of the index file used to contains hits.
GetElementsSize()140   uint64_t GetElementsSize() const {
141     // Element size is the same as disk size excluding the header block.
142     return GetDiskUsage() - block_size();
143   }
144 
num_blocks()145   int num_blocks() const { return num_blocks_; }
146 
147   // Info about the index based on the block size.
block_size()148   int block_size() const { return header_block_->header()->block_size; }
149 
150   // Num blocks starts at 1 since the first block is the header.
empty()151   bool empty() const { return num_blocks_ <= 1; }
152 
153   // The percentage of the maximum index size that is free. Allocated blocks are
154   // treated as fully used, even if they are only partially used. In this way,
155   // min_free_fraction is a lower bound of available space.
min_free_fraction()156   double min_free_fraction() const {
157     return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
158   }
159 
160   libtextclassifier3::Status Reset();
161 
162   void GetDebugInfo(int verbosity, std::string* out) const;
163 
164  private:
165   FlashIndexStorage(const std::string& index_filename,
166                     const Filesystem* filesystem, bool has_in_memory_freelists);
167 
168   // Init the index from persistence. Create if file does not exist. We do not
169   // erase corrupt files.
170   //
171   // Returns false if unable to create a new header or if the existing one is
172   // corrupt.
173   bool Init();
174 
175   // Create or open the header block. Returns true on success.
176   bool InitHeader();
177 
178   // Create a new header block for an empty index file.
179   bool CreateHeader();
180 
181   // Loads the header stored at the beginning of the index file and validates
182   // the values stored in it.
183   bool OpenHeader(int64_t file_size);
184 
185   // Add the IndexBlock referred to by block_index in the on-disk free list with
186   // index block_info_index.
187   void AddToOnDiskFreeList(uint32_t block_index, int block_info_index,
188                            IndexBlock* index_block);
189 
190   // Remove the IndexBlock referred to by block_index from the Header free list
191   // with index block_info_index.
192   void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index,
193                                 IndexBlock* index_block);
194 
195   // Returns:
196   //   - On success, a valid PostingListHolder created from the first entry of
197   //     the in-memory freelist at block_info_index
198   //   - NOT_FOUND if there was no entry in the freelist
199   //   - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
200   //     allocated for some reason.
201   libtextclassifier3::StatusOr<PostingListHolder>
202   GetPostingListFromInMemoryFreeList(int block_info_index);
203 
204   // Returns:
205   //   - On success, a valid PostingListHolder created from the first entry of
206   //     the on-disk freelist at block_info_index
207   //   - NOT_FOUND if there was no entry in the freelist
208   //   - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
209   //     allocated for some reason.
210   libtextclassifier3::StatusOr<PostingListHolder>
211   GetPostingListFromOnDiskFreeList(int block_info_index);
212 
213   // Returns:
214   //   - On success, a valid PostingListHolder created from a newly allocated
215   //     IndexBlock.
216   //   - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new
217   //     IndexBlock.
218   libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList(
219       int block_info_index);
220 
221   // Returns:
222   //   - On success, a newly created IndexBlock at block_index with posting
223   //     lists of size posting_list_size
224   //   - INTERNAL_ERROR if unable to access the region in file representing the
225   //   IndexBlock
226   libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock(
227       int block_index, uint32_t posting_list_size) const;
228 
229   // Returns:
230   //   - On success, the IndexBlock that exists at block_index
231   //   - INTERNAL_ERROR if unable to access the region in file representing the
232   //   IndexBlock
233   libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const;
234 
235   // Add a new block to the end of the file and return its block
236   // index. Returns kInvalidBlockIndex if unable to grow the index file.
237   int GrowIndex();
238 
239   // Return the index into index_block_infos of the smallest posting_list free
240   // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds
241   // the max-sized posting list.
242   int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const;
243 
244   // Flushes the in-memory free list to disk.
245   void FlushInMemoryFreeList();
246 
247   // Underlying filename.
248   std::string index_filename_;
249 
250   // We open the index file into this fd.
251   ScopedFd block_fd_;
252   int num_blocks_;  // can be inferred from index file size
253 
254   std::unique_ptr<HeaderBlock> header_block_;
255 
256   // In-memory cache of free posting lists.
257   struct FreeList {
258     // Experimentally determined that high watermark for largest
259     // freelist was ~3500.
260     static constexpr size_t kMaxSize = 4096;
261 
262     // Push a new PostingListIdentifier if there is space.
263     void Push(PostingListIdentifier id);
264 
265     // Attempt to pop a PostingListIdentifier.
266     //
267     // RETURNS:
268     //  - identifier of a free posting list, on success
269     //  - NOT_FOUND if there are no free posting lists on this free list.
270     libtextclassifier3::StatusOr<PostingListIdentifier> TryPop();
271 
272     std::string DebugString() const;
273 
274    private:
275     std::vector<PostingListIdentifier> free_list_;
276     int free_list_size_high_watermark_ = 0;
277     int num_dropped_free_list_entries_ = 0;
278   };
279   std::vector<FreeList> in_memory_freelists_;
280 
281   const Filesystem* filesystem_;  // not owned; can't be null
282 
283   bool has_in_memory_freelists_;
284 };
285 
286 }  // namespace lib
287 }  // namespace icing
288 
289 #endif  // ICING_INDEX_FLASH_INDEX_STORAGE_H_
290