1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // A file-backed bitmap with fast & efficient reads/writes of bits. 16 // The bitmap will automatically grow in size as more bits are added, with a 17 // max-capacity of 2M bits. 18 // 19 // Note on Performance: 20 // This class internally uses mmap() without a readahead buffer. This keeps the 21 // memory-usage low while also having low (amortized) read/write latency. 22 // However, some reads/writes will pay the cost of page-faults. 23 // In order to keep memory-mapping efficient, the bitmap always grows in 24 // 4KiB sized blocks so that it is aligned with system page-size. 25 // 26 // This class doesn't aggressively flush/sync changes to disk and relies on the 27 // system to buffer and flush changes in the background. This greatly reduces 28 // disk-churn and performance of writes. However, an unexpected crash or an 29 // abrupt reboot of the system could lead to data-loss. This can be mitigated 30 // by manually calling PersistToDisk() when needed. 31 // 32 // Usage: 33 // auto bitmap = RETURN_OR_ASSIGN(FileBackedBitmap::Create(...)); 34 // 35 // bitmap.Set(100, false); 36 // bitmap.Set(10, true); 37 // 38 // bitmap.Get(0); // Default default of 'false'. 39 // bitmap.Get(10); 40 // 41 // bitmap.PersistToDisk(); // Optional. Immediately syncs all changes to disk. 42 // bitmap.reset(); 43 44 #ifndef ICING_FILE_FILE_BACKED_BITMAP_H_ 45 #define ICING_FILE_FILE_BACKED_BITMAP_H_ 46 #include <cstdint> 47 #include <memory> 48 #include <string> 49 #include <string_view> 50 51 #include "icing/text_classifier/lib3/utils/base/status.h" 52 #include "icing/text_classifier/lib3/utils/base/statusor.h" 53 #include "icing/file/filesystem.h" 54 #include "icing/file/memory-mapped-file.h" 55 56 namespace icing { 57 namespace lib { 58 59 class FileBackedBitmap { 60 public: 61 // Growth of FileBackedBitmap is in blocks of a fixed size. This helper 62 // returns the number of bits that can be fitted in the specified number of 63 // blocks. 64 // 65 // NOTE: This is meant for tests and clients shouldn't care about this. 66 static int GetBlockCapacity(int num_blocks); 67 68 // Returns an initialized instance of the bitmap that can immediately handle 69 // read/write operations. 70 // 71 // file_path : Specifies the file to persist the bitmap to; must be a path 72 // within a directory that already exists. If the file itself 73 // doesn't exist, a new bitmap will be created. 74 // 75 // mmap_strategy : Mmap strategy for the underlying file, see 76 // MemoryMappedFile::Strategy for more details. 77 // 78 // Returns an error if the file was corrupted or if any IO error was 79 // encountered. An error here implies that the old data has been lost and 80 // the file has to be deleted and re-initialized again. 81 static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedBitmap>> Create( 82 const Filesystem* filesystem, std::string_view file_path, 83 MemoryMappedFile::Strategy mmap_strategy); 84 85 // If the bitmap was created with 86 // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be 87 // synced by the system and the checksum will be updated. 88 ~FileBackedBitmap(); 89 90 // Set the bit at the specified position. The bitmap is automatically resized 91 // to at least fit 'index' number of bits. bit_index should not be larger than 92 // 2M, which is the max-capacity of FileBackedBitmap. 93 // 94 // Returns any encountered IO error. 95 // 96 // NOTE: While changes take place immediately, they may not be fully persisted 97 // to disk till PersistToDisk() is called. 98 // 99 // NOTE: The Bitmap grows in blocks of 4KiB. So, setting a specific bit 100 // beyond current capacity can lead to pre-allocating up to ~32K extra bits. 101 libtextclassifier3::Status Set(int bit_index, bool bit_value); 102 103 // Get the bit at the specified index. Unset bits default to 'false'. 104 // 105 // Returns OUT_OF_RANGE error if bit_index > NumBits(). 106 libtextclassifier3::StatusOr<bool> Get(int bit_index) const; 107 108 // Count of bits currently being stored in this bitmap. 109 // 110 // NOTE: BitMap growth happens in blocks of 4KiB. So, the smallest bitmap will 111 // automatically have ~32K bits pre-allocated. Subsequently, future 112 // growths/truncations of the bitmap will change NumBits() in multiples of 113 // 32K. 114 int NumBits() const; 115 116 // Truncates the size of the bitmap to 'new_num_bits'. Any data beyond this 117 // will be lost. 118 libtextclassifier3::Status TruncateTo(int new_num_bits); 119 120 // Syncs all the changes made to the bitmap to disk and updates the checksum. 121 // 122 // Returns any encountered IO error. 123 // 124 // NOTE: Neither Set() nor the ~FileBackedBitmap() guarantee syncing all 125 // changes to disk. This method should be explicitly called to protect the 126 // data from an abrupt system reboot. 127 libtextclassifier3::Status PersistToDisk(); 128 129 private: 130 // Limit the max-size of the bitmap. Someone wanting to store more bits will 131 // likely benefit from a custom solution. 132 static constexpr int kMaxNumBits = 2 * 1024 * 1024; 133 134 // Growth of FileBackedBitmap will be in blocks of this size. This size 135 // should align with the page-size of the system so that mmapping can be 136 // most efficient. 137 static constexpr int kBlockByteSize = 4 * 1024; 138 139 // Version of the file-format used by the class. Every time the format is 140 // modified in a backwards-incompatible way, this needs to be incremented 141 static constexpr int32_t kCurrentVersion = 1; 142 143 struct Header { 144 // Version of the file-format used by this class. This allows us to change 145 // the format and upgrade old data to the new format without losing it. 146 int32_t version; 147 148 // Checksum of the entire file when it was last persisted to disk. 149 // This is used on init to make sure that the file has not been corrupted. 150 // 151 // NOTE: The checksum is not expected to match when ChecksumState=kStale. 152 uint32_t checksum; 153 154 // As an optimization, FileBackedBitmap delays recomputation of the checksum 155 // even when some bits in the Bitmap are modified. While this improves 156 // performance, it increases the risk of losing data due to a crash. 157 // ChecksumState tracks if the changes to the bitmap have been fully 158 // reflected in the checksum stored above. 159 // 160 // NOTE: We use int32_t to store a bool info here to keep the Header 161 // aligned. 162 enum ChecksumState : int32_t { kFresh, kStale }; 163 ChecksumState state; 164 }; 165 166 // The size of the backing file to store the specified number of bits. This 167 // size is aligned to the page-size of the system so that it can be 168 // efficiently memory mapped. 169 static size_t FileSizeForBits(int num_bits); 170 171 static constexpr int kHeaderByteSize = sizeof(Header); 172 173 // Helpers to read/modify the header of the bitmap file. 174 const Header& header() const; 175 Header* mutable_header(); 176 177 // Use FileBackedBitmap::Create() to instantiate. 178 FileBackedBitmap(const Filesystem* filesystem, std::string_view file_path, 179 MemoryMappedFile::Strategy mmap_strategy); 180 181 // Verify the contents of the bitmap and get ready for read/write operations. 182 // 183 // Returns an error if the file was corrupted or if any IO error was 184 // encountered. An error here implies that the old data has been lost and 185 // the file has to be deleted and re-initialized again. 186 libtextclassifier3::Status Initialize(); 187 188 // Makes sure that the data on disk is upgraded to match the file-format 189 // represented by kCurrentVersion. 190 libtextclassifier3::Status UpgradeToCurrentVersion(); 191 192 // Grows the size of the bitmap to match 'new_num_bits'. Any newly added bit 193 // will default to 'false'. 194 // 195 // The upper-bound for new_num_bits is kMaxNumBits. Requests to further 196 // increase the size will fail with an INVALID_ARGUMENT error. 197 libtextclassifier3::Status GrowTo(int new_num_bits); 198 199 using Word = uint32_t; 200 static constexpr int kNumWordBits = sizeof(Word) * 8; 201 202 // Helpers to perform 32bit read/write operations on the raw bitmap data. 203 // This makes it easy to use 32bit bitwise operations to modify the bitmap. 204 libtextclassifier3::StatusOr<Word> GetWord(int word_index) const; 205 libtextclassifier3::Status SetWord(int word_index, Word word); 206 207 // CRC32 based checksum of all the bits stored in the bitmap. This checksum 208 // only uses the data and not the contents of the header. 209 uint32_t ComputeChecksum() const; 210 211 const Filesystem* const filesystem_; 212 const std::string file_path_; 213 std::unique_ptr<MemoryMappedFile> mmapper_; 214 }; 215 216 } // namespace lib 217 } // namespace icing 218 219 #endif // ICING_FILE_FILE_BACKED_BITMAP_H_ 220