1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A file-backed bitmap with fast & efficient reads/writes of bits.
16 // The bitmap will automatically grow in size as more bits are added, with a
17 // max-capacity of 2M bits.
18 //
19 // Note on Performance:
20 // This class internally uses mmap() without a readahead buffer. This keeps the
21 // memory-usage low while also having low (amortized) read/write latency.
22 // However, some reads/writes will pay the cost of page-faults.
23 // In order to keep memory-mapping efficient, the bitmap always grows in
24 // 4KiB sized blocks so that it is aligned with system page-size.
25 //
26 // This class doesn't aggressively flush/sync changes to disk and relies on the
27 // system to buffer and flush changes in the background. This greatly reduces
28 // disk-churn and performance of writes. However, an unexpected crash or an
29 // abrupt reboot of the system could lead to data-loss. This can be mitigated
30 // by manually calling PersistToDisk() when needed.
31 //
32 // Usage:
33 // auto bitmap = RETURN_OR_ASSIGN(FileBackedBitmap::Create(...));
34 //
35 // bitmap.Set(100, false);
36 // bitmap.Set(10, true);
37 //
38 // bitmap.Get(0);  // Default default of 'false'.
39 // bitmap.Get(10);
40 //
41 // bitmap.PersistToDisk();  // Optional. Immediately syncs all changes to disk.
42 // bitmap.reset();
43 
44 #ifndef ICING_FILE_FILE_BACKED_BITMAP_H_
45 #define ICING_FILE_FILE_BACKED_BITMAP_H_
46 #include <cstdint>
47 #include <memory>
48 #include <string>
49 #include <string_view>
50 
51 #include "icing/text_classifier/lib3/utils/base/status.h"
52 #include "icing/text_classifier/lib3/utils/base/statusor.h"
53 #include "icing/file/filesystem.h"
54 #include "icing/file/memory-mapped-file.h"
55 
56 namespace icing {
57 namespace lib {
58 
59 class FileBackedBitmap {
60  public:
61   // Growth of FileBackedBitmap is in blocks of a fixed size. This helper
62   // returns the number of bits that can be fitted in the specified number of
63   // blocks.
64   //
65   // NOTE: This is meant for tests and clients shouldn't care about this.
66   static int GetBlockCapacity(int num_blocks);
67 
68   // Returns an initialized instance of the bitmap that can immediately handle
69   // read/write operations.
70   //
71   // file_path : Specifies the file to persist the bitmap to; must be a path
72   //             within a directory that already exists. If the file itself
73   //             doesn't exist, a new bitmap will be created.
74   //
75   // mmap_strategy : Mmap strategy for the underlying file, see
76   //                 MemoryMappedFile::Strategy for more details.
77   //
78   // Returns an error if the file was corrupted or if any IO error was
79   // encountered. An error here implies that the old data has been lost and
80   // the file has to be deleted and re-initialized again.
81   static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedBitmap>> Create(
82       const Filesystem* filesystem, std::string_view file_path,
83       MemoryMappedFile::Strategy mmap_strategy);
84 
85   // If the bitmap was created with
86   // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, then changes will be
87   // synced by the system and the checksum will be updated.
88   ~FileBackedBitmap();
89 
90   // Set the bit at the specified position. The bitmap is automatically resized
91   // to at least fit 'index' number of bits. bit_index should not be larger than
92   // 2M, which is the max-capacity of FileBackedBitmap.
93   //
94   // Returns any encountered IO error.
95   //
96   // NOTE: While changes take place immediately, they may not be fully persisted
97   // to disk till PersistToDisk() is called.
98   //
99   // NOTE: The Bitmap grows in blocks of 4KiB. So, setting a specific bit
100   // beyond current capacity can lead to pre-allocating up to ~32K extra bits.
101   libtextclassifier3::Status Set(int bit_index, bool bit_value);
102 
103   // Get the bit at the specified index. Unset bits default to 'false'.
104   //
105   // Returns OUT_OF_RANGE error if bit_index > NumBits().
106   libtextclassifier3::StatusOr<bool> Get(int bit_index) const;
107 
108   // Count of bits currently being stored in this bitmap.
109   //
110   // NOTE: BitMap growth happens in blocks of 4KiB. So, the smallest bitmap will
111   // automatically have ~32K bits pre-allocated. Subsequently, future
112   // growths/truncations of the bitmap will change NumBits() in multiples of
113   // 32K.
114   int NumBits() const;
115 
116   // Truncates the size of the bitmap to 'new_num_bits'. Any data beyond this
117   // will be lost.
118   libtextclassifier3::Status TruncateTo(int new_num_bits);
119 
120   // Syncs all the changes made to the bitmap to disk and updates the checksum.
121   //
122   // Returns any encountered IO error.
123   //
124   // NOTE: Neither Set() nor the ~FileBackedBitmap() guarantee syncing all
125   // changes to disk. This method should be explicitly called to protect the
126   // data from an abrupt system reboot.
127   libtextclassifier3::Status PersistToDisk();
128 
129  private:
130   // Limit the max-size of the bitmap. Someone wanting to store more bits will
131   // likely benefit from a custom solution.
132   static constexpr int kMaxNumBits = 2 * 1024 * 1024;
133 
134   // Growth of FileBackedBitmap will be in blocks of this size. This size
135   // should align with the page-size of the system so that mmapping can be
136   // most efficient.
137   static constexpr int kBlockByteSize = 4 * 1024;
138 
139   // Version of the file-format used by the class. Every time the format is
140   // modified in a backwards-incompatible way, this needs to be incremented
141   static constexpr int32_t kCurrentVersion = 1;
142 
143   struct Header {
144     // Version of the file-format used by this class. This allows us to change
145     // the format and upgrade old data to the new format without losing it.
146     int32_t version;
147 
148     // Checksum of the entire file when it was last persisted to disk.
149     // This is used on init to make sure that the file has not been corrupted.
150     //
151     // NOTE: The checksum is not expected to match when ChecksumState=kStale.
152     uint32_t checksum;
153 
154     // As an optimization, FileBackedBitmap delays recomputation of the checksum
155     // even when some bits in the Bitmap are modified. While this improves
156     // performance, it increases the risk of losing data due to a crash.
157     // ChecksumState tracks if the changes to the bitmap have been fully
158     // reflected in the checksum stored above.
159     //
160     // NOTE: We use int32_t to store a bool info here to keep the Header
161     // aligned.
162     enum ChecksumState : int32_t { kFresh, kStale };
163     ChecksumState state;
164   };
165 
166   // The size of the backing file to store the specified number of bits. This
167   // size is aligned to the page-size of the system so that it can be
168   // efficiently memory mapped.
169   static size_t FileSizeForBits(int num_bits);
170 
171   static constexpr int kHeaderByteSize = sizeof(Header);
172 
173   // Helpers to read/modify the header of the bitmap file.
174   const Header& header() const;
175   Header* mutable_header();
176 
177   // Use FileBackedBitmap::Create() to instantiate.
178   FileBackedBitmap(const Filesystem* filesystem, std::string_view file_path,
179                    MemoryMappedFile::Strategy mmap_strategy);
180 
181   // Verify the contents of the bitmap and get ready for read/write operations.
182   //
183   // Returns an error if the file was corrupted or if any IO error was
184   // encountered. An error here implies that the old data has been lost and
185   // the file has to be deleted and re-initialized again.
186   libtextclassifier3::Status Initialize();
187 
188   // Makes sure that the data on disk is upgraded to match the file-format
189   // represented by kCurrentVersion.
190   libtextclassifier3::Status UpgradeToCurrentVersion();
191 
192   // Grows the size of the bitmap to match 'new_num_bits'. Any newly added bit
193   // will default to 'false'.
194   //
195   // The upper-bound for new_num_bits is kMaxNumBits. Requests to further
196   // increase the size will fail with an INVALID_ARGUMENT error.
197   libtextclassifier3::Status GrowTo(int new_num_bits);
198 
199   using Word = uint32_t;
200   static constexpr int kNumWordBits = sizeof(Word) * 8;
201 
202   // Helpers to perform 32bit read/write operations on the raw bitmap data.
203   // This makes it easy to use 32bit bitwise operations to modify the bitmap.
204   libtextclassifier3::StatusOr<Word> GetWord(int word_index) const;
205   libtextclassifier3::Status SetWord(int word_index, Word word);
206 
207   // CRC32 based checksum of all the bits stored in the bitmap. This checksum
208   // only uses the data and not the contents of the header.
209   uint32_t ComputeChecksum() const;
210 
211   const Filesystem* const filesystem_;
212   const std::string file_path_;
213   std::unique_ptr<MemoryMappedFile> mmapper_;
214 };
215 
216 }  // namespace lib
217 }  // namespace icing
218 
219 #endif  // ICING_FILE_FILE_BACKED_BITMAP_H_
220