1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_
16 #define ICING_INDEX_MAIN_POSTING_LIST_USED_H_
17
18 #include <string.h>
19 #include <sys/mman.h>
20
21 #include <algorithm>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/index/main/posting-list-utils.h"
28 #include "icing/util/logging.h"
29
30 namespace icing {
31 namespace lib {
32
33 // A posting list with hits in it. Layout described in comments in
34 // posting-list-used.cc.
35 class PostingListUsed {
36 public:
37 // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes.
38 // 'Preexisting' means that posting_list_buffer was previously modified by
39 // another instance of PostingListUsed.
40 //
41 // Caller owns the hits buffer and must not free it while using a
42 // PostingListUsed.
43 //
44 // RETURNS:
45 // - A valid PostingListUsed if successful
46 // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
47 // || size_in_bytes % sizeof(Hit) != 0.
48 // - FAILED_PRECONDITION if posting_list_buffer is null
49 static libtextclassifier3::StatusOr<PostingListUsed>
50 CreateFromPreexistingPostingListUsedRegion(void *posting_list_buffer,
51 uint32_t size_in_bytes);
52
53 // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes
54 // and initializes the content of the buffer so that the returned
55 // PostingListUsed is empty.
56 //
57 // Caller owns the posting_list_buffer buffer and must not free it while using
58 // a PostingListUsed.
59 //
60 // RETURNS:
61 // - A valid PostingListUsed if successful
62 // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
63 // || size_in_bytes % sizeof(Hit) != 0.
64 // - FAILED_PRECONDITION if posting_list_buffer is null
65 static libtextclassifier3::StatusOr<PostingListUsed>
66 CreateFromUnitializedRegion(void *posting_list_buffer,
67 uint32_t size_in_bytes);
68
69 // Move contents from another posting list. Clears other.
70 //
71 // RETURNS:
72 // - OK, if successful
73 // - INVALID_ARGUMENT if 'other' is not valid or 'other' is too large to fit
74 // in 'this'.
75 // - FAILED_PRECONDITION if 'this' posting list is in a corrupted state.
76 libtextclassifier3::Status MoveFrom(PostingListUsed *other);
77
78 // Min size of posting list that can fit these used bytes. (See
79 // MoveFrom.)
80 uint32_t MinPostingListSizeToFit() const;
81
82 // Prepend a hit to the posting list.
83 // RETURNS:
84 // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
85 // previously added hit.
86 // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
87 // list.
88 libtextclassifier3::Status PrependHit(const Hit &hit);
89
90 // Prepend hits to the posting list. Hits should be sorted in
91 // descending order (as defined by the less than operator for Hit)
92 //
93 // Returns the number of hits that could be prepended to the posting list. If
94 // keep_prepended is true, whatever could be prepended is kept, otherwise the
95 // posting list is left in its original state.
96 template <class T, Hit (*GetHit)(const T &)>
97 uint32_t PrependHitArray(const T *array, uint32_t num_hits,
98 bool keep_prepended);
99
100 // Retrieves the hits stored in the posting list.
101 //
102 // RETURNS:
103 // - On success, a vector of hits sorted by the reverse order of prepending.
104 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
105 libtextclassifier3::StatusOr<std::vector<Hit>> GetHits() const;
106
107 // Same as GetHits but appends hits to hits_out.
108 //
109 // RETURNS:
110 // - On success, a vector of hits sorted by the reverse order of prepending.
111 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
112 libtextclassifier3::Status GetHits(std::vector<Hit> *hits_out) const;
113
114 // Undo the last num_hits hits prepended. If num_hits > number of
115 // hits we clear all hits.
116 //
117 // RETURNS:
118 // - OK on success
119 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
120 libtextclassifier3::Status PopFrontHits(uint32_t num_hits);
121
122 // Returns bytes used by actual hits.
123 uint32_t BytesUsed() const;
124
125 private:
126 // Posting list layout formats:
127 //
128 // not_full
129 //
130 // +-----------------+----------------+-------+-----------------+
131 // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
132 // +-----------------+----------------+-------+-----------------+
133 //
134 // almost_full
135 //
136 // +-----------------+----------------+-------+-----------------+
137 // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
138 // +-----------------+----------------+-------+-----------------+
139 //
140 // full()
141 //
142 // +-----------------+----------------+-------+-----------------+
143 // |1st hit |2nd hit |(pad) |(compressed) hits|
144 // +-----------------+----------------+-------+-----------------+
145 //
146 // The first two uncompressed hits also implicitly encode information about
147 // the size of the compressed hits region.
148 //
149 // 1. If the posting list is NOT_FULL, then
150 // posting_list_buffer_[0] contains the byte offset of the start of the
151 // compressed hits - and, thus, the size of the compressed hits region is
152 // size_in_bytes - posting_list_buffer_[0].
153 //
154 // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
155 // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
156 // - 1] and ends at size_in_bytes - 1.
157 //
158 // Hit term frequencies are stored after the hit value, compressed or
159 // uncompressed. For the first two special hits, we always have a
160 // space for the term frequency. For hits in the compressed area, we only have
161 // the term frequency following the hit value of hit.has_term_frequency() is
162 // true. This allows good compression in the common case where hits don't have
163 // a valid term frequency.
164 //
165 // EXAMPLE
166 // Posting list storage. Posting list size: 20 bytes
167 // EMPTY!
168 // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
169 // | 20 |Hit::kInvalidVal| 0x000 |
170 // +-------------+----------------+----------------+-----------------------+
171 //
172 // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
173 // NOT FULL!
174 // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
175 // | 16 |Hit::kInvalidVal| 0x000 | 0x07FFF998 |
176 // +-------------+----------------+-----------------+----------------------+
177 //
178 // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
179 // TermFrequency=125)
180 // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
181 // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
182 // | 13 |Hit::kInvalidVal| 0x000 | 0x07FFF684| 125 | 788 |
183 // +-------------+----------------+-----------+-----------+------+-----------+
184 //
185 // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
186 // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
187 // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
188 // | 9 |Hit::kInvVal| 0x00 |0x07FFF4D2| 434 | 125 | 788 |
189 // +-------------+------------+--------+----------+---------+------+---------+
190 //
191 // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
192 // TermFrequency = 87)
193 // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
194 // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
195 // |Hit::kInvVal|0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 |
196 // +-------------+------------+---------+---------+---------+------+---------+
197 //
198 // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
199 // FULL!
200 // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
201 // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788
202 // |
203 // +-------------+-------------+---------+----------+---------+------+---------+
PostingListUsed(void * posting_list_buffer,uint32_t size_in_bytes)204 PostingListUsed(void *posting_list_buffer, uint32_t size_in_bytes)
205 : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
206 size_in_bytes_(size_in_bytes) {}
207
208 // Helpers to determine what state the posting list is in.
full()209 bool full() const {
210 return get_special_hit(0).ValueOrDie().is_valid() &&
211 get_special_hit(1).ValueOrDie().is_valid();
212 }
almost_full()213 bool almost_full() const {
214 return !get_special_hit(0).ValueOrDie().is_valid();
215 }
empty()216 bool empty() const {
217 return get_special_hit(0).ValueOrDie().value() == size_in_bytes_ &&
218 !get_special_hit(1).ValueOrDie().is_valid();
219 }
220
221 // Returns false if both special hits are invalid or if the offset value
222 // stored in the special hit is less than kSpecialHitsSize or greater than
223 // size_in_bytes_. Returns true, otherwise.
224 bool IsPostingListValid() const;
225
226 // Prepend hit to a posting list that is in the ALMOST_FULL state.
227 // RETURNS:
228 // - OK, if successful
229 // - INVALID_ARGUMENT if hit is not less than the previously added hit.
230 libtextclassifier3::Status PrependHitToAlmostFull(const Hit &hit);
231
232 // Prepend hit to a posting list that is in the EMPTY state. This will always
233 // succeed because there are no pre-existing hits and no validly constructed
234 // posting list could fail to fit one hit.
235 void PrependHitToEmpty(const Hit &hit);
236
237 // Prepend hit to a posting list that is in the NOT_FULL state.
238 // RETURNS:
239 // - OK, if successful
240 // - INVALID_ARGUMENT if hit is not less than the previously added hit.
241 libtextclassifier3::Status PrependHitToNotFull(const Hit &hit,
242 uint32_t offset);
243
244 // Reset contents to an empty posting list. This *must* be called if the
245 // posting_list_buffer_ region is uninitialized.
246 void Clear();
247
248 // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
249 // a byte offset between kSpecialHitsSize and size_in_bytes_ (inclusive)
250 // (not_full state).
251 uint32_t get_start_byte_offset() const;
252
253 // Sets the special hits to properly reflect what offset is (see layout
254 // comment for further details).
255 //
256 // Returns false if offset > size_in_bytes_ or offset is (kSpecialHitsSize,
257 // sizeof(Hit)) or offset is (sizeof(Hit), 0). True, otherwise.
258 bool set_start_byte_offset(uint32_t offset);
259
260 // Manipulate padded areas. We never store the same hit value twice
261 // so a delta of 0 is a pad byte.
262
263 // Returns offset of first non-pad byte.
264 uint32_t GetPadEnd(uint32_t offset) const;
265
266 // Fill padding between offset start and offset end with 0s.
267 // Returns false if end > size_in_bytes_. True, otherwise.
268 bool PadToEnd(uint32_t start, uint32_t end);
269
270 // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
271 // hits in the posting list if the posting list contains less than limit
272 // number of hits. out can be NULL.
273 //
274 // NOTE: If called with limit=1, pop=true on a posting list that transitioned
275 // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
276 // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
277 // ALMOST_FULL.
278 //
279 // RETURNS:
280 // - OK on success
281 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
282 libtextclassifier3::Status GetHitsInternal(uint32_t limit, bool pop,
283 std::vector<Hit> *out) const;
284
285 // Retrieves the value stored in the index-th special hit.
286 //
287 // RETURNS:
288 // - A valid Hit, on success
289 // - INVALID_ARGUMENT if index is not less than kNumSpecialHits
290 libtextclassifier3::StatusOr<Hit> get_special_hit(uint32_t index) const;
291
292 // Sets the value stored in the index-th special hit to val. If index is not
293 // less than kSpecialHitSize / sizeof(Hit), this has no effect.
294 bool set_special_hit(uint32_t index, const Hit &val);
295
296 // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
297 // returns the new beginning of the padded region.
298 //
299 // RETURNS:
300 // - The new beginning of the padded region, if successful.
301 // - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
302 // kSpecialHitsSize
303 libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
304 const Hit &hit, uint32_t offset);
305
306 // If hit has a term frequency, consumes the term frequency at offset, updates
307 // hit to include the term frequency and updates offset to reflect that the
308 // term frequency has been consumed.
309 //
310 // RETURNS:
311 // - OK, if successful
312 // - INVALID_ARGUMENT if hit has a term frequency and offset +
313 // sizeof(Hit::TermFrequency) >=
314 // size_in_bytes_
315 libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
316 Hit *hit, uint32_t *offset) const;
317
318 // A byte array of size size_in_bytes_ containing encoded hits for this
319 // posting list.
320 uint8_t *posting_list_buffer_; // does not own!
321 uint32_t size_in_bytes_;
322 };
323
324 // Inlined functions. Implementation details below. Avert eyes!
325 template <class T, Hit (*GetHit)(const T &)>
PrependHitArray(const T * array,uint32_t num_hits,bool keep_prepended)326 uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
327 bool keep_prepended) {
328 if (!IsPostingListValid()) {
329 return 0;
330 }
331
332 // Prepend hits working backwards from array[num_hits - 1].
333 uint32_t i;
334 for (i = 0; i < num_hits; ++i) {
335 if (!PrependHit(GetHit(array[num_hits - i - 1])).ok()) {
336 break;
337 }
338 }
339 if (i != num_hits && !keep_prepended) {
340 // Didn't fit. Undo everything and check that we have the same offset as
341 // before. PopFrontHits guarantees that it will remove all 'i' hits so long
342 // as there are at least 'i' hits in the posting list, which we know there
343 // are.
344 PopFrontHits(i);
345 }
346 return i;
347 }
348
349 } // namespace lib
350 } // namespace icing
351
352 #endif // ICING_INDEX_MAIN_POSTING_LIST_USED_H_
353