1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_MAIN_POSTING_LIST_USED_H_
16 #define ICING_INDEX_MAIN_POSTING_LIST_USED_H_
17 
18 #include <string.h>
19 #include <sys/mman.h>
20 
21 #include <algorithm>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/index/main/posting-list-utils.h"
28 #include "icing/util/logging.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 // A posting list with hits in it. Layout described in comments in
34 // posting-list-used.cc.
35 class PostingListUsed {
36  public:
37   // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes.
38   // 'Preexisting' means that posting_list_buffer was previously modified by
39   // another instance of PostingListUsed.
40   //
41   // Caller owns the hits buffer and must not free it while using a
42   // PostingListUsed.
43   //
44   // RETURNS:
45   //   - A valid PostingListUsed if successful
46   //   - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
47   //       || size_in_bytes % sizeof(Hit) != 0.
48   //   - FAILED_PRECONDITION if posting_list_buffer is null
49   static libtextclassifier3::StatusOr<PostingListUsed>
50   CreateFromPreexistingPostingListUsedRegion(void *posting_list_buffer,
51                                              uint32_t size_in_bytes);
52 
53   // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes
54   // and initializes the content of the buffer so that the returned
55   // PostingListUsed is empty.
56   //
57   // Caller owns the posting_list_buffer buffer and must not free it while using
58   // a PostingListUsed.
59   //
60   // RETURNS:
61   //   - A valid PostingListUsed if successful
62   //   - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
63   //       || size_in_bytes % sizeof(Hit) != 0.
64   //   - FAILED_PRECONDITION if posting_list_buffer is null
65   static libtextclassifier3::StatusOr<PostingListUsed>
66   CreateFromUnitializedRegion(void *posting_list_buffer,
67                               uint32_t size_in_bytes);
68 
69   // Move contents from another posting list. Clears other.
70   //
71   // RETURNS:
72   //   - OK, if successful
73   //   - INVALID_ARGUMENT if 'other' is not valid or 'other' is too large to fit
74   //       in 'this'.
75   //   - FAILED_PRECONDITION if 'this' posting list is in a corrupted state.
76   libtextclassifier3::Status MoveFrom(PostingListUsed *other);
77 
78   // Min size of posting list that can fit these used bytes. (See
79   // MoveFrom.)
80   uint32_t MinPostingListSizeToFit() const;
81 
82   // Prepend a hit to the posting list.
83   // RETURNS:
84   //   - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
85   //   previously added hit.
86   //   - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
87   //   list.
88   libtextclassifier3::Status PrependHit(const Hit &hit);
89 
90   // Prepend hits to the posting list. Hits should be sorted in
91   // descending order (as defined by the less than operator for Hit)
92   //
93   // Returns the number of hits that could be prepended to the posting list. If
94   // keep_prepended is true, whatever could be prepended is kept, otherwise the
95   // posting list is left in its original state.
96   template <class T, Hit (*GetHit)(const T &)>
97   uint32_t PrependHitArray(const T *array, uint32_t num_hits,
98                            bool keep_prepended);
99 
100   // Retrieves the hits stored in the posting list.
101   //
102   // RETURNS:
103   //   - On success, a vector of hits sorted by the reverse order of prepending.
104   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
105   libtextclassifier3::StatusOr<std::vector<Hit>> GetHits() const;
106 
107   // Same as GetHits but appends hits to hits_out.
108   //
109   // RETURNS:
110   //   - On success, a vector of hits sorted by the reverse order of prepending.
111   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
112   libtextclassifier3::Status GetHits(std::vector<Hit> *hits_out) const;
113 
114   // Undo the last num_hits hits prepended. If num_hits > number of
115   // hits we clear all hits.
116   //
117   // RETURNS:
118   //   - OK on success
119   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
120   libtextclassifier3::Status PopFrontHits(uint32_t num_hits);
121 
122   // Returns bytes used by actual hits.
123   uint32_t BytesUsed() const;
124 
125  private:
126   // Posting list layout formats:
127   //
128   // not_full
129   //
130   // +-----------------+----------------+-------+-----------------+
131   // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
132   // +-----------------+----------------+-------+-----------------+
133   //
134   // almost_full
135   //
136   // +-----------------+----------------+-------+-----------------+
137   // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
138   // +-----------------+----------------+-------+-----------------+
139   //
140   // full()
141   //
142   // +-----------------+----------------+-------+-----------------+
143   // |1st hit          |2nd hit         |(pad)  |(compressed) hits|
144   // +-----------------+----------------+-------+-----------------+
145   //
146   // The first two uncompressed hits also implicitly encode information about
147   // the size of the compressed hits region.
148   //
149   // 1. If the posting list is NOT_FULL, then
150   // posting_list_buffer_[0] contains the byte offset of the start of the
151   // compressed hits - and, thus, the size of the compressed hits region is
152   // size_in_bytes - posting_list_buffer_[0].
153   //
154   // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
155   // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
156   // - 1] and ends at size_in_bytes - 1.
157   //
158   // Hit term frequencies are stored after the hit value, compressed or
159   // uncompressed. For the first two special hits, we always have a
160   // space for the term frequency. For hits in the compressed area, we only have
161   // the term frequency following the hit value of hit.has_term_frequency() is
162   // true. This allows good compression in the common case where hits don't have
163   // a valid term frequency.
164   //
165   // EXAMPLE
166   // Posting list storage. Posting list size: 20 bytes
167   // EMPTY!
168   // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
169   // |     20      |Hit::kInvalidVal|                 0x000                  |
170   // +-------------+----------------+----------------+-----------------------+
171   //
172   // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
173   // NOT FULL!
174   // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
175   // |     16      |Hit::kInvalidVal|      0x000      |       0x07FFF998     |
176   // +-------------+----------------+-----------------+----------------------+
177   //
178   // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
179   // TermFrequency=125)
180   // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
181   // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
182   // |      13     |Hit::kInvalidVal|   0x000   | 0x07FFF684| 125  |    788    |
183   // +-------------+----------------+-----------+-----------+------+-----------+
184   //
185   // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
186   // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
187   // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
188   // |      9      |Hit::kInvVal|  0x00  |0x07FFF4D2|   434   | 125  |   788   |
189   // +-------------+------------+--------+----------+---------+------+---------+
190   //
191   // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
192   // TermFrequency = 87)
193   // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
194   // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
195   // |Hit::kInvVal|0x07FFF40E,87|  0x000  |    196  |   434   |  125 |   788   |
196   // +-------------+------------+---------+---------+---------+------+---------+
197   //
198   // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
199   // FULL!
200   // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
201   // -+ | 0x07FFF320  |0x07FFF40E,87|  0x000  |    196   |   434   |  125 | 788
202   // |
203   // +-------------+-------------+---------+----------+---------+------+---------+
PostingListUsed(void * posting_list_buffer,uint32_t size_in_bytes)204   PostingListUsed(void *posting_list_buffer, uint32_t size_in_bytes)
205       : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
206         size_in_bytes_(size_in_bytes) {}
207 
208   // Helpers to determine what state the posting list is in.
full()209   bool full() const {
210     return get_special_hit(0).ValueOrDie().is_valid() &&
211            get_special_hit(1).ValueOrDie().is_valid();
212   }
almost_full()213   bool almost_full() const {
214     return !get_special_hit(0).ValueOrDie().is_valid();
215   }
empty()216   bool empty() const {
217     return get_special_hit(0).ValueOrDie().value() == size_in_bytes_ &&
218            !get_special_hit(1).ValueOrDie().is_valid();
219   }
220 
221   // Returns false if both special hits are invalid or if the offset value
222   // stored in the special hit is less than kSpecialHitsSize or greater than
223   // size_in_bytes_. Returns true, otherwise.
224   bool IsPostingListValid() const;
225 
226   // Prepend hit to a posting list that is in the ALMOST_FULL state.
227   // RETURNS:
228   //  - OK, if successful
229   //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
230   libtextclassifier3::Status PrependHitToAlmostFull(const Hit &hit);
231 
232   // Prepend hit to a posting list that is in the EMPTY state. This will always
233   // succeed because there are no pre-existing hits and no validly constructed
234   // posting list could fail to fit one hit.
235   void PrependHitToEmpty(const Hit &hit);
236 
237   // Prepend hit to a posting list that is in the NOT_FULL state.
238   // RETURNS:
239   //  - OK, if successful
240   //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
241   libtextclassifier3::Status PrependHitToNotFull(const Hit &hit,
242                                                  uint32_t offset);
243 
244   // Reset contents to an empty posting list. This *must* be called if the
245   // posting_list_buffer_ region is uninitialized.
246   void Clear();
247 
248   // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
249   // a byte offset between kSpecialHitsSize and size_in_bytes_ (inclusive)
250   // (not_full state).
251   uint32_t get_start_byte_offset() const;
252 
253   // Sets the special hits to properly reflect what offset is (see layout
254   // comment for further details).
255   //
256   // Returns false if offset > size_in_bytes_ or offset is (kSpecialHitsSize,
257   // sizeof(Hit)) or offset is (sizeof(Hit), 0). True, otherwise.
258   bool set_start_byte_offset(uint32_t offset);
259 
260   // Manipulate padded areas. We never store the same hit value twice
261   // so a delta of 0 is a pad byte.
262 
263   // Returns offset of first non-pad byte.
264   uint32_t GetPadEnd(uint32_t offset) const;
265 
266   // Fill padding between offset start and offset end with 0s.
267   // Returns false if end > size_in_bytes_. True, otherwise.
268   bool PadToEnd(uint32_t start, uint32_t end);
269 
270   // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
271   // hits in the posting list if the posting list contains less than limit
272   // number of hits. out can be NULL.
273   //
274   // NOTE: If called with limit=1, pop=true on a posting list that transitioned
275   // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
276   // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
277   // ALMOST_FULL.
278   //
279   // RETURNS:
280   //   - OK on success
281   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
282   libtextclassifier3::Status GetHitsInternal(uint32_t limit, bool pop,
283                                              std::vector<Hit> *out) const;
284 
285   // Retrieves the value stored in the index-th special hit.
286   //
287   // RETURNS:
288   //   - A valid Hit, on success
289   //   - INVALID_ARGUMENT if index is not less than kNumSpecialHits
290   libtextclassifier3::StatusOr<Hit> get_special_hit(uint32_t index) const;
291 
292   // Sets the value stored in the index-th special hit to val. If index is not
293   // less than kSpecialHitSize / sizeof(Hit), this has no effect.
294   bool set_special_hit(uint32_t index, const Hit &val);
295 
296   // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
297   // returns the new beginning of the padded region.
298   //
299   // RETURNS:
300   //   - The new beginning of the padded region, if successful.
301   //   - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
302   // kSpecialHitsSize
303   libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
304       const Hit &hit, uint32_t offset);
305 
306   // If hit has a term frequency, consumes the term frequency at offset, updates
307   // hit to include the term frequency and updates offset to reflect that the
308   // term frequency has been consumed.
309   //
310   // RETURNS:
311   //   - OK, if successful
312   //   - INVALID_ARGUMENT if hit has a term frequency and offset +
313   //   sizeof(Hit::TermFrequency) >=
314   //     size_in_bytes_
315   libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
316       Hit *hit, uint32_t *offset) const;
317 
318   // A byte array of size size_in_bytes_ containing encoded hits for this
319   // posting list.
320   uint8_t *posting_list_buffer_;  // does not own!
321   uint32_t size_in_bytes_;
322 };
323 
324 // Inlined functions. Implementation details below. Avert eyes!
325 template <class T, Hit (*GetHit)(const T &)>
PrependHitArray(const T * array,uint32_t num_hits,bool keep_prepended)326 uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
327                                           bool keep_prepended) {
328   if (!IsPostingListValid()) {
329     return 0;
330   }
331 
332   // Prepend hits working backwards from array[num_hits - 1].
333   uint32_t i;
334   for (i = 0; i < num_hits; ++i) {
335     if (!PrependHit(GetHit(array[num_hits - i - 1])).ok()) {
336       break;
337     }
338   }
339   if (i != num_hits && !keep_prepended) {
340     // Didn't fit. Undo everything and check that we have the same offset as
341     // before. PopFrontHits guarantees that it will remove all 'i' hits so long
342     // as there are at least 'i' hits in the posting list, which we know there
343     // are.
344     PopFrontHits(i);
345   }
346   return i;
347 }
348 
349 }  // namespace lib
350 }  // namespace icing
351 
352 #endif  // ICING_INDEX_MAIN_POSTING_LIST_USED_H_
353