1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/main/posting-list-used.h"
16 
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstdint>
20 #include <limits>
21 
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/index/main/posting-list-utils.h"
24 #include "icing/legacy/core/icing-string-util.h"
25 #include "icing/legacy/index/icing-bit-util.h"
26 #include "icing/util/status-macros.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 namespace {
32 
GetTermFrequencyByteSize(const Hit & hit)33 uint32_t GetTermFrequencyByteSize(const Hit &hit) {
34   return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
35 }
36 
37 }  // namespace
38 
39 libtextclassifier3::StatusOr<PostingListUsed>
CreateFromPreexistingPostingListUsedRegion(void * posting_list_buffer,uint32_t size_in_bytes)40 PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
41     void *posting_list_buffer, uint32_t size_in_bytes) {
42   ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
43   if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
44     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
45         "Requested posting list size %d is invalid!", size_in_bytes));
46   }
47   return PostingListUsed(posting_list_buffer, size_in_bytes);
48 }
49 
50 libtextclassifier3::StatusOr<PostingListUsed>
CreateFromUnitializedRegion(void * posting_list_buffer,uint32_t size_in_bytes)51 PostingListUsed::CreateFromUnitializedRegion(void *posting_list_buffer,
52                                              uint32_t size_in_bytes) {
53   ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used,
54                          CreateFromPreexistingPostingListUsedRegion(
55                              posting_list_buffer, size_in_bytes));
56   posting_list_used.Clear();
57   return posting_list_used;
58 }
59 
Clear()60 void PostingListUsed::Clear() {
61   // Safe to ignore return value because size_in_bytes_ a valid argument.
62   set_start_byte_offset(size_in_bytes_);
63 }
64 
MoveFrom(PostingListUsed * other)65 libtextclassifier3::Status PostingListUsed::MoveFrom(PostingListUsed *other) {
66   ICING_RETURN_ERROR_IF_NULL(other);
67   if (other->MinPostingListSizeToFit() > size_in_bytes_) {
68     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
69         "other->MinPostingListSizeToFit %d must be larger than size %d.",
70         other->MinPostingListSizeToFit(), size_in_bytes_));
71   }
72 
73   if (!IsPostingListValid()) {
74     return absl_ports::FailedPreconditionError(
75         "This posting list is in an invalid state and can't be used!");
76   }
77   if (!other->IsPostingListValid()) {
78     return absl_ports::InvalidArgumentError(
79         "Cannot MoveFrom an invalid posting list!");
80   }
81 
82   // Pop just enough hits that all of other's compressed hits fit in
83   // this posting_list's compressed area. Then we can memcpy that area.
84   std::vector<Hit> hits;
85   while (other->full() || other->almost_full() ||
86          (size_in_bytes_ - posting_list_utils::kSpecialHitsSize <
87           other->BytesUsed())) {
88     if (!other->GetHitsInternal(/*limit=*/1, /*pop=*/true, &hits).ok()) {
89       return absl_ports::AbortedError(
90           "Unable to retrieve hits from other posting list.");
91     }
92   }
93 
94   // memcpy the area and set up start byte offset.
95   Clear();
96   memcpy(posting_list_buffer_ + size_in_bytes_ - other->BytesUsed(),
97          other->posting_list_buffer_ + other->get_start_byte_offset(),
98          other->BytesUsed());
99   // Because we popped all hits from other outside of the compressed area and we
100   // guaranteed that other->BytesUsed is less than size_in_bytes_ -
101   // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
102   // NOT_FULL state, so ignoring the value is safe.
103   set_start_byte_offset(size_in_bytes_ - other->BytesUsed());
104 
105   // Put back remaining hits.
106   for (size_t i = 0; i < hits.size(); i++) {
107     const Hit &hit = hits[hits.size() - i - 1];
108     // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
109     // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
110     // should be impossible because we've already assured that there is enough
111     // room above.
112     ICING_RETURN_IF_ERROR(PrependHit(hit));
113   }
114 
115   other->Clear();
116   return libtextclassifier3::Status::OK;
117 }
118 
GetPadEnd(uint32_t offset) const119 uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const {
120   Hit::Value pad;
121   uint32_t pad_end = offset;
122   while (pad_end < size_in_bytes_) {
123     size_t pad_len = VarInt::Decode(posting_list_buffer_ + pad_end, &pad);
124     if (pad != 0) {
125       // No longer a pad.
126       break;
127     }
128     pad_end += pad_len;
129   }
130   return pad_end;
131 }
132 
PadToEnd(uint32_t start,uint32_t end)133 bool PostingListUsed::PadToEnd(uint32_t start, uint32_t end) {
134   if (end > size_in_bytes_) {
135     ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
136     return false;
137   }
138   // In VarInt a value of 0 encodes to 0.
139   memset(posting_list_buffer_ + start, 0, end - start);
140   return true;
141 }
142 
PrependHitToAlmostFull(const Hit & hit)143 libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull(
144     const Hit &hit) {
145   // Get delta between first hit and the new hit. Try to fit delta
146   // in the padded area and put new hit at the special position 1.
147   // Calling ValueOrDie is safe here because 1 < kNumSpecialHits.
148   Hit cur = get_special_hit(1).ValueOrDie();
149   if (cur.value() <= hit.value()) {
150     return absl_ports::InvalidArgumentError(
151         "Hit being prepended must be strictly less than the most recent Hit");
152   }
153   uint64_t delta = cur.value() - hit.value();
154   uint8_t delta_buf[VarInt::kMaxEncodedLen64];
155   size_t delta_len = VarInt::Encode(delta, delta_buf);
156   uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
157 
158   uint32_t pad_end = GetPadEnd(posting_list_utils::kSpecialHitsSize);
159 
160   if (pad_end >= posting_list_utils::kSpecialHitsSize + delta_len +
161                      cur_term_frequency_bytes) {
162     // Pad area has enough space for delta and term_frequency of existing hit
163     // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
164     uint8_t *delta_offset =
165         posting_list_buffer_ + pad_end - delta_len - cur_term_frequency_bytes;
166     memcpy(delta_offset, delta_buf, delta_len);
167     // Now copy term_frequency.
168     Hit::TermFrequency term_frequency = cur.term_frequency();
169     uint8_t *term_frequency_offset = delta_offset + delta_len;
170     memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
171 
172     // Now first hit is the new hit, at special position 1. Safe to ignore the
173     // return value because 1 < kNumSpecialHits.
174     set_special_hit(1, hit);
175     // Safe to ignore the return value because sizeof(Hit) is a valid argument.
176     set_start_byte_offset(sizeof(Hit));
177   } else {
178     // No space for delta. We put the new hit at special position 0
179     // and go to the full state. Safe to ignore the return value because 1 <
180     // kNumSpecialHits.
181     set_special_hit(0, hit);
182   }
183   return libtextclassifier3::Status::OK;
184 }
185 
PrependHitToEmpty(const Hit & hit)186 void PostingListUsed::PrependHitToEmpty(const Hit &hit) {
187   // First hit to be added. Just add verbatim, no compression.
188   if (size_in_bytes_ == posting_list_utils::kSpecialHitsSize) {
189     // Safe to ignore the return value because 1 < kNumSpecialHits
190     set_special_hit(1, hit);
191     // Safe to ignore the return value because sizeof(Hit) is a valid argument.
192     set_start_byte_offset(sizeof(Hit));
193   } else {
194     // Since this is the first hit, size != kSpecialHitsSize and
195     // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
196     // the compressed region, so ValueOrDie is safe.
197     uint32_t offset = PrependHitUncompressed(hit, size_in_bytes_).ValueOrDie();
198     // Safe to ignore the return value because PrependHitUncompressed is
199     // guaranteed to return a valid offset.
200     set_start_byte_offset(offset);
201   }
202 }
203 
PrependHitToNotFull(const Hit & hit,uint32_t offset)204 libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
205     const Hit &hit, uint32_t offset) {
206   // First hit in compressed area. It is uncompressed. See if delta
207   // between the first hit and new hit will still fit in the
208   // compressed area.
209   if (offset + sizeof(Hit::Value) > size_in_bytes_) {
210     // The first hit in the compressed region *should* be uncompressed, but
211     // somehow there isn't enough room between offset and the end of the
212     // compressed area to fit an uncompressed hit. This should NEVER happen.
213     return absl_ports::FailedPreconditionError(
214         "Posting list is in an invalid state.");
215   }
216   Hit::Value cur_value;
217   memcpy(&cur_value, posting_list_buffer_ + offset, sizeof(Hit::Value));
218   if (cur_value <= hit.value()) {
219     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
220         "Hit %d being prepended must be strictly less than the most recent "
221         "Hit %d",
222         hit.value(), cur_value));
223   }
224   uint64_t delta = cur_value - hit.value();
225   uint8_t delta_buf[VarInt::kMaxEncodedLen64];
226   size_t delta_len = VarInt::Encode(delta, delta_buf);
227   uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
228 
229   // offset now points to one past the end of the first hit.
230   offset += sizeof(Hit::Value);
231   if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
232           hit_term_frequency_bytes <=
233       offset) {
234     // Enough space for delta in compressed area.
235 
236     // Prepend delta.
237     offset -= delta_len;
238     memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
239 
240     // Prepend new hit with (possibly) its term_frequency. We know that there is
241     // room for 'hit' because of the if statement above, so calling ValueOrDie
242     // is safe.
243     offset = PrependHitUncompressed(hit, offset).ValueOrDie();
244     // offset is guaranteed to be valid here. So it's safe to ignore the return
245     // value. The if above will guarantee that offset >= kSpecialHitSize and <
246     // size_in_bytes_ because the if ensures that there is enough room between
247     // offset and kSpecialHitSize to fit the delta of the previous hit, any
248     // term_frequency and the uncompressed hit.
249     set_start_byte_offset(offset);
250   } else if (posting_list_utils::kSpecialHitsSize + delta_len <= offset) {
251     // Only have space for delta. The new hit must be put in special
252     // position 1.
253 
254     // Prepend delta.
255     offset -= delta_len;
256     memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
257 
258     // Prepend pad. Safe to ignore the return value of PadToEnd because offset
259     // must be less than size_in_bytes_. Otherwise, this function already would
260     // have returned FAILED_PRECONDITION.
261     PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
262 
263     // Put new hit in special position 1. Safe to ignore return value because 1
264     // < kNumSpecialHits.
265     set_special_hit(1, hit);
266 
267     // State almost_full. Safe to ignore the return value because sizeof(Hit) is
268     // a valid argument.
269     set_start_byte_offset(sizeof(Hit));
270   } else {
271     // Very rare case where delta is larger than sizeof(Hit::Value)
272     // (i.e. varint delta encoding expanded required storage). We
273     // move first hit to special position 1 and put new hit in
274     // special position 0.
275     Hit cur(cur_value);
276     // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
277     // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
278     // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
279     // offset < size_in_bytes, so it is safe to ignore the return value here.
280     ConsumeTermFrequencyIfPresent(&cur, &offset);
281     // Safe to ignore the return value of PadToEnd because offset must be less
282     // than size_in_bytes_. Otherwise, this function already would have returned
283     // FAILED_PRECONDITION.
284     PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
285     // Safe to ignore the return value here because 0 and 1 < kNumSpecialHits.
286     set_special_hit(1, cur);
287     set_special_hit(0, hit);
288   }
289   return libtextclassifier3::Status::OK;
290 }
291 
PrependHit(const Hit & hit)292 libtextclassifier3::Status PostingListUsed::PrependHit(const Hit &hit) {
293   static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
294                 "Hit::Value cannot be larger than 8 bytes because the delta "
295                 "must be able to fit in 8 bytes.");
296   if (!hit.is_valid()) {
297     return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
298   }
299   if (!IsPostingListValid()) {
300     return absl_ports::FailedPreconditionError(
301         "This PostingListUsed is in an invalid state and can't add any hits!");
302   }
303 
304   if (full()) {
305     // State full: no space left.
306     return absl_ports::ResourceExhaustedError("No more room for hits");
307   } else if (almost_full()) {
308     return PrependHitToAlmostFull(hit);
309   } else if (empty()) {
310     PrependHitToEmpty(hit);
311     return libtextclassifier3::Status::OK;
312   } else {
313     uint32_t offset = get_start_byte_offset();
314     return PrependHitToNotFull(hit, offset);
315   }
316 }
317 
GetHits() const318 libtextclassifier3::StatusOr<std::vector<Hit>> PostingListUsed::GetHits()
319     const {
320   std::vector<Hit> hits_out;
321   ICING_RETURN_IF_ERROR(GetHits(&hits_out));
322   return hits_out;
323 }
324 
GetHits(std::vector<Hit> * hits_out) const325 libtextclassifier3::Status PostingListUsed::GetHits(
326     std::vector<Hit> *hits_out) const {
327   return GetHitsInternal(/*limit=*/std::numeric_limits<uint32_t>::max(),
328                          /*pop=*/false, hits_out);
329 }
330 
PopFrontHits(uint32_t num_hits)331 libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) {
332   if (num_hits == 1 && full()) {
333     // The PL is in full status which means that we save 2 uncompressed hits in
334     // the 2 special postions. But full status may be reached by 2 different
335     // statuses.
336     // (1) In "almost full" status
337     // +-----------------+----------------+-------+-----------------+
338     // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
339     // +-----------------+----------------+-------+-----------------+
340     // When we prepend another hit, we can only put it at the special
341     // position 0. And we get a full PL
342     // +-----------------+----------------+-------+-----------------+
343     // |new 1st hit      |original 1st hit|(pad)  |(compressed) hits|
344     // +-----------------+----------------+-------+-----------------+
345     // (2) In "not full" status
346     // +-----------------+----------------+------+-------+------------------+
347     // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
348     // +-----------------+----------------+------+-------+------------------+
349     // When we prepend another hit, we can reach any of the 3 following
350     // scenarios:
351     // (2.1) not full
352     // if the space of pad and original 1st hit can accommodate the new 1st hit
353     // and the encoded delta value.
354     // +-----------------+----------------+------+-----------+-----------------+
355     // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
356     // +-----------------+----------------+------+-----------+-----------------+
357     // (2.2) almost full
358     // If the space of pad and original 1st hit cannot accommodate the new 1st
359     // hit and the encoded delta value but can accommodate the encoded delta
360     // value only. We can put the new 1st hit at special position 1.
361     // +-----------------+----------------+-------+-----------------+
362     // |Hit::kInvalidVal |new 1st hit     |(pad)  |(compressed) hits|
363     // +-----------------+----------------+-------+-----------------+
364     // (2.3) full
365     // In very rare case, it cannot even accommodate only the encoded delta
366     // value. we can move the original 1st hit into special position 1 and the
367     // new 1st hit into special position 0. This may happen because we use
368     // VarInt encoding method which may make the encoded value longer (about
369     // 4/3 times of original)
370     // +-----------------+----------------+-------+-----------------+
371     // |new 1st hit      |original 1st hit|(pad)  |(compressed) hits|
372     // +-----------------+----------------+-------+-----------------+
373     // Suppose now the PL is full. But we don't know whether it arrived to
374     // this status from "not full" like (2.3) or from "almost full" like (1).
375     // We'll return to "almost full" status like (1) if we simply pop the new
376     // 1st hit but we want to make the prepending operation "reversible". So
377     // there should be some way to return to "not full" if possible. A simple
378     // way to do it is to pop 2 hits out of the PL to status "almost full" or
379     // "not full".  And add the original 1st hit back. We can return to the
380     // correct original statuses of (2.1) or (1). This makes our prepending
381     // operation reversible.
382     std::vector<Hit> out;
383 
384     // Popping 2 hits should never fail because we've just ensured that the
385     // posting list is in the FULL state.
386     ICING_RETURN_IF_ERROR(GetHitsInternal(/*limit=*/2, /*pop=*/true, &out));
387 
388     // PrependHit should never fail because out[1] is a valid hit less than
389     // previous hits in the posting list and because there's no way that the
390     // posting list could run out of room because it previously stored this hit
391     // AND another hit.
392     PrependHit(out[1]);
393   } else if (num_hits > 0) {
394     return GetHitsInternal(/*limit=*/num_hits, /*pop=*/true, nullptr);
395   }
396   return libtextclassifier3::Status::OK;
397 }
398 
GetHitsInternal(uint32_t limit,bool pop,std::vector<Hit> * out) const399 libtextclassifier3::Status PostingListUsed::GetHitsInternal(
400     uint32_t limit, bool pop, std::vector<Hit> *out) const {
401   // Put current uncompressed val here.
402   Hit::Value val = Hit::kInvalidValue;
403   uint32_t offset = get_start_byte_offset();
404   uint32_t count = 0;
405 
406   // First traverse the first two special positions.
407   while (count < limit && offset < posting_list_utils::kSpecialHitsSize) {
408     // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
409     // kNumSpecialHits because of the check above.
410     Hit hit = get_special_hit(offset / sizeof(Hit)).ValueOrDie();
411     val = hit.value();
412     if (out != nullptr) {
413       out->push_back(hit);
414     }
415     offset += sizeof(Hit);
416     count++;
417   }
418 
419   // If special position 1 was set then we need to skip padding.
420   if (val != Hit::kInvalidValue &&
421       offset == posting_list_utils::kSpecialHitsSize) {
422     offset = GetPadEnd(offset);
423   }
424 
425   while (count < limit && offset < size_in_bytes_) {
426     if (val == Hit::kInvalidValue) {
427       // First hit is in compressed area. Put that in val.
428       memcpy(&val, posting_list_buffer_ + offset, sizeof(Hit::Value));
429       offset += sizeof(Hit::Value);
430     } else {
431       // Now we have delta encoded subsequent hits. Decode and push.
432       uint64_t delta;
433       offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
434       val += delta;
435     }
436     Hit hit(val);
437     libtextclassifier3::Status status =
438         ConsumeTermFrequencyIfPresent(&hit, &offset);
439     if (!status.ok()) {
440       // This posting list has been corrupted somehow. The first hit of the
441       // posting list claims to have a term frequency, but there's no more room
442       // in the posting list for that term frequency to exist. Return an empty
443       // vector and zero to indicate no hits retrieved.
444       if (out != nullptr) {
445         out->clear();
446       }
447       return absl_ports::InternalError("Posting list has been corrupted!");
448     }
449     if (out != nullptr) {
450       out->push_back(hit);
451     }
452     count++;
453   }
454 
455   if (pop) {
456     PostingListUsed *mutable_this = const_cast<PostingListUsed *>(this);
457     // Modify the posting list so that we pop all hits actually
458     // traversed.
459     if (offset >= posting_list_utils::kSpecialHitsSize &&
460         offset < size_in_bytes_) {
461       // In the compressed area. Pop and reconstruct. offset/val is
462       // the last traversed hit, which we must discard. So move one
463       // more forward.
464       uint64_t delta;
465       offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
466       val += delta;
467 
468       // Now val is the first hit of the new posting list.
469       if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
470         // val fits in compressed area. Simply copy.
471         offset -= sizeof(Hit::Value);
472         memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
473       } else {
474         // val won't fit in compressed area. Also see if there is a
475         // term_frequency.
476         Hit hit(val);
477         libtextclassifier3::Status status =
478             ConsumeTermFrequencyIfPresent(&hit, &offset);
479         if (!status.ok()) {
480           // This posting list has been corrupted somehow. The first hit of
481           // the posting list claims to have a term frequency, but there's no
482           // more room in the posting list for that term frequency to exist.
483           // Return an empty vector and zero to indicate no hits retrieved. Do
484           // not pop anything.
485           if (out != nullptr) {
486             out->clear();
487           }
488           return absl_ports::InternalError("Posting list has been corrupted!");
489         }
490         // Okay to ignore the return value here because 1 < kNumSpecialHits.
491         mutable_this->set_special_hit(1, hit);
492 
493         // Prepend pad. Safe to ignore the return value of PadToEnd because
494         // offset must be less than size_in_bytes_ thanks to the if above.
495         mutable_this->PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
496         offset = sizeof(Hit);
497       }
498     }
499     // offset is guaranteed to be valid so ignoring the return value of
500     // set_start_byte_offset is safe. It falls into one of four scenarios:
501     // Scenario 1: the above if was false because offset is not < size_in_bytes_
502     //   In this case, offset must be == size_in_bytes_ because we reached
503     //   offset by unwinding hits on the posting list.
504     // Scenario 2: offset is < kSpecialHitSize
505     //   In this case, offset is guaranteed to be either 0 or sizeof(Hit)
506     //   because offset is incremented by sizeof(Hit) within the first while
507     //   loop.
508     // Scenario 3: offset is within the compressed region and the new first hit
509     //   in the posting list (the value that 'val' holds) will fit as an
510     //   uncompressed hit in the compressed region. The resulting offset from
511     //   decompressing val must be >= kSpecialHitSize because otherwise we'd be
512     //   in Scenario 4
513     // Scenario 4: offset is within the compressed region, but the new first hit
514     //   in the posting list is too large to fit as an uncompressed hit in the
515     //   in the compressed region. Therefore, it must be stored in a special hit
516     //   and offset will be sizeof(Hit).
517     mutable_this->set_start_byte_offset(offset);
518   }
519 
520   return libtextclassifier3::Status::OK;
521 }
522 
get_special_hit(uint32_t index) const523 libtextclassifier3::StatusOr<Hit> PostingListUsed::get_special_hit(
524     uint32_t index) const {
525   static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
526   if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
527     return absl_ports::InvalidArgumentError(
528         "Special hits only exist at indices 0 and 1");
529   }
530   Hit val;
531   memcpy(&val, posting_list_buffer_ + index * sizeof(val), sizeof(val));
532   return val;
533 }
534 
set_special_hit(uint32_t index,const Hit & val)535 bool PostingListUsed::set_special_hit(uint32_t index, const Hit &val) {
536   if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
537     ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
538     return false;
539   }
540   memcpy(posting_list_buffer_ + index * sizeof(val), &val, sizeof(val));
541   return true;
542 }
543 
BytesUsed() const544 uint32_t PostingListUsed::BytesUsed() const {
545   // The special hits will be included if they represent actual hits. If they
546   // represent the hit offset or the invalid hit sentinel, they are not
547   // included.
548   return size_in_bytes_ - get_start_byte_offset();
549 }
550 
MinPostingListSizeToFit() const551 uint32_t PostingListUsed::MinPostingListSizeToFit() const {
552   if (full() || almost_full()) {
553     // If in either the FULL state or ALMOST_FULL state, this posting list *is*
554     // the minimum size posting list that can fit these hits. So just return the
555     // size of the posting list.
556     return size_in_bytes_;
557   }
558 
559   // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
560   // posting list that would be guaranteed to fit these hits would be
561   // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
562   // special_hit(1) and the n compressed hits in the compressed region.
563   // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
564   // fitting these hits into a posting list would require BytesUsed plus one
565   // extra hit.
566   return BytesUsed() + sizeof(Hit);
567 }
568 
IsPostingListValid() const569 bool PostingListUsed::IsPostingListValid() const {
570   if (almost_full()) {
571     // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
572     // know that 1 < kNumSpecialHits.
573     if (!get_special_hit(1).ValueOrDie().is_valid()) {
574       ICING_LOG(ERROR)
575           << "Both special hits cannot be invalid at the same time.";
576       return false;
577     }
578   } else if (!full()) {
579     // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
580     // safe because we know that 0 < kNumSpecialHits.
581     if (get_special_hit(0).ValueOrDie().value() > size_in_bytes_ ||
582         get_special_hit(0).ValueOrDie().value() <
583             posting_list_utils::kSpecialHitsSize) {
584       ICING_LOG(ERROR) << "Hit: " << get_special_hit(0).ValueOrDie().value()
585                        << " size: " << size_in_bytes_
586                        << " sp size: " << posting_list_utils::kSpecialHitsSize;
587       return false;
588     }
589   }
590   return true;
591 }
592 
get_start_byte_offset() const593 uint32_t PostingListUsed::get_start_byte_offset() const {
594   if (full()) {
595     return 0;
596   } else if (almost_full()) {
597     return sizeof(Hit);
598   } else {
599     // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
600     // kNumSpecialHits.
601     return get_special_hit(0).ValueOrDie().value();
602   }
603 }
604 
set_start_byte_offset(uint32_t offset)605 bool PostingListUsed::set_start_byte_offset(uint32_t offset) {
606   if (offset > size_in_bytes_) {
607     ICING_LOG(ERROR) << "offset cannot be a value greater than size "
608                      << size_in_bytes_ << ". offset is " << offset << ".";
609     return false;
610   }
611   if (offset < posting_list_utils::kSpecialHitsSize && offset > sizeof(Hit)) {
612     ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
613                      << ", " << posting_list_utils::kSpecialHitsSize
614                      << "). offset is " << offset << ".";
615     return false;
616   }
617   if (offset < sizeof(Hit) && offset != 0) {
618     ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
619                      << "). offset is " << offset << ".";
620     return false;
621   }
622   if (offset >= posting_list_utils::kSpecialHitsSize) {
623     // not_full state. Safe to ignore the return value because 0 and 1 are both
624     // < kNumSpecialHits.
625     set_special_hit(0, Hit(offset));
626     set_special_hit(1, Hit());
627   } else if (offset == sizeof(Hit)) {
628     // almost_full state. Safe to ignore the return value because 1 is both <
629     // kNumSpecialHits.
630     set_special_hit(0, Hit());
631   }
632   // Nothing to do for the FULL state - the offset isn't actually stored
633   // anywhere and both special hits hold valid hits.
634   return true;
635 }
636 
PrependHitUncompressed(const Hit & hit,uint32_t offset)637 libtextclassifier3::StatusOr<uint32_t> PostingListUsed::PrependHitUncompressed(
638     const Hit &hit, uint32_t offset) {
639   if (hit.has_term_frequency()) {
640     if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit)) {
641       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
642           "Not enough room to prepend Hit at offset %d.", offset));
643     }
644     offset -= sizeof(Hit);
645     memcpy(posting_list_buffer_ + offset, &hit, sizeof(Hit));
646   } else {
647     if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value)) {
648       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
649           "Not enough room to prepend Hit::Value at offset %d.", offset));
650     }
651     offset -= sizeof(Hit::Value);
652     Hit::Value val = hit.value();
653     memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
654   }
655   return offset;
656 }
657 
ConsumeTermFrequencyIfPresent(Hit * hit,uint32_t * offset) const658 libtextclassifier3::Status PostingListUsed::ConsumeTermFrequencyIfPresent(
659     Hit *hit, uint32_t *offset) const {
660   if (!hit->has_term_frequency()) {
661     // No term frequency to consume. Everything is fine.
662     return libtextclassifier3::Status::OK;
663   }
664   if (*offset + sizeof(Hit::TermFrequency) > size_in_bytes_) {
665     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
666         "offset %d must not point past the end of the posting list of size %d.",
667         *offset, size_in_bytes_));
668   }
669   Hit::TermFrequency term_frequency;
670   memcpy(&term_frequency, posting_list_buffer_ + *offset,
671          sizeof(Hit::TermFrequency));
672   *hit = Hit(hit->value(), term_frequency);
673   *offset += sizeof(Hit::TermFrequency);
674   return libtextclassifier3::Status::OK;
675 }
676 
677 }  // namespace lib
678 }  // namespace icing
679