1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/main/posting-list-used.h"
16
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstdint>
20 #include <limits>
21
22 #include "icing/absl_ports/canonical_errors.h"
23 #include "icing/index/main/posting-list-utils.h"
24 #include "icing/legacy/core/icing-string-util.h"
25 #include "icing/legacy/index/icing-bit-util.h"
26 #include "icing/util/status-macros.h"
27
28 namespace icing {
29 namespace lib {
30
31 namespace {
32
GetTermFrequencyByteSize(const Hit & hit)33 uint32_t GetTermFrequencyByteSize(const Hit &hit) {
34 return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
35 }
36
37 } // namespace
38
39 libtextclassifier3::StatusOr<PostingListUsed>
CreateFromPreexistingPostingListUsedRegion(void * posting_list_buffer,uint32_t size_in_bytes)40 PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
41 void *posting_list_buffer, uint32_t size_in_bytes) {
42 ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
43 if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
44 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
45 "Requested posting list size %d is invalid!", size_in_bytes));
46 }
47 return PostingListUsed(posting_list_buffer, size_in_bytes);
48 }
49
50 libtextclassifier3::StatusOr<PostingListUsed>
CreateFromUnitializedRegion(void * posting_list_buffer,uint32_t size_in_bytes)51 PostingListUsed::CreateFromUnitializedRegion(void *posting_list_buffer,
52 uint32_t size_in_bytes) {
53 ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used,
54 CreateFromPreexistingPostingListUsedRegion(
55 posting_list_buffer, size_in_bytes));
56 posting_list_used.Clear();
57 return posting_list_used;
58 }
59
Clear()60 void PostingListUsed::Clear() {
61 // Safe to ignore return value because size_in_bytes_ a valid argument.
62 set_start_byte_offset(size_in_bytes_);
63 }
64
MoveFrom(PostingListUsed * other)65 libtextclassifier3::Status PostingListUsed::MoveFrom(PostingListUsed *other) {
66 ICING_RETURN_ERROR_IF_NULL(other);
67 if (other->MinPostingListSizeToFit() > size_in_bytes_) {
68 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
69 "other->MinPostingListSizeToFit %d must be larger than size %d.",
70 other->MinPostingListSizeToFit(), size_in_bytes_));
71 }
72
73 if (!IsPostingListValid()) {
74 return absl_ports::FailedPreconditionError(
75 "This posting list is in an invalid state and can't be used!");
76 }
77 if (!other->IsPostingListValid()) {
78 return absl_ports::InvalidArgumentError(
79 "Cannot MoveFrom an invalid posting list!");
80 }
81
82 // Pop just enough hits that all of other's compressed hits fit in
83 // this posting_list's compressed area. Then we can memcpy that area.
84 std::vector<Hit> hits;
85 while (other->full() || other->almost_full() ||
86 (size_in_bytes_ - posting_list_utils::kSpecialHitsSize <
87 other->BytesUsed())) {
88 if (!other->GetHitsInternal(/*limit=*/1, /*pop=*/true, &hits).ok()) {
89 return absl_ports::AbortedError(
90 "Unable to retrieve hits from other posting list.");
91 }
92 }
93
94 // memcpy the area and set up start byte offset.
95 Clear();
96 memcpy(posting_list_buffer_ + size_in_bytes_ - other->BytesUsed(),
97 other->posting_list_buffer_ + other->get_start_byte_offset(),
98 other->BytesUsed());
99 // Because we popped all hits from other outside of the compressed area and we
100 // guaranteed that other->BytesUsed is less than size_in_bytes_ -
101 // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
102 // NOT_FULL state, so ignoring the value is safe.
103 set_start_byte_offset(size_in_bytes_ - other->BytesUsed());
104
105 // Put back remaining hits.
106 for (size_t i = 0; i < hits.size(); i++) {
107 const Hit &hit = hits[hits.size() - i - 1];
108 // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
109 // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
110 // should be impossible because we've already assured that there is enough
111 // room above.
112 ICING_RETURN_IF_ERROR(PrependHit(hit));
113 }
114
115 other->Clear();
116 return libtextclassifier3::Status::OK;
117 }
118
GetPadEnd(uint32_t offset) const119 uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const {
120 Hit::Value pad;
121 uint32_t pad_end = offset;
122 while (pad_end < size_in_bytes_) {
123 size_t pad_len = VarInt::Decode(posting_list_buffer_ + pad_end, &pad);
124 if (pad != 0) {
125 // No longer a pad.
126 break;
127 }
128 pad_end += pad_len;
129 }
130 return pad_end;
131 }
132
PadToEnd(uint32_t start,uint32_t end)133 bool PostingListUsed::PadToEnd(uint32_t start, uint32_t end) {
134 if (end > size_in_bytes_) {
135 ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
136 return false;
137 }
138 // In VarInt a value of 0 encodes to 0.
139 memset(posting_list_buffer_ + start, 0, end - start);
140 return true;
141 }
142
PrependHitToAlmostFull(const Hit & hit)143 libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull(
144 const Hit &hit) {
145 // Get delta between first hit and the new hit. Try to fit delta
146 // in the padded area and put new hit at the special position 1.
147 // Calling ValueOrDie is safe here because 1 < kNumSpecialHits.
148 Hit cur = get_special_hit(1).ValueOrDie();
149 if (cur.value() <= hit.value()) {
150 return absl_ports::InvalidArgumentError(
151 "Hit being prepended must be strictly less than the most recent Hit");
152 }
153 uint64_t delta = cur.value() - hit.value();
154 uint8_t delta_buf[VarInt::kMaxEncodedLen64];
155 size_t delta_len = VarInt::Encode(delta, delta_buf);
156 uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
157
158 uint32_t pad_end = GetPadEnd(posting_list_utils::kSpecialHitsSize);
159
160 if (pad_end >= posting_list_utils::kSpecialHitsSize + delta_len +
161 cur_term_frequency_bytes) {
162 // Pad area has enough space for delta and term_frequency of existing hit
163 // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
164 uint8_t *delta_offset =
165 posting_list_buffer_ + pad_end - delta_len - cur_term_frequency_bytes;
166 memcpy(delta_offset, delta_buf, delta_len);
167 // Now copy term_frequency.
168 Hit::TermFrequency term_frequency = cur.term_frequency();
169 uint8_t *term_frequency_offset = delta_offset + delta_len;
170 memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
171
172 // Now first hit is the new hit, at special position 1. Safe to ignore the
173 // return value because 1 < kNumSpecialHits.
174 set_special_hit(1, hit);
175 // Safe to ignore the return value because sizeof(Hit) is a valid argument.
176 set_start_byte_offset(sizeof(Hit));
177 } else {
178 // No space for delta. We put the new hit at special position 0
179 // and go to the full state. Safe to ignore the return value because 1 <
180 // kNumSpecialHits.
181 set_special_hit(0, hit);
182 }
183 return libtextclassifier3::Status::OK;
184 }
185
PrependHitToEmpty(const Hit & hit)186 void PostingListUsed::PrependHitToEmpty(const Hit &hit) {
187 // First hit to be added. Just add verbatim, no compression.
188 if (size_in_bytes_ == posting_list_utils::kSpecialHitsSize) {
189 // Safe to ignore the return value because 1 < kNumSpecialHits
190 set_special_hit(1, hit);
191 // Safe to ignore the return value because sizeof(Hit) is a valid argument.
192 set_start_byte_offset(sizeof(Hit));
193 } else {
194 // Since this is the first hit, size != kSpecialHitsSize and
195 // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
196 // the compressed region, so ValueOrDie is safe.
197 uint32_t offset = PrependHitUncompressed(hit, size_in_bytes_).ValueOrDie();
198 // Safe to ignore the return value because PrependHitUncompressed is
199 // guaranteed to return a valid offset.
200 set_start_byte_offset(offset);
201 }
202 }
203
PrependHitToNotFull(const Hit & hit,uint32_t offset)204 libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
205 const Hit &hit, uint32_t offset) {
206 // First hit in compressed area. It is uncompressed. See if delta
207 // between the first hit and new hit will still fit in the
208 // compressed area.
209 if (offset + sizeof(Hit::Value) > size_in_bytes_) {
210 // The first hit in the compressed region *should* be uncompressed, but
211 // somehow there isn't enough room between offset and the end of the
212 // compressed area to fit an uncompressed hit. This should NEVER happen.
213 return absl_ports::FailedPreconditionError(
214 "Posting list is in an invalid state.");
215 }
216 Hit::Value cur_value;
217 memcpy(&cur_value, posting_list_buffer_ + offset, sizeof(Hit::Value));
218 if (cur_value <= hit.value()) {
219 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
220 "Hit %d being prepended must be strictly less than the most recent "
221 "Hit %d",
222 hit.value(), cur_value));
223 }
224 uint64_t delta = cur_value - hit.value();
225 uint8_t delta_buf[VarInt::kMaxEncodedLen64];
226 size_t delta_len = VarInt::Encode(delta, delta_buf);
227 uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
228
229 // offset now points to one past the end of the first hit.
230 offset += sizeof(Hit::Value);
231 if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
232 hit_term_frequency_bytes <=
233 offset) {
234 // Enough space for delta in compressed area.
235
236 // Prepend delta.
237 offset -= delta_len;
238 memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
239
240 // Prepend new hit with (possibly) its term_frequency. We know that there is
241 // room for 'hit' because of the if statement above, so calling ValueOrDie
242 // is safe.
243 offset = PrependHitUncompressed(hit, offset).ValueOrDie();
244 // offset is guaranteed to be valid here. So it's safe to ignore the return
245 // value. The if above will guarantee that offset >= kSpecialHitSize and <
246 // size_in_bytes_ because the if ensures that there is enough room between
247 // offset and kSpecialHitSize to fit the delta of the previous hit, any
248 // term_frequency and the uncompressed hit.
249 set_start_byte_offset(offset);
250 } else if (posting_list_utils::kSpecialHitsSize + delta_len <= offset) {
251 // Only have space for delta. The new hit must be put in special
252 // position 1.
253
254 // Prepend delta.
255 offset -= delta_len;
256 memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
257
258 // Prepend pad. Safe to ignore the return value of PadToEnd because offset
259 // must be less than size_in_bytes_. Otherwise, this function already would
260 // have returned FAILED_PRECONDITION.
261 PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
262
263 // Put new hit in special position 1. Safe to ignore return value because 1
264 // < kNumSpecialHits.
265 set_special_hit(1, hit);
266
267 // State almost_full. Safe to ignore the return value because sizeof(Hit) is
268 // a valid argument.
269 set_start_byte_offset(sizeof(Hit));
270 } else {
271 // Very rare case where delta is larger than sizeof(Hit::Value)
272 // (i.e. varint delta encoding expanded required storage). We
273 // move first hit to special position 1 and put new hit in
274 // special position 0.
275 Hit cur(cur_value);
276 // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
277 // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
278 // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
279 // offset < size_in_bytes, so it is safe to ignore the return value here.
280 ConsumeTermFrequencyIfPresent(&cur, &offset);
281 // Safe to ignore the return value of PadToEnd because offset must be less
282 // than size_in_bytes_. Otherwise, this function already would have returned
283 // FAILED_PRECONDITION.
284 PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
285 // Safe to ignore the return value here because 0 and 1 < kNumSpecialHits.
286 set_special_hit(1, cur);
287 set_special_hit(0, hit);
288 }
289 return libtextclassifier3::Status::OK;
290 }
291
PrependHit(const Hit & hit)292 libtextclassifier3::Status PostingListUsed::PrependHit(const Hit &hit) {
293 static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
294 "Hit::Value cannot be larger than 8 bytes because the delta "
295 "must be able to fit in 8 bytes.");
296 if (!hit.is_valid()) {
297 return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
298 }
299 if (!IsPostingListValid()) {
300 return absl_ports::FailedPreconditionError(
301 "This PostingListUsed is in an invalid state and can't add any hits!");
302 }
303
304 if (full()) {
305 // State full: no space left.
306 return absl_ports::ResourceExhaustedError("No more room for hits");
307 } else if (almost_full()) {
308 return PrependHitToAlmostFull(hit);
309 } else if (empty()) {
310 PrependHitToEmpty(hit);
311 return libtextclassifier3::Status::OK;
312 } else {
313 uint32_t offset = get_start_byte_offset();
314 return PrependHitToNotFull(hit, offset);
315 }
316 }
317
GetHits() const318 libtextclassifier3::StatusOr<std::vector<Hit>> PostingListUsed::GetHits()
319 const {
320 std::vector<Hit> hits_out;
321 ICING_RETURN_IF_ERROR(GetHits(&hits_out));
322 return hits_out;
323 }
324
GetHits(std::vector<Hit> * hits_out) const325 libtextclassifier3::Status PostingListUsed::GetHits(
326 std::vector<Hit> *hits_out) const {
327 return GetHitsInternal(/*limit=*/std::numeric_limits<uint32_t>::max(),
328 /*pop=*/false, hits_out);
329 }
330
PopFrontHits(uint32_t num_hits)331 libtextclassifier3::Status PostingListUsed::PopFrontHits(uint32_t num_hits) {
332 if (num_hits == 1 && full()) {
333 // The PL is in full status which means that we save 2 uncompressed hits in
334 // the 2 special postions. But full status may be reached by 2 different
335 // statuses.
336 // (1) In "almost full" status
337 // +-----------------+----------------+-------+-----------------+
338 // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
339 // +-----------------+----------------+-------+-----------------+
340 // When we prepend another hit, we can only put it at the special
341 // position 0. And we get a full PL
342 // +-----------------+----------------+-------+-----------------+
343 // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
344 // +-----------------+----------------+-------+-----------------+
345 // (2) In "not full" status
346 // +-----------------+----------------+------+-------+------------------+
347 // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
348 // +-----------------+----------------+------+-------+------------------+
349 // When we prepend another hit, we can reach any of the 3 following
350 // scenarios:
351 // (2.1) not full
352 // if the space of pad and original 1st hit can accommodate the new 1st hit
353 // and the encoded delta value.
354 // +-----------------+----------------+------+-----------+-----------------+
355 // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
356 // +-----------------+----------------+------+-----------+-----------------+
357 // (2.2) almost full
358 // If the space of pad and original 1st hit cannot accommodate the new 1st
359 // hit and the encoded delta value but can accommodate the encoded delta
360 // value only. We can put the new 1st hit at special position 1.
361 // +-----------------+----------------+-------+-----------------+
362 // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits|
363 // +-----------------+----------------+-------+-----------------+
364 // (2.3) full
365 // In very rare case, it cannot even accommodate only the encoded delta
366 // value. we can move the original 1st hit into special position 1 and the
367 // new 1st hit into special position 0. This may happen because we use
368 // VarInt encoding method which may make the encoded value longer (about
369 // 4/3 times of original)
370 // +-----------------+----------------+-------+-----------------+
371 // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
372 // +-----------------+----------------+-------+-----------------+
373 // Suppose now the PL is full. But we don't know whether it arrived to
374 // this status from "not full" like (2.3) or from "almost full" like (1).
375 // We'll return to "almost full" status like (1) if we simply pop the new
376 // 1st hit but we want to make the prepending operation "reversible". So
377 // there should be some way to return to "not full" if possible. A simple
378 // way to do it is to pop 2 hits out of the PL to status "almost full" or
379 // "not full". And add the original 1st hit back. We can return to the
380 // correct original statuses of (2.1) or (1). This makes our prepending
381 // operation reversible.
382 std::vector<Hit> out;
383
384 // Popping 2 hits should never fail because we've just ensured that the
385 // posting list is in the FULL state.
386 ICING_RETURN_IF_ERROR(GetHitsInternal(/*limit=*/2, /*pop=*/true, &out));
387
388 // PrependHit should never fail because out[1] is a valid hit less than
389 // previous hits in the posting list and because there's no way that the
390 // posting list could run out of room because it previously stored this hit
391 // AND another hit.
392 PrependHit(out[1]);
393 } else if (num_hits > 0) {
394 return GetHitsInternal(/*limit=*/num_hits, /*pop=*/true, nullptr);
395 }
396 return libtextclassifier3::Status::OK;
397 }
398
GetHitsInternal(uint32_t limit,bool pop,std::vector<Hit> * out) const399 libtextclassifier3::Status PostingListUsed::GetHitsInternal(
400 uint32_t limit, bool pop, std::vector<Hit> *out) const {
401 // Put current uncompressed val here.
402 Hit::Value val = Hit::kInvalidValue;
403 uint32_t offset = get_start_byte_offset();
404 uint32_t count = 0;
405
406 // First traverse the first two special positions.
407 while (count < limit && offset < posting_list_utils::kSpecialHitsSize) {
408 // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
409 // kNumSpecialHits because of the check above.
410 Hit hit = get_special_hit(offset / sizeof(Hit)).ValueOrDie();
411 val = hit.value();
412 if (out != nullptr) {
413 out->push_back(hit);
414 }
415 offset += sizeof(Hit);
416 count++;
417 }
418
419 // If special position 1 was set then we need to skip padding.
420 if (val != Hit::kInvalidValue &&
421 offset == posting_list_utils::kSpecialHitsSize) {
422 offset = GetPadEnd(offset);
423 }
424
425 while (count < limit && offset < size_in_bytes_) {
426 if (val == Hit::kInvalidValue) {
427 // First hit is in compressed area. Put that in val.
428 memcpy(&val, posting_list_buffer_ + offset, sizeof(Hit::Value));
429 offset += sizeof(Hit::Value);
430 } else {
431 // Now we have delta encoded subsequent hits. Decode and push.
432 uint64_t delta;
433 offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
434 val += delta;
435 }
436 Hit hit(val);
437 libtextclassifier3::Status status =
438 ConsumeTermFrequencyIfPresent(&hit, &offset);
439 if (!status.ok()) {
440 // This posting list has been corrupted somehow. The first hit of the
441 // posting list claims to have a term frequency, but there's no more room
442 // in the posting list for that term frequency to exist. Return an empty
443 // vector and zero to indicate no hits retrieved.
444 if (out != nullptr) {
445 out->clear();
446 }
447 return absl_ports::InternalError("Posting list has been corrupted!");
448 }
449 if (out != nullptr) {
450 out->push_back(hit);
451 }
452 count++;
453 }
454
455 if (pop) {
456 PostingListUsed *mutable_this = const_cast<PostingListUsed *>(this);
457 // Modify the posting list so that we pop all hits actually
458 // traversed.
459 if (offset >= posting_list_utils::kSpecialHitsSize &&
460 offset < size_in_bytes_) {
461 // In the compressed area. Pop and reconstruct. offset/val is
462 // the last traversed hit, which we must discard. So move one
463 // more forward.
464 uint64_t delta;
465 offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
466 val += delta;
467
468 // Now val is the first hit of the new posting list.
469 if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
470 // val fits in compressed area. Simply copy.
471 offset -= sizeof(Hit::Value);
472 memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
473 } else {
474 // val won't fit in compressed area. Also see if there is a
475 // term_frequency.
476 Hit hit(val);
477 libtextclassifier3::Status status =
478 ConsumeTermFrequencyIfPresent(&hit, &offset);
479 if (!status.ok()) {
480 // This posting list has been corrupted somehow. The first hit of
481 // the posting list claims to have a term frequency, but there's no
482 // more room in the posting list for that term frequency to exist.
483 // Return an empty vector and zero to indicate no hits retrieved. Do
484 // not pop anything.
485 if (out != nullptr) {
486 out->clear();
487 }
488 return absl_ports::InternalError("Posting list has been corrupted!");
489 }
490 // Okay to ignore the return value here because 1 < kNumSpecialHits.
491 mutable_this->set_special_hit(1, hit);
492
493 // Prepend pad. Safe to ignore the return value of PadToEnd because
494 // offset must be less than size_in_bytes_ thanks to the if above.
495 mutable_this->PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
496 offset = sizeof(Hit);
497 }
498 }
499 // offset is guaranteed to be valid so ignoring the return value of
500 // set_start_byte_offset is safe. It falls into one of four scenarios:
501 // Scenario 1: the above if was false because offset is not < size_in_bytes_
502 // In this case, offset must be == size_in_bytes_ because we reached
503 // offset by unwinding hits on the posting list.
504 // Scenario 2: offset is < kSpecialHitSize
505 // In this case, offset is guaranteed to be either 0 or sizeof(Hit)
506 // because offset is incremented by sizeof(Hit) within the first while
507 // loop.
508 // Scenario 3: offset is within the compressed region and the new first hit
509 // in the posting list (the value that 'val' holds) will fit as an
510 // uncompressed hit in the compressed region. The resulting offset from
511 // decompressing val must be >= kSpecialHitSize because otherwise we'd be
512 // in Scenario 4
513 // Scenario 4: offset is within the compressed region, but the new first hit
514 // in the posting list is too large to fit as an uncompressed hit in the
515 // in the compressed region. Therefore, it must be stored in a special hit
516 // and offset will be sizeof(Hit).
517 mutable_this->set_start_byte_offset(offset);
518 }
519
520 return libtextclassifier3::Status::OK;
521 }
522
get_special_hit(uint32_t index) const523 libtextclassifier3::StatusOr<Hit> PostingListUsed::get_special_hit(
524 uint32_t index) const {
525 static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
526 if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
527 return absl_ports::InvalidArgumentError(
528 "Special hits only exist at indices 0 and 1");
529 }
530 Hit val;
531 memcpy(&val, posting_list_buffer_ + index * sizeof(val), sizeof(val));
532 return val;
533 }
534
set_special_hit(uint32_t index,const Hit & val)535 bool PostingListUsed::set_special_hit(uint32_t index, const Hit &val) {
536 if (index >= posting_list_utils::kNumSpecialHits || index < 0) {
537 ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
538 return false;
539 }
540 memcpy(posting_list_buffer_ + index * sizeof(val), &val, sizeof(val));
541 return true;
542 }
543
BytesUsed() const544 uint32_t PostingListUsed::BytesUsed() const {
545 // The special hits will be included if they represent actual hits. If they
546 // represent the hit offset or the invalid hit sentinel, they are not
547 // included.
548 return size_in_bytes_ - get_start_byte_offset();
549 }
550
MinPostingListSizeToFit() const551 uint32_t PostingListUsed::MinPostingListSizeToFit() const {
552 if (full() || almost_full()) {
553 // If in either the FULL state or ALMOST_FULL state, this posting list *is*
554 // the minimum size posting list that can fit these hits. So just return the
555 // size of the posting list.
556 return size_in_bytes_;
557 }
558
559 // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
560 // posting list that would be guaranteed to fit these hits would be
561 // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
562 // special_hit(1) and the n compressed hits in the compressed region.
563 // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
564 // fitting these hits into a posting list would require BytesUsed plus one
565 // extra hit.
566 return BytesUsed() + sizeof(Hit);
567 }
568
IsPostingListValid() const569 bool PostingListUsed::IsPostingListValid() const {
570 if (almost_full()) {
571 // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
572 // know that 1 < kNumSpecialHits.
573 if (!get_special_hit(1).ValueOrDie().is_valid()) {
574 ICING_LOG(ERROR)
575 << "Both special hits cannot be invalid at the same time.";
576 return false;
577 }
578 } else if (!full()) {
579 // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
580 // safe because we know that 0 < kNumSpecialHits.
581 if (get_special_hit(0).ValueOrDie().value() > size_in_bytes_ ||
582 get_special_hit(0).ValueOrDie().value() <
583 posting_list_utils::kSpecialHitsSize) {
584 ICING_LOG(ERROR) << "Hit: " << get_special_hit(0).ValueOrDie().value()
585 << " size: " << size_in_bytes_
586 << " sp size: " << posting_list_utils::kSpecialHitsSize;
587 return false;
588 }
589 }
590 return true;
591 }
592
get_start_byte_offset() const593 uint32_t PostingListUsed::get_start_byte_offset() const {
594 if (full()) {
595 return 0;
596 } else if (almost_full()) {
597 return sizeof(Hit);
598 } else {
599 // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
600 // kNumSpecialHits.
601 return get_special_hit(0).ValueOrDie().value();
602 }
603 }
604
set_start_byte_offset(uint32_t offset)605 bool PostingListUsed::set_start_byte_offset(uint32_t offset) {
606 if (offset > size_in_bytes_) {
607 ICING_LOG(ERROR) << "offset cannot be a value greater than size "
608 << size_in_bytes_ << ". offset is " << offset << ".";
609 return false;
610 }
611 if (offset < posting_list_utils::kSpecialHitsSize && offset > sizeof(Hit)) {
612 ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
613 << ", " << posting_list_utils::kSpecialHitsSize
614 << "). offset is " << offset << ".";
615 return false;
616 }
617 if (offset < sizeof(Hit) && offset != 0) {
618 ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
619 << "). offset is " << offset << ".";
620 return false;
621 }
622 if (offset >= posting_list_utils::kSpecialHitsSize) {
623 // not_full state. Safe to ignore the return value because 0 and 1 are both
624 // < kNumSpecialHits.
625 set_special_hit(0, Hit(offset));
626 set_special_hit(1, Hit());
627 } else if (offset == sizeof(Hit)) {
628 // almost_full state. Safe to ignore the return value because 1 is both <
629 // kNumSpecialHits.
630 set_special_hit(0, Hit());
631 }
632 // Nothing to do for the FULL state - the offset isn't actually stored
633 // anywhere and both special hits hold valid hits.
634 return true;
635 }
636
PrependHitUncompressed(const Hit & hit,uint32_t offset)637 libtextclassifier3::StatusOr<uint32_t> PostingListUsed::PrependHitUncompressed(
638 const Hit &hit, uint32_t offset) {
639 if (hit.has_term_frequency()) {
640 if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit)) {
641 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
642 "Not enough room to prepend Hit at offset %d.", offset));
643 }
644 offset -= sizeof(Hit);
645 memcpy(posting_list_buffer_ + offset, &hit, sizeof(Hit));
646 } else {
647 if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value)) {
648 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
649 "Not enough room to prepend Hit::Value at offset %d.", offset));
650 }
651 offset -= sizeof(Hit::Value);
652 Hit::Value val = hit.value();
653 memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
654 }
655 return offset;
656 }
657
ConsumeTermFrequencyIfPresent(Hit * hit,uint32_t * offset) const658 libtextclassifier3::Status PostingListUsed::ConsumeTermFrequencyIfPresent(
659 Hit *hit, uint32_t *offset) const {
660 if (!hit->has_term_frequency()) {
661 // No term frequency to consume. Everything is fine.
662 return libtextclassifier3::Status::OK;
663 }
664 if (*offset + sizeof(Hit::TermFrequency) > size_in_bytes_) {
665 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
666 "offset %d must not point past the end of the posting list of size %d.",
667 *offset, size_in_bytes_));
668 }
669 Hit::TermFrequency term_frequency;
670 memcpy(&term_frequency, posting_list_buffer_ + *offset,
671 sizeof(Hit::TermFrequency));
672 *hit = Hit(hit->value(), term_frequency);
673 *offset += sizeof(Hit::TermFrequency);
674 return libtextclassifier3::Status::OK;
675 }
676
677 } // namespace lib
678 } // namespace icing
679