1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "icing/index/main/main-index.h"
15
16 #include <cstdint>
17 #include <cstring>
18 #include <memory>
19
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/absl_ports/str_cat.h"
22 #include "icing/index/main/index-block.h"
23 #include "icing/index/term-id-codec.h"
24 #include "icing/index/term-property-id.h"
25 #include "icing/legacy/index/icing-dynamic-trie.h"
26 #include "icing/util/status-macros.h"
27
28 namespace icing {
29 namespace lib {
30
31 namespace {
32
33 // Finds the shortest,valid prefix term with prefix hits in lexicon for which
34 // "prefix" is a prefix.
35 // Returns a valid FindTermResult with found=true if either:
36 // 1. prefix exists as a term in lexicon.
37 // 2. the shortest, valid prefix in the lexicon exists and contains prefix
38 // hits.
39 // Returns a FindTermResult with found=false and undefined values of tvi and
40 // exact if no term was found.
41 struct FindTermResult {
42 // TVI of the term that was found. Undefined if found=false.
43 uint32_t tvi;
44 // Whether or not a valid term with prefix hits was found.
45 bool found;
46 // Whether or not that term is equal to 'prefix'
47 bool exact;
48 };
FindShortestValidTermWithPrefixHits(const IcingDynamicTrie * lexicon,const std::string & prefix)49 FindTermResult FindShortestValidTermWithPrefixHits(
50 const IcingDynamicTrie* lexicon, const std::string& prefix) {
51 // For prefix indexing: when we are doing a prefix match for "prefix", find
52 // the tvi to the equivalent posting list. prefix's own posting list might not
53 // exist but one of its children acts as a proxy.
54 IcingDynamicTrie::PropertyReader hits_in_prefix_section(
55 *lexicon, GetHasHitsInPrefixSectionPropertyId());
56 uint32_t tvi = 0;
57 bool found = false;
58 bool exact = false;
59 for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
60 it.Advance()) {
61 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
62 memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
63
64 // Posting list id might be invalid if this is also a backfill term.
65 // Suppose that the main index has two pre-existing prefix hits "foot" and
66 // "fool" - it will have a branch point posting list for "foo". Then, let's
67 // suppose that the other index adds hits for "foul", "four" and "far". This
68 // will result in branch points for "fo" and "f".
69 // If "fo" was added before "f", then the iterator would first give us "fo".
70 // "fo" will have an invalid posting_list_id because it hasn't been
71 // backfilled yet, so we need to continue iterating to "foo".
72 if (posting_list_id.is_valid()) {
73 exact = (prefix.size() == strlen(it.GetKey()));
74 tvi = it.GetValueIndex();
75 // Found it. Does it have prefix hits?
76 found = exact || hits_in_prefix_section.HasProperty(tvi);
77 break;
78 }
79 }
80 FindTermResult result = {tvi, found, exact};
81 return result;
82 }
83
84 } // namespace
85
Create(const std::string & index_directory,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)86 libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
87 const std::string& index_directory, const Filesystem* filesystem,
88 const IcingFilesystem* icing_filesystem) {
89 ICING_RETURN_ERROR_IF_NULL(filesystem);
90 ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
91 auto main_index = std::make_unique<MainIndex>();
92 ICING_RETURN_IF_ERROR(
93 main_index->Init(index_directory, filesystem, icing_filesystem));
94 return main_index;
95 }
96
97 // TODO(b/139087650) : Migrate off of IcingFilesystem.
Init(const std::string & index_directory,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)98 libtextclassifier3::Status MainIndex::Init(
99 const std::string& index_directory, const Filesystem* filesystem,
100 const IcingFilesystem* icing_filesystem) {
101 if (!filesystem->CreateDirectoryRecursively(index_directory.c_str())) {
102 return absl_ports::InternalError("Unable to create main index directory.");
103 }
104 std::string flash_index_file = index_directory + "/main_index";
105 ICING_ASSIGN_OR_RETURN(
106 FlashIndexStorage flash_index,
107 FlashIndexStorage::Create(flash_index_file, filesystem));
108 flash_index_storage_ =
109 std::make_unique<FlashIndexStorage>(std::move(flash_index));
110
111 std::string lexicon_file = index_directory + "/main-lexicon";
112 IcingDynamicTrie::RuntimeOptions runtime_options;
113 main_lexicon_ = std::make_unique<IcingDynamicTrie>(
114 lexicon_file, runtime_options, icing_filesystem);
115 IcingDynamicTrie::Options lexicon_options;
116 if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
117 !main_lexicon_->Init()) {
118 return absl_ports::InternalError("Failed to initialize lexicon trie");
119 }
120 return libtextclassifier3::Status::OK;
121 }
122
GetElementsSize() const123 libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
124 IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
125 if (storage_info.main_index_storage_size() == -1 ||
126 storage_info.main_index_lexicon_size() == -1) {
127 return absl_ports::AbortedError(
128 "Failed to get size of MainIndex's members.");
129 }
130 return storage_info.main_index_storage_size() +
131 storage_info.main_index_lexicon_size();
132 }
133
GetStorageInfo(IndexStorageInfoProto storage_info) const134 IndexStorageInfoProto MainIndex::GetStorageInfo(
135 IndexStorageInfoProto storage_info) const {
136 int64_t lexicon_elt_size = main_lexicon_->GetElementsSize();
137 if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
138 storage_info.set_main_index_lexicon_size(lexicon_elt_size);
139 } else {
140 storage_info.set_main_index_lexicon_size(-1);
141 }
142 int64_t index_elt_size = flash_index_storage_->GetElementsSize();
143 if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
144 storage_info.set_main_index_storage_size(index_elt_size);
145 } else {
146 storage_info.set_main_index_storage_size(-1);
147 }
148 storage_info.set_main_index_block_size(flash_index_storage_->block_size());
149 storage_info.set_num_blocks(flash_index_storage_->num_blocks());
150 storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
151 return storage_info;
152 }
153
154 libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
GetAccessorForExactTerm(const std::string & term)155 MainIndex::GetAccessorForExactTerm(const std::string& term) {
156 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
157 if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
158 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
159 "Term %s is not present in main lexicon.", term.c_str()));
160 }
161 ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor,
162 PostingListAccessor::CreateFromExisting(
163 flash_index_storage_.get(), posting_list_id));
164 return std::make_unique<PostingListAccessor>(std::move(accessor));
165 }
166
167 libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
GetAccessorForPrefixTerm(const std::string & prefix)168 MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
169 bool exact = false;
170 // For prefix indexing: when we are doing a prefix match for
171 // "prefix", find the tvi to the equivalent posting list. prefix's
172 // own posting list might not exist but its shortest child acts as a proxy.
173 //
174 // For example, if there are only two hits in the index are prefix hits for
175 // "bar" and "bat", then both will appear on a posting list for "ba". "b"
176 // won't have a posting list, but "ba" will suffice.
177 IcingDynamicTrie::PropertyReader hits_in_prefix_section(
178 *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
179 IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
180 if (!main_itr.IsValid()) {
181 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
182 "Term: %s is not present in the main lexicon.", prefix.c_str()));
183 }
184 exact = (prefix.length() == strlen(main_itr.GetKey()));
185
186 if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
187 // Found it, but it doesn't have prefix hits. Exit early. No need to
188 // retrieve the posting list because there's nothing there for us.
189 return libtextclassifier3::Status::OK;
190 }
191 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
192 memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
193 ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
194 PostingListAccessor::CreateFromExisting(
195 flash_index_storage_.get(), posting_list_id));
196 GetPrefixAccessorResult result = {
197 std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact};
198 return result;
199 }
200
201 // TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
IsTermInNamespaces(const IcingDynamicTrie::PropertyReadersAll & property_reader,uint32_t value_index,const std::vector<NamespaceId> & namespace_ids)202 bool IsTermInNamespaces(
203 const IcingDynamicTrie::PropertyReadersAll& property_reader,
204 uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
205 if (namespace_ids.empty()) {
206 return true;
207 }
208 for (NamespaceId namespace_id : namespace_ids) {
209 if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
210 value_index)) {
211 return true;
212 }
213 }
214
215 return false;
216 }
217
218 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindTermsByPrefix(const std::string & prefix,const std::vector<NamespaceId> & namespace_ids,int num_to_return)219 MainIndex::FindTermsByPrefix(const std::string& prefix,
220 const std::vector<NamespaceId>& namespace_ids,
221 int num_to_return) {
222 // Finds all the terms that start with the given prefix in the lexicon.
223 IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());
224
225 // A property reader to help check if a term has some property.
226 IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_);
227
228 std::vector<TermMetadata> term_metadata_list;
229 while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) {
230 uint32_t term_value_index = term_iterator.GetValueIndex();
231
232 // Skips the terms that don't exist in the given namespaces. We won't skip
233 // any terms if namespace_ids is empty.
234 if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
235 term_iterator.Advance();
236 continue;
237 }
238 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
239 memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
240 // Getting the actual hit count would require reading the entire posting
241 // list chain. We take an approximation to avoid all of those IO ops.
242 // Because we are not reading the posting lists, it is impossible to
243 // differentiate between single max-size posting lists and chains of
244 // max-size posting lists. We assume that the impact on scoring is not
245 // significant.
246 int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock(
247 flash_index_storage_->block_size(),
248 posting_list_id.posting_list_index_bits());
249 term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count);
250
251 term_iterator.Advance();
252 }
253 if (term_iterator.IsValid()) {
254 // We exited the loop above because we hit the num_to_return limit.
255 ICING_LOG(WARNING) << "Ran into limit of " << num_to_return
256 << " retrieving suggestions for " << prefix
257 << ". Some suggestions may not be returned and others "
258 "may be misranked.";
259 }
260 return term_metadata_list;
261 }
262
263 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddBackfillBranchPoints(const IcingDynamicTrie & other_lexicon)264 MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
265 // Maps new branching points in main lexicon to the term such that
266 // branching_point_term is a prefix of term and there are no terms smaller
267 // than term and greater than branching_point_term.
268 std::string prefix;
269 LexiconMergeOutputs outputs;
270 for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
271 other_term_itr.IsValid(); other_term_itr.Advance()) {
272 // If term were inserted in the main lexicon, what new branching would it
273 // create? (It always creates at most one.)
274 int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
275 other_term_itr.GetKey(), /*utf8=*/true);
276 if (prefix_len <= 0) {
277 continue;
278 }
279 prefix.assign(other_term_itr.GetKey(), prefix_len);
280
281 // Figure out backfill tvi. Might not exist since all children terms could
282 // only contain hits from non-prefix sections.
283 //
284 // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
285 // we're adding "foul". The new branching prefix will be "fo". The backfill
286 // prefix will be "foo" - all hits in prefix section on "foo" will need to
287 // be added to the new "fo" posting list later.
288 FindTermResult result =
289 FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
290 if (!result.found || result.exact) {
291 continue;
292 }
293
294 // This is a new prefix that will need backfilling from its next-in-line
295 // posting list. This new prefix will have to have a posting list eventually
296 // so insert a default PostingListIdentifier as a placeholder.
297 uint32_t branching_prefix_tvi;
298 bool new_key;
299 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
300 if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_id,
301 &branching_prefix_tvi, /*replace=*/false,
302 &new_key)) {
303 return absl_ports::InternalError("Could not insert branching prefix");
304 }
305
306 // Backfills only contain prefix hits by default. So set these here but
307 // could be overridden when adding hits from the other index later.
308 if (!main_lexicon_->SetProperty(branching_prefix_tvi,
309 GetHasNoExactHitsPropertyId()) ||
310 !main_lexicon_->SetProperty(branching_prefix_tvi,
311 GetHasHitsInPrefixSectionPropertyId())) {
312 return absl_ports::InternalError("Setting prefix prop failed");
313 }
314
315 outputs.backfill_map[branching_prefix_tvi] = result.tvi;
316 }
317 return outputs;
318 }
319
320 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddTerms(const IcingDynamicTrie & other_lexicon,LexiconMergeOutputs && outputs)321 MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
322 LexiconMergeOutputs&& outputs) {
323 IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
324 for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
325 other_term_itr.IsValid(); other_term_itr.Advance()) {
326 uint32_t new_main_tvi;
327 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
328 if (!main_lexicon_->Insert(other_term_itr.GetKey(), &posting_list_id,
329 &new_main_tvi,
330 /*replace=*/false)) {
331 return absl_ports::InternalError(absl_ports::StrCat(
332 "Could not insert term: ", other_term_itr.GetKey()));
333 }
334
335 // Copy the properties from the other lexicon over to the main lexicon.
336 uint32_t other_tvi = other_term_itr.GetValueIndex();
337 if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
338 new_main_tvi)) {
339 return absl_ports::InternalError(absl_ports::StrCat(
340 "Could not insert term: ", other_term_itr.GetKey()));
341 }
342
343 // Add other to main mapping.
344 outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
345
346 memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(new_main_tvi),
347 sizeof(posting_list_id));
348 if (posting_list_id.block_index() != kInvalidBlockIndex) {
349 outputs.main_tvi_to_block_index[new_main_tvi] =
350 posting_list_id.block_index();
351 }
352 }
353 return std::move(outputs);
354 }
355
356 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddBranchPoints(const IcingDynamicTrie & other_lexicon,LexiconMergeOutputs && outputs)357 MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
358 LexiconMergeOutputs&& outputs) {
359 IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
360 other_lexicon, GetHasHitsInPrefixSectionPropertyId());
361 if (!has_prefix_prop_reader.Exists()) {
362 return std::move(outputs);
363 }
364 std::string prefix;
365 for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
366 other_term_itr.IsValid(); other_term_itr.Advance()) {
367 // Only expand terms that have hits in prefix sections.
368 if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
369 continue;
370 }
371
372 // Get prefixes where there is already a branching point in the main
373 // lexicon. We skip prefixes which don't already have a branching point.
374 std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
375 other_term_itr.GetKey(), /*utf8=*/true);
376
377 int buf_start = outputs.prefix_tvis_buf.size();
378 // Add prefixes.
379 for (int prefix_length : prefix_lengths) {
380 if (prefix_length <= 0) {
381 continue;
382 }
383
384 prefix.assign(other_term_itr.GetKey(), prefix_length);
385 uint32_t prefix_tvi;
386 bool new_key;
387 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
388 if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_id, &prefix_tvi,
389 /*replace=*/false, &new_key)) {
390 return absl_ports::InternalError(
391 absl_ports::StrCat("Could not insert prefix: ", prefix));
392 }
393
394 // Prefix tvi will have hits in prefix section.
395 if (!main_lexicon_->SetProperty(prefix_tvi,
396 GetHasHitsInPrefixSectionPropertyId())) {
397 return absl_ports::InternalError(
398 "Setting has hits in prefix section prop failed");
399 }
400
401 // If it hasn't been added by non-prefix term insertions in
402 // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
403 if (new_key && !main_lexicon_->SetProperty(
404 prefix_tvi, GetHasNoExactHitsPropertyId())) {
405 return absl_ports::InternalError("Setting no exact hits prop failed");
406 }
407
408 outputs.prefix_tvis_buf.push_back(prefix_tvi);
409
410 memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(prefix_tvi),
411 sizeof(posting_list_id));
412 if (posting_list_id.block_index() != kInvalidBlockIndex) {
413 outputs.main_tvi_to_block_index[prefix_tvi] =
414 posting_list_id.block_index();
415 }
416 }
417
418 // Any prefixes added? Then add to map.
419 if (buf_start < outputs.prefix_tvis_buf.size()) {
420 outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
421 buf_start, outputs.prefix_tvis_buf.size() - buf_start};
422 }
423 }
424 return std::move(outputs);
425 }
426
CopyProperties(const IcingDynamicTrie::PropertyReadersAll & prop_reader,const IcingDynamicTrie & other_lexicon,uint32_t other_tvi,uint32_t new_main_tvi)427 bool MainIndex::CopyProperties(
428 const IcingDynamicTrie::PropertyReadersAll& prop_reader,
429 const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
430 uint32_t new_main_tvi) {
431 for (uint32_t property_id = 0; property_id < prop_reader.size();
432 ++property_id) {
433 if (property_id == GetHasNoExactHitsPropertyId()) {
434 // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
435 // for this term, then HasNoExactHits needs to be set to false in
436 // main_lexicon. If other_lexicon has no exact hits for this term, then
437 // HasNoExactHits in the main_lexicon should not be modified.
438 if (!prop_reader.HasProperty(property_id, other_tvi) &&
439 !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
440 ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed";
441 return false;
442 }
443 } else {
444 // If other_lexicon has this property set for this term, then that
445 // property needs to be set for the main_lexicon. If other_lexicon
446 // doesn't have this property set, then the property in the main lexicon
447 // should not be modified.
448 if (prop_reader.HasProperty(property_id, other_tvi) &&
449 !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
450 return false;
451 }
452 }
453 }
454 return true;
455 }
456
AddHits(const TermIdCodec & term_id_codec,std::unordered_map<uint32_t,uint32_t> && backfill_map,std::vector<TermIdHitPair> && hits,DocumentId last_added_document_id)457 libtextclassifier3::Status MainIndex::AddHits(
458 const TermIdCodec& term_id_codec,
459 std::unordered_map<uint32_t, uint32_t>&& backfill_map,
460 std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id) {
461 if (hits.empty()) {
462 flash_index_storage_->set_last_indexed_docid(last_added_document_id);
463 return libtextclassifier3::Status::OK;
464 }
465 uint32_t cur_term_id = hits[0].term_id();
466 ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
467 term_id_codec.DecodeTermInfo(cur_term_id));
468 // Iterate through all hits. If these hits are for a term that also needs
469 // backfill, then backfill first and then add the new hits.
470 size_t k_start = 0;
471 size_t k_end = 0;
472 while (k_start < hits.size()) {
473 uint32_t term_id = hits[k_end].term_id();
474 while (term_id == cur_term_id && ++k_end < hits.size()) {
475 term_id = hits[k_end].term_id();
476 }
477
478 // Look for backfill.
479 PostingListIdentifier backfill_posting_list_id =
480 PostingListIdentifier::kInvalid;
481 auto itr = backfill_map.find(cur_decoded_term.tvi);
482 if (itr != backfill_map.end()) {
483 const void* value = main_lexicon_->GetValueAtIndex(itr->second);
484 memcpy(&backfill_posting_list_id, value,
485 sizeof(backfill_posting_list_id));
486 backfill_map.erase(itr);
487 }
488 ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi,
489 backfill_posting_list_id,
490 &hits[k_start], k_end - k_start));
491 cur_term_id = term_id;
492 ICING_ASSIGN_OR_RETURN(cur_decoded_term,
493 term_id_codec.DecodeTermInfo(cur_term_id));
494 k_start = k_end;
495 }
496
497 // Now copy remaining backfills.
498 ICING_VLOG(1) << IcingStringUtil::StringPrintf("Remaining backfills %zu",
499 backfill_map.size());
500 for (auto other_tvi_main_tvi_pair : backfill_map) {
501 PostingListIdentifier backfill_posting_list_id =
502 PostingListIdentifier::kInvalid;
503 memcpy(&backfill_posting_list_id,
504 main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second),
505 sizeof(backfill_posting_list_id));
506 ICING_ASSIGN_OR_RETURN(
507 PostingListAccessor hit_accum,
508 PostingListAccessor::Create(flash_index_storage_.get()));
509 ICING_RETURN_IF_ERROR(
510 AddPrefixBackfillHits(backfill_posting_list_id, &hit_accum));
511 PostingListAccessor::FinalizeResult result =
512 PostingListAccessor::Finalize(std::move(hit_accum));
513 if (result.id.is_valid()) {
514 main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id);
515 }
516 }
517 flash_index_storage_->set_last_indexed_docid(last_added_document_id);
518 return libtextclassifier3::Status::OK;
519 }
520
AddHitsForTerm(uint32_t tvi,PostingListIdentifier backfill_posting_list_id,const TermIdHitPair * hit_elements,size_t len)521 libtextclassifier3::Status MainIndex::AddHitsForTerm(
522 uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
523 const TermIdHitPair* hit_elements, size_t len) {
524 // 1. Create a PostingListAccessor - either from the pre-existing block, if
525 // one exists, or from scratch.
526 PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
527 memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi),
528 sizeof(posting_list_id));
529 std::unique_ptr<PostingListAccessor> pl_accessor;
530 if (posting_list_id.is_valid()) {
531 if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
532 ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
533 "Index dropped hits. Invalid block index %u >= %u",
534 posting_list_id.block_index(), flash_index_storage_->num_blocks());
535 // TODO(b/159918304) : Consider revising the checksumming strategy in the
536 // main index. Providing some mechanism to check for corruption - either
537 // during initialization or some later time would allow us to avoid
538 // whack-a-mole with odd corruption issues like this one (b/62820689).
539 return absl_ports::InternalError(
540 "Valid posting list has an invalid block index!");
541 }
542 ICING_ASSIGN_OR_RETURN(PostingListAccessor tmp,
543 PostingListAccessor::CreateFromExisting(
544 flash_index_storage_.get(), posting_list_id));
545 pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp));
546 } else {
547 // New posting list.
548 ICING_ASSIGN_OR_RETURN(
549 PostingListAccessor tmp,
550 PostingListAccessor::Create(flash_index_storage_.get()));
551 pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp));
552 }
553
554 // 2. Backfill any hits if necessary.
555 if (backfill_posting_list_id.is_valid()) {
556 ICING_RETURN_IF_ERROR(
557 AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get()));
558 }
559
560 // 3. Add all the new hits.
561 for (int i = len - 1; i >= 0; --i) {
562 Hit hit = hit_elements[i].hit();
563 ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit));
564 }
565
566 // 4. Finalize this posting list and put its identifier in the lexicon.
567 PostingListAccessor::FinalizeResult result =
568 PostingListAccessor::Finalize(std::move(*pl_accessor));
569 if (result.id.is_valid()) {
570 main_lexicon_->SetValueAtIndex(tvi, &result.id);
571 }
572 return libtextclassifier3::Status::OK;
573 }
574
AddPrefixBackfillHits(PostingListIdentifier backfill_posting_list_id,PostingListAccessor * hit_accum)575 libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
576 PostingListIdentifier backfill_posting_list_id,
577 PostingListAccessor* hit_accum) {
578 ICING_ASSIGN_OR_RETURN(
579 PostingListAccessor backfill_accessor,
580 PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
581 backfill_posting_list_id));
582 std::vector<Hit> backfill_hits;
583 ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
584 backfill_accessor.GetNextHitsBatch());
585 while (!tmp.empty()) {
586 std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits));
587 ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor.GetNextHitsBatch());
588 }
589
590 Hit last_added_hit;
591 // The hits in backfill_hits are in the reverse order of how they were added.
592 // Iterate in reverse to add them to this new posting list in the correct
593 // order.
594 for (auto itr = backfill_hits.rbegin(); itr != backfill_hits.rend(); ++itr) {
595 const Hit& hit = *itr;
596 // Skip hits from non-prefix-enabled sections.
597 if (!hit.is_in_prefix_section()) {
598 continue;
599 }
600
601 // A backfill hit is a prefix hit in a prefix section.
602 const Hit backfill_hit(hit.section_id(), hit.document_id(),
603 hit.term_frequency(),
604 /*is_in_prefix_section=*/true,
605 /*is_prefix_hit=*/true);
606 if (backfill_hit == last_added_hit) {
607 // Skip duplicate values due to overriding of the is_prefix flag.
608 continue;
609 }
610 last_added_hit = backfill_hit;
611 ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit));
612 }
613 return libtextclassifier3::Status::OK;
614 }
615
GetDebugInfo(int verbosity,std::string * out) const616 void MainIndex::GetDebugInfo(int verbosity, std::string* out) const {
617 // Lexicon.
618 out->append("Main Lexicon stats:\n");
619 main_lexicon_->GetDebugInfo(verbosity, out);
620
621 if (verbosity <= 0) {
622 return;
623 }
624
625 flash_index_storage_->GetDebugInfo(verbosity, out);
626 }
627
628 } // namespace lib
629 } // namespace icing
630