1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_ 16 #define ICING_INDEX_MAIN_MAIN_INDEX_H_ 17 18 #include <memory> 19 20 #include "icing/text_classifier/lib3/utils/base/status.h" 21 #include "icing/text_classifier/lib3/utils/base/statusor.h" 22 #include "icing/file/filesystem.h" 23 #include "icing/index/lite/term-id-hit-pair.h" 24 #include "icing/index/main/flash-index-storage.h" 25 #include "icing/index/main/posting-list-accessor.h" 26 #include "icing/index/term-id-codec.h" 27 #include "icing/index/term-metadata.h" 28 #include "icing/legacy/index/icing-dynamic-trie.h" 29 #include "icing/legacy/index/icing-filesystem.h" 30 #include "icing/proto/storage.pb.h" 31 #include "icing/store/namespace-id.h" 32 #include "icing/util/status-macros.h" 33 34 namespace icing { 35 namespace lib { 36 37 class MainIndex { 38 public: 39 // RETURNS: 40 // - valid instance of MainIndex, on success. 41 // - INTERNAL error if unable to create the lexicon or flash storage. 42 static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create( 43 const std::string& index_directory, const Filesystem* filesystem, 44 const IcingFilesystem* icing_filesystem); 45 46 // Get a PostingListAccessor that holds the posting list chain for 'term'. 47 // 48 // RETURNS: 49 // - On success, a valid PostingListAccessor 50 // - NOT_FOUND if term is not present in the main index. 51 libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>> 52 GetAccessorForExactTerm(const std::string& term); 53 54 // Get a PostingListAccessor for 'prefix'. 55 // 56 // RETURNS: 57 // - On success, a result containing a valid PostingListAccessor. 58 // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a 59 // prefix are present in the main index. 60 struct GetPrefixAccessorResult { 61 // A PostingListAccessor that holds the posting list chain for the term 62 // that best represents 'prefix' in the main index. 63 std::unique_ptr<PostingListAccessor> accessor; 64 // True if the returned posting list chain is for 'prefix' or false if the 65 // returned posting list chain is for a term for which 'prefix' is a prefix. 66 bool exact; 67 }; 68 libtextclassifier3::StatusOr<GetPrefixAccessorResult> 69 GetAccessorForPrefixTerm(const std::string& prefix); 70 71 // Finds terms with the given prefix in the given namespaces. If 72 // 'namespace_ids' is empty, returns results from all the namespaces. The 73 // input prefix must be normalized, otherwise inaccurate results may be 74 // returned. Results are not sorted specifically and are in lexigraphical 75 // order. Number of results are no more than 'num_to_return'. 76 // 77 // The hit count returned with each TermMetadata is an approximation based of 78 // posting list size. 79 // 80 // Returns: 81 // A list of TermMetadata on success 82 // INTERNAL_ERROR if failed to access term data. 83 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( 84 const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, 85 int num_to_return); 86 87 struct LexiconMergeOutputs { 88 // Maps from main_lexicon tvi for new branching point to the main_lexicon 89 // tvi for posting list whose hits must be backfilled. 90 std::unordered_map<uint32_t, uint32_t> backfill_map; 91 92 // Maps from lexicon tvis to main_lexicon tvis. 93 std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi; 94 95 // Maps from main lexicon tvi to the block index. Tvis with no entry do not 96 // have an allocated posting list. 97 std::unordered_map<uint32_t, int> main_tvi_to_block_index; 98 99 // Maps from the lexicon tvi to the beginning position in 100 // prefix_tvis_buf and the length. 101 std::unordered_map<uint32_t, std::pair<int, int>> 102 other_tvi_to_prefix_main_tvis; 103 104 // Stores tvis that are mapped to by other_tvi_to_prefix_tvis. 105 std::vector<uint32_t> prefix_tvis_buf; 106 }; 107 108 // Merge the lexicon into the main lexicon and populate the data 109 // structures necessary to translate lite tvis to main tvis, track backfilling 110 // and expanding lite terms to prefix terms. 111 // 112 // RETURNS: 113 // - OK on success 114 // - INTERNAL on IO error while writing to the main lexicon. MergeLexicon(const IcingDynamicTrie & other_lexicon)115 libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon( 116 const IcingDynamicTrie& other_lexicon) { 117 // Backfill branch points need to be added first so that the backfill_map 118 // can be correctly populated. 119 ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs, 120 AddBackfillBranchPoints(other_lexicon)); 121 ICING_ASSIGN_OR_RETURN(outputs, 122 AddTerms(other_lexicon, std::move(outputs))); 123 // Non-backfill branch points need to be added last so that the mapping of 124 // newly added terms to prefix terms can be correctly populated (prefix 125 // terms might be branch points between two new terms or between a 126 // pre-existing term and a new term). 127 ICING_ASSIGN_OR_RETURN(outputs, 128 AddBranchPoints(other_lexicon, std::move(outputs))); 129 return outputs; 130 } 131 132 // Add hits to the main index and backfill from existing posting lists to new 133 // backfill branch points. 134 // 135 // The backfill_map maps from main_lexicon tvi for a newly added branching 136 // point to the main_lexicon tvi for the posting list whose hits must be 137 // backfilled. backfill_map should be populated as part of LexiconMergeOutputs 138 // in MergeLexicon and be blindly passed to this function. 139 // 140 // RETURNS: 141 // - OK on success 142 // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id 143 // exceeds the max TermId, is not valid or is not less than pre-existing hits 144 // in the main index. 145 // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks 146 // - RESOURCE_EXHAUSTED error if unable to grow the index 147 libtextclassifier3::Status AddHits( 148 const TermIdCodec& term_id_codec, 149 std::unordered_map<uint32_t, uint32_t>&& backfill_map, 150 std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id); 151 PersistToDisk()152 libtextclassifier3::Status PersistToDisk() { 153 if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) { 154 return libtextclassifier3::Status::OK; 155 } 156 return absl_ports::InternalError("Unable to sync lite index components."); 157 } 158 last_added_document_id()159 DocumentId last_added_document_id() const { 160 return flash_index_storage_->get_last_indexed_docid(); 161 } 162 Reset()163 libtextclassifier3::Status Reset() { 164 ICING_RETURN_IF_ERROR(flash_index_storage_->Reset()); 165 main_lexicon_->Clear(); 166 return libtextclassifier3::Status::OK; 167 } 168 Warm()169 void Warm() { main_lexicon_->Warm(); } 170 171 // Returns: 172 // - elements size of lexicon and index, on success 173 // - INTERNAL on IO error 174 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; 175 176 // Takes the provided storage_info, populates the fields related to the main 177 // index and returns that storage_info. 178 // 179 // If an IO error occurs while trying to calculate the value for a field, then 180 // that field will be set to -1. 181 IndexStorageInfoProto GetStorageInfo( 182 IndexStorageInfoProto storage_info) const; 183 184 // Returns debug information for the main index in out. 185 // verbosity <= 0, simplest debug information - just the lexicon 186 // verbosity > 0, more detailed debug information including raw postings 187 // lists. 188 void GetDebugInfo(int verbosity, std::string* out) const; 189 190 private: 191 libtextclassifier3::Status Init(const std::string& index_directory, 192 const Filesystem* filesystem, 193 const IcingFilesystem* icing_filesystem); 194 195 // Helpers for merging the lexicon 196 // Add all 'backfill' branch points. Backfill branch points are prefix 197 // branch points that are a prefix of terms that existed in the lexicon 198 // to the merge. 199 // 200 // For example, if the main lexicon only contains "foot" and is then merged 201 // with a lite lexicon containing only "fool", then a backfill branch point 202 // for "foo" will be added to contain prefix hits from both the pre-existing 203 // posting list for "foot" and the new posting list for "fool". 204 // 205 // Populates LexiconMergeOutputs.backfill_map 206 // 207 // RETURNS: 208 // - OK on success 209 // - INTERNAL on IO error while writing to the main lexicon. 210 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints( 211 const IcingDynamicTrie& other_lexicon); 212 213 // Add all terms from the lexicon. 214 // 215 // Populates LexiconMergeOutputs.other_tvi_to_main_tvi 216 // 217 // RETURNS: 218 // - OK on success 219 // - INTERNAL on IO error while writing to the main lexicon. 220 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms( 221 const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); 222 223 // Add all branch points for terms added from the lexicon. 224 // For example, if the main lexicon is empty and is then merged with a 225 // lexicon containing only "foot" and "fool", then a branch point for "foo" 226 // will be added to contain prefix hits from both "foot" and "fool". 227 // 228 // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and 229 // LexiconMergeOutputs.prefix_tvis_buf; 230 // 231 // RETURNS: 232 // - OK on success 233 // - INTERNAL on IO error while writing to the main lexicon. 234 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints( 235 const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); 236 237 // Copies all properties from old_tvi in the other lexicon to the new_tvi in 238 // the main lexicon. 239 // Returns true on success, false if an IO error is encountered. 240 bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader, 241 const IcingDynamicTrie& other_lexicon, uint32_t other_tvi, 242 uint32_t new_main_tvi); 243 244 // Add all hits between [hit_elements, hit_elements + len) to main_index, 245 // updating the entry in the main lexicon at trie_value_index to point to the 246 // resulting posting list. Hits are sorted in descending document id order, so 247 // they should be to posting lists in reverse (starting at hit_elements 248 // + len - 1) and working backwards. Therefore, hit_elements must be in sorted 249 // order. 250 // 251 // trie_value_index may point to a valid posting list id if there is a 252 // pre-existing posting list to append to. 253 // 254 // If backfill_posting_list_id is valid, then the hits from the posting list 255 // identified by backfill_posting_list_id should be added to the new posting 256 // list before the hits in hit_elements. 257 // 258 // RETURNS: 259 // - OK on success 260 // - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid 261 // but points out of bounds in the IndexBlock referred to by 262 // id.block_index(), if one of the hits from [hit_elements,hit_elements+len) 263 // is not valid, or if one of the hits from [hit_elements,hit_elements+len) 264 // is not less than the previously added hits. 265 // - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid 266 // but points to an invalid block index or if unable to mmap the IndexBlock. 267 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new 268 // posting list. 269 libtextclassifier3::Status AddHitsForTerm( 270 uint32_t tvi, PostingListIdentifier backfill_posting_list_id, 271 const TermIdHitPair* hit_elements, size_t len); 272 273 // Adds all prefix hits or hits from prefix sections present on the posting 274 // list identified by backfill_posting_list_id to hit_accum. 275 // 276 // RETURNS: 277 // - OK, on success 278 // - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the 279 // IndexBlock referred to by id.block_index() 280 // - INTERNAL_ERROR if unable to mmap the block identified by 281 // backfill_posting_list_id or if the posting list identified by 282 // backfill_posting_list_id has been corrupted. 283 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new 284 // posting list. 285 libtextclassifier3::Status AddPrefixBackfillHits( 286 PostingListIdentifier backfill_posting_list_id, 287 PostingListAccessor* hit_accum); 288 289 std::unique_ptr<FlashIndexStorage> flash_index_storage_; 290 std::unique_ptr<IcingDynamicTrie> main_lexicon_; 291 }; 292 293 } // namespace lib 294 } // namespace icing 295 296 #endif // ICING_INDEX_MAIN_MAIN_INDEX_H_ 297