1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
16 #define ICING_INDEX_MAIN_MAIN_INDEX_H_
17 
18 #include <memory>
19 
20 #include "icing/text_classifier/lib3/utils/base/status.h"
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/file/filesystem.h"
23 #include "icing/index/lite/term-id-hit-pair.h"
24 #include "icing/index/main/flash-index-storage.h"
25 #include "icing/index/main/posting-list-accessor.h"
26 #include "icing/index/term-id-codec.h"
27 #include "icing/index/term-metadata.h"
28 #include "icing/legacy/index/icing-dynamic-trie.h"
29 #include "icing/legacy/index/icing-filesystem.h"
30 #include "icing/proto/storage.pb.h"
31 #include "icing/store/namespace-id.h"
32 #include "icing/util/status-macros.h"
33 
34 namespace icing {
35 namespace lib {
36 
37 class MainIndex {
38  public:
39   // RETURNS:
40   //  - valid instance of MainIndex, on success.
41   //  - INTERNAL error if unable to create the lexicon or flash storage.
42   static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create(
43       const std::string& index_directory, const Filesystem* filesystem,
44       const IcingFilesystem* icing_filesystem);
45 
46   // Get a PostingListAccessor that holds the posting list chain for 'term'.
47   //
48   // RETURNS:
49   //  - On success, a valid PostingListAccessor
50   //  - NOT_FOUND if term is not present in the main index.
51   libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
52   GetAccessorForExactTerm(const std::string& term);
53 
54   // Get a PostingListAccessor for 'prefix'.
55   //
56   // RETURNS:
57   //  - On success, a result containing a valid PostingListAccessor.
58   //  - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
59   //    prefix are present in the main index.
60   struct GetPrefixAccessorResult {
61     // A PostingListAccessor that holds the posting list chain for the term
62     // that best represents 'prefix' in the main index.
63     std::unique_ptr<PostingListAccessor> accessor;
64     // True if the returned posting list chain is for 'prefix' or false if the
65     // returned posting list chain is for a term for which 'prefix' is a prefix.
66     bool exact;
67   };
68   libtextclassifier3::StatusOr<GetPrefixAccessorResult>
69   GetAccessorForPrefixTerm(const std::string& prefix);
70 
71   // Finds terms with the given prefix in the given namespaces. If
72   // 'namespace_ids' is empty, returns results from all the namespaces. The
73   // input prefix must be normalized, otherwise inaccurate results may be
74   // returned. Results are not sorted specifically and are in lexigraphical
75   // order. Number of results are no more than 'num_to_return'.
76   //
77   // The hit count returned with each TermMetadata is an approximation based of
78   // posting list size.
79   //
80   // Returns:
81   //   A list of TermMetadata on success
82   //   INTERNAL_ERROR if failed to access term data.
83   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
84       const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
85       int num_to_return);
86 
87   struct LexiconMergeOutputs {
88     // Maps from main_lexicon tvi for new branching point to the main_lexicon
89     // tvi for posting list whose hits must be backfilled.
90     std::unordered_map<uint32_t, uint32_t> backfill_map;
91 
92     // Maps from lexicon tvis to main_lexicon tvis.
93     std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
94 
95     // Maps from main lexicon tvi to the block index. Tvis with no entry do not
96     // have an allocated posting list.
97     std::unordered_map<uint32_t, int> main_tvi_to_block_index;
98 
99     // Maps from the lexicon tvi to the beginning position in
100     // prefix_tvis_buf and the length.
101     std::unordered_map<uint32_t, std::pair<int, int>>
102         other_tvi_to_prefix_main_tvis;
103 
104     // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
105     std::vector<uint32_t> prefix_tvis_buf;
106   };
107 
108   // Merge the lexicon into the main lexicon and populate the data
109   // structures necessary to translate lite tvis to main tvis, track backfilling
110   // and expanding lite terms to prefix terms.
111   //
112   // RETURNS:
113   //   - OK on success
114   //   - INTERNAL on IO error while writing to the main lexicon.
MergeLexicon(const IcingDynamicTrie & other_lexicon)115   libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
116       const IcingDynamicTrie& other_lexicon) {
117     // Backfill branch points need to be added first so that the backfill_map
118     // can be correctly populated.
119     ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
120                            AddBackfillBranchPoints(other_lexicon));
121     ICING_ASSIGN_OR_RETURN(outputs,
122                            AddTerms(other_lexicon, std::move(outputs)));
123     // Non-backfill branch points need to be added last so that the mapping of
124     // newly added terms to prefix terms can be correctly populated (prefix
125     // terms might be branch points between two new terms or between a
126     // pre-existing term and a new term).
127     ICING_ASSIGN_OR_RETURN(outputs,
128                            AddBranchPoints(other_lexicon, std::move(outputs)));
129     return outputs;
130   }
131 
132   // Add hits to the main index and backfill from existing posting lists to new
133   // backfill branch points.
134   //
135   // The backfill_map maps from main_lexicon tvi for a newly added branching
136   // point to the main_lexicon tvi for the posting list whose hits must be
137   // backfilled. backfill_map should be populated as part of LexiconMergeOutputs
138   // in MergeLexicon and be blindly passed to this function.
139   //
140   // RETURNS:
141   //  - OK on success
142   //  - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
143   //  exceeds the max TermId, is not valid or is not less than pre-existing hits
144   //  in the main index.
145   //  - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
146   //  - RESOURCE_EXHAUSTED error if unable to grow the index
147   libtextclassifier3::Status AddHits(
148       const TermIdCodec& term_id_codec,
149       std::unordered_map<uint32_t, uint32_t>&& backfill_map,
150       std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id);
151 
PersistToDisk()152   libtextclassifier3::Status PersistToDisk() {
153     if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) {
154       return libtextclassifier3::Status::OK;
155     }
156     return absl_ports::InternalError("Unable to sync lite index components.");
157   }
158 
last_added_document_id()159   DocumentId last_added_document_id() const {
160     return flash_index_storage_->get_last_indexed_docid();
161   }
162 
Reset()163   libtextclassifier3::Status Reset() {
164     ICING_RETURN_IF_ERROR(flash_index_storage_->Reset());
165     main_lexicon_->Clear();
166     return libtextclassifier3::Status::OK;
167   }
168 
Warm()169   void Warm() { main_lexicon_->Warm(); }
170 
171   // Returns:
172   //  - elements size of lexicon and index, on success
173   //  - INTERNAL on IO error
174   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
175 
176   // Takes the provided storage_info, populates the fields related to the main
177   // index and returns that storage_info.
178   //
179   // If an IO error occurs while trying to calculate the value for a field, then
180   // that field will be set to -1.
181   IndexStorageInfoProto GetStorageInfo(
182       IndexStorageInfoProto storage_info) const;
183 
184   // Returns debug information for the main index in out.
185   // verbosity <= 0, simplest debug information - just the lexicon
186   // verbosity > 0, more detailed debug information including raw postings
187   //                lists.
188   void GetDebugInfo(int verbosity, std::string* out) const;
189 
190  private:
191   libtextclassifier3::Status Init(const std::string& index_directory,
192                                   const Filesystem* filesystem,
193                                   const IcingFilesystem* icing_filesystem);
194 
195   // Helpers for merging the lexicon
196   // Add all 'backfill' branch points. Backfill branch points are prefix
197   // branch points that are a prefix of terms that existed in the lexicon
198   // to the merge.
199   //
200   // For example, if the main lexicon only contains "foot" and is then merged
201   // with a lite lexicon containing only "fool", then a backfill branch point
202   // for "foo" will be added to contain prefix hits from both the pre-existing
203   // posting list for "foot" and the new posting list for "fool".
204   //
205   // Populates LexiconMergeOutputs.backfill_map
206   //
207   // RETURNS:
208   //   - OK on success
209   //   - INTERNAL on IO error while writing to the main lexicon.
210   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
211       const IcingDynamicTrie& other_lexicon);
212 
213   // Add all terms from the lexicon.
214   //
215   // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
216   //
217   // RETURNS:
218   //   - OK on success
219   //   - INTERNAL on IO error while writing to the main lexicon.
220   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
221       const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
222 
223   // Add all branch points for terms added from the lexicon.
224   // For example, if the main lexicon is empty and is then merged with a
225   // lexicon containing only "foot" and "fool", then a branch point for "foo"
226   // will be added to contain prefix hits from both "foot" and "fool".
227   //
228   // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
229   // LexiconMergeOutputs.prefix_tvis_buf;
230   //
231   // RETURNS:
232   //   - OK on success
233   //   - INTERNAL on IO error while writing to the main lexicon.
234   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
235       const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
236 
237   // Copies all properties from old_tvi in the other lexicon to the new_tvi in
238   // the main lexicon.
239   // Returns true on success, false if an IO error is encountered.
240   bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
241                       const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
242                       uint32_t new_main_tvi);
243 
244   // Add all hits between [hit_elements, hit_elements + len) to main_index,
245   // updating the entry in the main lexicon at trie_value_index to point to the
246   // resulting posting list. Hits are sorted in descending document id order, so
247   // they should be to posting lists in reverse (starting at hit_elements
248   // + len - 1) and working backwards. Therefore, hit_elements must be in sorted
249   // order.
250   //
251   // trie_value_index may point to a valid posting list id if there is a
252   // pre-existing posting list to append to.
253   //
254   // If backfill_posting_list_id is valid, then the hits from the posting list
255   // identified by backfill_posting_list_id should be added to the new posting
256   // list before the hits in hit_elements.
257   //
258   // RETURNS:
259   //  - OK on success
260   //  - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid
261   //  but points out of bounds in the IndexBlock referred to by
262   //  id.block_index(), if one of the hits from [hit_elements,hit_elements+len)
263   //  is not valid, or if one of the hits from [hit_elements,hit_elements+len)
264   //  is not less than the previously added hits.
265   //  - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid
266   //  but points to an invalid block index or if unable to mmap the IndexBlock.
267   //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
268   //  posting list.
269   libtextclassifier3::Status AddHitsForTerm(
270       uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
271       const TermIdHitPair* hit_elements, size_t len);
272 
273   // Adds all prefix hits or hits from prefix sections present on the posting
274   // list identified by backfill_posting_list_id to hit_accum.
275   //
276   // RETURNS:
277   //  - OK, on success
278   //  - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the
279   //  IndexBlock referred to by id.block_index()
280   //  - INTERNAL_ERROR if unable to mmap the block identified by
281   //  backfill_posting_list_id or if the posting list identified by
282   //  backfill_posting_list_id has been corrupted.
283   //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
284   //  posting list.
285   libtextclassifier3::Status AddPrefixBackfillHits(
286       PostingListIdentifier backfill_posting_list_id,
287       PostingListAccessor* hit_accum);
288 
289   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
290   std::unique_ptr<IcingDynamicTrie> main_lexicon_;
291 };
292 
293 }  // namespace lib
294 }  // namespace icing
295 
296 #endif  // ICING_INDEX_MAIN_MAIN_INDEX_H_
297