1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "icing/index/main/main-index-merger.h"
15 
16 #include "gmock/gmock.h"
17 #include "gtest/gtest.h"
18 #include "icing/absl_ports/canonical_errors.h"
19 #include "icing/file/filesystem.h"
20 #include "icing/index/iterator/doc-hit-info-iterator.h"
21 #include "icing/index/main/doc-hit-info-iterator-term-main.h"
22 #include "icing/index/main/main-index-merger.h"
23 #include "icing/index/main/main-index.h"
24 #include "icing/index/term-id-codec.h"
25 #include "icing/index/term-property-id.h"
26 #include "icing/legacy/index/icing-dynamic-trie.h"
27 #include "icing/legacy/index/icing-filesystem.h"
28 #include "icing/schema/section.h"
29 #include "icing/store/namespace-id.h"
30 #include "icing/testing/common-matchers.h"
31 #include "icing/testing/tmp-directory.h"
32 
33 namespace icing {
34 namespace lib {
35 
36 namespace {
37 
38 using ::testing::UnorderedElementsAre;
39 
40 class MainIndexMergerTest : public testing::Test {
41  protected:
SetUp()42   void SetUp() override {
43     index_dir_ = GetTestTempDir() + "/test_dir";
44     ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
45 
46     std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
47     LiteIndex::Options options(lite_index_file_name,
48                                /*hit_buffer_want_merge_bytes=*/1024 * 1024);
49     ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
50                                LiteIndex::Create(options, &icing_filesystem_));
51 
52     ICING_ASSERT_OK_AND_ASSIGN(
53         term_id_codec_,
54         TermIdCodec::Create(
55             IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
56             IcingDynamicTrie::max_value_index(options.lexicon_options)));
57   }
58 
TearDown()59   void TearDown() override {
60     ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
61   }
62 
63   std::string index_dir_;
64   Filesystem filesystem_;
65   IcingFilesystem icing_filesystem_;
66   std::unique_ptr<LiteIndex> lite_index_;
67   std::unique_ptr<TermIdCodec> term_id_codec_;
68 };
69 
70 constexpr NamespaceId kNamespace0 = 0;
71 
TEST_F(MainIndexMergerTest,TranslateTermNotAdded)72 TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
73   // 1. Index two docs in the Lite Index:
74   // - Doc0 {"foot" is_in_prefix_section=FALSE}
75   // - Doc1 {"fool", is_in_prefix_section=FALSE}
76   ICING_ASSERT_OK_AND_ASSIGN(
77       uint32_t foot_tvi,
78       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
79   ICING_ASSERT_OK_AND_ASSIGN(
80       uint32_t foot_term_id,
81       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
82   ICING_ASSERT_OK_AND_ASSIGN(
83       uint32_t fool_tvi,
84       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
85   ICING_ASSERT_OK_AND_ASSIGN(
86       uint32_t fool_term_id,
87       term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
88 
89   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
90                /*is_in_prefix_section=*/false);
91   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
92   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
93                /*is_in_prefix_section=*/false);
94   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
95 
96   // 2. Build up a fake LexiconMergeOutputs
97   // This is some made up number that doesn't matter for this test.
98   uint32_t foot_main_tvi = 5;
99 
100   // Only create a mapping for 'foot'. Leave out the mapping for 'fool'
101   MainIndex::LexiconMergeOutputs lexicon_outputs;
102   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
103 
104   // 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
105   // mapping.
106   ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
107                   *lite_index_, *term_id_codec_, lexicon_outputs),
108               StatusIs(libtextclassifier3::StatusCode::INTERNAL));
109 }
110 
TEST_F(MainIndexMergerTest,PrefixExpansion)111 TEST_F(MainIndexMergerTest, PrefixExpansion) {
112   // 1. Index two docs in the Lite Index:
113   // - Doc0 {"foot" is_in_prefix_section=FALSE}
114   // - Doc1 {"fool", is_in_prefix_section=TRUE}
115   ICING_ASSERT_OK_AND_ASSIGN(
116       uint32_t foot_tvi,
117       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
118   ICING_ASSERT_OK_AND_ASSIGN(
119       uint32_t foot_term_id,
120       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
121   ICING_ASSERT_OK_AND_ASSIGN(
122       uint32_t fool_tvi,
123       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
124   ICING_ASSERT_OK_AND_ASSIGN(
125       uint32_t fool_term_id,
126       term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
127 
128   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
129                /*is_in_prefix_section=*/false);
130   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
131   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
132                /*is_in_prefix_section=*/true);
133   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
134 
135   // 2. Build up a fake LexiconMergeOutputs
136   // This is some made up number that doesn't matter for this test.
137   uint32_t foo_main_tvi = 12;
138   ICING_ASSERT_OK_AND_ASSIGN(
139       uint32_t foo_term_id,
140       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
141   Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1,
142                       Hit::kDefaultTermFrequency,
143                       /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
144 
145   uint32_t foot_main_tvi = 5;
146   ICING_ASSERT_OK_AND_ASSIGN(
147       uint32_t foot_main_term_id,
148       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
149   uint32_t fool_main_tvi = 10;
150   ICING_ASSERT_OK_AND_ASSIGN(
151       uint32_t fool_main_term_id,
152       term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
153 
154   MainIndex::LexiconMergeOutputs lexicon_outputs;
155   // Map "fool" to it's prefix hit for "foo".
156   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
157                                                         std::make_pair(0, 1));
158   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
159   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
160   lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
161 
162   // 3. TranslateAndExpand should;
163   //   a. Translate lite term ids to main term ids based on the map
164   //   b. Expand 'fool' to have a hit for 'foo'
165   ICING_ASSERT_OK_AND_ASSIGN(
166       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
167       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
168                                                   lexicon_outputs));
169   EXPECT_THAT(
170       expanded_term_id_hit_pairs,
171       UnorderedElementsAre(TermIdHitPair(foot_main_term_id, doc0_hit),
172                            TermIdHitPair(fool_main_term_id, doc1_hit),
173                            TermIdHitPair(foo_term_id, doc1_prefix_hit)));
174 }
175 
TEST_F(MainIndexMergerTest,DedupePrefixAndExactWithDifferentTermFrequencies)176 TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
177   // 1. Index one doc in the Lite Index:
178   // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
179   ICING_ASSERT_OK_AND_ASSIGN(
180       uint32_t foot_tvi,
181       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
182   ICING_ASSERT_OK_AND_ASSIGN(
183       uint32_t foot_term_id,
184       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
185   ICING_ASSERT_OK_AND_ASSIGN(
186       uint32_t foo_tvi,
187       lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
188   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
189                              term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
190 
191   Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
192                     /*is_in_prefix_section=*/true);
193   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
194   Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
195                    Hit::kDefaultTermFrequency,
196                    /*is_in_prefix_section=*/true);
197   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
198 
199   // 2. Build up a fake LexiconMergeOutputs
200   // This is some made up number that doesn't matter for this test.
201   uint32_t foo_main_tvi = 12;
202   ICING_ASSERT_OK_AND_ASSIGN(
203       uint32_t foo_main_term_id,
204       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
205   // The prefix hit for 'foot' should have the same term frequency as the exact
206   // hit for 'foot'. The final prefix hit has term frequency equal to 58.
207   Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
208                       /*term_frequency=*/58,
209                       /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
210 
211   uint32_t foot_main_tvi = 5;
212   ICING_ASSERT_OK_AND_ASSIGN(
213       uint32_t foot_main_term_id,
214       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
215 
216   MainIndex::LexiconMergeOutputs lexicon_outputs;
217   // Map "foot" to it's prefix hit for "foo".
218   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
219                                                         std::make_pair(0, 1));
220   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
221   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
222   lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
223 
224   // 3. TranslateAndExpand should;
225   //   a. Translate lite term ids to main term ids based on the map
226   //   b. Expand 'foot' to have a hit for 'foo'
227   //   c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
228   //   latter with term frequency as the sum of the term frequencies.
229   ICING_ASSERT_OK_AND_ASSIGN(
230       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
231       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
232                                                   lexicon_outputs));
233   EXPECT_THAT(
234       expanded_term_id_hit_pairs,
235       UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
236                            TermIdHitPair(foo_main_term_id, foo_doc0_hit),
237                            TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
238 }
239 
TEST_F(MainIndexMergerTest,DedupeWithExactSameTermFrequencies)240 TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
241   // 1. Index one doc in the Lite Index:
242   // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
243   ICING_ASSERT_OK_AND_ASSIGN(
244       uint32_t foot_tvi,
245       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
246   ICING_ASSERT_OK_AND_ASSIGN(
247       uint32_t foot_term_id,
248       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
249   ICING_ASSERT_OK_AND_ASSIGN(
250       uint32_t foo_tvi,
251       lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
252   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
253                              term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
254 
255   Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
256                     /*is_in_prefix_section=*/true);
257   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
258   Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
259                    /*is_in_prefix_section=*/true);
260   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
261   // The prefix hit should take the sum as term_frequency - 114.
262   Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
263                           /*term_frequency=*/114,
264                           /*is_in_prefix_section=*/true,
265                           /*is_prefix_hit=*/true);
266 
267   // 2. Build up a fake LexiconMergeOutputs
268   // This is some made up number that doesn't matter for this test.
269   uint32_t foo_main_tvi = 12;
270   ICING_ASSERT_OK_AND_ASSIGN(
271       uint32_t foo_main_term_id,
272       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
273 
274   uint32_t foot_main_tvi = 5;
275   ICING_ASSERT_OK_AND_ASSIGN(
276       uint32_t foot_main_term_id,
277       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
278 
279   MainIndex::LexiconMergeOutputs lexicon_outputs;
280   // Map "foot" to it's prefix hit for "foo".
281   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
282                                                         std::make_pair(0, 1));
283   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
284   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
285   lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
286 
287   // 3. TranslateAndExpand should;
288   //   a. Translate lite term ids to main term ids based on the map
289   //   b. Expand 'foot' to have a hit for 'foo'
290   //   c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
291   //   latter with term frequency as the sum of the term frequencies.
292   ICING_ASSERT_OK_AND_ASSIGN(
293       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
294       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
295                                                   lexicon_outputs));
296   EXPECT_THAT(expanded_term_id_hit_pairs,
297               UnorderedElementsAre(
298                   TermIdHitPair(foot_main_term_id, foot_doc0_hit),
299                   TermIdHitPair(foo_main_term_id, foo_doc0_hit),
300                   TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit)));
301 }
302 
TEST_F(MainIndexMergerTest,DedupePrefixExpansion)303 TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
304   // 1. Index one doc in the Lite Index:
305   // - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
306   ICING_ASSERT_OK_AND_ASSIGN(
307       uint32_t foot_tvi,
308       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
309   ICING_ASSERT_OK_AND_ASSIGN(
310       uint32_t foot_term_id,
311       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
312   ICING_ASSERT_OK_AND_ASSIGN(
313       uint32_t fool_tvi,
314       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
315   ICING_ASSERT_OK_AND_ASSIGN(
316       uint32_t fool_term_id,
317       term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
318 
319   Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0,
320                     /*term_frequency=*/Hit::kMaxTermFrequency,
321                     /*is_in_prefix_section=*/true);
322   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
323   Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0,
324                     Hit::kDefaultTermFrequency,
325                     /*is_in_prefix_section=*/true);
326   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));
327 
328   // 2. Build up a fake LexiconMergeOutputs
329   // This is some made up number that doesn't matter for this test.
330   uint32_t foo_main_tvi = 12;
331   ICING_ASSERT_OK_AND_ASSIGN(
332       uint32_t foo_term_id,
333       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
334   // The prefix hit should take the sum as term frequency - 256, capped at
335   // kMaxTermFrequency.
336   Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
337                       /*term_frequency=*/Hit::kMaxTermFrequency,
338                       /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
339 
340   uint32_t foot_main_tvi = 5;
341   ICING_ASSERT_OK_AND_ASSIGN(
342       uint32_t foot_main_term_id,
343       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
344   uint32_t fool_main_tvi = 10;
345   ICING_ASSERT_OK_AND_ASSIGN(
346       uint32_t fool_main_term_id,
347       term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
348 
349   MainIndex::LexiconMergeOutputs lexicon_outputs;
350   // Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
351   // "foo".
352   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
353                                                         std::make_pair(0, 1));
354   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
355   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
356                                                         std::make_pair(1, 1));
357   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
358   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
359   lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
360 
361   // 3. TranslateAndExpand should;
362   //   a. Translate lite term ids to main term ids based on the map
363   //   b. Expand 'foot' and 'fool' to have hits for 'foo'
364   //   c. Merge the prefix hits from 'foot' and 'fool', taking the sum as
365   //      term frequency.
366   ICING_ASSERT_OK_AND_ASSIGN(
367       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
368       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
369                                                   lexicon_outputs));
370   EXPECT_THAT(
371       expanded_term_id_hit_pairs,
372       UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
373                            TermIdHitPair(fool_main_term_id, fool_doc0_hit),
374                            TermIdHitPair(foo_term_id, doc0_prefix_hit)));
375 }
376 
377 }  // namespace
378 
379 }  // namespace lib
380 }  // namespace icing
381