1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "icing/index/main/main-index-merger.h"
15
16 #include "gmock/gmock.h"
17 #include "gtest/gtest.h"
18 #include "icing/absl_ports/canonical_errors.h"
19 #include "icing/file/filesystem.h"
20 #include "icing/index/iterator/doc-hit-info-iterator.h"
21 #include "icing/index/main/doc-hit-info-iterator-term-main.h"
22 #include "icing/index/main/main-index-merger.h"
23 #include "icing/index/main/main-index.h"
24 #include "icing/index/term-id-codec.h"
25 #include "icing/index/term-property-id.h"
26 #include "icing/legacy/index/icing-dynamic-trie.h"
27 #include "icing/legacy/index/icing-filesystem.h"
28 #include "icing/schema/section.h"
29 #include "icing/store/namespace-id.h"
30 #include "icing/testing/common-matchers.h"
31 #include "icing/testing/tmp-directory.h"
32
33 namespace icing {
34 namespace lib {
35
36 namespace {
37
38 using ::testing::UnorderedElementsAre;
39
40 class MainIndexMergerTest : public testing::Test {
41 protected:
SetUp()42 void SetUp() override {
43 index_dir_ = GetTestTempDir() + "/test_dir";
44 ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
45
46 std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
47 LiteIndex::Options options(lite_index_file_name,
48 /*hit_buffer_want_merge_bytes=*/1024 * 1024);
49 ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
50 LiteIndex::Create(options, &icing_filesystem_));
51
52 ICING_ASSERT_OK_AND_ASSIGN(
53 term_id_codec_,
54 TermIdCodec::Create(
55 IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
56 IcingDynamicTrie::max_value_index(options.lexicon_options)));
57 }
58
TearDown()59 void TearDown() override {
60 ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
61 }
62
63 std::string index_dir_;
64 Filesystem filesystem_;
65 IcingFilesystem icing_filesystem_;
66 std::unique_ptr<LiteIndex> lite_index_;
67 std::unique_ptr<TermIdCodec> term_id_codec_;
68 };
69
70 constexpr NamespaceId kNamespace0 = 0;
71
TEST_F(MainIndexMergerTest,TranslateTermNotAdded)72 TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
73 // 1. Index two docs in the Lite Index:
74 // - Doc0 {"foot" is_in_prefix_section=FALSE}
75 // - Doc1 {"fool", is_in_prefix_section=FALSE}
76 ICING_ASSERT_OK_AND_ASSIGN(
77 uint32_t foot_tvi,
78 lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
79 ICING_ASSERT_OK_AND_ASSIGN(
80 uint32_t foot_term_id,
81 term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
82 ICING_ASSERT_OK_AND_ASSIGN(
83 uint32_t fool_tvi,
84 lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
85 ICING_ASSERT_OK_AND_ASSIGN(
86 uint32_t fool_term_id,
87 term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
88
89 Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
90 /*is_in_prefix_section=*/false);
91 ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
92 Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
93 /*is_in_prefix_section=*/false);
94 ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
95
96 // 2. Build up a fake LexiconMergeOutputs
97 // This is some made up number that doesn't matter for this test.
98 uint32_t foot_main_tvi = 5;
99
100 // Only create a mapping for 'foot'. Leave out the mapping for 'fool'
101 MainIndex::LexiconMergeOutputs lexicon_outputs;
102 lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
103
104 // 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
105 // mapping.
106 ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
107 *lite_index_, *term_id_codec_, lexicon_outputs),
108 StatusIs(libtextclassifier3::StatusCode::INTERNAL));
109 }
110
TEST_F(MainIndexMergerTest,PrefixExpansion)111 TEST_F(MainIndexMergerTest, PrefixExpansion) {
112 // 1. Index two docs in the Lite Index:
113 // - Doc0 {"foot" is_in_prefix_section=FALSE}
114 // - Doc1 {"fool", is_in_prefix_section=TRUE}
115 ICING_ASSERT_OK_AND_ASSIGN(
116 uint32_t foot_tvi,
117 lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
118 ICING_ASSERT_OK_AND_ASSIGN(
119 uint32_t foot_term_id,
120 term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
121 ICING_ASSERT_OK_AND_ASSIGN(
122 uint32_t fool_tvi,
123 lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
124 ICING_ASSERT_OK_AND_ASSIGN(
125 uint32_t fool_term_id,
126 term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
127
128 Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
129 /*is_in_prefix_section=*/false);
130 ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
131 Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
132 /*is_in_prefix_section=*/true);
133 ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
134
135 // 2. Build up a fake LexiconMergeOutputs
136 // This is some made up number that doesn't matter for this test.
137 uint32_t foo_main_tvi = 12;
138 ICING_ASSERT_OK_AND_ASSIGN(
139 uint32_t foo_term_id,
140 term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
141 Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1,
142 Hit::kDefaultTermFrequency,
143 /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
144
145 uint32_t foot_main_tvi = 5;
146 ICING_ASSERT_OK_AND_ASSIGN(
147 uint32_t foot_main_term_id,
148 term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
149 uint32_t fool_main_tvi = 10;
150 ICING_ASSERT_OK_AND_ASSIGN(
151 uint32_t fool_main_term_id,
152 term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
153
154 MainIndex::LexiconMergeOutputs lexicon_outputs;
155 // Map "fool" to it's prefix hit for "foo".
156 lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
157 std::make_pair(0, 1));
158 lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
159 lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
160 lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
161
162 // 3. TranslateAndExpand should;
163 // a. Translate lite term ids to main term ids based on the map
164 // b. Expand 'fool' to have a hit for 'foo'
165 ICING_ASSERT_OK_AND_ASSIGN(
166 std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
167 MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
168 lexicon_outputs));
169 EXPECT_THAT(
170 expanded_term_id_hit_pairs,
171 UnorderedElementsAre(TermIdHitPair(foot_main_term_id, doc0_hit),
172 TermIdHitPair(fool_main_term_id, doc1_hit),
173 TermIdHitPair(foo_term_id, doc1_prefix_hit)));
174 }
175
TEST_F(MainIndexMergerTest,DedupePrefixAndExactWithDifferentTermFrequencies)176 TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
177 // 1. Index one doc in the Lite Index:
178 // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
179 ICING_ASSERT_OK_AND_ASSIGN(
180 uint32_t foot_tvi,
181 lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
182 ICING_ASSERT_OK_AND_ASSIGN(
183 uint32_t foot_term_id,
184 term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
185 ICING_ASSERT_OK_AND_ASSIGN(
186 uint32_t foo_tvi,
187 lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
188 ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
189 term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
190
191 Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
192 /*is_in_prefix_section=*/true);
193 ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
194 Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
195 Hit::kDefaultTermFrequency,
196 /*is_in_prefix_section=*/true);
197 ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
198
199 // 2. Build up a fake LexiconMergeOutputs
200 // This is some made up number that doesn't matter for this test.
201 uint32_t foo_main_tvi = 12;
202 ICING_ASSERT_OK_AND_ASSIGN(
203 uint32_t foo_main_term_id,
204 term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
205 // The prefix hit for 'foot' should have the same term frequency as the exact
206 // hit for 'foot'. The final prefix hit has term frequency equal to 58.
207 Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
208 /*term_frequency=*/58,
209 /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
210
211 uint32_t foot_main_tvi = 5;
212 ICING_ASSERT_OK_AND_ASSIGN(
213 uint32_t foot_main_term_id,
214 term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
215
216 MainIndex::LexiconMergeOutputs lexicon_outputs;
217 // Map "foot" to it's prefix hit for "foo".
218 lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
219 std::make_pair(0, 1));
220 lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
221 lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
222 lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
223
224 // 3. TranslateAndExpand should;
225 // a. Translate lite term ids to main term ids based on the map
226 // b. Expand 'foot' to have a hit for 'foo'
227 // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
228 // latter with term frequency as the sum of the term frequencies.
229 ICING_ASSERT_OK_AND_ASSIGN(
230 std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
231 MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
232 lexicon_outputs));
233 EXPECT_THAT(
234 expanded_term_id_hit_pairs,
235 UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
236 TermIdHitPair(foo_main_term_id, foo_doc0_hit),
237 TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
238 }
239
TEST_F(MainIndexMergerTest,DedupeWithExactSameTermFrequencies)240 TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
241 // 1. Index one doc in the Lite Index:
242 // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
243 ICING_ASSERT_OK_AND_ASSIGN(
244 uint32_t foot_tvi,
245 lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
246 ICING_ASSERT_OK_AND_ASSIGN(
247 uint32_t foot_term_id,
248 term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
249 ICING_ASSERT_OK_AND_ASSIGN(
250 uint32_t foo_tvi,
251 lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
252 ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
253 term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
254
255 Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
256 /*is_in_prefix_section=*/true);
257 ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
258 Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
259 /*is_in_prefix_section=*/true);
260 ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
261 // The prefix hit should take the sum as term_frequency - 114.
262 Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
263 /*term_frequency=*/114,
264 /*is_in_prefix_section=*/true,
265 /*is_prefix_hit=*/true);
266
267 // 2. Build up a fake LexiconMergeOutputs
268 // This is some made up number that doesn't matter for this test.
269 uint32_t foo_main_tvi = 12;
270 ICING_ASSERT_OK_AND_ASSIGN(
271 uint32_t foo_main_term_id,
272 term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
273
274 uint32_t foot_main_tvi = 5;
275 ICING_ASSERT_OK_AND_ASSIGN(
276 uint32_t foot_main_term_id,
277 term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
278
279 MainIndex::LexiconMergeOutputs lexicon_outputs;
280 // Map "foot" to it's prefix hit for "foo".
281 lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
282 std::make_pair(0, 1));
283 lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
284 lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
285 lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
286
287 // 3. TranslateAndExpand should;
288 // a. Translate lite term ids to main term ids based on the map
289 // b. Expand 'foot' to have a hit for 'foo'
290 // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
291 // latter with term frequency as the sum of the term frequencies.
292 ICING_ASSERT_OK_AND_ASSIGN(
293 std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
294 MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
295 lexicon_outputs));
296 EXPECT_THAT(expanded_term_id_hit_pairs,
297 UnorderedElementsAre(
298 TermIdHitPair(foot_main_term_id, foot_doc0_hit),
299 TermIdHitPair(foo_main_term_id, foo_doc0_hit),
300 TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit)));
301 }
302
TEST_F(MainIndexMergerTest,DedupePrefixExpansion)303 TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
304 // 1. Index one doc in the Lite Index:
305 // - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
306 ICING_ASSERT_OK_AND_ASSIGN(
307 uint32_t foot_tvi,
308 lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
309 ICING_ASSERT_OK_AND_ASSIGN(
310 uint32_t foot_term_id,
311 term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
312 ICING_ASSERT_OK_AND_ASSIGN(
313 uint32_t fool_tvi,
314 lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
315 ICING_ASSERT_OK_AND_ASSIGN(
316 uint32_t fool_term_id,
317 term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
318
319 Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0,
320 /*term_frequency=*/Hit::kMaxTermFrequency,
321 /*is_in_prefix_section=*/true);
322 ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
323 Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0,
324 Hit::kDefaultTermFrequency,
325 /*is_in_prefix_section=*/true);
326 ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));
327
328 // 2. Build up a fake LexiconMergeOutputs
329 // This is some made up number that doesn't matter for this test.
330 uint32_t foo_main_tvi = 12;
331 ICING_ASSERT_OK_AND_ASSIGN(
332 uint32_t foo_term_id,
333 term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
334 // The prefix hit should take the sum as term frequency - 256, capped at
335 // kMaxTermFrequency.
336 Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
337 /*term_frequency=*/Hit::kMaxTermFrequency,
338 /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
339
340 uint32_t foot_main_tvi = 5;
341 ICING_ASSERT_OK_AND_ASSIGN(
342 uint32_t foot_main_term_id,
343 term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
344 uint32_t fool_main_tvi = 10;
345 ICING_ASSERT_OK_AND_ASSIGN(
346 uint32_t fool_main_term_id,
347 term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
348
349 MainIndex::LexiconMergeOutputs lexicon_outputs;
350 // Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
351 // "foo".
352 lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
353 std::make_pair(0, 1));
354 lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
355 lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
356 std::make_pair(1, 1));
357 lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
358 lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
359 lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
360
361 // 3. TranslateAndExpand should;
362 // a. Translate lite term ids to main term ids based on the map
363 // b. Expand 'foot' and 'fool' to have hits for 'foo'
364 // c. Merge the prefix hits from 'foot' and 'fool', taking the sum as
365 // term frequency.
366 ICING_ASSERT_OK_AND_ASSIGN(
367 std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
368 MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
369 lexicon_outputs));
370 EXPECT_THAT(
371 expanded_term_id_hit_pairs,
372 UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
373 TermIdHitPair(fool_main_term_id, fool_doc0_hit),
374 TermIdHitPair(foo_term_id, doc0_prefix_hit)));
375 }
376
377 } // namespace
378
379 } // namespace lib
380 } // namespace icing
381