1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_TERM_ID_CODEC_H_
16 #define ICING_INDEX_TERM_ID_CODEC_H_
17 
18 #include <cstdint>
19 #include <memory>
20 
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 
23 // Encodes/decodes TermIds into different TviTypes. A "tvi" is a
24 // term_value_index into some space, essentially a unique id within that space.
25 // Across TviTypes, tvis are not necessarily unique (i.e. we can have a tvi of 0
26 // in the LITE space and a tvi of 0 in the MAIN space). The codec maps tvis into
27 // one overall TermId space so that tvis can be represented by a unique TermId
28 // across all TviTypes (i.e. a MAIN tvi of 0 can be represented by 0, and a LITE
29 // tvi of 0 can be represented by 10). The max TermId will be the sum of the max
30 // MAIN tvi and the max LITE tvi.
31 //
32 // Example use:
33 //   ICING_ASSIGN_OR_RETURN(auto term_id_codec,
34 //       TermIdCodec::Create(/*max_main_tvi=*/5, /*max_lite_tvi=*/5);
35 //
36 //   term_id_codec->tvi_type(0); // TviType::Main
37 //   term_id_codec->tvi_type(4); // TviType::Main
38 //   term_id_codec->tvi_type(5); // TviType::Lite
39 //   term_id_codec->tvi_type(9); // TviType::Lite
40 //
41 //   term_id_codec->tvi_type(100); // INVALID_ARGUMENT, exceeds max TermId
42 //
43 // TODO(cassiewang): Use signed integers for the tvi values. Currently, the max
44 // values that are passed in are ~5 million for max_main_tvi, and ~1 million for
45 // max_lite_tvi. Since the sum of both of these is still well under the int32_t
46 // max, we should use signed integers (go/totw/159) (go/totw/159). But since
47 // we're getting these values from icing::DynamicTrie, we need to convert all
48 // the uints at once to avoid even worse undefined conversion behavior.
49 namespace icing {
50 namespace lib {
51 
52 enum TviType { MAIN, LITE };
53 
54 class TermIdCodec {
55  public:
56   struct DecodedTermInfo {
57     TviType tvi_type;
58     uint32_t tvi;
59   };
60 
61   // Encodes/decodes TermIds based on a max main tvi and a max lite tvi. The max
62   // tvis are an exclusive upper bound on the values. For example, Create(5, 5)
63   // creates a MAIN encoding that holds [0, 1, 2, 3, 4] TermIds and a LITE
64   // encoding that holds [5, 6, 7, 8, 9] TermIds.
65   //
66   // Returns:
67   //   unique_ptr to a TermIdCodec on success
68   //   INVALID_ARGUMENT if the sum of max_main_tvi and max_lite_tvi is greater
69   //     than the max uint32_t value
70   static libtextclassifier3::StatusOr<std::unique_ptr<TermIdCodec>> Create(
71       uint32_t max_main_tvi, uint32_t max_lite_tvi);
72 
73   // Returns:
74   //   TermId that would represent the given tvi of tvi_type
75   //   INVALID_ARGUMENT if the tvi of tvi_type would exceed the max TermId
76   libtextclassifier3::StatusOr<uint32_t> EncodeTvi(uint32_t tvi,
77                                                    TviType tvi_type) const;
78 
79   // Returns:
80   //   TviType of the encoded TermId
81   //   INVALID_ARGUMENT if the term_id exceeds the max TermId
82   libtextclassifier3::StatusOr<TviType> DecodeTviType(uint32_t term_id) const;
83 
84   // Returns:
85   //   Decoded info of the given term_id
86   //   INVALID_ARGUMENT if the term_id exceeds the max TermId
87   libtextclassifier3::StatusOr<DecodedTermInfo> DecodeTermInfo(
88       uint32_t term_id) const;
89 
max_main_tvi()90   uint32_t max_main_tvi() const { return max_main_tvi_; }
91 
max_lite_tvi()92   uint32_t max_lite_tvi() const { return max_lite_tvi_; }
93 
max_term_id()94   uint32_t max_term_id() const { return max_main_tvi_ + max_lite_tvi_; }
95 
96  private:
TermIdCodec(uint32_t max_main_tvi,uint32_t max_lite_tvi)97   explicit TermIdCodec(uint32_t max_main_tvi, uint32_t max_lite_tvi)
98       : max_main_tvi_(max_main_tvi), max_lite_tvi_(max_lite_tvi) {}
99 
100   uint32_t max_main_tvi_;
101   uint32_t max_lite_tvi_;
102 };
103 
104 }  // namespace lib
105 }  // namespace icing
106 
107 #endif  // ICING_INDEX_TERM_ID_CODEC_H_
108