1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_TERM_ID_CODEC_H_ 16 #define ICING_INDEX_TERM_ID_CODEC_H_ 17 18 #include <cstdint> 19 #include <memory> 20 21 #include "icing/text_classifier/lib3/utils/base/statusor.h" 22 23 // Encodes/decodes TermIds into different TviTypes. A "tvi" is a 24 // term_value_index into some space, essentially a unique id within that space. 25 // Across TviTypes, tvis are not necessarily unique (i.e. we can have a tvi of 0 26 // in the LITE space and a tvi of 0 in the MAIN space). The codec maps tvis into 27 // one overall TermId space so that tvis can be represented by a unique TermId 28 // across all TviTypes (i.e. a MAIN tvi of 0 can be represented by 0, and a LITE 29 // tvi of 0 can be represented by 10). The max TermId will be the sum of the max 30 // MAIN tvi and the max LITE tvi. 31 // 32 // Example use: 33 // ICING_ASSIGN_OR_RETURN(auto term_id_codec, 34 // TermIdCodec::Create(/*max_main_tvi=*/5, /*max_lite_tvi=*/5); 35 // 36 // term_id_codec->tvi_type(0); // TviType::Main 37 // term_id_codec->tvi_type(4); // TviType::Main 38 // term_id_codec->tvi_type(5); // TviType::Lite 39 // term_id_codec->tvi_type(9); // TviType::Lite 40 // 41 // term_id_codec->tvi_type(100); // INVALID_ARGUMENT, exceeds max TermId 42 // 43 // TODO(cassiewang): Use signed integers for the tvi values. Currently, the max 44 // values that are passed in are ~5 million for max_main_tvi, and ~1 million for 45 // max_lite_tvi. Since the sum of both of these is still well under the int32_t 46 // max, we should use signed integers (go/totw/159) (go/totw/159). But since 47 // we're getting these values from icing::DynamicTrie, we need to convert all 48 // the uints at once to avoid even worse undefined conversion behavior. 49 namespace icing { 50 namespace lib { 51 52 enum TviType { MAIN, LITE }; 53 54 class TermIdCodec { 55 public: 56 struct DecodedTermInfo { 57 TviType tvi_type; 58 uint32_t tvi; 59 }; 60 61 // Encodes/decodes TermIds based on a max main tvi and a max lite tvi. The max 62 // tvis are an exclusive upper bound on the values. For example, Create(5, 5) 63 // creates a MAIN encoding that holds [0, 1, 2, 3, 4] TermIds and a LITE 64 // encoding that holds [5, 6, 7, 8, 9] TermIds. 65 // 66 // Returns: 67 // unique_ptr to a TermIdCodec on success 68 // INVALID_ARGUMENT if the sum of max_main_tvi and max_lite_tvi is greater 69 // than the max uint32_t value 70 static libtextclassifier3::StatusOr<std::unique_ptr<TermIdCodec>> Create( 71 uint32_t max_main_tvi, uint32_t max_lite_tvi); 72 73 // Returns: 74 // TermId that would represent the given tvi of tvi_type 75 // INVALID_ARGUMENT if the tvi of tvi_type would exceed the max TermId 76 libtextclassifier3::StatusOr<uint32_t> EncodeTvi(uint32_t tvi, 77 TviType tvi_type) const; 78 79 // Returns: 80 // TviType of the encoded TermId 81 // INVALID_ARGUMENT if the term_id exceeds the max TermId 82 libtextclassifier3::StatusOr<TviType> DecodeTviType(uint32_t term_id) const; 83 84 // Returns: 85 // Decoded info of the given term_id 86 // INVALID_ARGUMENT if the term_id exceeds the max TermId 87 libtextclassifier3::StatusOr<DecodedTermInfo> DecodeTermInfo( 88 uint32_t term_id) const; 89 max_main_tvi()90 uint32_t max_main_tvi() const { return max_main_tvi_; } 91 max_lite_tvi()92 uint32_t max_lite_tvi() const { return max_lite_tvi_; } 93 max_term_id()94 uint32_t max_term_id() const { return max_main_tvi_ + max_lite_tvi_; } 95 96 private: TermIdCodec(uint32_t max_main_tvi,uint32_t max_lite_tvi)97 explicit TermIdCodec(uint32_t max_main_tvi, uint32_t max_lite_tvi) 98 : max_main_tvi_(max_main_tvi), max_lite_tvi_(max_lite_tvi) {} 99 100 uint32_t max_main_tvi_; 101 uint32_t max_lite_tvi_; 102 }; 103 104 } // namespace lib 105 } // namespace icing 106 107 #endif // ICING_INDEX_TERM_ID_CODEC_H_ 108