1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 16 #define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 17 18 #include <memory> 19 #include <string> 20 #include <string_view> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/transform/normalizer.h" 24 #include "unicode/unorm2.h" 25 #include "unicode/utrans.h" 26 27 namespace icing { 28 namespace lib { 29 30 // Used to normalize UTF8 strings for text matching. It enforces a set of rules: 31 // 1. Transforms text to be lowercase UTF8. 32 // 2. Transforms full-width Latin characters to ASCII characters if possible. 33 // 3. Transforms hiragana to katakana. 34 // 4. Removes accent / diacritic marks on Latin characters 35 // 5. Normalized text must be less than or equal to max_term_byte_size, 36 // otherwise it will be truncated. 37 // 38 // There're some other rules from ICU not listed here, please see .cc file for 39 // details. 40 class IcuNormalizer : public Normalizer { 41 public: 42 // Creates a normalizer with the subcomponents it needs. max_term_byte_size 43 // enforces the max size of text after normalization, text will be truncated 44 // if exceeds the max size. 45 // 46 // Returns: 47 // A normalizer on success 48 // INVALID_ARGUMENT if max_term_byte_size <= 0 49 // INTERNAL_ERROR if failed to create any subcomponent 50 static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create( 51 int max_term_byte_size); 52 53 // Normalizes the input term based on rules. See .cc file for rule details. 54 // 55 // NOTE: Term should not mix Latin and non-Latin characters. Doing so may 56 // result in the non-Latin characters not properly being normalized 57 std::string NormalizeTerm(std::string_view term) const override; 58 59 private: 60 // A handler class that helps manage the lifecycle of UTransliterator. It's 61 // used in IcuNormalizer to transform terms into the formats we need. 62 class TermTransformer { 63 public: 64 // Creates TermTransformer with a valid UTransliterator instance 65 // 66 // Returns: 67 // A term transformer on success 68 // INTERNAL_ERROR if failed to create any subcomponent 69 static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>> 70 Create(); 71 72 // Closes the UTransliterator instance 73 ~TermTransformer(); 74 75 // Transforms the text based on our rules described at top of this file 76 std::string Transform(std::string_view term) const; 77 78 private: 79 explicit TermTransformer(UTransliterator* u_transliterator); 80 81 // An ICU class to execute custom term transformation / normalization rules. 82 // utrans_close() must by called after using. 83 UTransliterator* u_transliterator_; 84 }; 85 86 explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer, 87 int max_term_byte_size); 88 89 // Helper method to normalize Latin terms only. Rules applied: 90 // 1. Uppercase to lowercase 91 // 2. Remove diacritic (accent) marks 92 std::string NormalizeLatin(const UNormalizer2* normalizer2, 93 std::string_view term) const; 94 95 // Used to transform terms into their normalized forms. 96 std::unique_ptr<TermTransformer> term_transformer_; 97 98 // The maximum term length allowed after normalization. 99 int max_term_byte_size_; 100 }; 101 102 } // namespace lib 103 } // namespace icing 104 105 #endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 106