1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
16 #define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
17 
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/transform/normalizer.h"
24 #include "unicode/unorm2.h"
25 #include "unicode/utrans.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // Used to normalize UTF8 strings for text matching. It enforces a set of rules:
31 //  1. Transforms text to be lowercase UTF8.
32 //  2. Transforms full-width Latin characters to ASCII characters if possible.
33 //  3. Transforms hiragana to katakana.
34 //  4. Removes accent / diacritic marks on Latin characters
35 //  5. Normalized text must be less than or equal to max_term_byte_size,
36 //     otherwise it will be truncated.
37 //
38 // There're some other rules from ICU not listed here, please see .cc file for
39 // details.
40 class IcuNormalizer : public Normalizer {
41  public:
42   // Creates a normalizer with the subcomponents it needs. max_term_byte_size
43   // enforces the max size of text after normalization, text will be truncated
44   // if exceeds the max size.
45   //
46   // Returns:
47   //   A normalizer on success
48   //   INVALID_ARGUMENT if max_term_byte_size <= 0
49   //   INTERNAL_ERROR if failed to create any subcomponent
50   static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create(
51       int max_term_byte_size);
52 
53   // Normalizes the input term based on rules. See .cc file for rule details.
54   //
55   // NOTE: Term should not mix Latin and non-Latin characters. Doing so may
56   // result in the non-Latin characters not properly being normalized
57   std::string NormalizeTerm(std::string_view term) const override;
58 
59  private:
60   // A handler class that helps manage the lifecycle of UTransliterator. It's
61   // used in IcuNormalizer to transform terms into the formats we need.
62   class TermTransformer {
63    public:
64     // Creates TermTransformer with a valid UTransliterator instance
65     //
66     // Returns:
67     //   A term transformer on success
68     //   INTERNAL_ERROR if failed to create any subcomponent
69     static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>>
70     Create();
71 
72     // Closes the UTransliterator instance
73     ~TermTransformer();
74 
75     // Transforms the text based on our rules described at top of this file
76     std::string Transform(std::string_view term) const;
77 
78    private:
79     explicit TermTransformer(UTransliterator* u_transliterator);
80 
81     // An ICU class to execute custom term transformation / normalization rules.
82     // utrans_close() must by called after using.
83     UTransliterator* u_transliterator_;
84   };
85 
86   explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer,
87                          int max_term_byte_size);
88 
89   // Helper method to normalize Latin terms only. Rules applied:
90   // 1. Uppercase to lowercase
91   // 2. Remove diacritic (accent) marks
92   std::string NormalizeLatin(const UNormalizer2* normalizer2,
93                              std::string_view term) const;
94 
95   // Used to transform terms into their normalized forms.
96   std::unique_ptr<TermTransformer> term_transformer_;
97 
98   // The maximum term length allowed after normalization.
99   int max_term_byte_size_;
100 };
101 
102 }  // namespace lib
103 }  // namespace icing
104 
105 #endif  // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
106