1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/transform/icu/icu-normalizer.h"
16
17 #include <cctype>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/absl_ports/str_cat.h"
26 #include "icing/transform/normalizer.h"
27 #include "icing/util/i18n-utils.h"
28 #include "icing/util/logging.h"
29 #include "icing/util/status-macros.h"
30 #include "unicode/umachine.h"
31 #include "unicode/unorm2.h"
32 #include "unicode/utrans.h"
33
34 namespace icing {
35 namespace lib {
36
37 namespace {
38
39 // The following is the compound id used to tell UTransliterator how to
40 // transform terms. The basic normalization forms NFD (canonical normalization
41 // form decomposition) and NFKC (compatible normalization form composition)
42 // are applied as well as some other rules we need. More information at
43 // http://www.unicode.org/reports/tr15/
44 //
45 // Please note that the following rules don't support small hiragana to katakana
46 // transformation.
47 constexpr UChar kTransformRulesUtf16[] =
48 u"Lower; " // Lowercase
49 "Latin-ASCII; " // Map Latin characters to ASCII characters
50 "Hiragana-Katakana; " // Map hiragana to katakana
51 "[:Latin:] NFD; " // Decompose Latin letters
52 "[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks
53 "NFKC"; // Decompose and compose everything
54
55 // Length of the transform rules excluding the terminating NULL.
56 constexpr int kTransformRulesLength =
57 sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
58
59 // Transforms a Unicode character with diacritics to its counterpart in ASCII
60 // range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
61 // the transformation is successful.
62 //
63 // NOTE: According to our convention this function should have returned
64 // StatusOr<char>. However, this function is performance-sensitive because is
65 // could be called on every Latin character in normalization, so we make it
66 // return a bool here to save a bit more time and memory.
DiacriticCharToAscii(const UNormalizer2 * normalizer2,UChar32 uchar32_in,char * char_out)67 bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
68 char* char_out) {
69 if (i18n_utils::IsAscii(uchar32_in)) {
70 // The Unicode character is within ASCII range
71 if (char_out != nullptr) {
72 *char_out = uchar32_in;
73 }
74 return true;
75 }
76
77 // Maximum number of pieces a Unicode character can be decomposed into.
78 // TODO(tjbarron) figure out if this number is proper.
79 constexpr int kDecompositionBufferCapacity = 5;
80
81 // A buffer used to store Unicode decomposition mappings of only one
82 // character.
83 UChar decomposition_buffer[kDecompositionBufferCapacity];
84
85 // Decomposes the Unicode character, trying to get an ASCII char and some
86 // diacritic chars.
87 UErrorCode status = U_ZERO_ERROR;
88 if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
89 kDecompositionBufferCapacity, &status) > 0 &&
90 !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
91 if (char_out != nullptr) {
92 *char_out = decomposition_buffer[0];
93 }
94 return true;
95 }
96 return false;
97 }
98
99 } // namespace
100
101 // Creates a IcuNormalizer with a valid TermTransformer instance.
102 //
103 // Note: UTokenizer2 is also an option to normalize Unicode strings, but since
104 // we need some custom transform rules other than NFC/NFKC we have to use
105 // TermTransformer as a custom transform rule executor.
106 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>>
Create(int max_term_byte_size)107 IcuNormalizer::Create(int max_term_byte_size) {
108 ICING_ASSIGN_OR_RETURN(
109 std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
110 IcuNormalizer::TermTransformer::Create());
111
112 return std::unique_ptr<IcuNormalizer>(
113 new IcuNormalizer(std::move(term_transformer), max_term_byte_size));
114 }
115
IcuNormalizer(std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,int max_term_byte_size)116 IcuNormalizer::IcuNormalizer(
117 std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
118 int max_term_byte_size)
119 : term_transformer_(std::move(term_transformer)),
120 max_term_byte_size_(max_term_byte_size) {}
121
NormalizeTerm(const std::string_view term) const122 std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
123 std::string normalized_text;
124
125 if (term.empty()) {
126 return normalized_text;
127 }
128
129 UErrorCode status = U_ZERO_ERROR;
130 // ICU manages the singleton instance
131 const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
132 if (U_FAILURE(status)) {
133 ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
134 }
135
136 // Checks if the first character is within ASCII range or can be transformed
137 // into an ASCII char. Since the term is tokenized, we know that the whole
138 // term can be transformed into ASCII if the first character can.
139 UChar32 first_uchar32 =
140 i18n_utils::GetUChar32At(term.data(), term.length(), 0);
141 if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
142 DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
143 // This is a faster method to normalize Latin terms.
144 normalized_text = NormalizeLatin(normalizer2, term);
145 } else {
146 normalized_text = term_transformer_->Transform(term);
147 }
148
149 if (normalized_text.length() > max_term_byte_size_) {
150 i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
151 }
152
153 return normalized_text;
154 }
155
NormalizeLatin(const UNormalizer2 * normalizer2,const std::string_view term) const156 std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
157 const std::string_view term) const {
158 std::string result;
159 result.reserve(term.length());
160 for (int i = 0; i < term.length(); i++) {
161 if (i18n_utils::IsAscii(term[i])) {
162 result.push_back(std::tolower(term[i]));
163 } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
164 UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
165 if (uchar32 == i18n_utils::kInvalidUChar32) {
166 ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
167 << " at position" << i;
168 continue;
169 }
170 char ascii_char;
171 if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
172 result.push_back(std::tolower(ascii_char));
173 } else {
174 // We don't know how to transform / decompose this Unicode character, it
175 // probably means that some other Unicode characters are mixed with
176 // Latin characters. This shouldn't happen if input term is properly
177 // tokenized. We handle it here in case there're something wrong with
178 // the tokenizers.
179 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
180 absl_ports::StrAppend(&result, term.substr(i, utf8_length));
181 }
182 }
183 }
184
185 return result;
186 }
187
188 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer::TermTransformer>>
Create()189 IcuNormalizer::TermTransformer::Create() {
190 UErrorCode status = U_ZERO_ERROR;
191 UTransliterator* term_transformer = utrans_openU(
192 kTransformRulesUtf16, kTransformRulesLength, UTRANS_FORWARD,
193 /*rules=*/nullptr, /*rulesLength=*/0, /*parseError=*/nullptr, &status);
194
195 if (U_FAILURE(status)) {
196 return absl_ports::InternalError("Failed to create UTransliterator.");
197 }
198
199 return std::unique_ptr<IcuNormalizer::TermTransformer>(
200 new IcuNormalizer::TermTransformer(term_transformer));
201 }
202
TermTransformer(UTransliterator * u_transliterator)203 IcuNormalizer::TermTransformer::TermTransformer(
204 UTransliterator* u_transliterator)
205 : u_transliterator_(u_transliterator) {}
206
~TermTransformer()207 IcuNormalizer::TermTransformer::~TermTransformer() {
208 if (u_transliterator_ != nullptr) {
209 utrans_close(u_transliterator_);
210 }
211 }
212
Transform(const std::string_view term) const213 std::string IcuNormalizer::TermTransformer::Transform(
214 const std::string_view term) const {
215 auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
216 if (!utf16_term_or.ok()) {
217 ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
218 return std::string(term);
219 }
220 std::u16string utf16_term = std::move(utf16_term_or).ValueOrDie();
221 UErrorCode status = U_ZERO_ERROR;
222 int utf16_term_desired_length = utf16_term.length();
223 int limit = utf16_term.length();
224 utrans_transUChars(u_transliterator_, &utf16_term[0],
225 &utf16_term_desired_length, utf16_term.length(),
226 /*start=*/0, &limit, &status);
227
228 // For most cases, one Unicode character is normalized to exact one Unicode
229 // character according to our transformation rules. However, there could be
230 // some rare cases where the normalized text is longer than the original
231 // one. E.g. "¼" (1 character) -> "1/4" (3 characters). That causes a buffer
232 // overflow error and we need to increase our buffer size and try again.
233 if (status == U_BUFFER_OVERFLOW_ERROR) {
234 // 'utf16_term_desired_length' has already been set to the desired value
235 // by utrans_transUChars(), here we increase the buffer size to that
236 // value.
237 //
238 // NOTE: we need to call resize() but not reserve() because values can't
239 // be set at positions after length().
240 int original_content_length = utf16_term.length();
241 utf16_term.resize(utf16_term_desired_length);
242 utf16_term_desired_length = original_content_length;
243 limit = original_content_length;
244 status = U_ZERO_ERROR;
245 utrans_transUChars(u_transliterator_, &utf16_term[0],
246 &utf16_term_desired_length, utf16_term.length(),
247 /*start=*/0, &limit, &status);
248 }
249
250 if (U_FAILURE(status)) {
251 // Failed to transform, return its original form.
252 ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
253 return std::string(term);
254 }
255
256 auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
257 if (!utf8_term_or.ok()) {
258 ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
259 return std::string(term);
260 }
261 return std::move(utf8_term_or).ValueOrDie();
262 }
263
264 } // namespace lib
265 } // namespace icing
266