1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/transform/icu/icu-normalizer.h"
16 
17 #include <cctype>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/absl_ports/str_cat.h"
26 #include "icing/transform/normalizer.h"
27 #include "icing/util/i18n-utils.h"
28 #include "icing/util/logging.h"
29 #include "icing/util/status-macros.h"
30 #include "unicode/umachine.h"
31 #include "unicode/unorm2.h"
32 #include "unicode/utrans.h"
33 
34 namespace icing {
35 namespace lib {
36 
37 namespace {
38 
39 // The following is the compound id used to tell UTransliterator how to
40 // transform terms. The basic normalization forms NFD (canonical normalization
41 // form decomposition) and NFKC (compatible normalization form composition)
42 // are applied as well as some other rules we need. More information at
43 // http://www.unicode.org/reports/tr15/
44 //
45 // Please note that the following rules don't support small hiragana to katakana
46 // transformation.
47 constexpr UChar kTransformRulesUtf16[] =
48     u"Lower; "                      // Lowercase
49     "Latin-ASCII; "                 // Map Latin characters to ASCII characters
50     "Hiragana-Katakana; "           // Map hiragana to katakana
51     "[:Latin:] NFD; "               // Decompose Latin letters
52     "[:Nonspacing Mark:] Remove; "  // Remove accent / diacritic marks
53     "NFKC";                         // Decompose and compose everything
54 
55 // Length of the transform rules excluding the terminating NULL.
56 constexpr int kTransformRulesLength =
57     sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
58 
59 // Transforms a Unicode character with diacritics to its counterpart in ASCII
60 // range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
61 // the transformation is successful.
62 //
63 // NOTE: According to our convention this function should have returned
64 // StatusOr<char>. However, this function is performance-sensitive because is
65 // could be called on every Latin character in normalization, so we make it
66 // return a bool here to save a bit more time and memory.
DiacriticCharToAscii(const UNormalizer2 * normalizer2,UChar32 uchar32_in,char * char_out)67 bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
68                           char* char_out) {
69   if (i18n_utils::IsAscii(uchar32_in)) {
70     // The Unicode character is within ASCII range
71     if (char_out != nullptr) {
72       *char_out = uchar32_in;
73     }
74     return true;
75   }
76 
77   // Maximum number of pieces a Unicode character can be decomposed into.
78   // TODO(tjbarron) figure out if this number is proper.
79   constexpr int kDecompositionBufferCapacity = 5;
80 
81   // A buffer used to store Unicode decomposition mappings of only one
82   // character.
83   UChar decomposition_buffer[kDecompositionBufferCapacity];
84 
85   // Decomposes the Unicode character, trying to get an ASCII char and some
86   // diacritic chars.
87   UErrorCode status = U_ZERO_ERROR;
88   if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
89                               kDecompositionBufferCapacity, &status) > 0 &&
90       !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
91     if (char_out != nullptr) {
92       *char_out = decomposition_buffer[0];
93     }
94     return true;
95   }
96   return false;
97 }
98 
99 }  // namespace
100 
101 // Creates a IcuNormalizer with a valid TermTransformer instance.
102 //
103 // Note: UTokenizer2 is also an option to normalize Unicode strings, but since
104 // we need some custom transform rules other than NFC/NFKC we have to use
105 // TermTransformer as a custom transform rule executor.
106 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>>
Create(int max_term_byte_size)107 IcuNormalizer::Create(int max_term_byte_size) {
108   ICING_ASSIGN_OR_RETURN(
109       std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
110       IcuNormalizer::TermTransformer::Create());
111 
112   return std::unique_ptr<IcuNormalizer>(
113       new IcuNormalizer(std::move(term_transformer), max_term_byte_size));
114 }
115 
IcuNormalizer(std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,int max_term_byte_size)116 IcuNormalizer::IcuNormalizer(
117     std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
118     int max_term_byte_size)
119     : term_transformer_(std::move(term_transformer)),
120       max_term_byte_size_(max_term_byte_size) {}
121 
NormalizeTerm(const std::string_view term) const122 std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
123   std::string normalized_text;
124 
125   if (term.empty()) {
126     return normalized_text;
127   }
128 
129   UErrorCode status = U_ZERO_ERROR;
130   // ICU manages the singleton instance
131   const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
132   if (U_FAILURE(status)) {
133     ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
134   }
135 
136   // Checks if the first character is within ASCII range or can be transformed
137   // into an ASCII char. Since the term is tokenized, we know that the whole
138   // term can be transformed into ASCII if the first character can.
139   UChar32 first_uchar32 =
140       i18n_utils::GetUChar32At(term.data(), term.length(), 0);
141   if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
142       DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
143     // This is a faster method to normalize Latin terms.
144     normalized_text = NormalizeLatin(normalizer2, term);
145   } else {
146     normalized_text = term_transformer_->Transform(term);
147   }
148 
149   if (normalized_text.length() > max_term_byte_size_) {
150     i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
151   }
152 
153   return normalized_text;
154 }
155 
NormalizeLatin(const UNormalizer2 * normalizer2,const std::string_view term) const156 std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
157                                           const std::string_view term) const {
158   std::string result;
159   result.reserve(term.length());
160   for (int i = 0; i < term.length(); i++) {
161     if (i18n_utils::IsAscii(term[i])) {
162       result.push_back(std::tolower(term[i]));
163     } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
164       UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
165       if (uchar32 == i18n_utils::kInvalidUChar32) {
166         ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
167                            << " at position" << i;
168         continue;
169       }
170       char ascii_char;
171       if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
172         result.push_back(std::tolower(ascii_char));
173       } else {
174         // We don't know how to transform / decompose this Unicode character, it
175         // probably means that some other Unicode characters are mixed with
176         // Latin characters. This shouldn't happen if input term is properly
177         // tokenized. We handle it here in case there're something wrong with
178         // the tokenizers.
179         int utf8_length = i18n_utils::GetUtf8Length(uchar32);
180         absl_ports::StrAppend(&result, term.substr(i, utf8_length));
181       }
182     }
183   }
184 
185   return result;
186 }
187 
188 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer::TermTransformer>>
Create()189 IcuNormalizer::TermTransformer::Create() {
190   UErrorCode status = U_ZERO_ERROR;
191   UTransliterator* term_transformer = utrans_openU(
192       kTransformRulesUtf16, kTransformRulesLength, UTRANS_FORWARD,
193       /*rules=*/nullptr, /*rulesLength=*/0, /*parseError=*/nullptr, &status);
194 
195   if (U_FAILURE(status)) {
196     return absl_ports::InternalError("Failed to create UTransliterator.");
197   }
198 
199   return std::unique_ptr<IcuNormalizer::TermTransformer>(
200       new IcuNormalizer::TermTransformer(term_transformer));
201 }
202 
TermTransformer(UTransliterator * u_transliterator)203 IcuNormalizer::TermTransformer::TermTransformer(
204     UTransliterator* u_transliterator)
205     : u_transliterator_(u_transliterator) {}
206 
~TermTransformer()207 IcuNormalizer::TermTransformer::~TermTransformer() {
208   if (u_transliterator_ != nullptr) {
209     utrans_close(u_transliterator_);
210   }
211 }
212 
Transform(const std::string_view term) const213 std::string IcuNormalizer::TermTransformer::Transform(
214     const std::string_view term) const {
215   auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
216   if (!utf16_term_or.ok()) {
217     ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
218     return std::string(term);
219   }
220   std::u16string utf16_term = std::move(utf16_term_or).ValueOrDie();
221   UErrorCode status = U_ZERO_ERROR;
222   int utf16_term_desired_length = utf16_term.length();
223   int limit = utf16_term.length();
224   utrans_transUChars(u_transliterator_, &utf16_term[0],
225                      &utf16_term_desired_length, utf16_term.length(),
226                      /*start=*/0, &limit, &status);
227 
228   // For most cases, one Unicode character is normalized to exact one Unicode
229   // character according to our transformation rules. However, there could be
230   // some rare cases where the normalized text is longer than the original
231   // one. E.g. "¼" (1 character) -> "1/4" (3 characters). That causes a buffer
232   // overflow error and we need to increase our buffer size and try again.
233   if (status == U_BUFFER_OVERFLOW_ERROR) {
234     // 'utf16_term_desired_length' has already been set to the desired value
235     // by utrans_transUChars(), here we increase the buffer size to that
236     // value.
237     //
238     // NOTE: we need to call resize() but not reserve() because values can't
239     // be set at positions after length().
240     int original_content_length = utf16_term.length();
241     utf16_term.resize(utf16_term_desired_length);
242     utf16_term_desired_length = original_content_length;
243     limit = original_content_length;
244     status = U_ZERO_ERROR;
245     utrans_transUChars(u_transliterator_, &utf16_term[0],
246                        &utf16_term_desired_length, utf16_term.length(),
247                        /*start=*/0, &limit, &status);
248   }
249 
250   if (U_FAILURE(status)) {
251     // Failed to transform, return its original form.
252     ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
253     return std::string(term);
254   }
255 
256   auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
257   if (!utf8_term_or.ok()) {
258     ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
259     return std::string(term);
260   }
261   return std::move(utf8_term_or).ValueOrDie();
262 }
263 
264 }  // namespace lib
265 }  // namespace icing
266