1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/i18n-utils.h"
16 
17 #include <cctype>
18 #include <string_view>
19 
20 #include "icing/text_classifier/lib3/utils/base/statusor.h"
21 #include "icing/absl_ports/canonical_errors.h"
22 #include "icing/absl_ports/str_cat.h"
23 #include "icing/util/logging.h"
24 #include "unicode/uchar.h"
25 #include "unicode/umachine.h"
26 #include "unicode/ustring.h"
27 #include "unicode/utf16.h"
28 #include "unicode/utf8.h"
29 #include "unicode/utypes.h"
30 
31 namespace icing {
32 namespace lib {
33 namespace i18n_utils {
34 
35 namespace {
36 
37 // All ASCII punctuation that's also in a Unicode Punctuation category
38 // (https://www.fileformat.info/info/unicode/category/index.htm). The set of
39 // characters that are regarded as punctuation is not the same for std::ispunct
40 // and u_ispunct.
41 const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
42 
43 }  // namespace
44 
Utf16ToUtf8(const std::u16string & utf16_string)45 libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
46     const std::u16string& utf16_string) {
47   std::string utf8_string;
48   // Allocates the maximum possible UTF8 string length:
49   // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
50   //
51   // NOTE: we need to call resize() but not reserve() because values can't be
52   // set at positions after length().
53   utf8_string.resize(utf16_string.length() * 3 + 1);
54 
55   int result_length = 0;
56   UErrorCode status = U_ZERO_ERROR;
57   u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
58               utf16_string.data(), utf16_string.length(), &status);
59   // Corrects the length
60   utf8_string.resize(result_length);
61 
62   if (U_FAILURE(status)) {
63     return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
64   }
65   return utf8_string;
66 }
67 
Utf8ToUtf16(std::string_view utf8_string)68 libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
69     std::string_view utf8_string) {
70   std::u16string utf16_result;
71   // The UTF16 string won't be longer than its UTF8 format
72   //
73   // NOTE: we need to call resize() but not reserve() because values can't be
74   // set at positions after length().
75   utf16_result.resize(utf8_string.length());
76 
77   int result_length = 0;
78   UErrorCode status = U_ZERO_ERROR;
79   u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
80                 utf8_string.data(), utf8_string.length(), &status);
81   // Corrects the length
82   utf16_result.resize(result_length);
83 
84   if (U_FAILURE(status)) {
85     return absl_ports::InternalError(absl_ports::StrCat(
86         "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
87   }
88   return utf16_result;
89 }
90 
GetUChar32At(const char * data,int length,int position)91 UChar32 GetUChar32At(const char* data, int length, int position) {
92   UChar32 uchar32;
93   U8_NEXT_OR_FFFD(data, position, length, uchar32);
94   return uchar32;
95 }
96 
SafeTruncateUtf8(std::string * str,int truncate_to_length)97 void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
98   if (str == nullptr || truncate_to_length >= str->length()) {
99     return;
100   }
101 
102   str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
103 }
104 
SafeTruncateUtf8Length(const char * str,int desired_length)105 int SafeTruncateUtf8Length(const char* str, int desired_length) {
106   while (desired_length > 0) {
107     if (IsLeadUtf8Byte(str[desired_length])) {
108       break;
109     }
110     --desired_length;
111   }
112   return desired_length;
113 }
114 
IsAscii(char c)115 bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
116 
IsAscii(UChar32 c)117 bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
118 
GetUtf8Length(UChar32 c)119 int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
120 
GetUtf16Length(UChar32 c)121 int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
122 
IsLeadUtf8Byte(char c)123 bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }
124 
IsPunctuationAt(std::string_view input,int position,int * char_len_out)125 bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
126   if (IsAscii(input[position])) {
127     if (char_len_out != nullptr) {
128       *char_len_out = 1;
129     }
130     return ascii_icu_punctuation.find(input[position]) != std::string::npos;
131   }
132   UChar32 c = GetUChar32At(input.data(), input.length(), position);
133   if (char_len_out != nullptr) {
134     *char_len_out = U8_LENGTH(c);
135   }
136   return u_ispunct(c);
137 }
138 
IsWhitespaceAt(std::string_view input,int position)139 bool IsWhitespaceAt(std::string_view input, int position) {
140   if (IsAscii(input[position])) {
141     return std::isspace(input[position]);
142   }
143   UChar32 c = GetUChar32At(input.data(), input.length(), position);
144   return u_isUWhiteSpace(c);
145 }
146 
IsAlphabeticAt(std::string_view input,int position)147 bool IsAlphabeticAt(std::string_view input, int position) {
148   if (IsAscii(input[position])) {
149     return std::isalpha(input[position]);
150   }
151   UChar32 c = GetUChar32At(input.data(), input.length(), position);
152   return u_isUAlphabetic(c);
153 }
154 
AppendUchar32ToUtf8(std::string * utf8_string,UChar32 uchar)155 void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
156   uint8_t utf8_buffer[4];  // U8_APPEND writes 0 to 4 bytes
157 
158   int utf8_index = 0;
159   UBool has_error = false;
160 
161   // utf8_index is advanced to the end of the contents if successful
162   U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
163 
164   if (has_error) {
165     ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string.";
166     return;
167   }
168   utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
169 }
170 
171 }  // namespace i18n_utils
172 }  // namespace lib
173 }  // namespace icing
174