1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_UTIL_I18N_UTILS_H_
16 #define ICING_UTIL_I18N_UTILS_H_
17 
18 #include <string>
19 #include <string_view>
20 
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "unicode/umachine.h"
23 
24 namespace icing {
25 namespace lib {
26 
27 // Internationalization utils that use standard utilities or custom code. Does
28 // not require any special dependencies, such as data files for ICU.
29 namespace i18n_utils {
30 
31 // An invalid value defined by Unicode.
32 static constexpr UChar32 kInvalidUChar32 = 0xFFFD;
33 
34 // Converts a UTF16 string to a UTF8 string.
35 //
36 // Returns:
37 //   A UTF8 string on success
38 //   INTERNAL_ERROR on any failures
39 libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
40     const std::u16string& utf16_string);
41 
42 // Converts a UTF8 string to a UTF16 string.
43 //
44 // Returns:
45 //   A UTF16 string on success
46 //   INTERNAL_ERROR on any failures
47 libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
48     std::string_view utf8_string);
49 
50 // Returns the char at the given position.
51 UChar32 GetUChar32At(const char* data, int length, int position);
52 
53 // Returns the safe position to truncate a UTF8 string at so that multi-byte
54 // UTF8 characters are not cut in the middle. The returned value will always be
55 // 0 <= val <= desired_length.
56 //
57 // REQUIRES: 0 <= desired_length < strlen(str)
58 int SafeTruncateUtf8Length(const char* str, int desired_length);
59 
60 // Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
61 // in the middle. The string will be truncated in place.
62 void SafeTruncateUtf8(std::string* str, int truncate_to_length);
63 
64 // Checks if the single char is within ASCII range.
65 bool IsAscii(char c);
66 
67 // Checks if the Unicode char is within ASCII range.
68 bool IsAscii(UChar32 c);
69 
70 // Returns how many code units (char) are used for the UTF-8 encoding of this
71 // Unicode character. Returns 0 if not valid.
72 int GetUtf8Length(UChar32 c);
73 
74 // Returns how many code units (char16_t) are used for the UTF-16 encoding of
75 // this Unicode character. Returns 0 if not valid.
76 int GetUtf16Length(UChar32 c);
77 
78 // Checks if the single char is the first byte of a UTF8 character, note
79 // that a single ASCII char is also considered a lead byte.
80 bool IsLeadUtf8Byte(char c);
81 
82 // Checks if the character at position is punctuation. Assigns the length of the
83 // character at position to *char_len_out if the character at position is valid
84 // punctuation and char_len_out is not null.
85 bool IsPunctuationAt(std::string_view input, int position,
86                      int* char_len_out = nullptr);
87 
88 // Checks if the character at position is a whitespace.
89 bool IsWhitespaceAt(std::string_view input, int position);
90 
91 // Checks if the character at position is a whitespace.
92 bool IsAlphabeticAt(std::string_view input, int position);
93 
94 void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar);
95 
96 }  // namespace i18n_utils
97 }  // namespace lib
98 }  // namespace icing
99 
100 #endif  // ICING_UTIL_I18N_UTILS_H_
101