1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
18 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
19 
20 #include "utils/base/integral_types.h"
21 #include "utils/utf8/unicodetext.h"
22 #include "utils/utf8/unilib-common.h"
23 
24 #if defined TC3_UNILIB_ICU
25 #include "utils/utf8/unilib-icu.h"
26 #define INIT_UNILIB_FOR_TESTING(VAR) VAR()
27 #elif defined TC3_UNILIB_JAVAICU
28 #include "utils/utf8/unilib-javaicu.h"
29 #define INIT_UNILIB_FOR_TESTING(VAR) VAR(nullptr)
30 #elif defined TC3_UNILIB_APPLE
31 #include "utils/utf8/unilib-apple.h"
32 #define INIT_UNILIB_FOR_TESTING(VAR) VAR()
33 #else
34 #error No TC3_UNILIB implementation specified.
35 #endif
36 
37 namespace libtextclassifier3 {
38 
39 class UniLib : public UniLibBase {
40  public:
41   using UniLibBase::UniLibBase;
42 
43   // Lowercase a unicode string.
ToLowerText(const UnicodeText & text)44   UnicodeText ToLowerText(const UnicodeText& text) const {
45     UnicodeText result;
46     for (const char32 codepoint : text) {
47       result.push_back(ToLower(codepoint));
48     }
49     return result;
50   }
51 
52   // Uppercase a unicode string.
ToUpperText(const UnicodeText & text)53   UnicodeText ToUpperText(const UnicodeText& text) const {
54     UnicodeText result;
55     for (const char32 codepoint : text) {
56       result.push_back(UniLibBase::ToUpper(codepoint));
57     }
58     return result;
59   }
60 
IsLowerText(const UnicodeText & text)61   bool IsLowerText(const UnicodeText& text) const {
62     for (const char32 codepoint : text) {
63       if (!IsLower(codepoint)) {
64         return false;
65       }
66     }
67     return true;
68   }
69 
IsUpperText(const UnicodeText & text)70   bool IsUpperText(const UnicodeText& text) const {
71     for (const char32 codepoint : text) {
72       if (!IsUpper(codepoint)) {
73         return false;
74       }
75     }
76     return true;
77   }
78 
IsDigits(const UnicodeText & text)79   bool IsDigits(const UnicodeText& text) const {
80     for (const char32 codepoint : text) {
81       if (!IsDigit(codepoint)) {
82         return false;
83       }
84     }
85     return true;
86   }
87 
IsPercentage(char32 codepoint)88   bool IsPercentage(char32 codepoint) const {
89     return libtextclassifier3::IsPercentage(codepoint);
90   }
91 
IsSlash(char32 codepoint)92   bool IsSlash(char32 codepoint) const {
93     return libtextclassifier3::IsSlash(codepoint);
94   }
95 
IsMinus(char32 codepoint)96   bool IsMinus(char32 codepoint) const {
97     return libtextclassifier3::IsMinus(codepoint);
98   }
99 
IsNumberSign(char32 codepoint)100   bool IsNumberSign(char32 codepoint) const {
101     return libtextclassifier3::IsNumberSign(codepoint);
102   }
103 
IsDot(char32 codepoint)104   bool IsDot(char32 codepoint) const {
105     return libtextclassifier3::IsDot(codepoint);
106   }
107 
IsApostrophe(char32 codepoint)108   bool IsApostrophe(char32 codepoint) const {
109     return libtextclassifier3::IsApostrophe(codepoint);
110   }
111 
IsQuotation(char32 codepoint)112   bool IsQuotation(char32 codepoint) const {
113     return libtextclassifier3::IsQuotation(codepoint);
114   }
115 
IsAmpersand(char32 codepoint)116   bool IsAmpersand(char32 codepoint) const {
117     return libtextclassifier3::IsAmpersand(codepoint);
118   }
119 
IsLatinLetter(char32 codepoint)120   bool IsLatinLetter(char32 codepoint) const {
121     return libtextclassifier3::IsLatinLetter(codepoint);
122   }
123 
IsArabicLetter(char32 codepoint)124   bool IsArabicLetter(char32 codepoint) const {
125     return libtextclassifier3::IsArabicLetter(codepoint);
126   }
127 
IsCyrillicLetter(char32 codepoint)128   bool IsCyrillicLetter(char32 codepoint) const {
129     return libtextclassifier3::IsCyrillicLetter(codepoint);
130   }
131 
IsChineseLetter(char32 codepoint)132   bool IsChineseLetter(char32 codepoint) const {
133     return libtextclassifier3::IsChineseLetter(codepoint);
134   }
135 
IsJapaneseLetter(char32 codepoint)136   bool IsJapaneseLetter(char32 codepoint) const {
137     return libtextclassifier3::IsJapaneseLetter(codepoint);
138   }
139 
IsKoreanLetter(char32 codepoint)140   bool IsKoreanLetter(char32 codepoint) const {
141     return libtextclassifier3::IsKoreanLetter(codepoint);
142   }
143 
IsThaiLetter(char32 codepoint)144   bool IsThaiLetter(char32 codepoint) const {
145     return libtextclassifier3::IsThaiLetter(codepoint);
146   }
147 
IsCJTletter(char32 codepoint)148   bool IsCJTletter(char32 codepoint) const {
149     return libtextclassifier3::IsCJTletter(codepoint);
150   }
151 
IsLetter(char32 codepoint)152   bool IsLetter(char32 codepoint) const {
153     return libtextclassifier3::IsLetter(codepoint);
154   }
155 
IsValidUtf8(const UnicodeText & text)156   bool IsValidUtf8(const UnicodeText& text) const {
157     // Basic check of structural validity of UTF8.
158     if (!text.is_valid()) {
159       return false;
160     }
161     // In addition to that, we declare that a valid UTF8 is when the number of
162     // codepoints in the string as measured by ICU is the same as the number of
163     // codepoints as measured by UnicodeText. Because if we don't do this check,
164     // the indices might differ, and cause trouble, because the assumption
165     // throughout the code is that ICU indices and UnicodeText indices are the
166     // same.
167     // NOTE: This is not perfect, as this doesn't check the alignment of the
168     // codepoints, but for the practical purposes should be enough.
169     const StatusOr<int32> icu_length = Length(text);
170     if (!icu_length.ok()) {
171       return false;
172     }
173 
174     if (icu_length.ValueOrDie() != text.size_codepoints()) {
175       return false;
176     }
177 
178     return true;
179   }
180 };
181 
182 }  // namespace libtextclassifier3
183 #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
184