1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unilib_test-include.h"
18
19 #include "utils/base/logging.h"
20 #include "gmock/gmock.h"
21
22 namespace libtextclassifier3 {
23 namespace test_internal {
24
25 using ::testing::ElementsAre;
26
TEST_F(UniLibTest,CharacterClassesAscii)27 TEST_F(UniLibTest, CharacterClassesAscii) {
28 EXPECT_TRUE(unilib_->IsOpeningBracket('('));
29 EXPECT_TRUE(unilib_->IsClosingBracket(')'));
30 EXPECT_FALSE(unilib_->IsWhitespace(')'));
31 EXPECT_TRUE(unilib_->IsWhitespace(' '));
32 EXPECT_FALSE(unilib_->IsDigit(')'));
33 EXPECT_TRUE(unilib_->IsDigit('0'));
34 EXPECT_TRUE(unilib_->IsDigit('9'));
35 EXPECT_FALSE(unilib_->IsUpper(')'));
36 EXPECT_TRUE(unilib_->IsUpper('A'));
37 EXPECT_TRUE(unilib_->IsUpper('Z'));
38 EXPECT_FALSE(unilib_->IsLower(')'));
39 EXPECT_TRUE(unilib_->IsLower('a'));
40 EXPECT_TRUE(unilib_->IsLower('z'));
41 EXPECT_TRUE(unilib_->IsPunctuation('!'));
42 EXPECT_TRUE(unilib_->IsPunctuation('?'));
43 EXPECT_TRUE(unilib_->IsPunctuation('#'));
44 EXPECT_TRUE(unilib_->IsPunctuation('('));
45 EXPECT_FALSE(unilib_->IsPunctuation('0'));
46 EXPECT_FALSE(unilib_->IsPunctuation('$'));
47 EXPECT_TRUE(unilib_->IsPercentage('%'));
48 EXPECT_TRUE(unilib_->IsPercentage(u'%'));
49 EXPECT_TRUE(unilib_->IsSlash('/'));
50 EXPECT_TRUE(unilib_->IsSlash(u'/'));
51 EXPECT_TRUE(unilib_->IsMinus('-'));
52 EXPECT_TRUE(unilib_->IsMinus(u'-'));
53 EXPECT_TRUE(unilib_->IsNumberSign('#'));
54 EXPECT_TRUE(unilib_->IsNumberSign(u'#'));
55 EXPECT_TRUE(unilib_->IsDot('.'));
56 EXPECT_TRUE(unilib_->IsDot(u'.'));
57 EXPECT_TRUE(unilib_->IsApostrophe('\''));
58 EXPECT_TRUE(unilib_->IsApostrophe(u'ߴ'));
59 EXPECT_TRUE(unilib_->IsQuotation(u'"'));
60 EXPECT_TRUE(unilib_->IsQuotation(u'”'));
61 EXPECT_TRUE(unilib_->IsAmpersand(u'&'));
62 EXPECT_TRUE(unilib_->IsAmpersand(u'﹠'));
63 EXPECT_TRUE(unilib_->IsAmpersand(u'&'));
64
65 EXPECT_TRUE(unilib_->IsLatinLetter('A'));
66 EXPECT_TRUE(unilib_->IsArabicLetter(u'ب')); // ARABIC LETTER BEH
67 EXPECT_TRUE(
68 unilib_->IsCyrillicLetter(u'ᲀ')); // CYRILLIC SMALL LETTER ROUNDED VE
69 EXPECT_TRUE(unilib_->IsChineseLetter(u'豈')); // CJK COMPATIBILITY IDEOGRAPH
70 EXPECT_TRUE(unilib_->IsJapaneseLetter(u'ぁ')); // HIRAGANA LETTER SMALL A
71 EXPECT_TRUE(unilib_->IsKoreanLetter(u'ㄱ')); // HANGUL LETTER KIYEOK
72 EXPECT_TRUE(unilib_->IsThaiLetter(u'ก')); // THAI CHARACTER KO KAI
73 EXPECT_TRUE(unilib_->IsCJTletter(u'ก')); // THAI CHARACTER KO KAI
74 EXPECT_FALSE(unilib_->IsCJTletter('A'));
75
76 EXPECT_TRUE(unilib_->IsLetter('A'));
77 EXPECT_TRUE(unilib_->IsLetter(u'A'));
78 EXPECT_TRUE(unilib_->IsLetter(u'ト')); // KATAKANA LETTER TO
79 EXPECT_TRUE(unilib_->IsLetter(u'ト')); // HALFWIDTH KATAKANA LETTER TO
80 EXPECT_TRUE(unilib_->IsLetter(u'豈')); // CJK COMPATIBILITY IDEOGRAPH
81
82 EXPECT_EQ(unilib_->ToLower('A'), 'a');
83 EXPECT_EQ(unilib_->ToLower('Z'), 'z');
84 EXPECT_EQ(unilib_->ToLower(')'), ')');
85 EXPECT_EQ(unilib_->ToLowerText(UTF8ToUnicodeText("Never gonna give you up."))
86 .ToUTF8String(),
87 "never gonna give you up.");
88 EXPECT_EQ(unilib_->ToUpper('a'), 'A');
89 EXPECT_EQ(unilib_->ToUpper('z'), 'Z');
90 EXPECT_EQ(unilib_->ToUpper(')'), ')');
91 EXPECT_EQ(unilib_->ToUpperText(UTF8ToUnicodeText("Never gonna let you down."))
92 .ToUTF8String(),
93 "NEVER GONNA LET YOU DOWN.");
94 EXPECT_EQ(unilib_->GetPairedBracket(')'), '(');
95 EXPECT_EQ(unilib_->GetPairedBracket('}'), '{');
96 }
97
TEST_F(UniLibTest,CharacterClassesUnicode)98 TEST_F(UniLibTest, CharacterClassesUnicode) {
99 EXPECT_TRUE(unilib_->IsOpeningBracket(0x0F3C)); // TIBET ANG KHANG GYON
100 EXPECT_TRUE(unilib_->IsClosingBracket(0x0F3D)); // TIBET ANG KHANG GYAS
101 EXPECT_FALSE(unilib_->IsWhitespace(0x23F0)); // ALARM CLOCK
102 EXPECT_TRUE(unilib_->IsWhitespace(0x2003)); // EM SPACE
103 EXPECT_FALSE(unilib_->IsDigit(0xA619)); // VAI SYMBOL JONG
104 EXPECT_TRUE(unilib_->IsDigit(0xA620)); // VAI DIGIT ZERO
105 EXPECT_TRUE(unilib_->IsDigit(0xA629)); // VAI DIGIT NINE
106 EXPECT_FALSE(unilib_->IsDigit(0xA62A)); // VAI SYLLABLE NDOLE MA
107 EXPECT_FALSE(unilib_->IsUpper(0x0211)); // SMALL R WITH DOUBLE GRAVE
108 EXPECT_TRUE(unilib_->IsUpper(0x0212)); // CAPITAL R WITH DOUBLE GRAVE
109 EXPECT_TRUE(unilib_->IsUpper(0x0391)); // GREEK CAPITAL ALPHA
110 EXPECT_TRUE(unilib_->IsUpper(0x03AB)); // GREEK CAPITAL UPSILON W DIAL
111 EXPECT_FALSE(unilib_->IsUpper(0x03AC)); // GREEK SMALL ALPHA WITH TONOS
112 EXPECT_TRUE(unilib_->IsLower(0x03AC)); // GREEK SMALL ALPHA WITH TONOS
113 EXPECT_TRUE(unilib_->IsLower(0x03B1)); // GREEK SMALL ALPHA
114 EXPECT_TRUE(unilib_->IsLower(0x03CB)); // GREEK SMALL UPSILON
115 EXPECT_TRUE(unilib_->IsLower(0x0211)); // SMALL R WITH DOUBLE GRAVE
116 EXPECT_TRUE(unilib_->IsLower(0x03C0)); // GREEK SMALL PI
117 EXPECT_TRUE(unilib_->IsLower(0x007A)); // SMALL Z
118 EXPECT_FALSE(unilib_->IsLower(0x005A)); // CAPITAL Z
119 EXPECT_FALSE(unilib_->IsLower(0x0212)); // CAPITAL R WITH DOUBLE GRAVE
120 EXPECT_FALSE(unilib_->IsLower(0x0391)); // GREEK CAPITAL ALPHA
121 EXPECT_TRUE(unilib_->IsPunctuation(0x055E)); // ARMENIAN QUESTION MARK
122 EXPECT_TRUE(unilib_->IsPunctuation(0x066C)); // ARABIC THOUSANDS SEPARATOR
123 EXPECT_TRUE(unilib_->IsPunctuation(0x07F7)); // NKO SYMBOL GBAKURUNEN
124 EXPECT_TRUE(unilib_->IsPunctuation(0x10AF2)); // DOUBLE DOT WITHIN DOT
125 EXPECT_FALSE(unilib_->IsPunctuation(0x00A3)); // POUND SIGN
126 EXPECT_FALSE(unilib_->IsPunctuation(0xA838)); // NORTH INDIC RUPEE MARK
127 EXPECT_TRUE(unilib_->IsPercentage(0x0025)); // PERCENT SIGN
128 EXPECT_TRUE(unilib_->IsPercentage(0xFF05)); // FULLWIDTH PERCENT SIGN
129 EXPECT_TRUE(unilib_->IsSlash(0x002F)); // SOLIDUS
130 EXPECT_TRUE(unilib_->IsSlash(0xFF0F)); // FULLWIDTH SOLIDUS
131 EXPECT_TRUE(unilib_->IsMinus(0x002D)); // HYPHEN-MINUS
132 EXPECT_TRUE(unilib_->IsMinus(0xFF0D)); // FULLWIDTH HYPHEN-MINUS
133 EXPECT_TRUE(unilib_->IsNumberSign(0x0023)); // NUMBER SIGN
134 EXPECT_TRUE(unilib_->IsNumberSign(0xFF03)); // FULLWIDTH NUMBER SIGN
135 EXPECT_TRUE(unilib_->IsDot(0x002E)); // FULL STOP
136 EXPECT_TRUE(unilib_->IsDot(0xFF0E)); // FULLWIDTH FULL STOP
137
138 EXPECT_TRUE(unilib_->IsLatinLetter(0x0041)); // LATIN CAPITAL LETTER A
139 EXPECT_TRUE(unilib_->IsArabicLetter(0x0628)); // ARABIC LETTER BEH
140 EXPECT_TRUE(
141 unilib_->IsCyrillicLetter(0x1C80)); // CYRILLIC SMALL LETTER ROUNDED VE
142 EXPECT_TRUE(unilib_->IsChineseLetter(0xF900)); // CJK COMPATIBILITY IDEOGRAPH
143 EXPECT_TRUE(unilib_->IsJapaneseLetter(0x3041)); // HIRAGANA LETTER SMALL A
144 EXPECT_TRUE(unilib_->IsKoreanLetter(0x3131)); // HANGUL LETTER KIYEOK
145 EXPECT_TRUE(unilib_->IsThaiLetter(0x0E01)); // THAI CHARACTER KO KAI
146 EXPECT_TRUE(unilib_->IsCJTletter(0x0E01)); // THAI CHARACTER KO KAI
147 EXPECT_FALSE(unilib_->IsCJTletter(0x0041)); // LATIN CAPITAL LETTER A
148
149 EXPECT_TRUE(unilib_->IsLetter(0x0041)); // LATIN CAPITAL LETTER A
150 EXPECT_TRUE(unilib_->IsLetter(0xFF21)); // FULLWIDTH LATIN CAPITAL LETTER A
151 EXPECT_TRUE(unilib_->IsLetter(0x30C8)); // KATAKANA LETTER TO
152 EXPECT_TRUE(unilib_->IsLetter(0xFF84)); // HALFWIDTH KATAKANA LETTER TO
153 EXPECT_TRUE(unilib_->IsLetter(0xF900)); // CJK COMPATIBILITY IDEOGRAPH
154
155 EXPECT_EQ(unilib_->ToLower(0x0391), 0x03B1); // GREEK ALPHA
156 EXPECT_EQ(unilib_->ToLower(0x03AB), 0x03CB); // GREEK UPSILON WITH DIALYTIKA
157 EXPECT_EQ(unilib_->ToLower(0x03C0), 0x03C0); // GREEK SMALL PI
158 EXPECT_EQ(unilib_->ToLower(0x03A3), 0x03C3); // GREEK CAPITAL LETTER SIGMA
159 EXPECT_EQ(
160 unilib_->ToLowerText(UTF8ToUnicodeText("Κανένας άνθρωπος δεν ξέρει"))
161 .ToUTF8String(),
162 "κανένας άνθρωπος δεν ξέρει");
163 EXPECT_TRUE(unilib_->IsLowerText(UTF8ToUnicodeText("ξέρει")));
164 EXPECT_EQ(unilib_->ToUpper(0x03B1), 0x0391); // GREEK ALPHA
165 EXPECT_EQ(unilib_->ToUpper(0x03CB), 0x03AB); // GREEK UPSILON WITH DIALYTIKA
166 EXPECT_EQ(unilib_->ToUpper(0x0391), 0x0391); // GREEK CAPITAL ALPHA
167 EXPECT_EQ(unilib_->ToUpper(0x03C3), 0x03A3); // GREEK CAPITAL LETTER SIGMA
168 EXPECT_EQ(unilib_->ToUpper(0x03C2), 0x03A3); // GREEK CAPITAL LETTER SIGMA
169 EXPECT_EQ(
170 unilib_->ToUpperText(UTF8ToUnicodeText("Κανένας άνθρωπος δεν ξέρει"))
171 .ToUTF8String(),
172 "ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ");
173 EXPECT_TRUE(unilib_->IsUpperText(UTF8ToUnicodeText("ΚΑΝΈΝΑΣ")));
174 EXPECT_EQ(unilib_->GetPairedBracket(0x0F3C), 0x0F3D);
175 EXPECT_EQ(unilib_->GetPairedBracket(0x0F3D), 0x0F3C);
176 }
177
TEST_F(UniLibTest,RegexInterface)178 TEST_F(UniLibTest, RegexInterface) {
179 const UnicodeText regex_pattern =
180 UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true);
181 std::unique_ptr<UniLib::RegexPattern> pattern =
182 unilib_->CreateRegexPattern(regex_pattern);
183 const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false);
184 int status;
185 std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input);
186 TC3_LOG(INFO) << matcher->Matches(&status);
187 TC3_LOG(INFO) << matcher->Find(&status);
188 TC3_LOG(INFO) << matcher->Start(0, &status);
189 TC3_LOG(INFO) << matcher->End(0, &status);
190 TC3_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
191 }
192
TEST_F(UniLibTest,Regex)193 TEST_F(UniLibTest, Regex) {
194 // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
195 // test the regex functionality with it to verify we are handling the indices
196 // correctly.
197 const UnicodeText regex_pattern =
198 UTF8ToUnicodeText("[0-9]+", /*do_copy=*/false);
199 std::unique_ptr<UniLib::RegexPattern> pattern =
200 unilib_->CreateRegexPattern(regex_pattern);
201 int status;
202 std::unique_ptr<UniLib::RegexMatcher> matcher;
203
204 matcher = pattern->Matcher(UTF8ToUnicodeText("0123", /*do_copy=*/false));
205 EXPECT_TRUE(matcher->Matches(&status));
206 EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
207 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
208 EXPECT_TRUE(matcher->Matches(&status)); // Check that the state is reset.
209 EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
210 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
211
212 matcher = pattern->Matcher(
213 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
214 EXPECT_FALSE(matcher->Matches(&status));
215 EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
216 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
217
218 matcher = pattern->Matcher(
219 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
220 EXPECT_TRUE(matcher->Find(&status));
221 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
222 EXPECT_EQ(matcher->Start(0, &status), 8);
223 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
224 EXPECT_EQ(matcher->End(0, &status), 13);
225 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
226 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
227 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
228 }
229
TEST_F(UniLibTest,RegexLazy)230 TEST_F(UniLibTest, RegexLazy) {
231 std::unique_ptr<UniLib::RegexPattern> pattern =
232 unilib_->CreateLazyRegexPattern(
233 UTF8ToUnicodeText("[a-z][0-9]", /*do_copy=*/false));
234 int status;
235 std::unique_ptr<UniLib::RegexMatcher> matcher;
236
237 matcher = pattern->Matcher(UTF8ToUnicodeText("a3", /*do_copy=*/false));
238 EXPECT_TRUE(matcher->Matches(&status));
239 EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
240 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
241 EXPECT_TRUE(matcher->Matches(&status)); // Check that the state is reset.
242 EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
243 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
244
245 matcher = pattern->Matcher(UTF8ToUnicodeText("3a", /*do_copy=*/false));
246 EXPECT_FALSE(matcher->Matches(&status));
247 EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
248 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
249 }
250
TEST_F(UniLibTest,RegexGroups)251 TEST_F(UniLibTest, RegexGroups) {
252 // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
253 // test the regex functionality with it to verify we are handling the indices
254 // correctly.
255 const UnicodeText regex_pattern =
256 UTF8ToUnicodeText("([0-9])([0-9]+)", /*do_copy=*/false);
257 std::unique_ptr<UniLib::RegexPattern> pattern =
258 unilib_->CreateRegexPattern(regex_pattern);
259 int status;
260 std::unique_ptr<UniLib::RegexMatcher> matcher;
261
262 matcher = pattern->Matcher(
263 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
264 EXPECT_TRUE(matcher->Find(&status));
265 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
266 EXPECT_EQ(matcher->Start(0, &status), 8);
267 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
268 EXPECT_EQ(matcher->Start(1, &status), 8);
269 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
270 EXPECT_EQ(matcher->Start(2, &status), 9);
271 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
272 EXPECT_EQ(matcher->End(0, &status), 13);
273 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
274 EXPECT_EQ(matcher->End(1, &status), 9);
275 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
276 EXPECT_EQ(matcher->End(2, &status), 12);
277 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
278 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
279 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
280 EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0");
281 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
282 EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
283 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
284 }
285
TEST_F(UniLibTest,RegexGroupsNotAllGroupsInvolved)286 TEST_F(UniLibTest, RegexGroupsNotAllGroupsInvolved) {
287 const UnicodeText regex_pattern =
288 UTF8ToUnicodeText("([0-9])([a-z])?", /*do_copy=*/false);
289 std::unique_ptr<UniLib::RegexPattern> pattern =
290 unilib_->CreateRegexPattern(regex_pattern);
291 int status;
292 std::unique_ptr<UniLib::RegexMatcher> matcher;
293
294 matcher = pattern->Matcher(UTF8ToUnicodeText("7", /*do_copy=*/false));
295 EXPECT_TRUE(matcher->Find(&status));
296 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
297 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "7");
298 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
299 EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "7");
300 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
301 EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "");
302 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
303 }
304
TEST_F(UniLibTest,RegexGroupsEmptyResult)305 TEST_F(UniLibTest, RegexGroupsEmptyResult) {
306 const UnicodeText regex_pattern =
307 UTF8ToUnicodeText("(.*)", /*do_copy=*/false);
308 std::unique_ptr<UniLib::RegexPattern> pattern =
309 unilib_->CreateRegexPattern(regex_pattern);
310 int status;
311 std::unique_ptr<UniLib::RegexMatcher> matcher;
312
313 matcher = pattern->Matcher(UTF8ToUnicodeText("", /*do_copy=*/false));
314 EXPECT_TRUE(matcher->Find(&status));
315 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
316 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "");
317 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
318 EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "");
319 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
320 }
321
TEST_F(UniLibTest,BreakIterator)322 TEST_F(UniLibTest, BreakIterator) {
323 const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
324 std::unique_ptr<UniLib::BreakIterator> iterator =
325 unilib_->CreateBreakIterator(text);
326 std::vector<int> break_indices;
327 int break_index = 0;
328 while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
329 break_indices.push_back(break_index);
330 }
331 EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
332 }
333
TEST_F(UniLibTest,BreakIterator4ByteUTF8)334 TEST_F(UniLibTest, BreakIterator4ByteUTF8) {
335 const UnicodeText text = UTF8ToUnicodeText("", /*do_copy=*/false);
336 std::unique_ptr<UniLib::BreakIterator> iterator =
337 unilib_->CreateBreakIterator(text);
338 std::vector<int> break_indices;
339 int break_index = 0;
340 while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
341 break_indices.push_back(break_index);
342 }
343 EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
344 }
345
TEST_F(UniLibTest,Integer32Parse)346 TEST_F(UniLibTest, Integer32Parse) {
347 int result;
348 EXPECT_TRUE(unilib_->ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false),
349 &result));
350 EXPECT_EQ(result, 123);
351 }
352
TEST_F(UniLibTest,Integer32ParseFloatNumber)353 TEST_F(UniLibTest, Integer32ParseFloatNumber) {
354 int result;
355 EXPECT_FALSE(unilib_->ParseInt32(UTF8ToUnicodeText("12.3", /*do_copy=*/false),
356 &result));
357 }
358
TEST_F(UniLibTest,Integer32ParseLongNumber)359 TEST_F(UniLibTest, Integer32ParseLongNumber) {
360 int32 result;
361 EXPECT_TRUE(unilib_->ParseInt32(
362 UTF8ToUnicodeText("1000000000", /*do_copy=*/false), &result));
363 EXPECT_EQ(result, 1000000000);
364 }
365
TEST_F(UniLibTest,Integer32ParseOverflowNumber)366 TEST_F(UniLibTest, Integer32ParseOverflowNumber) {
367 int32 result;
368 EXPECT_FALSE(unilib_->ParseInt32(
369 UTF8ToUnicodeText("9123456789", /*do_copy=*/false), &result));
370 }
371
TEST_F(UniLibTest,Integer32ParseEmptyString)372 TEST_F(UniLibTest, Integer32ParseEmptyString) {
373 int result;
374 EXPECT_FALSE(
375 unilib_->ParseInt32(UTF8ToUnicodeText("", /*do_copy=*/false), &result));
376 }
377
TEST_F(UniLibTest,Integer32ParseFullWidth)378 TEST_F(UniLibTest, Integer32ParseFullWidth) {
379 int result;
380 // The input string here is full width
381 EXPECT_TRUE(unilib_->ParseInt32(
382 UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
383 EXPECT_EQ(result, 123);
384 }
385
TEST_F(UniLibTest,Integer32ParseNotNumber)386 TEST_F(UniLibTest, Integer32ParseNotNumber) {
387 int result;
388 // The input string here is full width
389 EXPECT_FALSE(unilib_->ParseInt32(
390 UTF8ToUnicodeText("1a3", /*do_copy=*/false), &result));
391 // Strings starting with "nan" are not numbers.
392 EXPECT_FALSE(unilib_->ParseInt32(UTF8ToUnicodeText("Nancy",
393 /*do_copy=*/false),
394 &result));
395 // Strings starting with "inf" are not numbers
396 EXPECT_FALSE(unilib_->ParseInt32(
397 UTF8ToUnicodeText("Information", /*do_copy=*/false), &result));
398 }
399
TEST_F(UniLibTest,Integer64Parse)400 TEST_F(UniLibTest, Integer64Parse) {
401 int64 result;
402 EXPECT_TRUE(unilib_->ParseInt64(UTF8ToUnicodeText("123", /*do_copy=*/false),
403 &result));
404 EXPECT_EQ(result, 123);
405 }
406
TEST_F(UniLibTest,Integer64ParseFloatNumber)407 TEST_F(UniLibTest, Integer64ParseFloatNumber) {
408 int64 result;
409 EXPECT_FALSE(unilib_->ParseInt64(UTF8ToUnicodeText("12.3", /*do_copy=*/false),
410 &result));
411 }
412
TEST_F(UniLibTest,Integer64ParseLongNumber)413 TEST_F(UniLibTest, Integer64ParseLongNumber) {
414 int64 result;
415 // The limitation comes from the javaicu implementation: parseDouble does not
416 // have ICU support and parseInt limit the size of the number.
417 EXPECT_TRUE(unilib_->ParseInt64(
418 UTF8ToUnicodeText("1000000000", /*do_copy=*/false), &result));
419 EXPECT_EQ(result, 1000000000);
420 }
421
TEST_F(UniLibTest,Integer64ParseOverflowNumber)422 TEST_F(UniLibTest, Integer64ParseOverflowNumber) {
423 int64 result;
424 EXPECT_FALSE(unilib_->ParseInt64(
425 UTF8ToUnicodeText("92233720368547758099", /*do_copy=*/false), &result));
426 }
427
TEST_F(UniLibTest,Integer64ParseOverflowNegativeNumber)428 TEST_F(UniLibTest, Integer64ParseOverflowNegativeNumber) {
429 int64 result;
430 EXPECT_FALSE(unilib_->ParseInt64(
431 UTF8ToUnicodeText("-92233720368547758099", /*do_copy=*/false), &result));
432 }
433
TEST_F(UniLibTest,Integer64ParseEmptyString)434 TEST_F(UniLibTest, Integer64ParseEmptyString) {
435 int64 result;
436 EXPECT_FALSE(
437 unilib_->ParseInt64(UTF8ToUnicodeText("", /*do_copy=*/false), &result));
438 }
439
TEST_F(UniLibTest,Integer64ParseFullWidth)440 TEST_F(UniLibTest, Integer64ParseFullWidth) {
441 int64 result;
442 // The input string here is full width
443 EXPECT_TRUE(unilib_->ParseInt64(
444 UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
445 EXPECT_EQ(result, 123);
446 }
447
TEST_F(UniLibTest,Integer64ParseNotNumber)448 TEST_F(UniLibTest, Integer64ParseNotNumber) {
449 int64 result;
450 // The input string here is full width
451 EXPECT_FALSE(unilib_->ParseInt64(
452 UTF8ToUnicodeText("1a4", /*do_copy=*/false), &result));
453 // Strings starting with "nan" are not numbers.
454 EXPECT_FALSE(unilib_->ParseInt64(UTF8ToUnicodeText("Nancy",
455 /*do_copy=*/false),
456 &result));
457 // Strings starting with "inf" are not numbers
458 EXPECT_FALSE(unilib_->ParseInt64(
459 UTF8ToUnicodeText("Information", /*do_copy=*/false), &result));
460 }
461
TEST_F(UniLibTest,DoubleParse)462 TEST_F(UniLibTest, DoubleParse) {
463 double result;
464 EXPECT_TRUE(unilib_->ParseDouble(UTF8ToUnicodeText("1.23", /*do_copy=*/false),
465 &result));
466 EXPECT_EQ(result, 1.23);
467 }
468
TEST_F(UniLibTest,DoubleParseLongNumber)469 TEST_F(UniLibTest, DoubleParseLongNumber) {
470 double result;
471 // The limitation comes from the javaicu implementation: parseDouble does not
472 // have ICU support and parseInt limit the size of the number.
473 EXPECT_TRUE(unilib_->ParseDouble(
474 UTF8ToUnicodeText("999999999.999999999", /*do_copy=*/false), &result));
475 EXPECT_EQ(result, 999999999.999999999);
476 }
477
TEST_F(UniLibTest,DoubleParseWithoutFractionalPart)478 TEST_F(UniLibTest, DoubleParseWithoutFractionalPart) {
479 double result;
480 EXPECT_TRUE(unilib_->ParseDouble(UTF8ToUnicodeText("123", /*do_copy=*/false),
481 &result));
482 EXPECT_EQ(result, 123);
483 }
484
TEST_F(UniLibTest,DoubleParseEmptyString)485 TEST_F(UniLibTest, DoubleParseEmptyString) {
486 double result;
487 EXPECT_FALSE(
488 unilib_->ParseDouble(UTF8ToUnicodeText("", /*do_copy=*/false), &result));
489 }
490
TEST_F(UniLibTest,DoubleParsePrecedingDot)491 TEST_F(UniLibTest, DoubleParsePrecedingDot) {
492 double result;
493 EXPECT_FALSE(unilib_->ParseDouble(
494 UTF8ToUnicodeText(".123", /*do_copy=*/false), &result));
495 }
496
TEST_F(UniLibTest,DoubleParseLeadingDot)497 TEST_F(UniLibTest, DoubleParseLeadingDot) {
498 double result;
499 EXPECT_FALSE(unilib_->ParseDouble(
500 UTF8ToUnicodeText("123.", /*do_copy=*/false), &result));
501 }
502
TEST_F(UniLibTest,DoubleParseMultipleDots)503 TEST_F(UniLibTest, DoubleParseMultipleDots) {
504 double result;
505 EXPECT_FALSE(unilib_->ParseDouble(
506 UTF8ToUnicodeText("1.2.3", /*do_copy=*/false), &result));
507 }
508
TEST_F(UniLibTest,DoubleParseFullWidth)509 TEST_F(UniLibTest, DoubleParseFullWidth) {
510 double result;
511 // The input string here is full width
512 EXPECT_TRUE(unilib_->ParseDouble(
513 UTF8ToUnicodeText("1.23", /*do_copy=*/false), &result));
514 EXPECT_EQ(result, 1.23);
515 }
516
TEST_F(UniLibTest,DoubleParseNotNumber)517 TEST_F(UniLibTest, DoubleParseNotNumber) {
518 double result;
519 // The input string here is full width
520 EXPECT_FALSE(unilib_->ParseDouble(
521 UTF8ToUnicodeText("1a5", /*do_copy=*/false), &result));
522 // Strings starting with "nan" are not numbers.
523 EXPECT_FALSE(unilib_->ParseDouble(
524 UTF8ToUnicodeText("Nancy", /*do_copy=*/false), &result));
525 // Strings starting with "inf" are not numbers
526 EXPECT_FALSE(unilib_->ParseDouble(
527 UTF8ToUnicodeText("Information", /*do_copy=*/false), &result));
528 }
529
TEST_F(UniLibTest,Length)530 TEST_F(UniLibTest, Length) {
531 EXPECT_EQ(unilib_->Length(UTF8ToUnicodeText("hello", /*do_copy=*/false))
532 .ValueOrDie(),
533 5);
534 EXPECT_EQ(unilib_->Length(UTF8ToUnicodeText("ěščřž", /*do_copy=*/false))
535 .ValueOrDie(),
536 5);
537 // Test Invalid UTF8.
538 // This testing condition needs to be != 1, as Apple character counting seems
539 // to return 0 when the input is invalid UTF8, while ICU will treat the
540 // invalid codepoint as 3 separate bytes.
541 EXPECT_NE(
542 unilib_->Length(UTF8ToUnicodeText("\xed\xa0\x80", /*do_copy=*/false))
543 .ValueOrDie(),
544 1);
545 }
546
547 } // namespace test_internal
548 } // namespace libtextclassifier3
549