1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8/unilib_test-include.h"
18 
19 #include "utils/base/logging.h"
20 #include "gmock/gmock.h"
21 
22 namespace libtextclassifier3 {
23 namespace test_internal {
24 
25 using ::testing::ElementsAre;
26 
TEST_F(UniLibTest,CharacterClassesAscii)27 TEST_F(UniLibTest, CharacterClassesAscii) {
28   EXPECT_TRUE(unilib_->IsOpeningBracket('('));
29   EXPECT_TRUE(unilib_->IsClosingBracket(')'));
30   EXPECT_FALSE(unilib_->IsWhitespace(')'));
31   EXPECT_TRUE(unilib_->IsWhitespace(' '));
32   EXPECT_FALSE(unilib_->IsDigit(')'));
33   EXPECT_TRUE(unilib_->IsDigit('0'));
34   EXPECT_TRUE(unilib_->IsDigit('9'));
35   EXPECT_FALSE(unilib_->IsUpper(')'));
36   EXPECT_TRUE(unilib_->IsUpper('A'));
37   EXPECT_TRUE(unilib_->IsUpper('Z'));
38   EXPECT_FALSE(unilib_->IsLower(')'));
39   EXPECT_TRUE(unilib_->IsLower('a'));
40   EXPECT_TRUE(unilib_->IsLower('z'));
41   EXPECT_TRUE(unilib_->IsPunctuation('!'));
42   EXPECT_TRUE(unilib_->IsPunctuation('?'));
43   EXPECT_TRUE(unilib_->IsPunctuation('#'));
44   EXPECT_TRUE(unilib_->IsPunctuation('('));
45   EXPECT_FALSE(unilib_->IsPunctuation('0'));
46   EXPECT_FALSE(unilib_->IsPunctuation('$'));
47   EXPECT_TRUE(unilib_->IsPercentage('%'));
48   EXPECT_TRUE(unilib_->IsPercentage(u'%'));
49   EXPECT_TRUE(unilib_->IsSlash('/'));
50   EXPECT_TRUE(unilib_->IsSlash(u'/'));
51   EXPECT_TRUE(unilib_->IsMinus('-'));
52   EXPECT_TRUE(unilib_->IsMinus(u'-'));
53   EXPECT_TRUE(unilib_->IsNumberSign('#'));
54   EXPECT_TRUE(unilib_->IsNumberSign(u'#'));
55   EXPECT_TRUE(unilib_->IsDot('.'));
56   EXPECT_TRUE(unilib_->IsDot(u'.'));
57   EXPECT_TRUE(unilib_->IsApostrophe('\''));
58   EXPECT_TRUE(unilib_->IsApostrophe(u'ߴ'));
59   EXPECT_TRUE(unilib_->IsQuotation(u'"'));
60   EXPECT_TRUE(unilib_->IsQuotation(u'”'));
61   EXPECT_TRUE(unilib_->IsAmpersand(u'&'));
62   EXPECT_TRUE(unilib_->IsAmpersand(u'﹠'));
63   EXPECT_TRUE(unilib_->IsAmpersand(u'&'));
64 
65   EXPECT_TRUE(unilib_->IsLatinLetter('A'));
66   EXPECT_TRUE(unilib_->IsArabicLetter(u'ب'));  // ARABIC LETTER BEH
67   EXPECT_TRUE(
68       unilib_->IsCyrillicLetter(u'ᲀ'));  // CYRILLIC SMALL LETTER ROUNDED VE
69   EXPECT_TRUE(unilib_->IsChineseLetter(u'豈'));   // CJK COMPATIBILITY IDEOGRAPH
70   EXPECT_TRUE(unilib_->IsJapaneseLetter(u'ぁ'));  // HIRAGANA LETTER SMALL A
71   EXPECT_TRUE(unilib_->IsKoreanLetter(u'ㄱ'));    // HANGUL LETTER KIYEOK
72   EXPECT_TRUE(unilib_->IsThaiLetter(u'ก'));       // THAI CHARACTER KO KAI
73   EXPECT_TRUE(unilib_->IsCJTletter(u'ก'));        // THAI CHARACTER KO KAI
74   EXPECT_FALSE(unilib_->IsCJTletter('A'));
75 
76   EXPECT_TRUE(unilib_->IsLetter('A'));
77   EXPECT_TRUE(unilib_->IsLetter(u'A'));
78   EXPECT_TRUE(unilib_->IsLetter(u'ト'));  // KATAKANA LETTER TO
79   EXPECT_TRUE(unilib_->IsLetter(u'ト'));   // HALFWIDTH KATAKANA LETTER TO
80   EXPECT_TRUE(unilib_->IsLetter(u'豈'));  // CJK COMPATIBILITY IDEOGRAPH
81 
82   EXPECT_EQ(unilib_->ToLower('A'), 'a');
83   EXPECT_EQ(unilib_->ToLower('Z'), 'z');
84   EXPECT_EQ(unilib_->ToLower(')'), ')');
85   EXPECT_EQ(unilib_->ToLowerText(UTF8ToUnicodeText("Never gonna give you up."))
86                 .ToUTF8String(),
87             "never gonna give you up.");
88   EXPECT_EQ(unilib_->ToUpper('a'), 'A');
89   EXPECT_EQ(unilib_->ToUpper('z'), 'Z');
90   EXPECT_EQ(unilib_->ToUpper(')'), ')');
91   EXPECT_EQ(unilib_->ToUpperText(UTF8ToUnicodeText("Never gonna let you down."))
92                 .ToUTF8String(),
93             "NEVER GONNA LET YOU DOWN.");
94   EXPECT_EQ(unilib_->GetPairedBracket(')'), '(');
95   EXPECT_EQ(unilib_->GetPairedBracket('}'), '{');
96 }
97 
TEST_F(UniLibTest,CharacterClassesUnicode)98 TEST_F(UniLibTest, CharacterClassesUnicode) {
99   EXPECT_TRUE(unilib_->IsOpeningBracket(0x0F3C));  // TIBET ANG KHANG GYON
100   EXPECT_TRUE(unilib_->IsClosingBracket(0x0F3D));  // TIBET ANG KHANG GYAS
101   EXPECT_FALSE(unilib_->IsWhitespace(0x23F0));     // ALARM CLOCK
102   EXPECT_TRUE(unilib_->IsWhitespace(0x2003));      // EM SPACE
103   EXPECT_FALSE(unilib_->IsDigit(0xA619));          // VAI SYMBOL JONG
104   EXPECT_TRUE(unilib_->IsDigit(0xA620));           // VAI DIGIT ZERO
105   EXPECT_TRUE(unilib_->IsDigit(0xA629));           // VAI DIGIT NINE
106   EXPECT_FALSE(unilib_->IsDigit(0xA62A));          // VAI SYLLABLE NDOLE MA
107   EXPECT_FALSE(unilib_->IsUpper(0x0211));          // SMALL R WITH DOUBLE GRAVE
108   EXPECT_TRUE(unilib_->IsUpper(0x0212));         // CAPITAL R WITH DOUBLE GRAVE
109   EXPECT_TRUE(unilib_->IsUpper(0x0391));         // GREEK CAPITAL ALPHA
110   EXPECT_TRUE(unilib_->IsUpper(0x03AB));         // GREEK CAPITAL UPSILON W DIAL
111   EXPECT_FALSE(unilib_->IsUpper(0x03AC));        // GREEK SMALL ALPHA WITH TONOS
112   EXPECT_TRUE(unilib_->IsLower(0x03AC));         // GREEK SMALL ALPHA WITH TONOS
113   EXPECT_TRUE(unilib_->IsLower(0x03B1));         // GREEK SMALL ALPHA
114   EXPECT_TRUE(unilib_->IsLower(0x03CB));         // GREEK SMALL UPSILON
115   EXPECT_TRUE(unilib_->IsLower(0x0211));         // SMALL R WITH DOUBLE GRAVE
116   EXPECT_TRUE(unilib_->IsLower(0x03C0));         // GREEK SMALL PI
117   EXPECT_TRUE(unilib_->IsLower(0x007A));         // SMALL Z
118   EXPECT_FALSE(unilib_->IsLower(0x005A));        // CAPITAL Z
119   EXPECT_FALSE(unilib_->IsLower(0x0212));        // CAPITAL R WITH DOUBLE GRAVE
120   EXPECT_FALSE(unilib_->IsLower(0x0391));        // GREEK CAPITAL ALPHA
121   EXPECT_TRUE(unilib_->IsPunctuation(0x055E));   // ARMENIAN QUESTION MARK
122   EXPECT_TRUE(unilib_->IsPunctuation(0x066C));   // ARABIC THOUSANDS SEPARATOR
123   EXPECT_TRUE(unilib_->IsPunctuation(0x07F7));   // NKO SYMBOL GBAKURUNEN
124   EXPECT_TRUE(unilib_->IsPunctuation(0x10AF2));  // DOUBLE DOT WITHIN DOT
125   EXPECT_FALSE(unilib_->IsPunctuation(0x00A3));  // POUND SIGN
126   EXPECT_FALSE(unilib_->IsPunctuation(0xA838));  // NORTH INDIC RUPEE MARK
127   EXPECT_TRUE(unilib_->IsPercentage(0x0025));    // PERCENT SIGN
128   EXPECT_TRUE(unilib_->IsPercentage(0xFF05));    // FULLWIDTH PERCENT SIGN
129   EXPECT_TRUE(unilib_->IsSlash(0x002F));         // SOLIDUS
130   EXPECT_TRUE(unilib_->IsSlash(0xFF0F));         // FULLWIDTH SOLIDUS
131   EXPECT_TRUE(unilib_->IsMinus(0x002D));         // HYPHEN-MINUS
132   EXPECT_TRUE(unilib_->IsMinus(0xFF0D));         // FULLWIDTH HYPHEN-MINUS
133   EXPECT_TRUE(unilib_->IsNumberSign(0x0023));    // NUMBER SIGN
134   EXPECT_TRUE(unilib_->IsNumberSign(0xFF03));    // FULLWIDTH NUMBER SIGN
135   EXPECT_TRUE(unilib_->IsDot(0x002E));           // FULL STOP
136   EXPECT_TRUE(unilib_->IsDot(0xFF0E));           // FULLWIDTH FULL STOP
137 
138   EXPECT_TRUE(unilib_->IsLatinLetter(0x0041));   // LATIN CAPITAL LETTER A
139   EXPECT_TRUE(unilib_->IsArabicLetter(0x0628));  // ARABIC LETTER BEH
140   EXPECT_TRUE(
141       unilib_->IsCyrillicLetter(0x1C80));  // CYRILLIC SMALL LETTER ROUNDED VE
142   EXPECT_TRUE(unilib_->IsChineseLetter(0xF900));  // CJK COMPATIBILITY IDEOGRAPH
143   EXPECT_TRUE(unilib_->IsJapaneseLetter(0x3041));  // HIRAGANA LETTER SMALL A
144   EXPECT_TRUE(unilib_->IsKoreanLetter(0x3131));    // HANGUL LETTER KIYEOK
145   EXPECT_TRUE(unilib_->IsThaiLetter(0x0E01));      // THAI CHARACTER KO KAI
146   EXPECT_TRUE(unilib_->IsCJTletter(0x0E01));       // THAI CHARACTER KO KAI
147   EXPECT_FALSE(unilib_->IsCJTletter(0x0041));      // LATIN CAPITAL LETTER A
148 
149   EXPECT_TRUE(unilib_->IsLetter(0x0041));  // LATIN CAPITAL LETTER A
150   EXPECT_TRUE(unilib_->IsLetter(0xFF21));  // FULLWIDTH LATIN CAPITAL LETTER A
151   EXPECT_TRUE(unilib_->IsLetter(0x30C8));  // KATAKANA LETTER TO
152   EXPECT_TRUE(unilib_->IsLetter(0xFF84));  // HALFWIDTH KATAKANA LETTER TO
153   EXPECT_TRUE(unilib_->IsLetter(0xF900));  // CJK COMPATIBILITY IDEOGRAPH
154 
155   EXPECT_EQ(unilib_->ToLower(0x0391), 0x03B1);  // GREEK ALPHA
156   EXPECT_EQ(unilib_->ToLower(0x03AB), 0x03CB);  // GREEK UPSILON WITH DIALYTIKA
157   EXPECT_EQ(unilib_->ToLower(0x03C0), 0x03C0);  // GREEK SMALL PI
158   EXPECT_EQ(unilib_->ToLower(0x03A3), 0x03C3);  // GREEK CAPITAL LETTER SIGMA
159   EXPECT_EQ(
160       unilib_->ToLowerText(UTF8ToUnicodeText("Κανένας άνθρωπος δεν ξέρει"))
161           .ToUTF8String(),
162       "κανένας άνθρωπος δεν ξέρει");
163   EXPECT_TRUE(unilib_->IsLowerText(UTF8ToUnicodeText("ξέρει")));
164   EXPECT_EQ(unilib_->ToUpper(0x03B1), 0x0391);  // GREEK ALPHA
165   EXPECT_EQ(unilib_->ToUpper(0x03CB), 0x03AB);  // GREEK UPSILON WITH DIALYTIKA
166   EXPECT_EQ(unilib_->ToUpper(0x0391), 0x0391);  // GREEK CAPITAL ALPHA
167   EXPECT_EQ(unilib_->ToUpper(0x03C3), 0x03A3);  // GREEK CAPITAL LETTER SIGMA
168   EXPECT_EQ(unilib_->ToUpper(0x03C2), 0x03A3);  // GREEK CAPITAL LETTER SIGMA
169   EXPECT_EQ(
170       unilib_->ToUpperText(UTF8ToUnicodeText("Κανένας άνθρωπος δεν ξέρει"))
171           .ToUTF8String(),
172       "ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ");
173   EXPECT_TRUE(unilib_->IsUpperText(UTF8ToUnicodeText("ΚΑΝΈΝΑΣ")));
174   EXPECT_EQ(unilib_->GetPairedBracket(0x0F3C), 0x0F3D);
175   EXPECT_EQ(unilib_->GetPairedBracket(0x0F3D), 0x0F3C);
176 }
177 
TEST_F(UniLibTest,RegexInterface)178 TEST_F(UniLibTest, RegexInterface) {
179   const UnicodeText regex_pattern =
180       UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true);
181   std::unique_ptr<UniLib::RegexPattern> pattern =
182       unilib_->CreateRegexPattern(regex_pattern);
183   const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false);
184   int status;
185   std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input);
186   TC3_LOG(INFO) << matcher->Matches(&status);
187   TC3_LOG(INFO) << matcher->Find(&status);
188   TC3_LOG(INFO) << matcher->Start(0, &status);
189   TC3_LOG(INFO) << matcher->End(0, &status);
190   TC3_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
191 }
192 
TEST_F(UniLibTest,Regex)193 TEST_F(UniLibTest, Regex) {
194   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
195   // test the regex functionality with it to verify we are handling the indices
196   // correctly.
197   const UnicodeText regex_pattern =
198       UTF8ToUnicodeText("[0-9]+��", /*do_copy=*/false);
199   std::unique_ptr<UniLib::RegexPattern> pattern =
200       unilib_->CreateRegexPattern(regex_pattern);
201   int status;
202   std::unique_ptr<UniLib::RegexMatcher> matcher;
203 
204   matcher = pattern->Matcher(UTF8ToUnicodeText("0123��", /*do_copy=*/false));
205   EXPECT_TRUE(matcher->Matches(&status));
206   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
207   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
208   EXPECT_TRUE(matcher->Matches(&status));  // Check that the state is reset.
209   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
210   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
211 
212   matcher = pattern->Matcher(
213       UTF8ToUnicodeText("hello���� 0123�� world", /*do_copy=*/false));
214   EXPECT_FALSE(matcher->Matches(&status));
215   EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
216   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
217 
218   matcher = pattern->Matcher(
219       UTF8ToUnicodeText("hello���� 0123�� world", /*do_copy=*/false));
220   EXPECT_TRUE(matcher->Find(&status));
221   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
222   EXPECT_EQ(matcher->Start(0, &status), 8);
223   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
224   EXPECT_EQ(matcher->End(0, &status), 13);
225   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
226   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123��");
227   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
228 }
229 
TEST_F(UniLibTest,RegexLazy)230 TEST_F(UniLibTest, RegexLazy) {
231   std::unique_ptr<UniLib::RegexPattern> pattern =
232       unilib_->CreateLazyRegexPattern(
233           UTF8ToUnicodeText("[a-z][0-9]", /*do_copy=*/false));
234   int status;
235   std::unique_ptr<UniLib::RegexMatcher> matcher;
236 
237   matcher = pattern->Matcher(UTF8ToUnicodeText("a3", /*do_copy=*/false));
238   EXPECT_TRUE(matcher->Matches(&status));
239   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
240   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
241   EXPECT_TRUE(matcher->Matches(&status));  // Check that the state is reset.
242   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
243   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
244 
245   matcher = pattern->Matcher(UTF8ToUnicodeText("3a", /*do_copy=*/false));
246   EXPECT_FALSE(matcher->Matches(&status));
247   EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
248   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
249 }
250 
TEST_F(UniLibTest,RegexGroups)251 TEST_F(UniLibTest, RegexGroups) {
252   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
253   // test the regex functionality with it to verify we are handling the indices
254   // correctly.
255   const UnicodeText regex_pattern =
256       UTF8ToUnicodeText("([0-9])([0-9]+)��", /*do_copy=*/false);
257   std::unique_ptr<UniLib::RegexPattern> pattern =
258       unilib_->CreateRegexPattern(regex_pattern);
259   int status;
260   std::unique_ptr<UniLib::RegexMatcher> matcher;
261 
262   matcher = pattern->Matcher(
263       UTF8ToUnicodeText("hello���� 0123�� world", /*do_copy=*/false));
264   EXPECT_TRUE(matcher->Find(&status));
265   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
266   EXPECT_EQ(matcher->Start(0, &status), 8);
267   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
268   EXPECT_EQ(matcher->Start(1, &status), 8);
269   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
270   EXPECT_EQ(matcher->Start(2, &status), 9);
271   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
272   EXPECT_EQ(matcher->End(0, &status), 13);
273   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
274   EXPECT_EQ(matcher->End(1, &status), 9);
275   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
276   EXPECT_EQ(matcher->End(2, &status), 12);
277   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
278   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123��");
279   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
280   EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0");
281   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
282   EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
283   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
284 }
285 
TEST_F(UniLibTest,RegexGroupsNotAllGroupsInvolved)286 TEST_F(UniLibTest, RegexGroupsNotAllGroupsInvolved) {
287   const UnicodeText regex_pattern =
288       UTF8ToUnicodeText("([0-9])([a-z])?", /*do_copy=*/false);
289   std::unique_ptr<UniLib::RegexPattern> pattern =
290       unilib_->CreateRegexPattern(regex_pattern);
291   int status;
292   std::unique_ptr<UniLib::RegexMatcher> matcher;
293 
294   matcher = pattern->Matcher(UTF8ToUnicodeText("7", /*do_copy=*/false));
295   EXPECT_TRUE(matcher->Find(&status));
296   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
297   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "7");
298   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
299   EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "7");
300   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
301   EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "");
302   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
303 }
304 
TEST_F(UniLibTest,RegexGroupsEmptyResult)305 TEST_F(UniLibTest, RegexGroupsEmptyResult) {
306   const UnicodeText regex_pattern =
307       UTF8ToUnicodeText("(.*)", /*do_copy=*/false);
308   std::unique_ptr<UniLib::RegexPattern> pattern =
309       unilib_->CreateRegexPattern(regex_pattern);
310   int status;
311   std::unique_ptr<UniLib::RegexMatcher> matcher;
312 
313   matcher = pattern->Matcher(UTF8ToUnicodeText("", /*do_copy=*/false));
314   EXPECT_TRUE(matcher->Find(&status));
315   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
316   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "");
317   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
318   EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "");
319   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
320 }
321 
TEST_F(UniLibTest,BreakIterator)322 TEST_F(UniLibTest, BreakIterator) {
323   const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
324   std::unique_ptr<UniLib::BreakIterator> iterator =
325       unilib_->CreateBreakIterator(text);
326   std::vector<int> break_indices;
327   int break_index = 0;
328   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
329     break_indices.push_back(break_index);
330   }
331   EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
332 }
333 
TEST_F(UniLibTest,BreakIterator4ByteUTF8)334 TEST_F(UniLibTest, BreakIterator4ByteUTF8) {
335   const UnicodeText text = UTF8ToUnicodeText("������", /*do_copy=*/false);
336   std::unique_ptr<UniLib::BreakIterator> iterator =
337       unilib_->CreateBreakIterator(text);
338   std::vector<int> break_indices;
339   int break_index = 0;
340   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
341     break_indices.push_back(break_index);
342   }
343   EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
344 }
345 
TEST_F(UniLibTest,Integer32Parse)346 TEST_F(UniLibTest, Integer32Parse) {
347   int result;
348   EXPECT_TRUE(unilib_->ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false),
349                                   &result));
350   EXPECT_EQ(result, 123);
351 }
352 
TEST_F(UniLibTest,Integer32ParseFloatNumber)353 TEST_F(UniLibTest, Integer32ParseFloatNumber) {
354   int result;
355   EXPECT_FALSE(unilib_->ParseInt32(UTF8ToUnicodeText("12.3", /*do_copy=*/false),
356                                    &result));
357 }
358 
TEST_F(UniLibTest,Integer32ParseLongNumber)359 TEST_F(UniLibTest, Integer32ParseLongNumber) {
360   int32 result;
361   EXPECT_TRUE(unilib_->ParseInt32(
362       UTF8ToUnicodeText("1000000000", /*do_copy=*/false), &result));
363   EXPECT_EQ(result, 1000000000);
364 }
365 
TEST_F(UniLibTest,Integer32ParseOverflowNumber)366 TEST_F(UniLibTest, Integer32ParseOverflowNumber) {
367   int32 result;
368   EXPECT_FALSE(unilib_->ParseInt32(
369       UTF8ToUnicodeText("9123456789", /*do_copy=*/false), &result));
370 }
371 
TEST_F(UniLibTest,Integer32ParseEmptyString)372 TEST_F(UniLibTest, Integer32ParseEmptyString) {
373   int result;
374   EXPECT_FALSE(
375       unilib_->ParseInt32(UTF8ToUnicodeText("", /*do_copy=*/false), &result));
376 }
377 
TEST_F(UniLibTest,Integer32ParseFullWidth)378 TEST_F(UniLibTest, Integer32ParseFullWidth) {
379   int result;
380   // The input string here is full width
381   EXPECT_TRUE(unilib_->ParseInt32(
382       UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
383   EXPECT_EQ(result, 123);
384 }
385 
TEST_F(UniLibTest,Integer32ParseNotNumber)386 TEST_F(UniLibTest, Integer32ParseNotNumber) {
387   int result;
388   // The input string here is full width
389   EXPECT_FALSE(unilib_->ParseInt32(
390       UTF8ToUnicodeText("1a3", /*do_copy=*/false), &result));
391   // Strings starting with "nan" are not numbers.
392   EXPECT_FALSE(unilib_->ParseInt32(UTF8ToUnicodeText("Nancy",
393                                                      /*do_copy=*/false),
394                                    &result));
395   // Strings starting with "inf" are not numbers
396   EXPECT_FALSE(unilib_->ParseInt32(
397       UTF8ToUnicodeText("Information", /*do_copy=*/false), &result));
398 }
399 
TEST_F(UniLibTest,Integer64Parse)400 TEST_F(UniLibTest, Integer64Parse) {
401   int64 result;
402   EXPECT_TRUE(unilib_->ParseInt64(UTF8ToUnicodeText("123", /*do_copy=*/false),
403                                   &result));
404   EXPECT_EQ(result, 123);
405 }
406 
TEST_F(UniLibTest,Integer64ParseFloatNumber)407 TEST_F(UniLibTest, Integer64ParseFloatNumber) {
408   int64 result;
409   EXPECT_FALSE(unilib_->ParseInt64(UTF8ToUnicodeText("12.3", /*do_copy=*/false),
410                                    &result));
411 }
412 
TEST_F(UniLibTest,Integer64ParseLongNumber)413 TEST_F(UniLibTest, Integer64ParseLongNumber) {
414   int64 result;
415   // The limitation comes from the javaicu implementation: parseDouble does not
416   // have ICU support and parseInt limit the size of the number.
417   EXPECT_TRUE(unilib_->ParseInt64(
418       UTF8ToUnicodeText("1000000000", /*do_copy=*/false), &result));
419   EXPECT_EQ(result, 1000000000);
420 }
421 
TEST_F(UniLibTest,Integer64ParseOverflowNumber)422 TEST_F(UniLibTest, Integer64ParseOverflowNumber) {
423   int64 result;
424   EXPECT_FALSE(unilib_->ParseInt64(
425       UTF8ToUnicodeText("92233720368547758099", /*do_copy=*/false), &result));
426 }
427 
TEST_F(UniLibTest,Integer64ParseOverflowNegativeNumber)428 TEST_F(UniLibTest, Integer64ParseOverflowNegativeNumber) {
429   int64 result;
430   EXPECT_FALSE(unilib_->ParseInt64(
431       UTF8ToUnicodeText("-92233720368547758099", /*do_copy=*/false), &result));
432 }
433 
TEST_F(UniLibTest,Integer64ParseEmptyString)434 TEST_F(UniLibTest, Integer64ParseEmptyString) {
435   int64 result;
436   EXPECT_FALSE(
437       unilib_->ParseInt64(UTF8ToUnicodeText("", /*do_copy=*/false), &result));
438 }
439 
TEST_F(UniLibTest,Integer64ParseFullWidth)440 TEST_F(UniLibTest, Integer64ParseFullWidth) {
441   int64 result;
442   // The input string here is full width
443   EXPECT_TRUE(unilib_->ParseInt64(
444       UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
445   EXPECT_EQ(result, 123);
446 }
447 
TEST_F(UniLibTest,Integer64ParseNotNumber)448 TEST_F(UniLibTest, Integer64ParseNotNumber) {
449   int64 result;
450   // The input string here is full width
451   EXPECT_FALSE(unilib_->ParseInt64(
452       UTF8ToUnicodeText("1a4", /*do_copy=*/false), &result));
453   // Strings starting with "nan" are not numbers.
454   EXPECT_FALSE(unilib_->ParseInt64(UTF8ToUnicodeText("Nancy",
455                                                      /*do_copy=*/false),
456                                    &result));
457   // Strings starting with "inf" are not numbers
458   EXPECT_FALSE(unilib_->ParseInt64(
459       UTF8ToUnicodeText("Information", /*do_copy=*/false), &result));
460 }
461 
TEST_F(UniLibTest,DoubleParse)462 TEST_F(UniLibTest, DoubleParse) {
463   double result;
464   EXPECT_TRUE(unilib_->ParseDouble(UTF8ToUnicodeText("1.23", /*do_copy=*/false),
465                                    &result));
466   EXPECT_EQ(result, 1.23);
467 }
468 
TEST_F(UniLibTest,DoubleParseLongNumber)469 TEST_F(UniLibTest, DoubleParseLongNumber) {
470   double result;
471   // The limitation comes from the javaicu implementation: parseDouble does not
472   // have ICU support and parseInt limit the size of the number.
473   EXPECT_TRUE(unilib_->ParseDouble(
474       UTF8ToUnicodeText("999999999.999999999", /*do_copy=*/false), &result));
475   EXPECT_EQ(result, 999999999.999999999);
476 }
477 
TEST_F(UniLibTest,DoubleParseWithoutFractionalPart)478 TEST_F(UniLibTest, DoubleParseWithoutFractionalPart) {
479   double result;
480   EXPECT_TRUE(unilib_->ParseDouble(UTF8ToUnicodeText("123", /*do_copy=*/false),
481                                    &result));
482   EXPECT_EQ(result, 123);
483 }
484 
TEST_F(UniLibTest,DoubleParseEmptyString)485 TEST_F(UniLibTest, DoubleParseEmptyString) {
486   double result;
487   EXPECT_FALSE(
488       unilib_->ParseDouble(UTF8ToUnicodeText("", /*do_copy=*/false), &result));
489 }
490 
TEST_F(UniLibTest,DoubleParsePrecedingDot)491 TEST_F(UniLibTest, DoubleParsePrecedingDot) {
492   double result;
493   EXPECT_FALSE(unilib_->ParseDouble(
494       UTF8ToUnicodeText(".123", /*do_copy=*/false), &result));
495 }
496 
TEST_F(UniLibTest,DoubleParseLeadingDot)497 TEST_F(UniLibTest, DoubleParseLeadingDot) {
498   double result;
499   EXPECT_FALSE(unilib_->ParseDouble(
500       UTF8ToUnicodeText("123.", /*do_copy=*/false), &result));
501 }
502 
TEST_F(UniLibTest,DoubleParseMultipleDots)503 TEST_F(UniLibTest, DoubleParseMultipleDots) {
504   double result;
505   EXPECT_FALSE(unilib_->ParseDouble(
506       UTF8ToUnicodeText("1.2.3", /*do_copy=*/false), &result));
507 }
508 
TEST_F(UniLibTest,DoubleParseFullWidth)509 TEST_F(UniLibTest, DoubleParseFullWidth) {
510   double result;
511   // The input string here is full width
512   EXPECT_TRUE(unilib_->ParseDouble(
513       UTF8ToUnicodeText("1.23", /*do_copy=*/false), &result));
514   EXPECT_EQ(result, 1.23);
515 }
516 
TEST_F(UniLibTest,DoubleParseNotNumber)517 TEST_F(UniLibTest, DoubleParseNotNumber) {
518   double result;
519   // The input string here is full width
520   EXPECT_FALSE(unilib_->ParseDouble(
521       UTF8ToUnicodeText("1a5", /*do_copy=*/false), &result));
522   // Strings starting with "nan" are not numbers.
523   EXPECT_FALSE(unilib_->ParseDouble(
524       UTF8ToUnicodeText("Nancy", /*do_copy=*/false), &result));
525   // Strings starting with "inf" are not numbers
526   EXPECT_FALSE(unilib_->ParseDouble(
527       UTF8ToUnicodeText("Information", /*do_copy=*/false), &result));
528 }
529 
TEST_F(UniLibTest,Length)530 TEST_F(UniLibTest, Length) {
531   EXPECT_EQ(unilib_->Length(UTF8ToUnicodeText("hello", /*do_copy=*/false))
532                 .ValueOrDie(),
533             5);
534   EXPECT_EQ(unilib_->Length(UTF8ToUnicodeText("ěščřž", /*do_copy=*/false))
535                 .ValueOrDie(),
536             5);
537   // Test Invalid UTF8.
538   // This testing condition needs to be != 1, as Apple character counting seems
539   // to return 0 when the input is invalid UTF8, while ICU will treat the
540   // invalid codepoint as 3 separate bytes.
541   EXPECT_NE(
542       unilib_->Length(UTF8ToUnicodeText("\xed\xa0\x80", /*do_copy=*/false))
543           .ValueOrDie(),
544       1);
545 }
546 
547 }  // namespace test_internal
548 }  // namespace libtextclassifier3
549