1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/tokenizer.h"
18 
19 #include <vector>
20 
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 
24 namespace libtextclassifier3 {
25 namespace {
26 
27 using testing::ElementsAreArray;
28 
29 class TestingTokenizer : public Tokenizer {
30  public:
TestingTokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)31   TestingTokenizer(
32       const TokenizationType type, const UniLib* unilib,
33       const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
34       const std::vector<const CodepointRange*>&
35           internal_tokenizer_codepoint_ranges,
36       const bool split_on_script_change,
37       const bool icu_preserve_whitespace_tokens)
38       : Tokenizer(type, unilib, codepoint_ranges,
39                   internal_tokenizer_codepoint_ranges, split_on_script_change,
40                   icu_preserve_whitespace_tokens) {}
41 
42   using Tokenizer::FindTokenizationRange;
43 };
44 
45 class TestingTokenizerProxy {
46  public:
TestingTokenizerProxy(TokenizationType type,const std::vector<TokenizationCodepointRangeT> & codepoint_range_configs,const std::vector<CodepointRangeT> & internal_codepoint_range_configs,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)47   TestingTokenizerProxy(
48       TokenizationType type,
49       const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
50       const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
51       const bool split_on_script_change,
52       const bool icu_preserve_whitespace_tokens)
53       : INIT_UNILIB_FOR_TESTING(unilib_) {
54     const int num_configs = codepoint_range_configs.size();
55     std::vector<const TokenizationCodepointRange*> configs_fb;
56     configs_fb.reserve(num_configs);
57     const int num_internal_configs = internal_codepoint_range_configs.size();
58     std::vector<const CodepointRange*> internal_configs_fb;
59     internal_configs_fb.reserve(num_internal_configs);
60     buffers_.reserve(num_configs + num_internal_configs);
61     for (int i = 0; i < num_configs; i++) {
62       flatbuffers::FlatBufferBuilder builder;
63       builder.Finish(CreateTokenizationCodepointRange(
64           builder, &codepoint_range_configs[i]));
65       buffers_.push_back(builder.Release());
66       configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
67           buffers_.back().data()));
68     }
69     for (int i = 0; i < num_internal_configs; i++) {
70       flatbuffers::FlatBufferBuilder builder;
71       builder.Finish(
72           CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
73       buffers_.push_back(builder.Release());
74       internal_configs_fb.push_back(
75           flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
76     }
77     tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
78         type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
79         icu_preserve_whitespace_tokens));
80   }
81 
TestFindTokenizationRole(int c) const82   TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
83     const TokenizationCodepointRangeT* range =
84         tokenizer_->FindTokenizationRange(c);
85     if (range != nullptr) {
86       return range->role;
87     } else {
88       return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
89     }
90   }
91 
Tokenize(const std::string & utf8_text) const92   std::vector<Token> Tokenize(const std::string& utf8_text) const {
93     return tokenizer_->Tokenize(utf8_text);
94   }
95 
96  private:
97   UniLib unilib_;
98   std::vector<flatbuffers::DetachedBuffer> buffers_;
99   std::unique_ptr<TestingTokenizer> tokenizer_;
100 };
101 
TEST(TokenizerTest,FindTokenizationRange)102 TEST(TokenizerTest, FindTokenizationRange) {
103   std::vector<TokenizationCodepointRangeT> configs;
104   TokenizationCodepointRangeT* config;
105 
106   configs.emplace_back();
107   config = &configs.back();
108   config->start = 0;
109   config->end = 10;
110   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
111 
112   configs.emplace_back();
113   config = &configs.back();
114   config->start = 32;
115   config->end = 33;
116   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
117 
118   configs.emplace_back();
119   config = &configs.back();
120   config->start = 1234;
121   config->end = 12345;
122   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
123 
124   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
125                                   {}, /*split_on_script_change=*/false,
126                                   /*icu_preserve_whitespace_tokens=*/false);
127 
128   // Test hits to the first group.
129   EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
130             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
131   EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
132             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
133   EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
134             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
135 
136   // Test a hit to the second group.
137   EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
138             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
139   EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
140             TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
141   EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
142             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
143 
144   // Test hits to the third group.
145   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
146             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
147   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
148             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
149   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
150             TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
151   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
152             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
153 
154   // Test a hit outside.
155   EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
156             TokenizationCodepointRange_::Role_DEFAULT_ROLE);
157 }
158 
TEST(TokenizerTest,TokenizeOnSpace)159 TEST(TokenizerTest, TokenizeOnSpace) {
160   std::vector<TokenizationCodepointRangeT> configs;
161   TokenizationCodepointRangeT* config;
162 
163   configs.emplace_back();
164   config = &configs.back();
165   // Space character.
166   config->start = 32;
167   config->end = 33;
168   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
169 
170   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
171                                   {},
172                                   /*split_on_script_change=*/false,
173                                   /*icu_preserve_whitespace_tokens=*/false);
174   std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
175 
176   EXPECT_THAT(tokens,
177               ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
178 }
179 
TEST(TokenizerTest,TokenizeOnSpaceAndScriptChange)180 TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
181   std::vector<TokenizationCodepointRangeT> configs;
182   TokenizationCodepointRangeT* config;
183 
184   // Latin.
185   configs.emplace_back();
186   config = &configs.back();
187   config->start = 0;
188   config->end = 32;
189   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
190   config->script_id = 1;
191   configs.emplace_back();
192   config = &configs.back();
193   config->start = 32;
194   config->end = 33;
195   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
196   config->script_id = 1;
197   configs.emplace_back();
198   config = &configs.back();
199   config->start = 33;
200   config->end = 0x77F + 1;
201   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
202   config->script_id = 1;
203 
204   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
205                                   {},
206                                   /*split_on_script_change=*/true,
207                                   /*icu_preserve_whitespace_tokens=*/false);
208   EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
209               std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
210                                   Token("전화", 7, 10), Token("(123)", 10, 15),
211                                   Token("456-789", 16, 23),
212                                   Token("웹사이트", 23, 28)}));
213 }  // namespace
214 
TEST(TokenizerTest,TokenizeComplex)215 TEST(TokenizerTest, TokenizeComplex) {
216   std::vector<TokenizationCodepointRangeT> configs;
217   TokenizationCodepointRangeT* config;
218 
219   // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
220   // Latin - cyrilic.
221   //   0000..007F; Basic Latin
222   //   0080..00FF; Latin-1 Supplement
223   //   0100..017F; Latin Extended-A
224   //   0180..024F; Latin Extended-B
225   //   0250..02AF; IPA Extensions
226   //   02B0..02FF; Spacing Modifier Letters
227   //   0300..036F; Combining Diacritical Marks
228   //   0370..03FF; Greek and Coptic
229   //   0400..04FF; Cyrillic
230   //   0500..052F; Cyrillic Supplement
231   //   0530..058F; Armenian
232   //   0590..05FF; Hebrew
233   //   0600..06FF; Arabic
234   //   0700..074F; Syriac
235   //   0750..077F; Arabic Supplement
236   configs.emplace_back();
237   config = &configs.back();
238   config->start = 0;
239   config->end = 32;
240   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
241   configs.emplace_back();
242   config = &configs.back();
243   config->start = 32;
244   config->end = 33;
245   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
246   configs.emplace_back();
247   config = &configs.back();
248   config->start = 33;
249   config->end = 0x77F + 1;
250   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
251 
252   // CJK
253   // 2E80..2EFF; CJK Radicals Supplement
254   // 3000..303F; CJK Symbols and Punctuation
255   // 3040..309F; Hiragana
256   // 30A0..30FF; Katakana
257   // 3100..312F; Bopomofo
258   // 3130..318F; Hangul Compatibility Jamo
259   // 3190..319F; Kanbun
260   // 31A0..31BF; Bopomofo Extended
261   // 31C0..31EF; CJK Strokes
262   // 31F0..31FF; Katakana Phonetic Extensions
263   // 3200..32FF; Enclosed CJK Letters and Months
264   // 3300..33FF; CJK Compatibility
265   // 3400..4DBF; CJK Unified Ideographs Extension A
266   // 4DC0..4DFF; Yijing Hexagram Symbols
267   // 4E00..9FFF; CJK Unified Ideographs
268   // A000..A48F; Yi Syllables
269   // A490..A4CF; Yi Radicals
270   // A4D0..A4FF; Lisu
271   // A500..A63F; Vai
272   // F900..FAFF; CJK Compatibility Ideographs
273   // FE30..FE4F; CJK Compatibility Forms
274   // 20000..2A6DF; CJK Unified Ideographs Extension B
275   // 2A700..2B73F; CJK Unified Ideographs Extension C
276   // 2B740..2B81F; CJK Unified Ideographs Extension D
277   // 2B820..2CEAF; CJK Unified Ideographs Extension E
278   // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
279   // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
280   configs.emplace_back();
281   config = &configs.back();
282   config->start = 0x2E80;
283   config->end = 0x2EFF + 1;
284   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
285   configs.emplace_back();
286   config = &configs.back();
287   config->start = 0x3000;
288   config->end = 0xA63F + 1;
289   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
290   configs.emplace_back();
291   config = &configs.back();
292   config->start = 0xF900;
293   config->end = 0xFAFF + 1;
294   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
295   configs.emplace_back();
296   config = &configs.back();
297   config->start = 0xFE30;
298   config->end = 0xFE4F + 1;
299   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
300   configs.emplace_back();
301   config = &configs.back();
302   config->start = 0x20000;
303   config->end = 0x2A6DF + 1;
304   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
305   configs.emplace_back();
306   config = &configs.back();
307   config->start = 0x2A700;
308   config->end = 0x2B73F + 1;
309   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
310   configs.emplace_back();
311   config = &configs.back();
312   config->start = 0x2B740;
313   config->end = 0x2B81F + 1;
314   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
315   configs.emplace_back();
316   config = &configs.back();
317   config->start = 0x2B820;
318   config->end = 0x2CEAF + 1;
319   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
320   configs.emplace_back();
321   config = &configs.back();
322   config->start = 0x2CEB0;
323   config->end = 0x2EBEF + 1;
324   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
325   configs.emplace_back();
326   config = &configs.back();
327   config->start = 0x2F800;
328   config->end = 0x2FA1F + 1;
329   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
330 
331   // Thai.
332   // 0E00..0E7F; Thai
333   configs.emplace_back();
334   config = &configs.back();
335   config->start = 0x0E00;
336   config->end = 0x0E7F + 1;
337   config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
338 
339   TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
340                                   {},
341                                   /*split_on_script_change=*/false,
342                                   /*icu_preserve_whitespace_tokens=*/false);
343   std::vector<Token> tokens;
344 
345   tokens = tokenizer.Tokenize(
346       "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
347   EXPECT_EQ(tokens.size(), 30);
348 
349   tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
350   // clang-format off
351   EXPECT_THAT(
352       tokens,
353       ElementsAreArray({Token("問", 0, 1),
354                         Token("少", 1, 2),
355                         Token("目", 2, 3),
356                         Token("hello", 4, 9),
357                         Token("木", 10, 11),
358                         Token("輸", 11, 12),
359                         Token("ย", 12, 13),
360                         Token("า", 13, 14),
361                         Token("ม", 14, 15),
362                         Token("き", 15, 16),
363                         Token("ゃ", 16, 17)}));
364   // clang-format on
365 }
366 
367 #ifdef TC3_TEST_ICU
TEST(TokenizerTest,ICUTokenize)368 TEST(TokenizerTest, ICUTokenize) {
369   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
370                                   /*split_on_script_change=*/false,
371                                   /*icu_preserve_whitespace_tokens=*/false);
372   std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
373   ASSERT_EQ(tokens,
374             // clang-format off
375             std::vector<Token>({Token("พระบาท", 0, 6),
376                                 Token("สมเด็จ", 6, 12),
377                                 Token("พระ", 12, 15),
378                                 Token("ปร", 15, 17),
379                                 Token("มิ", 17, 19)}));
380   // clang-format on
381 }
382 
TEST(TokenizerTest,ICUTokenizeWithWhitespaces)383 TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
384   TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
385                                   /*split_on_script_change=*/false,
386                                   /*icu_preserve_whitespace_tokens=*/true);
387   std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
388   ASSERT_EQ(tokens,
389             // clang-format off
390             std::vector<Token>({Token("พระบาท", 0, 6),
391                                 Token(" ", 6, 7),
392                                 Token("สมเด็จ", 7, 13),
393                                 Token(" ", 13, 14),
394                                 Token("พระ", 14, 17),
395                                 Token(" ", 17, 18),
396                                 Token("ปร", 18, 20),
397                                 Token(" ", 20, 21),
398                                 Token("มิ", 21, 23)}));
399   // clang-format on
400 }
401 
TEST(TokenizerTest,MixedTokenize)402 TEST(TokenizerTest, MixedTokenize) {
403   std::vector<TokenizationCodepointRangeT> configs;
404   TokenizationCodepointRangeT* config;
405 
406   configs.emplace_back();
407   config = &configs.back();
408   config->start = 32;
409   config->end = 33;
410   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
411 
412   std::vector<CodepointRangeT> internal_configs;
413   CodepointRangeT* interal_config;
414 
415   internal_configs.emplace_back();
416   interal_config = &internal_configs.back();
417   interal_config->start = 0;
418   interal_config->end = 128;
419 
420   internal_configs.emplace_back();
421   interal_config = &internal_configs.back();
422   interal_config->start = 128;
423   interal_config->end = 256;
424 
425   internal_configs.emplace_back();
426   interal_config = &internal_configs.back();
427   interal_config->start = 256;
428   interal_config->end = 384;
429 
430   internal_configs.emplace_back();
431   interal_config = &internal_configs.back();
432   interal_config->start = 384;
433   interal_config->end = 592;
434 
435   TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
436                                   internal_configs,
437                                   /*split_on_script_change=*/false,
438                                   /*icu_preserve_whitespace_tokens=*/false);
439 
440   std::vector<Token> tokens = tokenizer.Tokenize(
441       "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
442   ASSERT_EQ(tokens,
443             // clang-format off
444             std::vector<Token>({Token("こんにちは", 0, 5),
445                                 Token("Japanese-ląnguagę", 5, 22),
446                                 Token("text", 23, 27),
447                                 Token("世界", 28, 30),
448                                 Token("http://www.google.com/", 31, 53)}));
449   // clang-format on
450 }
451 
TEST(TokenizerTest,InternalTokenizeOnScriptChange)452 TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
453   std::vector<TokenizationCodepointRangeT> configs;
454   TokenizationCodepointRangeT* config;
455 
456   configs.emplace_back();
457   config = &configs.back();
458   config->start = 0;
459   config->end = 256;
460   config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
461 
462   {
463     TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
464                                     configs, {},
465                                     /*split_on_script_change=*/false,
466                                     /*icu_preserve_whitespace_tokens=*/false);
467 
468     EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
469               std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
470   }
471 
472   {
473     TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
474                                     configs, {},
475                                     /*split_on_script_change=*/true,
476                                     /*icu_preserve_whitespace_tokens=*/false);
477     EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
478               std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
479                                   Token("웹사이트", 7, 11)}));
480   }
481 }
482 #endif
483 
484 }  // namespace
485 }  // namespace libtextclassifier3
486