1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/tokenizer.h"
18
19 #include <vector>
20
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23
24 namespace libtextclassifier3 {
25 namespace {
26
27 using testing::ElementsAreArray;
28
29 class TestingTokenizer : public Tokenizer {
30 public:
TestingTokenizer(const TokenizationType type,const UniLib * unilib,const std::vector<const TokenizationCodepointRange * > & codepoint_ranges,const std::vector<const CodepointRange * > & internal_tokenizer_codepoint_ranges,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)31 TestingTokenizer(
32 const TokenizationType type, const UniLib* unilib,
33 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
34 const std::vector<const CodepointRange*>&
35 internal_tokenizer_codepoint_ranges,
36 const bool split_on_script_change,
37 const bool icu_preserve_whitespace_tokens)
38 : Tokenizer(type, unilib, codepoint_ranges,
39 internal_tokenizer_codepoint_ranges, split_on_script_change,
40 icu_preserve_whitespace_tokens) {}
41
42 using Tokenizer::FindTokenizationRange;
43 };
44
45 class TestingTokenizerProxy {
46 public:
TestingTokenizerProxy(TokenizationType type,const std::vector<TokenizationCodepointRangeT> & codepoint_range_configs,const std::vector<CodepointRangeT> & internal_codepoint_range_configs,const bool split_on_script_change,const bool icu_preserve_whitespace_tokens)47 TestingTokenizerProxy(
48 TokenizationType type,
49 const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
50 const std::vector<CodepointRangeT>& internal_codepoint_range_configs,
51 const bool split_on_script_change,
52 const bool icu_preserve_whitespace_tokens)
53 : INIT_UNILIB_FOR_TESTING(unilib_) {
54 const int num_configs = codepoint_range_configs.size();
55 std::vector<const TokenizationCodepointRange*> configs_fb;
56 configs_fb.reserve(num_configs);
57 const int num_internal_configs = internal_codepoint_range_configs.size();
58 std::vector<const CodepointRange*> internal_configs_fb;
59 internal_configs_fb.reserve(num_internal_configs);
60 buffers_.reserve(num_configs + num_internal_configs);
61 for (int i = 0; i < num_configs; i++) {
62 flatbuffers::FlatBufferBuilder builder;
63 builder.Finish(CreateTokenizationCodepointRange(
64 builder, &codepoint_range_configs[i]));
65 buffers_.push_back(builder.Release());
66 configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>(
67 buffers_.back().data()));
68 }
69 for (int i = 0; i < num_internal_configs; i++) {
70 flatbuffers::FlatBufferBuilder builder;
71 builder.Finish(
72 CreateCodepointRange(builder, &internal_codepoint_range_configs[i]));
73 buffers_.push_back(builder.Release());
74 internal_configs_fb.push_back(
75 flatbuffers::GetRoot<CodepointRange>(buffers_.back().data()));
76 }
77 tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer(
78 type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change,
79 icu_preserve_whitespace_tokens));
80 }
81
TestFindTokenizationRole(int c) const82 TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
83 const TokenizationCodepointRangeT* range =
84 tokenizer_->FindTokenizationRange(c);
85 if (range != nullptr) {
86 return range->role;
87 } else {
88 return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
89 }
90 }
91
Tokenize(const std::string & utf8_text) const92 std::vector<Token> Tokenize(const std::string& utf8_text) const {
93 return tokenizer_->Tokenize(utf8_text);
94 }
95
96 private:
97 UniLib unilib_;
98 std::vector<flatbuffers::DetachedBuffer> buffers_;
99 std::unique_ptr<TestingTokenizer> tokenizer_;
100 };
101
TEST(TokenizerTest,FindTokenizationRange)102 TEST(TokenizerTest, FindTokenizationRange) {
103 std::vector<TokenizationCodepointRangeT> configs;
104 TokenizationCodepointRangeT* config;
105
106 configs.emplace_back();
107 config = &configs.back();
108 config->start = 0;
109 config->end = 10;
110 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
111
112 configs.emplace_back();
113 config = &configs.back();
114 config->start = 32;
115 config->end = 33;
116 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
117
118 configs.emplace_back();
119 config = &configs.back();
120 config->start = 1234;
121 config->end = 12345;
122 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
123
124 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
125 {}, /*split_on_script_change=*/false,
126 /*icu_preserve_whitespace_tokens=*/false);
127
128 // Test hits to the first group.
129 EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
130 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
131 EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
132 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
133 EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
134 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
135
136 // Test a hit to the second group.
137 EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
138 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
139 EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
140 TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
141 EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
142 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
143
144 // Test hits to the third group.
145 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
146 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
147 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
148 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
149 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
150 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
151 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
152 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
153
154 // Test a hit outside.
155 EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
156 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
157 }
158
TEST(TokenizerTest,TokenizeOnSpace)159 TEST(TokenizerTest, TokenizeOnSpace) {
160 std::vector<TokenizationCodepointRangeT> configs;
161 TokenizationCodepointRangeT* config;
162
163 configs.emplace_back();
164 config = &configs.back();
165 // Space character.
166 config->start = 32;
167 config->end = 33;
168 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
169
170 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
171 {},
172 /*split_on_script_change=*/false,
173 /*icu_preserve_whitespace_tokens=*/false);
174 std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
175
176 EXPECT_THAT(tokens,
177 ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
178 }
179
TEST(TokenizerTest,TokenizeOnSpaceAndScriptChange)180 TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
181 std::vector<TokenizationCodepointRangeT> configs;
182 TokenizationCodepointRangeT* config;
183
184 // Latin.
185 configs.emplace_back();
186 config = &configs.back();
187 config->start = 0;
188 config->end = 32;
189 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
190 config->script_id = 1;
191 configs.emplace_back();
192 config = &configs.back();
193 config->start = 32;
194 config->end = 33;
195 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
196 config->script_id = 1;
197 configs.emplace_back();
198 config = &configs.back();
199 config->start = 33;
200 config->end = 0x77F + 1;
201 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
202 config->script_id = 1;
203
204 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
205 {},
206 /*split_on_script_change=*/true,
207 /*icu_preserve_whitespace_tokens=*/false);
208 EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
209 std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
210 Token("전화", 7, 10), Token("(123)", 10, 15),
211 Token("456-789", 16, 23),
212 Token("웹사이트", 23, 28)}));
213 } // namespace
214
TEST(TokenizerTest,TokenizeComplex)215 TEST(TokenizerTest, TokenizeComplex) {
216 std::vector<TokenizationCodepointRangeT> configs;
217 TokenizationCodepointRangeT* config;
218
219 // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
220 // Latin - cyrilic.
221 // 0000..007F; Basic Latin
222 // 0080..00FF; Latin-1 Supplement
223 // 0100..017F; Latin Extended-A
224 // 0180..024F; Latin Extended-B
225 // 0250..02AF; IPA Extensions
226 // 02B0..02FF; Spacing Modifier Letters
227 // 0300..036F; Combining Diacritical Marks
228 // 0370..03FF; Greek and Coptic
229 // 0400..04FF; Cyrillic
230 // 0500..052F; Cyrillic Supplement
231 // 0530..058F; Armenian
232 // 0590..05FF; Hebrew
233 // 0600..06FF; Arabic
234 // 0700..074F; Syriac
235 // 0750..077F; Arabic Supplement
236 configs.emplace_back();
237 config = &configs.back();
238 config->start = 0;
239 config->end = 32;
240 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
241 configs.emplace_back();
242 config = &configs.back();
243 config->start = 32;
244 config->end = 33;
245 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
246 configs.emplace_back();
247 config = &configs.back();
248 config->start = 33;
249 config->end = 0x77F + 1;
250 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
251
252 // CJK
253 // 2E80..2EFF; CJK Radicals Supplement
254 // 3000..303F; CJK Symbols and Punctuation
255 // 3040..309F; Hiragana
256 // 30A0..30FF; Katakana
257 // 3100..312F; Bopomofo
258 // 3130..318F; Hangul Compatibility Jamo
259 // 3190..319F; Kanbun
260 // 31A0..31BF; Bopomofo Extended
261 // 31C0..31EF; CJK Strokes
262 // 31F0..31FF; Katakana Phonetic Extensions
263 // 3200..32FF; Enclosed CJK Letters and Months
264 // 3300..33FF; CJK Compatibility
265 // 3400..4DBF; CJK Unified Ideographs Extension A
266 // 4DC0..4DFF; Yijing Hexagram Symbols
267 // 4E00..9FFF; CJK Unified Ideographs
268 // A000..A48F; Yi Syllables
269 // A490..A4CF; Yi Radicals
270 // A4D0..A4FF; Lisu
271 // A500..A63F; Vai
272 // F900..FAFF; CJK Compatibility Ideographs
273 // FE30..FE4F; CJK Compatibility Forms
274 // 20000..2A6DF; CJK Unified Ideographs Extension B
275 // 2A700..2B73F; CJK Unified Ideographs Extension C
276 // 2B740..2B81F; CJK Unified Ideographs Extension D
277 // 2B820..2CEAF; CJK Unified Ideographs Extension E
278 // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
279 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
280 configs.emplace_back();
281 config = &configs.back();
282 config->start = 0x2E80;
283 config->end = 0x2EFF + 1;
284 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
285 configs.emplace_back();
286 config = &configs.back();
287 config->start = 0x3000;
288 config->end = 0xA63F + 1;
289 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
290 configs.emplace_back();
291 config = &configs.back();
292 config->start = 0xF900;
293 config->end = 0xFAFF + 1;
294 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
295 configs.emplace_back();
296 config = &configs.back();
297 config->start = 0xFE30;
298 config->end = 0xFE4F + 1;
299 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
300 configs.emplace_back();
301 config = &configs.back();
302 config->start = 0x20000;
303 config->end = 0x2A6DF + 1;
304 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
305 configs.emplace_back();
306 config = &configs.back();
307 config->start = 0x2A700;
308 config->end = 0x2B73F + 1;
309 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
310 configs.emplace_back();
311 config = &configs.back();
312 config->start = 0x2B740;
313 config->end = 0x2B81F + 1;
314 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
315 configs.emplace_back();
316 config = &configs.back();
317 config->start = 0x2B820;
318 config->end = 0x2CEAF + 1;
319 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
320 configs.emplace_back();
321 config = &configs.back();
322 config->start = 0x2CEB0;
323 config->end = 0x2EBEF + 1;
324 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
325 configs.emplace_back();
326 config = &configs.back();
327 config->start = 0x2F800;
328 config->end = 0x2FA1F + 1;
329 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
330
331 // Thai.
332 // 0E00..0E7F; Thai
333 configs.emplace_back();
334 config = &configs.back();
335 config->start = 0x0E00;
336 config->end = 0x0E7F + 1;
337 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
338
339 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs,
340 {},
341 /*split_on_script_change=*/false,
342 /*icu_preserve_whitespace_tokens=*/false);
343 std::vector<Token> tokens;
344
345 tokens = tokenizer.Tokenize(
346 "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
347 EXPECT_EQ(tokens.size(), 30);
348
349 tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
350 // clang-format off
351 EXPECT_THAT(
352 tokens,
353 ElementsAreArray({Token("問", 0, 1),
354 Token("少", 1, 2),
355 Token("目", 2, 3),
356 Token("hello", 4, 9),
357 Token("木", 10, 11),
358 Token("輸", 11, 12),
359 Token("ย", 12, 13),
360 Token("า", 13, 14),
361 Token("ม", 14, 15),
362 Token("き", 15, 16),
363 Token("ゃ", 16, 17)}));
364 // clang-format on
365 }
366
367 #ifdef TC3_TEST_ICU
TEST(TokenizerTest,ICUTokenize)368 TEST(TokenizerTest, ICUTokenize) {
369 TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
370 /*split_on_script_change=*/false,
371 /*icu_preserve_whitespace_tokens=*/false);
372 std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ");
373 ASSERT_EQ(tokens,
374 // clang-format off
375 std::vector<Token>({Token("พระบาท", 0, 6),
376 Token("สมเด็จ", 6, 12),
377 Token("พระ", 12, 15),
378 Token("ปร", 15, 17),
379 Token("มิ", 17, 19)}));
380 // clang-format on
381 }
382
TEST(TokenizerTest,ICUTokenizeWithWhitespaces)383 TEST(TokenizerTest, ICUTokenizeWithWhitespaces) {
384 TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {},
385 /*split_on_script_change=*/false,
386 /*icu_preserve_whitespace_tokens=*/true);
387 std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
388 ASSERT_EQ(tokens,
389 // clang-format off
390 std::vector<Token>({Token("พระบาท", 0, 6),
391 Token(" ", 6, 7),
392 Token("สมเด็จ", 7, 13),
393 Token(" ", 13, 14),
394 Token("พระ", 14, 17),
395 Token(" ", 17, 18),
396 Token("ปร", 18, 20),
397 Token(" ", 20, 21),
398 Token("มิ", 21, 23)}));
399 // clang-format on
400 }
401
TEST(TokenizerTest,MixedTokenize)402 TEST(TokenizerTest, MixedTokenize) {
403 std::vector<TokenizationCodepointRangeT> configs;
404 TokenizationCodepointRangeT* config;
405
406 configs.emplace_back();
407 config = &configs.back();
408 config->start = 32;
409 config->end = 33;
410 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
411
412 std::vector<CodepointRangeT> internal_configs;
413 CodepointRangeT* interal_config;
414
415 internal_configs.emplace_back();
416 interal_config = &internal_configs.back();
417 interal_config->start = 0;
418 interal_config->end = 128;
419
420 internal_configs.emplace_back();
421 interal_config = &internal_configs.back();
422 interal_config->start = 128;
423 interal_config->end = 256;
424
425 internal_configs.emplace_back();
426 interal_config = &internal_configs.back();
427 interal_config->start = 256;
428 interal_config->end = 384;
429
430 internal_configs.emplace_back();
431 interal_config = &internal_configs.back();
432 interal_config->start = 384;
433 interal_config->end = 592;
434
435 TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs,
436 internal_configs,
437 /*split_on_script_change=*/false,
438 /*icu_preserve_whitespace_tokens=*/false);
439
440 std::vector<Token> tokens = tokenizer.Tokenize(
441 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
442 ASSERT_EQ(tokens,
443 // clang-format off
444 std::vector<Token>({Token("こんにちは", 0, 5),
445 Token("Japanese-ląnguagę", 5, 22),
446 Token("text", 23, 27),
447 Token("世界", 28, 30),
448 Token("http://www.google.com/", 31, 53)}));
449 // clang-format on
450 }
451
TEST(TokenizerTest,InternalTokenizeOnScriptChange)452 TEST(TokenizerTest, InternalTokenizeOnScriptChange) {
453 std::vector<TokenizationCodepointRangeT> configs;
454 TokenizationCodepointRangeT* config;
455
456 configs.emplace_back();
457 config = &configs.back();
458 config->start = 0;
459 config->end = 256;
460 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
461
462 {
463 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
464 configs, {},
465 /*split_on_script_change=*/false,
466 /*icu_preserve_whitespace_tokens=*/false);
467
468 EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
469 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
470 }
471
472 {
473 TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER,
474 configs, {},
475 /*split_on_script_change=*/true,
476 /*icu_preserve_whitespace_tokens=*/false);
477 EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"),
478 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
479 Token("웹사이트", 7, 11)}));
480 }
481 }
482 #endif
483
484 } // namespace
485 } // namespace libtextclassifier3
486