1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "smartselect/tokenizer.h"
18 
19 #include "gmock/gmock.h"
20 #include "gtest/gtest.h"
21 
22 namespace libtextclassifier {
23 namespace {
24 
25 using testing::ElementsAreArray;
26 
27 class TestingTokenizer : public Tokenizer {
28  public:
TestingTokenizer(const std::vector<TokenizationCodepointRange> & codepoint_range_configs)29   explicit TestingTokenizer(
30       const std::vector<TokenizationCodepointRange>& codepoint_range_configs)
31       : Tokenizer(codepoint_range_configs) {}
32 
TestFindTokenizationRole(int c) const33   TokenizationCodepointRange::Role TestFindTokenizationRole(int c) const {
34     return FindTokenizationRole(c);
35   }
36 };
37 
TEST(TokenizerTest,FindTokenizationRole)38 TEST(TokenizerTest, FindTokenizationRole) {
39   std::vector<TokenizationCodepointRange> configs;
40   TokenizationCodepointRange* config;
41 
42   configs.emplace_back();
43   config = &configs.back();
44   config->set_start(0);
45   config->set_end(10);
46   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
47 
48   configs.emplace_back();
49   config = &configs.back();
50   config->set_start(32);
51   config->set_end(33);
52   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
53 
54   configs.emplace_back();
55   config = &configs.back();
56   config->set_start(1234);
57   config->set_end(12345);
58   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
59 
60   TestingTokenizer tokenizer(configs);
61 
62   // Test hits to the first group.
63   EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
64             TokenizationCodepointRange::TOKEN_SEPARATOR);
65   EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
66             TokenizationCodepointRange::TOKEN_SEPARATOR);
67   EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
68             TokenizationCodepointRange::DEFAULT_ROLE);
69 
70   // Test a hit to the second group.
71   EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
72             TokenizationCodepointRange::DEFAULT_ROLE);
73   EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
74             TokenizationCodepointRange::WHITESPACE_SEPARATOR);
75   EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
76             TokenizationCodepointRange::DEFAULT_ROLE);
77 
78   // Test hits to the third group.
79   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
80             TokenizationCodepointRange::DEFAULT_ROLE);
81   EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
82             TokenizationCodepointRange::TOKEN_SEPARATOR);
83   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
84             TokenizationCodepointRange::TOKEN_SEPARATOR);
85   EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
86             TokenizationCodepointRange::DEFAULT_ROLE);
87 
88   // Test a hit outside.
89   EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
90             TokenizationCodepointRange::DEFAULT_ROLE);
91 }
92 
TEST(TokenizerTest,TokenizeOnSpace)93 TEST(TokenizerTest, TokenizeOnSpace) {
94   std::vector<TokenizationCodepointRange> configs;
95   TokenizationCodepointRange* config;
96 
97   configs.emplace_back();
98   config = &configs.back();
99   // Space character.
100   config->set_start(32);
101   config->set_end(33);
102   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
103 
104   TestingTokenizer tokenizer(configs);
105   std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
106 
107   EXPECT_THAT(tokens,
108               ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
109 }
110 
TEST(TokenizerTest,TokenizeComplex)111 TEST(TokenizerTest, TokenizeComplex) {
112   std::vector<TokenizationCodepointRange> configs;
113   TokenizationCodepointRange* config;
114 
115   // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
116   // Latin - cyrilic.
117   //   0000..007F; Basic Latin
118   //   0080..00FF; Latin-1 Supplement
119   //   0100..017F; Latin Extended-A
120   //   0180..024F; Latin Extended-B
121   //   0250..02AF; IPA Extensions
122   //   02B0..02FF; Spacing Modifier Letters
123   //   0300..036F; Combining Diacritical Marks
124   //   0370..03FF; Greek and Coptic
125   //   0400..04FF; Cyrillic
126   //   0500..052F; Cyrillic Supplement
127   //   0530..058F; Armenian
128   //   0590..05FF; Hebrew
129   //   0600..06FF; Arabic
130   //   0700..074F; Syriac
131   //   0750..077F; Arabic Supplement
132   configs.emplace_back();
133   config = &configs.back();
134   config->set_start(0);
135   config->set_end(32);
136   config->set_role(TokenizationCodepointRange::DEFAULT_ROLE);
137   configs.emplace_back();
138   config = &configs.back();
139   config->set_start(32);
140   config->set_end(33);
141   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
142   configs.emplace_back();
143   config = &configs.back();
144   config->set_start(33);
145   config->set_end(0x77F + 1);
146   config->set_role(TokenizationCodepointRange::DEFAULT_ROLE);
147 
148   // CJK
149   // 2E80..2EFF; CJK Radicals Supplement
150   // 3000..303F; CJK Symbols and Punctuation
151   // 3040..309F; Hiragana
152   // 30A0..30FF; Katakana
153   // 3100..312F; Bopomofo
154   // 3130..318F; Hangul Compatibility Jamo
155   // 3190..319F; Kanbun
156   // 31A0..31BF; Bopomofo Extended
157   // 31C0..31EF; CJK Strokes
158   // 31F0..31FF; Katakana Phonetic Extensions
159   // 3200..32FF; Enclosed CJK Letters and Months
160   // 3300..33FF; CJK Compatibility
161   // 3400..4DBF; CJK Unified Ideographs Extension A
162   // 4DC0..4DFF; Yijing Hexagram Symbols
163   // 4E00..9FFF; CJK Unified Ideographs
164   // A000..A48F; Yi Syllables
165   // A490..A4CF; Yi Radicals
166   // A4D0..A4FF; Lisu
167   // A500..A63F; Vai
168   // F900..FAFF; CJK Compatibility Ideographs
169   // FE30..FE4F; CJK Compatibility Forms
170   // 20000..2A6DF; CJK Unified Ideographs Extension B
171   // 2A700..2B73F; CJK Unified Ideographs Extension C
172   // 2B740..2B81F; CJK Unified Ideographs Extension D
173   // 2B820..2CEAF; CJK Unified Ideographs Extension E
174   // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
175   // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
176   configs.emplace_back();
177   config = &configs.back();
178   config->set_start(0x2E80);
179   config->set_end(0x2EFF + 1);
180   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
181   configs.emplace_back();
182   config = &configs.back();
183   config->set_start(0x3000);
184   config->set_end(0xA63F + 1);
185   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
186   configs.emplace_back();
187   config = &configs.back();
188   config->set_start(0xF900);
189   config->set_end(0xFAFF + 1);
190   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
191   configs.emplace_back();
192   config = &configs.back();
193   config->set_start(0xFE30);
194   config->set_end(0xFE4F + 1);
195   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
196   configs.emplace_back();
197   config = &configs.back();
198   config->set_start(0x20000);
199   config->set_end(0x2A6DF + 1);
200   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
201   configs.emplace_back();
202   config = &configs.back();
203   config->set_start(0x2A700);
204   config->set_end(0x2B73F + 1);
205   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
206   configs.emplace_back();
207   config = &configs.back();
208   config->set_start(0x2B740);
209   config->set_end(0x2B81F + 1);
210   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
211   configs.emplace_back();
212   config = &configs.back();
213   config->set_start(0x2B820);
214   config->set_end(0x2CEAF + 1);
215   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
216   configs.emplace_back();
217   config = &configs.back();
218   config->set_start(0x2CEB0);
219   config->set_end(0x2EBEF + 1);
220   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
221   configs.emplace_back();
222   config = &configs.back();
223   config->set_start(0x2F800);
224   config->set_end(0x2FA1F + 1);
225   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
226 
227   // Thai.
228   // 0E00..0E7F; Thai
229   configs.emplace_back();
230   config = &configs.back();
231   config->set_start(0x0E00);
232   config->set_end(0x0E7F + 1);
233   config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
234 
235   Tokenizer tokenizer(configs);
236   std::vector<Token> tokens;
237 
238   tokens = tokenizer.Tokenize(
239       "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
240   EXPECT_EQ(tokens.size(), 30);
241 
242   tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
243   // clang-format off
244   EXPECT_THAT(
245       tokens,
246       ElementsAreArray({Token("問", 0, 1),
247                         Token("少", 1, 2),
248                         Token("目", 2, 3),
249                         Token("hello", 4, 9),
250                         Token("木", 10, 11),
251                         Token("輸", 11, 12),
252                         Token("ย", 12, 13),
253                         Token("า", 13, 14),
254                         Token("ม", 14, 15),
255                         Token("き", 15, 16),
256                         Token("ゃ", 16, 17)}));
257   // clang-format on
258 }
259 
260 }  // namespace
261 }  // namespace libtextclassifier
262