1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "smartselect/tokenizer.h"
18
19 #include "gmock/gmock.h"
20 #include "gtest/gtest.h"
21
22 namespace libtextclassifier {
23 namespace {
24
25 using testing::ElementsAreArray;
26
27 class TestingTokenizer : public Tokenizer {
28 public:
TestingTokenizer(const std::vector<TokenizationCodepointRange> & codepoint_range_configs)29 explicit TestingTokenizer(
30 const std::vector<TokenizationCodepointRange>& codepoint_range_configs)
31 : Tokenizer(codepoint_range_configs) {}
32
TestFindTokenizationRole(int c) const33 TokenizationCodepointRange::Role TestFindTokenizationRole(int c) const {
34 return FindTokenizationRole(c);
35 }
36 };
37
TEST(TokenizerTest,FindTokenizationRole)38 TEST(TokenizerTest, FindTokenizationRole) {
39 std::vector<TokenizationCodepointRange> configs;
40 TokenizationCodepointRange* config;
41
42 configs.emplace_back();
43 config = &configs.back();
44 config->set_start(0);
45 config->set_end(10);
46 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
47
48 configs.emplace_back();
49 config = &configs.back();
50 config->set_start(32);
51 config->set_end(33);
52 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
53
54 configs.emplace_back();
55 config = &configs.back();
56 config->set_start(1234);
57 config->set_end(12345);
58 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
59
60 TestingTokenizer tokenizer(configs);
61
62 // Test hits to the first group.
63 EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
64 TokenizationCodepointRange::TOKEN_SEPARATOR);
65 EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
66 TokenizationCodepointRange::TOKEN_SEPARATOR);
67 EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
68 TokenizationCodepointRange::DEFAULT_ROLE);
69
70 // Test a hit to the second group.
71 EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
72 TokenizationCodepointRange::DEFAULT_ROLE);
73 EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
74 TokenizationCodepointRange::WHITESPACE_SEPARATOR);
75 EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
76 TokenizationCodepointRange::DEFAULT_ROLE);
77
78 // Test hits to the third group.
79 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
80 TokenizationCodepointRange::DEFAULT_ROLE);
81 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
82 TokenizationCodepointRange::TOKEN_SEPARATOR);
83 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
84 TokenizationCodepointRange::TOKEN_SEPARATOR);
85 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
86 TokenizationCodepointRange::DEFAULT_ROLE);
87
88 // Test a hit outside.
89 EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
90 TokenizationCodepointRange::DEFAULT_ROLE);
91 }
92
TEST(TokenizerTest,TokenizeOnSpace)93 TEST(TokenizerTest, TokenizeOnSpace) {
94 std::vector<TokenizationCodepointRange> configs;
95 TokenizationCodepointRange* config;
96
97 configs.emplace_back();
98 config = &configs.back();
99 // Space character.
100 config->set_start(32);
101 config->set_end(33);
102 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
103
104 TestingTokenizer tokenizer(configs);
105 std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
106
107 EXPECT_THAT(tokens,
108 ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
109 }
110
TEST(TokenizerTest,TokenizeComplex)111 TEST(TokenizerTest, TokenizeComplex) {
112 std::vector<TokenizationCodepointRange> configs;
113 TokenizationCodepointRange* config;
114
115 // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
116 // Latin - cyrilic.
117 // 0000..007F; Basic Latin
118 // 0080..00FF; Latin-1 Supplement
119 // 0100..017F; Latin Extended-A
120 // 0180..024F; Latin Extended-B
121 // 0250..02AF; IPA Extensions
122 // 02B0..02FF; Spacing Modifier Letters
123 // 0300..036F; Combining Diacritical Marks
124 // 0370..03FF; Greek and Coptic
125 // 0400..04FF; Cyrillic
126 // 0500..052F; Cyrillic Supplement
127 // 0530..058F; Armenian
128 // 0590..05FF; Hebrew
129 // 0600..06FF; Arabic
130 // 0700..074F; Syriac
131 // 0750..077F; Arabic Supplement
132 configs.emplace_back();
133 config = &configs.back();
134 config->set_start(0);
135 config->set_end(32);
136 config->set_role(TokenizationCodepointRange::DEFAULT_ROLE);
137 configs.emplace_back();
138 config = &configs.back();
139 config->set_start(32);
140 config->set_end(33);
141 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
142 configs.emplace_back();
143 config = &configs.back();
144 config->set_start(33);
145 config->set_end(0x77F + 1);
146 config->set_role(TokenizationCodepointRange::DEFAULT_ROLE);
147
148 // CJK
149 // 2E80..2EFF; CJK Radicals Supplement
150 // 3000..303F; CJK Symbols and Punctuation
151 // 3040..309F; Hiragana
152 // 30A0..30FF; Katakana
153 // 3100..312F; Bopomofo
154 // 3130..318F; Hangul Compatibility Jamo
155 // 3190..319F; Kanbun
156 // 31A0..31BF; Bopomofo Extended
157 // 31C0..31EF; CJK Strokes
158 // 31F0..31FF; Katakana Phonetic Extensions
159 // 3200..32FF; Enclosed CJK Letters and Months
160 // 3300..33FF; CJK Compatibility
161 // 3400..4DBF; CJK Unified Ideographs Extension A
162 // 4DC0..4DFF; Yijing Hexagram Symbols
163 // 4E00..9FFF; CJK Unified Ideographs
164 // A000..A48F; Yi Syllables
165 // A490..A4CF; Yi Radicals
166 // A4D0..A4FF; Lisu
167 // A500..A63F; Vai
168 // F900..FAFF; CJK Compatibility Ideographs
169 // FE30..FE4F; CJK Compatibility Forms
170 // 20000..2A6DF; CJK Unified Ideographs Extension B
171 // 2A700..2B73F; CJK Unified Ideographs Extension C
172 // 2B740..2B81F; CJK Unified Ideographs Extension D
173 // 2B820..2CEAF; CJK Unified Ideographs Extension E
174 // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
175 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
176 configs.emplace_back();
177 config = &configs.back();
178 config->set_start(0x2E80);
179 config->set_end(0x2EFF + 1);
180 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
181 configs.emplace_back();
182 config = &configs.back();
183 config->set_start(0x3000);
184 config->set_end(0xA63F + 1);
185 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
186 configs.emplace_back();
187 config = &configs.back();
188 config->set_start(0xF900);
189 config->set_end(0xFAFF + 1);
190 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
191 configs.emplace_back();
192 config = &configs.back();
193 config->set_start(0xFE30);
194 config->set_end(0xFE4F + 1);
195 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
196 configs.emplace_back();
197 config = &configs.back();
198 config->set_start(0x20000);
199 config->set_end(0x2A6DF + 1);
200 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
201 configs.emplace_back();
202 config = &configs.back();
203 config->set_start(0x2A700);
204 config->set_end(0x2B73F + 1);
205 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
206 configs.emplace_back();
207 config = &configs.back();
208 config->set_start(0x2B740);
209 config->set_end(0x2B81F + 1);
210 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
211 configs.emplace_back();
212 config = &configs.back();
213 config->set_start(0x2B820);
214 config->set_end(0x2CEAF + 1);
215 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
216 configs.emplace_back();
217 config = &configs.back();
218 config->set_start(0x2CEB0);
219 config->set_end(0x2EBEF + 1);
220 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
221 configs.emplace_back();
222 config = &configs.back();
223 config->set_start(0x2F800);
224 config->set_end(0x2FA1F + 1);
225 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
226
227 // Thai.
228 // 0E00..0E7F; Thai
229 configs.emplace_back();
230 config = &configs.back();
231 config->set_start(0x0E00);
232 config->set_end(0x0E7F + 1);
233 config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR);
234
235 Tokenizer tokenizer(configs);
236 std::vector<Token> tokens;
237
238 tokens = tokenizer.Tokenize(
239 "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
240 EXPECT_EQ(tokens.size(), 30);
241
242 tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
243 // clang-format off
244 EXPECT_THAT(
245 tokens,
246 ElementsAreArray({Token("問", 0, 1),
247 Token("少", 1, 2),
248 Token("目", 2, 3),
249 Token("hello", 4, 9),
250 Token("木", 10, 11),
251 Token("輸", 11, 12),
252 Token("ย", 12, 13),
253 Token("า", 13, 14),
254 Token("ม", 14, 15),
255 Token("き", 15, 16),
256 Token("ゃ", 16, 17)}));
257 // clang-format on
258 }
259
260 } // namespace
261 } // namespace libtextclassifier
262