1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minikin/Hyphenator.h"
18 
19 #include <gtest/gtest.h>
20 
21 #include "FileUtils.h"
22 
23 #ifndef NELEM
24 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
25 #endif
26 
27 namespace minikin {
28 
29 const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb";
30 const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb";
31 
32 const uint16_t HYPHEN_MINUS = 0x002D;
33 const uint16_t SOFT_HYPHEN = 0x00AD;
34 const uint16_t MIDDLE_DOT = 0x00B7;
35 const uint16_t GREEK_LOWER_ALPHA = 0x03B1;
36 const uint16_t ARMENIAN_AYB = 0x0531;
37 const uint16_t HEBREW_ALEF = 0x05D0;
38 const uint16_t ARABIC_ALEF = 0x0627;
39 const uint16_t ARABIC_BEH = 0x0628;
40 const uint16_t ARABIC_ZWARAKAY = 0x0659;
41 const uint16_t MALAYALAM_KA = 0x0D15;
42 const uint16_t UCAS_E = 0x1401;
43 const uint16_t HYPHEN = 0x2010;
44 const uint16_t EN_DASH = 0x2013;
45 
46 typedef std::function<Hyphenator*(const uint8_t*, size_t, size_t, size_t, const std::string&)>
47         Generator;
48 
49 class HyphenatorTest : public testing::TestWithParam<Generator> {};
50 
51 INSTANTIATE_TEST_SUITE_P(HyphenatorInstantiation, HyphenatorTest,
52                          testing::Values(Hyphenator::loadBinary, Hyphenator::loadBinaryForRust),
__anonde0d95910102(const testing::TestParamInfo<HyphenatorTest::ParamType>& info) 53                          [](const testing::TestParamInfo<HyphenatorTest::ParamType>& info) {
54                              switch (info.index) {
55                                  case 0:
56                                      return "CXX";
57                                  case 1:
58                                      return "Rust";
59                                  default:
60                                      return "Unknown";
61                              }
62                          });
63 
64 // Simple test for US English. This tests "table", which happens to be the in the exceptions list.
TEST_P(HyphenatorTest,usEnglishAutomaticHyphenation)65 TEST_P(HyphenatorTest, usEnglishAutomaticHyphenation) {
66     std::vector<uint8_t> patternData = readWholeFile(usHyph);
67     Hyphenator* hyphenator = GetParam()(patternData.data(), patternData.size(), 2, 3, "en");
68     const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'};
69     std::vector<HyphenationType> result;
70     hyphenator->hyphenate(word, &result);
71     EXPECT_EQ((size_t)5, result.size());
72     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
73     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
74     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
75     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
76     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
77 }
78 
79 // Catalan l·l should break as l-/l
TEST_P(HyphenatorTest,catalanMiddleDot)80 TEST_P(HyphenatorTest, catalanMiddleDot) {
81     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "ca");
82     const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l'};
83     std::vector<HyphenationType> result;
84     hyphenator->hyphenate(word, &result);
85     EXPECT_EQ((size_t)5, result.size());
86     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
87     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
88     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
89     EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]);
90     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
91 }
92 
93 // Catalan l·l should not break if the word is too short.
TEST_P(HyphenatorTest,catalanMiddleDotShortWord)94 TEST_P(HyphenatorTest, catalanMiddleDotShortWord) {
95     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "ca");
96     const uint16_t word[] = {'l', MIDDLE_DOT, 'l'};
97     std::vector<HyphenationType> result;
98     hyphenator->hyphenate(word, &result);
99     EXPECT_EQ((size_t)3, result.size());
100     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
101     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
102     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
103 }
104 
105 // If we break on a hyphen in Polish, the hyphen should be repeated on the next line.
TEST_P(HyphenatorTest,polishHyphen)106 TEST_P(HyphenatorTest, polishHyphen) {
107     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "pl");
108     const uint16_t word[] = {'x', HYPHEN, 'y'};
109     std::vector<HyphenationType> result;
110     hyphenator->hyphenate(word, &result);
111     EXPECT_EQ((size_t)3, result.size());
112     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
113     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
114     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
115 }
116 
117 // If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation.
TEST_P(HyphenatorTest,polishHyphenButNonLatinWord)118 TEST_P(HyphenatorTest, polishHyphenButNonLatinWord) {
119     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "pl");
120     const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA};
121     std::vector<HyphenationType> result;
122     hyphenator->hyphenate(word, &result);
123     EXPECT_EQ((size_t)3, result.size());
124     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
125     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
126     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
127 }
128 
129 // Polish en dash doesn't repeat on next line (as far as we know), but just provides a break
130 // opportunity.
TEST_P(HyphenatorTest,polishEnDash)131 TEST_P(HyphenatorTest, polishEnDash) {
132     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "pl");
133     const uint16_t word[] = {'x', EN_DASH, 'y'};
134     std::vector<HyphenationType> result;
135     hyphenator->hyphenate(word, &result);
136     EXPECT_EQ((size_t)3, result.size());
137     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
138     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
139     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
140 }
141 
142 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
143 // Polish.)
TEST_P(HyphenatorTest,slovenianHyphen)144 TEST_P(HyphenatorTest, slovenianHyphen) {
145     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "sl");
146     const uint16_t word[] = {'x', HYPHEN, 'y'};
147     std::vector<HyphenationType> result;
148     hyphenator->hyphenate(word, &result);
149     EXPECT_EQ((size_t)3, result.size());
150     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
151     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
152     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
153 }
154 
155 // In Latin script text, soft hyphens should insert a visible hyphen if broken at.
TEST_P(HyphenatorTest,latinSoftHyphen)156 TEST_P(HyphenatorTest, latinSoftHyphen) {
157     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
158     const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
159     std::vector<HyphenationType> result;
160     hyphenator->hyphenate(word, &result);
161     EXPECT_EQ((size_t)3, result.size());
162     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
163     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
164     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
165 }
166 
167 // Soft hyphens at the beginning of a word are not useful in linebreaking.
TEST_P(HyphenatorTest,latinSoftHyphenStartingTheWord)168 TEST_P(HyphenatorTest, latinSoftHyphenStartingTheWord) {
169     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
170     const uint16_t word[] = {SOFT_HYPHEN, 'y'};
171     std::vector<HyphenationType> result;
172     hyphenator->hyphenate(word, &result);
173     EXPECT_EQ((size_t)2, result.size());
174     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
175     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
176 }
177 
178 // In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at.
TEST_P(HyphenatorTest,malayalamSoftHyphen)179 TEST_P(HyphenatorTest, malayalamSoftHyphen) {
180     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
181     const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA};
182     std::vector<HyphenationType> result;
183     hyphenator->hyphenate(word, &result);
184     EXPECT_EQ((size_t)3, result.size());
185     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
186     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
187     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
188 }
189 
190 // In automatically hyphenated Malayalam script text, we should not insert a visible hyphen.
TEST_P(HyphenatorTest,malayalamAutomaticHyphenation)191 TEST_P(HyphenatorTest, malayalamAutomaticHyphenation) {
192     std::vector<uint8_t> patternData = readWholeFile(malayalamHyph);
193     Hyphenator* hyphenator = GetParam()(patternData.data(), patternData.size(), 2, 2, "en");
194     const uint16_t word[] = {MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA};
195     std::vector<HyphenationType> result;
196     hyphenator->hyphenate(word, &result);
197     EXPECT_EQ((size_t)5, result.size());
198     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
199     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
200     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
201     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[3]);
202     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
203 }
204 
205 // In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at.
TEST_P(HyphenatorTest,aremenianSoftHyphen)206 TEST_P(HyphenatorTest, aremenianSoftHyphen) {
207     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
208     const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB};
209     std::vector<HyphenationType> result;
210     hyphenator->hyphenate(word, &result);
211     EXPECT_EQ((size_t)3, result.size());
212     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
213     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
214     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]);
215 }
216 
217 // In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now.
218 // We may need to change this to maqaf later.
TEST_P(HyphenatorTest,hebrewSoftHyphen)219 TEST_P(HyphenatorTest, hebrewSoftHyphen) {
220     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
221     const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF};
222     std::vector<HyphenationType> result;
223     hyphenator->hyphenate(word, &result);
224     EXPECT_EQ((size_t)3, result.size());
225     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
226     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
227     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
228 }
229 
230 // Soft hyphen between two Arabic letters that join should keep the joining
231 // behavior when broken across lines.
TEST_P(HyphenatorTest,arabicSoftHyphenConnecting)232 TEST_P(HyphenatorTest, arabicSoftHyphenConnecting) {
233     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
234     const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH};
235     std::vector<HyphenationType> result;
236     hyphenator->hyphenate(word, &result);
237     EXPECT_EQ((size_t)3, result.size());
238     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
239     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
240     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]);
241 }
242 
243 // Arabic letters may be joining on one side, but if it's the wrong side, we
244 // should use the normal hyphen.
TEST_P(HyphenatorTest,arabicSoftHyphenNonConnecting)245 TEST_P(HyphenatorTest, arabicSoftHyphenNonConnecting) {
246     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
247     const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH};
248     std::vector<HyphenationType> result;
249     hyphenator->hyphenate(word, &result);
250     EXPECT_EQ((size_t)3, result.size());
251     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
252     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
253     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
254 }
255 
256 // Skip transparent characters until you find a non-transparent one.
TEST_P(HyphenatorTest,arabicSoftHyphenSkipTransparents)257 TEST_P(HyphenatorTest, arabicSoftHyphenSkipTransparents) {
258     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
259     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
260     std::vector<HyphenationType> result;
261     hyphenator->hyphenate(word, &result);
262     EXPECT_EQ((size_t)5, result.size());
263     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
264     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
265     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
266     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]);
267     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
268 }
269 
270 // Skip transparent characters until you find a non-transparent one. If we get to one end without
271 // finding anything, we are still non-joining.
TEST_P(HyphenatorTest,arabicSoftHyphenTransparentsAtEnd)272 TEST_P(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) {
273     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
274     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY};
275     std::vector<HyphenationType> result;
276     hyphenator->hyphenate(word, &result);
277     EXPECT_EQ((size_t)4, result.size());
278     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
279     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
280     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
281     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]);
282 }
283 
284 // Skip transparent characters until you find a non-transparent one. If we get to one end without
285 // finding anything, we are still non-joining.
TEST_P(HyphenatorTest,arabicSoftHyphenTransparentsAtStart)286 TEST_P(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) {
287     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
288     const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
289     std::vector<HyphenationType> result;
290     hyphenator->hyphenate(word, &result);
291     EXPECT_EQ((size_t)4, result.size());
292     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
293     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
294     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
295     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
296 }
297 
298 // In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen.
TEST_P(HyphenatorTest,ucasSoftHyphen)299 TEST_P(HyphenatorTest, ucasSoftHyphen) {
300     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
301     const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E};
302     std::vector<HyphenationType> result;
303     hyphenator->hyphenate(word, &result);
304     EXPECT_EQ((size_t)3, result.size());
305     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
306     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
307     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
308 }
309 
310 // Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a
311 // little arbitrary, but let's test it anyway.
TEST_P(HyphenatorTest,mixedScriptSoftHyphen)312 TEST_P(HyphenatorTest, mixedScriptSoftHyphen) {
313     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
314     const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E};
315     std::vector<HyphenationType> result;
316     hyphenator->hyphenate(word, &result);
317     EXPECT_EQ((size_t)3, result.size());
318     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
319     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
320     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
321 }
322 
323 // Hard hyphens provide a breaking opportunity with nothing extra inserted.
TEST_P(HyphenatorTest,hardHyphen)324 TEST_P(HyphenatorTest, hardHyphen) {
325     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
326     const uint16_t word[] = {'x', HYPHEN, 'y'};
327     std::vector<HyphenationType> result;
328     hyphenator->hyphenate(word, &result);
329     EXPECT_EQ((size_t)3, result.size());
330     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
331     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
332     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
333 }
334 
335 // Hyphen-minuses also provide a breaking opportunity with nothing extra inserted.
TEST_P(HyphenatorTest,hyphenMinus)336 TEST_P(HyphenatorTest, hyphenMinus) {
337     Hyphenator* hyphenator = GetParam()(nullptr, 0, 2, 2, "en");
338     const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'};
339     std::vector<HyphenationType> result;
340     hyphenator->hyphenate(word, &result);
341     EXPECT_EQ((size_t)3, result.size());
342     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
343     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
344     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
345 }
346 
347 // If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break
348 // it at that point.
TEST_P(HyphenatorTest,startingHyphenMinus)349 TEST_P(HyphenatorTest, startingHyphenMinus) {
350     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 0, 2, 2, "en");
351     const uint16_t word[] = {HYPHEN_MINUS, 'y'};
352     std::vector<HyphenationType> result;
353     hyphenator->hyphenate(word, &result);
354     EXPECT_EQ((size_t)2, result.size());
355     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
356     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
357 }
358 
359 }  // namespace minikin
360