1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minikin/Hyphenator.h"
18 
19 #include <gtest/gtest.h>
20 
21 #include "FileUtils.h"
22 
23 #ifndef NELEM
24 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
25 #endif
26 
27 namespace minikin {
28 
29 const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb";
30 const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb";
31 
32 const uint16_t HYPHEN_MINUS = 0x002D;
33 const uint16_t SOFT_HYPHEN = 0x00AD;
34 const uint16_t MIDDLE_DOT = 0x00B7;
35 const uint16_t GREEK_LOWER_ALPHA = 0x03B1;
36 const uint16_t ARMENIAN_AYB = 0x0531;
37 const uint16_t HEBREW_ALEF = 0x05D0;
38 const uint16_t ARABIC_ALEF = 0x0627;
39 const uint16_t ARABIC_BEH = 0x0628;
40 const uint16_t ARABIC_ZWARAKAY = 0x0659;
41 const uint16_t MALAYALAM_KA = 0x0D15;
42 const uint16_t UCAS_E = 0x1401;
43 const uint16_t HYPHEN = 0x2010;
44 const uint16_t EN_DASH = 0x2013;
45 
46 // Simple test for US English. This tests "table", which happens to be the in the exceptions list.
TEST(HyphenatorTest,usEnglishAutomaticHyphenation)47 TEST(HyphenatorTest, usEnglishAutomaticHyphenation) {
48     std::vector<uint8_t> patternData = readWholeFile(usHyph);
49     Hyphenator* hyphenator = Hyphenator::loadBinary(patternData.data(), 2, 3, "en");
50     const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'};
51     std::vector<HyphenationType> result;
52     hyphenator->hyphenate(word, &result);
53     EXPECT_EQ((size_t)5, result.size());
54     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
55     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
56     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
57     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
58     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
59 }
60 
61 // Catalan l·l should break as l-/l
TEST(HyphenatorTest,catalanMiddleDot)62 TEST(HyphenatorTest, catalanMiddleDot) {
63     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "ca");
64     const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l'};
65     std::vector<HyphenationType> result;
66     hyphenator->hyphenate(word, &result);
67     EXPECT_EQ((size_t)5, result.size());
68     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
69     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
70     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
71     EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]);
72     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
73 }
74 
75 // Catalan l·l should not break if the word is too short.
TEST(HyphenatorTest,catalanMiddleDotShortWord)76 TEST(HyphenatorTest, catalanMiddleDotShortWord) {
77     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "ca");
78     const uint16_t word[] = {'l', MIDDLE_DOT, 'l'};
79     std::vector<HyphenationType> result;
80     hyphenator->hyphenate(word, &result);
81     EXPECT_EQ((size_t)3, result.size());
82     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
83     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
84     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
85 }
86 
87 // If we break on a hyphen in Polish, the hyphen should be repeated on the next line.
TEST(HyphenatorTest,polishHyphen)88 TEST(HyphenatorTest, polishHyphen) {
89     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
90     const uint16_t word[] = {'x', HYPHEN, 'y'};
91     std::vector<HyphenationType> result;
92     hyphenator->hyphenate(word, &result);
93     EXPECT_EQ((size_t)3, result.size());
94     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
95     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
96     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
97 }
98 
99 // If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation.
TEST(HyphenatorTest,polishHyphenButNonLatinWord)100 TEST(HyphenatorTest, polishHyphenButNonLatinWord) {
101     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
102     const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA};
103     std::vector<HyphenationType> result;
104     hyphenator->hyphenate(word, &result);
105     EXPECT_EQ((size_t)3, result.size());
106     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
107     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
108     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
109 }
110 
111 // Polish en dash doesn't repeat on next line (as far as we know), but just provides a break
112 // opportunity.
TEST(HyphenatorTest,polishEnDash)113 TEST(HyphenatorTest, polishEnDash) {
114     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
115     const uint16_t word[] = {'x', EN_DASH, 'y'};
116     std::vector<HyphenationType> result;
117     hyphenator->hyphenate(word, &result);
118     EXPECT_EQ((size_t)3, result.size());
119     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
120     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
121     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
122 }
123 
124 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
125 // Polish.)
TEST(HyphenatorTest,slovenianHyphen)126 TEST(HyphenatorTest, slovenianHyphen) {
127     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "sl");
128     const uint16_t word[] = {'x', HYPHEN, 'y'};
129     std::vector<HyphenationType> result;
130     hyphenator->hyphenate(word, &result);
131     EXPECT_EQ((size_t)3, result.size());
132     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
133     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
134     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
135 }
136 
137 // In Latin script text, soft hyphens should insert a visible hyphen if broken at.
TEST(HyphenatorTest,latinSoftHyphen)138 TEST(HyphenatorTest, latinSoftHyphen) {
139     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
140     const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
141     std::vector<HyphenationType> result;
142     hyphenator->hyphenate(word, &result);
143     EXPECT_EQ((size_t)3, result.size());
144     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
145     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
146     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
147 }
148 
149 // Soft hyphens at the beginning of a word are not useful in linebreaking.
TEST(HyphenatorTest,latinSoftHyphenStartingTheWord)150 TEST(HyphenatorTest, latinSoftHyphenStartingTheWord) {
151     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
152     const uint16_t word[] = {SOFT_HYPHEN, 'y'};
153     std::vector<HyphenationType> result;
154     hyphenator->hyphenate(word, &result);
155     EXPECT_EQ((size_t)2, result.size());
156     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
157     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
158 }
159 
160 // In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at.
TEST(HyphenatorTest,malayalamSoftHyphen)161 TEST(HyphenatorTest, malayalamSoftHyphen) {
162     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
163     const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA};
164     std::vector<HyphenationType> result;
165     hyphenator->hyphenate(word, &result);
166     EXPECT_EQ((size_t)3, result.size());
167     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
168     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
169     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
170 }
171 
172 // In automatically hyphenated Malayalam script text, we should not insert a visible hyphen.
TEST(HyphenatorTest,malayalamAutomaticHyphenation)173 TEST(HyphenatorTest, malayalamAutomaticHyphenation) {
174     std::vector<uint8_t> patternData = readWholeFile(malayalamHyph);
175     Hyphenator* hyphenator = Hyphenator::loadBinary(patternData.data(), 2, 2, "en");
176     const uint16_t word[] = {MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA};
177     std::vector<HyphenationType> result;
178     hyphenator->hyphenate(word, &result);
179     EXPECT_EQ((size_t)5, result.size());
180     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
181     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
182     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
183     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[3]);
184     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
185 }
186 
187 // In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at.
TEST(HyphenatorTest,aremenianSoftHyphen)188 TEST(HyphenatorTest, aremenianSoftHyphen) {
189     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
190     const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB};
191     std::vector<HyphenationType> result;
192     hyphenator->hyphenate(word, &result);
193     EXPECT_EQ((size_t)3, result.size());
194     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
195     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
196     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]);
197 }
198 
199 // In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now.
200 // We may need to change this to maqaf later.
TEST(HyphenatorTest,hebrewSoftHyphen)201 TEST(HyphenatorTest, hebrewSoftHyphen) {
202     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
203     const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF};
204     std::vector<HyphenationType> result;
205     hyphenator->hyphenate(word, &result);
206     EXPECT_EQ((size_t)3, result.size());
207     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
208     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
209     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
210 }
211 
212 // Soft hyphen between two Arabic letters that join should keep the joining
213 // behavior when broken across lines.
TEST(HyphenatorTest,arabicSoftHyphenConnecting)214 TEST(HyphenatorTest, arabicSoftHyphenConnecting) {
215     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
216     const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH};
217     std::vector<HyphenationType> result;
218     hyphenator->hyphenate(word, &result);
219     EXPECT_EQ((size_t)3, result.size());
220     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
221     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
222     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]);
223 }
224 
225 // Arabic letters may be joining on one side, but if it's the wrong side, we
226 // should use the normal hyphen.
TEST(HyphenatorTest,arabicSoftHyphenNonConnecting)227 TEST(HyphenatorTest, arabicSoftHyphenNonConnecting) {
228     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
229     const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH};
230     std::vector<HyphenationType> result;
231     hyphenator->hyphenate(word, &result);
232     EXPECT_EQ((size_t)3, result.size());
233     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
234     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
235     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
236 }
237 
238 // Skip transparent characters until you find a non-transparent one.
TEST(HyphenatorTest,arabicSoftHyphenSkipTransparents)239 TEST(HyphenatorTest, arabicSoftHyphenSkipTransparents) {
240     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
241     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
242     std::vector<HyphenationType> result;
243     hyphenator->hyphenate(word, &result);
244     EXPECT_EQ((size_t)5, result.size());
245     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
246     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
247     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
248     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]);
249     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
250 }
251 
252 // Skip transparent characters until you find a non-transparent one. If we get to one end without
253 // finding anything, we are still non-joining.
TEST(HyphenatorTest,arabicSoftHyphenTransparentsAtEnd)254 TEST(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) {
255     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
256     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY};
257     std::vector<HyphenationType> result;
258     hyphenator->hyphenate(word, &result);
259     EXPECT_EQ((size_t)4, result.size());
260     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
261     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
262     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
263     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]);
264 }
265 
266 // Skip transparent characters until you find a non-transparent one. If we get to one end without
267 // finding anything, we are still non-joining.
TEST(HyphenatorTest,arabicSoftHyphenTransparentsAtStart)268 TEST(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) {
269     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
270     const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
271     std::vector<HyphenationType> result;
272     hyphenator->hyphenate(word, &result);
273     EXPECT_EQ((size_t)4, result.size());
274     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
275     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
276     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
277     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
278 }
279 
280 // In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen.
TEST(HyphenatorTest,ucasSoftHyphen)281 TEST(HyphenatorTest, ucasSoftHyphen) {
282     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
283     const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E};
284     std::vector<HyphenationType> result;
285     hyphenator->hyphenate(word, &result);
286     EXPECT_EQ((size_t)3, result.size());
287     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
288     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
289     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
290 }
291 
292 // Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a
293 // little arbitrary, but let's test it anyway.
TEST(HyphenatorTest,mixedScriptSoftHyphen)294 TEST(HyphenatorTest, mixedScriptSoftHyphen) {
295     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
296     const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E};
297     std::vector<HyphenationType> result;
298     hyphenator->hyphenate(word, &result);
299     EXPECT_EQ((size_t)3, result.size());
300     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
301     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
302     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
303 }
304 
305 // Hard hyphens provide a breaking opportunity with nothing extra inserted.
TEST(HyphenatorTest,hardHyphen)306 TEST(HyphenatorTest, hardHyphen) {
307     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
308     const uint16_t word[] = {'x', HYPHEN, 'y'};
309     std::vector<HyphenationType> result;
310     hyphenator->hyphenate(word, &result);
311     EXPECT_EQ((size_t)3, result.size());
312     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
313     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
314     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
315 }
316 
317 // Hyphen-minuses also provide a breaking opportunity with nothing extra inserted.
TEST(HyphenatorTest,hyphenMinus)318 TEST(HyphenatorTest, hyphenMinus) {
319     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
320     const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'};
321     std::vector<HyphenationType> result;
322     hyphenator->hyphenate(word, &result);
323     EXPECT_EQ((size_t)3, result.size());
324     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
325     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
326     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
327 }
328 
329 // If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break
330 // it at that point.
TEST(HyphenatorTest,startingHyphenMinus)331 TEST(HyphenatorTest, startingHyphenMinus) {
332     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
333     const uint16_t word[] = {HYPHEN_MINUS, 'y'};
334     std::vector<HyphenationType> result;
335     hyphenator->hyphenate(word, &result);
336     EXPECT_EQ((size_t)2, result.size());
337     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
338     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
339 }
340 
341 }  // namespace minikin
342