1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <gtest/gtest.h>
18 
19 #include "ICUTestBase.h"
20 #include <minikin/Hyphenator.h>
21 #include <FileUtils.h>
22 
23 #ifndef NELEM
24 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
25 #endif
26 
27 namespace minikin {
28 
29 const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb";
30 const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb";
31 
32 typedef ICUTestBase HyphenatorTest;
33 
34 const icu::Locale catalanLocale("ca", "ES", nullptr, nullptr);
35 const icu::Locale polishLocale("pl", "PL", nullptr, nullptr);
36 const icu::Locale& usLocale = icu::Locale::getUS();
37 
38 const uint16_t HYPHEN_MINUS = 0x002D;
39 const uint16_t SOFT_HYPHEN = 0x00AD;
40 const uint16_t MIDDLE_DOT = 0x00B7;
41 const uint16_t GREEK_LOWER_ALPHA = 0x03B1;
42 const uint16_t ARMENIAN_AYB = 0x0531;
43 const uint16_t HEBREW_ALEF = 0x05D0;
44 const uint16_t ARABIC_ALEF = 0x0627;
45 const uint16_t ARABIC_BEH = 0x0628;
46 const uint16_t ARABIC_ZWARAKAY = 0x0659;
47 const uint16_t MALAYALAM_KA = 0x0D15;
48 const uint16_t UCAS_E = 0x1401;
49 const uint16_t HYPHEN = 0x2010;
50 const uint16_t EN_DASH = 0x2013;
51 
52 // Simple test for US English. This tests "table", which happens to be the in the exceptions list.
TEST_F(HyphenatorTest,usEnglishAutomaticHyphenation)53 TEST_F(HyphenatorTest, usEnglishAutomaticHyphenation) {
54     Hyphenator* hyphenator = Hyphenator::loadBinary(readWholeFile(usHyph).data(), 2, 3);
55     const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'};
56     std::vector<HyphenationType> result;
57     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
58     EXPECT_EQ((size_t) 5, result.size());
59     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
60     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
61     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
62     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
63     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
64 }
65 
66 // Catalan l·l should break as l-/l
TEST_F(HyphenatorTest,catalanMiddleDot)67 TEST_F(HyphenatorTest, catalanMiddleDot) {
68     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
69     const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l'};
70     std::vector<HyphenationType> result;
71     hyphenator->hyphenate(&result, word, NELEM(word), catalanLocale);
72     EXPECT_EQ((size_t) 5, result.size());
73     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
74     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
75     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
76     EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]);
77     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
78 }
79 
80 // Catalan l·l should not break if the word is too short.
TEST_F(HyphenatorTest,catalanMiddleDotShortWord)81 TEST_F(HyphenatorTest, catalanMiddleDotShortWord) {
82     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
83     const uint16_t word[] = {'l', MIDDLE_DOT, 'l'};
84     std::vector<HyphenationType> result;
85     hyphenator->hyphenate(&result, word, NELEM(word), catalanLocale);
86     EXPECT_EQ((size_t) 3, result.size());
87     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
88     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
89     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
90 }
91 
92 // If we break on a hyphen in Polish, the hyphen should be repeated on the next line.
TEST_F(HyphenatorTest,polishHyphen)93 TEST_F(HyphenatorTest, polishHyphen) {
94     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
95     const uint16_t word[] = {'x', HYPHEN, 'y'};
96     std::vector<HyphenationType> result;
97     hyphenator->hyphenate(&result, word, NELEM(word), polishLocale);
98     EXPECT_EQ((size_t) 3, result.size());
99     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
100     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
101     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
102 }
103 
104 // If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation.
TEST_F(HyphenatorTest,polishHyphenButNonLatinWord)105 TEST_F(HyphenatorTest, polishHyphenButNonLatinWord) {
106     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
107     const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA};
108     std::vector<HyphenationType> result;
109     hyphenator->hyphenate(&result, word, NELEM(word), polishLocale);
110     EXPECT_EQ((size_t) 3, result.size());
111     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
112     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
113     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
114 }
115 
116 // Polish en dash doesn't repeat on next line (as far as we know), but just provides a break
117 // opportunity.
TEST_F(HyphenatorTest,polishEnDash)118 TEST_F(HyphenatorTest, polishEnDash) {
119     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
120     const uint16_t word[] = {'x', EN_DASH, 'y'};
121     std::vector<HyphenationType> result;
122     hyphenator->hyphenate(&result, word, NELEM(word), polishLocale);
123     EXPECT_EQ((size_t) 3, result.size());
124     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
125     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
126     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
127 }
128 
129 // In Latin script text, soft hyphens should insert a visible hyphen if broken at.
TEST_F(HyphenatorTest,latinSoftHyphen)130 TEST_F(HyphenatorTest, latinSoftHyphen) {
131     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
132     const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
133     std::vector<HyphenationType> result;
134     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
135     EXPECT_EQ((size_t) 3, result.size());
136     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
137     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
138     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
139 }
140 
141 // Soft hyphens at the beginning of a word are not useful in linebreaking.
TEST_F(HyphenatorTest,latinSoftHyphenStartingTheWord)142 TEST_F(HyphenatorTest, latinSoftHyphenStartingTheWord) {
143     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
144     const uint16_t word[] = {SOFT_HYPHEN, 'y'};
145     std::vector<HyphenationType> result;
146     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
147     EXPECT_EQ((size_t) 2, result.size());
148     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
149     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
150 }
151 
152 // In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at.
TEST_F(HyphenatorTest,malayalamSoftHyphen)153 TEST_F(HyphenatorTest, malayalamSoftHyphen) {
154     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
155     const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA};
156     std::vector<HyphenationType> result;
157     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
158     EXPECT_EQ((size_t) 3, result.size());
159     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
160     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
161     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
162 }
163 
164 // In automatically hyphenated Malayalam script text, we should not insert a visible hyphen.
TEST_F(HyphenatorTest,malayalamAutomaticHyphenation)165 TEST_F(HyphenatorTest, malayalamAutomaticHyphenation) {
166     Hyphenator* hyphenator = Hyphenator::loadBinary(readWholeFile(malayalamHyph).data(), 2, 2);
167     const uint16_t word[] = {
168             MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA};
169     std::vector<HyphenationType> result;
170     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
171     EXPECT_EQ((size_t) 5, result.size());
172     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
173     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
174     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
175     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[3]);
176     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
177 }
178 
179 // In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at.
TEST_F(HyphenatorTest,aremenianSoftHyphen)180 TEST_F(HyphenatorTest, aremenianSoftHyphen) {
181     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
182     const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB};
183     std::vector<HyphenationType> result;
184     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
185     EXPECT_EQ((size_t) 3, result.size());
186     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
187     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
188     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]);
189 }
190 
191 // In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now.
192 // We may need to change this to maqaf later.
TEST_F(HyphenatorTest,hebrewSoftHyphen)193 TEST_F(HyphenatorTest, hebrewSoftHyphen) {
194     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
195     const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF};
196     std::vector<HyphenationType> result;
197     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
198     EXPECT_EQ((size_t) 3, result.size());
199     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
200     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
201     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
202 }
203 
204 // Soft hyphen between two Arabic letters that join should keep the joining
205 // behavior when broken across lines.
TEST_F(HyphenatorTest,arabicSoftHyphenConnecting)206 TEST_F(HyphenatorTest, arabicSoftHyphenConnecting) {
207     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
208     const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH};
209     std::vector<HyphenationType> result;
210     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
211     EXPECT_EQ((size_t) 3, result.size());
212     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
213     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
214     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]);
215 }
216 
217 // Arabic letters may be joining on one side, but if it's the wrong side, we
218 // should use the normal hyphen.
TEST_F(HyphenatorTest,arabicSoftHyphenNonConnecting)219 TEST_F(HyphenatorTest, arabicSoftHyphenNonConnecting) {
220     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
221     const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH};
222     std::vector<HyphenationType> result;
223     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
224     EXPECT_EQ((size_t) 3, result.size());
225     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
226     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
227     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
228 }
229 
230 // Skip transparent characters until you find a non-transparent one.
TEST_F(HyphenatorTest,arabicSoftHyphenSkipTransparents)231 TEST_F(HyphenatorTest, arabicSoftHyphenSkipTransparents) {
232     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
233     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
234     std::vector<HyphenationType> result;
235     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
236     EXPECT_EQ((size_t) 5, result.size());
237     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
238     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
239     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
240     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]);
241     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
242 }
243 
244 // Skip transparent characters until you find a non-transparent one. If we get to one end without
245 // finding anything, we are still non-joining.
TEST_F(HyphenatorTest,arabicSoftHyphenTransparentsAtEnd)246 TEST_F(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) {
247     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
248     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY};
249     std::vector<HyphenationType> result;
250     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
251     EXPECT_EQ((size_t) 4, result.size());
252     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
253     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
254     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
255     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]);
256 }
257 
258 // Skip transparent characters until you find a non-transparent one. If we get to one end without
259 // finding anything, we are still non-joining.
TEST_F(HyphenatorTest,arabicSoftHyphenTransparentsAtStart)260 TEST_F(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) {
261     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
262     const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
263     std::vector<HyphenationType> result;
264     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
265     EXPECT_EQ((size_t) 4, result.size());
266     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
267     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
268     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
269     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
270 }
271 
272 // In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen.
TEST_F(HyphenatorTest,ucasSoftHyphen)273 TEST_F(HyphenatorTest, ucasSoftHyphen) {
274     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
275     const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E};
276     std::vector<HyphenationType> result;
277     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
278     EXPECT_EQ((size_t) 3, result.size());
279     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
280     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
281     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
282 }
283 
284 // Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a
285 // little arbitrary, but let's test it anyway.
TEST_F(HyphenatorTest,mixedScriptSoftHyphen)286 TEST_F(HyphenatorTest, mixedScriptSoftHyphen) {
287     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
288     const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E};
289     std::vector<HyphenationType> result;
290     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
291     EXPECT_EQ((size_t) 3, result.size());
292     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
293     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
294     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
295 }
296 
297 // Hard hyphens provide a breaking opportunity with nothing extra inserted.
TEST_F(HyphenatorTest,hardHyphen)298 TEST_F(HyphenatorTest, hardHyphen) {
299     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
300     const uint16_t word[] = {'x', HYPHEN, 'y'};
301     std::vector<HyphenationType> result;
302     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
303     EXPECT_EQ((size_t) 3, result.size());
304     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
305     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
306     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
307 }
308 
309 // Hyphen-minuses also provide a breaking opportunity with nothing extra inserted.
TEST_F(HyphenatorTest,hyphenMinus)310 TEST_F(HyphenatorTest, hyphenMinus) {
311     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
312     const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'};
313     std::vector<HyphenationType> result;
314     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
315     EXPECT_EQ((size_t) 3, result.size());
316     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
317     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
318     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
319 }
320 
321 // If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break
322 // it at that point.
TEST_F(HyphenatorTest,startingHyphenMinus)323 TEST_F(HyphenatorTest, startingHyphenMinus) {
324     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2);
325     const uint16_t word[] = {HYPHEN_MINUS, 'y'};
326     std::vector<HyphenationType> result;
327     hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
328     EXPECT_EQ((size_t) 2, result.size());
329     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
330     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
331 }
332 
333 }  // namespace minikin
334 
335