1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <gtest/gtest.h>
18 #include <UnicodeUtils.h>
19 
20 #include "LayoutUtils.h"
21 
22 namespace {
23 
ExpectNextWordBreakForCache(size_t offset_in,const char * query_str)24 void ExpectNextWordBreakForCache(size_t offset_in, const char* query_str) {
25     const size_t BUF_SIZE = 256U;
26     uint16_t buf[BUF_SIZE];
27     size_t expected_breakpoint = 0U;
28     size_t size = 0U;
29 
30     ParseUnicode(buf, BUF_SIZE, query_str, &size, &expected_breakpoint);
31     EXPECT_EQ(expected_breakpoint,
32               getNextWordBreakForCache(buf, offset_in, size))
33         << "Expected position is [" << query_str << "] from offset " << offset_in;
34 }
35 
ExpectPrevWordBreakForCache(size_t offset_in,const char * query_str)36 void ExpectPrevWordBreakForCache(size_t offset_in, const char* query_str) {
37     const size_t BUF_SIZE = 256U;
38     uint16_t buf[BUF_SIZE];
39     size_t expected_breakpoint = 0U;
40     size_t size = 0U;
41 
42     ParseUnicode(buf, BUF_SIZE, query_str, &size, &expected_breakpoint);
43     EXPECT_EQ(expected_breakpoint,
44               getPrevWordBreakForCache(buf, offset_in, size))
45         << "Expected position is [" << query_str << "] from offset " << offset_in;
46 }
47 
TEST(WordBreakTest,goNextWordBreakTest)48 TEST(WordBreakTest, goNextWordBreakTest) {
49     ExpectNextWordBreakForCache(0, "|");
50 
51     // Continue for spaces.
52     ExpectNextWordBreakForCache(0, "'a' 'b' 'c' 'd' |");
53     ExpectNextWordBreakForCache(1, "'a' 'b' 'c' 'd' |");
54     ExpectNextWordBreakForCache(2, "'a' 'b' 'c' 'd' |");
55     ExpectNextWordBreakForCache(3, "'a' 'b' 'c' 'd' |");
56     ExpectNextWordBreakForCache(4, "'a' 'b' 'c' 'd' |");
57     ExpectNextWordBreakForCache(1000, "'a' 'b' 'c' 'd' |");
58 
59     // Space makes word break.
60     ExpectNextWordBreakForCache(0, "'a' 'b' | U+0020 'c' 'd'");
61     ExpectNextWordBreakForCache(1, "'a' 'b' | U+0020 'c' 'd'");
62     ExpectNextWordBreakForCache(2, "'a' 'b' U+0020 | 'c' 'd'");
63     ExpectNextWordBreakForCache(3, "'a' 'b' U+0020 'c' 'd' |");
64     ExpectNextWordBreakForCache(4, "'a' 'b' U+0020 'c' 'd' |");
65     ExpectNextWordBreakForCache(5, "'a' 'b' U+0020 'c' 'd' |");
66     ExpectNextWordBreakForCache(1000, "'a' 'b' U+0020 'c' 'd' |");
67 
68     ExpectNextWordBreakForCache(0, "'a' 'b' | U+2000 'c' 'd'");
69     ExpectNextWordBreakForCache(1, "'a' 'b' | U+2000 'c' 'd'");
70     ExpectNextWordBreakForCache(2, "'a' 'b' U+2000 | 'c' 'd'");
71     ExpectNextWordBreakForCache(3, "'a' 'b' U+2000 'c' 'd' |");
72     ExpectNextWordBreakForCache(4, "'a' 'b' U+2000 'c' 'd' |");
73     ExpectNextWordBreakForCache(5, "'a' 'b' U+2000 'c' 'd' |");
74     ExpectNextWordBreakForCache(1000, "'a' 'b' U+2000 'c' 'd' |");
75 
76     ExpectNextWordBreakForCache(0, "'a' 'b' | U+2000 U+2000 'c' 'd'");
77     ExpectNextWordBreakForCache(1, "'a' 'b' | U+2000 U+2000 'c' 'd'");
78     ExpectNextWordBreakForCache(2, "'a' 'b' U+2000 | U+2000 'c' 'd'");
79     ExpectNextWordBreakForCache(3, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
80     ExpectNextWordBreakForCache(4, "'a' 'b' U+2000 U+2000 'c' 'd' |");
81     ExpectNextWordBreakForCache(5, "'a' 'b' U+2000 U+2000 'c' 'd' |");
82     ExpectNextWordBreakForCache(6, "'a' 'b' U+2000 U+2000 'c' 'd' |");
83     ExpectNextWordBreakForCache(1000, "'a' 'b' U+2000 U+2000 'c' 'd' |");
84 
85     // CJK ideographs makes word break.
86     ExpectNextWordBreakForCache(0, "U+4E00 | U+4E00   U+4E00   U+4E00   U+4E00");
87     ExpectNextWordBreakForCache(1, "U+4E00   U+4E00 | U+4E00   U+4E00   U+4E00");
88     ExpectNextWordBreakForCache(2, "U+4E00   U+4E00   U+4E00 | U+4E00   U+4E00");
89     ExpectNextWordBreakForCache(3, "U+4E00   U+4E00   U+4E00   U+4E00 | U+4E00");
90     ExpectNextWordBreakForCache(4, "U+4E00   U+4E00   U+4E00   U+4E00   U+4E00 |");
91     ExpectNextWordBreakForCache(5, "U+4E00   U+4E00   U+4E00   U+4E00   U+4E00 |");
92     ExpectNextWordBreakForCache(1000,
93                              "U+4E00   U+4E00   U+4E00   U+4E00   U+4E00 |");
94 
95     ExpectNextWordBreakForCache(0, "U+4E00 | U+4E8C   U+4E09   U+56DB   U+4E94");
96     ExpectNextWordBreakForCache(1, "U+4E00   U+4E8C | U+4E09   U+56DB   U+4E94");
97     ExpectNextWordBreakForCache(2, "U+4E00   U+4E8C   U+4E09 | U+56DB   U+4E94");
98     ExpectNextWordBreakForCache(3, "U+4E00   U+4E8C   U+4E09   U+56DB | U+4E94");
99     ExpectNextWordBreakForCache(4, "U+4E00   U+4E8C   U+4E09   U+56DB   U+4E94 |");
100     ExpectNextWordBreakForCache(5, "U+4E00   U+4E8C   U+4E09   U+56DB   U+4E94 |");
101     ExpectNextWordBreakForCache(1000,
102                              "U+4E00   U+4E8C   U+4E09   U+56DB   U+4E94 |");
103 
104     ExpectNextWordBreakForCache(0, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
105     ExpectNextWordBreakForCache(1, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
106     ExpectNextWordBreakForCache(2, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
107     ExpectNextWordBreakForCache(3, "U+4E00 'a' 'b' U+2000 | 'c' U+4E00");
108     ExpectNextWordBreakForCache(4, "U+4E00 'a' 'b' U+2000 'c' | U+4E00");
109     ExpectNextWordBreakForCache(5, "U+4E00 'a' 'b' U+2000 'c' U+4E00 |");
110     ExpectNextWordBreakForCache(1000, "U+4E00 'a' 'b' U+2000 'c' U+4E00 |");
111 
112     // Continue if trailing characters is Unicode combining characters.
113     ExpectNextWordBreakForCache(0, "U+4E00 U+0332 | U+4E00");
114     ExpectNextWordBreakForCache(1, "U+4E00 U+0332 | U+4E00");
115     ExpectNextWordBreakForCache(2, "U+4E00 U+0332 U+4E00 |");
116     ExpectNextWordBreakForCache(3, "U+4E00 U+0332 U+4E00 |");
117     ExpectNextWordBreakForCache(1000, "U+4E00 U+0332 U+4E00 |");
118 
119     // Surrogate pairs.
120     ExpectNextWordBreakForCache(0, "U+1F60D U+1F618 |");
121     ExpectNextWordBreakForCache(1, "U+1F60D U+1F618 |");
122     ExpectNextWordBreakForCache(2, "U+1F60D U+1F618 |");
123     ExpectNextWordBreakForCache(3, "U+1F60D U+1F618 |");
124     ExpectNextWordBreakForCache(4, "U+1F60D U+1F618 |");
125     ExpectNextWordBreakForCache(1000, "U+1F60D U+1F618 |");
126 
127     // Broken surrogate pairs.
128     // U+D84D is leading surrogate but there is no trailing surrogate for it.
129     ExpectNextWordBreakForCache(0, "U+D84D U+1F618 |");
130     ExpectNextWordBreakForCache(1, "U+D84D U+1F618 |");
131     ExpectNextWordBreakForCache(2, "U+D84D U+1F618 |");
132     ExpectNextWordBreakForCache(3, "U+D84D U+1F618 |");
133     ExpectNextWordBreakForCache(1000, "U+D84D U+1F618 |");
134 
135     ExpectNextWordBreakForCache(0, "U+1F618 U+D84D |");
136     ExpectNextWordBreakForCache(1, "U+1F618 U+D84D |");
137     ExpectNextWordBreakForCache(2, "U+1F618 U+D84D |");
138     ExpectNextWordBreakForCache(3, "U+1F618 U+D84D |");
139     ExpectNextWordBreakForCache(1000, "U+1F618 U+D84D |");
140 
141     // U+DE0D is trailing surrogate but there is no leading surrogate for it.
142     ExpectNextWordBreakForCache(0, "U+DE0D U+1F618 |");
143     ExpectNextWordBreakForCache(1, "U+DE0D U+1F618 |");
144     ExpectNextWordBreakForCache(2, "U+DE0D U+1F618 |");
145     ExpectNextWordBreakForCache(3, "U+DE0D U+1F618 |");
146     ExpectNextWordBreakForCache(1000, "U+DE0D U+1F618 |");
147 
148     ExpectNextWordBreakForCache(0, "U+1F618 U+DE0D |");
149     ExpectNextWordBreakForCache(1, "U+1F618 U+DE0D |");
150     ExpectNextWordBreakForCache(2, "U+1F618 U+DE0D |");
151     ExpectNextWordBreakForCache(3, "U+1F618 U+DE0D |");
152     ExpectNextWordBreakForCache(1000, "U+1F618 U+DE0D |");
153 
154     // Regional indicator pair. U+1F1FA U+1F1F8 is US national flag.
155     ExpectNextWordBreakForCache(0, "U+1F1FA U+1F1F8 |");
156     ExpectNextWordBreakForCache(1, "U+1F1FA U+1F1F8 |");
157     ExpectNextWordBreakForCache(2, "U+1F1FA U+1F1F8 |");
158     ExpectNextWordBreakForCache(1000, "U+1F1FA U+1F1F8 |");
159 
160     // Tone marks.
161     // CJK ideographic char + Tone mark + CJK ideographic char
162     ExpectNextWordBreakForCache(0, "U+4444 U+302D | U+4444");
163     ExpectNextWordBreakForCache(1, "U+4444 U+302D | U+4444");
164     ExpectNextWordBreakForCache(2, "U+4444 U+302D U+4444 |");
165     ExpectNextWordBreakForCache(3, "U+4444 U+302D U+4444 |");
166     ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+4444 |");
167 
168     // Variation Selectors.
169     // CJK Ideographic char + Variation Selector(VS1) + CJK Ideographic char
170     ExpectNextWordBreakForCache(0, "U+845B U+FE00 | U+845B");
171     ExpectNextWordBreakForCache(1, "U+845B U+FE00 | U+845B");
172     ExpectNextWordBreakForCache(2, "U+845B U+FE00 U+845B |");
173     ExpectNextWordBreakForCache(3, "U+845B U+FE00 U+845B |");
174     ExpectNextWordBreakForCache(1000, "U+845B U+FE00 U+845B |");
175 
176     // CJK Ideographic char + Variation Selector(VS17) + CJK Ideographic char
177     ExpectNextWordBreakForCache(0, "U+845B U+E0100 | U+845B");
178     ExpectNextWordBreakForCache(1, "U+845B U+E0100 | U+845B");
179     ExpectNextWordBreakForCache(2, "U+845B U+E0100 | U+845B");
180     ExpectNextWordBreakForCache(3, "U+845B U+E0100 U+845B |");
181     ExpectNextWordBreakForCache(4, "U+845B U+E0100 U+845B |");
182     ExpectNextWordBreakForCache(5, "U+845B U+E0100 U+845B |");
183     ExpectNextWordBreakForCache(1000, "U+845B U+E0100 U+845B |");
184 
185     // CJK ideographic char + Tone mark + Variation Character(VS1)
186     ExpectNextWordBreakForCache(0, "U+4444 U+302D U+FE00 | U+4444");
187     ExpectNextWordBreakForCache(1, "U+4444 U+302D U+FE00 | U+4444");
188     ExpectNextWordBreakForCache(2, "U+4444 U+302D U+FE00 | U+4444");
189     ExpectNextWordBreakForCache(3, "U+4444 U+302D U+FE00 U+4444 |");
190     ExpectNextWordBreakForCache(4, "U+4444 U+302D U+FE00 U+4444 |");
191     ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+FE00 U+4444 |");
192 
193     // CJK ideographic char + Tone mark + Variation Character(VS17)
194     ExpectNextWordBreakForCache(0, "U+4444 U+302D U+E0100 | U+4444");
195     ExpectNextWordBreakForCache(1, "U+4444 U+302D U+E0100 | U+4444");
196     ExpectNextWordBreakForCache(2, "U+4444 U+302D U+E0100 | U+4444");
197     ExpectNextWordBreakForCache(3, "U+4444 U+302D U+E0100 | U+4444");
198     ExpectNextWordBreakForCache(4, "U+4444 U+302D U+E0100 U+4444 |");
199     ExpectNextWordBreakForCache(5, "U+4444 U+302D U+E0100 U+4444 |");
200     ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+E0100 U+4444 |");
201 
202     // CJK ideographic char + Variation Character(VS1) + Tone mark
203     ExpectNextWordBreakForCache(0, "U+4444 U+FE00 U+302D | U+4444");
204     ExpectNextWordBreakForCache(1, "U+4444 U+FE00 U+302D | U+4444");
205     ExpectNextWordBreakForCache(2, "U+4444 U+FE00 U+302D | U+4444");
206     ExpectNextWordBreakForCache(3, "U+4444 U+FE00 U+302D U+4444 |");
207     ExpectNextWordBreakForCache(4, "U+4444 U+FE00 U+302D U+4444 |");
208     ExpectNextWordBreakForCache(1000, "U+4444 U+FE00 U+302D U+4444 |");
209 
210     // CJK ideographic char + Variation Character(VS17) + Tone mark
211     ExpectNextWordBreakForCache(0, "U+4444 U+E0100 U+302D | U+4444");
212     ExpectNextWordBreakForCache(1, "U+4444 U+E0100 U+302D | U+4444");
213     ExpectNextWordBreakForCache(2, "U+4444 U+E0100 U+302D | U+4444");
214     ExpectNextWordBreakForCache(3, "U+4444 U+E0100 U+302D | U+4444");
215     ExpectNextWordBreakForCache(4, "U+4444 U+E0100 U+302D U+4444 |");
216     ExpectNextWordBreakForCache(5, "U+4444 U+E0100 U+302D U+4444 |");
217     ExpectNextWordBreakForCache(1000, "U+4444 U+E0100 U+302D U+4444 |");
218 
219     // Following test cases are unusual usage of variation selectors and tone
220     // marks for caching up the further behavior changes, e.g. index of bounds
221     // or crashes. Please feel free to update the test expectations if the
222     // behavior change makes sense to you.
223 
224     // Isolated Tone marks and Variation Selectors
225     ExpectNextWordBreakForCache(0, "U+FE00 |");
226     ExpectNextWordBreakForCache(1, "U+FE00 |");
227     ExpectNextWordBreakForCache(1000, "U+FE00 |");
228     ExpectNextWordBreakForCache(0, "U+E0100 |");
229     ExpectNextWordBreakForCache(1000, "U+E0100 |");
230     ExpectNextWordBreakForCache(0, "U+302D |");
231     ExpectNextWordBreakForCache(1000, "U+302D |");
232 
233     // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS1)
234     ExpectNextWordBreakForCache(0, "U+845B U+FE00 U+FE00 | U+845B");
235     ExpectNextWordBreakForCache(1, "U+845B U+FE00 U+FE00 | U+845B");
236     ExpectNextWordBreakForCache(2, "U+845B U+FE00 U+FE00 | U+845B");
237     ExpectNextWordBreakForCache(3, "U+845B U+FE00 U+FE00 U+845B |");
238     ExpectNextWordBreakForCache(4, "U+845B U+FE00 U+FE00 U+845B |");
239     ExpectNextWordBreakForCache(1000, "U+845B U+FE00 U+FE00 U+845B |");
240 
241     // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS17)
242     ExpectNextWordBreakForCache(0, "U+845B U+E0100 U+E0100 | U+845B");
243     ExpectNextWordBreakForCache(1, "U+845B U+E0100 U+E0100 | U+845B");
244     ExpectNextWordBreakForCache(2, "U+845B U+E0100 U+E0100 | U+845B");
245     ExpectNextWordBreakForCache(3, "U+845B U+E0100 U+E0100 | U+845B");
246     ExpectNextWordBreakForCache(4, "U+845B U+E0100 U+E0100 | U+845B");
247     ExpectNextWordBreakForCache(5, "U+845B U+E0100 U+E0100 U+845B |");
248     ExpectNextWordBreakForCache(6, "U+845B U+E0100 U+E0100 U+845B |");
249     ExpectNextWordBreakForCache(1000,
250                              "U+845B U+E0100 U+E0100 U+845B |");
251 
252     // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS17)
253     ExpectNextWordBreakForCache(0, "U+845B U+FE00 U+E0100 | U+845B");
254     ExpectNextWordBreakForCache(1, "U+845B U+FE00 U+E0100 | U+845B");
255     ExpectNextWordBreakForCache(2, "U+845B U+FE00 U+E0100 | U+845B");
256     ExpectNextWordBreakForCache(3, "U+845B U+FE00 U+E0100 | U+845B");
257     ExpectNextWordBreakForCache(4, "U+845B U+FE00 U+E0100 U+845B |");
258     ExpectNextWordBreakForCache(5, "U+845B U+FE00 U+E0100 U+845B |");
259     ExpectNextWordBreakForCache(1000, "U+845B U+FE00 U+E0100 U+845B |");
260 
261     // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS1)
262     ExpectNextWordBreakForCache(0, "U+845B U+E0100 U+FE00 | U+845B");
263     ExpectNextWordBreakForCache(1, "U+845B U+E0100 U+FE00 | U+845B");
264     ExpectNextWordBreakForCache(2, "U+845B U+E0100 U+FE00 | U+845B");
265     ExpectNextWordBreakForCache(3, "U+845B U+E0100 U+FE00 | U+845B");
266     ExpectNextWordBreakForCache(4, "U+845B U+E0100 U+FE00 U+845B |");
267     ExpectNextWordBreakForCache(5, "U+845B U+E0100 U+FE00 U+845B |");
268     ExpectNextWordBreakForCache(1000, "U+845B U+E0100 U+FE00 U+845B |");
269 
270     // Tone mark. + Tone mark
271     ExpectNextWordBreakForCache(0, "U+4444 U+302D U+302D | U+4444");
272     ExpectNextWordBreakForCache(1, "U+4444 U+302D U+302D | U+4444");
273     ExpectNextWordBreakForCache(2, "U+4444 U+302D U+302D | U+4444");
274     ExpectNextWordBreakForCache(3, "U+4444 U+302D U+302D U+4444 |");
275     ExpectNextWordBreakForCache(4, "U+4444 U+302D U+302D U+4444 |");
276     ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+302D U+4444 |");
277 }
278 
TEST(WordBreakTest,goPrevWordBreakTest)279 TEST(WordBreakTest, goPrevWordBreakTest) {
280     ExpectPrevWordBreakForCache(0, "|");
281 
282     // Continue for spaces.
283     ExpectPrevWordBreakForCache(0, "| 'a' 'b' 'c' 'd'");
284     ExpectPrevWordBreakForCache(1, "| 'a' 'b' 'c' 'd'");
285     ExpectPrevWordBreakForCache(2, "| 'a' 'b' 'c' 'd'");
286     ExpectPrevWordBreakForCache(3, "| 'a' 'b' 'c' 'd'");
287     ExpectPrevWordBreakForCache(4, "| 'a' 'b' 'c' 'd'");
288     ExpectPrevWordBreakForCache(1000, "| 'a' 'b' 'c' 'd'");
289 
290     // Space makes word break.
291     ExpectPrevWordBreakForCache(0, "| 'a' 'b' U+0020 'c' 'd'");
292     ExpectPrevWordBreakForCache(1, "| 'a' 'b' U+0020 'c' 'd'");
293     ExpectPrevWordBreakForCache(2, "| 'a' 'b' U+0020 'c' 'd'");
294     ExpectPrevWordBreakForCache(3, "'a' 'b' | U+0020 'c' 'd'");
295     ExpectPrevWordBreakForCache(4, "'a' 'b' U+0020 | 'c' 'd'");
296     ExpectPrevWordBreakForCache(5, "'a' 'b' U+0020 | 'c' 'd'");
297     ExpectPrevWordBreakForCache(1000, "'a' 'b' U+0020 | 'c' 'd'");
298 
299     ExpectPrevWordBreakForCache(0, "| 'a' 'b' U+2000 'c' 'd'");
300     ExpectPrevWordBreakForCache(1, "| 'a' 'b' U+2000 'c' 'd'");
301     ExpectPrevWordBreakForCache(2, "| 'a' 'b' U+2000 'c' 'd'");
302     ExpectPrevWordBreakForCache(3, "'a' 'b' | U+2000 'c' 'd'");
303     ExpectPrevWordBreakForCache(4, "'a' 'b' U+2000 | 'c' 'd'");
304     ExpectPrevWordBreakForCache(5, "'a' 'b' U+2000 | 'c' 'd'");
305     ExpectPrevWordBreakForCache(1000, "'a' 'b' U+2000 | 'c' 'd'");
306 
307     ExpectPrevWordBreakForCache(0, "| 'a' 'b' U+2000 U+2000 'c' 'd'");
308     ExpectPrevWordBreakForCache(1, "| 'a' 'b' U+2000 U+2000 'c' 'd'");
309     ExpectPrevWordBreakForCache(2, "| 'a' 'b' U+2000 U+2000 'c' 'd'");
310     ExpectPrevWordBreakForCache(3, "'a' 'b' | U+2000 U+2000 'c' 'd'");
311     ExpectPrevWordBreakForCache(4, "'a' 'b' U+2000 | U+2000 'c' 'd'");
312     ExpectPrevWordBreakForCache(5, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
313     ExpectPrevWordBreakForCache(6, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
314     ExpectPrevWordBreakForCache(1000, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
315 
316     // CJK ideographs makes word break.
317     ExpectPrevWordBreakForCache(0, "| U+4E00 U+4E00 U+4E00 U+4E00 U+4E00");
318     ExpectPrevWordBreakForCache(1, "| U+4E00 U+4E00 U+4E00 U+4E00 U+4E00");
319     ExpectPrevWordBreakForCache(2, "U+4E00 | U+4E00 U+4E00 U+4E00 U+4E00");
320     ExpectPrevWordBreakForCache(3, "U+4E00 U+4E00 | U+4E00 U+4E00 U+4E00");
321     ExpectPrevWordBreakForCache(4, "U+4E00 U+4E00 U+4E00 | U+4E00 U+4E00");
322     ExpectPrevWordBreakForCache(5, "U+4E00 U+4E00 U+4E00 U+4E00 | U+4E00");
323     ExpectPrevWordBreakForCache(1000, "U+4E00 U+4E00 U+4E00 U+4E00 | U+4E00");
324 
325     ExpectPrevWordBreakForCache(0, "| U+4E00 U+4E8C U+4E09 U+56DB U+4E94");
326     ExpectPrevWordBreakForCache(1, "| U+4E00 U+4E8C U+4E09 U+56DB U+4E94");
327     ExpectPrevWordBreakForCache(2, "U+4E00 | U+4E8C U+4E09 U+56DB U+4E94");
328     ExpectPrevWordBreakForCache(3, "U+4E00 U+4E8C | U+4E09 U+56DB U+4E94");
329     ExpectPrevWordBreakForCache(4, "U+4E00 U+4E8C U+4E09 | U+56DB U+4E94");
330     ExpectPrevWordBreakForCache(5, "U+4E00 U+4E8C U+4E09 U+56DB | U+4E94");
331     ExpectPrevWordBreakForCache(1000, "U+4E00 U+4E8C U+4E09 U+56DB | U+4E94");
332 
333     // Mixed case.
334     ExpectPrevWordBreakForCache(0, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
335     ExpectPrevWordBreakForCache(1, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
336     ExpectPrevWordBreakForCache(2, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
337     ExpectPrevWordBreakForCache(3, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
338     ExpectPrevWordBreakForCache(4, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
339     ExpectPrevWordBreakForCache(5, "U+4E00 'a' 'b' U+2000 | 'c' U+4E00");
340     ExpectPrevWordBreakForCache(6, "U+4E00 'a' 'b' U+2000 'c' | U+4E00");
341     ExpectPrevWordBreakForCache(1000, "U+4E00 'a' 'b' U+2000 'c' | U+4E00");
342 
343     // Continue if trailing characters is Unicode combining characters.
344     ExpectPrevWordBreakForCache(0, "| U+4E00 U+0332 U+4E00");
345     ExpectPrevWordBreakForCache(1, "| U+4E00 U+0332 U+4E00");
346     ExpectPrevWordBreakForCache(2, "| U+4E00 U+0332 U+4E00");
347     ExpectPrevWordBreakForCache(3, "U+4E00 U+0332 | U+4E00");
348     ExpectPrevWordBreakForCache(1000, "U+4E00 U+0332 | U+4E00");
349 
350     // Surrogate pairs.
351     ExpectPrevWordBreakForCache(0, "| U+1F60D U+1F618");
352     ExpectPrevWordBreakForCache(1, "| U+1F60D U+1F618");
353     ExpectPrevWordBreakForCache(2, "| U+1F60D U+1F618");
354     ExpectPrevWordBreakForCache(3, "| U+1F60D U+1F618");
355     ExpectPrevWordBreakForCache(4, "| U+1F60D U+1F618");
356     ExpectPrevWordBreakForCache(1000, "| U+1F60D U+1F618");
357 
358     // Broken surrogate pairs.
359     // U+D84D is leading surrogate but there is no trailing surrogate for it.
360     ExpectPrevWordBreakForCache(0, "| U+D84D U+1F618");
361     ExpectPrevWordBreakForCache(1, "| U+D84D U+1F618");
362     ExpectPrevWordBreakForCache(2, "| U+D84D U+1F618");
363     ExpectPrevWordBreakForCache(3, "| U+D84D U+1F618");
364     ExpectPrevWordBreakForCache(1000, "| U+D84D U+1F618");
365 
366     ExpectPrevWordBreakForCache(0, "| U+1F618 U+D84D");
367     ExpectPrevWordBreakForCache(1, "| U+1F618 U+D84D");
368     ExpectPrevWordBreakForCache(2, "| U+1F618 U+D84D");
369     ExpectPrevWordBreakForCache(3, "| U+1F618 U+D84D");
370     ExpectPrevWordBreakForCache(1000, "| U+1F618 U+D84D");
371 
372     // U+DE0D is trailing surrogate but there is no leading surrogate for it.
373     ExpectPrevWordBreakForCache(0, "| U+DE0D U+1F618");
374     ExpectPrevWordBreakForCache(1, "| U+DE0D U+1F618");
375     ExpectPrevWordBreakForCache(2, "| U+DE0D U+1F618");
376     ExpectPrevWordBreakForCache(3, "| U+DE0D U+1F618");
377     ExpectPrevWordBreakForCache(1000, "| U+DE0D U+1F618");
378 
379     ExpectPrevWordBreakForCache(0, "| U+1F618 U+DE0D");
380     ExpectPrevWordBreakForCache(1, "| U+1F618 U+DE0D");
381     ExpectPrevWordBreakForCache(2, "| U+1F618 U+DE0D");
382     ExpectPrevWordBreakForCache(3, "| U+1F618 U+DE0D");
383     ExpectPrevWordBreakForCache(1000, "| U+1F618 U+DE0D");
384 
385     // Regional indicator pair. U+1F1FA U+1F1F8 is US national flag.
386     ExpectPrevWordBreakForCache(0, "| U+1F1FA U+1F1F8");
387     ExpectPrevWordBreakForCache(1, "| U+1F1FA U+1F1F8");
388     ExpectPrevWordBreakForCache(2, "| U+1F1FA U+1F1F8");
389     ExpectPrevWordBreakForCache(1000, "| U+1F1FA U+1F1F8");
390 
391     // Tone marks.
392     // CJK ideographic char + Tone mark + CJK ideographic char
393     ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+4444");
394     ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+4444");
395     ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+4444");
396     ExpectPrevWordBreakForCache(3, "U+4444 U+302D | U+4444");
397     ExpectPrevWordBreakForCache(1000, "U+4444 U+302D | U+4444");
398 
399     // Variation Selectors.
400     // CJK Ideographic char + Variation Selector(VS1) + CJK Ideographic char
401     ExpectPrevWordBreakForCache(0, "| U+845B U+FE00 U+845B");
402     ExpectPrevWordBreakForCache(1, "| U+845B U+FE00 U+845B");
403     ExpectPrevWordBreakForCache(2, "| U+845B U+FE00 U+845B");
404     ExpectPrevWordBreakForCache(3, "U+845B U+FE00 | U+845B");
405     ExpectPrevWordBreakForCache(1000, "U+845B U+FE00 | U+845B");
406 
407     // CJK Ideographic char + Variation Selector(VS17) + CJK Ideographic char
408     ExpectPrevWordBreakForCache(0, "| U+845B U+E0100 U+845B");
409     ExpectPrevWordBreakForCache(1, "| U+845B U+E0100 U+845B");
410     ExpectPrevWordBreakForCache(2, "| U+845B U+E0100 U+845B");
411     ExpectPrevWordBreakForCache(3, "| U+845B U+E0100 U+845B");
412     ExpectPrevWordBreakForCache(4, "U+845B U+E0100 | U+845B");
413     ExpectPrevWordBreakForCache(5, "U+845B U+E0100 | U+845B");
414     ExpectPrevWordBreakForCache(1000, "U+845B U+E0100 | U+845B");
415 
416     // CJK ideographic char + Tone mark + Variation Character(VS1)
417     ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+FE00 U+4444");
418     ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+FE00 U+4444");
419     ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+FE00 U+4444");
420     ExpectPrevWordBreakForCache(3, "| U+4444 U+302D U+FE00 U+4444");
421     ExpectPrevWordBreakForCache(4, "U+4444 U+302D U+FE00 | U+4444");
422     ExpectPrevWordBreakForCache(1000, "U+4444 U+302D U+FE00 | U+4444");
423 
424     // CJK ideographic char + Tone mark + Variation Character(VS17)
425     ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+E0100 U+4444");
426     ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+E0100 U+4444");
427     ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+E0100 U+4444");
428     ExpectPrevWordBreakForCache(3, "| U+4444 U+302D U+E0100 U+4444");
429     ExpectPrevWordBreakForCache(4, "| U+4444 U+302D U+E0100 U+4444");
430     ExpectPrevWordBreakForCache(5, "U+4444 U+302D U+E0100 | U+4444");
431     ExpectPrevWordBreakForCache(1000, "U+4444 U+302D U+E0100 | U+4444");
432 
433     // CJK ideographic char + Variation Character(VS1) + Tone mark
434     ExpectPrevWordBreakForCache(0, "| U+4444 U+FE00 U+302D U+4444");
435     ExpectPrevWordBreakForCache(1, "| U+4444 U+FE00 U+302D U+4444");
436     ExpectPrevWordBreakForCache(2, "| U+4444 U+FE00 U+302D U+4444");
437     ExpectPrevWordBreakForCache(3, "| U+4444 U+FE00 U+302D U+4444");
438     ExpectPrevWordBreakForCache(4, "U+4444 U+FE00 U+302D | U+4444");
439     ExpectPrevWordBreakForCache(1000, "U+4444 U+FE00 U+302D | U+4444");
440 
441     // CJK ideographic char + Variation Character(VS17) + Tone mark
442     ExpectPrevWordBreakForCache(0, "| U+4444 U+E0100 U+302D U+4444");
443     ExpectPrevWordBreakForCache(1, "| U+4444 U+E0100 U+302D U+4444");
444     ExpectPrevWordBreakForCache(2, "| U+4444 U+E0100 U+302D U+4444");
445     ExpectPrevWordBreakForCache(3, "| U+4444 U+E0100 U+302D U+4444");
446     ExpectPrevWordBreakForCache(4, "| U+4444 U+E0100 U+302D U+4444");
447     ExpectPrevWordBreakForCache(5, "U+4444 U+E0100 U+302D | U+4444");
448     ExpectPrevWordBreakForCache(1000, "U+4444 U+E0100 U+302D | U+4444");
449 
450     // Following test cases are unusual usage of variation selectors and tone
451     // marks for caching up the further behavior changes, e.g. index of bounds
452     // or crashes. Please feel free to update the test expectations if the
453     // behavior change makes sense to you.
454 
455     // Isolated Tone marks and Variation Selectors
456     ExpectPrevWordBreakForCache(0, "| U+FE00");
457     ExpectPrevWordBreakForCache(1, "| U+FE00");
458     ExpectPrevWordBreakForCache(1000, "| U+FE00");
459     ExpectPrevWordBreakForCache(0, "| U+E0100");
460     ExpectPrevWordBreakForCache(1000, "| U+E0100");
461     ExpectPrevWordBreakForCache(0, "| U+302D");
462     ExpectPrevWordBreakForCache(1000, "| U+302D");
463 
464     // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS1)
465     ExpectPrevWordBreakForCache(0, "| U+845B U+FE00 U+FE00 U+845B");
466     ExpectPrevWordBreakForCache(1, "| U+845B U+FE00 U+FE00 U+845B");
467     ExpectPrevWordBreakForCache(2, "| U+845B U+FE00 U+FE00 U+845B");
468     ExpectPrevWordBreakForCache(3, "| U+845B U+FE00 U+FE00 U+845B");
469     ExpectPrevWordBreakForCache(4, "U+845B U+FE00 U+FE00 | U+845B");
470     ExpectPrevWordBreakForCache(1000, "U+845B U+FE00 U+FE00 | U+845B");
471 
472     // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS17)
473     ExpectPrevWordBreakForCache(0, "| U+845B U+E0100 U+E0100 U+845B");
474     ExpectPrevWordBreakForCache(1, "| U+845B U+E0100 U+E0100 U+845B");
475     ExpectPrevWordBreakForCache(2, "| U+845B U+E0100 U+E0100 U+845B");
476     ExpectPrevWordBreakForCache(3, "| U+845B U+E0100 U+E0100 U+845B");
477     ExpectPrevWordBreakForCache(4, "| U+845B U+E0100 U+E0100 U+845B");
478     ExpectPrevWordBreakForCache(5, "| U+845B U+E0100 U+E0100 U+845B");
479     ExpectPrevWordBreakForCache(6, "U+845B U+E0100 U+E0100 | U+845B");
480     ExpectPrevWordBreakForCache(1000,
481                              "U+845B U+E0100 U+E0100 | U+845B");
482 
483     // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS17)
484     ExpectPrevWordBreakForCache(0, "| U+845B U+FE00 U+E0100 U+845B");
485     ExpectPrevWordBreakForCache(1, "| U+845B U+FE00 U+E0100 U+845B");
486     ExpectPrevWordBreakForCache(2, "| U+845B U+FE00 U+E0100 U+845B");
487     ExpectPrevWordBreakForCache(3, "| U+845B U+FE00 U+E0100 U+845B");
488     ExpectPrevWordBreakForCache(4, "| U+845B U+FE00 U+E0100 U+845B");
489     ExpectPrevWordBreakForCache(5, "U+845B U+FE00 U+E0100 | U+845B");
490     ExpectPrevWordBreakForCache(1000, "U+845B U+FE00 U+E0100 | U+845B");
491 
492     // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS1)
493     ExpectPrevWordBreakForCache(0, "| U+845B U+E0100 U+FE00 U+845B");
494     ExpectPrevWordBreakForCache(1, "| U+845B U+E0100 U+FE00 U+845B");
495     ExpectPrevWordBreakForCache(2, "| U+845B U+E0100 U+FE00 U+845B");
496     ExpectPrevWordBreakForCache(3, "| U+845B U+E0100 U+FE00 U+845B");
497     ExpectPrevWordBreakForCache(4, "| U+845B U+E0100 U+FE00 U+845B");
498     ExpectPrevWordBreakForCache(5, "U+845B U+E0100 U+FE00 | U+845B");
499     ExpectPrevWordBreakForCache(1000, "U+845B U+E0100 U+FE00 | U+845B");
500 
501     // Tone mark. + Tone mark
502     ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+302D U+4444");
503     ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+302D U+4444");
504     ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+302D U+4444");
505     ExpectPrevWordBreakForCache(3, "| U+4444 U+302D U+302D U+4444");
506     ExpectPrevWordBreakForCache(4, "U+4444 U+302D U+302D | U+4444");
507     ExpectPrevWordBreakForCache(1000, "U+4444 U+302D U+302D | U+4444");
508 }
509 
510 }  // namespace
511