1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <gtest/gtest.h>
18#include <UnicodeUtils.h>
19
20#include "LayoutUtils.h"
21
22namespace {
23
24void ExpectNextWordBreakForCache(size_t offset_in, const char* query_str) {
25    const size_t BUF_SIZE = 256U;
26    uint16_t buf[BUF_SIZE];
27    size_t expected_breakpoint = 0U;
28    size_t size = 0U;
29
30    ParseUnicode(buf, BUF_SIZE, query_str, &size, &expected_breakpoint);
31    EXPECT_EQ(expected_breakpoint,
32              getNextWordBreakForCache(buf, offset_in, size))
33        << "Expected position is [" << query_str << "] from offset " << offset_in;
34}
35
36void ExpectPrevWordBreakForCache(size_t offset_in, const char* query_str) {
37    const size_t BUF_SIZE = 256U;
38    uint16_t buf[BUF_SIZE];
39    size_t expected_breakpoint = 0U;
40    size_t size = 0U;
41
42    ParseUnicode(buf, BUF_SIZE, query_str, &size, &expected_breakpoint);
43    EXPECT_EQ(expected_breakpoint,
44              getPrevWordBreakForCache(buf, offset_in, size))
45        << "Expected position is [" << query_str << "] from offset " << offset_in;
46}
47
48TEST(WordBreakTest, goNextWordBreakTest) {
49    ExpectNextWordBreakForCache(0, "|");
50
51    // Continue for spaces.
52    ExpectNextWordBreakForCache(0, "'a' 'b' 'c' 'd' |");
53    ExpectNextWordBreakForCache(1, "'a' 'b' 'c' 'd' |");
54    ExpectNextWordBreakForCache(2, "'a' 'b' 'c' 'd' |");
55    ExpectNextWordBreakForCache(3, "'a' 'b' 'c' 'd' |");
56    ExpectNextWordBreakForCache(4, "'a' 'b' 'c' 'd' |");
57    ExpectNextWordBreakForCache(1000, "'a' 'b' 'c' 'd' |");
58
59    // Space makes word break.
60    ExpectNextWordBreakForCache(0, "'a' 'b' | U+0020 'c' 'd'");
61    ExpectNextWordBreakForCache(1, "'a' 'b' | U+0020 'c' 'd'");
62    ExpectNextWordBreakForCache(2, "'a' 'b' U+0020 | 'c' 'd'");
63    ExpectNextWordBreakForCache(3, "'a' 'b' U+0020 'c' 'd' |");
64    ExpectNextWordBreakForCache(4, "'a' 'b' U+0020 'c' 'd' |");
65    ExpectNextWordBreakForCache(5, "'a' 'b' U+0020 'c' 'd' |");
66    ExpectNextWordBreakForCache(1000, "'a' 'b' U+0020 'c' 'd' |");
67
68    ExpectNextWordBreakForCache(0, "'a' 'b' | U+2000 'c' 'd'");
69    ExpectNextWordBreakForCache(1, "'a' 'b' | U+2000 'c' 'd'");
70    ExpectNextWordBreakForCache(2, "'a' 'b' U+2000 | 'c' 'd'");
71    ExpectNextWordBreakForCache(3, "'a' 'b' U+2000 'c' 'd' |");
72    ExpectNextWordBreakForCache(4, "'a' 'b' U+2000 'c' 'd' |");
73    ExpectNextWordBreakForCache(5, "'a' 'b' U+2000 'c' 'd' |");
74    ExpectNextWordBreakForCache(1000, "'a' 'b' U+2000 'c' 'd' |");
75
76    ExpectNextWordBreakForCache(0, "'a' 'b' | U+2000 U+2000 'c' 'd'");
77    ExpectNextWordBreakForCache(1, "'a' 'b' | U+2000 U+2000 'c' 'd'");
78    ExpectNextWordBreakForCache(2, "'a' 'b' U+2000 | U+2000 'c' 'd'");
79    ExpectNextWordBreakForCache(3, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
80    ExpectNextWordBreakForCache(4, "'a' 'b' U+2000 U+2000 'c' 'd' |");
81    ExpectNextWordBreakForCache(5, "'a' 'b' U+2000 U+2000 'c' 'd' |");
82    ExpectNextWordBreakForCache(6, "'a' 'b' U+2000 U+2000 'c' 'd' |");
83    ExpectNextWordBreakForCache(1000, "'a' 'b' U+2000 U+2000 'c' 'd' |");
84
85    // CJK ideographs makes word break.
86    ExpectNextWordBreakForCache(0, "U+4E00 | U+4E00   U+4E00   U+4E00   U+4E00");
87    ExpectNextWordBreakForCache(1, "U+4E00   U+4E00 | U+4E00   U+4E00   U+4E00");
88    ExpectNextWordBreakForCache(2, "U+4E00   U+4E00   U+4E00 | U+4E00   U+4E00");
89    ExpectNextWordBreakForCache(3, "U+4E00   U+4E00   U+4E00   U+4E00 | U+4E00");
90    ExpectNextWordBreakForCache(4, "U+4E00   U+4E00   U+4E00   U+4E00   U+4E00 |");
91    ExpectNextWordBreakForCache(5, "U+4E00   U+4E00   U+4E00   U+4E00   U+4E00 |");
92    ExpectNextWordBreakForCache(1000,
93                             "U+4E00   U+4E00   U+4E00   U+4E00   U+4E00 |");
94
95    ExpectNextWordBreakForCache(0, "U+4E00 | U+4E8C   U+4E09   U+56DB   U+4E94");
96    ExpectNextWordBreakForCache(1, "U+4E00   U+4E8C | U+4E09   U+56DB   U+4E94");
97    ExpectNextWordBreakForCache(2, "U+4E00   U+4E8C   U+4E09 | U+56DB   U+4E94");
98    ExpectNextWordBreakForCache(3, "U+4E00   U+4E8C   U+4E09   U+56DB | U+4E94");
99    ExpectNextWordBreakForCache(4, "U+4E00   U+4E8C   U+4E09   U+56DB   U+4E94 |");
100    ExpectNextWordBreakForCache(5, "U+4E00   U+4E8C   U+4E09   U+56DB   U+4E94 |");
101    ExpectNextWordBreakForCache(1000,
102                             "U+4E00   U+4E8C   U+4E09   U+56DB   U+4E94 |");
103
104    ExpectNextWordBreakForCache(0, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
105    ExpectNextWordBreakForCache(1, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
106    ExpectNextWordBreakForCache(2, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
107    ExpectNextWordBreakForCache(3, "U+4E00 'a' 'b' U+2000 | 'c' U+4E00");
108    ExpectNextWordBreakForCache(4, "U+4E00 'a' 'b' U+2000 'c' | U+4E00");
109    ExpectNextWordBreakForCache(5, "U+4E00 'a' 'b' U+2000 'c' U+4E00 |");
110    ExpectNextWordBreakForCache(1000, "U+4E00 'a' 'b' U+2000 'c' U+4E00 |");
111
112    // Continue if trailing characters is Unicode combining characters.
113    ExpectNextWordBreakForCache(0, "U+4E00 U+0332 | U+4E00");
114    ExpectNextWordBreakForCache(1, "U+4E00 U+0332 | U+4E00");
115    ExpectNextWordBreakForCache(2, "U+4E00 U+0332 U+4E00 |");
116    ExpectNextWordBreakForCache(3, "U+4E00 U+0332 U+4E00 |");
117    ExpectNextWordBreakForCache(1000, "U+4E00 U+0332 U+4E00 |");
118
119    // Surrogate pairs.
120    ExpectNextWordBreakForCache(0, "U+1F60D U+1F618 |");
121    ExpectNextWordBreakForCache(1, "U+1F60D U+1F618 |");
122    ExpectNextWordBreakForCache(2, "U+1F60D U+1F618 |");
123    ExpectNextWordBreakForCache(3, "U+1F60D U+1F618 |");
124    ExpectNextWordBreakForCache(4, "U+1F60D U+1F618 |");
125    ExpectNextWordBreakForCache(1000, "U+1F60D U+1F618 |");
126
127    // Broken surrogate pairs.
128    // U+D84D is leading surrogate but there is no trailing surrogate for it.
129    ExpectNextWordBreakForCache(0, "U+D84D U+1F618 |");
130    ExpectNextWordBreakForCache(1, "U+D84D U+1F618 |");
131    ExpectNextWordBreakForCache(2, "U+D84D U+1F618 |");
132    ExpectNextWordBreakForCache(3, "U+D84D U+1F618 |");
133    ExpectNextWordBreakForCache(1000, "U+D84D U+1F618 |");
134
135    ExpectNextWordBreakForCache(0, "U+1F618 U+D84D |");
136    ExpectNextWordBreakForCache(1, "U+1F618 U+D84D |");
137    ExpectNextWordBreakForCache(2, "U+1F618 U+D84D |");
138    ExpectNextWordBreakForCache(3, "U+1F618 U+D84D |");
139    ExpectNextWordBreakForCache(1000, "U+1F618 U+D84D |");
140
141    // U+DE0D is trailing surrogate but there is no leading surrogate for it.
142    ExpectNextWordBreakForCache(0, "U+DE0D U+1F618 |");
143    ExpectNextWordBreakForCache(1, "U+DE0D U+1F618 |");
144    ExpectNextWordBreakForCache(2, "U+DE0D U+1F618 |");
145    ExpectNextWordBreakForCache(3, "U+DE0D U+1F618 |");
146    ExpectNextWordBreakForCache(1000, "U+DE0D U+1F618 |");
147
148    ExpectNextWordBreakForCache(0, "U+1F618 U+DE0D |");
149    ExpectNextWordBreakForCache(1, "U+1F618 U+DE0D |");
150    ExpectNextWordBreakForCache(2, "U+1F618 U+DE0D |");
151    ExpectNextWordBreakForCache(3, "U+1F618 U+DE0D |");
152    ExpectNextWordBreakForCache(1000, "U+1F618 U+DE0D |");
153
154    // Regional indicator pair. U+1F1FA U+1F1F8 is US national flag.
155    ExpectNextWordBreakForCache(0, "U+1F1FA U+1F1F8 |");
156    ExpectNextWordBreakForCache(1, "U+1F1FA U+1F1F8 |");
157    ExpectNextWordBreakForCache(2, "U+1F1FA U+1F1F8 |");
158    ExpectNextWordBreakForCache(1000, "U+1F1FA U+1F1F8 |");
159
160    // Tone marks.
161    // CJK ideographic char + Tone mark + CJK ideographic char
162    ExpectNextWordBreakForCache(0, "U+4444 U+302D | U+4444");
163    ExpectNextWordBreakForCache(1, "U+4444 U+302D | U+4444");
164    ExpectNextWordBreakForCache(2, "U+4444 U+302D U+4444 |");
165    ExpectNextWordBreakForCache(3, "U+4444 U+302D U+4444 |");
166    ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+4444 |");
167
168    // Variation Selectors.
169    // CJK Ideographic char + Variation Selector(VS1) + CJK Ideographic char
170    ExpectNextWordBreakForCache(0, "U+845B U+FE00 | U+845B");
171    ExpectNextWordBreakForCache(1, "U+845B U+FE00 | U+845B");
172    ExpectNextWordBreakForCache(2, "U+845B U+FE00 U+845B |");
173    ExpectNextWordBreakForCache(3, "U+845B U+FE00 U+845B |");
174    ExpectNextWordBreakForCache(1000, "U+845B U+FE00 U+845B |");
175
176    // CJK Ideographic char + Variation Selector(VS17) + CJK Ideographic char
177    ExpectNextWordBreakForCache(0, "U+845B U+E0100 | U+845B");
178    ExpectNextWordBreakForCache(1, "U+845B U+E0100 | U+845B");
179    ExpectNextWordBreakForCache(2, "U+845B U+E0100 | U+845B");
180    ExpectNextWordBreakForCache(3, "U+845B U+E0100 U+845B |");
181    ExpectNextWordBreakForCache(4, "U+845B U+E0100 U+845B |");
182    ExpectNextWordBreakForCache(5, "U+845B U+E0100 U+845B |");
183    ExpectNextWordBreakForCache(1000, "U+845B U+E0100 U+845B |");
184
185    // CJK ideographic char + Tone mark + Variation Character(VS1)
186    ExpectNextWordBreakForCache(0, "U+4444 U+302D U+FE00 | U+4444");
187    ExpectNextWordBreakForCache(1, "U+4444 U+302D U+FE00 | U+4444");
188    ExpectNextWordBreakForCache(2, "U+4444 U+302D U+FE00 | U+4444");
189    ExpectNextWordBreakForCache(3, "U+4444 U+302D U+FE00 U+4444 |");
190    ExpectNextWordBreakForCache(4, "U+4444 U+302D U+FE00 U+4444 |");
191    ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+FE00 U+4444 |");
192
193    // CJK ideographic char + Tone mark + Variation Character(VS17)
194    ExpectNextWordBreakForCache(0, "U+4444 U+302D U+E0100 | U+4444");
195    ExpectNextWordBreakForCache(1, "U+4444 U+302D U+E0100 | U+4444");
196    ExpectNextWordBreakForCache(2, "U+4444 U+302D U+E0100 | U+4444");
197    ExpectNextWordBreakForCache(3, "U+4444 U+302D U+E0100 | U+4444");
198    ExpectNextWordBreakForCache(4, "U+4444 U+302D U+E0100 U+4444 |");
199    ExpectNextWordBreakForCache(5, "U+4444 U+302D U+E0100 U+4444 |");
200    ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+E0100 U+4444 |");
201
202    // CJK ideographic char + Variation Character(VS1) + Tone mark
203    ExpectNextWordBreakForCache(0, "U+4444 U+FE00 U+302D | U+4444");
204    ExpectNextWordBreakForCache(1, "U+4444 U+FE00 U+302D | U+4444");
205    ExpectNextWordBreakForCache(2, "U+4444 U+FE00 U+302D | U+4444");
206    ExpectNextWordBreakForCache(3, "U+4444 U+FE00 U+302D U+4444 |");
207    ExpectNextWordBreakForCache(4, "U+4444 U+FE00 U+302D U+4444 |");
208    ExpectNextWordBreakForCache(1000, "U+4444 U+FE00 U+302D U+4444 |");
209
210    // CJK ideographic char + Variation Character(VS17) + Tone mark
211    ExpectNextWordBreakForCache(0, "U+4444 U+E0100 U+302D | U+4444");
212    ExpectNextWordBreakForCache(1, "U+4444 U+E0100 U+302D | U+4444");
213    ExpectNextWordBreakForCache(2, "U+4444 U+E0100 U+302D | U+4444");
214    ExpectNextWordBreakForCache(3, "U+4444 U+E0100 U+302D | U+4444");
215    ExpectNextWordBreakForCache(4, "U+4444 U+E0100 U+302D U+4444 |");
216    ExpectNextWordBreakForCache(5, "U+4444 U+E0100 U+302D U+4444 |");
217    ExpectNextWordBreakForCache(1000, "U+4444 U+E0100 U+302D U+4444 |");
218
219    // Following test cases are unusual usage of variation selectors and tone
220    // marks for caching up the further behavior changes, e.g. index of bounds
221    // or crashes. Please feel free to update the test expectations if the
222    // behavior change makes sense to you.
223
224    // Isolated Tone marks and Variation Selectors
225    ExpectNextWordBreakForCache(0, "U+FE00 |");
226    ExpectNextWordBreakForCache(1, "U+FE00 |");
227    ExpectNextWordBreakForCache(1000, "U+FE00 |");
228    ExpectNextWordBreakForCache(0, "U+E0100 |");
229    ExpectNextWordBreakForCache(1000, "U+E0100 |");
230    ExpectNextWordBreakForCache(0, "U+302D |");
231    ExpectNextWordBreakForCache(1000, "U+302D |");
232
233    // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS1)
234    ExpectNextWordBreakForCache(0, "U+845B U+FE00 U+FE00 | U+845B");
235    ExpectNextWordBreakForCache(1, "U+845B U+FE00 U+FE00 | U+845B");
236    ExpectNextWordBreakForCache(2, "U+845B U+FE00 U+FE00 | U+845B");
237    ExpectNextWordBreakForCache(3, "U+845B U+FE00 U+FE00 U+845B |");
238    ExpectNextWordBreakForCache(4, "U+845B U+FE00 U+FE00 U+845B |");
239    ExpectNextWordBreakForCache(1000, "U+845B U+FE00 U+FE00 U+845B |");
240
241    // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS17)
242    ExpectNextWordBreakForCache(0, "U+845B U+E0100 U+E0100 | U+845B");
243    ExpectNextWordBreakForCache(1, "U+845B U+E0100 U+E0100 | U+845B");
244    ExpectNextWordBreakForCache(2, "U+845B U+E0100 U+E0100 | U+845B");
245    ExpectNextWordBreakForCache(3, "U+845B U+E0100 U+E0100 | U+845B");
246    ExpectNextWordBreakForCache(4, "U+845B U+E0100 U+E0100 | U+845B");
247    ExpectNextWordBreakForCache(5, "U+845B U+E0100 U+E0100 U+845B |");
248    ExpectNextWordBreakForCache(6, "U+845B U+E0100 U+E0100 U+845B |");
249    ExpectNextWordBreakForCache(1000,
250                             "U+845B U+E0100 U+E0100 U+845B |");
251
252    // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS17)
253    ExpectNextWordBreakForCache(0, "U+845B U+FE00 U+E0100 | U+845B");
254    ExpectNextWordBreakForCache(1, "U+845B U+FE00 U+E0100 | U+845B");
255    ExpectNextWordBreakForCache(2, "U+845B U+FE00 U+E0100 | U+845B");
256    ExpectNextWordBreakForCache(3, "U+845B U+FE00 U+E0100 | U+845B");
257    ExpectNextWordBreakForCache(4, "U+845B U+FE00 U+E0100 U+845B |");
258    ExpectNextWordBreakForCache(5, "U+845B U+FE00 U+E0100 U+845B |");
259    ExpectNextWordBreakForCache(1000, "U+845B U+FE00 U+E0100 U+845B |");
260
261    // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS1)
262    ExpectNextWordBreakForCache(0, "U+845B U+E0100 U+FE00 | U+845B");
263    ExpectNextWordBreakForCache(1, "U+845B U+E0100 U+FE00 | U+845B");
264    ExpectNextWordBreakForCache(2, "U+845B U+E0100 U+FE00 | U+845B");
265    ExpectNextWordBreakForCache(3, "U+845B U+E0100 U+FE00 | U+845B");
266    ExpectNextWordBreakForCache(4, "U+845B U+E0100 U+FE00 U+845B |");
267    ExpectNextWordBreakForCache(5, "U+845B U+E0100 U+FE00 U+845B |");
268    ExpectNextWordBreakForCache(1000, "U+845B U+E0100 U+FE00 U+845B |");
269
270    // Tone mark. + Tone mark
271    ExpectNextWordBreakForCache(0, "U+4444 U+302D U+302D | U+4444");
272    ExpectNextWordBreakForCache(1, "U+4444 U+302D U+302D | U+4444");
273    ExpectNextWordBreakForCache(2, "U+4444 U+302D U+302D | U+4444");
274    ExpectNextWordBreakForCache(3, "U+4444 U+302D U+302D U+4444 |");
275    ExpectNextWordBreakForCache(4, "U+4444 U+302D U+302D U+4444 |");
276    ExpectNextWordBreakForCache(1000, "U+4444 U+302D U+302D U+4444 |");
277}
278
279TEST(WordBreakTest, goPrevWordBreakTest) {
280    ExpectPrevWordBreakForCache(0, "|");
281
282    // Continue for spaces.
283    ExpectPrevWordBreakForCache(0, "| 'a' 'b' 'c' 'd'");
284    ExpectPrevWordBreakForCache(1, "| 'a' 'b' 'c' 'd'");
285    ExpectPrevWordBreakForCache(2, "| 'a' 'b' 'c' 'd'");
286    ExpectPrevWordBreakForCache(3, "| 'a' 'b' 'c' 'd'");
287    ExpectPrevWordBreakForCache(4, "| 'a' 'b' 'c' 'd'");
288    ExpectPrevWordBreakForCache(1000, "| 'a' 'b' 'c' 'd'");
289
290    // Space makes word break.
291    ExpectPrevWordBreakForCache(0, "| 'a' 'b' U+0020 'c' 'd'");
292    ExpectPrevWordBreakForCache(1, "| 'a' 'b' U+0020 'c' 'd'");
293    ExpectPrevWordBreakForCache(2, "| 'a' 'b' U+0020 'c' 'd'");
294    ExpectPrevWordBreakForCache(3, "'a' 'b' | U+0020 'c' 'd'");
295    ExpectPrevWordBreakForCache(4, "'a' 'b' U+0020 | 'c' 'd'");
296    ExpectPrevWordBreakForCache(5, "'a' 'b' U+0020 | 'c' 'd'");
297    ExpectPrevWordBreakForCache(1000, "'a' 'b' U+0020 | 'c' 'd'");
298
299    ExpectPrevWordBreakForCache(0, "| 'a' 'b' U+2000 'c' 'd'");
300    ExpectPrevWordBreakForCache(1, "| 'a' 'b' U+2000 'c' 'd'");
301    ExpectPrevWordBreakForCache(2, "| 'a' 'b' U+2000 'c' 'd'");
302    ExpectPrevWordBreakForCache(3, "'a' 'b' | U+2000 'c' 'd'");
303    ExpectPrevWordBreakForCache(4, "'a' 'b' U+2000 | 'c' 'd'");
304    ExpectPrevWordBreakForCache(5, "'a' 'b' U+2000 | 'c' 'd'");
305    ExpectPrevWordBreakForCache(1000, "'a' 'b' U+2000 | 'c' 'd'");
306
307    ExpectPrevWordBreakForCache(0, "| 'a' 'b' U+2000 U+2000 'c' 'd'");
308    ExpectPrevWordBreakForCache(1, "| 'a' 'b' U+2000 U+2000 'c' 'd'");
309    ExpectPrevWordBreakForCache(2, "| 'a' 'b' U+2000 U+2000 'c' 'd'");
310    ExpectPrevWordBreakForCache(3, "'a' 'b' | U+2000 U+2000 'c' 'd'");
311    ExpectPrevWordBreakForCache(4, "'a' 'b' U+2000 | U+2000 'c' 'd'");
312    ExpectPrevWordBreakForCache(5, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
313    ExpectPrevWordBreakForCache(6, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
314    ExpectPrevWordBreakForCache(1000, "'a' 'b' U+2000 U+2000 | 'c' 'd'");
315
316    // CJK ideographs makes word break.
317    ExpectPrevWordBreakForCache(0, "| U+4E00 U+4E00 U+4E00 U+4E00 U+4E00");
318    ExpectPrevWordBreakForCache(1, "| U+4E00 U+4E00 U+4E00 U+4E00 U+4E00");
319    ExpectPrevWordBreakForCache(2, "U+4E00 | U+4E00 U+4E00 U+4E00 U+4E00");
320    ExpectPrevWordBreakForCache(3, "U+4E00 U+4E00 | U+4E00 U+4E00 U+4E00");
321    ExpectPrevWordBreakForCache(4, "U+4E00 U+4E00 U+4E00 | U+4E00 U+4E00");
322    ExpectPrevWordBreakForCache(5, "U+4E00 U+4E00 U+4E00 U+4E00 | U+4E00");
323    ExpectPrevWordBreakForCache(1000, "U+4E00 U+4E00 U+4E00 U+4E00 | U+4E00");
324
325    ExpectPrevWordBreakForCache(0, "| U+4E00 U+4E8C U+4E09 U+56DB U+4E94");
326    ExpectPrevWordBreakForCache(1, "| U+4E00 U+4E8C U+4E09 U+56DB U+4E94");
327    ExpectPrevWordBreakForCache(2, "U+4E00 | U+4E8C U+4E09 U+56DB U+4E94");
328    ExpectPrevWordBreakForCache(3, "U+4E00 U+4E8C | U+4E09 U+56DB U+4E94");
329    ExpectPrevWordBreakForCache(4, "U+4E00 U+4E8C U+4E09 | U+56DB U+4E94");
330    ExpectPrevWordBreakForCache(5, "U+4E00 U+4E8C U+4E09 U+56DB | U+4E94");
331    ExpectPrevWordBreakForCache(1000, "U+4E00 U+4E8C U+4E09 U+56DB | U+4E94");
332
333    // Mixed case.
334    ExpectPrevWordBreakForCache(0, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
335    ExpectPrevWordBreakForCache(1, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
336    ExpectPrevWordBreakForCache(2, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
337    ExpectPrevWordBreakForCache(3, "| U+4E00 'a' 'b' U+2000 'c' U+4E00");
338    ExpectPrevWordBreakForCache(4, "U+4E00 'a' 'b' | U+2000 'c' U+4E00");
339    ExpectPrevWordBreakForCache(5, "U+4E00 'a' 'b' U+2000 | 'c' U+4E00");
340    ExpectPrevWordBreakForCache(6, "U+4E00 'a' 'b' U+2000 'c' | U+4E00");
341    ExpectPrevWordBreakForCache(1000, "U+4E00 'a' 'b' U+2000 'c' | U+4E00");
342
343    // Continue if trailing characters is Unicode combining characters.
344    ExpectPrevWordBreakForCache(0, "| U+4E00 U+0332 U+4E00");
345    ExpectPrevWordBreakForCache(1, "| U+4E00 U+0332 U+4E00");
346    ExpectPrevWordBreakForCache(2, "| U+4E00 U+0332 U+4E00");
347    ExpectPrevWordBreakForCache(3, "U+4E00 U+0332 | U+4E00");
348    ExpectPrevWordBreakForCache(1000, "U+4E00 U+0332 | U+4E00");
349
350    // Surrogate pairs.
351    ExpectPrevWordBreakForCache(0, "| U+1F60D U+1F618");
352    ExpectPrevWordBreakForCache(1, "| U+1F60D U+1F618");
353    ExpectPrevWordBreakForCache(2, "| U+1F60D U+1F618");
354    ExpectPrevWordBreakForCache(3, "| U+1F60D U+1F618");
355    ExpectPrevWordBreakForCache(4, "| U+1F60D U+1F618");
356    ExpectPrevWordBreakForCache(1000, "| U+1F60D U+1F618");
357
358    // Broken surrogate pairs.
359    // U+D84D is leading surrogate but there is no trailing surrogate for it.
360    ExpectPrevWordBreakForCache(0, "| U+D84D U+1F618");
361    ExpectPrevWordBreakForCache(1, "| U+D84D U+1F618");
362    ExpectPrevWordBreakForCache(2, "| U+D84D U+1F618");
363    ExpectPrevWordBreakForCache(3, "| U+D84D U+1F618");
364    ExpectPrevWordBreakForCache(1000, "| U+D84D U+1F618");
365
366    ExpectPrevWordBreakForCache(0, "| U+1F618 U+D84D");
367    ExpectPrevWordBreakForCache(1, "| U+1F618 U+D84D");
368    ExpectPrevWordBreakForCache(2, "| U+1F618 U+D84D");
369    ExpectPrevWordBreakForCache(3, "| U+1F618 U+D84D");
370    ExpectPrevWordBreakForCache(1000, "| U+1F618 U+D84D");
371
372    // U+DE0D is trailing surrogate but there is no leading surrogate for it.
373    ExpectPrevWordBreakForCache(0, "| U+DE0D U+1F618");
374    ExpectPrevWordBreakForCache(1, "| U+DE0D U+1F618");
375    ExpectPrevWordBreakForCache(2, "| U+DE0D U+1F618");
376    ExpectPrevWordBreakForCache(3, "| U+DE0D U+1F618");
377    ExpectPrevWordBreakForCache(1000, "| U+DE0D U+1F618");
378
379    ExpectPrevWordBreakForCache(0, "| U+1F618 U+DE0D");
380    ExpectPrevWordBreakForCache(1, "| U+1F618 U+DE0D");
381    ExpectPrevWordBreakForCache(2, "| U+1F618 U+DE0D");
382    ExpectPrevWordBreakForCache(3, "| U+1F618 U+DE0D");
383    ExpectPrevWordBreakForCache(1000, "| U+1F618 U+DE0D");
384
385    // Regional indicator pair. U+1F1FA U+1F1F8 is US national flag.
386    ExpectPrevWordBreakForCache(0, "| U+1F1FA U+1F1F8");
387    ExpectPrevWordBreakForCache(1, "| U+1F1FA U+1F1F8");
388    ExpectPrevWordBreakForCache(2, "| U+1F1FA U+1F1F8");
389    ExpectPrevWordBreakForCache(1000, "| U+1F1FA U+1F1F8");
390
391    // Tone marks.
392    // CJK ideographic char + Tone mark + CJK ideographic char
393    ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+4444");
394    ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+4444");
395    ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+4444");
396    ExpectPrevWordBreakForCache(3, "U+4444 U+302D | U+4444");
397    ExpectPrevWordBreakForCache(1000, "U+4444 U+302D | U+4444");
398
399    // Variation Selectors.
400    // CJK Ideographic char + Variation Selector(VS1) + CJK Ideographic char
401    ExpectPrevWordBreakForCache(0, "| U+845B U+FE00 U+845B");
402    ExpectPrevWordBreakForCache(1, "| U+845B U+FE00 U+845B");
403    ExpectPrevWordBreakForCache(2, "| U+845B U+FE00 U+845B");
404    ExpectPrevWordBreakForCache(3, "U+845B U+FE00 | U+845B");
405    ExpectPrevWordBreakForCache(1000, "U+845B U+FE00 | U+845B");
406
407    // CJK Ideographic char + Variation Selector(VS17) + CJK Ideographic char
408    ExpectPrevWordBreakForCache(0, "| U+845B U+E0100 U+845B");
409    ExpectPrevWordBreakForCache(1, "| U+845B U+E0100 U+845B");
410    ExpectPrevWordBreakForCache(2, "| U+845B U+E0100 U+845B");
411    ExpectPrevWordBreakForCache(3, "| U+845B U+E0100 U+845B");
412    ExpectPrevWordBreakForCache(4, "U+845B U+E0100 | U+845B");
413    ExpectPrevWordBreakForCache(5, "U+845B U+E0100 | U+845B");
414    ExpectPrevWordBreakForCache(1000, "U+845B U+E0100 | U+845B");
415
416    // CJK ideographic char + Tone mark + Variation Character(VS1)
417    ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+FE00 U+4444");
418    ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+FE00 U+4444");
419    ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+FE00 U+4444");
420    ExpectPrevWordBreakForCache(3, "| U+4444 U+302D U+FE00 U+4444");
421    ExpectPrevWordBreakForCache(4, "U+4444 U+302D U+FE00 | U+4444");
422    ExpectPrevWordBreakForCache(1000, "U+4444 U+302D U+FE00 | U+4444");
423
424    // CJK ideographic char + Tone mark + Variation Character(VS17)
425    ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+E0100 U+4444");
426    ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+E0100 U+4444");
427    ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+E0100 U+4444");
428    ExpectPrevWordBreakForCache(3, "| U+4444 U+302D U+E0100 U+4444");
429    ExpectPrevWordBreakForCache(4, "| U+4444 U+302D U+E0100 U+4444");
430    ExpectPrevWordBreakForCache(5, "U+4444 U+302D U+E0100 | U+4444");
431    ExpectPrevWordBreakForCache(1000, "U+4444 U+302D U+E0100 | U+4444");
432
433    // CJK ideographic char + Variation Character(VS1) + Tone mark
434    ExpectPrevWordBreakForCache(0, "| U+4444 U+FE00 U+302D U+4444");
435    ExpectPrevWordBreakForCache(1, "| U+4444 U+FE00 U+302D U+4444");
436    ExpectPrevWordBreakForCache(2, "| U+4444 U+FE00 U+302D U+4444");
437    ExpectPrevWordBreakForCache(3, "| U+4444 U+FE00 U+302D U+4444");
438    ExpectPrevWordBreakForCache(4, "U+4444 U+FE00 U+302D | U+4444");
439    ExpectPrevWordBreakForCache(1000, "U+4444 U+FE00 U+302D | U+4444");
440
441    // CJK ideographic char + Variation Character(VS17) + Tone mark
442    ExpectPrevWordBreakForCache(0, "| U+4444 U+E0100 U+302D U+4444");
443    ExpectPrevWordBreakForCache(1, "| U+4444 U+E0100 U+302D U+4444");
444    ExpectPrevWordBreakForCache(2, "| U+4444 U+E0100 U+302D U+4444");
445    ExpectPrevWordBreakForCache(3, "| U+4444 U+E0100 U+302D U+4444");
446    ExpectPrevWordBreakForCache(4, "| U+4444 U+E0100 U+302D U+4444");
447    ExpectPrevWordBreakForCache(5, "U+4444 U+E0100 U+302D | U+4444");
448    ExpectPrevWordBreakForCache(1000, "U+4444 U+E0100 U+302D | U+4444");
449
450    // Following test cases are unusual usage of variation selectors and tone
451    // marks for caching up the further behavior changes, e.g. index of bounds
452    // or crashes. Please feel free to update the test expectations if the
453    // behavior change makes sense to you.
454
455    // Isolated Tone marks and Variation Selectors
456    ExpectPrevWordBreakForCache(0, "| U+FE00");
457    ExpectPrevWordBreakForCache(1, "| U+FE00");
458    ExpectPrevWordBreakForCache(1000, "| U+FE00");
459    ExpectPrevWordBreakForCache(0, "| U+E0100");
460    ExpectPrevWordBreakForCache(1000, "| U+E0100");
461    ExpectPrevWordBreakForCache(0, "| U+302D");
462    ExpectPrevWordBreakForCache(1000, "| U+302D");
463
464    // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS1)
465    ExpectPrevWordBreakForCache(0, "| U+845B U+FE00 U+FE00 U+845B");
466    ExpectPrevWordBreakForCache(1, "| U+845B U+FE00 U+FE00 U+845B");
467    ExpectPrevWordBreakForCache(2, "| U+845B U+FE00 U+FE00 U+845B");
468    ExpectPrevWordBreakForCache(3, "| U+845B U+FE00 U+FE00 U+845B");
469    ExpectPrevWordBreakForCache(4, "U+845B U+FE00 U+FE00 | U+845B");
470    ExpectPrevWordBreakForCache(1000, "U+845B U+FE00 U+FE00 | U+845B");
471
472    // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS17)
473    ExpectPrevWordBreakForCache(0, "| U+845B U+E0100 U+E0100 U+845B");
474    ExpectPrevWordBreakForCache(1, "| U+845B U+E0100 U+E0100 U+845B");
475    ExpectPrevWordBreakForCache(2, "| U+845B U+E0100 U+E0100 U+845B");
476    ExpectPrevWordBreakForCache(3, "| U+845B U+E0100 U+E0100 U+845B");
477    ExpectPrevWordBreakForCache(4, "| U+845B U+E0100 U+E0100 U+845B");
478    ExpectPrevWordBreakForCache(5, "| U+845B U+E0100 U+E0100 U+845B");
479    ExpectPrevWordBreakForCache(6, "U+845B U+E0100 U+E0100 | U+845B");
480    ExpectPrevWordBreakForCache(1000,
481                             "U+845B U+E0100 U+E0100 | U+845B");
482
483    // CJK Ideographic char + Variation Selector(VS1) + Variation Selector(VS17)
484    ExpectPrevWordBreakForCache(0, "| U+845B U+FE00 U+E0100 U+845B");
485    ExpectPrevWordBreakForCache(1, "| U+845B U+FE00 U+E0100 U+845B");
486    ExpectPrevWordBreakForCache(2, "| U+845B U+FE00 U+E0100 U+845B");
487    ExpectPrevWordBreakForCache(3, "| U+845B U+FE00 U+E0100 U+845B");
488    ExpectPrevWordBreakForCache(4, "| U+845B U+FE00 U+E0100 U+845B");
489    ExpectPrevWordBreakForCache(5, "U+845B U+FE00 U+E0100 | U+845B");
490    ExpectPrevWordBreakForCache(1000, "U+845B U+FE00 U+E0100 | U+845B");
491
492    // CJK Ideographic char + Variation Selector(VS17) + Variation Selector(VS1)
493    ExpectPrevWordBreakForCache(0, "| U+845B U+E0100 U+FE00 U+845B");
494    ExpectPrevWordBreakForCache(1, "| U+845B U+E0100 U+FE00 U+845B");
495    ExpectPrevWordBreakForCache(2, "| U+845B U+E0100 U+FE00 U+845B");
496    ExpectPrevWordBreakForCache(3, "| U+845B U+E0100 U+FE00 U+845B");
497    ExpectPrevWordBreakForCache(4, "| U+845B U+E0100 U+FE00 U+845B");
498    ExpectPrevWordBreakForCache(5, "U+845B U+E0100 U+FE00 | U+845B");
499    ExpectPrevWordBreakForCache(1000, "U+845B U+E0100 U+FE00 | U+845B");
500
501    // Tone mark. + Tone mark
502    ExpectPrevWordBreakForCache(0, "| U+4444 U+302D U+302D U+4444");
503    ExpectPrevWordBreakForCache(1, "| U+4444 U+302D U+302D U+4444");
504    ExpectPrevWordBreakForCache(2, "| U+4444 U+302D U+302D U+4444");
505    ExpectPrevWordBreakForCache(3, "| U+4444 U+302D U+302D U+4444");
506    ExpectPrevWordBreakForCache(4, "U+4444 U+302D U+302D | U+4444");
507    ExpectPrevWordBreakForCache(1000, "U+4444 U+302D U+302D | U+4444");
508}
509
510}  // namespace
511