break_iterator_unittest.cc revision 868fa2fe829687343ffae624259930155e16dbd8
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/i18n/break_iterator.h"
6
7#include "base/strings/string_piece.h"
8#include "base/strings/stringprintf.h"
9#include "base/strings/utf_string_conversions.h"
10#include "testing/gtest/include/gtest/gtest.h"
11
12namespace base {
13namespace i18n {
14
15TEST(BreakIteratorTest, BreakWordEmpty) {
16  string16 empty;
17  BreakIterator iter(empty, BreakIterator::BREAK_WORD);
18  ASSERT_TRUE(iter.Init());
19  EXPECT_FALSE(iter.Advance());
20  EXPECT_FALSE(iter.IsWord());
21  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
22  EXPECT_FALSE(iter.IsWord());
23}
24
25TEST(BreakIteratorTest, BreakWord) {
26  string16 space(UTF8ToUTF16(" "));
27  string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
28  BreakIterator iter(str, BreakIterator::BREAK_WORD);
29  ASSERT_TRUE(iter.Init());
30  EXPECT_TRUE(iter.Advance());
31  EXPECT_FALSE(iter.IsWord());
32  EXPECT_EQ(space, iter.GetString());
33  EXPECT_TRUE(iter.Advance());
34  EXPECT_TRUE(iter.IsWord());
35  EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString());
36  EXPECT_TRUE(iter.Advance());
37  EXPECT_FALSE(iter.IsWord());
38  EXPECT_EQ(space, iter.GetString());
39  EXPECT_TRUE(iter.Advance());
40  EXPECT_TRUE(iter.IsWord());
41  EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString());
42  EXPECT_TRUE(iter.Advance());
43  EXPECT_FALSE(iter.IsWord());
44  EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString());
45  EXPECT_TRUE(iter.Advance());
46  EXPECT_FALSE(iter.IsWord());
47  EXPECT_EQ(space, iter.GetString());
48  EXPECT_TRUE(iter.Advance());
49  EXPECT_FALSE(iter.IsWord());
50  EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString());
51  EXPECT_TRUE(iter.Advance());
52  EXPECT_TRUE(iter.IsWord());
53  EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString());
54  EXPECT_TRUE(iter.Advance());
55  EXPECT_FALSE(iter.IsWord());
56  EXPECT_EQ(space, iter.GetString());
57  EXPECT_TRUE(iter.Advance());
58  EXPECT_TRUE(iter.IsWord());
59  EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
60  EXPECT_FALSE(iter.Advance());
61  EXPECT_FALSE(iter.IsWord());
62  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
63  EXPECT_FALSE(iter.IsWord());
64}
65
66TEST(BreakIteratorTest, BreakWide16) {
67  // Two greek words separated by space.
68  const string16 str(WideToUTF16(
69      L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
70      L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
71  const string16 word1(str.substr(0, 10));
72  const string16 word2(str.substr(11, 5));
73  BreakIterator iter(str, BreakIterator::BREAK_WORD);
74  ASSERT_TRUE(iter.Init());
75  EXPECT_TRUE(iter.Advance());
76  EXPECT_TRUE(iter.IsWord());
77  EXPECT_EQ(word1, iter.GetString());
78  EXPECT_TRUE(iter.Advance());
79  EXPECT_FALSE(iter.IsWord());
80  EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
81  EXPECT_TRUE(iter.Advance());
82  EXPECT_TRUE(iter.IsWord());
83  EXPECT_EQ(word2, iter.GetString());
84  EXPECT_FALSE(iter.Advance());
85  EXPECT_FALSE(iter.IsWord());
86  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
87  EXPECT_FALSE(iter.IsWord());
88}
89
90TEST(BreakIteratorTest, BreakWide32) {
91  // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
92  const char* very_wide_char = "\xF0\x9D\x92\x9C";
93  const string16 str(
94      UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
95  const string16 very_wide_word(str.substr(0, 2));
96
97  BreakIterator iter(str, BreakIterator::BREAK_WORD);
98  ASSERT_TRUE(iter.Init());
99  EXPECT_TRUE(iter.Advance());
100  EXPECT_TRUE(iter.IsWord());
101  EXPECT_EQ(very_wide_word, iter.GetString());
102  EXPECT_TRUE(iter.Advance());
103  EXPECT_FALSE(iter.IsWord());
104  EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
105  EXPECT_TRUE(iter.Advance());
106  EXPECT_TRUE(iter.IsWord());
107  EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
108  EXPECT_FALSE(iter.Advance());
109  EXPECT_FALSE(iter.IsWord());
110  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
111  EXPECT_FALSE(iter.IsWord());
112}
113
114TEST(BreakIteratorTest, BreakSpaceEmpty) {
115  string16 empty;
116  BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
117  ASSERT_TRUE(iter.Init());
118  EXPECT_FALSE(iter.Advance());
119  EXPECT_FALSE(iter.IsWord());
120  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
121  EXPECT_FALSE(iter.IsWord());
122}
123
124TEST(BreakIteratorTest, BreakSpace) {
125  string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
126  BreakIterator iter(str, BreakIterator::BREAK_SPACE);
127  ASSERT_TRUE(iter.Init());
128  EXPECT_TRUE(iter.Advance());
129  EXPECT_FALSE(iter.IsWord());
130  EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
131  EXPECT_TRUE(iter.Advance());
132  EXPECT_FALSE(iter.IsWord());
133  EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
134  EXPECT_TRUE(iter.Advance());
135  EXPECT_FALSE(iter.IsWord());
136  EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
137  EXPECT_TRUE(iter.Advance());
138  EXPECT_FALSE(iter.IsWord());
139  EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
140  EXPECT_TRUE(iter.Advance());
141  EXPECT_FALSE(iter.IsWord());
142  EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
143  EXPECT_FALSE(iter.Advance());
144  EXPECT_FALSE(iter.IsWord());
145  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
146  EXPECT_FALSE(iter.IsWord());
147}
148
149TEST(BreakIteratorTest, BreakSpaceSP) {
150  string16 str(UTF8ToUTF16(" foo bar! \npouet boom "));
151  BreakIterator iter(str, BreakIterator::BREAK_SPACE);
152  ASSERT_TRUE(iter.Init());
153  EXPECT_TRUE(iter.Advance());
154  EXPECT_FALSE(iter.IsWord());
155  EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
156  EXPECT_TRUE(iter.Advance());
157  EXPECT_FALSE(iter.IsWord());
158  EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
159  EXPECT_TRUE(iter.Advance());
160  EXPECT_FALSE(iter.IsWord());
161  EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
162  EXPECT_TRUE(iter.Advance());
163  EXPECT_FALSE(iter.IsWord());
164  EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
165  EXPECT_TRUE(iter.Advance());
166  EXPECT_FALSE(iter.IsWord());
167  EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString());
168  EXPECT_FALSE(iter.Advance());
169  EXPECT_FALSE(iter.IsWord());
170  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
171  EXPECT_FALSE(iter.IsWord());
172}
173
174TEST(BreakIteratorTest, BreakSpacekWide16) {
175  // Two Greek words.
176  const string16 str(WideToUTF16(
177      L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
178      L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
179  const string16 word1(str.substr(0, 11));
180  const string16 word2(str.substr(11, 5));
181  BreakIterator iter(str, BreakIterator::BREAK_SPACE);
182  ASSERT_TRUE(iter.Init());
183  EXPECT_TRUE(iter.Advance());
184  EXPECT_FALSE(iter.IsWord());
185  EXPECT_EQ(word1, iter.GetString());
186  EXPECT_TRUE(iter.Advance());
187  EXPECT_FALSE(iter.IsWord());
188  EXPECT_EQ(word2, iter.GetString());
189  EXPECT_FALSE(iter.Advance());
190  EXPECT_FALSE(iter.IsWord());
191  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
192  EXPECT_FALSE(iter.IsWord());
193}
194
195TEST(BreakIteratorTest, BreakSpaceWide32) {
196  // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
197  const char* very_wide_char = "\xF0\x9D\x92\x9C";
198  const string16 str(
199      UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
200  const string16 very_wide_word(str.substr(0, 3));
201
202  BreakIterator iter(str, BreakIterator::BREAK_SPACE);
203  ASSERT_TRUE(iter.Init());
204  EXPECT_TRUE(iter.Advance());
205  EXPECT_FALSE(iter.IsWord());
206  EXPECT_EQ(very_wide_word, iter.GetString());
207  EXPECT_TRUE(iter.Advance());
208  EXPECT_FALSE(iter.IsWord());
209  EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
210  EXPECT_FALSE(iter.Advance());
211  EXPECT_FALSE(iter.IsWord());
212  EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
213  EXPECT_FALSE(iter.IsWord());
214}
215
216TEST(BreakIteratorTest, BreakLineEmpty) {
217  string16 empty;
218  BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
219  ASSERT_TRUE(iter.Init());
220  EXPECT_FALSE(iter.Advance());
221  EXPECT_FALSE(iter.IsWord());
222  EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
223  EXPECT_FALSE(iter.IsWord());
224}
225
226TEST(BreakIteratorTest, BreakLine) {
227  string16 nl(UTF8ToUTF16("\n"));
228  string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
229  BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
230  ASSERT_TRUE(iter.Init());
231  EXPECT_TRUE(iter.Advance());
232  EXPECT_FALSE(iter.IsWord());
233  EXPECT_EQ(nl, iter.GetString());
234  EXPECT_TRUE(iter.Advance());
235  EXPECT_FALSE(iter.IsWord());
236  EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
237  EXPECT_TRUE(iter.Advance());
238  EXPECT_FALSE(iter.IsWord());
239  EXPECT_EQ(nl, iter.GetString());
240  EXPECT_TRUE(iter.Advance());
241  EXPECT_FALSE(iter.IsWord());
242  EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString());
243  EXPECT_FALSE(iter.Advance());
244  EXPECT_FALSE(iter.IsWord());
245  EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
246  EXPECT_FALSE(iter.IsWord());
247}
248
249TEST(BreakIteratorTest, BreakLineNL) {
250  string16 nl(UTF8ToUTF16("\n"));
251  string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
252  BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
253  ASSERT_TRUE(iter.Init());
254  EXPECT_TRUE(iter.Advance());
255  EXPECT_FALSE(iter.IsWord());
256  EXPECT_EQ(nl, iter.GetString());
257  EXPECT_TRUE(iter.Advance());
258  EXPECT_FALSE(iter.IsWord());
259  EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
260  EXPECT_TRUE(iter.Advance());
261  EXPECT_FALSE(iter.IsWord());
262  EXPECT_EQ(nl, iter.GetString());
263  EXPECT_TRUE(iter.Advance());
264  EXPECT_FALSE(iter.IsWord());
265  EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString());
266  EXPECT_FALSE(iter.Advance());
267  EXPECT_FALSE(iter.IsWord());
268  EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
269  EXPECT_FALSE(iter.IsWord());
270}
271
272TEST(BreakIteratorTest, BreakLineWide16) {
273  // Two Greek words separated by newline.
274  const string16 str(WideToUTF16(
275      L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
276      L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
277  const string16 line1(str.substr(0, 11));
278  const string16 line2(str.substr(11, 5));
279  BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
280  ASSERT_TRUE(iter.Init());
281  EXPECT_TRUE(iter.Advance());
282  EXPECT_FALSE(iter.IsWord());
283  EXPECT_EQ(line1, iter.GetString());
284  EXPECT_TRUE(iter.Advance());
285  EXPECT_FALSE(iter.IsWord());
286  EXPECT_EQ(line2, iter.GetString());
287  EXPECT_FALSE(iter.Advance());
288  EXPECT_FALSE(iter.IsWord());
289  EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
290  EXPECT_FALSE(iter.IsWord());
291}
292
293TEST(BreakIteratorTest, BreakLineWide32) {
294  // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
295  const char* very_wide_char = "\xF0\x9D\x92\x9C";
296  const string16 str(
297      UTF8ToUTF16(base::StringPrintf("%s\na", very_wide_char)));
298  const string16 very_wide_line(str.substr(0, 3));
299  BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
300  ASSERT_TRUE(iter.Init());
301  EXPECT_TRUE(iter.Advance());
302  EXPECT_FALSE(iter.IsWord());
303  EXPECT_EQ(very_wide_line, iter.GetString());
304  EXPECT_TRUE(iter.Advance());
305  EXPECT_FALSE(iter.IsWord());
306  EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
307  EXPECT_FALSE(iter.Advance());
308  EXPECT_FALSE(iter.IsWord());
309  EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
310  EXPECT_FALSE(iter.IsWord());
311}
312
313TEST(BreakIteratorTest, BreakCharacter) {
314  static const wchar_t* kCharacters[] = {
315    // An English word consisting of four ASCII characters.
316    L"w", L"o", L"r", L"d", L" ",
317    // A Hindi word (which means "Hindi") consisting of three Devanagari
318    // characters.
319    L"\x0939\x093F", L"\x0928\x094D", L"\x0926\x0940", L" ",
320    // A Thai word (which means "feel") consisting of three Thai characters.
321    L"\x0E23\x0E39\x0E49", L"\x0E2A\x0E36", L"\x0E01", L" ",
322  };
323  std::vector<string16> characters;
324  string16 text;
325  for (size_t i = 0; i < arraysize(kCharacters); ++i) {
326    characters.push_back(WideToUTF16(kCharacters[i]));
327    text.append(characters.back());
328  }
329  BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
330  ASSERT_TRUE(iter.Init());
331  for (size_t i = 0; i < arraysize(kCharacters); ++i) {
332    EXPECT_TRUE(iter.Advance());
333    EXPECT_EQ(characters[i], iter.GetString());
334  }
335}
336
337}  // namespace i18n
338}  // namespace base
339