1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <string>
6#include <vector>
7
8#include "base/format_macros.h"
9#include "base/strings/string_split.h"
10#include "base/strings/stringprintf.h"
11#include "base/strings/utf_string_conversions.h"
12#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
13#include "testing/gtest/include/gtest/gtest.h"
14
15namespace {
16
17struct TestCase {
18    const char* language;
19    bool allow_contraction;
20    const wchar_t* expected_words;
21};
22
23}  // namespace
24
25// Tests whether or not our SpellcheckWordIterator can extract only words used
26// by the specified language from a multi-language text.
27TEST(SpellcheckWordIteratorTest, SplitWord) {
28  // An input text. This text includes words of several languages. (Some words
29  // are not separated with whitespace characters.) Our SpellcheckWordIterator
30  // should extract only the words used by the specified language from this text
31  // and normalize them so our spell-checker can check their spellings.
32  const wchar_t kTestText[] =
33      // Graphic characters
34      L"!@#$%^&*()"
35      // Latin (including a contraction character and a ligature).
36      L"hello:hello a\xFB03x"
37      // Greek
38      L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
39      // Cyrillic
40      L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
41      L"\x0443\x0439\x0442\x0435"
42      // Hebrew (including niqquds)
43      L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
44      // Hebrew words with U+0027 and U+05F3
45      L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
46      // Hebrew words with U+0022 and U+05F4
47      L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
48      // Hebrew words enclosed with ASCII quotes.
49      L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
50      // Arabic (including vowel marks)
51      L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
52      L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
53      L"\x0652\x0643\x064f\x0645\x0652"
54      // Hindi
55      L"\x0930\x093E\x091C\x0927\x093E\x0928"
56      // Thai
57      L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
58      L"\x0e23\x0e31\x0e1a"
59      // Hiraganas
60      L"\x3053\x3093\x306B\x3061\x306F"
61      // CJKV ideographs
62      L"\x4F60\x597D"
63      // Hangul Syllables
64      L"\xC548\xB155\xD558\xC138\xC694"
65      // Full-width latin : Hello
66      L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
67      L"e.g.,";
68
69  // The languages and expected results used in this test.
70  static const TestCase kTestCases[] = {
71    {
72      // English (keep contraction words)
73      "en-US", true, L"hello:hello affix Hello e.g"
74    }, {
75      // English (split contraction words)
76      "en-US", false, L"hello hello affix Hello e g"
77    }, {
78      // Greek
79      "el-GR", true,
80      L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
81    }, {
82      // Russian
83      "ru-RU", true,
84      L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
85      L"\x0443\x0439\x0442\x0435"
86    }, {
87      // Hebrew
88      "he-IL", true,
89      L"\x05e9\x05dc\x05d5\x05dd "
90      L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
91      L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
92      L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
93    }, {
94      // Arabic
95      "ar", true,
96      L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
97      L"\x0644\x064a\x0643\x0645"
98    }, {
99      // Hindi
100      "hi-IN", true,
101      L"\x0930\x093E\x091C\x0927\x093E\x0928"
102    }, {
103      // Thai
104      "th-TH", true,
105      L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
106      L"\x0e23\x0e31\x0e1a"
107    }, {
108      // Korean
109      "ko-KR", true,
110      L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
111      L"\x1109\x1166\x110b\x116d"
112    },
113  };
114
115  for (size_t i = 0; i < arraysize(kTestCases); ++i) {
116    SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
117                                    kTestCases[i].language));
118
119    SpellcheckCharAttribute attributes;
120    attributes.SetDefaultLanguage(kTestCases[i].language);
121
122    base::string16 input(base::WideToUTF16(kTestText));
123    SpellcheckWordIterator iterator;
124    EXPECT_TRUE(iterator.Initialize(&attributes,
125                                    kTestCases[i].allow_contraction));
126    EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
127
128    std::vector<base::string16> expected_words;
129    base::SplitString(
130        base::WideToUTF16(kTestCases[i].expected_words), ' ', &expected_words);
131
132    base::string16 actual_word;
133    int actual_start, actual_end;
134    size_t index = 0;
135    while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
136      EXPECT_TRUE(index < expected_words.size());
137      if (index < expected_words.size())
138        EXPECT_EQ(expected_words[index], actual_word);
139      ++index;
140    }
141  }
142}
143
144// Tests whether our SpellcheckWordIterator extracts an empty word without
145// getting stuck in an infinite loop when inputting a Khmer text. (This is a
146// regression test for Issue 46278.)
147TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
148  SpellcheckCharAttribute attributes;
149  attributes.SetDefaultLanguage("en-US");
150
151  const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
152  base::string16 input(base::WideToUTF16(kTestText));
153
154  SpellcheckWordIterator iterator;
155  EXPECT_TRUE(iterator.Initialize(&attributes, true));
156  EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
157
158  // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
159  // iterator.GetNextWord() call gets stuck in an infinite loop. Therefore, this
160  // test succeeds if this call returns without timeouts.
161  base::string16 actual_word;
162  int actual_start, actual_end;
163  EXPECT_FALSE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
164  EXPECT_EQ(0, actual_start);
165  EXPECT_EQ(0, actual_end);
166}
167
168// Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
169// on LTR languages. On the other hand, it should not treat ASCII numbers as
170// word characters on RTL languages because they change the text direction from
171// RTL to LTR.
172TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
173  // A set of a language, a dummy word, and a text direction used in this test.
174  // For each language, this test splits a dummy word, which consists of ASCII
175  // numbers and an alphabet of the language, into words. When ASCII numbers are
176  // treated as word characters, the split word becomes equal to the dummy word.
177  // Otherwise, the split word does not include ASCII numbers.
178  static const struct {
179    const char* language;
180    const wchar_t* text;
181    bool left_to_right;
182  } kTestCases[] = {
183    {
184      // English
185      "en-US", L"0123456789" L"a", true,
186    }, {
187      // Greek
188      "el-GR", L"0123456789" L"\x03B1", true,
189    }, {
190      // Russian
191      "ru-RU", L"0123456789" L"\x0430", true,
192    }, {
193      // Hebrew
194      "he-IL", L"0123456789" L"\x05D0", false,
195    }, {
196      // Arabic
197      "ar",  L"0123456789" L"\x0627", false,
198    }, {
199      // Hindi
200      "hi-IN", L"0123456789" L"\x0905", true,
201    }, {
202      // Thai
203      "th-TH", L"0123456789" L"\x0e01", true,
204    }, {
205      // Korean
206      "ko-KR", L"0123456789" L"\x1100\x1161", true,
207    },
208  };
209
210  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) {
211    SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
212                                    kTestCases[i].language));
213
214    SpellcheckCharAttribute attributes;
215    attributes.SetDefaultLanguage(kTestCases[i].language);
216
217    base::string16 input_word(base::WideToUTF16(kTestCases[i].text));
218    SpellcheckWordIterator iterator;
219    EXPECT_TRUE(iterator.Initialize(&attributes, true));
220    EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
221
222    base::string16 actual_word;
223    int actual_start, actual_end;
224    EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
225    if (kTestCases[i].left_to_right)
226      EXPECT_EQ(input_word, actual_word);
227    else
228      EXPECT_NE(input_word, actual_word);
229  }
230}
231
232TEST(SpellcheckWordIteratorTest, Initialization) {
233  // Test initialization works when a default language is set.
234  {
235    SpellcheckCharAttribute attributes;
236    attributes.SetDefaultLanguage("en-US");
237
238    SpellcheckWordIterator iterator;
239    EXPECT_TRUE(iterator.Initialize(&attributes, true));
240  }
241
242  // Test initialization fails when no default language is set.
243  {
244    SpellcheckCharAttribute attributes;
245
246    SpellcheckWordIterator iterator;
247    EXPECT_FALSE(iterator.Initialize(&attributes, true));
248  }
249}
250