1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <map>
6#include <string>
7
8#include "base/files/file_util.h"
9#include "base/format_macros.h"
10#include "base/i18n/icu_string_conversions.h"
11#include "base/strings/stringprintf.h"
12#include "base/strings/utf_string_conversions.h"
13#include "chrome/tools/convert_dict/aff_reader.h"
14#include "chrome/tools/convert_dict/dic_reader.h"
15#include "testing/gtest/include/gtest/gtest.h"
16#include "third_party/hunspell/google/bdict_reader.h"
17#include "third_party/hunspell/google/bdict_writer.h"
18
19namespace {
20
21// Compares the given word list with the serialized trie to make sure they
22// are the same.
23// (This function is copied from "chrome/tools/convert_dict/convert_dict.cc").
24bool VerifyWords(const convert_dict::DicReader::WordList& org_words,
25                 const std::string& serialized) {
26  hunspell::BDictReader reader;
27  EXPECT_TRUE(
28      reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()),
29      serialized.size()));
30
31  hunspell::WordIterator iter = reader.GetAllWordIterator();
32
33  int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
34
35  static const int kBufSize = 128;
36  char buf[kBufSize];
37  for (size_t i = 0; i < org_words.size(); i++) {
38    SCOPED_TRACE(base::StringPrintf(
39        "org_words[%" PRIuS "]: %s", i, org_words[i].first.c_str()));
40
41    int affix_matches = iter.Advance(buf, kBufSize, affix_ids);
42    EXPECT_NE(0, affix_matches);
43    EXPECT_EQ(org_words[i].first, std::string(buf));
44    EXPECT_EQ(affix_matches, static_cast<int>(org_words[i].second.size()));
45
46    // Check the individual affix indices.
47    for (size_t affix_index = 0; affix_index < org_words[i].second.size();
48         affix_index++) {
49      EXPECT_EQ(affix_ids[affix_index], org_words[i].second[affix_index]);
50    }
51  }
52
53  return true;
54}
55
56// Implements the test process used by ConvertDictTest.
57// This function encapsulates all complicated operations used by
58// ConvertDictTest so we can conceal them from the tests themselves.
59// This function consists of the following parts:
60// * Creates a dummy affix file and a dictionary file.
61// * Reads the dummy files.
62// * Creates bdict data.
63// * Verify the bdict data.
64void RunDictionaryTest(const char* codepage,
65                       const std::map<base::string16, bool>& word_list) {
66  // Create an affix data and a dictionary data.
67  std::string aff_data(base::StringPrintf("SET %s\n", codepage));
68
69  std::string dic_data(base::StringPrintf("%" PRIuS "\n", word_list.size()));
70  for (std::map<base::string16, bool>::const_iterator it = word_list.begin();
71       it != word_list.end(); ++it) {
72    std::string encoded_word;
73    EXPECT_TRUE(UTF16ToCodepage(it->first,
74                                codepage,
75                                base::OnStringConversionError::FAIL,
76                                &encoded_word));
77    dic_data += encoded_word;
78    dic_data += "\n";
79  }
80
81  // Create a temporary affix file and a dictionary file from the test data.
82  base::FilePath aff_file;
83  base::CreateTemporaryFile(&aff_file);
84  base::WriteFile(aff_file, aff_data.c_str(), aff_data.length());
85
86  base::FilePath dic_file;
87  base::CreateTemporaryFile(&dic_file);
88  base::WriteFile(dic_file, dic_data.c_str(), dic_data.length());
89
90  {
91    // Read the above affix file with AffReader and read the dictionary file
92    // with DicReader, respectively.
93    convert_dict::AffReader aff_reader(aff_file);
94    EXPECT_TRUE(aff_reader.Read());
95
96    convert_dict::DicReader dic_reader(dic_file);
97    EXPECT_TRUE(dic_reader.Read(&aff_reader));
98
99    // Verify this DicReader includes all the input words.
100    EXPECT_EQ(word_list.size(), dic_reader.words().size());
101    for (size_t i = 0; i < dic_reader.words().size(); ++i) {
102      SCOPED_TRACE(base::StringPrintf("dic_reader.words()[%" PRIuS "]: %s",
103                                      i, dic_reader.words()[i].first.c_str()));
104      base::string16 word(base::UTF8ToUTF16(dic_reader.words()[i].first));
105      EXPECT_TRUE(word_list.find(word) != word_list.end());
106    }
107
108    // Create BDICT data and verify it.
109    hunspell::BDictWriter writer;
110    writer.SetComment(aff_reader.comments());
111    writer.SetAffixRules(aff_reader.affix_rules());
112    writer.SetAffixGroups(aff_reader.GetAffixGroups());
113    writer.SetReplacements(aff_reader.replacements());
114    writer.SetOtherCommands(aff_reader.other_commands());
115    writer.SetWords(dic_reader.words());
116
117    std::string bdict_data = writer.GetBDict();
118    VerifyWords(dic_reader.words(), bdict_data);
119    EXPECT_TRUE(hunspell::BDict::Verify(bdict_data.data(), bdict_data.size()));
120
121    // Trim the end of this BDICT and verify our verifier tells these trimmed
122    // BDICTs are corrupted.
123    for (size_t i = 1; i < bdict_data.size(); ++i) {
124      SCOPED_TRACE(base::StringPrintf("i = %" PRIuS, i));
125      EXPECT_FALSE(hunspell::BDict::Verify(bdict_data.data(),
126                                           bdict_data.size() - i));
127    }
128  }
129
130  // Deletes the temporary files.
131  // We need to delete them after the above AffReader and DicReader are deleted
132  // since they close the input files in their destructors.
133  base::DeleteFile(aff_file, false);
134  base::DeleteFile(dic_file, false);
135}
136
137}  // namespace
138
139// Tests whether or not our DicReader can read all the input English words
140TEST(ConvertDictTest, English) {
141  const char kCodepage[] = "UTF-8";
142  const wchar_t* kWords[] = {
143    L"I",
144    L"he",
145    L"she",
146    L"it",
147    L"we",
148    L"you",
149    L"they",
150  };
151
152  std::map<base::string16, bool> word_list;
153  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
154    word_list.insert(
155        std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
156                                             true));
157
158  RunDictionaryTest(kCodepage, word_list);
159}
160
161// Tests whether or not our DicReader can read all the input Russian words.
162TEST(ConvertDictTest, Russian) {
163  const char kCodepage[] = "KOI8-R";
164  const wchar_t* kWords[] = {
165    L"\x044f",
166    L"\x0442\x044b",
167    L"\x043e\x043d",
168    L"\x043e\x043d\x0430",
169    L"\x043e\x043d\x043e",
170    L"\x043c\x044b",
171    L"\x0432\x044b",
172    L"\x043e\x043d\x0438",
173  };
174
175  std::map<base::string16, bool> word_list;
176  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
177    word_list.insert(
178        std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
179                                             true));
180
181  RunDictionaryTest(kCodepage, word_list);
182}
183
184// Tests whether or not our DicReader can read all the input Hungarian words.
185TEST(ConvertDictTest, Hungarian) {
186  const char kCodepage[] = "ISO8859-2";
187  const wchar_t* kWords[] = {
188    L"\x00e9\x006e",
189    L"\x0074\x0065",
190    L"\x0151",
191    L"\x00f6\x006e",
192    L"\x006d\x0061\x0067\x0061",
193    L"\x006d\x0069",
194    L"\x0074\x0069",
195    L"\x0151\x006b",
196    L"\x00f6\x006e\x00f6\x006b",
197    L"\x006d\x0061\x0067\x0075\x006b",
198  };
199
200  std::map<base::string16, bool> word_list;
201  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
202    word_list.insert(
203        std::make_pair<base::string16, bool>(base::WideToUTF16(kWords[i]),
204                                             true));
205
206  RunDictionaryTest(kCodepage, word_list);
207}
208