15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/tools/convert_dict/dic_reader.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 101320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "base/files/file_util.h" 117d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string_util.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/tools/convert_dict/aff_reader.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/tools/convert_dict/hunspell_reader.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace convert_dict { 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Maps each unique word to the unique affix group IDs associated with it. 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef std::map<std::string, std::set<int> > WordSet; 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SplitDicLine(const std::string& line, std::vector<std::string>* output) { 232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // We split the line on a slash not preceded by a backslash. A slash at the 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // beginning of the line is not a separator either. 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t slash_index = line.size(); 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < line.size(); i++) { 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (line[i] == '/' && i > 0 && line[i - 1] != '\\') { 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) slash_index = i; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) output->clear(); 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Everything before the slash index is the first term. We also need to 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // convert all escaped slashes ("\/" sequences) to regular slashes. 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string word = line.substr(0, slash_index); 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) output->push_back(word); 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Everything (if anything) after the slash is the second. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (slash_index < line.size() - 1) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) output->push_back(line.substr(slash_index + 1)); 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This function reads words from a .dic file, or a .dic_delta file. Note that 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we read 'all' the words in the file, irrespective of the word count given 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the first non empty line of a .dic file. Also note that, for a .dic_delta 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// file, the first line actually does _not_ have the number of words. In order 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to control this, we use the |file_has_word_count_in_the_first_line| 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// parameter to tell this method whether the first non empty line in the file 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// contains the number of words or not. If it does, skip the first line. If it 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// does not, then the first line contains a word. 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* file_type, const char* encoding, 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool file_has_word_count_in_the_first_line) { 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int line_number = 0; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (!feof(file)) { 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string line = ReadLine(file); 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number++; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StripComment(&line); 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (line.empty()) 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (file_has_word_count_in_the_first_line) { 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Skip the first nonempty line, this is the line count. We don't bother 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // with it and just read all the lines. 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) file_has_word_count_in_the_first_line = false; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<std::string> split; 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SplitDicLine(line, &split); 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (split.empty() || split.size() > 2) { 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) printf("Line %d has extra slashes in the %s file\n", line_number, 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) file_type); 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The first part is the word, the second (optional) part is the affix. We 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // always use UTF-8 as the encoding to simplify life. 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string utf8word; 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string encoding_string(encoding); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (encoding_string == "UTF-8") { 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf8word = split[0]; 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) line_number, encoding, file_type); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We always convert the affix to an index. 0 means no affix. 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int affix_index = 0; 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (split.size() == 2) { 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Got a rule, which is the stuff after the slash. The line may also have 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // an optional term separated by a tab. This is the morphological 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // description. We don't care about this (it is used in the tests to 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // generate a nice dump), so we remove it. 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t split1_tab_offset = split[1].find('\t'); 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (split1_tab_offset != std::string::npos) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) split[1] = split[1].substr(0, split1_tab_offset); 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (aff_reader->has_indexed_affixes()) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) affix_index = atoi(split[1].c_str()); 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) affix_index = aff_reader->GetAFIndexForAFString(split[1]); 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Discard the morphological description if it is attached to the first 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // token. (It is attached to the first token if a word doesn't have affix 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // rules.) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t word_tab_offset = utf8word.find('\t'); 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_tab_offset != std::string::npos) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utf8word = utf8word.substr(0, word_tab_offset); 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WordSet::iterator found = word_set->find(utf8word); 1172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) std::set<int> affix_vector; 1182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) affix_vector.insert(affix_index); 1192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (found == word_set->end()) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_set->insert(std::make_pair(utf8word, affix_vector)); 1222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) else 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) found->second.insert(affix_index); 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)DicReader::DicReader(const base::FilePath& path) { 132a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) file_ = base::OpenFile(path, "r"); 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) base::FilePath additional_path = 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta")); 136a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) additional_words_file_ = base::OpenFile(additional_path, "r"); 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (additional_words_file_) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str()); 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) printf("%" PRFilePath " not found.\n", additional_path.value().c_str()); 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DicReader::~DicReader() { 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (file_) 146a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::CloseFile(file_); 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (additional_words_file_) 148a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::CloseFile(additional_words_file_); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DicReader::Read(AffReader* aff_reader) { 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!file_) 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WordSet word_set; 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add words from the dic file to the word set. 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Note that the first line is the word count in the file. 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) aff_reader->encoding(), true)) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add words from the .dic_delta file to the word set, if it exists. 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The first line is the first word to add. Word count line is not present. 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // NOTE: These additional words should be encoded as UTF-8. 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (additional_words_file_ != NULL) { 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "UTF-8", false); 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make sure the words are sorted, they may be unsorted in the input. 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (WordSet::iterator word = word_set.begin(); word != word_set.end(); 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++word) { 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<int> affixes; 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (std::set<int>::iterator aff = word->second.begin(); 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) aff != word->second.end(); ++aff) 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) affixes.push_back(*aff); 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Double check that the affixes are sorted. This isn't strictly necessary 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // but it's nice for the file to have a fixed layout. 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::sort(affixes.begin(), affixes.end()); 1812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) std::reverse(affixes.begin(), affixes.end()); 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) words_.push_back(std::make_pair(word->first, affixes)); 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Double-check that the words are sorted. 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::sort(words_.begin(), words_.end()); 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace convert_dict 191