15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/tools/convert_dict/dic_reader.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
101320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "base/files/file_util.h"
117d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string_util.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/tools/convert_dict/aff_reader.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/tools/convert_dict/hunspell_reader.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace convert_dict {
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Maps each unique word to the unique affix group IDs associated with it.
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef std::map<std::string, std::set<int> > WordSet;
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // We split the line on a slash not preceded by a backslash. A slash at the
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // beginning of the line is not a separator either.
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t slash_index = line.size();
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 0; i < line.size(); i++) {
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (line[i] == '/' && i > 0 && line[i - 1] != '\\') {
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      slash_index = i;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->clear();
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Everything before the slash index is the first term. We also need to
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // convert all escaped slashes ("\/" sequences) to regular slashes.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string word = line.substr(0, slash_index);
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->push_back(word);
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Everything (if anything) after the slash is the second.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (slash_index < line.size() - 1)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(line.substr(slash_index + 1));
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This function reads words from a .dic file, or a .dic_delta file. Note that
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we read 'all' the words in the file, irrespective of the word count given
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the first non empty line of a .dic file. Also note that, for a .dic_delta
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// file, the first line actually does _not_ have the number of words. In order
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to control this, we use the |file_has_word_count_in_the_first_line|
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// parameter to tell this method whether the first non empty line in the file
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// contains the number of words or not. If it does, skip the first line. If it
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// does not, then the first line contains a word.
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     const char* file_type, const char* encoding,
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     bool file_has_word_count_in_the_first_line) {
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int line_number = 0;
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (!feof(file)) {
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::string line = ReadLine(file);
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    line_number++;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    StripComment(&line);
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (line.empty())
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (file_has_word_count_in_the_first_line) {
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Skip the first nonempty line, this is the line count. We don't bother
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // with it and just read all the lines.
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      file_has_word_count_in_the_first_line = false;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::vector<std::string> split;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SplitDicLine(line, &split);
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (split.empty() || split.size() > 2) {
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      printf("Line %d has extra slashes in the %s file\n", line_number,
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             file_type);
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return false;
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // The first part is the word, the second (optional) part is the affix. We
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // always use UTF-8 as the encoding to simplify life.
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::string utf8word;
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::string encoding_string(encoding);
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (encoding_string == "UTF-8") {
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      utf8word = split[0];
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             line_number, encoding, file_type);
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return false;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We always convert the affix to an index. 0 means no affix.
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int affix_index = 0;
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (split.size() == 2) {
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Got a rule, which is the stuff after the slash. The line may also have
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // an optional term separated by a tab. This is the morphological
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // description. We don't care about this (it is used in the tests to
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // generate a nice dump), so we remove it.
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      size_t split1_tab_offset = split[1].find('\t');
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (split1_tab_offset != std::string::npos)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        split[1] = split[1].substr(0, split1_tab_offset);
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (aff_reader->has_indexed_affixes())
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        affix_index = atoi(split[1].c_str());
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      else
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        affix_index = aff_reader->GetAFIndexForAFString(split[1]);
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Discard the morphological description if it is attached to the first
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // token. (It is attached to the first token if a word doesn't have affix
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // rules.)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size_t word_tab_offset = utf8word.find('\t');
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (word_tab_offset != std::string::npos)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      utf8word = utf8word.substr(0, word_tab_offset);
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    WordSet::iterator found = word_set->find(utf8word);
1172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    std::set<int> affix_vector;
1182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    affix_vector.insert(affix_index);
1192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    if (found == word_set->end())
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      word_set->insert(std::make_pair(utf8word, affix_vector));
1222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    else
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      found->second.insert(affix_index);
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)DicReader::DicReader(const base::FilePath& path) {
132a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  file_ = base::OpenFile(path, "r");
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  base::FilePath additional_path =
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta"));
136a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  additional_words_file_ = base::OpenFile(additional_path, "r");
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (additional_words_file_)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str());
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    printf("%" PRFilePath " not found.\n", additional_path.value().c_str());
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DicReader::~DicReader() {
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (file_)
146a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    base::CloseFile(file_);
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (additional_words_file_)
148a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    base::CloseFile(additional_words_file_);
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DicReader::Read(AffReader* aff_reader) {
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!file_)
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  WordSet word_set;
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Add words from the dic file to the word set.
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Note that the first line is the word count in the file.
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!PopulateWordSet(&word_set, file_, aff_reader, "dic",
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       aff_reader->encoding(), true))
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Add words from the .dic_delta file to the word set, if it exists.
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The first line is the first word to add. Word count line is not present.
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // NOTE: These additional words should be encoded as UTF-8.
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (additional_words_file_ != NULL) {
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    "UTF-8", false);
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Make sure the words are sorted, they may be unsorted in the input.
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (WordSet::iterator word = word_set.begin(); word != word_set.end();
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       ++word) {
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::vector<int> affixes;
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (std::set<int>::iterator aff = word->second.begin();
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         aff != word->second.end(); ++aff)
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      affixes.push_back(*aff);
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Double check that the affixes are sorted. This isn't strictly necessary
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // but it's nice for the file to have a fixed layout.
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    std::sort(affixes.begin(), affixes.end());
1812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    std::reverse(affixes.begin(), affixes.end());
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    words_.push_back(std::make_pair(word->first, affixes));
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Double-check that the words are sorted.
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::sort(words_.begin(), words_.end());
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace convert_dict
191