15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_TOOLS_CONVERT_DICT_AFF_READER_H__
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_TOOLS_CONVERT_DICT_AFF_READER_H__
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <stdio.h>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)namespace base {
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FilePath;
152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace convert_dict {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class AffReader {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  explicit AffReader(const base::FilePath& path);
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~AffReader();
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Read();
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns whether this file uses indexed affixes, or, on false, whether the
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // rule string will be specified literally in the .dic file. This must be
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // called after Read().
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool has_indexed_affixes() const { return has_indexed_affixes_; }
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a string representing the encoding of the dictionary. This will
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // default to ISO-8859-1 if the .aff file does not specify it.
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* encoding() const { return encoding_.c_str(); }
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Converts the given string from the file encoding to UTF-8, returning true
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // on success.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool EncodingToUTF8(const std::string& encoded, std::string* utf8) const;
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds a new affix string, returning the index. If it already exists, returns
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the index of the existing one. This is used to convert .dic files which
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // list the
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // You must not call this until after Read();
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int GetAFIndexForAFString(const std::string& af_string);
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Getters for the computed data.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const std::string& comments() const { return intro_comment_; }
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const std::vector<std::string>& affix_rules() const { return affix_rules_; }
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const std::vector< std::pair<std::string, std::string> >&
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      replacements() const {
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return replacements_;
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const std::vector<std::string>& other_commands() const {
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return other_commands_;
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the affix groups ("AF" lines) for this file. The indices into this
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // are 1-based, but we don't use the 0th item, so lookups will have to
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // subtract one to get the index. This is how hunspell stores this data.
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<std::string> GetAffixGroups() const;
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Command-specific handlers. These are given the string folling the
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // command. The input rule may be modified arbitrarily by the function.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int AddAffixGroup(std::string* rule);  // Returns the new affix group ID.
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddAffix(std::string* rule);  // SFX/PFX
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddReplacement(std::string* rule);
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // void HandleFlag(std::string* rule);
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Used to handle "other" commands. The "raw" just saves the line as-is.
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The "encoded" version converts the line to UTF-8 and saves it.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void HandleRawCommand(const std::string& line);
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void HandleEncodedCommand(const std::string& line);
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FILE* file_;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Comments from the beginning of the file. This is everything before the
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // first command. We want to store this since it often contains the copyright
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // information.
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string intro_comment_;
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Encoding of the source words.
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string encoding_;
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Affix rules. These are populated by "AF" commands. The .dic file can refer
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to these by index. They are indexed by their string value (the list of
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // characters representing rules), and map to the numeric affix IDs.
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // These can also be added using GetAFIndexForAFString.
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::map<std::string, int> affix_groups_;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // True when the affixes were specified in the .aff file using indices. The
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // dictionary reader uses this to see how it should treat the stuff after the
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // word on each line.
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool has_indexed_affixes_;
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // SFX and PFX commands. This is a list of each of those lines in the order
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // they appear in the file. They have been re-encoded.
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<std::string> affix_rules_;
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Replacement commands. The first string is a possible input, and the second
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is the replacment.
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector< std::pair<std::string, std::string> > replacements_;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // All other commands.
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<std::string> other_commands_;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace convert_dict
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_TOOLS_CONVERT_DICT_AFF_READER_H__
111