1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Defines an iterator class that enumerates words supported by our spellchecker
6// from multi-language text. This class is used for filtering out characters
7// not supported by our spellchecker.
8
9#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
10#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
11
12#include <string>
13
14#include "base/basictypes.h"
15#include "base/memory/scoped_ptr.h"
16#include "base/strings/string16.h"
17#include "third_party/icu/source/common/unicode/uscript.h"
18
19namespace base {
20namespace i18n {
21class BreakIterator;
22} // namespace i18n
23} // namespace base
24
25// A class which encapsulates language-specific operations used by
26// SpellcheckWordIterator. When we set the spellchecker language, this class
27// creates rule sets that filter out the characters not supported by the
28// spellchecker. (Please read the comment in the SpellcheckWordIterator class
29// about how to use this class.)
30class SpellcheckCharAttribute {
31 public:
32  SpellcheckCharAttribute();
33  ~SpellcheckCharAttribute();
34
35  // Sets the language of the spellchecker. When this function is called with an
36  // ISO language code, this function creates the custom rule-sets used by
37  // the ICU break iterator so it can extract only words used by the language.
38  // GetRuleSet() returns the rule-sets created in this function.
39  void SetDefaultLanguage(const std::string& language);
40
41  // Returns a custom rule-set string used by the ICU break iterator. This class
42  // has two rule-sets, one splits a contraction and the other does not, so we
43  // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
44  // "seven", "year", and "old") and check their spellings. The result stirng is
45  // encoded in UTF-16 since ICU needs UTF-16 strings.
46  base::string16 GetRuleSet(bool allow_contraction) const;
47
48  // Outputs a character only if it is a word character. (Please read the
49  // comments in CreateRuleSets() why we need this function.)
50  bool OutputChar(UChar c, base::string16* output) const;
51
52 private:
53  // Creates the rule-sets that return words possibly used by the given
54  // language. Unfortunately, these rule-sets are not perfect and have some
55  // false-positives. For example, they return combined accent marks even though
56  // we need English words only. We call OutputCharacter() to filter out such
57  // false-positive characters.
58  void CreateRuleSets(const std::string& language);
59
60  // Outputs a character only if it is one used by the given language. These
61  // functions are called from OutputChar().
62  bool OutputArabic(UChar c, base::string16* output) const;
63  bool OutputHangul(UChar c, base::string16* output) const;
64  bool OutputHebrew(UChar c, base::string16* output) const;
65  bool OutputDefault(UChar c, base::string16* output) const;
66
67  // The custom rule-set strings used by ICU break iterator. Since it is not so
68  // easy to create custom rule-sets from an ISO language code, this class
69  // saves these rule-set strings created when we set the language.
70  base::string16 ruleset_allow_contraction_;
71  base::string16 ruleset_disallow_contraction_;
72
73  // The script code used by this language.
74  UScriptCode script_code_;
75
76  DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
77};
78
79// A class which extracts words that can be checked for spelling from a
80// multi-language string. The ICU word-break iterator does not discard some
81// punctuation characters attached to a word. For example, when we set a word
82// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
83// it discard characters not used by the language. For example, it returns
84// Russian words even though we need English words only. To extract only the
85// words that our spellchecker can check their spellings, this class uses custom
86// rule-sets created by the SpellcheckCharAttribute class. Also, this class
87// normalizes extracted words so our spellchecker can check the spellings of
88// words that include ligatures, combined characters, full-width characters,
89// etc. This class uses UTF-16 strings as its input and output strings since
90// UTF-16 is the native encoding of ICU and avoid unnecessary conversions
91// when changing the encoding of this string for our spellchecker. (Chrome can
92// use two or more spellcheckers and we cannot assume their encodings.)
93// The following snippet is an example that extracts words with this class.
94//
95//   // Creates the language-specific attributes for US English.
96//   SpellcheckCharAttribute attribute;
97//   attribute.SetDefaultLanguage("en-US");
98//
99//   // Set up a SpellcheckWordIterator object which extracts English words,
100//   // and retrieve them.
101//   SpellcheckWordIterator iterator;
102//   base::string16 text(base::UTF8ToUTF16("this is a test."));
103//   iterator.Initialize(&attribute, true);
104//   iterator.SetText(text.c_str(), text_.length());
105//
106//   base::string16 word;
107//   int offset;
108//   int length;
109//   while (iterator.GetNextWord(&word, &offset, &length)) {
110//     ...
111//   }
112//
113class SpellcheckWordIterator {
114 public:
115  SpellcheckWordIterator();
116  ~SpellcheckWordIterator();
117
118  // Initializes a word-iterator object with the language-specific attribute. If
119  // we need to split contractions and concatenated words, call this function
120  // with its 'allow_contraction' parameter false. (This function uses lots of
121  // temporal memory to compile a custom word-break rule into an automaton.)
122  bool Initialize(const SpellcheckCharAttribute* attribute,
123                  bool allow_contraction);
124
125  // Returns whether this word iterator is initialized.
126  bool IsInitialized() const;
127
128  // Set text to be iterated. (This text does not have to be NULL-terminated.)
129  // This function also resets internal state so we can reuse this iterator
130  // without calling Initialize().
131  bool SetText(const base::char16* text, size_t length);
132
133  // Retrieves a word (or a contraction), stores its copy to 'word_string', and
134  // stores the position and the length for input word to 'word_start'. Since
135  // this function normalizes the output word, the length of 'word_string' may
136  // be different from the 'word_length'. Therefore, when we call functions that
137  // changes the input text, such as string16::replace(), we need to use
138  // 'word_start' and 'word_length' as listed in the following snippet.
139  //
140  //   while(iterator.GetNextWord(&word, &offset, &length))
141  //     text.replace(offset, length, word);
142  //
143  bool GetNextWord(base::string16* word_string,
144                   int* word_start,
145                   int* word_length);
146
147  // Releases all the resources attached to this object.
148  void Reset();
149
150 private:
151  // Normalizes a non-terminated string returned from an ICU word-break
152  // iterator. A word returned from an ICU break iterator may include characters
153  // not supported by our spellchecker, e.g. ligatures, combining/ characters,
154  // full-width letters, etc. This function replaces such characters with
155  // alternative characters supported by our spellchecker. This function also
156  // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
157  // characters.
158  bool Normalize(int input_start,
159                 int input_length,
160                 base::string16* output_string) const;
161
162  // The pointer to the input string from which we are extracting words.
163  const base::char16* text_;
164
165  // The language-specific attributes used for filtering out non-word
166  // characters.
167  const SpellcheckCharAttribute* attribute_;
168
169  // The break iterator.
170  scoped_ptr<base::i18n::BreakIterator> iterator_;
171
172  DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
173};
174
175#endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
176
177