spellcheck_worditerator.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Implements a custom word iterator used for our spellchecker.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/stringprintf.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/utf_string_conversions.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/spellchecker/spellcheck.h"
172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/icu/public/common/unicode/normlzr.h"
182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/icu/public/common/unicode/schriter.h"
192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/icu/public/common/unicode/uscript.h"
202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/icu/public/i18n/unicode/ulocdata.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SpellcheckCharAttribute implementation:
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckCharAttribute::SpellcheckCharAttribute()
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : script_code_(USCRIPT_LATIN) {
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckCharAttribute::~SpellcheckCharAttribute() {
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CreateRuleSets(language);
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)string16 SpellcheckCharAttribute::GetRuleSet(bool allow_contraction) const {
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return allow_contraction ?
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ruleset_allow_contraction_ : ruleset_disallow_contraction_;
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The template for our custom rule sets, which is based on the word-break
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // rules of ICU 4.0:
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The major differences from the original one are listed below:
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It discards comments in the original rules.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It discards characters not needed by our spellchecker (e.g. numbers,
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It allows customization of the $ALetter value (i.e. word characters).
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It allows customization of the $ALetterPlus value (i.e. whether or not to
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   use the dictionary data).
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It allows choosing whether or not to split a text at contraction
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   characters.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This template only changes the forward-iteration rules. So, calling
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ubrk_prev() returns the same results as the original template.
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const char kRuleTemplate[] =
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!chain;"
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$CR           = [\\p{Word_Break = CR}];"
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$LF           = [\\p{Word_Break = LF}];"
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Newline      = [\\p{Word_Break = Newline}];"
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Extend       = [\\p{Word_Break = Extend}];"
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Format       = [\\p{Word_Break = Format}];"
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Katakana     = [\\p{Word_Break = Katakana}];"
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Not all the characters in a given script are ALetter.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // For instance, U+05F4 is MidLetter. So, this may be
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // better, but it leads to an empty set error in Thai.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // "$ALetter   = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetter      = [\\p{script=%s}%s];"
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNumLet    = [\\p{Word_Break = MidNumLet}];"
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidLetter    = [\\p{Word_Break = MidLetter}%s];"
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNum       = [\\p{Word_Break = MidNum}];"
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Numeric      = [\\p{Word_Break = Numeric}];"
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Control        = [\\p{Grapheme_Cluster_Break = Control}]; "
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "%s"  // ALetterPlus
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$KatakanaEx     = $Katakana     ($Extend |  $Format)*;"
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;"
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;"
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;"
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNumEx       = $MidNum       ($Extend |  $Format)*;"
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$NumericEx      = $Numeric      ($Extend |  $Format)*;"
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;"
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Hiragana       = [\\p{script=Hiragana}];"
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Ideographic    = [\\p{Ideographic}];"
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;"
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;"
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!forward;"
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$CR $LF;"
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "[^$CR $LF $Newline]? ($Extend |  $Format)+;"
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx {200};"
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx $ALetterEx {200};"
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "%s"  // (Allow|Disallow) Contraction
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!reverse;"
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx     = ($Format | $Extend)* $ALetterPlus;"
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;"
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx     = ($Format | $Extend)* $Numeric;"
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackMidNumEx      = ($Format | $Extend)* $MidNum;"
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackMidLetterEx   = ($Format | $Extend)* $MidLetter;"
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackKatakanaEx    = ($Format | $Extend)* $Katakana;"
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;"
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$LF $CR;"
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($Format | $Extend)*  [^$CR $LF $Newline]?;"
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx $BackALetterEx;"
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;"
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx $BackNumericEx;"
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx $BackALetterEx;"
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx $BackNumericEx;"
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;"
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackKatakanaEx $BackKatakanaEx;"
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |"
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      " $BackKatakanaEx | $BackExtendNumLetEx);"
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)"
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      " $BackExtendNumLetEx;"
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!safe_reverse;"
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($Extend | $Format)+ .?;"
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidLetter | $MidNumLet) $BackALetterEx;"
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidNum | $MidNumLet) $BackNumericEx;"
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!safe_forward;"
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($Extend | $Format)+ .?;"
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidLetterEx | $MidNumLetEx) $ALetterEx;"
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidNumEx | $MidNumLetEx) $NumericEx;";
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Retrieve the script codes used by the given language from ICU. When the
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // given language consists of two or more scripts, we just use the first
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // script. The size of returned script codes is always < 8. Therefore, we use
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // an array of size 8 so we can include all script codes without insufficient
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // buffer errors.
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode error = U_ZERO_ERROR;
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UScriptCode script_code[8];
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int scripts = uscript_getCode(language.c_str(), script_code,
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                arraysize(script_code), &error);
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_SUCCESS(error) && scripts >= 1)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    script_code_ = script_code[0];
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // only for the languages which need it (i.e. Korean and Thai) to prevent ICU
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // from returning dictionary words (i.e. Korean or Thai words) for languages
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // which don't need them.
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* aletter = uscript_getName(script_code_);
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!aletter)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    aletter = "Latin";
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kWithDictionary[] =
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$dictionary   = [:LineBreak = Complex_Context:];"
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];";
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kWithoutDictionary[] = "$ALetterPlus  = $ALetter;";
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* aletter_plus = kWithoutDictionary;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    aletter_plus = kWithDictionary;
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Treat numbers as word characters except for Arabic and Hebrew.
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* aletter_extra = " [0123456789]";
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC)
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    aletter_extra = "";
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kMidLetterExtra[] = "";
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // For Hebrew, treat single/double quoation marks as MidLetter.
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kMidLetterExtraHebrew[] = "\"'";
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* midletter_extra = kMidLetterExtra;
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code_ == USCRIPT_HEBREW)
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    midletter_extra = kMidLetterExtraHebrew;
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Create two custom rule-sets: one allows contraction and the other does not.
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We save these strings in UTF-16 so we can use it without conversions. (ICU
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // needs UTF-16 strings.)
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kAllowContraction[] =
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};";
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kDisallowContraction[] = "";
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ruleset_allow_contraction_ = ASCIIToUTF16(
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      base::StringPrintf(kRuleTemplate,
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter,
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_extra,
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         midletter_extra,
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_plus,
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         kAllowContraction));
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ruleset_disallow_contraction_ = ASCIIToUTF16(
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      base::StringPrintf(kRuleTemplate,
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter,
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_extra,
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         midletter_extra,
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_plus,
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         kDisallowContraction));
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Call the language-specific function if necessary.
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Otherwise, we call the default one.
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (script_code_) {
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case USCRIPT_ARABIC:
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputArabic(c, output);
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case USCRIPT_HANGUL:
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputHangul(c, output);
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case USCRIPT_HEBREW:
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputHebrew(c, output);
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputDefault(c, output);
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckCharAttribute::OutputArabic(UChar c, string16* output) const {
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Discard characters not from Arabic alphabets. We also discard vowel marks
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // marking an Arabic word including vowel marks as misspelled. (We need to
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // check these vowel marks manually and filter them out since their script
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // codes are USCRIPT_ARABIC.)
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (0x0621 <= c && c <= 0x064D)
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(c);
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckCharAttribute::OutputHangul(UChar c, string16* output) const {
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Decompose a Hangul character to a Hangul vowel and consonants used by our
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker. A Hangul character of Unicode is a ligature consisting of a
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a point of a cubic linear space consisting of (first consonant, vowel, last
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // consonant). Therefore, we can compose a Hangul character from a vowel and
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // two consonants with linear composition:
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   character =  0xAC00 +
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                (first consonant - 0x1100) * 28 * 21 +
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                (vowel           - 0x1161) * 28 +
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                (last consonant  - 0x11A7);
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We can also decompose a Hangul character with linear decomposition:
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   first consonant = (character - 0xAC00) / 28 / 21;
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   vowel           = (character - 0xAC00) / 28 % 21;
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   last consonant  = (character - 0xAC00) % 28;
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This code is copied from Unicode Standard Annex #15
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // <http://unicode.org/reports/tr15> and added some comments.
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kSBase = 0xAC00;  // U+AC00: the top of Hangul characters.
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kLBase = 0x1100;  // U+1100: the top of Hangul first consonants.
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kVBase = 0x1161;  // U+1161: the top of Hangul vowels.
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kTBase = 0x11A7;  // U+11A7: the top of Hangul last consonants.
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kLCount = 19;     // The number of Hangul first consonants.
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kVCount = 21;     // The number of Hangul vowels.
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kTCount = 28;     // The number of Hangul last consonants.
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kNCount = kVCount * kTCount;
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kSCount = kLCount * kNCount;
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int index = c - kSBase;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (index < 0 || index >= kSBase + kSCount) {
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // This is not a Hangul syllable. Call the default output function since we
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // should output this character when it is a Hangul syllable.
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return OutputDefault(c, output);
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This is a Hangul character. Decompose this characters into Hangul vowels
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and consonants.
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int l = kLBase + index / kNCount;
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int v = kVBase + (index % kNCount) / kTCount;
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int t = kTBase + index % kTCount;
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->push_back(l);
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->push_back(v);
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (t != kTBase)
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(t);
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to prevent our Hebrew dictionary from marking a Hebrew word including
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // niqquds manually and filter them out since their script codes are
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // USCRIPT_HEBREW.)
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Pass through ASCII single/double quotation marks and Hebrew Geresh and
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Gershayim.
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 ||
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c == 0x05F4 || c == 0x05F3)
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(c);
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckCharAttribute::OutputDefault(UChar c, string16* output) const {
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Check the script code of this character and output only if it is the one
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // used by the spellchecker language.
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UScriptCode script_code = uscript_getScript(c, &status);
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code == script_code_ || script_code == USCRIPT_COMMON)
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(c);
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SpellcheckWordIterator implementation:
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckWordIterator::SpellcheckWordIterator()
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : text_(NULL),
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      length_(0),
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      position_(UBRK_DONE),
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      attribute_(NULL),
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      iterator_(NULL) {
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckWordIterator::~SpellcheckWordIterator() {
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Reset();
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::Initialize(
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const SpellcheckCharAttribute* attribute,
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    bool allow_contraction) {
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Create a custom ICU break iterator with empty text used in this object. (We
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // allow setting text later so we can re-use this iterator.)
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(attribute);
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode open_status = U_ZERO_ERROR;
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UParseError parse_status;
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string16 rule(attribute->GetRuleSet(allow_contraction));
3152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // If there is no rule set, the attributes were invalid.
3172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  if (rule.empty())
3182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return false;
3192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  iterator_ = ubrk_openRules(rule.c_str(), rule.length(), NULL, 0,
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             &parse_status, &open_status);
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_FAILURE(open_status))
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set the character attributes so we can normalize the words extracted by
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this iterator.
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  attribute_ = attribute;
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::IsInitialized() const {
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Return true if we have an ICU custom iterator.
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return !!iterator_;
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::SetText(const char16* text, size_t length) {
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!!iterator_);
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set the text to be split by this iterator.
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ubrk_setText(iterator_, text, length, &status);
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_FAILURE(status))
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Retrieve the position to the first word in this text. We return false if
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this text does not have any words. (For example, The input text consists
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // only of Chinese characters while the spellchecker language is English.)
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  position_ = ubrk_first(iterator_);
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (position_ == UBRK_DONE)
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  text_ = text;
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  length_ = static_cast<int>(length);
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::GetNextWord(string16* word_string,
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         int* word_start,
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         int* word_length) {
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!!text_ && length_ > 0);
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word_string->clear();
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *word_start = 0;
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *word_length = 0;
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!text_ || position_ == UBRK_DONE)
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Find a word that can be checked for spelling. Our rule sets filter out
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // invalid words (e.g. numbers and characters not supported by the
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker language) so this ubrk_getRuleStatus() call returns
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // words until we can find a valid word or reach the end of the input string.
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int next = ubrk_next(iterator_);
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (next != UBRK_DONE) {
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (ubrk_getRuleStatus(iterator_) != UBRK_WORD_NONE) {
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (Normalize(position_, next - position_, word_string)) {
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *word_start = position_;
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *word_length = next - position_;
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        position_ = next;
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return true;
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    position_ = next;
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    next = ubrk_next(iterator_);
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // There aren't any more words in the given text. Set the position to
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // UBRK_DONE to prevent from calling ubrk_next() next time when this function
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is called.
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  position_ = UBRK_DONE;
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpellcheckWordIterator::Reset() {
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (iterator_) {
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ubrk_close(iterator_);
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    iterator_ = NULL;
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::Normalize(int input_start,
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                       int input_length,
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                       string16* output_string) const {
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We use NFKC (Normalization Form, Compatible decomposition, followed by
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // canonical Composition) defined in Unicode Standard Annex #15 to normalize
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this token because it it the most suitable normalization algorithm for our
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker. Nevertheless, it is not a perfect algorithm for our
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker and we need manual normalization as well. The normalized
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // text does not have to be NUL-terminated since its characters are copied to
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // string16, which adds a NUL character when we need.
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::UnicodeString input(FALSE, &text_[input_start], input_length);
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::UnicodeString output;
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Copy the normalized text to the output.
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::StringCharacterIterator it(output);
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    attribute_->OutputChar(c, output_string);
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return !output_string->empty();
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
426