15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Implements a custom word iterator used for our spellchecker.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
13f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "base/i18n/break_iterator.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
157d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/stringprintf.h"
16868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/renderer/spellchecker/spellcheck.h"
18ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/normlzr.h"
19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/schriter.h"
20ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/uscript.h"
21ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/i18n/unicode/ulocdata.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SpellcheckCharAttribute implementation:
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckCharAttribute::SpellcheckCharAttribute()
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : script_code_(USCRIPT_LATIN) {
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckCharAttribute::~SpellcheckCharAttribute() {
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CreateRuleSets(language);
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
36a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)base::string16 SpellcheckCharAttribute::GetRuleSet(
37a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    bool allow_contraction) const {
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return allow_contraction ?
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ruleset_allow_contraction_ : ruleset_disallow_contraction_;
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The template for our custom rule sets, which is based on the word-break
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // rules of ICU 4.0:
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The major differences from the original one are listed below:
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It discards comments in the original rules.
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It discards characters not needed by our spellchecker (e.g. numbers,
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It allows customization of the $ALetter value (i.e. word characters).
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It allows customization of the $ALetterPlus value (i.e. whether or not to
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   use the dictionary data).
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * It allows choosing whether or not to split a text at contraction
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   characters.
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This template only changes the forward-iteration rules. So, calling
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ubrk_prev() returns the same results as the original template.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const char kRuleTemplate[] =
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!chain;"
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$CR           = [\\p{Word_Break = CR}];"
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$LF           = [\\p{Word_Break = LF}];"
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Newline      = [\\p{Word_Break = Newline}];"
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Extend       = [\\p{Word_Break = Extend}];"
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Format       = [\\p{Word_Break = Format}];"
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Katakana     = [\\p{Word_Break = Katakana}];"
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Not all the characters in a given script are ALetter.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // For instance, U+05F4 is MidLetter. So, this may be
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // better, but it leads to an empty set error in Thai.
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // "$ALetter   = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetter      = [\\p{script=%s}%s];"
705f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      // U+0027 (single quote/apostrophe) is not in MidNumLet any more
715f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      // in UAX 29 rev 21 or later. For our purpose, U+0027
725f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      // has to be treated as MidNumLet. ( http://crbug.com/364072 )
735f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      "$MidNumLet    = [\\p{Word_Break = MidNumLet} \\u0027];"
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidLetter    = [\\p{Word_Break = MidLetter}%s];"
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNum       = [\\p{Word_Break = MidNum}];"
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Numeric      = [\\p{Word_Break = Numeric}];"
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Control        = [\\p{Grapheme_Cluster_Break = Control}]; "
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "%s"  // ALetterPlus
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$KatakanaEx     = $Katakana     ($Extend |  $Format)*;"
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;"
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;"
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;"
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$MidNumEx       = $MidNum       ($Extend |  $Format)*;"
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$NumericEx      = $Numeric      ($Extend |  $Format)*;"
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;"
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Hiragana       = [\\p{script=Hiragana}];"
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$Ideographic    = [\\p{Ideographic}];"
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;"
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;"
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!forward;"
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$CR $LF;"
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "[^$CR $LF $Newline]? ($Extend |  $Format)+;"
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx {200};"
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx $ALetterEx {200};"
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "%s"  // (Allow|Disallow) Contraction
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!reverse;"
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx     = ($Format | $Extend)* $ALetterPlus;"
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;"
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx     = ($Format | $Extend)* $Numeric;"
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackMidNumEx      = ($Format | $Extend)* $MidNum;"
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackMidLetterEx   = ($Format | $Extend)* $MidLetter;"
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackKatakanaEx    = ($Format | $Extend)* $Katakana;"
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;"
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$LF $CR;"
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($Format | $Extend)*  [^$CR $LF $Newline]?;"
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx $BackALetterEx;"
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;"
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx $BackNumericEx;"
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx $BackALetterEx;"
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackALetterEx $BackNumericEx;"
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;"
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackKatakanaEx $BackKatakanaEx;"
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |"
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      " $BackKatakanaEx | $BackExtendNumLetEx);"
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)"
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      " $BackExtendNumLetEx;"
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!safe_reverse;"
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($Extend | $Format)+ .?;"
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidLetter | $MidNumLet) $BackALetterEx;"
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidNum | $MidNumLet) $BackNumericEx;"
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "!!safe_forward;"
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($Extend | $Format)+ .?;"
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidLetterEx | $MidNumLetEx) $ALetterEx;"
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "($MidNumEx | $MidNumLetEx) $NumericEx;";
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Retrieve the script codes used by the given language from ICU. When the
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // given language consists of two or more scripts, we just use the first
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // script. The size of returned script codes is always < 8. Therefore, we use
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // an array of size 8 so we can include all script codes without insufficient
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // buffer errors.
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode error = U_ZERO_ERROR;
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UScriptCode script_code[8];
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int scripts = uscript_getCode(language.c_str(), script_code,
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                arraysize(script_code), &error);
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (U_SUCCESS(error) && scripts >= 1)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    script_code_ = script_code[0];
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // only for the languages which need it (i.e. Korean and Thai) to prevent ICU
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // from returning dictionary words (i.e. Korean or Thai words) for languages
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // which don't need them.
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* aletter = uscript_getName(script_code_);
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!aletter)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    aletter = "Latin";
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kWithDictionary[] =
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$dictionary   = [:LineBreak = Complex_Context:];"
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];";
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kWithoutDictionary[] = "$ALetterPlus  = $ALetter;";
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* aletter_plus = kWithoutDictionary;
1595f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI ||
1605f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)      script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    aletter_plus = kWithDictionary;
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Treat numbers as word characters except for Arabic and Hebrew.
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* aletter_extra = " [0123456789]";
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC)
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    aletter_extra = "";
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kMidLetterExtra[] = "";
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // For Hebrew, treat single/double quoation marks as MidLetter.
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kMidLetterExtraHebrew[] = "\"'";
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* midletter_extra = kMidLetterExtra;
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code_ == USCRIPT_HEBREW)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    midletter_extra = kMidLetterExtraHebrew;
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Create two custom rule-sets: one allows contraction and the other does not.
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We save these strings in UTF-16 so we can use it without conversions. (ICU
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // needs UTF-16 strings.)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kAllowContraction[] =
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};";
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char kDisallowContraction[] = "";
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  ruleset_allow_contraction_ = base::ASCIIToUTF16(
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      base::StringPrintf(kRuleTemplate,
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter,
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_extra,
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         midletter_extra,
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_plus,
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         kAllowContraction));
1895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  ruleset_disallow_contraction_ = base::ASCIIToUTF16(
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      base::StringPrintf(kRuleTemplate,
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter,
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_extra,
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         midletter_extra,
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         aletter_plus,
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         kDisallowContraction));
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
198a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool SpellcheckCharAttribute::OutputChar(UChar c,
199a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                         base::string16* output) const {
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Call the language-specific function if necessary.
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Otherwise, we call the default one.
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (script_code_) {
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case USCRIPT_ARABIC:
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputArabic(c, output);
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case USCRIPT_HANGUL:
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputHangul(c, output);
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case USCRIPT_HEBREW:
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputHebrew(c, output);
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return OutputDefault(c, output);
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
217a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool SpellcheckCharAttribute::OutputArabic(UChar c,
218a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                           base::string16* output) const {
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Discard characters not from Arabic alphabets. We also discard vowel marks
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // marking an Arabic word including vowel marks as misspelled. (We need to
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // check these vowel marks manually and filter them out since their script
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // codes are USCRIPT_ARABIC.)
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (0x0621 <= c && c <= 0x064D)
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(c);
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
229a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool SpellcheckCharAttribute::OutputHangul(UChar c,
230a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                           base::string16* output) const {
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Decompose a Hangul character to a Hangul vowel and consonants used by our
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker. A Hangul character of Unicode is a ligature consisting of a
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a point of a cubic linear space consisting of (first consonant, vowel, last
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // consonant). Therefore, we can compose a Hangul character from a vowel and
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // two consonants with linear composition:
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   character =  0xAC00 +
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                (first consonant - 0x1100) * 28 * 21 +
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                (vowel           - 0x1161) * 28 +
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //                (last consonant  - 0x11A7);
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We can also decompose a Hangul character with linear decomposition:
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   first consonant = (character - 0xAC00) / 28 / 21;
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   vowel           = (character - 0xAC00) / 28 % 21;
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   last consonant  = (character - 0xAC00) % 28;
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This code is copied from Unicode Standard Annex #15
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // <http://unicode.org/reports/tr15> and added some comments.
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kSBase = 0xAC00;  // U+AC00: the top of Hangul characters.
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kLBase = 0x1100;  // U+1100: the top of Hangul first consonants.
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kVBase = 0x1161;  // U+1161: the top of Hangul vowels.
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kTBase = 0x11A7;  // U+11A7: the top of Hangul last consonants.
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kLCount = 19;     // The number of Hangul first consonants.
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kVCount = 21;     // The number of Hangul vowels.
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kTCount = 28;     // The number of Hangul last consonants.
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kNCount = kVCount * kTCount;
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int kSCount = kLCount * kNCount;
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int index = c - kSBase;
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (index < 0 || index >= kSBase + kSCount) {
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // This is not a Hangul syllable. Call the default output function since we
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // should output this character when it is a Hangul syllable.
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return OutputDefault(c, output);
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This is a Hangul character. Decompose this characters into Hangul vowels
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and consonants.
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int l = kLBase + index / kNCount;
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int v = kVBase + (index % kNCount) / kTCount;
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int t = kTBase + index % kTCount;
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->push_back(l);
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->push_back(v);
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (t != kTBase)
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(t);
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
277a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool SpellcheckCharAttribute::OutputHebrew(UChar c,
278a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                           base::string16* output) const {
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to prevent our Hebrew dictionary from marking a Hebrew word including
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // niqquds manually and filter them out since their script codes are
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // USCRIPT_HEBREW.)
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Pass through ASCII single/double quotation marks and Hebrew Geresh and
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Gershayim.
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 ||
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c == 0x05F4 || c == 0x05F3)
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(c);
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
292a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool SpellcheckCharAttribute::OutputDefault(UChar c,
293a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                            base::string16* output) const {
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Check the script code of this character and output only if it is the one
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // used by the spellchecker language.
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UScriptCode script_code = uscript_getScript(c, &status);
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (script_code == script_code_ || script_code == USCRIPT_COMMON)
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    output->push_back(c);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SpellcheckWordIterator implementation:
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckWordIterator::SpellcheckWordIterator()
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : text_(NULL),
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      attribute_(NULL),
308f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      iterator_() {
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpellcheckWordIterator::~SpellcheckWordIterator() {
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Reset();
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::Initialize(
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const SpellcheckCharAttribute* attribute,
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    bool allow_contraction) {
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Create a custom ICU break iterator with empty text used in this object. (We
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // allow setting text later so we can re-use this iterator.)
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(attribute);
321f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  const base::string16 rule(attribute->GetRuleSet(allow_contraction));
3222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
3232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // If there is no rule set, the attributes were invalid.
3242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  if (rule.empty())
3252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return false;
3262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
327f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  scoped_ptr<base::i18n::BreakIterator> iterator(
328f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      new base::i18n::BreakIterator(base::string16(), rule));
329f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (!iterator->Init()) {
330f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    // Since we're not passing in any text, the only reason this could fail
331f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    // is if we fail to parse the rules. Since the rules are hardcoded,
332f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    // that would be a bug in this class.
333f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    NOTREACHED() << "failed to open iterator (broken rules)";
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
335f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
336f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  iterator_ = iterator.Pass();
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set the character attributes so we can normalize the words extracted by
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this iterator.
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  attribute_ = attribute;
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::IsInitialized() const {
345f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  // Return true iff we have an iterator.
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return !!iterator_;
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) {
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!!iterator_);
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set the text to be split by this iterator.
353f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (!iterator_->SetText(text, length)) {
354f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    LOG(ERROR) << "failed to set text";
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
356f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  text_ = text;
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
362a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool SpellcheckWordIterator::GetNextWord(base::string16* word_string,
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         int* word_start,
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         int* word_length) {
365f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  DCHECK(!!text_);
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  word_string->clear();
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *word_start = 0;
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *word_length = 0;
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
371f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (!text_) {
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
373f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Find a word that can be checked for spelling. Our rule sets filter out
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // invalid words (e.g. numbers and characters not supported by the
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker language) so this ubrk_getRuleStatus() call returns
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // words until we can find a valid word or reach the end of the input string.
380f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  while (iterator_->Advance()) {
381f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const size_t start = iterator_->prev();
382f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const size_t length = iterator_->pos() - start;
383f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    if (iterator_->IsWord()) {
384f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      if (Normalize(start, length, word_string)) {
385f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        *word_start = start;
386f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        *word_length = length;
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return true;
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
392f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  // There aren't any more words in the given text.
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpellcheckWordIterator::Reset() {
397f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  iterator_.reset();
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool SpellcheckWordIterator::Normalize(int input_start,
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                       int input_length,
402a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                                       base::string16* output_string) const {
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We use NFKC (Normalization Form, Compatible decomposition, followed by
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // canonical Composition) defined in Unicode Standard Annex #15 to normalize
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this token because it it the most suitable normalization algorithm for our
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker. Nevertheless, it is not a perfect algorithm for our
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // spellchecker and we need manual normalization as well. The normalized
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // text does not have to be NUL-terminated since its characters are copied to
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // string16, which adds a NUL character when we need.
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::UnicodeString input(FALSE, &text_[input_start], input_length);
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::UnicodeString output;
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING)
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Copy the normalized text to the output.
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  icu::StringCharacterIterator it(output);
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next())
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    attribute_->OutputChar(c, output_string);
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return !output_string->empty();
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
424