1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/i18n/break_iterator.h" 6 7#include "base/logging.h" 8#include "third_party/icu/source/common/unicode/ubrk.h" 9#include "third_party/icu/source/common/unicode/uchar.h" 10#include "third_party/icu/source/common/unicode/ustring.h" 11 12namespace base { 13namespace i18n { 14 15const size_t npos = -1; 16 17BreakIterator::BreakIterator(const string16& str, BreakType break_type) 18 : iter_(NULL), 19 string_(str), 20 break_type_(break_type), 21 prev_(npos), 22 pos_(0) { 23} 24 25BreakIterator::BreakIterator(const string16& str, const string16& rules) 26 : iter_(NULL), 27 string_(str), 28 rules_(rules), 29 break_type_(RULE_BASED), 30 prev_(npos), 31 pos_(0) { 32} 33 34BreakIterator::~BreakIterator() { 35 if (iter_) 36 ubrk_close(static_cast<UBreakIterator*>(iter_)); 37} 38 39bool BreakIterator::Init() { 40 UErrorCode status = U_ZERO_ERROR; 41 UParseError parse_error; 42 UBreakIteratorType break_type; 43 switch (break_type_) { 44 case BREAK_CHARACTER: 45 break_type = UBRK_CHARACTER; 46 break; 47 case BREAK_WORD: 48 break_type = UBRK_WORD; 49 break; 50 case BREAK_LINE: 51 case BREAK_NEWLINE: 52 case RULE_BASED: // (Keep compiler happy, break_type not used in this case) 53 break_type = UBRK_LINE; 54 break; 55 default: 56 NOTREACHED() << "invalid break_type_"; 57 return false; 58 } 59 if (break_type_ == RULE_BASED) { 60 iter_ = ubrk_openRules(rules_.c_str(), 61 static_cast<int32_t>(rules_.length()), 62 string_.data(), 63 static_cast<int32_t>(string_.size()), 64 &parse_error, 65 &status); 66 if (U_FAILURE(status)) { 67 NOTREACHED() << "ubrk_openRules failed to parse rule string at line " 68 << parse_error.line << ", offset " << parse_error.offset; 69 } 70 } else { 71 iter_ = ubrk_open(break_type, 72 NULL, 73 string_.data(), 74 static_cast<int32_t>(string_.size()), 75 &status); 76 if (U_FAILURE(status)) { 77 NOTREACHED() << "ubrk_open failed"; 78 } 79 } 80 81 if (U_FAILURE(status)) { 82 return false; 83 } 84 85 // Move the iterator to the beginning of the string. 86 ubrk_first(static_cast<UBreakIterator*>(iter_)); 87 return true; 88} 89 90bool BreakIterator::Advance() { 91 int32_t pos; 92 int32_t status; 93 prev_ = pos_; 94 switch (break_type_) { 95 case BREAK_CHARACTER: 96 case BREAK_WORD: 97 case BREAK_LINE: 98 case RULE_BASED: 99 pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); 100 if (pos == UBRK_DONE) { 101 pos_ = npos; 102 return false; 103 } 104 pos_ = static_cast<size_t>(pos); 105 return true; 106 case BREAK_NEWLINE: 107 do { 108 pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); 109 if (pos == UBRK_DONE) 110 break; 111 pos_ = static_cast<size_t>(pos); 112 status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); 113 } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT); 114 if (pos == UBRK_DONE && prev_ == pos_) { 115 pos_ = npos; 116 return false; 117 } 118 return true; 119 default: 120 NOTREACHED() << "invalid break_type_"; 121 return false; 122 } 123} 124 125bool BreakIterator::SetText(const base::char16* text, const size_t length) { 126 UErrorCode status = U_ZERO_ERROR; 127 ubrk_setText(static_cast<UBreakIterator*>(iter_), 128 text, length, &status); 129 pos_ = 0; // implicit when ubrk_setText is done 130 prev_ = npos; 131 if (U_FAILURE(status)) { 132 NOTREACHED() << "ubrk_setText failed"; 133 return false; 134 } 135 return true; 136} 137 138bool BreakIterator::IsWord() const { 139 int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); 140 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) 141 return false; 142 return status != UBRK_WORD_NONE; 143} 144 145bool BreakIterator::IsEndOfWord(size_t position) const { 146 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) 147 return false; 148 149 UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); 150 UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); 151 int32_t status = ubrk_getRuleStatus(iter); 152 return (!!boundary && status != UBRK_WORD_NONE); 153} 154 155bool BreakIterator::IsStartOfWord(size_t position) const { 156 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) 157 return false; 158 159 UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); 160 UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); 161 ubrk_next(iter); 162 int32_t next_status = ubrk_getRuleStatus(iter); 163 return (!!boundary && next_status != UBRK_WORD_NONE); 164} 165 166string16 BreakIterator::GetString() const { 167 DCHECK(prev_ != npos && pos_ != npos); 168 return string_.substr(prev_, pos_ - prev_); 169} 170 171} // namespace i18n 172} // namespace base 173