1// Copyright (c) 2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/i18n/word_iterator.h" 6 7#include "base/logging.h" 8#include "unicode/ubrk.h" 9#include "unicode/ustring.h" 10 11const size_t npos = -1; 12 13WordIterator::WordIterator(const std::wstring& str, BreakType break_type) 14 : iter_(NULL), 15 string_(str), 16 break_type_(break_type), 17 prev_(npos), 18 pos_(0) { 19} 20 21WordIterator::~WordIterator() { 22 if (iter_) 23 ubrk_close(iter_); 24} 25 26bool WordIterator::Init() { 27 UErrorCode status = U_ZERO_ERROR; 28 UBreakIteratorType break_type; 29 switch (break_type_) { 30 case BREAK_WORD: 31 break_type = UBRK_WORD; 32 break; 33 case BREAK_LINE: 34 break_type = UBRK_LINE; 35 break; 36 default: 37 NOTREACHED(); 38 break_type = UBRK_LINE; 39 } 40#if defined(WCHAR_T_IS_UTF16) 41 iter_ = ubrk_open(break_type, NULL, 42 string_.data(), static_cast<int32_t>(string_.size()), 43 &status); 44#else // WCHAR_T_IS_UTF16 45 // When wchar_t is wider than UChar (16 bits), transform |string_| into a 46 // UChar* string. Size the UChar* buffer to be large enough to hold twice 47 // as many UTF-16 code points as there are UCS-4 characters, in case each 48 // character translates to a UTF-16 surrogate pair, and leave room for a NUL 49 // terminator. 50 // TODO(avi): avoid this alloc 51 chars_.resize(string_.length() * sizeof(UChar) + 1); 52 53 UErrorCode error = U_ZERO_ERROR; 54 int32_t destLength; 55 u_strFromWCS(&chars_[0], chars_.size(), &destLength, string_.data(), 56 string_.length(), &error); 57 58 iter_ = ubrk_open(break_type, NULL, &chars_[0], destLength, &status); 59#endif 60 if (U_FAILURE(status)) { 61 NOTREACHED() << "ubrk_open failed"; 62 return false; 63 } 64 ubrk_first(iter_); // Move the iterator to the beginning of the string. 65 return true; 66} 67 68bool WordIterator::Advance() { 69 prev_ = pos_; 70 const int32_t pos = ubrk_next(iter_); 71 if (pos == UBRK_DONE) { 72 pos_ = npos; 73 return false; 74 } else { 75 pos_ = static_cast<size_t>(pos); 76 return true; 77 } 78} 79 80bool WordIterator::IsWord() const { 81 return (ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE); 82} 83 84std::wstring WordIterator::GetWord() const { 85 DCHECK(prev_ != npos && pos_ != npos); 86#if defined(WCHAR_T_IS_UTF16) 87 return string_.substr(prev_, pos_ - prev_); 88#else // WCHAR_T_IS_UTF16 89 // See comment in Init(). If there are no surrogate pairs, 90 // |out_length| will be exactly |in_length|, if there are surrogate 91 // pairs it will be less than |in_length|. 92 int32_t out_length; 93 UErrorCode error = U_ZERO_ERROR; 94 const int32_t in_length = pos_ - prev_; 95 std::vector<std::wstring::value_type> out_buffer(in_length); 96 u_strToWCS(&out_buffer[0], in_length, &out_length, 97 &chars_[prev_], in_length, &error); 98 DCHECK_LE(out_length, in_length); 99 return std::wstring(&out_buffer[0], out_length); 100#endif 101} 102