172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved. 27f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// Use of this source code is governed by a BSD-style license that can be 37f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// found in the LICENSE file. 47f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 57f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#ifndef BASE_I18N_CHAR_ITERATOR_H_ 67f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#define BASE_I18N_CHAR_ITERATOR_H_ 77f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#pragma once 87f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 97f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#include <string> 107f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 117f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#include "base/basictypes.h" 127f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#include "base/string16.h" 137f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 147f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// The CharIterator classes iterate through the characters in UTF8 and 157f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// UTF16 strings. Example usage: 167f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// 177f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// UTF8CharIterator iter(&str); 187f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// while (!iter.End()) { 19731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick// VLOG(1) << iter.get(); 207f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// iter.Advance(); 217f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// } 227f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 237f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#if defined(OS_WIN) 247f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsentypedef unsigned char uint8_t; 257f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#endif 267f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 277f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsennamespace base { 2872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsennamespace i18n { 297f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 307f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsenclass UTF8CharIterator { 317f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen public: 327f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Requires |str| to live as long as the UTF8CharIterator does. 337f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen UTF8CharIterator(const std::string* str); 3472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen ~UTF8CharIterator(); 357f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 367f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Return the starting array index of the current character within the 377f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // string. 387f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 array_pos() const { return array_pos_; } 397f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 407f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Return the logical index of the current character, independent of the 417f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // number of bytes each character takes. 427f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 char_pos() const { return char_pos_; } 437f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 447f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Return the current char. 457f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 get() const { return char_; } 467f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 477f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Returns true if we're at the end of the string. 487f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen bool end() const { return array_pos_ == len_; } 497f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 507f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Advance to the next actual character. Returns false if we're at the 517f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // end of the string. 527f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen bool Advance(); 537f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 547f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen private: 557f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The string we're iterating over. 567f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen const uint8_t* str_; 577f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 587f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The length of the encoded string. 597f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 len_; 607f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 617f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Array index. 627f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 array_pos_; 637f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 647f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The next array index. 657f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 next_pos_; 667f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 677f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Character index. 687f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 char_pos_; 697f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 707f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The current character. 717f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 char_; 727f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 737f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen DISALLOW_COPY_AND_ASSIGN(UTF8CharIterator); 747f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen}; 757f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 767f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsenclass UTF16CharIterator { 777f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen public: 787f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Requires |str| to live as long as the UTF16CharIterator does. 797f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen UTF16CharIterator(const string16* str); 807f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen UTF16CharIterator(const char16* str, size_t str_len); 8172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen ~UTF16CharIterator(); 827f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 837f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Return the starting array index of the current character within the 847f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // string. 857f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 array_pos() const { return array_pos_; } 867f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 877f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Return the logical index of the current character, independent of the 887f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // number of codewords each character takes. 897f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 char_pos() const { return char_pos_; } 907f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 917f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Return the current char. 927f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 get() const { return char_; } 937f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 947f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Returns true if we're at the end of the string. 957f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen bool end() const { return array_pos_ == len_; } 967f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 977f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Advance to the next actual character. Returns false if we're at the 987f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // end of the string. 997f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen bool Advance(); 1007f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1017f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen private: 1027f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Fills in the current character we found and advances to the next 1037f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // character, updating all flags as necessary. 1047f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen void ReadChar(); 1057f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1067f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The string we're iterating over. 1077f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen const char16* str_; 1087f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1097f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The length of the encoded string. 1107f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 len_; 1117f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1127f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Array index. 1137f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 array_pos_; 1147f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1157f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The next array index. 1167f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 next_pos_; 1177f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1187f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // Character index. 1197f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 char_pos_; 1207f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1217f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen // The current character. 1227f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen int32 char_; 1237f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1247f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen DISALLOW_COPY_AND_ASSIGN(UTF16CharIterator); 1257f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen}; 1267f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 12772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen} // namespace i18n 1287f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen} // namespace base 1297f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen 1307f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#endif // BASE_I18N_CHAR_ITERATOR_H_ 131