172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved.
27f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// Use of this source code is governed by a BSD-style license that can be
37f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// found in the LICENSE file.
47f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
57f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#ifndef BASE_I18N_CHAR_ITERATOR_H_
67f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#define BASE_I18N_CHAR_ITERATOR_H_
77f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#pragma once
87f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
97f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#include <string>
107f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
117f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#include "base/basictypes.h"
127f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#include "base/string16.h"
137f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
147f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// The CharIterator classes iterate through the characters in UTF8 and
157f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen// UTF16 strings.  Example usage:
167f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen//
177f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen//   UTF8CharIterator iter(&str);
187f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen//   while (!iter.End()) {
19731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick//     VLOG(1) << iter.get();
207f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen//     iter.Advance();
217f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen//   }
227f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
237f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#if defined(OS_WIN)
247f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsentypedef unsigned char uint8_t;
257f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#endif
267f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
277f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsennamespace base {
2872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsennamespace i18n {
297f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
307f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsenclass UTF8CharIterator {
317f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen public:
327f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Requires |str| to live as long as the UTF8CharIterator does.
337f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  UTF8CharIterator(const std::string* str);
3472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  ~UTF8CharIterator();
357f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
367f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Return the starting array index of the current character within the
377f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // string.
387f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 array_pos() const { return array_pos_; }
397f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
407f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Return the logical index of the current character, independent of the
417f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // number of bytes each character takes.
427f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 char_pos() const { return char_pos_; }
437f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
447f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Return the current char.
457f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 get() const { return char_; }
467f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
477f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Returns true if we're at the end of the string.
487f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  bool end() const { return array_pos_ == len_; }
497f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
507f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Advance to the next actual character.  Returns false if we're at the
517f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // end of the string.
527f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  bool Advance();
537f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
547f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen private:
557f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The string we're iterating over.
567f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  const uint8_t* str_;
577f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
587f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The length of the encoded string.
597f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 len_;
607f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
617f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Array index.
627f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 array_pos_;
637f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
647f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The next array index.
657f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 next_pos_;
667f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
677f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Character index.
687f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 char_pos_;
697f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
707f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The current character.
717f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 char_;
727f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
737f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  DISALLOW_COPY_AND_ASSIGN(UTF8CharIterator);
747f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen};
757f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
767f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsenclass UTF16CharIterator {
777f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen public:
787f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Requires |str| to live as long as the UTF16CharIterator does.
797f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  UTF16CharIterator(const string16* str);
807f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  UTF16CharIterator(const char16* str, size_t str_len);
8172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  ~UTF16CharIterator();
827f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
837f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Return the starting array index of the current character within the
847f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // string.
857f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 array_pos() const { return array_pos_; }
867f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
877f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Return the logical index of the current character, independent of the
887f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // number of codewords each character takes.
897f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 char_pos() const { return char_pos_; }
907f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
917f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Return the current char.
927f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 get() const { return char_; }
937f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
947f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Returns true if we're at the end of the string.
957f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  bool end() const { return array_pos_ == len_; }
967f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
977f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Advance to the next actual character.  Returns false if we're at the
987f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // end of the string.
997f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  bool Advance();
1007f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1017f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen private:
1027f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Fills in the current character we found and advances to the next
1037f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // character, updating all flags as necessary.
1047f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  void ReadChar();
1057f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1067f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The string we're iterating over.
1077f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  const char16* str_;
1087f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1097f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The length of the encoded string.
1107f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 len_;
1117f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1127f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Array index.
1137f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 array_pos_;
1147f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1157f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The next array index.
1167f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 next_pos_;
1177f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1187f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // Character index.
1197f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 char_pos_;
1207f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1217f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  // The current character.
1227f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  int32 char_;
1237f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1247f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen  DISALLOW_COPY_AND_ASSIGN(UTF16CharIterator);
1257f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen};
1267f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
12772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}  // namespace i18n
1287f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen}  // namespace base
1297f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen
1307f92e4ca7bcb209f5d9b38cacaf3a1c6cbe493aaKristian Monsen#endif  // BASE_I18N_CHAR_ITERATOR_H_
131