11ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Copyright (C) 2006 Google Inc.
21ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
31ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Licensed under the Apache License, Version 2.0 (the "License");
41ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// you may not use this file except in compliance with the License.
51ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// You may obtain a copy of the License at
61ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
71ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// http://www.apache.org/licenses/LICENSE-2.0
81ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
91ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unless required by applicable law or agreed to in writing, software
101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// distributed under the License is distributed on an "AS IS" BASIS,
111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// See the License for the specific language governing permissions and
131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// limitations under the License.
141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Author: Jim Meehan
161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#ifndef UTIL_UTF8_UNICODETEXT_H__
181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#define UTIL_UTF8_UNICODETEXT_H__
191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <iterator>
211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <string>
221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <utility>
23af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/basictypes.h"
24fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com
25fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace i18n {
26fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace phonenumbers {
271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::string;
291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::bidirectional_iterator_tag;
301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::pair;
311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ***************************** UnicodeText **************************
331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A UnicodeText object is a container for a sequence of Unicode
351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// codepoint values. It has default, copy, and assignment constructors.
361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Data can be appended to it from another UnicodeText, from
371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// iterators, or from a single codepoint.
381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The internal representation of the text is UTF-8. Since UTF-8 is a
401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// variable-width format, UnicodeText does not provide random access
411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// to the text, and changes to the text are permitted only at the end.
421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The UnicodeText class defines a const_iterator. The dereferencing
441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// operator (*) returns a codepoint (char32). The iterator is a
451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// bidirectional, read-only iterator. It becomes invalid if the text
461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// is changed.
471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// There are methods for appending and retrieving UTF-8 data directly.
491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The 'utf8_data' method returns a const char* that contains the
501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8-encoded version of the text; 'utf8_length' returns the number
511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4 bytes of UTF-8 data in a char array and returns the number of
531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// bytes that it stored.
541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 0x10FFFF], but UnicodeText has the additional restriction that it
571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// can contain only those characters that are valid for interchange on
581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// the Web. This excludes all of the control codes except for carriage
591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// return, line feed, and horizontal tab.  It also excludes
601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// non-characters, but codepoints that are in the Private Use regions
611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// are allowed, as are codepoints that are unassigned. (See the
621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unicode reference for details.) The function UniLib::IsInterchangeValid
631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// can be used as a test for this property.
641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeTexts are safe. Every method that constructs or modifies a
661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeText tests for interchange-validity, and will substitute a
671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// space for the invalid data. Such cases are reported via
681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// LOG(WARNING).
691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// MEMORY MANAGEMENT: copy, take ownership, or point to
711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A UnicodeText is either an "owner", meaning that it owns the memory
731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// for the data buffer and will free it when the UnicodeText is
741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// destroyed, or it is an "alias", meaning that it does not.
751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// There are three methods for storing UTF-8 data in a UnicodeText:
771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// CopyUTF8(buffer, len) copies buffer.
791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// PointToUTF8(buffer, size) creates an alias pointing to buffer.
831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// All three methods perform a validity check on the buffer. There are
851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// private, "unsafe" versions of these functions that bypass the
861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// validity check. They are used internally and by friend-functions
871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// that are handling UTF-8 data that has already been validated.
881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The purpose of an alias is to avoid making an unnecessary copy of a
901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 buffer while still providing access to the Unicode values
911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// within that text through iterators or the fast scanners that are
921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// based on UTF-8 state tables. The lifetime of an alias must not
931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// exceed the lifetime of the buffer from which it was constructed.
941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The semantics of an alias might be described as "copy on write or
961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// repair." The source data is never modified. If push_back() or
971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// append() is called on an alias, a copy of the data will be created,
981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// and the UnicodeText will become an owner. If clear() is called on
991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// an alias, it becomes an (empty) owner.
1001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
1011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The copy constructor and the assignment operator produce an owner.
1021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// That is, after direct initialization ("UnicodeText x(y);") or copy
1031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// initialization ("UnicodeText x = y;") x will be an owner, even if y
1041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// was an alias. The assignment operator ("x = y;") also produces an
1051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// owner unless x and y are the same object and y is an alias.
1061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
1071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Aliases should be used with care. If the source from which an alias
1081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// was created is freed, or if the contents are changed, while the
1091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// alias is still in use, fatal errors could result. But it can be
1101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// quite useful to have a UnicodeText "window" through which to see a
1111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 buffer without having to pay the price of making a copy.
1121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
1131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTILITIES
1141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
1151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The interfaces in util/utf8/public/textutils.h provide higher-level
1161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// utilities for dealing with UnicodeTexts, including routines for
1171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
1181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// strings, creating strings from UnicodeTexts, normalizing text for
1191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// efficient matching or display, and others.
1201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comclass UnicodeText {
1221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com public:
1231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  class const_iterator;
1241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  typedef char32 value_type;
1261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Constructors. These always produce owners.
1281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText();  // Create an empty text.
1291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText(const UnicodeText& src);  // copy constructor
1301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Construct a substring (copies the data).
1311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText(const const_iterator& first, const const_iterator& last);
1321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Assignment operator. This copies the data and produces an owner
1341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // unless this == &src, e.g., "x = x;", which is a no-op.
1351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& operator=(const UnicodeText& src);
1361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // x.Copy(y) copies the data from y into x.
1381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& Copy(const UnicodeText& src);
1391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
1401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // x.PointTo(y) changes x so that it points to y's data.
1421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // It does not copy y or take ownership of y's data.
1431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& PointTo(const UnicodeText& src);
1441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& PointTo(const const_iterator& first,
1451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                       const const_iterator& last);
1461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ~UnicodeText();
1481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  void clear();  // Clear text.
1501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  bool empty() { return repr_.size_ == 0; }  // Test if text is empty.
1511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Add a codepoint to the end of the text.
1531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // If the codepoint is not interchange-valid, add a space instead
1541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // and log a warning.
1551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  void push_back(char32 codepoint);
1561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Generic appending operation.
1581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // iterator_traits<ForwardIterator>::value_type must be implicitly
1591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // convertible to char32. Typical uses of this method might include:
1601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //     char32 chars[] = {0x1, 0x2, ...};
1611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //     vector<char32> more_chars = ...;
1621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //     utext.append(chars, chars+arraysize(chars));
1631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //     utext.append(more_chars.begin(), more_chars.end());
1641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  template<typename ForwardIterator>
1651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
1661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    while (first != last) { push_back(*first++); }
1671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return *this;
1681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
1691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // A specialization of the generic append() method.
1711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& append(const const_iterator& first, const const_iterator& last);
1721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // An optimization of append(source.begin(), source.end()).
1741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& append(const UnicodeText& source);
1751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  int size() const;  // the number of Unicode characters (codepoints)
1771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
1791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
1801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  class const_iterator {
1821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    typedef const_iterator CI;
1831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com   public:
1841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    typedef bidirectional_iterator_tag iterator_category;
1851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    typedef char32 value_type;
1861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    typedef ptrdiff_t difference_type;
1871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    typedef void pointer;  // (Not needed.)
1881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    typedef const char32 reference;  // (Needed for const_reverse_iterator)
1891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // Iterators are default-constructible.
1911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator();
1921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // It's safe to make multiple passes over a UnicodeText.
1941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator(const const_iterator& other);
1951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator& operator=(const const_iterator& other);
1961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    char32 operator*() const;  // Dereference
1981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator& operator++();  // Advance (++iter)
2001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator operator++(int) {  // (iter++)
2011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      const_iterator result(*this);
2021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      ++*this;
2031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return result;
2041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
2051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator& operator--();  // Retreat (--iter)
2071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_iterator operator--(int) {  // (iter--)
2081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      const_iterator result(*this);
2091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      --*this;
2101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return result;
2111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
2121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // We love relational operators.
2141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend bool operator==(const CI& lhs, const CI& rhs) {
2151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return lhs.it_ == rhs.it_; }
2161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend bool operator!=(const CI& lhs, const CI& rhs) {
2171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return !(lhs == rhs); }
2181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend bool operator<(const CI& lhs, const CI& rhs);
2191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend bool operator>(const CI& lhs, const CI& rhs) {
2201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return rhs < lhs; }
2211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend bool operator<=(const CI& lhs, const CI& rhs) {
2221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return !(rhs < lhs); }
2231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend bool operator>=(const CI& lhs, const CI& rhs) {
2241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return !(lhs < rhs); }
2251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend difference_type distance(const CI& first, const CI& last);
2271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // UTF-8-specific methods
2291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // Store the UTF-8 encoding of the current codepoint into buf,
2301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // which must be at least 4 bytes long. Return the number of
2311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // bytes written.
2321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int get_utf8(char* buf) const;
2331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // Return the iterator's pointer into the UTF-8 data.
2341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const char* utf8_data() const { return it_; }
2351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    string DebugString() const;
2371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com   private:
2391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend class UnicodeText;
2401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend class UnicodeTextUtils;
2411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    friend class UTF8StateTableProperty;
2421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    explicit const_iterator(const char* it) : it_(it) {}
2431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const char* it_;
2451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  };
2461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_iterator begin() const;
2481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_iterator end() const;
2491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
2511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com   public:
2521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const_reverse_iterator(const_iterator it) :
2531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com        std::reverse_iterator<const_iterator>(it) {}
2541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const char* utf8_data() const {
2551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      const_iterator tmp_it = base();
2561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return (--tmp_it).utf8_data();
2571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
2581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int get_utf8(char* buf) const {
2591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      const_iterator tmp_it = base();
2601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      return (--tmp_it).get_utf8(buf);
2611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
2621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  };
2631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_reverse_iterator rbegin() const {
2641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return const_reverse_iterator(end());
2651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
2661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_reverse_iterator rend() const {
2671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return const_reverse_iterator(begin());
2681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
2691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Substring searching.  Returns the beginning of the first
2711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // occurrence of "look", or end() if not found.
2721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
2731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Equivalent to find(look, begin())
2741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_iterator find(const UnicodeText& look) const;
2751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Returns whether this contains the character U+FFFD.  This can
2771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // occur, for example, if the input to Encodings::Decode() had byte
2781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // sequences that were invalid in the source encoding.
2791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  bool HasReplacementChar() const;
2801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // UTF-8-specific methods
2821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //
2831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Return the data, length, and capacity of UTF-8-encoded version of
2841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // the text. Length and capacity are measured in bytes.
2851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const char* utf8_data() const { return repr_.data_; }
2861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  int utf8_length() const { return repr_.size_; }
2871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  int utf8_capacity() const { return repr_.capacity_; }
2881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Return the UTF-8 data as a string.
2901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  static string UTF8Substring(const const_iterator& first,
2911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                              const const_iterator& last);
2921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // There are three methods for initializing a UnicodeText from UTF-8
2941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // data. They vary in details of memory management. In all cases,
2951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // the data is tested for interchange-validity. If it is not
2961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // interchange-valid, a LOG(WARNING) is issued, and each
2971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // structurally invalid byte and each interchange-invalid codepoint
2981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // is replaced with a space.
2991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // x.CopyUTF8(buf, len) copies buf into x.
3011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
3021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
3041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // buf. buf is not copied.
3051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
3061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                   int byte_length,
3071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                   int byte_capacity);
3081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // x.PointToUTF8(buf,len) changes x so that it points to buf
3101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // ("becomes an alias"). It does not take ownership or copy buf.
3111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // If the buffer is not valid, this has the same effect as
3121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // CopyUTF8(utf8_buffer, byte_length).
3131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
3141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Occasionally it is necessary to use functions that operate on the
3161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // pointer returned by utf8_data(). MakeIterator(p) provides a way
3171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // to get back to the UnicodeText level. It uses CHECK to ensure
3181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // that p is a pointer within this object's UTF-8 data, and that it
3191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // points to the beginning of a character.
3201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_iterator MakeIterator(const char* p) const;
3211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  string DebugString() const;
3231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com private:
3251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  friend class const_iterator;
3261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  friend class UnicodeTextUtils;
3271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  class Repr {  // A byte-string.
3291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com   public:
3301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    char* data_;
3311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int size_;
3321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int capacity_;
3331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    bool ours_;  // Do we own data_?
3341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
3361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    ~Repr() { if (ours_) delete[] data_; }
3371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void clear();
3391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void reserve(int capacity);
3401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void resize(int size);
3411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void append(const char* bytes, int byte_length);
3431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void Copy(const char* data, int size);
3441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void TakeOwnershipOf(char* data, int size, int capacity);
3451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    void PointTo(const char* data, int size);
3461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    string DebugString() const;
3481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com   private:
3501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    Repr& operator=(const Repr&);
3511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    Repr(const Repr& other);
3521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  };
3531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  Repr repr_;
3551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // UTF-8-specific private methods.
3571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // These routines do not perform a validity check when compiled
3581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // in opt mode.
3591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // It is an error to call these methods with UTF-8 data that
3601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // is not interchange-valid.
3611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //
3621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
3631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& UnsafeTakeOwnershipOfUTF8(
3641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      char* utf8_buffer, int byte_length, int byte_capacity);
3651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
3661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
3671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const_iterator UnsafeFind(const UnicodeText& look,
3681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                            const_iterator start_pos) const;
3691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com};
3701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
3721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
3741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return !(lhs == rhs);
3751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeTextRange is a pair of iterators, useful for specifying text
3781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// segments. If the iterators are ==, the segment is empty.
3791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comtypedef pair<UnicodeText::const_iterator,
3801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com             UnicodeText::const_iterator> UnicodeTextRange;
3811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
3831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return r.first == r.second;
3841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// *************************** Utilities *************************
3881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A factory function for creating a UnicodeText from a buffer of
3901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
3911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// is an "owner.")
3921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
3931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Each byte that is structurally invalid will be replaced with a
3941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// space. Each codepoint that is interchange-invalid will also be
3951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// replaced with a space, even if the codepoint was represented with a
3961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// multibyte sequence in the UTF-8 data.
3971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
3981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText MakeUnicodeTextAcceptingOwnership(
3991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    char* utf8_buffer, int byte_length, int byte_capacity) {
4001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UnicodeText().TakeOwnershipOfUTF8(
4011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      utf8_buffer, byte_length, byte_capacity);
4021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A factory function for creating a UnicodeText from a buffer of
4051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 data. The new UnicodeText does not take ownership of the
4061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// buffer. (It is an "alias.")
4071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
4081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
4091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const char* utf8_buffer, int byte_length) {
4101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
4111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Create a UnicodeText from a UTF-8 string or buffer.
4141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
4151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// If do_copy is true, then a copy of the string is made. The copy is
4161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// owned by the resulting UnicodeText object and will be freed when
4171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// the object is destroyed. This UnicodeText object is referred to
4181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// as an "owner."
4191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
4201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// If do_copy is false, then no copy is made. The resulting
4211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeText object does NOT take ownership of the string; in this
4221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// case, the lifetime of the UnicodeText object must not exceed the
4231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// lifetime of the string. This Unicodetext object is referred to as
4241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
4251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
4261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// If the input string does not contain valid UTF-8, then a copy is
4271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// made (as if do_copy were true) and coerced to valid UTF-8 by
4281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// replacing each invalid byte with a space.
4291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
4301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
4311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                     bool do_copy) {
4321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  UnicodeText t;
4331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (do_copy) {
4341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    t.CopyUTF8(utf8_buf, len);
4351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  } else {
4361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    t.PointToUTF8(utf8_buf, len);
4371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
4381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return t;
4391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
4421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
4431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
4461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UTF8ToUnicodeText(utf8_buf, len, true);
4471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
4491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UTF8ToUnicodeText(utf8_string, true);
4501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Return a string containing the UTF-8 encoded version of all the
4531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unicode characters in t.
4541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline string UnicodeTextToUTF8(const UnicodeText& t) {
4551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return string(t.utf8_data(), t.utf8_length());
4561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
458fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com}  // namespace phonenumbers
459fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com}  // namespace i18n
460fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com
4611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#endif  // UTIL_UTF8_UNICODETEXT_H__
462