11ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Copyright (C) 2006 Google Inc. 21ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 31ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Licensed under the Apache License, Version 2.0 (the "License"); 41ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// you may not use this file except in compliance with the License. 51ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// You may obtain a copy of the License at 61ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 71ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// http://www.apache.org/licenses/LICENSE-2.0 81ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 91ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unless required by applicable law or agreed to in writing, software 101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// distributed under the License is distributed on an "AS IS" BASIS, 111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// See the License for the specific language governing permissions and 131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// limitations under the License. 141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Author: Jim Meehan 161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#ifndef UTIL_UTF8_UNICODETEXT_H__ 181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#define UTIL_UTF8_UNICODETEXT_H__ 191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <iterator> 211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <string> 221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <utility> 23af4a2ce290b619b39c2cb2a682ea4d7746d3fb21philip.liard@gmail.com#include "phonenumbers/base/basictypes.h" 24fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com 25fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace i18n { 26fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace phonenumbers { 271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::string; 291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::bidirectional_iterator_tag; 301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::pair; 311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ***************************** UnicodeText ************************** 331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A UnicodeText object is a container for a sequence of Unicode 351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// codepoint values. It has default, copy, and assignment constructors. 361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Data can be appended to it from another UnicodeText, from 371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// iterators, or from a single codepoint. 381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The internal representation of the text is UTF-8. Since UTF-8 is a 401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// variable-width format, UnicodeText does not provide random access 411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// to the text, and changes to the text are permitted only at the end. 421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The UnicodeText class defines a const_iterator. The dereferencing 441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// operator (*) returns a codepoint (char32). The iterator is a 451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// bidirectional, read-only iterator. It becomes invalid if the text 461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// is changed. 471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// There are methods for appending and retrieving UTF-8 data directly. 491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The 'utf8_data' method returns a const char* that contains the 501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8-encoded version of the text; 'utf8_length' returns the number 511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// of bytes in the UTF-8 data. An iterator's 'get' method stores up to 521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4 bytes of UTF-8 data in a char array and returns the number of 531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// bytes that it stored. 541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Codepoints are integers in the range [0, 0xD7FF] or [0xE000, 561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 0x10FFFF], but UnicodeText has the additional restriction that it 571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// can contain only those characters that are valid for interchange on 581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// the Web. This excludes all of the control codes except for carriage 591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// return, line feed, and horizontal tab. It also excludes 601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// non-characters, but codepoints that are in the Private Use regions 611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// are allowed, as are codepoints that are unassigned. (See the 621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unicode reference for details.) The function UniLib::IsInterchangeValid 631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// can be used as a test for this property. 641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeTexts are safe. Every method that constructs or modifies a 661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeText tests for interchange-validity, and will substitute a 671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// space for the invalid data. Such cases are reported via 681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// LOG(WARNING). 691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// MEMORY MANAGEMENT: copy, take ownership, or point to 711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A UnicodeText is either an "owner", meaning that it owns the memory 731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// for the data buffer and will free it when the UnicodeText is 741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// destroyed, or it is an "alias", meaning that it does not. 751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// There are three methods for storing UTF-8 data in a UnicodeText: 771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// CopyUTF8(buffer, len) copies buffer. 791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. 811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// PointToUTF8(buffer, size) creates an alias pointing to buffer. 831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// All three methods perform a validity check on the buffer. There are 851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// private, "unsafe" versions of these functions that bypass the 861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// validity check. They are used internally and by friend-functions 871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// that are handling UTF-8 data that has already been validated. 881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The purpose of an alias is to avoid making an unnecessary copy of a 901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 buffer while still providing access to the Unicode values 911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// within that text through iterators or the fast scanners that are 921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// based on UTF-8 state tables. The lifetime of an alias must not 931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// exceed the lifetime of the buffer from which it was constructed. 941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The semantics of an alias might be described as "copy on write or 961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// repair." The source data is never modified. If push_back() or 971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// append() is called on an alias, a copy of the data will be created, 981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// and the UnicodeText will become an owner. If clear() is called on 991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// an alias, it becomes an (empty) owner. 1001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 1011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The copy constructor and the assignment operator produce an owner. 1021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// That is, after direct initialization ("UnicodeText x(y);") or copy 1031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// initialization ("UnicodeText x = y;") x will be an owner, even if y 1041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// was an alias. The assignment operator ("x = y;") also produces an 1051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// owner unless x and y are the same object and y is an alias. 1061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 1071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Aliases should be used with care. If the source from which an alias 1081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// was created is freed, or if the contents are changed, while the 1091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// alias is still in use, fatal errors could result. But it can be 1101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// quite useful to have a UnicodeText "window" through which to see a 1111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 buffer without having to pay the price of making a copy. 1121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 1131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTILITIES 1141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 1151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The interfaces in util/utf8/public/textutils.h provide higher-level 1161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// utilities for dealing with UnicodeTexts, including routines for 1171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or 1181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// strings, creating strings from UnicodeTexts, normalizing text for 1191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// efficient matching or display, and others. 1201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comclass UnicodeText { 1221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com public: 1231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com class const_iterator; 1241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef char32 value_type; 1261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Constructors. These always produce owners. 1281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText(); // Create an empty text. 1291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText(const UnicodeText& src); // copy constructor 1301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Construct a substring (copies the data). 1311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText(const const_iterator& first, const const_iterator& last); 1321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Assignment operator. This copies the data and produces an owner 1341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // unless this == &src, e.g., "x = x;", which is a no-op. 1351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& operator=(const UnicodeText& src); 1361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // x.Copy(y) copies the data from y into x. 1381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& Copy(const UnicodeText& src); 1391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } 1401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // x.PointTo(y) changes x so that it points to y's data. 1421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // It does not copy y or take ownership of y's data. 1431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& PointTo(const UnicodeText& src); 1441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& PointTo(const const_iterator& first, 1451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const const_iterator& last); 1461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ~UnicodeText(); 1481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void clear(); // Clear text. 1501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com bool empty() { return repr_.size_ == 0; } // Test if text is empty. 1511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Add a codepoint to the end of the text. 1531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // If the codepoint is not interchange-valid, add a space instead 1541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // and log a warning. 1551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void push_back(char32 codepoint); 1561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Generic appending operation. 1581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // iterator_traits<ForwardIterator>::value_type must be implicitly 1591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // convertible to char32. Typical uses of this method might include: 1601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // char32 chars[] = {0x1, 0x2, ...}; 1611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // vector<char32> more_chars = ...; 1621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // utext.append(chars, chars+arraysize(chars)); 1631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // utext.append(more_chars.begin(), more_chars.end()); 1641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com template<typename ForwardIterator> 1651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& append(ForwardIterator first, const ForwardIterator last) { 1661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com while (first != last) { push_back(*first++); } 1671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 1681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 1691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // A specialization of the generic append() method. 1711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& append(const const_iterator& first, const const_iterator& last); 1721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // An optimization of append(source.begin(), source.end()). 1741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& append(const UnicodeText& source); 1751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int size() const; // the number of Unicode characters (codepoints) 1771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); 1791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); 1801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com class const_iterator { 1821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef const_iterator CI; 1831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com public: 1841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef bidirectional_iterator_tag iterator_category; 1851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef char32 value_type; 1861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef ptrdiff_t difference_type; 1871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef void pointer; // (Not needed.) 1881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com typedef const char32 reference; // (Needed for const_reverse_iterator) 1891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Iterators are default-constructible. 1911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator(); 1921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // It's safe to make multiple passes over a UnicodeText. 1941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator(const const_iterator& other); 1951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator& operator=(const const_iterator& other); 1961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char32 operator*() const; // Dereference 1981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator& operator++(); // Advance (++iter) 2001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator operator++(int) { // (iter++) 2011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator result(*this); 2021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ++*this; 2031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return result; 2041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator& operator--(); // Retreat (--iter) 2071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator operator--(int) { // (iter--) 2081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator result(*this); 2091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com --*this; 2101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return result; 2111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // We love relational operators. 2141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator==(const CI& lhs, const CI& rhs) { 2151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return lhs.it_ == rhs.it_; } 2161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator!=(const CI& lhs, const CI& rhs) { 2171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return !(lhs == rhs); } 2181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator<(const CI& lhs, const CI& rhs); 2191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator>(const CI& lhs, const CI& rhs) { 2201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return rhs < lhs; } 2211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator<=(const CI& lhs, const CI& rhs) { 2221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return !(rhs < lhs); } 2231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend bool operator>=(const CI& lhs, const CI& rhs) { 2241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return !(lhs < rhs); } 2251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend difference_type distance(const CI& first, const CI& last); 2271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // UTF-8-specific methods 2291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Store the UTF-8 encoding of the current codepoint into buf, 2301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // which must be at least 4 bytes long. Return the number of 2311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // bytes written. 2321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int get_utf8(char* buf) const; 2331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Return the iterator's pointer into the UTF-8 data. 2341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* utf8_data() const { return it_; } 2351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com string DebugString() const; 2371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com private: 2391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend class UnicodeText; 2401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend class UnicodeTextUtils; 2411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend class UTF8StateTableProperty; 2421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com explicit const_iterator(const char* it) : it_(it) {} 2431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* it_; 2451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com }; 2461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator begin() const; 2481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator end() const; 2491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com class const_reverse_iterator : public std::reverse_iterator<const_iterator> { 2511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com public: 2521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_reverse_iterator(const_iterator it) : 2531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com std::reverse_iterator<const_iterator>(it) {} 2541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* utf8_data() const { 2551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator tmp_it = base(); 2561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return (--tmp_it).utf8_data(); 2571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int get_utf8(char* buf) const { 2591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator tmp_it = base(); 2601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return (--tmp_it).get_utf8(buf); 2611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com }; 2631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_reverse_iterator rbegin() const { 2641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return const_reverse_iterator(end()); 2651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_reverse_iterator rend() const { 2671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return const_reverse_iterator(begin()); 2681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Substring searching. Returns the beginning of the first 2711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // occurrence of "look", or end() if not found. 2721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator find(const UnicodeText& look, const_iterator start_pos) const; 2731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Equivalent to find(look, begin()) 2741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator find(const UnicodeText& look) const; 2751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Returns whether this contains the character U+FFFD. This can 2771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // occur, for example, if the input to Encodings::Decode() had byte 2781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // sequences that were invalid in the source encoding. 2791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com bool HasReplacementChar() const; 2801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // UTF-8-specific methods 2821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // 2831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Return the data, length, and capacity of UTF-8-encoded version of 2841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // the text. Length and capacity are measured in bytes. 2851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* utf8_data() const { return repr_.data_; } 2861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int utf8_length() const { return repr_.size_; } 2871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int utf8_capacity() const { return repr_.capacity_; } 2881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Return the UTF-8 data as a string. 2901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com static string UTF8Substring(const const_iterator& first, 2911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const const_iterator& last); 2921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // There are three methods for initializing a UnicodeText from UTF-8 2941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // data. They vary in details of memory management. In all cases, 2951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // the data is tested for interchange-validity. If it is not 2961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // interchange-valid, a LOG(WARNING) is issued, and each 2971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // structurally invalid byte and each interchange-invalid codepoint 2981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // is replaced with a space. 2991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // x.CopyUTF8(buf, len) copies buf into x. 3011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); 3021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of 3041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // buf. buf is not copied. 3051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, 3061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_length, 3071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_capacity); 3081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // x.PointToUTF8(buf,len) changes x so that it points to buf 3101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // ("becomes an alias"). It does not take ownership or copy buf. 3111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // If the buffer is not valid, this has the same effect as 3121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // CopyUTF8(utf8_buffer, byte_length). 3131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); 3141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Occasionally it is necessary to use functions that operate on the 3161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // pointer returned by utf8_data(). MakeIterator(p) provides a way 3171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // to get back to the UnicodeText level. It uses CHECK to ensure 3181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // that p is a pointer within this object's UTF-8 data, and that it 3191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // points to the beginning of a character. 3201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator MakeIterator(const char* p) const; 3211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com string DebugString() const; 3231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com private: 3251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend class const_iterator; 3261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com friend class UnicodeTextUtils; 3271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com class Repr { // A byte-string. 3291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com public: 3301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* data_; 3311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int size_; 3321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int capacity_; 3331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com bool ours_; // Do we own data_? 3341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} 3361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ~Repr() { if (ours_) delete[] data_; } 3371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void clear(); 3391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void reserve(int capacity); 3401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void resize(int size); 3411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void append(const char* bytes, int byte_length); 3431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void Copy(const char* data, int size); 3441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void TakeOwnershipOf(char* data, int size, int capacity); 3451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com void PointTo(const char* data, int size); 3461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com string DebugString() const; 3481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com private: 3501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com Repr& operator=(const Repr&); 3511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com Repr(const Repr& other); 3521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com }; 3531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com Repr repr_; 3551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // UTF-8-specific private methods. 3571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // These routines do not perform a validity check when compiled 3581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // in opt mode. 3591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // It is an error to call these methods with UTF-8 data that 3601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // is not interchange-valid. 3611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // 3621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); 3631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& UnsafeTakeOwnershipOfUTF8( 3641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* utf8_buffer, int byte_length, int byte_capacity); 3651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); 3661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); 3671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator UnsafeFind(const UnicodeText& look, 3681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator start_pos) const; 3691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}; 3701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool operator==(const UnicodeText& lhs, const UnicodeText& rhs); 3721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { 3741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return !(lhs == rhs); 3751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeTextRange is a pair of iterators, useful for specifying text 3781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// segments. If the iterators are ==, the segment is empty. 3791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comtypedef pair<UnicodeText::const_iterator, 3801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText::const_iterator> UnicodeTextRange; 3811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { 3831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return r.first == r.second; 3841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// *************************** Utilities ************************* 3881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A factory function for creating a UnicodeText from a buffer of 3901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It 3911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// is an "owner.") 3921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 3931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Each byte that is structurally invalid will be replaced with a 3941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// space. Each codepoint that is interchange-invalid will also be 3951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// replaced with a space, even if the codepoint was represented with a 3961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// multibyte sequence in the UTF-8 data. 3971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 3981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText MakeUnicodeTextAcceptingOwnership( 3991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* utf8_buffer, int byte_length, int byte_capacity) { 4001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UnicodeText().TakeOwnershipOfUTF8( 4011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com utf8_buffer, byte_length, byte_capacity); 4021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// A factory function for creating a UnicodeText from a buffer of 4051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UTF-8 data. The new UnicodeText does not take ownership of the 4061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// buffer. (It is an "alias.") 4071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( 4091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* utf8_buffer, int byte_length) { 4101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UnicodeText().PointToUTF8(utf8_buffer, byte_length); 4111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Create a UnicodeText from a UTF-8 string or buffer. 4141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// If do_copy is true, then a copy of the string is made. The copy is 4161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// owned by the resulting UnicodeText object and will be freed when 4171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// the object is destroyed. This UnicodeText object is referred to 4181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// as an "owner." 4191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// If do_copy is false, then no copy is made. The resulting 4211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// UnicodeText object does NOT take ownership of the string; in this 4221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// case, the lifetime of the UnicodeText object must not exceed the 4231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// lifetime of the string. This Unicodetext object is referred to as 4241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. 4251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// If the input string does not contain valid UTF-8, then a copy is 4271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// made (as if do_copy were true) and coerced to valid UTF-8 by 4281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// replacing each invalid byte with a space. 4291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 4301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, 4311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com bool do_copy) { 4321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com UnicodeText t; 4331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (do_copy) { 4341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com t.CopyUTF8(utf8_buf, len); 4351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } else { 4361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com t.PointToUTF8(utf8_buf, len); 4371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 4381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return t; 4391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { 4421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); 4431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { 4461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UTF8ToUnicodeText(utf8_buf, len, true); 4471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { 4491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UTF8ToUnicodeText(utf8_string, true); 4501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Return a string containing the UTF-8 encoded version of all the 4531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unicode characters in t. 4541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.cominline string UnicodeTextToUTF8(const UnicodeText& t) { 4551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return string(t.utf8_data(), t.utf8_length()); 4561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 458fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com} // namespace phonenumbers 459fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com} // namespace i18n 460fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com 4611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#endif // UTIL_UTF8_UNICODETEXT_H__ 462