11ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Copyright (C) 2006 Google Inc. 21ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 31ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Licensed under the Apache License, Version 2.0 (the "License"); 41ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// you may not use this file except in compliance with the License. 51ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// You may obtain a copy of the License at 61ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 71ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// http://www.apache.org/licenses/LICENSE-2.0 81ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// 91ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unless required by applicable law or agreed to in writing, software 101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// distributed under the License is distributed on an "AS IS" BASIS, 111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// See the License for the specific language governing permissions and 131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// limitations under the License. 141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Author: Jim Meehan 161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 178d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com#include <algorithm> 181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <sstream> 191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <cassert> 208d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com#include <cstdio> 211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/unicodetext.h" 231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/stringpiece.h" 241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//#include "utf/stringprintf.h" 251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/utf.h" 261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/unilib.h" 271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 28fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace i18n { 29fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace phonenumbers { 30fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com 311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::stringstream; 321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::max; 331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::hex; 341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::dec; 351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstatic int CodepointDistance(const char* start, const char* end) { 371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int n = 0; 381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Increment n on every non-trail-byte. 391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com for (const char* p = start; p < end; ++p) { 401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com n += (*reinterpret_cast<const signed char*>(p) >= -0x40); 411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return n; 431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstatic int CodepointCount(const char* utf8, int len) { 461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return CodepointDistance(utf8, utf8 + len); 471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::difference_type 501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comdistance(const UnicodeText::const_iterator& first, 511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const UnicodeText::const_iterator& last) { 521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return CodepointDistance(first.it_, last.it_); 531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ---------- Utility ---------- 561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstatic int ConvertToInterchangeValid(char* start, int len) { 581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // This routine is called only when we've discovered that a UTF-8 buffer 591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // was not interchange valid. This indicates a bug in the caller, and 611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // a LOG(WARNING) is done in that case. 621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // This is similar to CoerceToInterchangeValid, but it replaces each 631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // structurally valid byte with a space, and each non-interchange 641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // character with a space, even when that character requires more 651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is 661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // structurally valid UTF8, but U+FDD0 is not an interchange-valid 671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // code point. The result should contain one space, not three. 681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // 691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Since the conversion never needs to write more data than it 701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // reads, it is safe to change the buffer in place. It returns the 711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // number of bytes written. 721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* const in = start; 731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* out = start; 741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* const end = start + len; 751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com while (start < end) { 761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int good = UniLib::SpanInterchangeValid(start, end - start); 771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (good > 0) { 781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (out != start) { 791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com memmove(out, start, good); 801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com out += good; 821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com start += good; 831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (start == end) { 841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com break; 851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Is the current string invalid UTF8 or just non-interchange UTF8? 881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char32 rune; 891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int n; 901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (isvalidcharntorune(start, end - start, &rune, &n)) { 911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // structurally valid UTF8, but not interchange valid 921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com start += n; // Skip over the whole character. 931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } else { // bad UTF8 941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com start += 1; // Skip over just one byte 951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com *out++ = ' '; 971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return out - in; 991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// *************** Data representation ********** 1031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Note: the copy constructor is undefined. 1051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// After reserve(), resize(), or clear(), we're an owner, not an alias. 1071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::reserve(int new_capacity) { 1091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // If there's already enough capacity, and we're an owner, do nothing. 1101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (capacity_ >= new_capacity && ours_) return; 1111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Otherwise, allocate a new buffer. 1131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); 1141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char* new_data = new char[capacity_]; 1151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // If there is an old buffer, copy it into the new buffer. 1171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (data_) { 1181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com memcpy(new_data, data_, size_); 1191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (ours_) delete[] data_; // If we owned the old buffer, free it. 1201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 1211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com data_ = new_data; 1221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ours_ = true; // We own the new buffer. 1231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // size_ is unchanged. 1241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::resize(int new_size) { 1271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (new_size == 0) { 1281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com clear(); 1291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } else { 1301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (!ours_ || new_size > capacity_) reserve(new_size); 1311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Clear the memory in the expanded part. 1321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); 1331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com size_ = new_size; 1341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ours_ = true; 1351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 1361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// This implementation of clear() deallocates the buffer if we're an owner. 1391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// That's not strictly necessary; we could just set size_ to 0. 1401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::clear() { 1411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (ours_) delete[] data_; 1421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com data_ = NULL; 1431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com size_ = capacity_ = 0; 1441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ours_ = true; 1451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::Copy(const char* data, int size) { 1481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com resize(size); 1491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com memcpy(data_, data, size); 1501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { 1531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (data == data_) return; // We already own this memory. (Weird case.) 1541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 1551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com data_ = data; 1561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com size_ = size; 1571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com capacity_ = capacity; 1581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ours_ = true; 1591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::PointTo(const char* data, int size) { 1621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. 1631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com data_ = const_cast<char*>(data); 1641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com size_ = size; 1651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com capacity_ = size; 1661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ours_ = false; 1671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::append(const char* bytes, int byte_length) { 1701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com reserve(size_ + byte_length); 1711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com memcpy(data_ + size_, bytes, byte_length); 1721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com size_ += byte_length; 1731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::Repr::DebugString() const { 1761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com stringstream ss; 1771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec 1791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com << size_ << " capacity=" << capacity_ << " " 1801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com << (ours_ ? "Owned" : "Alias") << "}"; 1811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com string result; 1831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ss >> result; 1841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return result; 1861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// *************** UnicodeText ****************** 1911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- Constructors ----- 1931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Default constructor 1951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::UnicodeText() { 1961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 1971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 1981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Copy constructor 1991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::UnicodeText(const UnicodeText& src) { 2001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com Copy(src); 2011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Substring constructor 2041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::UnicodeText(const UnicodeText::const_iterator& first, 2051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const UnicodeText::const_iterator& last) { 2061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(first <= last && "Incompatible iterators"); 2071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(first.it_, last.it_ - first.it_); 2081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::UTF8Substring(const const_iterator& first, 2111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const const_iterator& last) { 2121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(first <= last && "Incompatible iterators"); 2131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return string(first.it_, last.it_ - first.it_); 2141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- Copy ----- 2181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::operator=(const UnicodeText& src) { 2201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (this != &src) { 2211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com Copy(src); 2221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::Copy(const UnicodeText& src) { 2271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.Copy(src.repr_.data_, src.repr_.size_); 2281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { 2321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.Copy(buffer, byte_length); 2331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { 2348d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n"); 2351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 2361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, 2411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_length) { 2421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.Copy(buffer, byte_length); 2431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- TakeOwnershipOf ----- 2471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, 2491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_length, 2501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_capacity) { 2511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); 2521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { 2538d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n"); 2541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 2551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, 2601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_length, 2611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_capacity) { 2621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); 2631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- PointTo ----- 2671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { 2691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (UniLib:: IsInterchangeValid(buffer, byte_length)) { 2701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.PointTo(buffer, byte_length); 2711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } else { 2728d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com fprintf(stderr, "UTF-8 buffer is not interchange-valid."); 2731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.Copy(buffer, byte_length); 2741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); 2751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 2761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, 2801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int byte_length) { 2811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.PointTo(buffer, byte_length); 2821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::PointTo(const UnicodeText& src) { 2861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.PointTo(src.repr_.data_, src.repr_.size_); 2871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::PointTo(const const_iterator &first, 2911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const const_iterator &last) { 2921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(first <= last && " Incompatible iterators"); 2931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); 2941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 2951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 2961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- Append ----- 2981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 2991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::append(const UnicodeText& u) { 3001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(u.repr_.data_, u.repr_.size_); 3011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 3021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::append(const const_iterator& first, 3051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const const_iterator& last) { 3061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(first <= last && "Incompatible iterators"); 3071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(first.it_, last.it_ - first.it_); 3081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 3091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { 3121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(utf8, len); 3131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 3141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- substring searching ----- 3171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, 3191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const_iterator start_pos) const { 3201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(start_pos.utf8_data() >= utf8_data()); 3211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); 3221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UnsafeFind(look, start_pos); 3231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { 3261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return UnsafeFind(look, begin()); 3271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::UnsafeFind( 3301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const UnicodeText& look, const_iterator start_pos) const { 3311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Due to the magic of the UTF8 encoding, searching for a sequence of 3321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // letters is equivalent to substring search. 3331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com StringPiece searching(utf8_data(), utf8_length()); 3341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com StringPiece look_piece(look.utf8_data(), look.utf8_length()); 3351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com StringPiece::size_type found = 3361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com searching.find(look_piece, start_pos.utf8_data() - utf8_data()); 3371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (found == StringPiece::npos) return end(); 3381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return const_iterator(utf8_data() + found); 3391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool UnicodeText::HasReplacementChar() const { 3421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Equivalent to: 3431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // UnicodeText replacement_char; 3441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // replacement_char.push_back(0xFFFD); 3451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // return find(replacement_char) != end(); 3461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com StringPiece searching(utf8_data(), utf8_length()); 3471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com StringPiece looking_for("\xEF\xBF\xBD", 3); 3481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return searching.find(looking_for) != StringPiece::npos; 3491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- other methods ----- 3521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Clear operator 3541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::clear() { 3551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.clear(); 3561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Destructor 3591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::~UnicodeText() {} 3601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::push_back(char32 c) { 3631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (UniLib::IsValidCodepoint(c)) { 3641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com char buf[UTFmax]; 3651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int len = runetochar(buf, &c); 3661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (UniLib::IsInterchangeValid(buf, len)) { 3671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(buf, len); 3681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } else { 3698d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c); 3701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(" ", 1); 3711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 3721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } else { 3738d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com fprintf(stderr, "Illegal Unicode value: 0x%x\n", c); 3741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.append(" ", 1); 3751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com } 3761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comint UnicodeText::size() const { 3791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return CodepointCount(repr_.data_, repr_.size_); 3801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { 3831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (&lhs == &rhs) return true; 3841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (lhs.repr_.size_ != rhs.repr_.size_) return false; 3851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; 3861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 3871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::DebugString() const { 3891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com stringstream ss; 3901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 3911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ss << "{UnicodeText " << hex << this << dec << " chars=" 3921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com << size() << " repr=" << repr_.DebugString() << "}"; 3931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#if 0 3941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return StringPrintf("{UnicodeText %p chars=%d repr=%s}", 3951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com this, 3961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com size(), 3971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com repr_.DebugString().c_str()); 3981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#endif 3991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com string result; 4001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ss >> result; 4011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return result; 4031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ******************* UnicodeText::const_iterator ********************* 4071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The implementation of const_iterator would be nicer if it 4091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// inherited from boost::iterator_facade 4101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// (http://boost.org/libs/iterator/doc/iterator_facade.html). 4111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::const_iterator() : it_(0) {} 4131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::const_iterator(const const_iterator& other) 4151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com : it_(other.it_) { 4161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator& 4191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::operator=(const const_iterator& other) { 4201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (&other != this) 4211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com it_ = other.it_; 4221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 4231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::begin() const { 4261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return const_iterator(repr_.data_); 4271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::end() const { 4301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return const_iterator(repr_.data_ + repr_.size_); 4311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool operator<(const UnicodeText::const_iterator& lhs, 4341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const UnicodeText::const_iterator& rhs) { 4351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return lhs.it_ < rhs.it_; 4361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comchar32 UnicodeText::const_iterator::operator*() const { 4391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // (We could call chartorune here, but that does some 4401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // error-checking, and we're guaranteed that our data is valid 4411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // UTF-8. Also, we expect this routine to be called very often. So 4421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // for speed, we do the calculation ourselves.) 4431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com // Convert from UTF-8 4451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com uint8 byte1 = static_cast<uint8>(it_[0]); 4461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (byte1 < 0x80) 4471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return byte1; 4481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com uint8 byte2 = static_cast<uint8>(it_[1]); 4501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (byte1 < 0xE0) 4511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return ((byte1 & 0x1F) << 6) 4521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com | (byte2 & 0x3F); 4531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com uint8 byte3 = static_cast<uint8>(it_[2]); 4551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (byte1 < 0xF0) 4561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return ((byte1 & 0x0F) << 12) 4571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com | ((byte2 & 0x3F) << 6) 4581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com | (byte3 & 0x3F); 4591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com uint8 byte4 = static_cast<uint8>(it_[3]); 4611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return ((byte1 & 0x07) << 18) 4621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com | ((byte2 & 0x3F) << 12) 4631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com | ((byte3 & 0x3F) << 6) 4641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com | (byte4 & 0x3F); 4651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { 4681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com it_ += UniLib::OneCharLen(it_); 4691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 4701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { 4731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com while (UniLib::IsTrailByte(*--it_)) { } 4741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return *this; 4751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comint UnicodeText::const_iterator::get_utf8(char* utf8_output) const { 4781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com utf8_output[0] = it_[0]; 4791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (static_cast<unsigned char>(it_[0]) < 0x80) 4801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return 1; 4811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com utf8_output[1] = it_[1]; 4831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (static_cast<unsigned char>(it_[0]) < 0xE0) 4841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return 2; 4851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com utf8_output[2] = it_[2]; 4871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com if (static_cast<unsigned char>(it_[0]) < 0xF0) 4881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return 3; 4891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com utf8_output[3] = it_[3]; 4911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return 4; 4921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 4931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 4951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { 4961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(p != NULL); 4971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* start = utf8_data(); 4981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com int len = utf8_length(); 4991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com const char* end = start + len; 5001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(p >= start); 5011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(p <= end); 5021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com assert(p == end || !UniLib::IsTrailByte(*p)); 5031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return const_iterator(p); 5041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 5051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 5061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::const_iterator::DebugString() const { 5071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com stringstream ss; 5081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 5091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ss << "{iter " << hex << it_ << "}"; 5101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com string result; 5111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com ss >> result; 5121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com 5131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com return result; 5141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com} 515fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com 516fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com} // namespace phonenumbers 517fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com} // namespace i18n 518