11ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Copyright (C) 2006 Google Inc.
21ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
31ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Licensed under the Apache License, Version 2.0 (the "License");
41ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// you may not use this file except in compliance with the License.
51ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// You may obtain a copy of the License at
61ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
71ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// http://www.apache.org/licenses/LICENSE-2.0
81ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//
91ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Unless required by applicable law or agreed to in writing, software
101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// distributed under the License is distributed on an "AS IS" BASIS,
111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// See the License for the specific language governing permissions and
131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// limitations under the License.
141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Author: Jim Meehan
161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
178d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com#include <algorithm>
181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <sstream>
191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include <cassert>
208d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com#include <cstdio>
211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/unicodetext.h"
231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/stringpiece.h"
241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com//#include "utf/stringprintf.h"
251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/utf.h"
261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#include "phonenumbers/utf/unilib.h"
271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
28fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace i18n {
29fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.comnamespace phonenumbers {
30fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com
311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::stringstream;
321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::max;
331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::hex;
341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comusing std::dec;
351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstatic int CodepointDistance(const char* start, const char* end) {
371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  int n = 0;
381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Increment n on every non-trail-byte.
391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  for (const char* p = start; p < end; ++p) {
401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return n;
431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstatic int CodepointCount(const char* utf8, int len) {
461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return CodepointDistance(utf8, utf8 + len);
471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::difference_type
501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comdistance(const UnicodeText::const_iterator& first,
511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com         const UnicodeText::const_iterator& last) {
521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return CodepointDistance(first.it_, last.it_);
531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ---------- Utility ----------
561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstatic int ConvertToInterchangeValid(char* start, int len) {
581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // This routine is called only when we've discovered that a UTF-8 buffer
591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // was not interchange valid. This indicates a bug in the caller, and
611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // a LOG(WARNING) is done in that case.
621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // This is similar to CoerceToInterchangeValid, but it replaces each
631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // structurally valid byte with a space, and each non-interchange
641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // character with a space, even when that character requires more
651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // code point. The result should contain one space, not three.
681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //
691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Since the conversion never needs to write more data than it
701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // reads, it is safe to change the buffer in place. It returns the
711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // number of bytes written.
721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  char* const in = start;
731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  char* out = start;
741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  char* const end = start + len;
751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  while (start < end) {
761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int good = UniLib::SpanInterchangeValid(start, end - start);
771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    if (good > 0) {
781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      if (out != start) {
791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com        memmove(out, start, good);
801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      }
811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      out += good;
821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      start += good;
831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      if (start == end) {
841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com        break;
851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      }
861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // Is the current string invalid UTF8 or just non-interchange UTF8?
881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    char32 rune;
891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int n;
901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    if (isvalidcharntorune(start, end - start, &rune, &n)) {
911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      // structurally valid UTF8, but not interchange valid
921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      start += n;  // Skip over the whole character.
931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    } else {  // bad UTF8
941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      start += 1;  // Skip over just one byte
951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    *out++ = ' ';
971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return out - in;
991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// *************** Data representation **********
1031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Note: the copy constructor is undefined.
1051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// After reserve(), resize(), or clear(), we're an owner, not an alias.
1071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::reserve(int new_capacity) {
1091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // If there's already enough capacity, and we're an owner, do nothing.
1101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (capacity_ >= new_capacity && ours_) return;
1111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Otherwise, allocate a new buffer.
1131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
1141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  char* new_data = new char[capacity_];
1151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // If there is an old buffer, copy it into the new buffer.
1171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (data_) {
1181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    memcpy(new_data, data_, size_);
1191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    if (ours_) delete[] data_;  // If we owned the old buffer, free it.
1201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
1211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  data_ = new_data;
1221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ours_ = true;  // We own the new buffer.
1231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // size_ is unchanged.
1241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::resize(int new_size) {
1271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (new_size == 0) {
1281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    clear();
1291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  } else {
1301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    if (!ours_ || new_size > capacity_) reserve(new_size);
1311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    // Clear the memory in the expanded part.
1321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
1331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    size_ = new_size;
1341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    ours_ = true;
1351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
1361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// This implementation of clear() deallocates the buffer if we're an owner.
1391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// That's not strictly necessary; we could just set size_ to 0.
1401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::clear() {
1411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (ours_) delete[] data_;
1421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  data_ = NULL;
1431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  size_ = capacity_ = 0;
1441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ours_ = true;
1451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::Copy(const char* data, int size) {
1481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  resize(size);
1491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  memcpy(data_, data, size);
1501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
1531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (data == data_) return;  // We already own this memory. (Weird case.)
1541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
1551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  data_ = data;
1561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  size_ = size;
1571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  capacity_ = capacity;
1581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ours_ = true;
1591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::PointTo(const char* data, int size) {
1621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
1631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  data_ = const_cast<char*>(data);
1641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  size_ = size;
1651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  capacity_ = size;
1661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ours_ = false;
1671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::Repr::append(const char* bytes, int byte_length) {
1701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  reserve(size_ + byte_length);
1711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  memcpy(data_ + size_, bytes, byte_length);
1721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  size_ += byte_length;
1731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::Repr::DebugString() const {
1761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  stringstream ss;
1771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
1791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com     << size_ << " capacity=" << capacity_ << " "
1801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com     << (ours_ ? "Owned" : "Alias") << "}";
1811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  string result;
1831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ss >> result;
1841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return result;
1861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// *************** UnicodeText ******************
1911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- Constructors -----
1931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Default constructor
1951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::UnicodeText() {
1961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
1971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
1981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Copy constructor
1991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::UnicodeText(const UnicodeText& src) {
2001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  Copy(src);
2011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Substring constructor
2041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
2051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                         const UnicodeText::const_iterator& last) {
2061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(first <= last && "Incompatible iterators");
2071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.append(first.it_, last.it_ - first.it_);
2081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::UTF8Substring(const const_iterator& first,
2111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                  const const_iterator& last) {
2121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(first <= last && "Incompatible iterators");
2131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return string(first.it_, last.it_ - first.it_);
2141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- Copy -----
2181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::operator=(const UnicodeText& src) {
2201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (this != &src) {
2211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    Copy(src);
2221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
2231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::Copy(const UnicodeText& src) {
2271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.Copy(src.repr_.data_, src.repr_.size_);
2281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
2321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.Copy(buffer, byte_length);
2331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
2348d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com    fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
2351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
2361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
2371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
2411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                           int byte_length) {
2421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.Copy(buffer, byte_length);
2431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- TakeOwnershipOf  -----
2471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
2491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                              int byte_length,
2501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                              int byte_capacity) {
2511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
2521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
2538d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com    fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
2541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
2551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
2561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
2601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                                    int byte_length,
2611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                                    int byte_capacity) {
2621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
2631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- PointTo -----
2671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
2691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
2701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    repr_.PointTo(buffer, byte_length);
2711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  } else {
2728d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com    fprintf(stderr, "UTF-8 buffer is not interchange-valid.");
2731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    repr_.Copy(buffer, byte_length);
2741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
2751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
2761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
2801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                          int byte_length) {
2811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.PointTo(buffer, byte_length);
2821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
2861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.PointTo(src.repr_.data_, src.repr_.size_);
2871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::PointTo(const const_iterator &first,
2911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                  const const_iterator &last) {
2921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(first <= last && " Incompatible iterators");
2931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
2941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
2951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
2961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- Append -----
2981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
2991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::append(const UnicodeText& u) {
3001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.append(u.repr_.data_, u.repr_.size_);
3011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
3021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::append(const const_iterator& first,
3051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                 const const_iterator& last) {
3061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(first <= last && "Incompatible iterators");
3071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.append(first.it_, last.it_ - first.it_);
3081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
3091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
3121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.append(utf8, len);
3131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
3141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- substring searching -----
3171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
3191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                                              const_iterator start_pos) const {
3201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(start_pos.utf8_data() >= utf8_data());
3211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
3221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UnsafeFind(look, start_pos);
3231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
3261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return UnsafeFind(look, begin());
3271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::UnsafeFind(
3301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    const UnicodeText& look, const_iterator start_pos) const {
3311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Due to the magic of the UTF8 encoding, searching for a sequence of
3321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // letters is equivalent to substring search.
3331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  StringPiece searching(utf8_data(), utf8_length());
3341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  StringPiece look_piece(look.utf8_data(), look.utf8_length());
3351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  StringPiece::size_type found =
3361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      searching.find(look_piece, start_pos.utf8_data() - utf8_data());
3371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (found == StringPiece::npos) return end();
3381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return const_iterator(utf8_data() + found);
3391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool UnicodeText::HasReplacementChar() const {
3421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Equivalent to:
3431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //   UnicodeText replacement_char;
3441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //   replacement_char.push_back(0xFFFD);
3451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  //   return find(replacement_char) != end();
3461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  StringPiece searching(utf8_data(), utf8_length());
3471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  StringPiece looking_for("\xEF\xBF\xBD", 3);
3481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return searching.find(looking_for) != StringPiece::npos;
3491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ----- other methods -----
3521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Clear operator
3541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::clear() {
3551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  repr_.clear();
3561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// Destructor
3591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::~UnicodeText() {}
3601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comvoid UnicodeText::push_back(char32 c) {
3631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (UniLib::IsValidCodepoint(c)) {
3641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    char buf[UTFmax];
3651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    int len = runetochar(buf, &c);
3661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    if (UniLib::IsInterchangeValid(buf, len)) {
3671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      repr_.append(buf, len);
3681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    } else {
3698d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com      fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c);
3701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com      repr_.append(" ", 1);
3711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    }
3721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  } else {
3738d8b5b3b2035197795d27573d4cf566b5d9ad689philip.liard@gmail.com    fprintf(stderr, "Illegal Unicode value: 0x%x\n", c);
3741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    repr_.append(" ", 1);
3751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  }
3761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comint UnicodeText::size() const {
3791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return CodepointCount(repr_.data_, repr_.size_);
3801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
3831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (&lhs == &rhs) return true;
3841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
3851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
3861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
3871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::DebugString() const {
3891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  stringstream ss;
3901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
3911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ss << "{UnicodeText " << hex << this << dec << " chars="
3921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com     << size() << " repr=" << repr_.DebugString() << "}";
3931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#if 0
3941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
3951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                      this,
3961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                      size(),
3971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com                      repr_.DebugString().c_str());
3981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com#endif
3991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  string result;
4001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ss >> result;
4011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return result;
4031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// ******************* UnicodeText::const_iterator *********************
4071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// The implementation of const_iterator would be nicer if it
4091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// inherited from boost::iterator_facade
4101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com// (http://boost.org/libs/iterator/doc/iterator_facade.html).
4111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::const_iterator() : it_(0) {}
4131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::const_iterator(const const_iterator& other)
4151ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    : it_(other.it_) {
4161ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4171ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4181ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator&
4191ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator::operator=(const const_iterator& other) {
4201ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (&other != this)
4211ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    it_ = other.it_;
4221ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
4231ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4241ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4251ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::begin() const {
4261ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return const_iterator(repr_.data_);
4271ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4281ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4291ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::end() const {
4301ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return const_iterator(repr_.data_ + repr_.size_);
4311ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4321ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4331ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.combool operator<(const UnicodeText::const_iterator& lhs,
4341ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com               const UnicodeText::const_iterator& rhs) {
4351ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return lhs.it_ < rhs.it_;
4361ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4371ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4381ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comchar32 UnicodeText::const_iterator::operator*() const {
4391ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // (We could call chartorune here, but that does some
4401ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // error-checking, and we're guaranteed that our data is valid
4411ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // UTF-8. Also, we expect this routine to be called very often. So
4421ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // for speed, we do the calculation ourselves.)
4431ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4441ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  // Convert from UTF-8
4451ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  uint8 byte1 = static_cast<uint8>(it_[0]);
4461ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (byte1 < 0x80)
4471ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return byte1;
4481ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4491ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  uint8 byte2 = static_cast<uint8>(it_[1]);
4501ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (byte1 < 0xE0)
4511ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return ((byte1 & 0x1F) << 6)
4521ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com          | (byte2 & 0x3F);
4531ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4541ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  uint8 byte3 = static_cast<uint8>(it_[2]);
4551ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (byte1 < 0xF0)
4561ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return ((byte1 & 0x0F) << 12)
4571ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com         | ((byte2 & 0x3F) << 6)
4581ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com         |  (byte3 & 0x3F);
4591ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4601ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  uint8 byte4 = static_cast<uint8>(it_[3]);
4611ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return ((byte1 & 0x07) << 18)
4621ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com       | ((byte2 & 0x3F) << 12)
4631ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com       | ((byte3 & 0x3F) << 6)
4641ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com       |  (byte4 & 0x3F);
4651ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4661ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4671ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
4681ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  it_ += UniLib::OneCharLen(it_);
4691ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
4701ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4711ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4721ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
4731ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  while (UniLib::IsTrailByte(*--it_)) { }
4741ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return *this;
4751ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4761ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4771ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comint UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
4781ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  utf8_output[0] = it_[0];
4791ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (static_cast<unsigned char>(it_[0]) < 0x80)
4801ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return 1;
4811ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4821ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  utf8_output[1] = it_[1];
4831ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (static_cast<unsigned char>(it_[0]) < 0xE0)
4841ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return 2;
4851ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4861ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  utf8_output[2] = it_[2];
4871ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  if (static_cast<unsigned char>(it_[0]) < 0xF0)
4881ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com    return 3;
4891ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4901ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  utf8_output[3] = it_[3];
4911ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return 4;
4921ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
4931ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4941ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
4951ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comUnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
4961ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(p != NULL);
4971ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const char* start = utf8_data();
4981ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  int len = utf8_length();
4991ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  const char* end = start + len;
5001ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(p >= start);
5011ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(p <= end);
5021ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  assert(p == end || !UniLib::IsTrailByte(*p));
5031ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return const_iterator(p);
5041ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
5051ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
5061ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.comstring UnicodeText::const_iterator::DebugString() const {
5071ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  stringstream ss;
5081ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
5091ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ss << "{iter " << hex << it_ << "}";
5101ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  string result;
5111ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  ss >> result;
5121ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com
5131ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com  return result;
5141ad5e5bc944bfb46689d87ace2773109cb54f5ephilip.liard@gmail.com}
515fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com
516fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com}  // namespace phonenumbers
517fa6ddeed736e42c266027a0d7b696909083d066bphilip.liard@gmail.com}  // namespace i18n
518