1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/utf_offset_string_conversions.h"
6
7#include <algorithm>
8
9#include "base/scoped_ptr.h"
10#include "base/string_piece.h"
11#include "base/utf_string_conversion_utils.h"
12
13using base::PrepareForUTF16Or32Output;
14using base::ReadUnicodeCharacter;
15using base::WriteUnicodeCharacter;
16
17// Generalized Unicode converter -----------------------------------------------
18
19// Converts the given source Unicode character type to the given destination
20// Unicode character type as a STL string. The given input buffer and size
21// determine the source, and the given output STL string will be replaced by
22// the result.
23template<typename SRC_CHAR>
24bool ConvertUnicode(const SRC_CHAR* src,
25                    size_t src_len,
26                    std::wstring* output,
27                    std::vector<size_t>* offsets_for_adjustment) {
28  if (offsets_for_adjustment) {
29    std::for_each(offsets_for_adjustment->begin(),
30                  offsets_for_adjustment->end(),
31                  LimitOffset<std::wstring>(src_len));
32  }
33
34  // ICU requires 32-bit numbers.
35  bool success = true;
36  AdjustOffset::Adjustments adjustments;
37  int32 src_len32 = static_cast<int32>(src_len);
38  for (int32 i = 0; i < src_len32; i++) {
39    uint32 code_point;
40    size_t original_i = i;
41    size_t chars_written = 0;
42    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
43      chars_written = WriteUnicodeCharacter(code_point, output);
44    } else {
45      chars_written = WriteUnicodeCharacter(0xFFFD, output);
46      success = false;
47    }
48    if (offsets_for_adjustment) {
49      // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
50      // character read, not after it (so that incrementing it in the loop
51      // increment will place it at the right location), so we need to account
52      // for that in determining the amount that was read.
53      adjustments.push_back(AdjustOffset::Adjustment(
54          original_i, i - original_i + 1, chars_written));
55    }
56  }
57
58  // Make offset adjustment.
59  if (offsets_for_adjustment && !adjustments.empty()) {
60    std::for_each(offsets_for_adjustment->begin(),
61                  offsets_for_adjustment->end(),
62                  AdjustOffset(adjustments));
63  }
64
65  return success;
66}
67
68// UTF-8 <-> Wide --------------------------------------------------------------
69
70bool UTF8ToWideAndAdjustOffset(const char* src,
71                               size_t src_len,
72                               std::wstring* output,
73                               size_t* offset_for_adjustment) {
74  std::vector<size_t> offsets;
75  if (offset_for_adjustment)
76    offsets.push_back(*offset_for_adjustment);
77  PrepareForUTF16Or32Output(src, src_len, output);
78  bool ret = ConvertUnicode(src, src_len, output, &offsets);
79  if (offset_for_adjustment)
80    *offset_for_adjustment = offsets[0];
81  return ret;
82}
83
84bool UTF8ToWideAndAdjustOffsets(const char* src,
85                                size_t src_len,
86                                std::wstring* output,
87                                std::vector<size_t>* offsets_for_adjustment) {
88  PrepareForUTF16Or32Output(src, src_len, output);
89  return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
90}
91
92std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8,
93                                       size_t* offset_for_adjustment) {
94  std::vector<size_t> offsets;
95  if (offset_for_adjustment)
96    offsets.push_back(*offset_for_adjustment);
97  std::wstring result;
98  UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result,
99                             &offsets);
100  if (offset_for_adjustment)
101    *offset_for_adjustment = offsets[0];
102  return result;
103}
104
105std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8,
106                                        std::vector<size_t>*
107                                            offsets_for_adjustment) {
108  std::wstring result;
109  UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result,
110                             offsets_for_adjustment);
111  return result;
112}
113
114// UTF-16 <-> Wide -------------------------------------------------------------
115
116#if defined(WCHAR_T_IS_UTF16)
117
118// When wide == UTF-16, then conversions are a NOP.
119bool UTF16ToWideAndAdjustOffset(const char16* src,
120                                size_t src_len,
121                                std::wstring* output,
122                                size_t* offset_for_adjustment) {
123  output->assign(src, src_len);
124  if (offset_for_adjustment && (*offset_for_adjustment >= src_len))
125    *offset_for_adjustment = std::wstring::npos;
126  return true;
127}
128
129bool UTF16ToWideAndAdjustOffsets(const char16* src,
130                                 size_t src_len,
131                                 std::wstring* output,
132                                 std::vector<size_t>* offsets_for_adjustment) {
133  output->assign(src, src_len);
134  if (offsets_for_adjustment) {
135    std::for_each(offsets_for_adjustment->begin(),
136                  offsets_for_adjustment->end(),
137                  LimitOffset<std::wstring>(src_len));
138  }
139  return true;
140}
141
142std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
143                                        size_t* offset_for_adjustment) {
144  if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length()))
145    *offset_for_adjustment = std::wstring::npos;
146  return utf16;
147}
148
149std::wstring UTF16ToWideAndAdjustOffsets(
150    const string16& utf16,
151    std::vector<size_t>* offsets_for_adjustment) {
152  if (offsets_for_adjustment) {
153    std::for_each(offsets_for_adjustment->begin(),
154                  offsets_for_adjustment->end(),
155                  LimitOffset<std::wstring>(utf16.length()));
156  }
157  return utf16;
158}
159
160#elif defined(WCHAR_T_IS_UTF32)
161
162bool UTF16ToWideAndAdjustOffset(const char16* src,
163                                size_t src_len,
164                                std::wstring* output,
165                                size_t* offset_for_adjustment) {
166  std::vector<size_t> offsets;
167  if (offset_for_adjustment)
168    offsets.push_back(*offset_for_adjustment);
169  output->clear();
170  // Assume that normally we won't have any non-BMP characters so the counts
171  // will be the same.
172  output->reserve(src_len);
173  bool ret = ConvertUnicode(src, src_len, output, &offsets);
174  if (offset_for_adjustment)
175    *offset_for_adjustment = offsets[0];
176  return ret;
177}
178
179bool UTF16ToWideAndAdjustOffsets(const char16* src,
180                                 size_t src_len,
181                                 std::wstring* output,
182                                 std::vector<size_t>* offsets_for_adjustment) {
183  output->clear();
184  // Assume that normally we won't have any non-BMP characters so the counts
185  // will be the same.
186  output->reserve(src_len);
187  return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
188}
189
190std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16,
191                                        size_t* offset_for_adjustment) {
192  std::vector<size_t> offsets;
193  if (offset_for_adjustment)
194    offsets.push_back(*offset_for_adjustment);
195  std::wstring result;
196  UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result,
197                              &offsets);
198  if (offset_for_adjustment)
199    *offset_for_adjustment = offsets[0];
200  return result;
201}
202
203std::wstring UTF16ToWideAndAdjustOffsets(
204    const string16& utf16,
205    std::vector<size_t>* offsets_for_adjustment) {
206  std::wstring result;
207  UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result,
208                              offsets_for_adjustment);
209  return result;
210}
211
212#endif  // defined(WCHAR_T_IS_UTF32)
213
214AdjustOffset::Adjustment::Adjustment(size_t location,
215                                     size_t old_length,
216                                     size_t new_length)
217  : location(location),
218    old_length(old_length),
219    new_length(new_length) {}
220
221AdjustOffset::AdjustOffset(const Adjustments& adjustments)
222    : adjustments_(adjustments) {}
223
224void AdjustOffset::operator()(size_t& offset) {
225  if (offset == std::wstring::npos)
226    return;
227  size_t adjustment = 0;
228  for (Adjustments::const_iterator i = adjustments_.begin();
229       i != adjustments_.end(); ++i) {
230    size_t location = i->location;
231    if (offset == location && i->new_length == 0) {
232      offset = std::wstring::npos;
233      return;
234    }
235    if (offset <= location)
236      break;
237    if (offset < (location + i->old_length)) {
238      offset = std::wstring::npos;
239      return;
240    }
241    adjustment += (i->old_length - i->new_length);
242  }
243  offset -= adjustment;
244}
245