1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/strings/utf_offset_string_conversions.h"
6
7#include <algorithm>
8
9#include "base/logging.h"
10#include "base/memory/scoped_ptr.h"
11#include "base/strings/string_piece.h"
12#include "base/strings/utf_string_conversion_utils.h"
13
14namespace base {
15
16OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
17                                       size_t original_length,
18                                       size_t output_length)
19    : original_offset(original_offset),
20      original_length(original_length),
21      output_length(output_length) {
22}
23
24// static
25void OffsetAdjuster::AdjustOffsets(
26    const Adjustments& adjustments,
27    std::vector<size_t>* offsets_for_adjustment) {
28  if (!offsets_for_adjustment || adjustments.empty())
29    return;
30  for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
31       i != offsets_for_adjustment->end(); ++i)
32    AdjustOffset(adjustments, &(*i));
33}
34
35// static
36void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
37                                  size_t* offset) {
38  if (*offset == string16::npos)
39    return;
40  int adjustment = 0;
41  for (Adjustments::const_iterator i = adjustments.begin();
42       i != adjustments.end(); ++i) {
43    if (*offset <= i->original_offset)
44      break;
45    if (*offset < (i->original_offset + i->original_length)) {
46      *offset = string16::npos;
47      return;
48    }
49    adjustment += static_cast<int>(i->original_length - i->output_length);
50  }
51  *offset -= adjustment;
52}
53
54// static
55void OffsetAdjuster::UnadjustOffsets(
56    const Adjustments& adjustments,
57    std::vector<size_t>* offsets_for_unadjustment) {
58  if (!offsets_for_unadjustment || adjustments.empty())
59    return;
60  for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
61       i != offsets_for_unadjustment->end(); ++i)
62    UnadjustOffset(adjustments, &(*i));
63}
64
65// static
66void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
67                                    size_t* offset) {
68  if (*offset == string16::npos)
69    return;
70  int adjustment = 0;
71  for (Adjustments::const_iterator i = adjustments.begin();
72       i != adjustments.end(); ++i) {
73    if (*offset + adjustment <= i->original_offset)
74      break;
75    adjustment += static_cast<int>(i->original_length - i->output_length);
76    if ((*offset + adjustment) <
77        (i->original_offset + i->original_length)) {
78      *offset = string16::npos;
79      return;
80    }
81  }
82  *offset += adjustment;
83}
84
85// static
86void OffsetAdjuster::MergeSequentialAdjustments(
87    const Adjustments& first_adjustments,
88    Adjustments* adjustments_on_adjusted_string) {
89  Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
90  Adjustments::const_iterator first_iter = first_adjustments.begin();
91  // Simultaneously iterate over all |adjustments_on_adjusted_string| and
92  // |first_adjustments|, adding adjustments to or correcting the adjustments
93  // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
94  // current number of characters collapsed by |first_adjustments| up to this
95  // point.  |currently_collapsing| keeps track of the number of characters
96  // collapsed by |first_adjustments| into the current |adjusted_iter|'s
97  // length.  These are characters that will change |shift| as soon as we're
98  // done processing the current |adjusted_iter|; they are not yet reflected in
99  // |shift|.
100  size_t shift = 0;
101  size_t currently_collapsing = 0;
102  while (adjusted_iter != adjustments_on_adjusted_string->end()) {
103    if ((first_iter == first_adjustments.end()) ||
104        ((adjusted_iter->original_offset + shift +
105          adjusted_iter->original_length) <= first_iter->original_offset)) {
106      // Entire |adjusted_iter| (accounting for its shift and including its
107      // whole original length) comes before |first_iter|.
108      //
109      // Correct the offset at |adjusted_iter| and move onto the next
110      // adjustment that needs revising.
111      adjusted_iter->original_offset += shift;
112      shift += currently_collapsing;
113      currently_collapsing = 0;
114      ++adjusted_iter;
115    } else if ((adjusted_iter->original_offset + shift) >
116               first_iter->original_offset) {
117      // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
118
119      // It's not possible for the adjustments to overlap.  (It shouldn't
120      // be possible that we have an |adjusted_iter->original_offset| that,
121      // when adjusted by the computed |shift|, is in the middle of
122      // |first_iter|'s output's length.  After all, that would mean the
123      // current adjustment_on_adjusted_string somehow points to an offset
124      // that was supposed to have been eliminated by the first set of
125      // adjustments.)
126      DCHECK_LE(first_iter->original_offset + first_iter->output_length,
127                adjusted_iter->original_offset + shift);
128
129      // Add the |first_adjustment_iter| to the full set of adjustments while
130      // making sure |adjusted_iter| continues pointing to the same element.
131      // We do this by inserting the |first_adjustment_iter| right before
132      // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
133      // the following element.
134      shift += first_iter->original_length - first_iter->output_length;
135      adjusted_iter = adjustments_on_adjusted_string->insert(
136          adjusted_iter, *first_iter);
137      ++adjusted_iter;
138      ++first_iter;
139    } else {
140      // The first adjustment adjusted something that then got further adjusted
141      // by the second set of adjustments.  In other words, |first_iter| points
142      // to something in the range covered by |adjusted_iter|'s length (after
143      // accounting for |shift|).  Precisely,
144      //   adjusted_iter->original_offset + shift
145      //   <=
146      //   first_iter->original_offset
147      //   <=
148      //   adjusted_iter->original_offset + shift +
149      //       adjusted_iter->original_length
150
151      // Modify the current |adjusted_iter| to include whatever collapsing
152      // happened in |first_iter|, then advance to the next |first_adjustments|
153      // because we dealt with the current one.
154      const int collapse = static_cast<int>(first_iter->original_length) -
155          static_cast<int>(first_iter->output_length);
156      // This function does not know how to deal with a string that expands and
157      // then gets modified, only strings that collapse and then get modified.
158      DCHECK_GT(collapse, 0);
159      adjusted_iter->original_length += collapse;
160      currently_collapsing += collapse;
161      ++first_iter;
162    }
163  }
164  DCHECK_EQ(0u, currently_collapsing);
165  if (first_iter != first_adjustments.end()) {
166    // Only first adjustments are left.  These do not need to be modified.
167    // (Their offsets are already correct with respect to the original string.)
168    // Append them all.
169    DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
170    adjustments_on_adjusted_string->insert(
171        adjustments_on_adjusted_string->end(), first_iter,
172        first_adjustments.end());
173  }
174}
175
176// Converts the given source Unicode character type to the given destination
177// Unicode character type as a STL string. The given input buffer and size
178// determine the source, and the given output STL string will be replaced by
179// the result.  If non-NULL, |adjustments| is set to reflect the all the
180// alterations to the string that are not one-character-to-one-character.
181// It will always be sorted by increasing offset.
182template<typename SrcChar, typename DestStdString>
183bool ConvertUnicode(const SrcChar* src,
184                    size_t src_len,
185                    DestStdString* output,
186                    OffsetAdjuster::Adjustments* adjustments) {
187  if (adjustments)
188    adjustments->clear();
189  // ICU requires 32-bit numbers.
190  bool success = true;
191  int32 src_len32 = static_cast<int32>(src_len);
192  for (int32 i = 0; i < src_len32; i++) {
193    uint32 code_point;
194    size_t original_i = i;
195    size_t chars_written = 0;
196    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
197      chars_written = WriteUnicodeCharacter(code_point, output);
198    } else {
199      chars_written = WriteUnicodeCharacter(0xFFFD, output);
200      success = false;
201    }
202
203    // Only bother writing an adjustment if this modification changed the
204    // length of this character.
205    // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
206    // character read, not after it (so that incrementing it in the loop
207    // increment will place it at the right location), so we need to account
208    // for that in determining the amount that was read.
209    if (adjustments && ((i - original_i + 1) != chars_written)) {
210      adjustments->push_back(OffsetAdjuster::Adjustment(
211          original_i, i - original_i + 1, chars_written));
212    }
213  }
214  return success;
215}
216
217bool UTF8ToUTF16WithAdjustments(
218    const char* src,
219    size_t src_len,
220    string16* output,
221    base::OffsetAdjuster::Adjustments* adjustments) {
222  PrepareForUTF16Or32Output(src, src_len, output);
223  return ConvertUnicode(src, src_len, output, adjustments);
224}
225
226string16 UTF8ToUTF16WithAdjustments(
227    const base::StringPiece& utf8,
228    base::OffsetAdjuster::Adjustments* adjustments) {
229  string16 result;
230  UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
231  return result;
232}
233
234string16 UTF8ToUTF16AndAdjustOffsets(
235    const base::StringPiece& utf8,
236    std::vector<size_t>* offsets_for_adjustment) {
237  std::for_each(offsets_for_adjustment->begin(),
238                offsets_for_adjustment->end(),
239                LimitOffset<base::StringPiece>(utf8.length()));
240  OffsetAdjuster::Adjustments adjustments;
241  string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
242  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
243  return result;
244}
245
246std::string UTF16ToUTF8AndAdjustOffsets(
247    const base::StringPiece16& utf16,
248    std::vector<size_t>* offsets_for_adjustment) {
249  std::for_each(offsets_for_adjustment->begin(),
250                offsets_for_adjustment->end(),
251                LimitOffset<base::StringPiece16>(utf16.length()));
252  std::string result;
253  PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
254  OffsetAdjuster::Adjustments adjustments;
255  ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
256  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
257  return result;
258}
259
260}  // namespace base
261