utf_offset_string_conversions.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/strings/utf_offset_string_conversions.h"
6
7#include <algorithm>
8
9#include "base/memory/scoped_ptr.h"
10#include "base/string_piece.h"
11#include "base/strings/utf_string_conversion_utils.h"
12
13namespace base {
14
15// Converts the given source Unicode character type to the given destination
16// Unicode character type as a STL string. The given input buffer and size
17// determine the source, and the given output STL string will be replaced by
18// the result.
19template<typename SrcChar, typename DestStdString>
20bool ConvertUnicode(const SrcChar* src,
21                    size_t src_len,
22                    DestStdString* output,
23                    std::vector<size_t>* offsets_for_adjustment) {
24  if (offsets_for_adjustment) {
25    std::for_each(offsets_for_adjustment->begin(),
26                  offsets_for_adjustment->end(),
27                  LimitOffset<DestStdString>(src_len));
28  }
29
30  // ICU requires 32-bit numbers.
31  bool success = true;
32  OffsetAdjuster offset_adjuster(offsets_for_adjustment);
33  int32 src_len32 = static_cast<int32>(src_len);
34  for (int32 i = 0; i < src_len32; i++) {
35    uint32 code_point;
36    size_t original_i = i;
37    size_t chars_written = 0;
38    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
39      chars_written = WriteUnicodeCharacter(code_point, output);
40    } else {
41      chars_written = WriteUnicodeCharacter(0xFFFD, output);
42      success = false;
43    }
44    if (offsets_for_adjustment) {
45      // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
46      // character read, not after it (so that incrementing it in the loop
47      // increment will place it at the right location), so we need to account
48      // for that in determining the amount that was read.
49      offset_adjuster.Add(OffsetAdjuster::Adjustment(original_i,
50          i - original_i + 1, chars_written));
51    }
52  }
53  return success;
54}
55
56bool UTF8ToUTF16AndAdjustOffset(const char* src,
57                                size_t src_len,
58                                string16* output,
59                                size_t* offset_for_adjustment) {
60  std::vector<size_t> offsets;
61  if (offset_for_adjustment)
62    offsets.push_back(*offset_for_adjustment);
63  PrepareForUTF16Or32Output(src, src_len, output);
64  bool ret = ConvertUnicode(src, src_len, output, &offsets);
65  if (offset_for_adjustment)
66    *offset_for_adjustment = offsets[0];
67  return ret;
68}
69
70bool UTF8ToUTF16AndAdjustOffsets(const char* src,
71                                 size_t src_len,
72                                 string16* output,
73                                 std::vector<size_t>* offsets_for_adjustment) {
74  PrepareForUTF16Or32Output(src, src_len, output);
75  return ConvertUnicode(src, src_len, output, offsets_for_adjustment);
76}
77
78string16 UTF8ToUTF16AndAdjustOffset(const base::StringPiece& utf8,
79                                        size_t* offset_for_adjustment) {
80  std::vector<size_t> offsets;
81  if (offset_for_adjustment)
82    offsets.push_back(*offset_for_adjustment);
83  string16 result;
84  UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result,
85                              &offsets);
86  if (offset_for_adjustment)
87    *offset_for_adjustment = offsets[0];
88  return result;
89}
90
91string16 UTF8ToUTF16AndAdjustOffsets(
92    const base::StringPiece& utf8,
93    std::vector<size_t>* offsets_for_adjustment) {
94  string16 result;
95  UTF8ToUTF16AndAdjustOffsets(utf8.data(), utf8.length(), &result,
96                              offsets_for_adjustment);
97  return result;
98}
99
100std::string UTF16ToUTF8AndAdjustOffset(
101    const base::StringPiece16& utf16,
102    size_t* offset_for_adjustment) {
103  std::vector<size_t> offsets;
104  if (offset_for_adjustment)
105    offsets.push_back(*offset_for_adjustment);
106  std::string result = UTF16ToUTF8AndAdjustOffsets(utf16, &offsets);
107  if (offset_for_adjustment)
108    *offset_for_adjustment = offsets[0];
109  return result;
110}
111
112std::string UTF16ToUTF8AndAdjustOffsets(
113    const base::StringPiece16& utf16,
114    std::vector<size_t>* offsets_for_adjustment) {
115  std::string result;
116  PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
117  ConvertUnicode(utf16.data(), utf16.length(), &result, offsets_for_adjustment);
118  return result;
119}
120
121OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
122                                       size_t original_length,
123                                       size_t output_length)
124    : original_offset(original_offset),
125      original_length(original_length),
126      output_length(output_length) {
127}
128
129OffsetAdjuster::OffsetAdjuster(std::vector<size_t>* offsets_for_adjustment)
130    : offsets_for_adjustment_(offsets_for_adjustment) {
131}
132
133OffsetAdjuster::~OffsetAdjuster() {
134  if (!offsets_for_adjustment_ || adjustments_.empty())
135    return;
136  for (std::vector<size_t>::iterator i(offsets_for_adjustment_->begin());
137       i != offsets_for_adjustment_->end(); ++i)
138    AdjustOffset(i);
139}
140
141void OffsetAdjuster::Add(const Adjustment& adjustment) {
142  adjustments_.push_back(adjustment);
143}
144
145void OffsetAdjuster::AdjustOffset(std::vector<size_t>::iterator offset) {
146  if (*offset == string16::npos)
147    return;
148  size_t adjustment = 0;
149  for (std::vector<Adjustment>::const_iterator i = adjustments_.begin();
150       i != adjustments_.end(); ++i) {
151    if (*offset == i->original_offset && i->output_length == 0) {
152      *offset = string16::npos;
153      return;
154    }
155    if (*offset <= i->original_offset)
156      break;
157    if (*offset < (i->original_offset + i->original_length)) {
158      *offset = string16::npos;
159      return;
160    }
161    adjustment += (i->original_length - i->output_length);
162  }
163  *offset -= adjustment;
164}
165
166}  // namespace base
167