1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/strings/string_util.h"
6
7#include <stdint.h>
8#include <limits>
9#include "base/macros.h"
10#include "base/strings/utf_string_conversion_utils.h"
11#include "base/third_party/icu/icu_utf.h"
12
13namespace base {
14
15namespace {
16
17typedef uintptr_t MachineWord;
18const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
19
20inline bool IsAlignedToMachineWord(const void* pointer) {
21  return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
22}
23
24template<typename T> inline T* AlignToMachineWord(T* pointer) {
25  return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
26                              ~kMachineWordAlignmentMask);
27}
28
29template<size_t size, typename CharacterType> struct NonASCIIMask;
30template<> struct NonASCIIMask<4, char> {
31    static inline uint32_t value() { return 0x80808080U; }
32};
33template<> struct NonASCIIMask<8, char> {
34    static inline uint64_t value() { return 0x8080808080808080ULL; }
35};
36
37}  // namespace
38namespace {
39
40template<typename StringType>
41StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
42  StringType ret;
43  ret.reserve(str.size());
44  for (size_t i = 0; i < str.size(); i++)
45    ret.push_back(ToLowerASCII(str[i]));
46  return ret;
47}
48
49template<typename StringType>
50StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
51  StringType ret;
52  ret.reserve(str.size());
53  for (size_t i = 0; i < str.size(); i++)
54    ret.push_back(ToUpperASCII(str[i]));
55  return ret;
56}
57
58}  // namespace
59
60std::string ToLowerASCII(StringPiece str) {
61  return ToLowerASCIIImpl<std::string>(str);
62}
63
64std::string ToUpperASCII(StringPiece str) {
65  return ToUpperASCIIImpl<std::string>(str);
66}
67
68template<class StringType>
69int CompareCaseInsensitiveASCIIT(BasicStringPiece<StringType> a,
70                                 BasicStringPiece<StringType> b) {
71  // Find the first characters that aren't equal and compare them.  If the end
72  // of one of the strings is found before a nonequal character, the lengths
73  // of the strings are compared.
74  size_t i = 0;
75  while (i < a.length() && i < b.length()) {
76    typename StringType::value_type lower_a = ToLowerASCII(a[i]);
77    typename StringType::value_type lower_b = ToLowerASCII(b[i]);
78    if (lower_a < lower_b)
79      return -1;
80    if (lower_a > lower_b)
81      return 1;
82    i++;
83  }
84
85  // End of one string hit before finding a different character. Expect the
86  // common case to be "strings equal" at this point so check that first.
87  if (a.length() == b.length())
88    return 0;
89
90  if (a.length() < b.length())
91    return -1;
92  return 1;
93}
94
95int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b) {
96  return CompareCaseInsensitiveASCIIT<std::string>(a, b);
97}
98
99bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) {
100  if (a.length() != b.length())
101    return false;
102  return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0;
103}
104
105template<typename STR>
106bool ReplaceCharsT(const STR& input,
107                   const STR& replace_chars,
108                   const STR& replace_with,
109                   STR* output) {
110  bool removed = false;
111  size_t replace_length = replace_with.length();
112
113  *output = input;
114
115  size_t found = output->find_first_of(replace_chars);
116  while (found != STR::npos) {
117    removed = true;
118    output->replace(found, 1, replace_with);
119    found = output->find_first_of(replace_chars, found + replace_length);
120  }
121
122  return removed;
123}
124
125bool ReplaceChars(const std::string& input,
126                  const StringPiece& replace_chars,
127                  const std::string& replace_with,
128                  std::string* output) {
129  return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output);
130}
131
132template<typename Str>
133TrimPositions TrimStringT(const Str& input,
134                          BasicStringPiece<Str> trim_chars,
135                          TrimPositions positions,
136                          Str* output) {
137  // Find the edges of leading/trailing whitespace as desired. Need to use
138  // a StringPiece version of input to be able to call find* on it with the
139  // StringPiece version of trim_chars (normally the trim_chars will be a
140  // constant so avoid making a copy).
141  BasicStringPiece<Str> input_piece(input);
142  const size_t last_char = input.length() - 1;
143  const size_t first_good_char = (positions & TRIM_LEADING) ?
144      input_piece.find_first_not_of(trim_chars) : 0;
145  const size_t last_good_char = (positions & TRIM_TRAILING) ?
146      input_piece.find_last_not_of(trim_chars) : last_char;
147
148  // When the string was all trimmed, report that we stripped off characters
149  // from whichever position the caller was interested in. For empty input, we
150  // stripped no characters, but we still need to clear |output|.
151  if (input.empty() ||
152      (first_good_char == Str::npos) || (last_good_char == Str::npos)) {
153    bool input_was_empty = input.empty();  // in case output == &input
154    output->clear();
155    return input_was_empty ? TRIM_NONE : positions;
156  }
157
158  // Trim.
159  *output =
160      input.substr(first_good_char, last_good_char - first_good_char + 1);
161
162  // Return where we trimmed from.
163  return static_cast<TrimPositions>(
164      ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
165      ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
166}
167
168bool TrimString(const std::string& input,
169                StringPiece trim_chars,
170                std::string* output) {
171  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
172}
173
174template<typename Str>
175BasicStringPiece<Str> TrimStringPieceT(BasicStringPiece<Str> input,
176                                       BasicStringPiece<Str> trim_chars,
177                                       TrimPositions positions) {
178  size_t begin = (positions & TRIM_LEADING) ?
179      input.find_first_not_of(trim_chars) : 0;
180  size_t end = (positions & TRIM_TRAILING) ?
181      input.find_last_not_of(trim_chars) + 1 : input.size();
182  return input.substr(begin, end - begin);
183}
184
185StringPiece TrimString(StringPiece input,
186                       const StringPiece& trim_chars,
187                       TrimPositions positions) {
188  return TrimStringPieceT(input, trim_chars, positions);
189}
190
191TrimPositions TrimWhitespaceASCII(const std::string& input,
192                                  TrimPositions positions,
193                                  std::string* output) {
194  return TrimStringT(input, StringPiece(kWhitespaceASCII), positions, output);
195}
196
197template <class Char>
198inline bool DoIsStringASCII(const Char* characters, size_t length) {
199  MachineWord all_char_bits = 0;
200  const Char* end = characters + length;
201
202  // Prologue: align the input.
203  while (!IsAlignedToMachineWord(characters) && characters != end) {
204    all_char_bits |= *characters;
205    ++characters;
206  }
207
208  // Compare the values of CPU word size.
209  const Char* word_end = AlignToMachineWord(end);
210  const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
211  while (characters < word_end) {
212    all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
213    characters += loop_increment;
214  }
215
216  // Process the remaining bytes.
217  while (characters != end) {
218    all_char_bits |= *characters;
219    ++characters;
220  }
221
222  MachineWord non_ascii_bit_mask =
223      NonASCIIMask<sizeof(MachineWord), Char>::value();
224  return !(all_char_bits & non_ascii_bit_mask);
225}
226
227bool IsStringASCII(const StringPiece& str) {
228  return DoIsStringASCII(str.data(), str.length());
229}
230
231bool IsStringUTF8(const StringPiece& str) {
232  const char *src = str.data();
233  int32_t src_len = static_cast<int32_t>(str.length());
234  int32_t char_index = 0;
235
236  while (char_index < src_len) {
237    int32_t code_point;
238    CBU8_NEXT(src, char_index, src_len, code_point);
239    if (!IsValidCharacter(code_point))
240      return false;
241  }
242  return true;
243}
244
245}  // namespace base
246