string_util.cc revision dc0f95d653279beabeb9817299e2902918ba123e
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/string_util.h"
6
7#include "build/build_config.h"
8
9#include <ctype.h>
10#include <errno.h>
11#include <math.h>
12#include <stdarg.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <time.h>
17#include <wchar.h>
18#include <wctype.h>
19
20#include <algorithm>
21#include <vector>
22
23#include "base/basictypes.h"
24#include "base/logging.h"
25#include "base/singleton.h"
26#include "base/third_party/dmg_fp/dmg_fp.h"
27#include "base/utf_string_conversion_utils.h"
28#include "base/utf_string_conversions.h"
29#include "base/third_party/icu/icu_utf.h"
30
31namespace {
32
33// Force the singleton used by Empty[W]String[16] to be a unique type. This
34// prevents other code that might accidentally use Singleton<string> from
35// getting our internal one.
36struct EmptyStrings {
37  EmptyStrings() {}
38  const std::string s;
39  const std::wstring ws;
40  const string16 s16;
41
42  static EmptyStrings* GetInstance() {
43    return Singleton<EmptyStrings>::get();
44  }
45};
46
47// Used by ReplaceStringPlaceholders to track the position in the string of
48// replaced parameters.
49struct ReplacementOffset {
50  ReplacementOffset(uintptr_t parameter, size_t offset)
51      : parameter(parameter),
52        offset(offset) {}
53
54  // Index of the parameter.
55  uintptr_t parameter;
56
57  // Starting position in the string.
58  size_t offset;
59};
60
61static bool CompareParameter(const ReplacementOffset& elem1,
62                             const ReplacementOffset& elem2) {
63  return elem1.parameter < elem2.parameter;
64}
65
66}  // namespace
67
68namespace base {
69
70bool IsWprintfFormatPortable(const wchar_t* format) {
71  for (const wchar_t* position = format; *position != '\0'; ++position) {
72    if (*position == '%') {
73      bool in_specification = true;
74      bool modifier_l = false;
75      while (in_specification) {
76        // Eat up characters until reaching a known specifier.
77        if (*++position == '\0') {
78          // The format string ended in the middle of a specification.  Call
79          // it portable because no unportable specifications were found.  The
80          // string is equally broken on all platforms.
81          return true;
82        }
83
84        if (*position == 'l') {
85          // 'l' is the only thing that can save the 's' and 'c' specifiers.
86          modifier_l = true;
87        } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
88                   *position == 'S' || *position == 'C' || *position == 'F' ||
89                   *position == 'D' || *position == 'O' || *position == 'U') {
90          // Not portable.
91          return false;
92        }
93
94        if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
95          // Portable, keep scanning the rest of the format string.
96          in_specification = false;
97        }
98      }
99    }
100  }
101
102  return true;
103}
104
105}  // namespace base
106
107
108const std::string& EmptyString() {
109  return EmptyStrings::GetInstance()->s;
110}
111
112const std::wstring& EmptyWString() {
113  return EmptyStrings::GetInstance()->ws;
114}
115
116const string16& EmptyString16() {
117  return EmptyStrings::GetInstance()->s16;
118}
119
120#define WHITESPACE_UNICODE \
121  0x0009, /* <control-0009> to <control-000D> */ \
122  0x000A,                                        \
123  0x000B,                                        \
124  0x000C,                                        \
125  0x000D,                                        \
126  0x0020, /* Space */                            \
127  0x0085, /* <control-0085> */                   \
128  0x00A0, /* No-Break Space */                   \
129  0x1680, /* Ogham Space Mark */                 \
130  0x180E, /* Mongolian Vowel Separator */        \
131  0x2000, /* En Quad to Hair Space */            \
132  0x2001,                                        \
133  0x2002,                                        \
134  0x2003,                                        \
135  0x2004,                                        \
136  0x2005,                                        \
137  0x2006,                                        \
138  0x2007,                                        \
139  0x2008,                                        \
140  0x2009,                                        \
141  0x200A,                                        \
142  0x200C, /* Zero Width Non-Joiner */            \
143  0x2028, /* Line Separator */                   \
144  0x2029, /* Paragraph Separator */              \
145  0x202F, /* Narrow No-Break Space */            \
146  0x205F, /* Medium Mathematical Space */        \
147  0x3000, /* Ideographic Space */                \
148  0
149
150const wchar_t kWhitespaceWide[] = {
151  WHITESPACE_UNICODE
152};
153const char16 kWhitespaceUTF16[] = {
154  WHITESPACE_UNICODE
155};
156const char kWhitespaceASCII[] = {
157  0x09,    // <control-0009> to <control-000D>
158  0x0A,
159  0x0B,
160  0x0C,
161  0x0D,
162  0x20,    // Space
163  0
164};
165
166const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
167
168template<typename STR>
169bool RemoveCharsT(const STR& input,
170                  const typename STR::value_type remove_chars[],
171                  STR* output) {
172  bool removed = false;
173  size_t found;
174
175  *output = input;
176
177  found = output->find_first_of(remove_chars);
178  while (found != STR::npos) {
179    removed = true;
180    output->replace(found, 1, STR());
181    found = output->find_first_of(remove_chars, found);
182  }
183
184  return removed;
185}
186
187bool RemoveChars(const std::wstring& input,
188                 const wchar_t remove_chars[],
189                 std::wstring* output) {
190  return RemoveCharsT(input, remove_chars, output);
191}
192
193#if !defined(WCHAR_T_IS_UTF16)
194bool RemoveChars(const string16& input,
195                 const char16 remove_chars[],
196                 string16* output) {
197  return RemoveCharsT(input, remove_chars, output);
198}
199#endif
200
201bool RemoveChars(const std::string& input,
202                 const char remove_chars[],
203                 std::string* output) {
204  return RemoveCharsT(input, remove_chars, output);
205}
206
207template<typename STR>
208TrimPositions TrimStringT(const STR& input,
209                          const typename STR::value_type trim_chars[],
210                          TrimPositions positions,
211                          STR* output) {
212  // Find the edges of leading/trailing whitespace as desired.
213  const typename STR::size_type last_char = input.length() - 1;
214  const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
215      input.find_first_not_of(trim_chars) : 0;
216  const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
217      input.find_last_not_of(trim_chars) : last_char;
218
219  // When the string was all whitespace, report that we stripped off whitespace
220  // from whichever position the caller was interested in.  For empty input, we
221  // stripped no whitespace, but we still need to clear |output|.
222  if (input.empty() ||
223      (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
224    bool input_was_empty = input.empty();  // in case output == &input
225    output->clear();
226    return input_was_empty ? TRIM_NONE : positions;
227  }
228
229  // Trim the whitespace.
230  *output =
231      input.substr(first_good_char, last_good_char - first_good_char + 1);
232
233  // Return where we trimmed from.
234  return static_cast<TrimPositions>(
235      ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
236      ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
237}
238
239bool TrimString(const std::wstring& input,
240                const wchar_t trim_chars[],
241                std::wstring* output) {
242  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
243}
244
245#if !defined(WCHAR_T_IS_UTF16)
246bool TrimString(const string16& input,
247                const char16 trim_chars[],
248                string16* output) {
249  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
250}
251#endif
252
253bool TrimString(const std::string& input,
254                const char trim_chars[],
255                std::string* output) {
256  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
257}
258
259void TruncateUTF8ToByteSize(const std::string& input,
260                            const size_t byte_size,
261                            std::string* output) {
262  DCHECK(output);
263  if (byte_size > input.length()) {
264    *output = input;
265    return;
266  }
267  DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
268  // Note: This cast is necessary because CBU8_NEXT uses int32s.
269  int32 truncation_length = static_cast<int32>(byte_size);
270  int32 char_index = truncation_length - 1;
271  const char* data = input.data();
272
273  // Using CBU8, we will move backwards from the truncation point
274  // to the beginning of the string looking for a valid UTF8
275  // character.  Once a full UTF8 character is found, we will
276  // truncate the string to the end of that character.
277  while (char_index >= 0) {
278    int32 prev = char_index;
279    uint32 code_point = 0;
280    CBU8_NEXT(data, char_index, truncation_length, code_point);
281    if (!base::IsValidCharacter(code_point) ||
282        !base::IsValidCodepoint(code_point)) {
283      char_index = prev - 1;
284    } else {
285      break;
286    }
287  }
288
289  if (char_index >= 0 )
290    *output = input.substr(0, char_index);
291  else
292    output->clear();
293}
294
295TrimPositions TrimWhitespace(const std::wstring& input,
296                             TrimPositions positions,
297                             std::wstring* output) {
298  return TrimStringT(input, kWhitespaceWide, positions, output);
299}
300
301#if !defined(WCHAR_T_IS_UTF16)
302TrimPositions TrimWhitespace(const string16& input,
303                             TrimPositions positions,
304                             string16* output) {
305  return TrimStringT(input, kWhitespaceUTF16, positions, output);
306}
307#endif
308
309TrimPositions TrimWhitespaceASCII(const std::string& input,
310                                  TrimPositions positions,
311                                  std::string* output) {
312  return TrimStringT(input, kWhitespaceASCII, positions, output);
313}
314
315// This function is only for backward-compatibility.
316// To be removed when all callers are updated.
317TrimPositions TrimWhitespace(const std::string& input,
318                             TrimPositions positions,
319                             std::string* output) {
320  return TrimWhitespaceASCII(input, positions, output);
321}
322
323template<typename STR>
324STR CollapseWhitespaceT(const STR& text,
325                        bool trim_sequences_with_line_breaks) {
326  STR result;
327  result.resize(text.size());
328
329  // Set flags to pretend we're already in a trimmed whitespace sequence, so we
330  // will trim any leading whitespace.
331  bool in_whitespace = true;
332  bool already_trimmed = true;
333
334  int chars_written = 0;
335  for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
336    if (IsWhitespace(*i)) {
337      if (!in_whitespace) {
338        // Reduce all whitespace sequences to a single space.
339        in_whitespace = true;
340        result[chars_written++] = L' ';
341      }
342      if (trim_sequences_with_line_breaks && !already_trimmed &&
343          ((*i == '\n') || (*i == '\r'))) {
344        // Whitespace sequences containing CR or LF are eliminated entirely.
345        already_trimmed = true;
346        --chars_written;
347      }
348    } else {
349      // Non-whitespace chracters are copied straight across.
350      in_whitespace = false;
351      already_trimmed = false;
352      result[chars_written++] = *i;
353    }
354  }
355
356  if (in_whitespace && !already_trimmed) {
357    // Any trailing whitespace is eliminated.
358    --chars_written;
359  }
360
361  result.resize(chars_written);
362  return result;
363}
364
365std::wstring CollapseWhitespace(const std::wstring& text,
366                                bool trim_sequences_with_line_breaks) {
367  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
368}
369
370#if !defined(WCHAR_T_IS_UTF16)
371string16 CollapseWhitespace(const string16& text,
372                            bool trim_sequences_with_line_breaks) {
373  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
374}
375#endif
376
377std::string CollapseWhitespaceASCII(const std::string& text,
378                                    bool trim_sequences_with_line_breaks) {
379  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
380}
381
382bool ContainsOnlyWhitespaceASCII(const std::string& str) {
383  for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
384    if (!IsAsciiWhitespace(*i))
385      return false;
386  }
387  return true;
388}
389
390bool ContainsOnlyWhitespace(const string16& str) {
391  for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
392    if (!IsWhitespace(*i))
393      return false;
394  }
395  return true;
396}
397
398template<typename STR>
399static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
400  for (typename STR::const_iterator iter = input.begin();
401       iter != input.end(); ++iter) {
402    if (characters.find(*iter) == STR::npos)
403      return false;
404  }
405  return true;
406}
407
408bool ContainsOnlyChars(const std::wstring& input,
409                       const std::wstring& characters) {
410  return ContainsOnlyCharsT(input, characters);
411}
412
413#if !defined(WCHAR_T_IS_UTF16)
414bool ContainsOnlyChars(const string16& input, const string16& characters) {
415  return ContainsOnlyCharsT(input, characters);
416}
417#endif
418
419bool ContainsOnlyChars(const std::string& input,
420                       const std::string& characters) {
421  return ContainsOnlyCharsT(input, characters);
422}
423
424std::string WideToASCII(const std::wstring& wide) {
425  DCHECK(IsStringASCII(wide)) << wide;
426  return std::string(wide.begin(), wide.end());
427}
428
429std::string UTF16ToASCII(const string16& utf16) {
430  DCHECK(IsStringASCII(utf16)) << utf16;
431  return std::string(utf16.begin(), utf16.end());
432}
433
434// Latin1 is just the low range of Unicode, so we can copy directly to convert.
435bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
436  std::string output;
437  output.resize(wide.size());
438  latin1->clear();
439  for (size_t i = 0; i < wide.size(); i++) {
440    if (wide[i] > 255)
441      return false;
442    output[i] = static_cast<char>(wide[i]);
443  }
444  latin1->swap(output);
445  return true;
446}
447
448template<class STR>
449static bool DoIsStringASCII(const STR& str) {
450  for (size_t i = 0; i < str.length(); i++) {
451    typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
452    if (c > 0x7F)
453      return false;
454  }
455  return true;
456}
457
458bool IsStringASCII(const std::wstring& str) {
459  return DoIsStringASCII(str);
460}
461
462#if !defined(WCHAR_T_IS_UTF16)
463bool IsStringASCII(const string16& str) {
464  return DoIsStringASCII(str);
465}
466#endif
467
468bool IsStringASCII(const base::StringPiece& str) {
469  return DoIsStringASCII(str);
470}
471
472bool IsStringUTF8(const std::string& str) {
473  const char *src = str.data();
474  int32 src_len = static_cast<int32>(str.length());
475  int32 char_index = 0;
476
477  while (char_index < src_len) {
478    int32 code_point;
479    CBU8_NEXT(src, char_index, src_len, code_point);
480    if (!base::IsValidCharacter(code_point))
481       return false;
482  }
483  return true;
484}
485
486template<typename Iter>
487static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
488                                          Iter a_end,
489                                          const char* b) {
490  for (Iter it = a_begin; it != a_end; ++it, ++b) {
491    if (!*b || base::ToLowerASCII(*it) != *b)
492      return false;
493  }
494  return *b == 0;
495}
496
497// Front-ends for LowerCaseEqualsASCII.
498bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
499  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
500}
501
502bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
503  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
504}
505
506#if !defined(WCHAR_T_IS_UTF16)
507bool LowerCaseEqualsASCII(const string16& a, const char* b) {
508  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
509}
510#endif
511
512bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
513                          std::string::const_iterator a_end,
514                          const char* b) {
515  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
516}
517
518bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
519                          std::wstring::const_iterator a_end,
520                          const char* b) {
521  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
522}
523
524#if !defined(WCHAR_T_IS_UTF16)
525bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
526                          string16::const_iterator a_end,
527                          const char* b) {
528  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
529}
530#endif
531
532#if !defined(ANDROID)
533bool LowerCaseEqualsASCII(const char* a_begin,
534                          const char* a_end,
535                          const char* b) {
536  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
537}
538#endif // !ANDROID
539
540#if !defined(ANDROID)
541bool LowerCaseEqualsASCII(const wchar_t* a_begin,
542                          const wchar_t* a_end,
543                          const char* b) {
544  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
545}
546#endif // !ANDROID
547
548#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID)
549bool LowerCaseEqualsASCII(const char16* a_begin,
550                          const char16* a_end,
551                          const char* b) {
552  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
553}
554#endif
555
556bool EqualsASCII(const string16& a, const base::StringPiece& b) {
557  if (a.length() != b.length())
558    return false;
559  return std::equal(b.begin(), b.end(), a.begin());
560}
561
562bool StartsWithASCII(const std::string& str,
563                     const std::string& search,
564                     bool case_sensitive) {
565  if (case_sensitive)
566    return str.compare(0, search.length(), search) == 0;
567  else
568    return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
569}
570
571template <typename STR>
572bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
573  if (case_sensitive) {
574    return str.compare(0, search.length(), search) == 0;
575  } else {
576    if (search.size() > str.size())
577      return false;
578    return std::equal(search.begin(), search.end(), str.begin(),
579                      base::CaseInsensitiveCompare<typename STR::value_type>());
580  }
581}
582
583bool StartsWith(const std::wstring& str, const std::wstring& search,
584                bool case_sensitive) {
585  return StartsWithT(str, search, case_sensitive);
586}
587
588#if !defined(WCHAR_T_IS_UTF16)
589bool StartsWith(const string16& str, const string16& search,
590                bool case_sensitive) {
591  return StartsWithT(str, search, case_sensitive);
592}
593#endif
594
595template <typename STR>
596bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
597  typename STR::size_type str_length = str.length();
598  typename STR::size_type search_length = search.length();
599  if (search_length > str_length)
600    return false;
601  if (case_sensitive) {
602    return str.compare(str_length - search_length, search_length, search) == 0;
603  } else {
604    return std::equal(search.begin(), search.end(),
605                      str.begin() + (str_length - search_length),
606                      base::CaseInsensitiveCompare<typename STR::value_type>());
607  }
608}
609
610bool EndsWith(const std::string& str, const std::string& search,
611              bool case_sensitive) {
612  return EndsWithT(str, search, case_sensitive);
613}
614
615bool EndsWith(const std::wstring& str, const std::wstring& search,
616              bool case_sensitive) {
617  return EndsWithT(str, search, case_sensitive);
618}
619
620#if !defined(WCHAR_T_IS_UTF16)
621bool EndsWith(const string16& str, const string16& search,
622              bool case_sensitive) {
623  return EndsWithT(str, search, case_sensitive);
624}
625#endif
626
627DataUnits GetByteDisplayUnits(int64 bytes) {
628  // The byte thresholds at which we display amounts.  A byte count is displayed
629  // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1].
630  // This must match the DataUnits enum.
631  static const int64 kUnitThresholds[] = {
632    0,              // DATA_UNITS_BYTE,
633    3*1024,         // DATA_UNITS_KIBIBYTE,
634    2*1024*1024,    // DATA_UNITS_MEBIBYTE,
635    1024*1024*1024  // DATA_UNITS_GIBIBYTE,
636  };
637
638  if (bytes < 0) {
639    NOTREACHED() << "Negative bytes value";
640    return DATA_UNITS_BYTE;
641  }
642
643  int unit_index = arraysize(kUnitThresholds);
644  while (--unit_index > 0) {
645    if (bytes >= kUnitThresholds[unit_index])
646      break;
647  }
648
649  DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE);
650  return DataUnits(unit_index);
651}
652
653// TODO(mpcomplete): deal with locale
654// Byte suffixes.  This must match the DataUnits enum.
655static const char* const kByteStrings[] = {
656  "B",
657  "kB",
658  "MB",
659  "GB"
660};
661
662static const char* const kSpeedStrings[] = {
663  "B/s",
664  "kB/s",
665  "MB/s",
666  "GB/s"
667};
668
669string16 FormatBytesInternal(int64 bytes,
670                             DataUnits units,
671                             bool show_units,
672                             const char* const* suffix) {
673  if (bytes < 0) {
674    NOTREACHED() << "Negative bytes value";
675    return string16();
676  }
677
678  DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE);
679
680  // Put the quantity in the right units.
681  double unit_amount = static_cast<double>(bytes);
682  for (int i = 0; i < units; ++i)
683    unit_amount /= 1024.0;
684
685  char buf[64];
686  if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100)
687    base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount);
688  else
689    base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount);
690
691  std::string ret(buf);
692  if (show_units) {
693    ret += " ";
694    ret += suffix[units];
695  }
696
697  return ASCIIToUTF16(ret);
698}
699
700string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) {
701  return FormatBytesInternal(bytes, units, show_units, kByteStrings);
702}
703
704string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) {
705  return FormatBytesInternal(bytes, units, show_units, kSpeedStrings);
706}
707
708template<class StringType>
709void DoReplaceSubstringsAfterOffset(StringType* str,
710                                    typename StringType::size_type start_offset,
711                                    const StringType& find_this,
712                                    const StringType& replace_with,
713                                    bool replace_all) {
714  if ((start_offset == StringType::npos) || (start_offset >= str->length()))
715    return;
716
717  DCHECK(!find_this.empty());
718  for (typename StringType::size_type offs(str->find(find_this, start_offset));
719      offs != StringType::npos; offs = str->find(find_this, offs)) {
720    str->replace(offs, find_this.length(), replace_with);
721    offs += replace_with.length();
722
723    if (!replace_all)
724      break;
725  }
726}
727
728void ReplaceFirstSubstringAfterOffset(string16* str,
729                                      string16::size_type start_offset,
730                                      const string16& find_this,
731                                      const string16& replace_with) {
732  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
733                                 false);  // replace first instance
734}
735
736void ReplaceFirstSubstringAfterOffset(std::string* str,
737                                      std::string::size_type start_offset,
738                                      const std::string& find_this,
739                                      const std::string& replace_with) {
740  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
741                                 false);  // replace first instance
742}
743
744void ReplaceSubstringsAfterOffset(string16* str,
745                                  string16::size_type start_offset,
746                                  const string16& find_this,
747                                  const string16& replace_with) {
748  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
749                                 true);  // replace all instances
750}
751
752void ReplaceSubstringsAfterOffset(std::string* str,
753                                  std::string::size_type start_offset,
754                                  const std::string& find_this,
755                                  const std::string& replace_with) {
756  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
757                                 true);  // replace all instances
758}
759
760
761template<typename STR>
762static size_t TokenizeT(const STR& str,
763                        const STR& delimiters,
764                        std::vector<STR>* tokens) {
765  tokens->clear();
766
767  typename STR::size_type start = str.find_first_not_of(delimiters);
768  while (start != STR::npos) {
769    typename STR::size_type end = str.find_first_of(delimiters, start + 1);
770    if (end == STR::npos) {
771      tokens->push_back(str.substr(start));
772      break;
773    } else {
774      tokens->push_back(str.substr(start, end - start));
775      start = str.find_first_not_of(delimiters, end + 1);
776    }
777  }
778
779  return tokens->size();
780}
781
782size_t Tokenize(const std::wstring& str,
783                const std::wstring& delimiters,
784                std::vector<std::wstring>* tokens) {
785  return TokenizeT(str, delimiters, tokens);
786}
787
788#if !defined(WCHAR_T_IS_UTF16)
789size_t Tokenize(const string16& str,
790                const string16& delimiters,
791                std::vector<string16>* tokens) {
792  return TokenizeT(str, delimiters, tokens);
793}
794#endif
795
796size_t Tokenize(const std::string& str,
797                const std::string& delimiters,
798                std::vector<std::string>* tokens) {
799  return TokenizeT(str, delimiters, tokens);
800}
801
802size_t Tokenize(const base::StringPiece& str,
803                const base::StringPiece& delimiters,
804                std::vector<base::StringPiece>* tokens) {
805  return TokenizeT(str, delimiters, tokens);
806}
807
808template<typename STR>
809static STR JoinStringT(const std::vector<STR>& parts,
810                       typename STR::value_type sep) {
811  if (parts.empty())
812    return STR();
813
814  STR result(parts[0]);
815  typename std::vector<STR>::const_iterator iter = parts.begin();
816  ++iter;
817
818  for (; iter != parts.end(); ++iter) {
819    result += sep;
820    result += *iter;
821  }
822
823  return result;
824}
825
826std::string JoinString(const std::vector<std::string>& parts, char sep) {
827  return JoinStringT(parts, sep);
828}
829
830string16 JoinString(const std::vector<string16>& parts, char16 sep) {
831  return JoinStringT(parts, sep);
832}
833
834template<class FormatStringType, class OutStringType>
835OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
836    const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
837  size_t substitutions = subst.size();
838  DCHECK(substitutions < 10);
839
840  size_t sub_length = 0;
841  for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
842       iter != subst.end(); ++iter) {
843    sub_length += iter->length();
844  }
845
846  OutStringType formatted;
847  formatted.reserve(format_string.length() + sub_length);
848
849  std::vector<ReplacementOffset> r_offsets;
850  for (typename FormatStringType::const_iterator i = format_string.begin();
851       i != format_string.end(); ++i) {
852    if ('$' == *i) {
853      if (i + 1 != format_string.end()) {
854        ++i;
855        DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
856        if ('$' == *i) {
857          while (i != format_string.end() && '$' == *i) {
858            formatted.push_back('$');
859            ++i;
860          }
861          --i;
862        } else {
863          uintptr_t index = *i - '1';
864          if (offsets) {
865            ReplacementOffset r_offset(index,
866                static_cast<int>(formatted.size()));
867            r_offsets.insert(std::lower_bound(r_offsets.begin(),
868                                              r_offsets.end(),
869                                              r_offset,
870                                              &CompareParameter),
871                             r_offset);
872          }
873          if (index < substitutions)
874            formatted.append(subst.at(index));
875        }
876      }
877    } else {
878      formatted.push_back(*i);
879    }
880  }
881  if (offsets) {
882    for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
883         i != r_offsets.end(); ++i) {
884      offsets->push_back(i->offset);
885    }
886  }
887  return formatted;
888}
889
890string16 ReplaceStringPlaceholders(const string16& format_string,
891                                   const std::vector<string16>& subst,
892                                   std::vector<size_t>* offsets) {
893  return DoReplaceStringPlaceholders(format_string, subst, offsets);
894}
895
896std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
897                                      const std::vector<std::string>& subst,
898                                      std::vector<size_t>* offsets) {
899  return DoReplaceStringPlaceholders(format_string, subst, offsets);
900}
901
902string16 ReplaceStringPlaceholders(const string16& format_string,
903                                   const string16& a,
904                                   size_t* offset) {
905  std::vector<size_t> offsets;
906  std::vector<string16> subst;
907  subst.push_back(a);
908  string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
909
910  DCHECK(offsets.size() == 1);
911  if (offset) {
912    *offset = offsets[0];
913  }
914  return result;
915}
916
917static bool IsWildcard(base_icu::UChar32 character) {
918  return character == '*' || character == '?';
919}
920
921// Move the strings pointers to the point where they start to differ.
922template <typename CHAR, typename NEXT>
923static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
924                         const CHAR** string, const CHAR* string_end,
925                         NEXT next) {
926  const CHAR* escape = NULL;
927  while (*pattern != pattern_end && *string != string_end) {
928    if (!escape && IsWildcard(**pattern)) {
929      // We don't want to match wildcard here, except if it's escaped.
930      return;
931    }
932
933    // Check if the escapement char is found. If so, skip it and move to the
934    // next character.
935    if (!escape && **pattern == '\\') {
936      escape = *pattern;
937      next(pattern, pattern_end);
938      continue;
939    }
940
941    // Check if the chars match, if so, increment the ptrs.
942    const CHAR* pattern_next = *pattern;
943    const CHAR* string_next = *string;
944    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
945    if (pattern_char == next(&string_next, string_end) &&
946        pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
947      *pattern = pattern_next;
948      *string = string_next;
949    } else {
950      // Uh ho, it did not match, we are done. If the last char was an
951      // escapement, that means that it was an error to advance the ptr here,
952      // let's put it back where it was. This also mean that the MatchPattern
953      // function will return false because if we can't match an escape char
954      // here, then no one will.
955      if (escape) {
956        *pattern = escape;
957      }
958      return;
959    }
960
961    escape = NULL;
962  }
963}
964
965template <typename CHAR, typename NEXT>
966static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
967  while (*pattern != end) {
968    if (!IsWildcard(**pattern))
969      return;
970    next(pattern, end);
971  }
972}
973
974template <typename CHAR, typename NEXT>
975static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
976                          const CHAR* pattern, const CHAR* pattern_end,
977                          int depth,
978                          NEXT next) {
979  const int kMaxDepth = 16;
980  if (depth > kMaxDepth)
981    return false;
982
983  // Eat all the matching chars.
984  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
985
986  // If the string is empty, then the pattern must be empty too, or contains
987  // only wildcards.
988  if (eval == eval_end) {
989    EatWildcard(&pattern, pattern_end, next);
990    return pattern == pattern_end;
991  }
992
993  // Pattern is empty but not string, this is not a match.
994  if (pattern == pattern_end)
995    return false;
996
997  // If this is a question mark, then we need to compare the rest with
998  // the current string or the string with one character eaten.
999  const CHAR* next_pattern = pattern;
1000  next(&next_pattern, pattern_end);
1001  if (pattern[0] == '?') {
1002    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1003                      depth + 1, next))
1004      return true;
1005    const CHAR* next_eval = eval;
1006    next(&next_eval, eval_end);
1007    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
1008                      depth + 1, next))
1009      return true;
1010  }
1011
1012  // This is a *, try to match all the possible substrings with the remainder
1013  // of the pattern.
1014  if (pattern[0] == '*') {
1015    // Collapse duplicate wild cards (********** into *) so that the
1016    // method does not recurse unnecessarily. http://crbug.com/52839
1017    EatWildcard(&next_pattern, pattern_end, next);
1018
1019    while (eval != eval_end) {
1020      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1021                        depth + 1, next))
1022        return true;
1023      eval++;
1024    }
1025
1026    // We reached the end of the string, let see if the pattern contains only
1027    // wildcards.
1028    if (eval == eval_end) {
1029      EatWildcard(&pattern, pattern_end, next);
1030      if (pattern != pattern_end)
1031        return false;
1032      return true;
1033    }
1034  }
1035
1036  return false;
1037}
1038
1039struct NextCharUTF8 {
1040  base_icu::UChar32 operator()(const char** p, const char* end) {
1041    base_icu::UChar32 c;
1042    int offset = 0;
1043    CBU8_NEXT(*p, offset, end - *p, c);
1044    *p += offset;
1045    return c;
1046  }
1047};
1048
1049struct NextCharUTF16 {
1050  base_icu::UChar32 operator()(const char16** p, const char16* end) {
1051    base_icu::UChar32 c;
1052    int offset = 0;
1053    CBU16_NEXT(*p, offset, end - *p, c);
1054    *p += offset;
1055    return c;
1056  }
1057};
1058
1059bool MatchPattern(const base::StringPiece& eval,
1060                  const base::StringPiece& pattern) {
1061  return MatchPatternT(eval.data(), eval.data() + eval.size(),
1062                       pattern.data(), pattern.data() + pattern.size(),
1063                       0, NextCharUTF8());
1064}
1065
1066bool MatchPattern(const string16& eval, const string16& pattern) {
1067  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1068                       pattern.c_str(), pattern.c_str() + pattern.size(),
1069                       0, NextCharUTF16());
1070}
1071
1072// The following code is compatible with the OpenBSD lcpy interface.  See:
1073//   http://www.gratisoft.us/todd/papers/strlcpy.html
1074//   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1075
1076namespace {
1077
1078template <typename CHAR>
1079size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1080  for (size_t i = 0; i < dst_size; ++i) {
1081    if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1082      return i;
1083  }
1084
1085  // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1086  if (dst_size != 0)
1087    dst[dst_size - 1] = 0;
1088
1089  // Count the rest of the |src|, and return it's length in characters.
1090  while (src[dst_size]) ++dst_size;
1091  return dst_size;
1092}
1093
1094}  // namespace
1095
1096size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1097  return lcpyT<char>(dst, src, dst_size);
1098}
1099size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1100  return lcpyT<wchar_t>(dst, src, dst_size);
1101}
1102