string_util.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/string_util.h"
6
7#include "build/build_config.h"
8
9#include <ctype.h>
10#include <errno.h>
11#include <math.h>
12#include <stdarg.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <time.h>
17#include <wchar.h>
18#include <wctype.h>
19
20#include <algorithm>
21#include <vector>
22
23#include "base/basictypes.h"
24#include "base/logging.h"
25#include "base/singleton.h"
26#include "base/third_party/dmg_fp/dmg_fp.h"
27#include "base/utf_string_conversion_utils.h"
28#include "base/utf_string_conversions.h"
29#include "base/third_party/icu/icu_utf.h"
30
31namespace {
32
33// Force the singleton used by Empty[W]String[16] to be a unique type. This
34// prevents other code that might accidentally use Singleton<string> from
35// getting our internal one.
36struct EmptyStrings {
37  EmptyStrings() {}
38  const std::string s;
39  const std::wstring ws;
40  const string16 s16;
41};
42
43// Used by ReplaceStringPlaceholders to track the position in the string of
44// replaced parameters.
45struct ReplacementOffset {
46  ReplacementOffset(uintptr_t parameter, size_t offset)
47      : parameter(parameter),
48        offset(offset) {}
49
50  // Index of the parameter.
51  uintptr_t parameter;
52
53  // Starting position in the string.
54  size_t offset;
55};
56
57static bool CompareParameter(const ReplacementOffset& elem1,
58                             const ReplacementOffset& elem2) {
59  return elem1.parameter < elem2.parameter;
60}
61
62}  // namespace
63
64namespace base {
65
66bool IsWprintfFormatPortable(const wchar_t* format) {
67  for (const wchar_t* position = format; *position != '\0'; ++position) {
68    if (*position == '%') {
69      bool in_specification = true;
70      bool modifier_l = false;
71      while (in_specification) {
72        // Eat up characters until reaching a known specifier.
73        if (*++position == '\0') {
74          // The format string ended in the middle of a specification.  Call
75          // it portable because no unportable specifications were found.  The
76          // string is equally broken on all platforms.
77          return true;
78        }
79
80        if (*position == 'l') {
81          // 'l' is the only thing that can save the 's' and 'c' specifiers.
82          modifier_l = true;
83        } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
84                   *position == 'S' || *position == 'C' || *position == 'F' ||
85                   *position == 'D' || *position == 'O' || *position == 'U') {
86          // Not portable.
87          return false;
88        }
89
90        if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
91          // Portable, keep scanning the rest of the format string.
92          in_specification = false;
93        }
94      }
95    }
96  }
97
98  return true;
99}
100
101}  // namespace base
102
103
104const std::string& EmptyString() {
105  return Singleton<EmptyStrings>::get()->s;
106}
107
108const std::wstring& EmptyWString() {
109  return Singleton<EmptyStrings>::get()->ws;
110}
111
112const string16& EmptyString16() {
113  return Singleton<EmptyStrings>::get()->s16;
114}
115
116#define WHITESPACE_UNICODE \
117  0x0009, /* <control-0009> to <control-000D> */ \
118  0x000A,                                        \
119  0x000B,                                        \
120  0x000C,                                        \
121  0x000D,                                        \
122  0x0020, /* Space */                            \
123  0x0085, /* <control-0085> */                   \
124  0x00A0, /* No-Break Space */                   \
125  0x1680, /* Ogham Space Mark */                 \
126  0x180E, /* Mongolian Vowel Separator */        \
127  0x2000, /* En Quad to Hair Space */            \
128  0x2001,                                        \
129  0x2002,                                        \
130  0x2003,                                        \
131  0x2004,                                        \
132  0x2005,                                        \
133  0x2006,                                        \
134  0x2007,                                        \
135  0x2008,                                        \
136  0x2009,                                        \
137  0x200A,                                        \
138  0x200C, /* Zero Width Non-Joiner */            \
139  0x2028, /* Line Separator */                   \
140  0x2029, /* Paragraph Separator */              \
141  0x202F, /* Narrow No-Break Space */            \
142  0x205F, /* Medium Mathematical Space */        \
143  0x3000, /* Ideographic Space */                \
144  0
145
146const wchar_t kWhitespaceWide[] = {
147  WHITESPACE_UNICODE
148};
149const char16 kWhitespaceUTF16[] = {
150  WHITESPACE_UNICODE
151};
152const char kWhitespaceASCII[] = {
153  0x09,    // <control-0009> to <control-000D>
154  0x0A,
155  0x0B,
156  0x0C,
157  0x0D,
158  0x20,    // Space
159  0
160};
161
162const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
163
164template<typename STR>
165bool RemoveCharsT(const STR& input,
166                  const typename STR::value_type remove_chars[],
167                  STR* output) {
168  bool removed = false;
169  size_t found;
170
171  *output = input;
172
173  found = output->find_first_of(remove_chars);
174  while (found != STR::npos) {
175    removed = true;
176    output->replace(found, 1, STR());
177    found = output->find_first_of(remove_chars, found);
178  }
179
180  return removed;
181}
182
183bool RemoveChars(const std::wstring& input,
184                 const wchar_t remove_chars[],
185                 std::wstring* output) {
186  return RemoveCharsT(input, remove_chars, output);
187}
188
189#if !defined(WCHAR_T_IS_UTF16)
190bool RemoveChars(const string16& input,
191                 const char16 remove_chars[],
192                 string16* output) {
193  return RemoveCharsT(input, remove_chars, output);
194}
195#endif
196
197bool RemoveChars(const std::string& input,
198                 const char remove_chars[],
199                 std::string* output) {
200  return RemoveCharsT(input, remove_chars, output);
201}
202
203template<typename STR>
204TrimPositions TrimStringT(const STR& input,
205                          const typename STR::value_type trim_chars[],
206                          TrimPositions positions,
207                          STR* output) {
208  // Find the edges of leading/trailing whitespace as desired.
209  const typename STR::size_type last_char = input.length() - 1;
210  const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
211      input.find_first_not_of(trim_chars) : 0;
212  const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
213      input.find_last_not_of(trim_chars) : last_char;
214
215  // When the string was all whitespace, report that we stripped off whitespace
216  // from whichever position the caller was interested in.  For empty input, we
217  // stripped no whitespace, but we still need to clear |output|.
218  if (input.empty() ||
219      (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
220    bool input_was_empty = input.empty();  // in case output == &input
221    output->clear();
222    return input_was_empty ? TRIM_NONE : positions;
223  }
224
225  // Trim the whitespace.
226  *output =
227      input.substr(first_good_char, last_good_char - first_good_char + 1);
228
229  // Return where we trimmed from.
230  return static_cast<TrimPositions>(
231      ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
232      ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
233}
234
235bool TrimString(const std::wstring& input,
236                const wchar_t trim_chars[],
237                std::wstring* output) {
238  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
239}
240
241#if !defined(WCHAR_T_IS_UTF16)
242bool TrimString(const string16& input,
243                const char16 trim_chars[],
244                string16* output) {
245  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
246}
247#endif
248
249bool TrimString(const std::string& input,
250                const char trim_chars[],
251                std::string* output) {
252  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
253}
254
255void TruncateUTF8ToByteSize(const std::string& input,
256                            const size_t byte_size,
257                            std::string* output) {
258  DCHECK(output);
259  if (byte_size > input.length()) {
260    *output = input;
261    return;
262  }
263  DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
264  // Note: This cast is necessary because CBU8_NEXT uses int32s.
265  int32 truncation_length = static_cast<int32>(byte_size);
266  int32 char_index = truncation_length - 1;
267  const char* data = input.data();
268
269  // Using CBU8, we will move backwards from the truncation point
270  // to the beginning of the string looking for a valid UTF8
271  // character.  Once a full UTF8 character is found, we will
272  // truncate the string to the end of that character.
273  while (char_index >= 0) {
274    int32 prev = char_index;
275    uint32 code_point = 0;
276    CBU8_NEXT(data, char_index, truncation_length, code_point);
277    if (!base::IsValidCharacter(code_point) ||
278        !base::IsValidCodepoint(code_point)) {
279      char_index = prev - 1;
280    } else {
281      break;
282    }
283  }
284
285  if (char_index >= 0 )
286    *output = input.substr(0, char_index);
287  else
288    output->clear();
289}
290
291TrimPositions TrimWhitespace(const std::wstring& input,
292                             TrimPositions positions,
293                             std::wstring* output) {
294  return TrimStringT(input, kWhitespaceWide, positions, output);
295}
296
297#if !defined(WCHAR_T_IS_UTF16)
298TrimPositions TrimWhitespace(const string16& input,
299                             TrimPositions positions,
300                             string16* output) {
301  return TrimStringT(input, kWhitespaceUTF16, positions, output);
302}
303#endif
304
305TrimPositions TrimWhitespaceASCII(const std::string& input,
306                                  TrimPositions positions,
307                                  std::string* output) {
308  return TrimStringT(input, kWhitespaceASCII, positions, output);
309}
310
311// This function is only for backward-compatibility.
312// To be removed when all callers are updated.
313TrimPositions TrimWhitespace(const std::string& input,
314                             TrimPositions positions,
315                             std::string* output) {
316  return TrimWhitespaceASCII(input, positions, output);
317}
318
319template<typename STR>
320STR CollapseWhitespaceT(const STR& text,
321                        bool trim_sequences_with_line_breaks) {
322  STR result;
323  result.resize(text.size());
324
325  // Set flags to pretend we're already in a trimmed whitespace sequence, so we
326  // will trim any leading whitespace.
327  bool in_whitespace = true;
328  bool already_trimmed = true;
329
330  int chars_written = 0;
331  for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
332    if (IsWhitespace(*i)) {
333      if (!in_whitespace) {
334        // Reduce all whitespace sequences to a single space.
335        in_whitespace = true;
336        result[chars_written++] = L' ';
337      }
338      if (trim_sequences_with_line_breaks && !already_trimmed &&
339          ((*i == '\n') || (*i == '\r'))) {
340        // Whitespace sequences containing CR or LF are eliminated entirely.
341        already_trimmed = true;
342        --chars_written;
343      }
344    } else {
345      // Non-whitespace chracters are copied straight across.
346      in_whitespace = false;
347      already_trimmed = false;
348      result[chars_written++] = *i;
349    }
350  }
351
352  if (in_whitespace && !already_trimmed) {
353    // Any trailing whitespace is eliminated.
354    --chars_written;
355  }
356
357  result.resize(chars_written);
358  return result;
359}
360
361std::wstring CollapseWhitespace(const std::wstring& text,
362                                bool trim_sequences_with_line_breaks) {
363  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
364}
365
366#if !defined(WCHAR_T_IS_UTF16)
367string16 CollapseWhitespace(const string16& text,
368                            bool trim_sequences_with_line_breaks) {
369  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
370}
371#endif
372
373std::string CollapseWhitespaceASCII(const std::string& text,
374                                    bool trim_sequences_with_line_breaks) {
375  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
376}
377
378bool ContainsOnlyWhitespaceASCII(const std::string& str) {
379  for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
380    if (!IsAsciiWhitespace(*i))
381      return false;
382  }
383  return true;
384}
385
386bool ContainsOnlyWhitespace(const string16& str) {
387  for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
388    if (!IsWhitespace(*i))
389      return false;
390  }
391  return true;
392}
393
394template<typename STR>
395static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
396  for (typename STR::const_iterator iter = input.begin();
397       iter != input.end(); ++iter) {
398    if (characters.find(*iter) == STR::npos)
399      return false;
400  }
401  return true;
402}
403
404bool ContainsOnlyChars(const std::wstring& input,
405                       const std::wstring& characters) {
406  return ContainsOnlyCharsT(input, characters);
407}
408
409#if !defined(WCHAR_T_IS_UTF16)
410bool ContainsOnlyChars(const string16& input, const string16& characters) {
411  return ContainsOnlyCharsT(input, characters);
412}
413#endif
414
415bool ContainsOnlyChars(const std::string& input,
416                       const std::string& characters) {
417  return ContainsOnlyCharsT(input, characters);
418}
419
420std::string WideToASCII(const std::wstring& wide) {
421  DCHECK(IsStringASCII(wide)) << wide;
422  return std::string(wide.begin(), wide.end());
423}
424
425std::string UTF16ToASCII(const string16& utf16) {
426  DCHECK(IsStringASCII(utf16)) << utf16;
427  return std::string(utf16.begin(), utf16.end());
428}
429
430// Latin1 is just the low range of Unicode, so we can copy directly to convert.
431bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
432  std::string output;
433  output.resize(wide.size());
434  latin1->clear();
435  for (size_t i = 0; i < wide.size(); i++) {
436    if (wide[i] > 255)
437      return false;
438    output[i] = static_cast<char>(wide[i]);
439  }
440  latin1->swap(output);
441  return true;
442}
443
444bool IsString8Bit(const std::wstring& str) {
445  for (size_t i = 0; i < str.length(); i++) {
446    if (str[i] > 255)
447      return false;
448  }
449  return true;
450}
451
452template<class STR>
453static bool DoIsStringASCII(const STR& str) {
454  for (size_t i = 0; i < str.length(); i++) {
455    typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
456    if (c > 0x7F)
457      return false;
458  }
459  return true;
460}
461
462bool IsStringASCII(const std::wstring& str) {
463  return DoIsStringASCII(str);
464}
465
466#if !defined(WCHAR_T_IS_UTF16)
467bool IsStringASCII(const string16& str) {
468  return DoIsStringASCII(str);
469}
470#endif
471
472bool IsStringASCII(const base::StringPiece& str) {
473  return DoIsStringASCII(str);
474}
475
476bool IsStringUTF8(const std::string& str) {
477  const char *src = str.data();
478  int32 src_len = static_cast<int32>(str.length());
479  int32 char_index = 0;
480
481  while (char_index < src_len) {
482    int32 code_point;
483    CBU8_NEXT(src, char_index, src_len, code_point);
484    if (!base::IsValidCharacter(code_point))
485       return false;
486  }
487  return true;
488}
489
490template<typename Iter>
491static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
492                                          Iter a_end,
493                                          const char* b) {
494  for (Iter it = a_begin; it != a_end; ++it, ++b) {
495    if (!*b || ToLowerASCII(*it) != *b)
496      return false;
497  }
498  return *b == 0;
499}
500
501// Front-ends for LowerCaseEqualsASCII.
502bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
503  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
504}
505
506bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
507  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
508}
509
510#if !defined(WCHAR_T_IS_UTF16)
511bool LowerCaseEqualsASCII(const string16& a, const char* b) {
512  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
513}
514#endif
515
516bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
517                          std::string::const_iterator a_end,
518                          const char* b) {
519  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
520}
521
522bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
523                          std::wstring::const_iterator a_end,
524                          const char* b) {
525  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
526}
527
528#if !defined(WCHAR_T_IS_UTF16)
529bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
530                          string16::const_iterator a_end,
531                          const char* b) {
532  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
533}
534#endif
535
536// weird string library ...
537#if !defined(ANDROID)
538bool LowerCaseEqualsASCII(const char* a_begin,
539                          const char* a_end,
540                          const char* b) {
541  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
542}
543#endif // !ANDROID
544
545// weird string library ...
546#if !defined(ANDROID)
547bool LowerCaseEqualsASCII(const wchar_t* a_begin,
548                          const wchar_t* a_end,
549                          const char* b) {
550  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
551}
552#endif // !ANDROID
553
554#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID)
555bool LowerCaseEqualsASCII(const char16* a_begin,
556                          const char16* a_end,
557                          const char* b) {
558  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
559}
560#endif
561
562bool EqualsASCII(const string16& a, const base::StringPiece& b) {
563  if (a.length() != b.length())
564    return false;
565  return std::equal(b.begin(), b.end(), a.begin());
566}
567
568bool StartsWithASCII(const std::string& str,
569                     const std::string& search,
570                     bool case_sensitive) {
571  if (case_sensitive)
572    return str.compare(0, search.length(), search) == 0;
573  else
574    return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
575}
576
577template <typename STR>
578bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
579  if (case_sensitive) {
580    return str.compare(0, search.length(), search) == 0;
581  } else {
582    if (search.size() > str.size())
583      return false;
584    return std::equal(search.begin(), search.end(), str.begin(),
585                      CaseInsensitiveCompare<typename STR::value_type>());
586  }
587}
588
589bool StartsWith(const std::wstring& str, const std::wstring& search,
590                bool case_sensitive) {
591  return StartsWithT(str, search, case_sensitive);
592}
593
594#if !defined(WCHAR_T_IS_UTF16)
595bool StartsWith(const string16& str, const string16& search,
596                bool case_sensitive) {
597  return StartsWithT(str, search, case_sensitive);
598}
599#endif
600
601template <typename STR>
602bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
603  typename STR::size_type str_length = str.length();
604  typename STR::size_type search_length = search.length();
605  if (search_length > str_length)
606    return false;
607  if (case_sensitive) {
608    return str.compare(str_length - search_length, search_length, search) == 0;
609  } else {
610    return std::equal(search.begin(), search.end(),
611                      str.begin() + (str_length - search_length),
612                      CaseInsensitiveCompare<typename STR::value_type>());
613  }
614}
615
616bool EndsWith(const std::string& str, const std::string& search,
617              bool case_sensitive) {
618  return EndsWithT(str, search, case_sensitive);
619}
620
621bool EndsWith(const std::wstring& str, const std::wstring& search,
622              bool case_sensitive) {
623  return EndsWithT(str, search, case_sensitive);
624}
625
626#if !defined(WCHAR_T_IS_UTF16)
627bool EndsWith(const string16& str, const string16& search,
628              bool case_sensitive) {
629  return EndsWithT(str, search, case_sensitive);
630}
631#endif
632
633DataUnits GetByteDisplayUnits(int64 bytes) {
634  // The byte thresholds at which we display amounts.  A byte count is displayed
635  // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1].
636  // This must match the DataUnits enum.
637  static const int64 kUnitThresholds[] = {
638    0,              // DATA_UNITS_BYTE,
639    3*1024,         // DATA_UNITS_KIBIBYTE,
640    2*1024*1024,    // DATA_UNITS_MEBIBYTE,
641    1024*1024*1024  // DATA_UNITS_GIBIBYTE,
642  };
643
644  if (bytes < 0) {
645    NOTREACHED() << "Negative bytes value";
646    return DATA_UNITS_BYTE;
647  }
648
649  int unit_index = arraysize(kUnitThresholds);
650  while (--unit_index > 0) {
651    if (bytes >= kUnitThresholds[unit_index])
652      break;
653  }
654
655  DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE);
656  return DataUnits(unit_index);
657}
658
659// TODO(mpcomplete): deal with locale
660// Byte suffixes.  This must match the DataUnits enum.
661static const char* const kByteStrings[] = {
662  "B",
663  "kB",
664  "MB",
665  "GB"
666};
667
668static const char* const kSpeedStrings[] = {
669  "B/s",
670  "kB/s",
671  "MB/s",
672  "GB/s"
673};
674
675string16 FormatBytesInternal(int64 bytes,
676                             DataUnits units,
677                             bool show_units,
678                             const char* const* suffix) {
679  if (bytes < 0) {
680    NOTREACHED() << "Negative bytes value";
681    return string16();
682  }
683
684  DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE);
685
686  // Put the quantity in the right units.
687  double unit_amount = static_cast<double>(bytes);
688  for (int i = 0; i < units; ++i)
689    unit_amount /= 1024.0;
690
691  char buf[64];
692  if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100)
693    base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount);
694  else
695    base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount);
696
697  std::string ret(buf);
698  if (show_units) {
699    ret += " ";
700    ret += suffix[units];
701  }
702
703  return ASCIIToUTF16(ret);
704}
705
706string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) {
707  return FormatBytesInternal(bytes, units, show_units, kByteStrings);
708}
709
710string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) {
711  return FormatBytesInternal(bytes, units, show_units, kSpeedStrings);
712}
713
714template<class StringType>
715void DoReplaceSubstringsAfterOffset(StringType* str,
716                                    typename StringType::size_type start_offset,
717                                    const StringType& find_this,
718                                    const StringType& replace_with,
719                                    bool replace_all) {
720  if ((start_offset == StringType::npos) || (start_offset >= str->length()))
721    return;
722
723  DCHECK(!find_this.empty());
724  for (typename StringType::size_type offs(str->find(find_this, start_offset));
725      offs != StringType::npos; offs = str->find(find_this, offs)) {
726    str->replace(offs, find_this.length(), replace_with);
727    offs += replace_with.length();
728
729    if (!replace_all)
730      break;
731  }
732}
733
734void ReplaceFirstSubstringAfterOffset(string16* str,
735                                      string16::size_type start_offset,
736                                      const string16& find_this,
737                                      const string16& replace_with) {
738  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
739                                 false);  // replace first instance
740}
741
742void ReplaceFirstSubstringAfterOffset(std::string* str,
743                                      std::string::size_type start_offset,
744                                      const std::string& find_this,
745                                      const std::string& replace_with) {
746  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
747                                 false);  // replace first instance
748}
749
750void ReplaceSubstringsAfterOffset(string16* str,
751                                  string16::size_type start_offset,
752                                  const string16& find_this,
753                                  const string16& replace_with) {
754  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
755                                 true);  // replace all instances
756}
757
758void ReplaceSubstringsAfterOffset(std::string* str,
759                                  std::string::size_type start_offset,
760                                  const std::string& find_this,
761                                  const std::string& replace_with) {
762  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
763                                 true);  // replace all instances
764}
765
766
767template<typename STR>
768static size_t TokenizeT(const STR& str,
769                        const STR& delimiters,
770                        std::vector<STR>* tokens) {
771  tokens->clear();
772
773  typename STR::size_type start = str.find_first_not_of(delimiters);
774  while (start != STR::npos) {
775    typename STR::size_type end = str.find_first_of(delimiters, start + 1);
776    if (end == STR::npos) {
777      tokens->push_back(str.substr(start));
778      break;
779    } else {
780      tokens->push_back(str.substr(start, end - start));
781      start = str.find_first_not_of(delimiters, end + 1);
782    }
783  }
784
785  return tokens->size();
786}
787
788size_t Tokenize(const std::wstring& str,
789                const std::wstring& delimiters,
790                std::vector<std::wstring>* tokens) {
791  return TokenizeT(str, delimiters, tokens);
792}
793
794#if !defined(WCHAR_T_IS_UTF16)
795size_t Tokenize(const string16& str,
796                const string16& delimiters,
797                std::vector<string16>* tokens) {
798  return TokenizeT(str, delimiters, tokens);
799}
800#endif
801
802size_t Tokenize(const std::string& str,
803                const std::string& delimiters,
804                std::vector<std::string>* tokens) {
805  return TokenizeT(str, delimiters, tokens);
806}
807
808size_t Tokenize(const base::StringPiece& str,
809                const base::StringPiece& delimiters,
810                std::vector<base::StringPiece>* tokens) {
811  return TokenizeT(str, delimiters, tokens);
812}
813
814template<typename STR>
815static STR JoinStringT(const std::vector<STR>& parts,
816                       typename STR::value_type sep) {
817  if (parts.size() == 0) return STR();
818
819  STR result(parts[0]);
820  typename std::vector<STR>::const_iterator iter = parts.begin();
821  ++iter;
822
823  for (; iter != parts.end(); ++iter) {
824    result += sep;
825    result += *iter;
826  }
827
828  return result;
829}
830
831std::string JoinString(const std::vector<std::string>& parts, char sep) {
832  return JoinStringT(parts, sep);
833}
834
835#if !defined(WCHAR_T_IS_UTF16)
836string16 JoinString(const std::vector<string16>& parts, char16 sep) {
837  return JoinStringT(parts, sep);
838}
839#endif
840
841std::wstring JoinString(const std::vector<std::wstring>& parts, wchar_t sep) {
842  return JoinStringT(parts, sep);
843}
844
845template<typename STR>
846void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) {
847  const size_t length = str.length();
848  if (!length)
849    return;
850
851  bool last_was_ws = false;
852  size_t last_non_ws_start = 0;
853  for (size_t i = 0; i < length; ++i) {
854    switch (str[i]) {
855      // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR.
856      case L' ':
857      case L'\t':
858      case L'\xA':
859      case L'\xB':
860      case L'\xC':
861      case L'\xD':
862        if (!last_was_ws) {
863          if (i > 0) {
864            result->push_back(
865                str.substr(last_non_ws_start, i - last_non_ws_start));
866          }
867          last_was_ws = true;
868        }
869        break;
870
871      default:  // Not a space character.
872        if (last_was_ws) {
873          last_was_ws = false;
874          last_non_ws_start = i;
875        }
876        break;
877    }
878  }
879  if (!last_was_ws) {
880    result->push_back(
881        str.substr(last_non_ws_start, length - last_non_ws_start));
882  }
883}
884
885void SplitStringAlongWhitespace(const std::wstring& str,
886                                std::vector<std::wstring>* result) {
887  SplitStringAlongWhitespaceT(str, result);
888}
889
890#if !defined(WCHAR_T_IS_UTF16)
891void SplitStringAlongWhitespace(const string16& str,
892                                std::vector<string16>* result) {
893  SplitStringAlongWhitespaceT(str, result);
894}
895#endif
896
897void SplitStringAlongWhitespace(const std::string& str,
898                                std::vector<std::string>* result) {
899  SplitStringAlongWhitespaceT(str, result);
900}
901
902template<class FormatStringType, class OutStringType>
903OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
904    const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
905  size_t substitutions = subst.size();
906  DCHECK(substitutions < 10);
907
908  size_t sub_length = 0;
909  for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
910       iter != subst.end(); ++iter) {
911    sub_length += (*iter).length();
912  }
913
914  OutStringType formatted;
915  formatted.reserve(format_string.length() + sub_length);
916
917  std::vector<ReplacementOffset> r_offsets;
918  for (typename FormatStringType::const_iterator i = format_string.begin();
919       i != format_string.end(); ++i) {
920    if ('$' == *i) {
921      if (i + 1 != format_string.end()) {
922        ++i;
923        DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
924        if ('$' == *i) {
925          while (i != format_string.end() && '$' == *i) {
926            formatted.push_back('$');
927            ++i;
928          }
929          --i;
930        } else {
931          uintptr_t index = *i - '1';
932          if (offsets) {
933            ReplacementOffset r_offset(index,
934                static_cast<int>(formatted.size()));
935            r_offsets.insert(std::lower_bound(r_offsets.begin(),
936                r_offsets.end(), r_offset,
937                &CompareParameter),
938                r_offset);
939          }
940          if (index < substitutions)
941            formatted.append(subst.at(index));
942        }
943      }
944    } else {
945      formatted.push_back(*i);
946    }
947  }
948  if (offsets) {
949    for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
950        i != r_offsets.end(); ++i) {
951      offsets->push_back(i->offset);
952    }
953  }
954  return formatted;
955}
956
957string16 ReplaceStringPlaceholders(const string16& format_string,
958                                   const std::vector<string16>& subst,
959                                   std::vector<size_t>* offsets) {
960  return DoReplaceStringPlaceholders(format_string, subst, offsets);
961}
962
963std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
964                                      const std::vector<std::string>& subst,
965                                      std::vector<size_t>* offsets) {
966  return DoReplaceStringPlaceholders(format_string, subst, offsets);
967}
968
969string16 ReplaceStringPlaceholders(const string16& format_string,
970                                   const string16& a,
971                                   size_t* offset) {
972  std::vector<size_t> offsets;
973  std::vector<string16> subst;
974  subst.push_back(a);
975  string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
976
977  DCHECK(offsets.size() == 1);
978  if (offset) {
979    *offset = offsets[0];
980  }
981  return result;
982}
983
984static bool IsWildcard(base_icu::UChar32 character) {
985  return character == '*' || character == '?';
986}
987
988// Move the strings pointers to the point where they start to differ.
989template <typename CHAR, typename NEXT>
990static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
991                         const CHAR** string, const CHAR* string_end,
992                         NEXT next) {
993  const CHAR* escape = NULL;
994  while (*pattern != pattern_end && *string != string_end) {
995    if (!escape && IsWildcard(**pattern)) {
996      // We don't want to match wildcard here, except if it's escaped.
997      return;
998    }
999
1000    // Check if the escapement char is found. If so, skip it and move to the
1001    // next character.
1002    if (!escape && **pattern == '\\') {
1003      escape = *pattern;
1004      next(pattern, pattern_end);
1005      continue;
1006    }
1007
1008    // Check if the chars match, if so, increment the ptrs.
1009    const CHAR* pattern_next = *pattern;
1010    const CHAR* string_next = *string;
1011    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
1012    if (pattern_char == next(&string_next, string_end) &&
1013        pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
1014      *pattern = pattern_next;
1015      *string = string_next;
1016    } else {
1017      // Uh ho, it did not match, we are done. If the last char was an
1018      // escapement, that means that it was an error to advance the ptr here,
1019      // let's put it back where it was. This also mean that the MatchPattern
1020      // function will return false because if we can't match an escape char
1021      // here, then no one will.
1022      if (escape) {
1023        *pattern = escape;
1024      }
1025      return;
1026    }
1027
1028    escape = NULL;
1029  }
1030}
1031
1032template <typename CHAR, typename NEXT>
1033static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
1034  while (*pattern != end) {
1035    if (!IsWildcard(**pattern))
1036      return;
1037    next(pattern, end);
1038  }
1039}
1040
1041template <typename CHAR, typename NEXT>
1042static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
1043                          const CHAR* pattern, const CHAR* pattern_end,
1044                          int depth,
1045                          NEXT next) {
1046  const int kMaxDepth = 16;
1047  if (depth > kMaxDepth)
1048    return false;
1049
1050  // Eat all the matching chars.
1051  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
1052
1053  // If the string is empty, then the pattern must be empty too, or contains
1054  // only wildcards.
1055  if (eval == eval_end) {
1056    EatWildcard(&pattern, pattern_end, next);
1057    return pattern == pattern_end;
1058  }
1059
1060  // Pattern is empty but not string, this is not a match.
1061  if (pattern == pattern_end)
1062    return false;
1063
1064  // If this is a question mark, then we need to compare the rest with
1065  // the current string or the string with one character eaten.
1066  const CHAR* next_pattern = pattern;
1067  next(&next_pattern, pattern_end);
1068  if (pattern[0] == '?') {
1069    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1070                      depth + 1, next))
1071      return true;
1072    const CHAR* next_eval = eval;
1073    next(&next_eval, eval_end);
1074    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
1075                      depth + 1, next))
1076      return true;
1077  }
1078
1079  // This is a *, try to match all the possible substrings with the remainder
1080  // of the pattern.
1081  if (pattern[0] == '*') {
1082    while (eval != eval_end) {
1083      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1084                        depth + 1, next))
1085        return true;
1086      eval++;
1087    }
1088
1089    // We reached the end of the string, let see if the pattern contains only
1090    // wildcards.
1091    if (eval == eval_end) {
1092      EatWildcard(&pattern, pattern_end, next);
1093      if (pattern != pattern_end)
1094        return false;
1095      return true;
1096    }
1097  }
1098
1099  return false;
1100}
1101
1102struct NextCharUTF8 {
1103  base_icu::UChar32 operator()(const char** p, const char* end) {
1104    base_icu::UChar32 c;
1105    int offset = 0;
1106    CBU8_NEXT(*p, offset, end - *p, c);
1107    *p += offset;
1108    return c;
1109  }
1110};
1111
1112struct NextCharUTF16 {
1113  base_icu::UChar32 operator()(const char16** p, const char16* end) {
1114    base_icu::UChar32 c;
1115    int offset = 0;
1116    CBU16_NEXT(*p, offset, end - *p, c);
1117    *p += offset;
1118    return c;
1119  }
1120};
1121
1122bool MatchPattern(const base::StringPiece& eval,
1123                  const base::StringPiece& pattern) {
1124  return MatchPatternT(eval.data(), eval.data() + eval.size(),
1125                       pattern.data(), pattern.data() + pattern.size(),
1126                       0, NextCharUTF8());
1127}
1128
1129bool MatchPattern(const string16& eval, const string16& pattern) {
1130  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1131                       pattern.c_str(), pattern.c_str() + pattern.size(),
1132                       0, NextCharUTF16());
1133}
1134
1135// The following code is compatible with the OpenBSD lcpy interface.  See:
1136//   http://www.gratisoft.us/todd/papers/strlcpy.html
1137//   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1138
1139namespace {
1140
1141template <typename CHAR>
1142size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1143  for (size_t i = 0; i < dst_size; ++i) {
1144    if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1145      return i;
1146  }
1147
1148  // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1149  if (dst_size != 0)
1150    dst[dst_size - 1] = 0;
1151
1152  // Count the rest of the |src|, and return it's length in characters.
1153  while (src[dst_size]) ++dst_size;
1154  return dst_size;
1155}
1156
1157}  // namespace
1158
1159size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1160  return lcpyT<char>(dst, src, dst_size);
1161}
1162size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1163  return lcpyT<wchar_t>(dst, src, dst_size);
1164}
1165
1166bool ElideString(const std::wstring& input, int max_len, std::wstring* output) {
1167  DCHECK(max_len >= 0);
1168  if (static_cast<int>(input.length()) <= max_len) {
1169    output->assign(input);
1170    return false;
1171  }
1172
1173  switch (max_len) {
1174    case 0:
1175      output->clear();
1176      break;
1177    case 1:
1178      output->assign(input.substr(0, 1));
1179      break;
1180    case 2:
1181      output->assign(input.substr(0, 2));
1182      break;
1183    case 3:
1184      output->assign(input.substr(0, 1) + L"." +
1185                     input.substr(input.length() - 1));
1186      break;
1187    case 4:
1188      output->assign(input.substr(0, 1) + L".." +
1189                     input.substr(input.length() - 1));
1190      break;
1191    default: {
1192      int rstr_len = (max_len - 3) / 2;
1193      int lstr_len = rstr_len + ((max_len - 3) % 2);
1194      output->assign(input.substr(0, lstr_len) + L"..." +
1195                     input.substr(input.length() - rstr_len));
1196      break;
1197    }
1198  }
1199
1200  return true;
1201}
1202