string_util.cc revision 1391b24619d56bae6ce14bb54ed0fb16a945e853
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/string_util.h"
6
7#include "build/build_config.h"
8
9#include <ctype.h>
10#include <errno.h>
11#include <math.h>
12#include <stdarg.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <time.h>
17#include <wchar.h>
18#include <wctype.h>
19
20#include <algorithm>
21#include <vector>
22
23#include "base/basictypes.h"
24#include "base/logging.h"
25#include "base/singleton.h"
26#include "base/third_party/dmg_fp/dmg_fp.h"
27#include "base/utf_string_conversion_utils.h"
28#include "base/utf_string_conversions.h"
29#include "base/third_party/icu/icu_utf.h"
30
31namespace {
32
33// Force the singleton used by Empty[W]String[16] to be a unique type. This
34// prevents other code that might accidentally use Singleton<string> from
35// getting our internal one.
36struct EmptyStrings {
37  EmptyStrings() {}
38  const std::string s;
39  const std::wstring ws;
40  const string16 s16;
41};
42
43// Used by ReplaceStringPlaceholders to track the position in the string of
44// replaced parameters.
45struct ReplacementOffset {
46  ReplacementOffset(uintptr_t parameter, size_t offset)
47      : parameter(parameter),
48        offset(offset) {}
49
50  // Index of the parameter.
51  uintptr_t parameter;
52
53  // Starting position in the string.
54  size_t offset;
55};
56
57static bool CompareParameter(const ReplacementOffset& elem1,
58                             const ReplacementOffset& elem2) {
59  return elem1.parameter < elem2.parameter;
60}
61
62}  // namespace
63
64namespace base {
65
66bool IsWprintfFormatPortable(const wchar_t* format) {
67  for (const wchar_t* position = format; *position != '\0'; ++position) {
68    if (*position == '%') {
69      bool in_specification = true;
70      bool modifier_l = false;
71      while (in_specification) {
72        // Eat up characters until reaching a known specifier.
73        if (*++position == '\0') {
74          // The format string ended in the middle of a specification.  Call
75          // it portable because no unportable specifications were found.  The
76          // string is equally broken on all platforms.
77          return true;
78        }
79
80        if (*position == 'l') {
81          // 'l' is the only thing that can save the 's' and 'c' specifiers.
82          modifier_l = true;
83        } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
84                   *position == 'S' || *position == 'C' || *position == 'F' ||
85                   *position == 'D' || *position == 'O' || *position == 'U') {
86          // Not portable.
87          return false;
88        }
89
90        if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
91          // Portable, keep scanning the rest of the format string.
92          in_specification = false;
93        }
94      }
95    }
96  }
97
98  return true;
99}
100
101}  // namespace base
102
103
104const std::string& EmptyString() {
105  return Singleton<EmptyStrings>::get()->s;
106}
107
108const std::wstring& EmptyWString() {
109  return Singleton<EmptyStrings>::get()->ws;
110}
111
112const string16& EmptyString16() {
113  return Singleton<EmptyStrings>::get()->s16;
114}
115
116#define WHITESPACE_UNICODE \
117  0x0009, /* <control-0009> to <control-000D> */ \
118  0x000A,                                        \
119  0x000B,                                        \
120  0x000C,                                        \
121  0x000D,                                        \
122  0x0020, /* Space */                            \
123  0x0085, /* <control-0085> */                   \
124  0x00A0, /* No-Break Space */                   \
125  0x1680, /* Ogham Space Mark */                 \
126  0x180E, /* Mongolian Vowel Separator */        \
127  0x2000, /* En Quad to Hair Space */            \
128  0x2001,                                        \
129  0x2002,                                        \
130  0x2003,                                        \
131  0x2004,                                        \
132  0x2005,                                        \
133  0x2006,                                        \
134  0x2007,                                        \
135  0x2008,                                        \
136  0x2009,                                        \
137  0x200A,                                        \
138  0x200C, /* Zero Width Non-Joiner */            \
139  0x2028, /* Line Separator */                   \
140  0x2029, /* Paragraph Separator */              \
141  0x202F, /* Narrow No-Break Space */            \
142  0x205F, /* Medium Mathematical Space */        \
143  0x3000, /* Ideographic Space */                \
144  0
145
146const wchar_t kWhitespaceWide[] = {
147  WHITESPACE_UNICODE
148};
149const char16 kWhitespaceUTF16[] = {
150  WHITESPACE_UNICODE
151};
152const char kWhitespaceASCII[] = {
153  0x09,    // <control-0009> to <control-000D>
154  0x0A,
155  0x0B,
156  0x0C,
157  0x0D,
158  0x20,    // Space
159  0
160};
161
162const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
163
164template<typename STR>
165bool RemoveCharsT(const STR& input,
166                  const typename STR::value_type remove_chars[],
167                  STR* output) {
168  bool removed = false;
169  size_t found;
170
171  *output = input;
172
173  found = output->find_first_of(remove_chars);
174  while (found != STR::npos) {
175    removed = true;
176    output->replace(found, 1, STR());
177    found = output->find_first_of(remove_chars, found);
178  }
179
180  return removed;
181}
182
183bool RemoveChars(const std::wstring& input,
184                 const wchar_t remove_chars[],
185                 std::wstring* output) {
186  return RemoveCharsT(input, remove_chars, output);
187}
188
189#if !defined(WCHAR_T_IS_UTF16)
190bool RemoveChars(const string16& input,
191                 const char16 remove_chars[],
192                 string16* output) {
193  return RemoveCharsT(input, remove_chars, output);
194}
195#endif
196
197bool RemoveChars(const std::string& input,
198                 const char remove_chars[],
199                 std::string* output) {
200  return RemoveCharsT(input, remove_chars, output);
201}
202
203template<typename STR>
204TrimPositions TrimStringT(const STR& input,
205                          const typename STR::value_type trim_chars[],
206                          TrimPositions positions,
207                          STR* output) {
208  // Find the edges of leading/trailing whitespace as desired.
209  const typename STR::size_type last_char = input.length() - 1;
210  const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
211      input.find_first_not_of(trim_chars) : 0;
212  const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
213      input.find_last_not_of(trim_chars) : last_char;
214
215  // When the string was all whitespace, report that we stripped off whitespace
216  // from whichever position the caller was interested in.  For empty input, we
217  // stripped no whitespace, but we still need to clear |output|.
218  if (input.empty() ||
219      (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
220    bool input_was_empty = input.empty();  // in case output == &input
221    output->clear();
222    return input_was_empty ? TRIM_NONE : positions;
223  }
224
225  // Trim the whitespace.
226  *output =
227      input.substr(first_good_char, last_good_char - first_good_char + 1);
228
229  // Return where we trimmed from.
230  return static_cast<TrimPositions>(
231      ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
232      ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
233}
234
235bool TrimString(const std::wstring& input,
236                const wchar_t trim_chars[],
237                std::wstring* output) {
238  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
239}
240
241#if !defined(WCHAR_T_IS_UTF16)
242bool TrimString(const string16& input,
243                const char16 trim_chars[],
244                string16* output) {
245  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
246}
247#endif
248
249bool TrimString(const std::string& input,
250                const char trim_chars[],
251                std::string* output) {
252  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
253}
254
255void TruncateUTF8ToByteSize(const std::string& input,
256                            const size_t byte_size,
257                            std::string* output) {
258  DCHECK(output);
259  if (byte_size > input.length()) {
260    *output = input;
261    return;
262  }
263  DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
264  // Note: This cast is necessary because CBU8_NEXT uses int32s.
265  int32 truncation_length = static_cast<int32>(byte_size);
266  int32 char_index = truncation_length - 1;
267  const char* data = input.data();
268
269  // Using CBU8, we will move backwards from the truncation point
270  // to the beginning of the string looking for a valid UTF8
271  // character.  Once a full UTF8 character is found, we will
272  // truncate the string to the end of that character.
273  while (char_index >= 0) {
274    int32 prev = char_index;
275    uint32 code_point = 0;
276    CBU8_NEXT(data, char_index, truncation_length, code_point);
277    if (!base::IsValidCharacter(code_point) ||
278        !base::IsValidCodepoint(code_point)) {
279      char_index = prev - 1;
280    } else {
281      break;
282    }
283  }
284
285  if (char_index >= 0 )
286    *output = input.substr(0, char_index);
287  else
288    output->clear();
289}
290
291TrimPositions TrimWhitespace(const std::wstring& input,
292                             TrimPositions positions,
293                             std::wstring* output) {
294  return TrimStringT(input, kWhitespaceWide, positions, output);
295}
296
297#if !defined(WCHAR_T_IS_UTF16)
298TrimPositions TrimWhitespace(const string16& input,
299                             TrimPositions positions,
300                             string16* output) {
301  return TrimStringT(input, kWhitespaceUTF16, positions, output);
302}
303#endif
304
305TrimPositions TrimWhitespaceASCII(const std::string& input,
306                                  TrimPositions positions,
307                                  std::string* output) {
308  return TrimStringT(input, kWhitespaceASCII, positions, output);
309}
310
311// This function is only for backward-compatibility.
312// To be removed when all callers are updated.
313TrimPositions TrimWhitespace(const std::string& input,
314                             TrimPositions positions,
315                             std::string* output) {
316  return TrimWhitespaceASCII(input, positions, output);
317}
318
319template<typename STR>
320STR CollapseWhitespaceT(const STR& text,
321                        bool trim_sequences_with_line_breaks) {
322  STR result;
323  result.resize(text.size());
324
325  // Set flags to pretend we're already in a trimmed whitespace sequence, so we
326  // will trim any leading whitespace.
327  bool in_whitespace = true;
328  bool already_trimmed = true;
329
330  int chars_written = 0;
331  for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
332    if (IsWhitespace(*i)) {
333      if (!in_whitespace) {
334        // Reduce all whitespace sequences to a single space.
335        in_whitespace = true;
336        result[chars_written++] = L' ';
337      }
338      if (trim_sequences_with_line_breaks && !already_trimmed &&
339          ((*i == '\n') || (*i == '\r'))) {
340        // Whitespace sequences containing CR or LF are eliminated entirely.
341        already_trimmed = true;
342        --chars_written;
343      }
344    } else {
345      // Non-whitespace chracters are copied straight across.
346      in_whitespace = false;
347      already_trimmed = false;
348      result[chars_written++] = *i;
349    }
350  }
351
352  if (in_whitespace && !already_trimmed) {
353    // Any trailing whitespace is eliminated.
354    --chars_written;
355  }
356
357  result.resize(chars_written);
358  return result;
359}
360
361std::wstring CollapseWhitespace(const std::wstring& text,
362                                bool trim_sequences_with_line_breaks) {
363  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
364}
365
366#if !defined(WCHAR_T_IS_UTF16)
367string16 CollapseWhitespace(const string16& text,
368                            bool trim_sequences_with_line_breaks) {
369  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
370}
371#endif
372
373std::string CollapseWhitespaceASCII(const std::string& text,
374                                    bool trim_sequences_with_line_breaks) {
375  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
376}
377
378bool ContainsOnlyWhitespaceASCII(const std::string& str) {
379  for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
380    if (!IsAsciiWhitespace(*i))
381      return false;
382  }
383  return true;
384}
385
386bool ContainsOnlyWhitespace(const string16& str) {
387  for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
388    if (!IsWhitespace(*i))
389      return false;
390  }
391  return true;
392}
393
394template<typename STR>
395static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
396  for (typename STR::const_iterator iter = input.begin();
397       iter != input.end(); ++iter) {
398    if (characters.find(*iter) == STR::npos)
399      return false;
400  }
401  return true;
402}
403
404bool ContainsOnlyChars(const std::wstring& input,
405                       const std::wstring& characters) {
406  return ContainsOnlyCharsT(input, characters);
407}
408
409#if !defined(WCHAR_T_IS_UTF16)
410bool ContainsOnlyChars(const string16& input, const string16& characters) {
411  return ContainsOnlyCharsT(input, characters);
412}
413#endif
414
415bool ContainsOnlyChars(const std::string& input,
416                       const std::string& characters) {
417  return ContainsOnlyCharsT(input, characters);
418}
419
420std::string WideToASCII(const std::wstring& wide) {
421  DCHECK(IsStringASCII(wide)) << wide;
422  return std::string(wide.begin(), wide.end());
423}
424
425std::string UTF16ToASCII(const string16& utf16) {
426  DCHECK(IsStringASCII(utf16)) << utf16;
427  return std::string(utf16.begin(), utf16.end());
428}
429
430// Latin1 is just the low range of Unicode, so we can copy directly to convert.
431bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
432  std::string output;
433  output.resize(wide.size());
434  latin1->clear();
435  for (size_t i = 0; i < wide.size(); i++) {
436    if (wide[i] > 255)
437      return false;
438    output[i] = static_cast<char>(wide[i]);
439  }
440  latin1->swap(output);
441  return true;
442}
443
444bool IsString8Bit(const std::wstring& str) {
445  for (size_t i = 0; i < str.length(); i++) {
446    if (str[i] > 255)
447      return false;
448  }
449  return true;
450}
451
452template<class STR>
453static bool DoIsStringASCII(const STR& str) {
454  for (size_t i = 0; i < str.length(); i++) {
455    typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
456    if (c > 0x7F)
457      return false;
458  }
459  return true;
460}
461
462bool IsStringASCII(const std::wstring& str) {
463  return DoIsStringASCII(str);
464}
465
466#if !defined(WCHAR_T_IS_UTF16)
467bool IsStringASCII(const string16& str) {
468  return DoIsStringASCII(str);
469}
470#endif
471
472bool IsStringASCII(const base::StringPiece& str) {
473  return DoIsStringASCII(str);
474}
475
476bool IsStringUTF8(const std::string& str) {
477  const char *src = str.data();
478  int32 src_len = static_cast<int32>(str.length());
479  int32 char_index = 0;
480
481  while (char_index < src_len) {
482    int32 code_point;
483    CBU8_NEXT(src, char_index, src_len, code_point);
484    if (!base::IsValidCharacter(code_point))
485       return false;
486  }
487  return true;
488}
489
490template<typename Iter>
491static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
492                                          Iter a_end,
493                                          const char* b) {
494  for (Iter it = a_begin; it != a_end; ++it, ++b) {
495    if (!*b || ToLowerASCII(*it) != *b)
496      return false;
497  }
498  return *b == 0;
499}
500
501// Front-ends for LowerCaseEqualsASCII.
502bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
503  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
504}
505
506bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
507  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
508}
509
510#if !defined(WCHAR_T_IS_UTF16)
511bool LowerCaseEqualsASCII(const string16& a, const char* b) {
512  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
513}
514#endif
515
516bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
517                          std::string::const_iterator a_end,
518                          const char* b) {
519  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
520}
521
522bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
523                          std::wstring::const_iterator a_end,
524                          const char* b) {
525  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
526}
527
528#if !defined(WCHAR_T_IS_UTF16)
529bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
530                          string16::const_iterator a_end,
531                          const char* b) {
532  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
533}
534#endif
535
536#if !defined(ANDROID)
537bool LowerCaseEqualsASCII(const char* a_begin,
538                          const char* a_end,
539                          const char* b) {
540  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
541}
542#endif // !ANDROID
543
544#if !defined(ANDROID)
545bool LowerCaseEqualsASCII(const wchar_t* a_begin,
546                          const wchar_t* a_end,
547                          const char* b) {
548  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
549}
550#endif // !ANDROID
551
552#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID)
553bool LowerCaseEqualsASCII(const char16* a_begin,
554                          const char16* a_end,
555                          const char* b) {
556  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
557}
558#endif
559
560bool EqualsASCII(const string16& a, const base::StringPiece& b) {
561  if (a.length() != b.length())
562    return false;
563  return std::equal(b.begin(), b.end(), a.begin());
564}
565
566bool StartsWithASCII(const std::string& str,
567                     const std::string& search,
568                     bool case_sensitive) {
569  if (case_sensitive)
570    return str.compare(0, search.length(), search) == 0;
571  else
572    return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
573}
574
575template <typename STR>
576bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
577  if (case_sensitive) {
578    return str.compare(0, search.length(), search) == 0;
579  } else {
580    if (search.size() > str.size())
581      return false;
582    return std::equal(search.begin(), search.end(), str.begin(),
583                      CaseInsensitiveCompare<typename STR::value_type>());
584  }
585}
586
587bool StartsWith(const std::wstring& str, const std::wstring& search,
588                bool case_sensitive) {
589  return StartsWithT(str, search, case_sensitive);
590}
591
592#if !defined(WCHAR_T_IS_UTF16)
593bool StartsWith(const string16& str, const string16& search,
594                bool case_sensitive) {
595  return StartsWithT(str, search, case_sensitive);
596}
597#endif
598
599template <typename STR>
600bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
601  typename STR::size_type str_length = str.length();
602  typename STR::size_type search_length = search.length();
603  if (search_length > str_length)
604    return false;
605  if (case_sensitive) {
606    return str.compare(str_length - search_length, search_length, search) == 0;
607  } else {
608    return std::equal(search.begin(), search.end(),
609                      str.begin() + (str_length - search_length),
610                      CaseInsensitiveCompare<typename STR::value_type>());
611  }
612}
613
614bool EndsWith(const std::string& str, const std::string& search,
615              bool case_sensitive) {
616  return EndsWithT(str, search, case_sensitive);
617}
618
619bool EndsWith(const std::wstring& str, const std::wstring& search,
620              bool case_sensitive) {
621  return EndsWithT(str, search, case_sensitive);
622}
623
624#if !defined(WCHAR_T_IS_UTF16)
625bool EndsWith(const string16& str, const string16& search,
626              bool case_sensitive) {
627  return EndsWithT(str, search, case_sensitive);
628}
629#endif
630
631DataUnits GetByteDisplayUnits(int64 bytes) {
632  // The byte thresholds at which we display amounts.  A byte count is displayed
633  // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1].
634  // This must match the DataUnits enum.
635  static const int64 kUnitThresholds[] = {
636    0,              // DATA_UNITS_BYTE,
637    3*1024,         // DATA_UNITS_KIBIBYTE,
638    2*1024*1024,    // DATA_UNITS_MEBIBYTE,
639    1024*1024*1024  // DATA_UNITS_GIBIBYTE,
640  };
641
642  if (bytes < 0) {
643    NOTREACHED() << "Negative bytes value";
644    return DATA_UNITS_BYTE;
645  }
646
647  int unit_index = arraysize(kUnitThresholds);
648  while (--unit_index > 0) {
649    if (bytes >= kUnitThresholds[unit_index])
650      break;
651  }
652
653  DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE);
654  return DataUnits(unit_index);
655}
656
657// TODO(mpcomplete): deal with locale
658// Byte suffixes.  This must match the DataUnits enum.
659static const char* const kByteStrings[] = {
660  "B",
661  "kB",
662  "MB",
663  "GB"
664};
665
666static const char* const kSpeedStrings[] = {
667  "B/s",
668  "kB/s",
669  "MB/s",
670  "GB/s"
671};
672
673string16 FormatBytesInternal(int64 bytes,
674                             DataUnits units,
675                             bool show_units,
676                             const char* const* suffix) {
677  if (bytes < 0) {
678    NOTREACHED() << "Negative bytes value";
679    return string16();
680  }
681
682  DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE);
683
684  // Put the quantity in the right units.
685  double unit_amount = static_cast<double>(bytes);
686  for (int i = 0; i < units; ++i)
687    unit_amount /= 1024.0;
688
689  char buf[64];
690  if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100)
691    base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount);
692  else
693    base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount);
694
695  std::string ret(buf);
696  if (show_units) {
697    ret += " ";
698    ret += suffix[units];
699  }
700
701  return ASCIIToUTF16(ret);
702}
703
704string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) {
705  return FormatBytesInternal(bytes, units, show_units, kByteStrings);
706}
707
708string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) {
709  return FormatBytesInternal(bytes, units, show_units, kSpeedStrings);
710}
711
712template<class StringType>
713void DoReplaceSubstringsAfterOffset(StringType* str,
714                                    typename StringType::size_type start_offset,
715                                    const StringType& find_this,
716                                    const StringType& replace_with,
717                                    bool replace_all) {
718  if ((start_offset == StringType::npos) || (start_offset >= str->length()))
719    return;
720
721  DCHECK(!find_this.empty());
722  for (typename StringType::size_type offs(str->find(find_this, start_offset));
723      offs != StringType::npos; offs = str->find(find_this, offs)) {
724    str->replace(offs, find_this.length(), replace_with);
725    offs += replace_with.length();
726
727    if (!replace_all)
728      break;
729  }
730}
731
732void ReplaceFirstSubstringAfterOffset(string16* str,
733                                      string16::size_type start_offset,
734                                      const string16& find_this,
735                                      const string16& replace_with) {
736  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
737                                 false);  // replace first instance
738}
739
740void ReplaceFirstSubstringAfterOffset(std::string* str,
741                                      std::string::size_type start_offset,
742                                      const std::string& find_this,
743                                      const std::string& replace_with) {
744  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
745                                 false);  // replace first instance
746}
747
748void ReplaceSubstringsAfterOffset(string16* str,
749                                  string16::size_type start_offset,
750                                  const string16& find_this,
751                                  const string16& replace_with) {
752  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
753                                 true);  // replace all instances
754}
755
756void ReplaceSubstringsAfterOffset(std::string* str,
757                                  std::string::size_type start_offset,
758                                  const std::string& find_this,
759                                  const std::string& replace_with) {
760  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
761                                 true);  // replace all instances
762}
763
764
765template<typename STR>
766static size_t TokenizeT(const STR& str,
767                        const STR& delimiters,
768                        std::vector<STR>* tokens) {
769  tokens->clear();
770
771  typename STR::size_type start = str.find_first_not_of(delimiters);
772  while (start != STR::npos) {
773    typename STR::size_type end = str.find_first_of(delimiters, start + 1);
774    if (end == STR::npos) {
775      tokens->push_back(str.substr(start));
776      break;
777    } else {
778      tokens->push_back(str.substr(start, end - start));
779      start = str.find_first_not_of(delimiters, end + 1);
780    }
781  }
782
783  return tokens->size();
784}
785
786size_t Tokenize(const std::wstring& str,
787                const std::wstring& delimiters,
788                std::vector<std::wstring>* tokens) {
789  return TokenizeT(str, delimiters, tokens);
790}
791
792#if !defined(WCHAR_T_IS_UTF16)
793size_t Tokenize(const string16& str,
794                const string16& delimiters,
795                std::vector<string16>* tokens) {
796  return TokenizeT(str, delimiters, tokens);
797}
798#endif
799
800size_t Tokenize(const std::string& str,
801                const std::string& delimiters,
802                std::vector<std::string>* tokens) {
803  return TokenizeT(str, delimiters, tokens);
804}
805
806size_t Tokenize(const base::StringPiece& str,
807                const base::StringPiece& delimiters,
808                std::vector<base::StringPiece>* tokens) {
809  return TokenizeT(str, delimiters, tokens);
810}
811
812template<typename STR>
813static STR JoinStringT(const std::vector<STR>& parts,
814                       typename STR::value_type sep) {
815  if (parts.size() == 0) return STR();
816
817  STR result(parts[0]);
818  typename std::vector<STR>::const_iterator iter = parts.begin();
819  ++iter;
820
821  for (; iter != parts.end(); ++iter) {
822    result += sep;
823    result += *iter;
824  }
825
826  return result;
827}
828
829std::string JoinString(const std::vector<std::string>& parts, char sep) {
830  return JoinStringT(parts, sep);
831}
832
833#if !defined(WCHAR_T_IS_UTF16)
834string16 JoinString(const std::vector<string16>& parts, char16 sep) {
835  return JoinStringT(parts, sep);
836}
837#endif
838
839std::wstring JoinString(const std::vector<std::wstring>& parts, wchar_t sep) {
840  return JoinStringT(parts, sep);
841}
842
843template<typename STR>
844void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) {
845  const size_t length = str.length();
846  if (!length)
847    return;
848
849  bool last_was_ws = false;
850  size_t last_non_ws_start = 0;
851  for (size_t i = 0; i < length; ++i) {
852    switch (str[i]) {
853      // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR.
854      case L' ':
855      case L'\t':
856      case L'\xA':
857      case L'\xB':
858      case L'\xC':
859      case L'\xD':
860        if (!last_was_ws) {
861          if (i > 0) {
862            result->push_back(
863                str.substr(last_non_ws_start, i - last_non_ws_start));
864          }
865          last_was_ws = true;
866        }
867        break;
868
869      default:  // Not a space character.
870        if (last_was_ws) {
871          last_was_ws = false;
872          last_non_ws_start = i;
873        }
874        break;
875    }
876  }
877  if (!last_was_ws) {
878    result->push_back(
879        str.substr(last_non_ws_start, length - last_non_ws_start));
880  }
881}
882
883void SplitStringAlongWhitespace(const std::wstring& str,
884                                std::vector<std::wstring>* result) {
885  SplitStringAlongWhitespaceT(str, result);
886}
887
888#if !defined(WCHAR_T_IS_UTF16)
889void SplitStringAlongWhitespace(const string16& str,
890                                std::vector<string16>* result) {
891  SplitStringAlongWhitespaceT(str, result);
892}
893#endif
894
895void SplitStringAlongWhitespace(const std::string& str,
896                                std::vector<std::string>* result) {
897  SplitStringAlongWhitespaceT(str, result);
898}
899
900template<class FormatStringType, class OutStringType>
901OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
902    const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
903  size_t substitutions = subst.size();
904  DCHECK(substitutions < 10);
905
906  size_t sub_length = 0;
907  for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
908       iter != subst.end(); ++iter) {
909    sub_length += (*iter).length();
910  }
911
912  OutStringType formatted;
913  formatted.reserve(format_string.length() + sub_length);
914
915  std::vector<ReplacementOffset> r_offsets;
916  for (typename FormatStringType::const_iterator i = format_string.begin();
917       i != format_string.end(); ++i) {
918    if ('$' == *i) {
919      if (i + 1 != format_string.end()) {
920        ++i;
921        DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
922        if ('$' == *i) {
923          while (i != format_string.end() && '$' == *i) {
924            formatted.push_back('$');
925            ++i;
926          }
927          --i;
928        } else {
929          uintptr_t index = *i - '1';
930          if (offsets) {
931            ReplacementOffset r_offset(index,
932                static_cast<int>(formatted.size()));
933            r_offsets.insert(std::lower_bound(r_offsets.begin(),
934                r_offsets.end(), r_offset,
935                &CompareParameter),
936                r_offset);
937          }
938          if (index < substitutions)
939            formatted.append(subst.at(index));
940        }
941      }
942    } else {
943      formatted.push_back(*i);
944    }
945  }
946  if (offsets) {
947    for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
948        i != r_offsets.end(); ++i) {
949      offsets->push_back(i->offset);
950    }
951  }
952  return formatted;
953}
954
955string16 ReplaceStringPlaceholders(const string16& format_string,
956                                   const std::vector<string16>& subst,
957                                   std::vector<size_t>* offsets) {
958  return DoReplaceStringPlaceholders(format_string, subst, offsets);
959}
960
961std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
962                                      const std::vector<std::string>& subst,
963                                      std::vector<size_t>* offsets) {
964  return DoReplaceStringPlaceholders(format_string, subst, offsets);
965}
966
967string16 ReplaceStringPlaceholders(const string16& format_string,
968                                   const string16& a,
969                                   size_t* offset) {
970  std::vector<size_t> offsets;
971  std::vector<string16> subst;
972  subst.push_back(a);
973  string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
974
975  DCHECK(offsets.size() == 1);
976  if (offset) {
977    *offset = offsets[0];
978  }
979  return result;
980}
981
982static bool IsWildcard(base_icu::UChar32 character) {
983  return character == '*' || character == '?';
984}
985
986// Move the strings pointers to the point where they start to differ.
987template <typename CHAR, typename NEXT>
988static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
989                         const CHAR** string, const CHAR* string_end,
990                         NEXT next) {
991  const CHAR* escape = NULL;
992  while (*pattern != pattern_end && *string != string_end) {
993    if (!escape && IsWildcard(**pattern)) {
994      // We don't want to match wildcard here, except if it's escaped.
995      return;
996    }
997
998    // Check if the escapement char is found. If so, skip it and move to the
999    // next character.
1000    if (!escape && **pattern == '\\') {
1001      escape = *pattern;
1002      next(pattern, pattern_end);
1003      continue;
1004    }
1005
1006    // Check if the chars match, if so, increment the ptrs.
1007    const CHAR* pattern_next = *pattern;
1008    const CHAR* string_next = *string;
1009    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
1010    if (pattern_char == next(&string_next, string_end) &&
1011        pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
1012      *pattern = pattern_next;
1013      *string = string_next;
1014    } else {
1015      // Uh ho, it did not match, we are done. If the last char was an
1016      // escapement, that means that it was an error to advance the ptr here,
1017      // let's put it back where it was. This also mean that the MatchPattern
1018      // function will return false because if we can't match an escape char
1019      // here, then no one will.
1020      if (escape) {
1021        *pattern = escape;
1022      }
1023      return;
1024    }
1025
1026    escape = NULL;
1027  }
1028}
1029
1030template <typename CHAR, typename NEXT>
1031static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
1032  while (*pattern != end) {
1033    if (!IsWildcard(**pattern))
1034      return;
1035    next(pattern, end);
1036  }
1037}
1038
1039template <typename CHAR, typename NEXT>
1040static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
1041                          const CHAR* pattern, const CHAR* pattern_end,
1042                          int depth,
1043                          NEXT next) {
1044  const int kMaxDepth = 16;
1045  if (depth > kMaxDepth)
1046    return false;
1047
1048  // Eat all the matching chars.
1049  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
1050
1051  // If the string is empty, then the pattern must be empty too, or contains
1052  // only wildcards.
1053  if (eval == eval_end) {
1054    EatWildcard(&pattern, pattern_end, next);
1055    return pattern == pattern_end;
1056  }
1057
1058  // Pattern is empty but not string, this is not a match.
1059  if (pattern == pattern_end)
1060    return false;
1061
1062  // If this is a question mark, then we need to compare the rest with
1063  // the current string or the string with one character eaten.
1064  const CHAR* next_pattern = pattern;
1065  next(&next_pattern, pattern_end);
1066  if (pattern[0] == '?') {
1067    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1068                      depth + 1, next))
1069      return true;
1070    const CHAR* next_eval = eval;
1071    next(&next_eval, eval_end);
1072    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
1073                      depth + 1, next))
1074      return true;
1075  }
1076
1077  // This is a *, try to match all the possible substrings with the remainder
1078  // of the pattern.
1079  if (pattern[0] == '*') {
1080    while (eval != eval_end) {
1081      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1082                        depth + 1, next))
1083        return true;
1084      eval++;
1085    }
1086
1087    // We reached the end of the string, let see if the pattern contains only
1088    // wildcards.
1089    if (eval == eval_end) {
1090      EatWildcard(&pattern, pattern_end, next);
1091      if (pattern != pattern_end)
1092        return false;
1093      return true;
1094    }
1095  }
1096
1097  return false;
1098}
1099
1100struct NextCharUTF8 {
1101  base_icu::UChar32 operator()(const char** p, const char* end) {
1102    base_icu::UChar32 c;
1103    int offset = 0;
1104    CBU8_NEXT(*p, offset, end - *p, c);
1105    *p += offset;
1106    return c;
1107  }
1108};
1109
1110struct NextCharUTF16 {
1111  base_icu::UChar32 operator()(const char16** p, const char16* end) {
1112    base_icu::UChar32 c;
1113    int offset = 0;
1114    CBU16_NEXT(*p, offset, end - *p, c);
1115    *p += offset;
1116    return c;
1117  }
1118};
1119
1120bool MatchPattern(const base::StringPiece& eval,
1121                  const base::StringPiece& pattern) {
1122  return MatchPatternT(eval.data(), eval.data() + eval.size(),
1123                       pattern.data(), pattern.data() + pattern.size(),
1124                       0, NextCharUTF8());
1125}
1126
1127bool MatchPattern(const string16& eval, const string16& pattern) {
1128  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1129                       pattern.c_str(), pattern.c_str() + pattern.size(),
1130                       0, NextCharUTF16());
1131}
1132
1133// The following code is compatible with the OpenBSD lcpy interface.  See:
1134//   http://www.gratisoft.us/todd/papers/strlcpy.html
1135//   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1136
1137namespace {
1138
1139template <typename CHAR>
1140size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1141  for (size_t i = 0; i < dst_size; ++i) {
1142    if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1143      return i;
1144  }
1145
1146  // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1147  if (dst_size != 0)
1148    dst[dst_size - 1] = 0;
1149
1150  // Count the rest of the |src|, and return it's length in characters.
1151  while (src[dst_size]) ++dst_size;
1152  return dst_size;
1153}
1154
1155}  // namespace
1156
1157size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1158  return lcpyT<char>(dst, src, dst_size);
1159}
1160size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1161  return lcpyT<wchar_t>(dst, src, dst_size);
1162}
1163
1164bool ElideString(const std::wstring& input, int max_len, std::wstring* output) {
1165  DCHECK(max_len >= 0);
1166  if (static_cast<int>(input.length()) <= max_len) {
1167    output->assign(input);
1168    return false;
1169  }
1170
1171  switch (max_len) {
1172    case 0:
1173      output->clear();
1174      break;
1175    case 1:
1176      output->assign(input.substr(0, 1));
1177      break;
1178    case 2:
1179      output->assign(input.substr(0, 2));
1180      break;
1181    case 3:
1182      output->assign(input.substr(0, 1) + L"." +
1183                     input.substr(input.length() - 1));
1184      break;
1185    case 4:
1186      output->assign(input.substr(0, 1) + L".." +
1187                     input.substr(input.length() - 1));
1188      break;
1189    default: {
1190      int rstr_len = (max_len - 3) / 2;
1191      int lstr_len = rstr_len + ((max_len - 3) % 2);
1192      output->assign(input.substr(0, lstr_len) + L"..." +
1193                     input.substr(input.length() - rstr_len));
1194      break;
1195    }
1196  }
1197
1198  return true;
1199}
1200