string_util.cc revision 513209b27ff55e2841eac0e4120199c23acce758
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/string_util.h"
6
7#include "build/build_config.h"
8
9#include <ctype.h>
10#include <errno.h>
11#include <math.h>
12#include <stdarg.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <time.h>
17#include <wchar.h>
18#include <wctype.h>
19
20#include <algorithm>
21#include <vector>
22
23#include "base/basictypes.h"
24#include "base/logging.h"
25#include "base/singleton.h"
26#include "base/third_party/dmg_fp/dmg_fp.h"
27#include "base/utf_string_conversion_utils.h"
28#include "base/utf_string_conversions.h"
29#include "base/third_party/icu/icu_utf.h"
30
31namespace {
32
33// Force the singleton used by Empty[W]String[16] to be a unique type. This
34// prevents other code that might accidentally use Singleton<string> from
35// getting our internal one.
36struct EmptyStrings {
37  EmptyStrings() {}
38  const std::string s;
39  const std::wstring ws;
40  const string16 s16;
41};
42
43// Used by ReplaceStringPlaceholders to track the position in the string of
44// replaced parameters.
45struct ReplacementOffset {
46  ReplacementOffset(uintptr_t parameter, size_t offset)
47      : parameter(parameter),
48        offset(offset) {}
49
50  // Index of the parameter.
51  uintptr_t parameter;
52
53  // Starting position in the string.
54  size_t offset;
55};
56
57static bool CompareParameter(const ReplacementOffset& elem1,
58                             const ReplacementOffset& elem2) {
59  return elem1.parameter < elem2.parameter;
60}
61
62}  // namespace
63
64namespace base {
65
66bool IsWprintfFormatPortable(const wchar_t* format) {
67  for (const wchar_t* position = format; *position != '\0'; ++position) {
68    if (*position == '%') {
69      bool in_specification = true;
70      bool modifier_l = false;
71      while (in_specification) {
72        // Eat up characters until reaching a known specifier.
73        if (*++position == '\0') {
74          // The format string ended in the middle of a specification.  Call
75          // it portable because no unportable specifications were found.  The
76          // string is equally broken on all platforms.
77          return true;
78        }
79
80        if (*position == 'l') {
81          // 'l' is the only thing that can save the 's' and 'c' specifiers.
82          modifier_l = true;
83        } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
84                   *position == 'S' || *position == 'C' || *position == 'F' ||
85                   *position == 'D' || *position == 'O' || *position == 'U') {
86          // Not portable.
87          return false;
88        }
89
90        if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
91          // Portable, keep scanning the rest of the format string.
92          in_specification = false;
93        }
94      }
95    }
96  }
97
98  return true;
99}
100
101}  // namespace base
102
103
104const std::string& EmptyString() {
105  return Singleton<EmptyStrings>::get()->s;
106}
107
108const std::wstring& EmptyWString() {
109  return Singleton<EmptyStrings>::get()->ws;
110}
111
112const string16& EmptyString16() {
113  return Singleton<EmptyStrings>::get()->s16;
114}
115
116#define WHITESPACE_UNICODE \
117  0x0009, /* <control-0009> to <control-000D> */ \
118  0x000A,                                        \
119  0x000B,                                        \
120  0x000C,                                        \
121  0x000D,                                        \
122  0x0020, /* Space */                            \
123  0x0085, /* <control-0085> */                   \
124  0x00A0, /* No-Break Space */                   \
125  0x1680, /* Ogham Space Mark */                 \
126  0x180E, /* Mongolian Vowel Separator */        \
127  0x2000, /* En Quad to Hair Space */            \
128  0x2001,                                        \
129  0x2002,                                        \
130  0x2003,                                        \
131  0x2004,                                        \
132  0x2005,                                        \
133  0x2006,                                        \
134  0x2007,                                        \
135  0x2008,                                        \
136  0x2009,                                        \
137  0x200A,                                        \
138  0x200C, /* Zero Width Non-Joiner */            \
139  0x2028, /* Line Separator */                   \
140  0x2029, /* Paragraph Separator */              \
141  0x202F, /* Narrow No-Break Space */            \
142  0x205F, /* Medium Mathematical Space */        \
143  0x3000, /* Ideographic Space */                \
144  0
145
146const wchar_t kWhitespaceWide[] = {
147  WHITESPACE_UNICODE
148};
149const char16 kWhitespaceUTF16[] = {
150  WHITESPACE_UNICODE
151};
152const char kWhitespaceASCII[] = {
153  0x09,    // <control-0009> to <control-000D>
154  0x0A,
155  0x0B,
156  0x0C,
157  0x0D,
158  0x20,    // Space
159  0
160};
161
162const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
163
164template<typename STR>
165bool RemoveCharsT(const STR& input,
166                  const typename STR::value_type remove_chars[],
167                  STR* output) {
168  bool removed = false;
169  size_t found;
170
171  *output = input;
172
173  found = output->find_first_of(remove_chars);
174  while (found != STR::npos) {
175    removed = true;
176    output->replace(found, 1, STR());
177    found = output->find_first_of(remove_chars, found);
178  }
179
180  return removed;
181}
182
183bool RemoveChars(const std::wstring& input,
184                 const wchar_t remove_chars[],
185                 std::wstring* output) {
186  return RemoveCharsT(input, remove_chars, output);
187}
188
189#if !defined(WCHAR_T_IS_UTF16)
190bool RemoveChars(const string16& input,
191                 const char16 remove_chars[],
192                 string16* output) {
193  return RemoveCharsT(input, remove_chars, output);
194}
195#endif
196
197bool RemoveChars(const std::string& input,
198                 const char remove_chars[],
199                 std::string* output) {
200  return RemoveCharsT(input, remove_chars, output);
201}
202
203template<typename STR>
204TrimPositions TrimStringT(const STR& input,
205                          const typename STR::value_type trim_chars[],
206                          TrimPositions positions,
207                          STR* output) {
208  // Find the edges of leading/trailing whitespace as desired.
209  const typename STR::size_type last_char = input.length() - 1;
210  const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
211      input.find_first_not_of(trim_chars) : 0;
212  const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
213      input.find_last_not_of(trim_chars) : last_char;
214
215  // When the string was all whitespace, report that we stripped off whitespace
216  // from whichever position the caller was interested in.  For empty input, we
217  // stripped no whitespace, but we still need to clear |output|.
218  if (input.empty() ||
219      (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
220    bool input_was_empty = input.empty();  // in case output == &input
221    output->clear();
222    return input_was_empty ? TRIM_NONE : positions;
223  }
224
225  // Trim the whitespace.
226  *output =
227      input.substr(first_good_char, last_good_char - first_good_char + 1);
228
229  // Return where we trimmed from.
230  return static_cast<TrimPositions>(
231      ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
232      ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
233}
234
235bool TrimString(const std::wstring& input,
236                const wchar_t trim_chars[],
237                std::wstring* output) {
238  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
239}
240
241#if !defined(WCHAR_T_IS_UTF16)
242bool TrimString(const string16& input,
243                const char16 trim_chars[],
244                string16* output) {
245  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
246}
247#endif
248
249bool TrimString(const std::string& input,
250                const char trim_chars[],
251                std::string* output) {
252  return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
253}
254
255void TruncateUTF8ToByteSize(const std::string& input,
256                            const size_t byte_size,
257                            std::string* output) {
258  DCHECK(output);
259  if (byte_size > input.length()) {
260    *output = input;
261    return;
262  }
263  DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
264  // Note: This cast is necessary because CBU8_NEXT uses int32s.
265  int32 truncation_length = static_cast<int32>(byte_size);
266  int32 char_index = truncation_length - 1;
267  const char* data = input.data();
268
269  // Using CBU8, we will move backwards from the truncation point
270  // to the beginning of the string looking for a valid UTF8
271  // character.  Once a full UTF8 character is found, we will
272  // truncate the string to the end of that character.
273  while (char_index >= 0) {
274    int32 prev = char_index;
275    uint32 code_point = 0;
276    CBU8_NEXT(data, char_index, truncation_length, code_point);
277    if (!base::IsValidCharacter(code_point) ||
278        !base::IsValidCodepoint(code_point)) {
279      char_index = prev - 1;
280    } else {
281      break;
282    }
283  }
284
285  if (char_index >= 0 )
286    *output = input.substr(0, char_index);
287  else
288    output->clear();
289}
290
291TrimPositions TrimWhitespace(const std::wstring& input,
292                             TrimPositions positions,
293                             std::wstring* output) {
294  return TrimStringT(input, kWhitespaceWide, positions, output);
295}
296
297#if !defined(WCHAR_T_IS_UTF16)
298TrimPositions TrimWhitespace(const string16& input,
299                             TrimPositions positions,
300                             string16* output) {
301  return TrimStringT(input, kWhitespaceUTF16, positions, output);
302}
303#endif
304
305TrimPositions TrimWhitespaceASCII(const std::string& input,
306                                  TrimPositions positions,
307                                  std::string* output) {
308  return TrimStringT(input, kWhitespaceASCII, positions, output);
309}
310
311// This function is only for backward-compatibility.
312// To be removed when all callers are updated.
313TrimPositions TrimWhitespace(const std::string& input,
314                             TrimPositions positions,
315                             std::string* output) {
316  return TrimWhitespaceASCII(input, positions, output);
317}
318
319template<typename STR>
320STR CollapseWhitespaceT(const STR& text,
321                        bool trim_sequences_with_line_breaks) {
322  STR result;
323  result.resize(text.size());
324
325  // Set flags to pretend we're already in a trimmed whitespace sequence, so we
326  // will trim any leading whitespace.
327  bool in_whitespace = true;
328  bool already_trimmed = true;
329
330  int chars_written = 0;
331  for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
332    if (IsWhitespace(*i)) {
333      if (!in_whitespace) {
334        // Reduce all whitespace sequences to a single space.
335        in_whitespace = true;
336        result[chars_written++] = L' ';
337      }
338      if (trim_sequences_with_line_breaks && !already_trimmed &&
339          ((*i == '\n') || (*i == '\r'))) {
340        // Whitespace sequences containing CR or LF are eliminated entirely.
341        already_trimmed = true;
342        --chars_written;
343      }
344    } else {
345      // Non-whitespace chracters are copied straight across.
346      in_whitespace = false;
347      already_trimmed = false;
348      result[chars_written++] = *i;
349    }
350  }
351
352  if (in_whitespace && !already_trimmed) {
353    // Any trailing whitespace is eliminated.
354    --chars_written;
355  }
356
357  result.resize(chars_written);
358  return result;
359}
360
361std::wstring CollapseWhitespace(const std::wstring& text,
362                                bool trim_sequences_with_line_breaks) {
363  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
364}
365
366#if !defined(WCHAR_T_IS_UTF16)
367string16 CollapseWhitespace(const string16& text,
368                            bool trim_sequences_with_line_breaks) {
369  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
370}
371#endif
372
373std::string CollapseWhitespaceASCII(const std::string& text,
374                                    bool trim_sequences_with_line_breaks) {
375  return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
376}
377
378bool ContainsOnlyWhitespaceASCII(const std::string& str) {
379  for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
380    if (!IsAsciiWhitespace(*i))
381      return false;
382  }
383  return true;
384}
385
386bool ContainsOnlyWhitespace(const string16& str) {
387  for (string16::const_iterator i(str.begin()); i != str.end(); ++i) {
388    if (!IsWhitespace(*i))
389      return false;
390  }
391  return true;
392}
393
394template<typename STR>
395static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
396  for (typename STR::const_iterator iter = input.begin();
397       iter != input.end(); ++iter) {
398    if (characters.find(*iter) == STR::npos)
399      return false;
400  }
401  return true;
402}
403
404bool ContainsOnlyChars(const std::wstring& input,
405                       const std::wstring& characters) {
406  return ContainsOnlyCharsT(input, characters);
407}
408
409#if !defined(WCHAR_T_IS_UTF16)
410bool ContainsOnlyChars(const string16& input, const string16& characters) {
411  return ContainsOnlyCharsT(input, characters);
412}
413#endif
414
415bool ContainsOnlyChars(const std::string& input,
416                       const std::string& characters) {
417  return ContainsOnlyCharsT(input, characters);
418}
419
420std::string WideToASCII(const std::wstring& wide) {
421  DCHECK(IsStringASCII(wide)) << wide;
422  return std::string(wide.begin(), wide.end());
423}
424
425std::string UTF16ToASCII(const string16& utf16) {
426  DCHECK(IsStringASCII(utf16)) << utf16;
427  return std::string(utf16.begin(), utf16.end());
428}
429
430// Latin1 is just the low range of Unicode, so we can copy directly to convert.
431bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
432  std::string output;
433  output.resize(wide.size());
434  latin1->clear();
435  for (size_t i = 0; i < wide.size(); i++) {
436    if (wide[i] > 255)
437      return false;
438    output[i] = static_cast<char>(wide[i]);
439  }
440  latin1->swap(output);
441  return true;
442}
443
444template<class STR>
445static bool DoIsStringASCII(const STR& str) {
446  for (size_t i = 0; i < str.length(); i++) {
447    typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
448    if (c > 0x7F)
449      return false;
450  }
451  return true;
452}
453
454bool IsStringASCII(const std::wstring& str) {
455  return DoIsStringASCII(str);
456}
457
458#if !defined(WCHAR_T_IS_UTF16)
459bool IsStringASCII(const string16& str) {
460  return DoIsStringASCII(str);
461}
462#endif
463
464bool IsStringASCII(const base::StringPiece& str) {
465  return DoIsStringASCII(str);
466}
467
468bool IsStringUTF8(const std::string& str) {
469  const char *src = str.data();
470  int32 src_len = static_cast<int32>(str.length());
471  int32 char_index = 0;
472
473  while (char_index < src_len) {
474    int32 code_point;
475    CBU8_NEXT(src, char_index, src_len, code_point);
476    if (!base::IsValidCharacter(code_point))
477       return false;
478  }
479  return true;
480}
481
482template<typename Iter>
483static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
484                                          Iter a_end,
485                                          const char* b) {
486  for (Iter it = a_begin; it != a_end; ++it, ++b) {
487    if (!*b || base::ToLowerASCII(*it) != *b)
488      return false;
489  }
490  return *b == 0;
491}
492
493// Front-ends for LowerCaseEqualsASCII.
494bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
495  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
496}
497
498bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
499  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
500}
501
502#if !defined(WCHAR_T_IS_UTF16)
503bool LowerCaseEqualsASCII(const string16& a, const char* b) {
504  return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
505}
506#endif
507
508bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
509                          std::string::const_iterator a_end,
510                          const char* b) {
511  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
512}
513
514bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
515                          std::wstring::const_iterator a_end,
516                          const char* b) {
517  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
518}
519
520#if !defined(WCHAR_T_IS_UTF16)
521bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
522                          string16::const_iterator a_end,
523                          const char* b) {
524  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
525}
526#endif
527
528#if !defined(ANDROID)
529bool LowerCaseEqualsASCII(const char* a_begin,
530                          const char* a_end,
531                          const char* b) {
532  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
533}
534#endif // !ANDROID
535
536#if !defined(ANDROID)
537bool LowerCaseEqualsASCII(const wchar_t* a_begin,
538                          const wchar_t* a_end,
539                          const char* b) {
540  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
541}
542#endif // !ANDROID
543
544#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID)
545bool LowerCaseEqualsASCII(const char16* a_begin,
546                          const char16* a_end,
547                          const char* b) {
548  return DoLowerCaseEqualsASCII(a_begin, a_end, b);
549}
550#endif
551
552bool EqualsASCII(const string16& a, const base::StringPiece& b) {
553  if (a.length() != b.length())
554    return false;
555  return std::equal(b.begin(), b.end(), a.begin());
556}
557
558bool StartsWithASCII(const std::string& str,
559                     const std::string& search,
560                     bool case_sensitive) {
561  if (case_sensitive)
562    return str.compare(0, search.length(), search) == 0;
563  else
564    return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
565}
566
567template <typename STR>
568bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
569  if (case_sensitive) {
570    return str.compare(0, search.length(), search) == 0;
571  } else {
572    if (search.size() > str.size())
573      return false;
574    return std::equal(search.begin(), search.end(), str.begin(),
575                      base::CaseInsensitiveCompare<typename STR::value_type>());
576  }
577}
578
579bool StartsWith(const std::wstring& str, const std::wstring& search,
580                bool case_sensitive) {
581  return StartsWithT(str, search, case_sensitive);
582}
583
584#if !defined(WCHAR_T_IS_UTF16)
585bool StartsWith(const string16& str, const string16& search,
586                bool case_sensitive) {
587  return StartsWithT(str, search, case_sensitive);
588}
589#endif
590
591template <typename STR>
592bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
593  typename STR::size_type str_length = str.length();
594  typename STR::size_type search_length = search.length();
595  if (search_length > str_length)
596    return false;
597  if (case_sensitive) {
598    return str.compare(str_length - search_length, search_length, search) == 0;
599  } else {
600    return std::equal(search.begin(), search.end(),
601                      str.begin() + (str_length - search_length),
602                      base::CaseInsensitiveCompare<typename STR::value_type>());
603  }
604}
605
606bool EndsWith(const std::string& str, const std::string& search,
607              bool case_sensitive) {
608  return EndsWithT(str, search, case_sensitive);
609}
610
611bool EndsWith(const std::wstring& str, const std::wstring& search,
612              bool case_sensitive) {
613  return EndsWithT(str, search, case_sensitive);
614}
615
616#if !defined(WCHAR_T_IS_UTF16)
617bool EndsWith(const string16& str, const string16& search,
618              bool case_sensitive) {
619  return EndsWithT(str, search, case_sensitive);
620}
621#endif
622
623DataUnits GetByteDisplayUnits(int64 bytes) {
624  // The byte thresholds at which we display amounts.  A byte count is displayed
625  // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1].
626  // This must match the DataUnits enum.
627  static const int64 kUnitThresholds[] = {
628    0,              // DATA_UNITS_BYTE,
629    3*1024,         // DATA_UNITS_KIBIBYTE,
630    2*1024*1024,    // DATA_UNITS_MEBIBYTE,
631    1024*1024*1024  // DATA_UNITS_GIBIBYTE,
632  };
633
634  if (bytes < 0) {
635    NOTREACHED() << "Negative bytes value";
636    return DATA_UNITS_BYTE;
637  }
638
639  int unit_index = arraysize(kUnitThresholds);
640  while (--unit_index > 0) {
641    if (bytes >= kUnitThresholds[unit_index])
642      break;
643  }
644
645  DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE);
646  return DataUnits(unit_index);
647}
648
649// TODO(mpcomplete): deal with locale
650// Byte suffixes.  This must match the DataUnits enum.
651static const char* const kByteStrings[] = {
652  "B",
653  "kB",
654  "MB",
655  "GB"
656};
657
658static const char* const kSpeedStrings[] = {
659  "B/s",
660  "kB/s",
661  "MB/s",
662  "GB/s"
663};
664
665string16 FormatBytesInternal(int64 bytes,
666                             DataUnits units,
667                             bool show_units,
668                             const char* const* suffix) {
669  if (bytes < 0) {
670    NOTREACHED() << "Negative bytes value";
671    return string16();
672  }
673
674  DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE);
675
676  // Put the quantity in the right units.
677  double unit_amount = static_cast<double>(bytes);
678  for (int i = 0; i < units; ++i)
679    unit_amount /= 1024.0;
680
681  char buf[64];
682  if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100)
683    base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount);
684  else
685    base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount);
686
687  std::string ret(buf);
688  if (show_units) {
689    ret += " ";
690    ret += suffix[units];
691  }
692
693  return ASCIIToUTF16(ret);
694}
695
696string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) {
697  return FormatBytesInternal(bytes, units, show_units, kByteStrings);
698}
699
700string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) {
701  return FormatBytesInternal(bytes, units, show_units, kSpeedStrings);
702}
703
704template<class StringType>
705void DoReplaceSubstringsAfterOffset(StringType* str,
706                                    typename StringType::size_type start_offset,
707                                    const StringType& find_this,
708                                    const StringType& replace_with,
709                                    bool replace_all) {
710  if ((start_offset == StringType::npos) || (start_offset >= str->length()))
711    return;
712
713  DCHECK(!find_this.empty());
714  for (typename StringType::size_type offs(str->find(find_this, start_offset));
715      offs != StringType::npos; offs = str->find(find_this, offs)) {
716    str->replace(offs, find_this.length(), replace_with);
717    offs += replace_with.length();
718
719    if (!replace_all)
720      break;
721  }
722}
723
724void ReplaceFirstSubstringAfterOffset(string16* str,
725                                      string16::size_type start_offset,
726                                      const string16& find_this,
727                                      const string16& replace_with) {
728  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
729                                 false);  // replace first instance
730}
731
732void ReplaceFirstSubstringAfterOffset(std::string* str,
733                                      std::string::size_type start_offset,
734                                      const std::string& find_this,
735                                      const std::string& replace_with) {
736  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
737                                 false);  // replace first instance
738}
739
740void ReplaceSubstringsAfterOffset(string16* str,
741                                  string16::size_type start_offset,
742                                  const string16& find_this,
743                                  const string16& replace_with) {
744  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
745                                 true);  // replace all instances
746}
747
748void ReplaceSubstringsAfterOffset(std::string* str,
749                                  std::string::size_type start_offset,
750                                  const std::string& find_this,
751                                  const std::string& replace_with) {
752  DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
753                                 true);  // replace all instances
754}
755
756
757template<typename STR>
758static size_t TokenizeT(const STR& str,
759                        const STR& delimiters,
760                        std::vector<STR>* tokens) {
761  tokens->clear();
762
763  typename STR::size_type start = str.find_first_not_of(delimiters);
764  while (start != STR::npos) {
765    typename STR::size_type end = str.find_first_of(delimiters, start + 1);
766    if (end == STR::npos) {
767      tokens->push_back(str.substr(start));
768      break;
769    } else {
770      tokens->push_back(str.substr(start, end - start));
771      start = str.find_first_not_of(delimiters, end + 1);
772    }
773  }
774
775  return tokens->size();
776}
777
778size_t Tokenize(const std::wstring& str,
779                const std::wstring& delimiters,
780                std::vector<std::wstring>* tokens) {
781  return TokenizeT(str, delimiters, tokens);
782}
783
784#if !defined(WCHAR_T_IS_UTF16)
785size_t Tokenize(const string16& str,
786                const string16& delimiters,
787                std::vector<string16>* tokens) {
788  return TokenizeT(str, delimiters, tokens);
789}
790#endif
791
792size_t Tokenize(const std::string& str,
793                const std::string& delimiters,
794                std::vector<std::string>* tokens) {
795  return TokenizeT(str, delimiters, tokens);
796}
797
798size_t Tokenize(const base::StringPiece& str,
799                const base::StringPiece& delimiters,
800                std::vector<base::StringPiece>* tokens) {
801  return TokenizeT(str, delimiters, tokens);
802}
803
804template<typename STR>
805static STR JoinStringT(const std::vector<STR>& parts,
806                       typename STR::value_type sep) {
807  if (parts.size() == 0) return STR();
808
809  STR result(parts[0]);
810  typename std::vector<STR>::const_iterator iter = parts.begin();
811  ++iter;
812
813  for (; iter != parts.end(); ++iter) {
814    result += sep;
815    result += *iter;
816  }
817
818  return result;
819}
820
821std::string JoinString(const std::vector<std::string>& parts, char sep) {
822  return JoinStringT(parts, sep);
823}
824
825string16 JoinString(const std::vector<string16>& parts, char16 sep) {
826  return JoinStringT(parts, sep);
827}
828
829template<typename STR>
830void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) {
831  const size_t length = str.length();
832  if (!length)
833    return;
834
835  bool last_was_ws = false;
836  size_t last_non_ws_start = 0;
837  for (size_t i = 0; i < length; ++i) {
838    switch (str[i]) {
839      // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR.
840      case L' ':
841      case L'\t':
842      case L'\xA':
843      case L'\xB':
844      case L'\xC':
845      case L'\xD':
846        if (!last_was_ws) {
847          if (i > 0) {
848            result->push_back(
849                str.substr(last_non_ws_start, i - last_non_ws_start));
850          }
851          last_was_ws = true;
852        }
853        break;
854
855      default:  // Not a space character.
856        if (last_was_ws) {
857          last_was_ws = false;
858          last_non_ws_start = i;
859        }
860        break;
861    }
862  }
863  if (!last_was_ws) {
864    result->push_back(
865        str.substr(last_non_ws_start, length - last_non_ws_start));
866  }
867}
868
869void SplitStringAlongWhitespace(const std::wstring& str,
870                                std::vector<std::wstring>* result) {
871  SplitStringAlongWhitespaceT(str, result);
872}
873
874#if !defined(WCHAR_T_IS_UTF16)
875void SplitStringAlongWhitespace(const string16& str,
876                                std::vector<string16>* result) {
877  SplitStringAlongWhitespaceT(str, result);
878}
879#endif
880
881void SplitStringAlongWhitespace(const std::string& str,
882                                std::vector<std::string>* result) {
883  SplitStringAlongWhitespaceT(str, result);
884}
885
886template<class FormatStringType, class OutStringType>
887OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
888    const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
889  size_t substitutions = subst.size();
890  DCHECK(substitutions < 10);
891
892  size_t sub_length = 0;
893  for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
894       iter != subst.end(); ++iter) {
895    sub_length += iter->length();
896  }
897
898  OutStringType formatted;
899  formatted.reserve(format_string.length() + sub_length);
900
901  std::vector<ReplacementOffset> r_offsets;
902  for (typename FormatStringType::const_iterator i = format_string.begin();
903       i != format_string.end(); ++i) {
904    if ('$' == *i) {
905      if (i + 1 != format_string.end()) {
906        ++i;
907        DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
908        if ('$' == *i) {
909          while (i != format_string.end() && '$' == *i) {
910            formatted.push_back('$');
911            ++i;
912          }
913          --i;
914        } else {
915          uintptr_t index = *i - '1';
916          if (offsets) {
917            ReplacementOffset r_offset(index,
918                static_cast<int>(formatted.size()));
919            r_offsets.insert(std::lower_bound(r_offsets.begin(),
920                                              r_offsets.end(),
921                                              r_offset,
922                                              &CompareParameter),
923                             r_offset);
924          }
925          if (index < substitutions)
926            formatted.append(subst.at(index));
927        }
928      }
929    } else {
930      formatted.push_back(*i);
931    }
932  }
933  if (offsets) {
934    for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
935         i != r_offsets.end(); ++i) {
936      offsets->push_back(i->offset);
937    }
938  }
939  return formatted;
940}
941
942string16 ReplaceStringPlaceholders(const string16& format_string,
943                                   const std::vector<string16>& subst,
944                                   std::vector<size_t>* offsets) {
945  return DoReplaceStringPlaceholders(format_string, subst, offsets);
946}
947
948std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
949                                      const std::vector<std::string>& subst,
950                                      std::vector<size_t>* offsets) {
951  return DoReplaceStringPlaceholders(format_string, subst, offsets);
952}
953
954string16 ReplaceStringPlaceholders(const string16& format_string,
955                                   const string16& a,
956                                   size_t* offset) {
957  std::vector<size_t> offsets;
958  std::vector<string16> subst;
959  subst.push_back(a);
960  string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
961
962  DCHECK(offsets.size() == 1);
963  if (offset) {
964    *offset = offsets[0];
965  }
966  return result;
967}
968
969static bool IsWildcard(base_icu::UChar32 character) {
970  return character == '*' || character == '?';
971}
972
973// Move the strings pointers to the point where they start to differ.
974template <typename CHAR, typename NEXT>
975static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
976                         const CHAR** string, const CHAR* string_end,
977                         NEXT next) {
978  const CHAR* escape = NULL;
979  while (*pattern != pattern_end && *string != string_end) {
980    if (!escape && IsWildcard(**pattern)) {
981      // We don't want to match wildcard here, except if it's escaped.
982      return;
983    }
984
985    // Check if the escapement char is found. If so, skip it and move to the
986    // next character.
987    if (!escape && **pattern == '\\') {
988      escape = *pattern;
989      next(pattern, pattern_end);
990      continue;
991    }
992
993    // Check if the chars match, if so, increment the ptrs.
994    const CHAR* pattern_next = *pattern;
995    const CHAR* string_next = *string;
996    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
997    if (pattern_char == next(&string_next, string_end) &&
998        pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
999      *pattern = pattern_next;
1000      *string = string_next;
1001    } else {
1002      // Uh ho, it did not match, we are done. If the last char was an
1003      // escapement, that means that it was an error to advance the ptr here,
1004      // let's put it back where it was. This also mean that the MatchPattern
1005      // function will return false because if we can't match an escape char
1006      // here, then no one will.
1007      if (escape) {
1008        *pattern = escape;
1009      }
1010      return;
1011    }
1012
1013    escape = NULL;
1014  }
1015}
1016
1017template <typename CHAR, typename NEXT>
1018static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
1019  while (*pattern != end) {
1020    if (!IsWildcard(**pattern))
1021      return;
1022    next(pattern, end);
1023  }
1024}
1025
1026template <typename CHAR, typename NEXT>
1027static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
1028                          const CHAR* pattern, const CHAR* pattern_end,
1029                          int depth,
1030                          NEXT next) {
1031  const int kMaxDepth = 16;
1032  if (depth > kMaxDepth)
1033    return false;
1034
1035  // Eat all the matching chars.
1036  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
1037
1038  // If the string is empty, then the pattern must be empty too, or contains
1039  // only wildcards.
1040  if (eval == eval_end) {
1041    EatWildcard(&pattern, pattern_end, next);
1042    return pattern == pattern_end;
1043  }
1044
1045  // Pattern is empty but not string, this is not a match.
1046  if (pattern == pattern_end)
1047    return false;
1048
1049  // If this is a question mark, then we need to compare the rest with
1050  // the current string or the string with one character eaten.
1051  const CHAR* next_pattern = pattern;
1052  next(&next_pattern, pattern_end);
1053  if (pattern[0] == '?') {
1054    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1055                      depth + 1, next))
1056      return true;
1057    const CHAR* next_eval = eval;
1058    next(&next_eval, eval_end);
1059    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
1060                      depth + 1, next))
1061      return true;
1062  }
1063
1064  // This is a *, try to match all the possible substrings with the remainder
1065  // of the pattern.
1066  if (pattern[0] == '*') {
1067    // Collapse duplicate wild cards (********** into *) so that the
1068    // method does not recurse unnecessarily. http://crbug.com/52839
1069    EatWildcard(&next_pattern, pattern_end, next);
1070
1071    while (eval != eval_end) {
1072      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
1073                        depth + 1, next))
1074        return true;
1075      eval++;
1076    }
1077
1078    // We reached the end of the string, let see if the pattern contains only
1079    // wildcards.
1080    if (eval == eval_end) {
1081      EatWildcard(&pattern, pattern_end, next);
1082      if (pattern != pattern_end)
1083        return false;
1084      return true;
1085    }
1086  }
1087
1088  return false;
1089}
1090
1091struct NextCharUTF8 {
1092  base_icu::UChar32 operator()(const char** p, const char* end) {
1093    base_icu::UChar32 c;
1094    int offset = 0;
1095    CBU8_NEXT(*p, offset, end - *p, c);
1096    *p += offset;
1097    return c;
1098  }
1099};
1100
1101struct NextCharUTF16 {
1102  base_icu::UChar32 operator()(const char16** p, const char16* end) {
1103    base_icu::UChar32 c;
1104    int offset = 0;
1105    CBU16_NEXT(*p, offset, end - *p, c);
1106    *p += offset;
1107    return c;
1108  }
1109};
1110
1111bool MatchPattern(const base::StringPiece& eval,
1112                  const base::StringPiece& pattern) {
1113  return MatchPatternT(eval.data(), eval.data() + eval.size(),
1114                       pattern.data(), pattern.data() + pattern.size(),
1115                       0, NextCharUTF8());
1116}
1117
1118bool MatchPattern(const string16& eval, const string16& pattern) {
1119  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1120                       pattern.c_str(), pattern.c_str() + pattern.size(),
1121                       0, NextCharUTF16());
1122}
1123
1124// The following code is compatible with the OpenBSD lcpy interface.  See:
1125//   http://www.gratisoft.us/todd/papers/strlcpy.html
1126//   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1127
1128namespace {
1129
1130template <typename CHAR>
1131size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1132  for (size_t i = 0; i < dst_size; ++i) {
1133    if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1134      return i;
1135  }
1136
1137  // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1138  if (dst_size != 0)
1139    dst[dst_size - 1] = 0;
1140
1141  // Count the rest of the |src|, and return it's length in characters.
1142  while (src[dst_size]) ++dst_size;
1143  return dst_size;
1144}
1145
1146}  // namespace
1147
1148size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1149  return lcpyT<char>(dst, src, dst_size);
1150}
1151size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1152  return lcpyT<wchar_t>(dst, src, dst_size);
1153}
1154
1155bool ElideString(const std::wstring& input, int max_len, std::wstring* output) {
1156  DCHECK(max_len >= 0);
1157  if (static_cast<int>(input.length()) <= max_len) {
1158    output->assign(input);
1159    return false;
1160  }
1161
1162  switch (max_len) {
1163    case 0:
1164      output->clear();
1165      break;
1166    case 1:
1167      output->assign(input.substr(0, 1));
1168      break;
1169    case 2:
1170      output->assign(input.substr(0, 2));
1171      break;
1172    case 3:
1173      output->assign(input.substr(0, 1) + L"." +
1174                     input.substr(input.length() - 1));
1175      break;
1176    case 4:
1177      output->assign(input.substr(0, 1) + L".." +
1178                     input.substr(input.length() - 1));
1179      break;
1180    default: {
1181      int rstr_len = (max_len - 3) / 2;
1182      int lstr_len = rstr_len + ((max_len - 3) % 2);
1183      output->assign(input.substr(0, lstr_len) + L"..." +
1184                     input.substr(input.length() - rstr_len));
1185      break;
1186    }
1187  }
1188
1189  return true;
1190}
1191