string_util.cc revision 513209b27ff55e2841eac0e4120199c23acce758
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/string_util.h" 6 7#include "build/build_config.h" 8 9#include <ctype.h> 10#include <errno.h> 11#include <math.h> 12#include <stdarg.h> 13#include <stdio.h> 14#include <stdlib.h> 15#include <string.h> 16#include <time.h> 17#include <wchar.h> 18#include <wctype.h> 19 20#include <algorithm> 21#include <vector> 22 23#include "base/basictypes.h" 24#include "base/logging.h" 25#include "base/singleton.h" 26#include "base/third_party/dmg_fp/dmg_fp.h" 27#include "base/utf_string_conversion_utils.h" 28#include "base/utf_string_conversions.h" 29#include "base/third_party/icu/icu_utf.h" 30 31namespace { 32 33// Force the singleton used by Empty[W]String[16] to be a unique type. This 34// prevents other code that might accidentally use Singleton<string> from 35// getting our internal one. 36struct EmptyStrings { 37 EmptyStrings() {} 38 const std::string s; 39 const std::wstring ws; 40 const string16 s16; 41}; 42 43// Used by ReplaceStringPlaceholders to track the position in the string of 44// replaced parameters. 45struct ReplacementOffset { 46 ReplacementOffset(uintptr_t parameter, size_t offset) 47 : parameter(parameter), 48 offset(offset) {} 49 50 // Index of the parameter. 51 uintptr_t parameter; 52 53 // Starting position in the string. 54 size_t offset; 55}; 56 57static bool CompareParameter(const ReplacementOffset& elem1, 58 const ReplacementOffset& elem2) { 59 return elem1.parameter < elem2.parameter; 60} 61 62} // namespace 63 64namespace base { 65 66bool IsWprintfFormatPortable(const wchar_t* format) { 67 for (const wchar_t* position = format; *position != '\0'; ++position) { 68 if (*position == '%') { 69 bool in_specification = true; 70 bool modifier_l = false; 71 while (in_specification) { 72 // Eat up characters until reaching a known specifier. 73 if (*++position == '\0') { 74 // The format string ended in the middle of a specification. Call 75 // it portable because no unportable specifications were found. The 76 // string is equally broken on all platforms. 77 return true; 78 } 79 80 if (*position == 'l') { 81 // 'l' is the only thing that can save the 's' and 'c' specifiers. 82 modifier_l = true; 83 } else if (((*position == 's' || *position == 'c') && !modifier_l) || 84 *position == 'S' || *position == 'C' || *position == 'F' || 85 *position == 'D' || *position == 'O' || *position == 'U') { 86 // Not portable. 87 return false; 88 } 89 90 if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { 91 // Portable, keep scanning the rest of the format string. 92 in_specification = false; 93 } 94 } 95 } 96 } 97 98 return true; 99} 100 101} // namespace base 102 103 104const std::string& EmptyString() { 105 return Singleton<EmptyStrings>::get()->s; 106} 107 108const std::wstring& EmptyWString() { 109 return Singleton<EmptyStrings>::get()->ws; 110} 111 112const string16& EmptyString16() { 113 return Singleton<EmptyStrings>::get()->s16; 114} 115 116#define WHITESPACE_UNICODE \ 117 0x0009, /* <control-0009> to <control-000D> */ \ 118 0x000A, \ 119 0x000B, \ 120 0x000C, \ 121 0x000D, \ 122 0x0020, /* Space */ \ 123 0x0085, /* <control-0085> */ \ 124 0x00A0, /* No-Break Space */ \ 125 0x1680, /* Ogham Space Mark */ \ 126 0x180E, /* Mongolian Vowel Separator */ \ 127 0x2000, /* En Quad to Hair Space */ \ 128 0x2001, \ 129 0x2002, \ 130 0x2003, \ 131 0x2004, \ 132 0x2005, \ 133 0x2006, \ 134 0x2007, \ 135 0x2008, \ 136 0x2009, \ 137 0x200A, \ 138 0x200C, /* Zero Width Non-Joiner */ \ 139 0x2028, /* Line Separator */ \ 140 0x2029, /* Paragraph Separator */ \ 141 0x202F, /* Narrow No-Break Space */ \ 142 0x205F, /* Medium Mathematical Space */ \ 143 0x3000, /* Ideographic Space */ \ 144 0 145 146const wchar_t kWhitespaceWide[] = { 147 WHITESPACE_UNICODE 148}; 149const char16 kWhitespaceUTF16[] = { 150 WHITESPACE_UNICODE 151}; 152const char kWhitespaceASCII[] = { 153 0x09, // <control-0009> to <control-000D> 154 0x0A, 155 0x0B, 156 0x0C, 157 0x0D, 158 0x20, // Space 159 0 160}; 161 162const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF"; 163 164template<typename STR> 165bool RemoveCharsT(const STR& input, 166 const typename STR::value_type remove_chars[], 167 STR* output) { 168 bool removed = false; 169 size_t found; 170 171 *output = input; 172 173 found = output->find_first_of(remove_chars); 174 while (found != STR::npos) { 175 removed = true; 176 output->replace(found, 1, STR()); 177 found = output->find_first_of(remove_chars, found); 178 } 179 180 return removed; 181} 182 183bool RemoveChars(const std::wstring& input, 184 const wchar_t remove_chars[], 185 std::wstring* output) { 186 return RemoveCharsT(input, remove_chars, output); 187} 188 189#if !defined(WCHAR_T_IS_UTF16) 190bool RemoveChars(const string16& input, 191 const char16 remove_chars[], 192 string16* output) { 193 return RemoveCharsT(input, remove_chars, output); 194} 195#endif 196 197bool RemoveChars(const std::string& input, 198 const char remove_chars[], 199 std::string* output) { 200 return RemoveCharsT(input, remove_chars, output); 201} 202 203template<typename STR> 204TrimPositions TrimStringT(const STR& input, 205 const typename STR::value_type trim_chars[], 206 TrimPositions positions, 207 STR* output) { 208 // Find the edges of leading/trailing whitespace as desired. 209 const typename STR::size_type last_char = input.length() - 1; 210 const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ? 211 input.find_first_not_of(trim_chars) : 0; 212 const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ? 213 input.find_last_not_of(trim_chars) : last_char; 214 215 // When the string was all whitespace, report that we stripped off whitespace 216 // from whichever position the caller was interested in. For empty input, we 217 // stripped no whitespace, but we still need to clear |output|. 218 if (input.empty() || 219 (first_good_char == STR::npos) || (last_good_char == STR::npos)) { 220 bool input_was_empty = input.empty(); // in case output == &input 221 output->clear(); 222 return input_was_empty ? TRIM_NONE : positions; 223 } 224 225 // Trim the whitespace. 226 *output = 227 input.substr(first_good_char, last_good_char - first_good_char + 1); 228 229 // Return where we trimmed from. 230 return static_cast<TrimPositions>( 231 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | 232 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); 233} 234 235bool TrimString(const std::wstring& input, 236 const wchar_t trim_chars[], 237 std::wstring* output) { 238 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 239} 240 241#if !defined(WCHAR_T_IS_UTF16) 242bool TrimString(const string16& input, 243 const char16 trim_chars[], 244 string16* output) { 245 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 246} 247#endif 248 249bool TrimString(const std::string& input, 250 const char trim_chars[], 251 std::string* output) { 252 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 253} 254 255void TruncateUTF8ToByteSize(const std::string& input, 256 const size_t byte_size, 257 std::string* output) { 258 DCHECK(output); 259 if (byte_size > input.length()) { 260 *output = input; 261 return; 262 } 263 DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); 264 // Note: This cast is necessary because CBU8_NEXT uses int32s. 265 int32 truncation_length = static_cast<int32>(byte_size); 266 int32 char_index = truncation_length - 1; 267 const char* data = input.data(); 268 269 // Using CBU8, we will move backwards from the truncation point 270 // to the beginning of the string looking for a valid UTF8 271 // character. Once a full UTF8 character is found, we will 272 // truncate the string to the end of that character. 273 while (char_index >= 0) { 274 int32 prev = char_index; 275 uint32 code_point = 0; 276 CBU8_NEXT(data, char_index, truncation_length, code_point); 277 if (!base::IsValidCharacter(code_point) || 278 !base::IsValidCodepoint(code_point)) { 279 char_index = prev - 1; 280 } else { 281 break; 282 } 283 } 284 285 if (char_index >= 0 ) 286 *output = input.substr(0, char_index); 287 else 288 output->clear(); 289} 290 291TrimPositions TrimWhitespace(const std::wstring& input, 292 TrimPositions positions, 293 std::wstring* output) { 294 return TrimStringT(input, kWhitespaceWide, positions, output); 295} 296 297#if !defined(WCHAR_T_IS_UTF16) 298TrimPositions TrimWhitespace(const string16& input, 299 TrimPositions positions, 300 string16* output) { 301 return TrimStringT(input, kWhitespaceUTF16, positions, output); 302} 303#endif 304 305TrimPositions TrimWhitespaceASCII(const std::string& input, 306 TrimPositions positions, 307 std::string* output) { 308 return TrimStringT(input, kWhitespaceASCII, positions, output); 309} 310 311// This function is only for backward-compatibility. 312// To be removed when all callers are updated. 313TrimPositions TrimWhitespace(const std::string& input, 314 TrimPositions positions, 315 std::string* output) { 316 return TrimWhitespaceASCII(input, positions, output); 317} 318 319template<typename STR> 320STR CollapseWhitespaceT(const STR& text, 321 bool trim_sequences_with_line_breaks) { 322 STR result; 323 result.resize(text.size()); 324 325 // Set flags to pretend we're already in a trimmed whitespace sequence, so we 326 // will trim any leading whitespace. 327 bool in_whitespace = true; 328 bool already_trimmed = true; 329 330 int chars_written = 0; 331 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { 332 if (IsWhitespace(*i)) { 333 if (!in_whitespace) { 334 // Reduce all whitespace sequences to a single space. 335 in_whitespace = true; 336 result[chars_written++] = L' '; 337 } 338 if (trim_sequences_with_line_breaks && !already_trimmed && 339 ((*i == '\n') || (*i == '\r'))) { 340 // Whitespace sequences containing CR or LF are eliminated entirely. 341 already_trimmed = true; 342 --chars_written; 343 } 344 } else { 345 // Non-whitespace chracters are copied straight across. 346 in_whitespace = false; 347 already_trimmed = false; 348 result[chars_written++] = *i; 349 } 350 } 351 352 if (in_whitespace && !already_trimmed) { 353 // Any trailing whitespace is eliminated. 354 --chars_written; 355 } 356 357 result.resize(chars_written); 358 return result; 359} 360 361std::wstring CollapseWhitespace(const std::wstring& text, 362 bool trim_sequences_with_line_breaks) { 363 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 364} 365 366#if !defined(WCHAR_T_IS_UTF16) 367string16 CollapseWhitespace(const string16& text, 368 bool trim_sequences_with_line_breaks) { 369 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 370} 371#endif 372 373std::string CollapseWhitespaceASCII(const std::string& text, 374 bool trim_sequences_with_line_breaks) { 375 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 376} 377 378bool ContainsOnlyWhitespaceASCII(const std::string& str) { 379 for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) { 380 if (!IsAsciiWhitespace(*i)) 381 return false; 382 } 383 return true; 384} 385 386bool ContainsOnlyWhitespace(const string16& str) { 387 for (string16::const_iterator i(str.begin()); i != str.end(); ++i) { 388 if (!IsWhitespace(*i)) 389 return false; 390 } 391 return true; 392} 393 394template<typename STR> 395static bool ContainsOnlyCharsT(const STR& input, const STR& characters) { 396 for (typename STR::const_iterator iter = input.begin(); 397 iter != input.end(); ++iter) { 398 if (characters.find(*iter) == STR::npos) 399 return false; 400 } 401 return true; 402} 403 404bool ContainsOnlyChars(const std::wstring& input, 405 const std::wstring& characters) { 406 return ContainsOnlyCharsT(input, characters); 407} 408 409#if !defined(WCHAR_T_IS_UTF16) 410bool ContainsOnlyChars(const string16& input, const string16& characters) { 411 return ContainsOnlyCharsT(input, characters); 412} 413#endif 414 415bool ContainsOnlyChars(const std::string& input, 416 const std::string& characters) { 417 return ContainsOnlyCharsT(input, characters); 418} 419 420std::string WideToASCII(const std::wstring& wide) { 421 DCHECK(IsStringASCII(wide)) << wide; 422 return std::string(wide.begin(), wide.end()); 423} 424 425std::string UTF16ToASCII(const string16& utf16) { 426 DCHECK(IsStringASCII(utf16)) << utf16; 427 return std::string(utf16.begin(), utf16.end()); 428} 429 430// Latin1 is just the low range of Unicode, so we can copy directly to convert. 431bool WideToLatin1(const std::wstring& wide, std::string* latin1) { 432 std::string output; 433 output.resize(wide.size()); 434 latin1->clear(); 435 for (size_t i = 0; i < wide.size(); i++) { 436 if (wide[i] > 255) 437 return false; 438 output[i] = static_cast<char>(wide[i]); 439 } 440 latin1->swap(output); 441 return true; 442} 443 444template<class STR> 445static bool DoIsStringASCII(const STR& str) { 446 for (size_t i = 0; i < str.length(); i++) { 447 typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i]; 448 if (c > 0x7F) 449 return false; 450 } 451 return true; 452} 453 454bool IsStringASCII(const std::wstring& str) { 455 return DoIsStringASCII(str); 456} 457 458#if !defined(WCHAR_T_IS_UTF16) 459bool IsStringASCII(const string16& str) { 460 return DoIsStringASCII(str); 461} 462#endif 463 464bool IsStringASCII(const base::StringPiece& str) { 465 return DoIsStringASCII(str); 466} 467 468bool IsStringUTF8(const std::string& str) { 469 const char *src = str.data(); 470 int32 src_len = static_cast<int32>(str.length()); 471 int32 char_index = 0; 472 473 while (char_index < src_len) { 474 int32 code_point; 475 CBU8_NEXT(src, char_index, src_len, code_point); 476 if (!base::IsValidCharacter(code_point)) 477 return false; 478 } 479 return true; 480} 481 482template<typename Iter> 483static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 484 Iter a_end, 485 const char* b) { 486 for (Iter it = a_begin; it != a_end; ++it, ++b) { 487 if (!*b || base::ToLowerASCII(*it) != *b) 488 return false; 489 } 490 return *b == 0; 491} 492 493// Front-ends for LowerCaseEqualsASCII. 494bool LowerCaseEqualsASCII(const std::string& a, const char* b) { 495 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 496} 497 498bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) { 499 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 500} 501 502#if !defined(WCHAR_T_IS_UTF16) 503bool LowerCaseEqualsASCII(const string16& a, const char* b) { 504 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 505} 506#endif 507 508bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 509 std::string::const_iterator a_end, 510 const char* b) { 511 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 512} 513 514bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 515 std::wstring::const_iterator a_end, 516 const char* b) { 517 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 518} 519 520#if !defined(WCHAR_T_IS_UTF16) 521bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 522 string16::const_iterator a_end, 523 const char* b) { 524 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 525} 526#endif 527 528#if !defined(ANDROID) 529bool LowerCaseEqualsASCII(const char* a_begin, 530 const char* a_end, 531 const char* b) { 532 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 533} 534#endif // !ANDROID 535 536#if !defined(ANDROID) 537bool LowerCaseEqualsASCII(const wchar_t* a_begin, 538 const wchar_t* a_end, 539 const char* b) { 540 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 541} 542#endif // !ANDROID 543 544#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID) 545bool LowerCaseEqualsASCII(const char16* a_begin, 546 const char16* a_end, 547 const char* b) { 548 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 549} 550#endif 551 552bool EqualsASCII(const string16& a, const base::StringPiece& b) { 553 if (a.length() != b.length()) 554 return false; 555 return std::equal(b.begin(), b.end(), a.begin()); 556} 557 558bool StartsWithASCII(const std::string& str, 559 const std::string& search, 560 bool case_sensitive) { 561 if (case_sensitive) 562 return str.compare(0, search.length(), search) == 0; 563 else 564 return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0; 565} 566 567template <typename STR> 568bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) { 569 if (case_sensitive) { 570 return str.compare(0, search.length(), search) == 0; 571 } else { 572 if (search.size() > str.size()) 573 return false; 574 return std::equal(search.begin(), search.end(), str.begin(), 575 base::CaseInsensitiveCompare<typename STR::value_type>()); 576 } 577} 578 579bool StartsWith(const std::wstring& str, const std::wstring& search, 580 bool case_sensitive) { 581 return StartsWithT(str, search, case_sensitive); 582} 583 584#if !defined(WCHAR_T_IS_UTF16) 585bool StartsWith(const string16& str, const string16& search, 586 bool case_sensitive) { 587 return StartsWithT(str, search, case_sensitive); 588} 589#endif 590 591template <typename STR> 592bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) { 593 typename STR::size_type str_length = str.length(); 594 typename STR::size_type search_length = search.length(); 595 if (search_length > str_length) 596 return false; 597 if (case_sensitive) { 598 return str.compare(str_length - search_length, search_length, search) == 0; 599 } else { 600 return std::equal(search.begin(), search.end(), 601 str.begin() + (str_length - search_length), 602 base::CaseInsensitiveCompare<typename STR::value_type>()); 603 } 604} 605 606bool EndsWith(const std::string& str, const std::string& search, 607 bool case_sensitive) { 608 return EndsWithT(str, search, case_sensitive); 609} 610 611bool EndsWith(const std::wstring& str, const std::wstring& search, 612 bool case_sensitive) { 613 return EndsWithT(str, search, case_sensitive); 614} 615 616#if !defined(WCHAR_T_IS_UTF16) 617bool EndsWith(const string16& str, const string16& search, 618 bool case_sensitive) { 619 return EndsWithT(str, search, case_sensitive); 620} 621#endif 622 623DataUnits GetByteDisplayUnits(int64 bytes) { 624 // The byte thresholds at which we display amounts. A byte count is displayed 625 // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1]. 626 // This must match the DataUnits enum. 627 static const int64 kUnitThresholds[] = { 628 0, // DATA_UNITS_BYTE, 629 3*1024, // DATA_UNITS_KIBIBYTE, 630 2*1024*1024, // DATA_UNITS_MEBIBYTE, 631 1024*1024*1024 // DATA_UNITS_GIBIBYTE, 632 }; 633 634 if (bytes < 0) { 635 NOTREACHED() << "Negative bytes value"; 636 return DATA_UNITS_BYTE; 637 } 638 639 int unit_index = arraysize(kUnitThresholds); 640 while (--unit_index > 0) { 641 if (bytes >= kUnitThresholds[unit_index]) 642 break; 643 } 644 645 DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE); 646 return DataUnits(unit_index); 647} 648 649// TODO(mpcomplete): deal with locale 650// Byte suffixes. This must match the DataUnits enum. 651static const char* const kByteStrings[] = { 652 "B", 653 "kB", 654 "MB", 655 "GB" 656}; 657 658static const char* const kSpeedStrings[] = { 659 "B/s", 660 "kB/s", 661 "MB/s", 662 "GB/s" 663}; 664 665string16 FormatBytesInternal(int64 bytes, 666 DataUnits units, 667 bool show_units, 668 const char* const* suffix) { 669 if (bytes < 0) { 670 NOTREACHED() << "Negative bytes value"; 671 return string16(); 672 } 673 674 DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE); 675 676 // Put the quantity in the right units. 677 double unit_amount = static_cast<double>(bytes); 678 for (int i = 0; i < units; ++i) 679 unit_amount /= 1024.0; 680 681 char buf[64]; 682 if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100) 683 base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount); 684 else 685 base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount); 686 687 std::string ret(buf); 688 if (show_units) { 689 ret += " "; 690 ret += suffix[units]; 691 } 692 693 return ASCIIToUTF16(ret); 694} 695 696string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) { 697 return FormatBytesInternal(bytes, units, show_units, kByteStrings); 698} 699 700string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) { 701 return FormatBytesInternal(bytes, units, show_units, kSpeedStrings); 702} 703 704template<class StringType> 705void DoReplaceSubstringsAfterOffset(StringType* str, 706 typename StringType::size_type start_offset, 707 const StringType& find_this, 708 const StringType& replace_with, 709 bool replace_all) { 710 if ((start_offset == StringType::npos) || (start_offset >= str->length())) 711 return; 712 713 DCHECK(!find_this.empty()); 714 for (typename StringType::size_type offs(str->find(find_this, start_offset)); 715 offs != StringType::npos; offs = str->find(find_this, offs)) { 716 str->replace(offs, find_this.length(), replace_with); 717 offs += replace_with.length(); 718 719 if (!replace_all) 720 break; 721 } 722} 723 724void ReplaceFirstSubstringAfterOffset(string16* str, 725 string16::size_type start_offset, 726 const string16& find_this, 727 const string16& replace_with) { 728 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 729 false); // replace first instance 730} 731 732void ReplaceFirstSubstringAfterOffset(std::string* str, 733 std::string::size_type start_offset, 734 const std::string& find_this, 735 const std::string& replace_with) { 736 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 737 false); // replace first instance 738} 739 740void ReplaceSubstringsAfterOffset(string16* str, 741 string16::size_type start_offset, 742 const string16& find_this, 743 const string16& replace_with) { 744 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 745 true); // replace all instances 746} 747 748void ReplaceSubstringsAfterOffset(std::string* str, 749 std::string::size_type start_offset, 750 const std::string& find_this, 751 const std::string& replace_with) { 752 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 753 true); // replace all instances 754} 755 756 757template<typename STR> 758static size_t TokenizeT(const STR& str, 759 const STR& delimiters, 760 std::vector<STR>* tokens) { 761 tokens->clear(); 762 763 typename STR::size_type start = str.find_first_not_of(delimiters); 764 while (start != STR::npos) { 765 typename STR::size_type end = str.find_first_of(delimiters, start + 1); 766 if (end == STR::npos) { 767 tokens->push_back(str.substr(start)); 768 break; 769 } else { 770 tokens->push_back(str.substr(start, end - start)); 771 start = str.find_first_not_of(delimiters, end + 1); 772 } 773 } 774 775 return tokens->size(); 776} 777 778size_t Tokenize(const std::wstring& str, 779 const std::wstring& delimiters, 780 std::vector<std::wstring>* tokens) { 781 return TokenizeT(str, delimiters, tokens); 782} 783 784#if !defined(WCHAR_T_IS_UTF16) 785size_t Tokenize(const string16& str, 786 const string16& delimiters, 787 std::vector<string16>* tokens) { 788 return TokenizeT(str, delimiters, tokens); 789} 790#endif 791 792size_t Tokenize(const std::string& str, 793 const std::string& delimiters, 794 std::vector<std::string>* tokens) { 795 return TokenizeT(str, delimiters, tokens); 796} 797 798size_t Tokenize(const base::StringPiece& str, 799 const base::StringPiece& delimiters, 800 std::vector<base::StringPiece>* tokens) { 801 return TokenizeT(str, delimiters, tokens); 802} 803 804template<typename STR> 805static STR JoinStringT(const std::vector<STR>& parts, 806 typename STR::value_type sep) { 807 if (parts.size() == 0) return STR(); 808 809 STR result(parts[0]); 810 typename std::vector<STR>::const_iterator iter = parts.begin(); 811 ++iter; 812 813 for (; iter != parts.end(); ++iter) { 814 result += sep; 815 result += *iter; 816 } 817 818 return result; 819} 820 821std::string JoinString(const std::vector<std::string>& parts, char sep) { 822 return JoinStringT(parts, sep); 823} 824 825string16 JoinString(const std::vector<string16>& parts, char16 sep) { 826 return JoinStringT(parts, sep); 827} 828 829template<typename STR> 830void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) { 831 const size_t length = str.length(); 832 if (!length) 833 return; 834 835 bool last_was_ws = false; 836 size_t last_non_ws_start = 0; 837 for (size_t i = 0; i < length; ++i) { 838 switch (str[i]) { 839 // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR. 840 case L' ': 841 case L'\t': 842 case L'\xA': 843 case L'\xB': 844 case L'\xC': 845 case L'\xD': 846 if (!last_was_ws) { 847 if (i > 0) { 848 result->push_back( 849 str.substr(last_non_ws_start, i - last_non_ws_start)); 850 } 851 last_was_ws = true; 852 } 853 break; 854 855 default: // Not a space character. 856 if (last_was_ws) { 857 last_was_ws = false; 858 last_non_ws_start = i; 859 } 860 break; 861 } 862 } 863 if (!last_was_ws) { 864 result->push_back( 865 str.substr(last_non_ws_start, length - last_non_ws_start)); 866 } 867} 868 869void SplitStringAlongWhitespace(const std::wstring& str, 870 std::vector<std::wstring>* result) { 871 SplitStringAlongWhitespaceT(str, result); 872} 873 874#if !defined(WCHAR_T_IS_UTF16) 875void SplitStringAlongWhitespace(const string16& str, 876 std::vector<string16>* result) { 877 SplitStringAlongWhitespaceT(str, result); 878} 879#endif 880 881void SplitStringAlongWhitespace(const std::string& str, 882 std::vector<std::string>* result) { 883 SplitStringAlongWhitespaceT(str, result); 884} 885 886template<class FormatStringType, class OutStringType> 887OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string, 888 const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) { 889 size_t substitutions = subst.size(); 890 DCHECK(substitutions < 10); 891 892 size_t sub_length = 0; 893 for (typename std::vector<OutStringType>::const_iterator iter = subst.begin(); 894 iter != subst.end(); ++iter) { 895 sub_length += iter->length(); 896 } 897 898 OutStringType formatted; 899 formatted.reserve(format_string.length() + sub_length); 900 901 std::vector<ReplacementOffset> r_offsets; 902 for (typename FormatStringType::const_iterator i = format_string.begin(); 903 i != format_string.end(); ++i) { 904 if ('$' == *i) { 905 if (i + 1 != format_string.end()) { 906 ++i; 907 DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i; 908 if ('$' == *i) { 909 while (i != format_string.end() && '$' == *i) { 910 formatted.push_back('$'); 911 ++i; 912 } 913 --i; 914 } else { 915 uintptr_t index = *i - '1'; 916 if (offsets) { 917 ReplacementOffset r_offset(index, 918 static_cast<int>(formatted.size())); 919 r_offsets.insert(std::lower_bound(r_offsets.begin(), 920 r_offsets.end(), 921 r_offset, 922 &CompareParameter), 923 r_offset); 924 } 925 if (index < substitutions) 926 formatted.append(subst.at(index)); 927 } 928 } 929 } else { 930 formatted.push_back(*i); 931 } 932 } 933 if (offsets) { 934 for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin(); 935 i != r_offsets.end(); ++i) { 936 offsets->push_back(i->offset); 937 } 938 } 939 return formatted; 940} 941 942string16 ReplaceStringPlaceholders(const string16& format_string, 943 const std::vector<string16>& subst, 944 std::vector<size_t>* offsets) { 945 return DoReplaceStringPlaceholders(format_string, subst, offsets); 946} 947 948std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, 949 const std::vector<std::string>& subst, 950 std::vector<size_t>* offsets) { 951 return DoReplaceStringPlaceholders(format_string, subst, offsets); 952} 953 954string16 ReplaceStringPlaceholders(const string16& format_string, 955 const string16& a, 956 size_t* offset) { 957 std::vector<size_t> offsets; 958 std::vector<string16> subst; 959 subst.push_back(a); 960 string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); 961 962 DCHECK(offsets.size() == 1); 963 if (offset) { 964 *offset = offsets[0]; 965 } 966 return result; 967} 968 969static bool IsWildcard(base_icu::UChar32 character) { 970 return character == '*' || character == '?'; 971} 972 973// Move the strings pointers to the point where they start to differ. 974template <typename CHAR, typename NEXT> 975static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, 976 const CHAR** string, const CHAR* string_end, 977 NEXT next) { 978 const CHAR* escape = NULL; 979 while (*pattern != pattern_end && *string != string_end) { 980 if (!escape && IsWildcard(**pattern)) { 981 // We don't want to match wildcard here, except if it's escaped. 982 return; 983 } 984 985 // Check if the escapement char is found. If so, skip it and move to the 986 // next character. 987 if (!escape && **pattern == '\\') { 988 escape = *pattern; 989 next(pattern, pattern_end); 990 continue; 991 } 992 993 // Check if the chars match, if so, increment the ptrs. 994 const CHAR* pattern_next = *pattern; 995 const CHAR* string_next = *string; 996 base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); 997 if (pattern_char == next(&string_next, string_end) && 998 pattern_char != (base_icu::UChar32) CBU_SENTINEL) { 999 *pattern = pattern_next; 1000 *string = string_next; 1001 } else { 1002 // Uh ho, it did not match, we are done. If the last char was an 1003 // escapement, that means that it was an error to advance the ptr here, 1004 // let's put it back where it was. This also mean that the MatchPattern 1005 // function will return false because if we can't match an escape char 1006 // here, then no one will. 1007 if (escape) { 1008 *pattern = escape; 1009 } 1010 return; 1011 } 1012 1013 escape = NULL; 1014 } 1015} 1016 1017template <typename CHAR, typename NEXT> 1018static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { 1019 while (*pattern != end) { 1020 if (!IsWildcard(**pattern)) 1021 return; 1022 next(pattern, end); 1023 } 1024} 1025 1026template <typename CHAR, typename NEXT> 1027static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, 1028 const CHAR* pattern, const CHAR* pattern_end, 1029 int depth, 1030 NEXT next) { 1031 const int kMaxDepth = 16; 1032 if (depth > kMaxDepth) 1033 return false; 1034 1035 // Eat all the matching chars. 1036 EatSameChars(&pattern, pattern_end, &eval, eval_end, next); 1037 1038 // If the string is empty, then the pattern must be empty too, or contains 1039 // only wildcards. 1040 if (eval == eval_end) { 1041 EatWildcard(&pattern, pattern_end, next); 1042 return pattern == pattern_end; 1043 } 1044 1045 // Pattern is empty but not string, this is not a match. 1046 if (pattern == pattern_end) 1047 return false; 1048 1049 // If this is a question mark, then we need to compare the rest with 1050 // the current string or the string with one character eaten. 1051 const CHAR* next_pattern = pattern; 1052 next(&next_pattern, pattern_end); 1053 if (pattern[0] == '?') { 1054 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1055 depth + 1, next)) 1056 return true; 1057 const CHAR* next_eval = eval; 1058 next(&next_eval, eval_end); 1059 if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, 1060 depth + 1, next)) 1061 return true; 1062 } 1063 1064 // This is a *, try to match all the possible substrings with the remainder 1065 // of the pattern. 1066 if (pattern[0] == '*') { 1067 // Collapse duplicate wild cards (********** into *) so that the 1068 // method does not recurse unnecessarily. http://crbug.com/52839 1069 EatWildcard(&next_pattern, pattern_end, next); 1070 1071 while (eval != eval_end) { 1072 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1073 depth + 1, next)) 1074 return true; 1075 eval++; 1076 } 1077 1078 // We reached the end of the string, let see if the pattern contains only 1079 // wildcards. 1080 if (eval == eval_end) { 1081 EatWildcard(&pattern, pattern_end, next); 1082 if (pattern != pattern_end) 1083 return false; 1084 return true; 1085 } 1086 } 1087 1088 return false; 1089} 1090 1091struct NextCharUTF8 { 1092 base_icu::UChar32 operator()(const char** p, const char* end) { 1093 base_icu::UChar32 c; 1094 int offset = 0; 1095 CBU8_NEXT(*p, offset, end - *p, c); 1096 *p += offset; 1097 return c; 1098 } 1099}; 1100 1101struct NextCharUTF16 { 1102 base_icu::UChar32 operator()(const char16** p, const char16* end) { 1103 base_icu::UChar32 c; 1104 int offset = 0; 1105 CBU16_NEXT(*p, offset, end - *p, c); 1106 *p += offset; 1107 return c; 1108 } 1109}; 1110 1111bool MatchPattern(const base::StringPiece& eval, 1112 const base::StringPiece& pattern) { 1113 return MatchPatternT(eval.data(), eval.data() + eval.size(), 1114 pattern.data(), pattern.data() + pattern.size(), 1115 0, NextCharUTF8()); 1116} 1117 1118bool MatchPattern(const string16& eval, const string16& pattern) { 1119 return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), 1120 pattern.c_str(), pattern.c_str() + pattern.size(), 1121 0, NextCharUTF16()); 1122} 1123 1124// The following code is compatible with the OpenBSD lcpy interface. See: 1125// http://www.gratisoft.us/todd/papers/strlcpy.html 1126// ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c 1127 1128namespace { 1129 1130template <typename CHAR> 1131size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { 1132 for (size_t i = 0; i < dst_size; ++i) { 1133 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. 1134 return i; 1135 } 1136 1137 // We were left off at dst_size. We over copied 1 byte. Null terminate. 1138 if (dst_size != 0) 1139 dst[dst_size - 1] = 0; 1140 1141 // Count the rest of the |src|, and return it's length in characters. 1142 while (src[dst_size]) ++dst_size; 1143 return dst_size; 1144} 1145 1146} // namespace 1147 1148size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { 1149 return lcpyT<char>(dst, src, dst_size); 1150} 1151size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { 1152 return lcpyT<wchar_t>(dst, src, dst_size); 1153} 1154 1155bool ElideString(const std::wstring& input, int max_len, std::wstring* output) { 1156 DCHECK(max_len >= 0); 1157 if (static_cast<int>(input.length()) <= max_len) { 1158 output->assign(input); 1159 return false; 1160 } 1161 1162 switch (max_len) { 1163 case 0: 1164 output->clear(); 1165 break; 1166 case 1: 1167 output->assign(input.substr(0, 1)); 1168 break; 1169 case 2: 1170 output->assign(input.substr(0, 2)); 1171 break; 1172 case 3: 1173 output->assign(input.substr(0, 1) + L"." + 1174 input.substr(input.length() - 1)); 1175 break; 1176 case 4: 1177 output->assign(input.substr(0, 1) + L".." + 1178 input.substr(input.length() - 1)); 1179 break; 1180 default: { 1181 int rstr_len = (max_len - 3) / 2; 1182 int lstr_len = rstr_len + ((max_len - 3) % 2); 1183 output->assign(input.substr(0, lstr_len) + L"..." + 1184 input.substr(input.length() - rstr_len)); 1185 break; 1186 } 1187 } 1188 1189 return true; 1190} 1191