string_util.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/string_util.h" 6 7#include "build/build_config.h" 8 9#include <ctype.h> 10#include <errno.h> 11#include <math.h> 12#include <stdarg.h> 13#include <stdio.h> 14#include <stdlib.h> 15#include <string.h> 16#include <time.h> 17#include <wchar.h> 18#include <wctype.h> 19 20#include <algorithm> 21#include <vector> 22 23#include "base/basictypes.h" 24#include "base/logging.h" 25#include "base/singleton.h" 26#include "base/third_party/dmg_fp/dmg_fp.h" 27#include "base/utf_string_conversion_utils.h" 28#include "base/utf_string_conversions.h" 29#include "base/third_party/icu/icu_utf.h" 30 31namespace { 32 33// Force the singleton used by Empty[W]String[16] to be a unique type. This 34// prevents other code that might accidentally use Singleton<string> from 35// getting our internal one. 36struct EmptyStrings { 37 EmptyStrings() {} 38 const std::string s; 39 const std::wstring ws; 40 const string16 s16; 41}; 42 43// Used by ReplaceStringPlaceholders to track the position in the string of 44// replaced parameters. 45struct ReplacementOffset { 46 ReplacementOffset(uintptr_t parameter, size_t offset) 47 : parameter(parameter), 48 offset(offset) {} 49 50 // Index of the parameter. 51 uintptr_t parameter; 52 53 // Starting position in the string. 54 size_t offset; 55}; 56 57static bool CompareParameter(const ReplacementOffset& elem1, 58 const ReplacementOffset& elem2) { 59 return elem1.parameter < elem2.parameter; 60} 61 62} // namespace 63 64namespace base { 65 66bool IsWprintfFormatPortable(const wchar_t* format) { 67 for (const wchar_t* position = format; *position != '\0'; ++position) { 68 if (*position == '%') { 69 bool in_specification = true; 70 bool modifier_l = false; 71 while (in_specification) { 72 // Eat up characters until reaching a known specifier. 73 if (*++position == '\0') { 74 // The format string ended in the middle of a specification. Call 75 // it portable because no unportable specifications were found. The 76 // string is equally broken on all platforms. 77 return true; 78 } 79 80 if (*position == 'l') { 81 // 'l' is the only thing that can save the 's' and 'c' specifiers. 82 modifier_l = true; 83 } else if (((*position == 's' || *position == 'c') && !modifier_l) || 84 *position == 'S' || *position == 'C' || *position == 'F' || 85 *position == 'D' || *position == 'O' || *position == 'U') { 86 // Not portable. 87 return false; 88 } 89 90 if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { 91 // Portable, keep scanning the rest of the format string. 92 in_specification = false; 93 } 94 } 95 } 96 } 97 98 return true; 99} 100 101} // namespace base 102 103 104const std::string& EmptyString() { 105 return Singleton<EmptyStrings>::get()->s; 106} 107 108const std::wstring& EmptyWString() { 109 return Singleton<EmptyStrings>::get()->ws; 110} 111 112const string16& EmptyString16() { 113 return Singleton<EmptyStrings>::get()->s16; 114} 115 116#define WHITESPACE_UNICODE \ 117 0x0009, /* <control-0009> to <control-000D> */ \ 118 0x000A, \ 119 0x000B, \ 120 0x000C, \ 121 0x000D, \ 122 0x0020, /* Space */ \ 123 0x0085, /* <control-0085> */ \ 124 0x00A0, /* No-Break Space */ \ 125 0x1680, /* Ogham Space Mark */ \ 126 0x180E, /* Mongolian Vowel Separator */ \ 127 0x2000, /* En Quad to Hair Space */ \ 128 0x2001, \ 129 0x2002, \ 130 0x2003, \ 131 0x2004, \ 132 0x2005, \ 133 0x2006, \ 134 0x2007, \ 135 0x2008, \ 136 0x2009, \ 137 0x200A, \ 138 0x200C, /* Zero Width Non-Joiner */ \ 139 0x2028, /* Line Separator */ \ 140 0x2029, /* Paragraph Separator */ \ 141 0x202F, /* Narrow No-Break Space */ \ 142 0x205F, /* Medium Mathematical Space */ \ 143 0x3000, /* Ideographic Space */ \ 144 0 145 146const wchar_t kWhitespaceWide[] = { 147 WHITESPACE_UNICODE 148}; 149const char16 kWhitespaceUTF16[] = { 150 WHITESPACE_UNICODE 151}; 152const char kWhitespaceASCII[] = { 153 0x09, // <control-0009> to <control-000D> 154 0x0A, 155 0x0B, 156 0x0C, 157 0x0D, 158 0x20, // Space 159 0 160}; 161 162const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF"; 163 164template<typename STR> 165bool RemoveCharsT(const STR& input, 166 const typename STR::value_type remove_chars[], 167 STR* output) { 168 bool removed = false; 169 size_t found; 170 171 *output = input; 172 173 found = output->find_first_of(remove_chars); 174 while (found != STR::npos) { 175 removed = true; 176 output->replace(found, 1, STR()); 177 found = output->find_first_of(remove_chars, found); 178 } 179 180 return removed; 181} 182 183bool RemoveChars(const std::wstring& input, 184 const wchar_t remove_chars[], 185 std::wstring* output) { 186 return RemoveCharsT(input, remove_chars, output); 187} 188 189#if !defined(WCHAR_T_IS_UTF16) 190bool RemoveChars(const string16& input, 191 const char16 remove_chars[], 192 string16* output) { 193 return RemoveCharsT(input, remove_chars, output); 194} 195#endif 196 197bool RemoveChars(const std::string& input, 198 const char remove_chars[], 199 std::string* output) { 200 return RemoveCharsT(input, remove_chars, output); 201} 202 203template<typename STR> 204TrimPositions TrimStringT(const STR& input, 205 const typename STR::value_type trim_chars[], 206 TrimPositions positions, 207 STR* output) { 208 // Find the edges of leading/trailing whitespace as desired. 209 const typename STR::size_type last_char = input.length() - 1; 210 const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ? 211 input.find_first_not_of(trim_chars) : 0; 212 const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ? 213 input.find_last_not_of(trim_chars) : last_char; 214 215 // When the string was all whitespace, report that we stripped off whitespace 216 // from whichever position the caller was interested in. For empty input, we 217 // stripped no whitespace, but we still need to clear |output|. 218 if (input.empty() || 219 (first_good_char == STR::npos) || (last_good_char == STR::npos)) { 220 bool input_was_empty = input.empty(); // in case output == &input 221 output->clear(); 222 return input_was_empty ? TRIM_NONE : positions; 223 } 224 225 // Trim the whitespace. 226 *output = 227 input.substr(first_good_char, last_good_char - first_good_char + 1); 228 229 // Return where we trimmed from. 230 return static_cast<TrimPositions>( 231 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | 232 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); 233} 234 235bool TrimString(const std::wstring& input, 236 const wchar_t trim_chars[], 237 std::wstring* output) { 238 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 239} 240 241#if !defined(WCHAR_T_IS_UTF16) 242bool TrimString(const string16& input, 243 const char16 trim_chars[], 244 string16* output) { 245 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 246} 247#endif 248 249bool TrimString(const std::string& input, 250 const char trim_chars[], 251 std::string* output) { 252 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 253} 254 255void TruncateUTF8ToByteSize(const std::string& input, 256 const size_t byte_size, 257 std::string* output) { 258 DCHECK(output); 259 if (byte_size > input.length()) { 260 *output = input; 261 return; 262 } 263 DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); 264 // Note: This cast is necessary because CBU8_NEXT uses int32s. 265 int32 truncation_length = static_cast<int32>(byte_size); 266 int32 char_index = truncation_length - 1; 267 const char* data = input.data(); 268 269 // Using CBU8, we will move backwards from the truncation point 270 // to the beginning of the string looking for a valid UTF8 271 // character. Once a full UTF8 character is found, we will 272 // truncate the string to the end of that character. 273 while (char_index >= 0) { 274 int32 prev = char_index; 275 uint32 code_point = 0; 276 CBU8_NEXT(data, char_index, truncation_length, code_point); 277 if (!base::IsValidCharacter(code_point) || 278 !base::IsValidCodepoint(code_point)) { 279 char_index = prev - 1; 280 } else { 281 break; 282 } 283 } 284 285 if (char_index >= 0 ) 286 *output = input.substr(0, char_index); 287 else 288 output->clear(); 289} 290 291TrimPositions TrimWhitespace(const std::wstring& input, 292 TrimPositions positions, 293 std::wstring* output) { 294 return TrimStringT(input, kWhitespaceWide, positions, output); 295} 296 297#if !defined(WCHAR_T_IS_UTF16) 298TrimPositions TrimWhitespace(const string16& input, 299 TrimPositions positions, 300 string16* output) { 301 return TrimStringT(input, kWhitespaceUTF16, positions, output); 302} 303#endif 304 305TrimPositions TrimWhitespaceASCII(const std::string& input, 306 TrimPositions positions, 307 std::string* output) { 308 return TrimStringT(input, kWhitespaceASCII, positions, output); 309} 310 311// This function is only for backward-compatibility. 312// To be removed when all callers are updated. 313TrimPositions TrimWhitespace(const std::string& input, 314 TrimPositions positions, 315 std::string* output) { 316 return TrimWhitespaceASCII(input, positions, output); 317} 318 319template<typename STR> 320STR CollapseWhitespaceT(const STR& text, 321 bool trim_sequences_with_line_breaks) { 322 STR result; 323 result.resize(text.size()); 324 325 // Set flags to pretend we're already in a trimmed whitespace sequence, so we 326 // will trim any leading whitespace. 327 bool in_whitespace = true; 328 bool already_trimmed = true; 329 330 int chars_written = 0; 331 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { 332 if (IsWhitespace(*i)) { 333 if (!in_whitespace) { 334 // Reduce all whitespace sequences to a single space. 335 in_whitespace = true; 336 result[chars_written++] = L' '; 337 } 338 if (trim_sequences_with_line_breaks && !already_trimmed && 339 ((*i == '\n') || (*i == '\r'))) { 340 // Whitespace sequences containing CR or LF are eliminated entirely. 341 already_trimmed = true; 342 --chars_written; 343 } 344 } else { 345 // Non-whitespace chracters are copied straight across. 346 in_whitespace = false; 347 already_trimmed = false; 348 result[chars_written++] = *i; 349 } 350 } 351 352 if (in_whitespace && !already_trimmed) { 353 // Any trailing whitespace is eliminated. 354 --chars_written; 355 } 356 357 result.resize(chars_written); 358 return result; 359} 360 361std::wstring CollapseWhitespace(const std::wstring& text, 362 bool trim_sequences_with_line_breaks) { 363 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 364} 365 366#if !defined(WCHAR_T_IS_UTF16) 367string16 CollapseWhitespace(const string16& text, 368 bool trim_sequences_with_line_breaks) { 369 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 370} 371#endif 372 373std::string CollapseWhitespaceASCII(const std::string& text, 374 bool trim_sequences_with_line_breaks) { 375 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 376} 377 378bool ContainsOnlyWhitespaceASCII(const std::string& str) { 379 for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) { 380 if (!IsAsciiWhitespace(*i)) 381 return false; 382 } 383 return true; 384} 385 386bool ContainsOnlyWhitespace(const string16& str) { 387 for (string16::const_iterator i(str.begin()); i != str.end(); ++i) { 388 if (!IsWhitespace(*i)) 389 return false; 390 } 391 return true; 392} 393 394template<typename STR> 395static bool ContainsOnlyCharsT(const STR& input, const STR& characters) { 396 for (typename STR::const_iterator iter = input.begin(); 397 iter != input.end(); ++iter) { 398 if (characters.find(*iter) == STR::npos) 399 return false; 400 } 401 return true; 402} 403 404bool ContainsOnlyChars(const std::wstring& input, 405 const std::wstring& characters) { 406 return ContainsOnlyCharsT(input, characters); 407} 408 409#if !defined(WCHAR_T_IS_UTF16) 410bool ContainsOnlyChars(const string16& input, const string16& characters) { 411 return ContainsOnlyCharsT(input, characters); 412} 413#endif 414 415bool ContainsOnlyChars(const std::string& input, 416 const std::string& characters) { 417 return ContainsOnlyCharsT(input, characters); 418} 419 420std::string WideToASCII(const std::wstring& wide) { 421 DCHECK(IsStringASCII(wide)) << wide; 422 return std::string(wide.begin(), wide.end()); 423} 424 425std::string UTF16ToASCII(const string16& utf16) { 426 DCHECK(IsStringASCII(utf16)) << utf16; 427 return std::string(utf16.begin(), utf16.end()); 428} 429 430// Latin1 is just the low range of Unicode, so we can copy directly to convert. 431bool WideToLatin1(const std::wstring& wide, std::string* latin1) { 432 std::string output; 433 output.resize(wide.size()); 434 latin1->clear(); 435 for (size_t i = 0; i < wide.size(); i++) { 436 if (wide[i] > 255) 437 return false; 438 output[i] = static_cast<char>(wide[i]); 439 } 440 latin1->swap(output); 441 return true; 442} 443 444bool IsString8Bit(const std::wstring& str) { 445 for (size_t i = 0; i < str.length(); i++) { 446 if (str[i] > 255) 447 return false; 448 } 449 return true; 450} 451 452template<class STR> 453static bool DoIsStringASCII(const STR& str) { 454 for (size_t i = 0; i < str.length(); i++) { 455 typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i]; 456 if (c > 0x7F) 457 return false; 458 } 459 return true; 460} 461 462bool IsStringASCII(const std::wstring& str) { 463 return DoIsStringASCII(str); 464} 465 466#if !defined(WCHAR_T_IS_UTF16) 467bool IsStringASCII(const string16& str) { 468 return DoIsStringASCII(str); 469} 470#endif 471 472bool IsStringASCII(const base::StringPiece& str) { 473 return DoIsStringASCII(str); 474} 475 476bool IsStringUTF8(const std::string& str) { 477 const char *src = str.data(); 478 int32 src_len = static_cast<int32>(str.length()); 479 int32 char_index = 0; 480 481 while (char_index < src_len) { 482 int32 code_point; 483 CBU8_NEXT(src, char_index, src_len, code_point); 484 if (!base::IsValidCharacter(code_point)) 485 return false; 486 } 487 return true; 488} 489 490template<typename Iter> 491static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 492 Iter a_end, 493 const char* b) { 494 for (Iter it = a_begin; it != a_end; ++it, ++b) { 495 if (!*b || ToLowerASCII(*it) != *b) 496 return false; 497 } 498 return *b == 0; 499} 500 501// Front-ends for LowerCaseEqualsASCII. 502bool LowerCaseEqualsASCII(const std::string& a, const char* b) { 503 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 504} 505 506bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) { 507 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 508} 509 510#if !defined(WCHAR_T_IS_UTF16) 511bool LowerCaseEqualsASCII(const string16& a, const char* b) { 512 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 513} 514#endif 515 516bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 517 std::string::const_iterator a_end, 518 const char* b) { 519 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 520} 521 522bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 523 std::wstring::const_iterator a_end, 524 const char* b) { 525 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 526} 527 528#if !defined(WCHAR_T_IS_UTF16) 529bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 530 string16::const_iterator a_end, 531 const char* b) { 532 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 533} 534#endif 535 536// weird string library ... 537#if !defined(ANDROID) 538bool LowerCaseEqualsASCII(const char* a_begin, 539 const char* a_end, 540 const char* b) { 541 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 542} 543#endif // !ANDROID 544 545// weird string library ... 546#if !defined(ANDROID) 547bool LowerCaseEqualsASCII(const wchar_t* a_begin, 548 const wchar_t* a_end, 549 const char* b) { 550 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 551} 552#endif // !ANDROID 553 554#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID) 555bool LowerCaseEqualsASCII(const char16* a_begin, 556 const char16* a_end, 557 const char* b) { 558 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 559} 560#endif 561 562bool EqualsASCII(const string16& a, const base::StringPiece& b) { 563 if (a.length() != b.length()) 564 return false; 565 return std::equal(b.begin(), b.end(), a.begin()); 566} 567 568bool StartsWithASCII(const std::string& str, 569 const std::string& search, 570 bool case_sensitive) { 571 if (case_sensitive) 572 return str.compare(0, search.length(), search) == 0; 573 else 574 return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0; 575} 576 577template <typename STR> 578bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) { 579 if (case_sensitive) { 580 return str.compare(0, search.length(), search) == 0; 581 } else { 582 if (search.size() > str.size()) 583 return false; 584 return std::equal(search.begin(), search.end(), str.begin(), 585 CaseInsensitiveCompare<typename STR::value_type>()); 586 } 587} 588 589bool StartsWith(const std::wstring& str, const std::wstring& search, 590 bool case_sensitive) { 591 return StartsWithT(str, search, case_sensitive); 592} 593 594#if !defined(WCHAR_T_IS_UTF16) 595bool StartsWith(const string16& str, const string16& search, 596 bool case_sensitive) { 597 return StartsWithT(str, search, case_sensitive); 598} 599#endif 600 601template <typename STR> 602bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) { 603 typename STR::size_type str_length = str.length(); 604 typename STR::size_type search_length = search.length(); 605 if (search_length > str_length) 606 return false; 607 if (case_sensitive) { 608 return str.compare(str_length - search_length, search_length, search) == 0; 609 } else { 610 return std::equal(search.begin(), search.end(), 611 str.begin() + (str_length - search_length), 612 CaseInsensitiveCompare<typename STR::value_type>()); 613 } 614} 615 616bool EndsWith(const std::string& str, const std::string& search, 617 bool case_sensitive) { 618 return EndsWithT(str, search, case_sensitive); 619} 620 621bool EndsWith(const std::wstring& str, const std::wstring& search, 622 bool case_sensitive) { 623 return EndsWithT(str, search, case_sensitive); 624} 625 626#if !defined(WCHAR_T_IS_UTF16) 627bool EndsWith(const string16& str, const string16& search, 628 bool case_sensitive) { 629 return EndsWithT(str, search, case_sensitive); 630} 631#endif 632 633DataUnits GetByteDisplayUnits(int64 bytes) { 634 // The byte thresholds at which we display amounts. A byte count is displayed 635 // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1]. 636 // This must match the DataUnits enum. 637 static const int64 kUnitThresholds[] = { 638 0, // DATA_UNITS_BYTE, 639 3*1024, // DATA_UNITS_KIBIBYTE, 640 2*1024*1024, // DATA_UNITS_MEBIBYTE, 641 1024*1024*1024 // DATA_UNITS_GIBIBYTE, 642 }; 643 644 if (bytes < 0) { 645 NOTREACHED() << "Negative bytes value"; 646 return DATA_UNITS_BYTE; 647 } 648 649 int unit_index = arraysize(kUnitThresholds); 650 while (--unit_index > 0) { 651 if (bytes >= kUnitThresholds[unit_index]) 652 break; 653 } 654 655 DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE); 656 return DataUnits(unit_index); 657} 658 659// TODO(mpcomplete): deal with locale 660// Byte suffixes. This must match the DataUnits enum. 661static const char* const kByteStrings[] = { 662 "B", 663 "kB", 664 "MB", 665 "GB" 666}; 667 668static const char* const kSpeedStrings[] = { 669 "B/s", 670 "kB/s", 671 "MB/s", 672 "GB/s" 673}; 674 675string16 FormatBytesInternal(int64 bytes, 676 DataUnits units, 677 bool show_units, 678 const char* const* suffix) { 679 if (bytes < 0) { 680 NOTREACHED() << "Negative bytes value"; 681 return string16(); 682 } 683 684 DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE); 685 686 // Put the quantity in the right units. 687 double unit_amount = static_cast<double>(bytes); 688 for (int i = 0; i < units; ++i) 689 unit_amount /= 1024.0; 690 691 char buf[64]; 692 if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100) 693 base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount); 694 else 695 base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount); 696 697 std::string ret(buf); 698 if (show_units) { 699 ret += " "; 700 ret += suffix[units]; 701 } 702 703 return ASCIIToUTF16(ret); 704} 705 706string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) { 707 return FormatBytesInternal(bytes, units, show_units, kByteStrings); 708} 709 710string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) { 711 return FormatBytesInternal(bytes, units, show_units, kSpeedStrings); 712} 713 714template<class StringType> 715void DoReplaceSubstringsAfterOffset(StringType* str, 716 typename StringType::size_type start_offset, 717 const StringType& find_this, 718 const StringType& replace_with, 719 bool replace_all) { 720 if ((start_offset == StringType::npos) || (start_offset >= str->length())) 721 return; 722 723 DCHECK(!find_this.empty()); 724 for (typename StringType::size_type offs(str->find(find_this, start_offset)); 725 offs != StringType::npos; offs = str->find(find_this, offs)) { 726 str->replace(offs, find_this.length(), replace_with); 727 offs += replace_with.length(); 728 729 if (!replace_all) 730 break; 731 } 732} 733 734void ReplaceFirstSubstringAfterOffset(string16* str, 735 string16::size_type start_offset, 736 const string16& find_this, 737 const string16& replace_with) { 738 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 739 false); // replace first instance 740} 741 742void ReplaceFirstSubstringAfterOffset(std::string* str, 743 std::string::size_type start_offset, 744 const std::string& find_this, 745 const std::string& replace_with) { 746 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 747 false); // replace first instance 748} 749 750void ReplaceSubstringsAfterOffset(string16* str, 751 string16::size_type start_offset, 752 const string16& find_this, 753 const string16& replace_with) { 754 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 755 true); // replace all instances 756} 757 758void ReplaceSubstringsAfterOffset(std::string* str, 759 std::string::size_type start_offset, 760 const std::string& find_this, 761 const std::string& replace_with) { 762 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 763 true); // replace all instances 764} 765 766 767template<typename STR> 768static size_t TokenizeT(const STR& str, 769 const STR& delimiters, 770 std::vector<STR>* tokens) { 771 tokens->clear(); 772 773 typename STR::size_type start = str.find_first_not_of(delimiters); 774 while (start != STR::npos) { 775 typename STR::size_type end = str.find_first_of(delimiters, start + 1); 776 if (end == STR::npos) { 777 tokens->push_back(str.substr(start)); 778 break; 779 } else { 780 tokens->push_back(str.substr(start, end - start)); 781 start = str.find_first_not_of(delimiters, end + 1); 782 } 783 } 784 785 return tokens->size(); 786} 787 788size_t Tokenize(const std::wstring& str, 789 const std::wstring& delimiters, 790 std::vector<std::wstring>* tokens) { 791 return TokenizeT(str, delimiters, tokens); 792} 793 794#if !defined(WCHAR_T_IS_UTF16) 795size_t Tokenize(const string16& str, 796 const string16& delimiters, 797 std::vector<string16>* tokens) { 798 return TokenizeT(str, delimiters, tokens); 799} 800#endif 801 802size_t Tokenize(const std::string& str, 803 const std::string& delimiters, 804 std::vector<std::string>* tokens) { 805 return TokenizeT(str, delimiters, tokens); 806} 807 808size_t Tokenize(const base::StringPiece& str, 809 const base::StringPiece& delimiters, 810 std::vector<base::StringPiece>* tokens) { 811 return TokenizeT(str, delimiters, tokens); 812} 813 814template<typename STR> 815static STR JoinStringT(const std::vector<STR>& parts, 816 typename STR::value_type sep) { 817 if (parts.size() == 0) return STR(); 818 819 STR result(parts[0]); 820 typename std::vector<STR>::const_iterator iter = parts.begin(); 821 ++iter; 822 823 for (; iter != parts.end(); ++iter) { 824 result += sep; 825 result += *iter; 826 } 827 828 return result; 829} 830 831std::string JoinString(const std::vector<std::string>& parts, char sep) { 832 return JoinStringT(parts, sep); 833} 834 835#if !defined(WCHAR_T_IS_UTF16) 836string16 JoinString(const std::vector<string16>& parts, char16 sep) { 837 return JoinStringT(parts, sep); 838} 839#endif 840 841std::wstring JoinString(const std::vector<std::wstring>& parts, wchar_t sep) { 842 return JoinStringT(parts, sep); 843} 844 845template<typename STR> 846void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) { 847 const size_t length = str.length(); 848 if (!length) 849 return; 850 851 bool last_was_ws = false; 852 size_t last_non_ws_start = 0; 853 for (size_t i = 0; i < length; ++i) { 854 switch (str[i]) { 855 // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR. 856 case L' ': 857 case L'\t': 858 case L'\xA': 859 case L'\xB': 860 case L'\xC': 861 case L'\xD': 862 if (!last_was_ws) { 863 if (i > 0) { 864 result->push_back( 865 str.substr(last_non_ws_start, i - last_non_ws_start)); 866 } 867 last_was_ws = true; 868 } 869 break; 870 871 default: // Not a space character. 872 if (last_was_ws) { 873 last_was_ws = false; 874 last_non_ws_start = i; 875 } 876 break; 877 } 878 } 879 if (!last_was_ws) { 880 result->push_back( 881 str.substr(last_non_ws_start, length - last_non_ws_start)); 882 } 883} 884 885void SplitStringAlongWhitespace(const std::wstring& str, 886 std::vector<std::wstring>* result) { 887 SplitStringAlongWhitespaceT(str, result); 888} 889 890#if !defined(WCHAR_T_IS_UTF16) 891void SplitStringAlongWhitespace(const string16& str, 892 std::vector<string16>* result) { 893 SplitStringAlongWhitespaceT(str, result); 894} 895#endif 896 897void SplitStringAlongWhitespace(const std::string& str, 898 std::vector<std::string>* result) { 899 SplitStringAlongWhitespaceT(str, result); 900} 901 902template<class FormatStringType, class OutStringType> 903OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string, 904 const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) { 905 size_t substitutions = subst.size(); 906 DCHECK(substitutions < 10); 907 908 size_t sub_length = 0; 909 for (typename std::vector<OutStringType>::const_iterator iter = subst.begin(); 910 iter != subst.end(); ++iter) { 911 sub_length += (*iter).length(); 912 } 913 914 OutStringType formatted; 915 formatted.reserve(format_string.length() + sub_length); 916 917 std::vector<ReplacementOffset> r_offsets; 918 for (typename FormatStringType::const_iterator i = format_string.begin(); 919 i != format_string.end(); ++i) { 920 if ('$' == *i) { 921 if (i + 1 != format_string.end()) { 922 ++i; 923 DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i; 924 if ('$' == *i) { 925 while (i != format_string.end() && '$' == *i) { 926 formatted.push_back('$'); 927 ++i; 928 } 929 --i; 930 } else { 931 uintptr_t index = *i - '1'; 932 if (offsets) { 933 ReplacementOffset r_offset(index, 934 static_cast<int>(formatted.size())); 935 r_offsets.insert(std::lower_bound(r_offsets.begin(), 936 r_offsets.end(), r_offset, 937 &CompareParameter), 938 r_offset); 939 } 940 if (index < substitutions) 941 formatted.append(subst.at(index)); 942 } 943 } 944 } else { 945 formatted.push_back(*i); 946 } 947 } 948 if (offsets) { 949 for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin(); 950 i != r_offsets.end(); ++i) { 951 offsets->push_back(i->offset); 952 } 953 } 954 return formatted; 955} 956 957string16 ReplaceStringPlaceholders(const string16& format_string, 958 const std::vector<string16>& subst, 959 std::vector<size_t>* offsets) { 960 return DoReplaceStringPlaceholders(format_string, subst, offsets); 961} 962 963std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, 964 const std::vector<std::string>& subst, 965 std::vector<size_t>* offsets) { 966 return DoReplaceStringPlaceholders(format_string, subst, offsets); 967} 968 969string16 ReplaceStringPlaceholders(const string16& format_string, 970 const string16& a, 971 size_t* offset) { 972 std::vector<size_t> offsets; 973 std::vector<string16> subst; 974 subst.push_back(a); 975 string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); 976 977 DCHECK(offsets.size() == 1); 978 if (offset) { 979 *offset = offsets[0]; 980 } 981 return result; 982} 983 984static bool IsWildcard(base_icu::UChar32 character) { 985 return character == '*' || character == '?'; 986} 987 988// Move the strings pointers to the point where they start to differ. 989template <typename CHAR, typename NEXT> 990static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, 991 const CHAR** string, const CHAR* string_end, 992 NEXT next) { 993 const CHAR* escape = NULL; 994 while (*pattern != pattern_end && *string != string_end) { 995 if (!escape && IsWildcard(**pattern)) { 996 // We don't want to match wildcard here, except if it's escaped. 997 return; 998 } 999 1000 // Check if the escapement char is found. If so, skip it and move to the 1001 // next character. 1002 if (!escape && **pattern == '\\') { 1003 escape = *pattern; 1004 next(pattern, pattern_end); 1005 continue; 1006 } 1007 1008 // Check if the chars match, if so, increment the ptrs. 1009 const CHAR* pattern_next = *pattern; 1010 const CHAR* string_next = *string; 1011 base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); 1012 if (pattern_char == next(&string_next, string_end) && 1013 pattern_char != (base_icu::UChar32) CBU_SENTINEL) { 1014 *pattern = pattern_next; 1015 *string = string_next; 1016 } else { 1017 // Uh ho, it did not match, we are done. If the last char was an 1018 // escapement, that means that it was an error to advance the ptr here, 1019 // let's put it back where it was. This also mean that the MatchPattern 1020 // function will return false because if we can't match an escape char 1021 // here, then no one will. 1022 if (escape) { 1023 *pattern = escape; 1024 } 1025 return; 1026 } 1027 1028 escape = NULL; 1029 } 1030} 1031 1032template <typename CHAR, typename NEXT> 1033static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { 1034 while (*pattern != end) { 1035 if (!IsWildcard(**pattern)) 1036 return; 1037 next(pattern, end); 1038 } 1039} 1040 1041template <typename CHAR, typename NEXT> 1042static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, 1043 const CHAR* pattern, const CHAR* pattern_end, 1044 int depth, 1045 NEXT next) { 1046 const int kMaxDepth = 16; 1047 if (depth > kMaxDepth) 1048 return false; 1049 1050 // Eat all the matching chars. 1051 EatSameChars(&pattern, pattern_end, &eval, eval_end, next); 1052 1053 // If the string is empty, then the pattern must be empty too, or contains 1054 // only wildcards. 1055 if (eval == eval_end) { 1056 EatWildcard(&pattern, pattern_end, next); 1057 return pattern == pattern_end; 1058 } 1059 1060 // Pattern is empty but not string, this is not a match. 1061 if (pattern == pattern_end) 1062 return false; 1063 1064 // If this is a question mark, then we need to compare the rest with 1065 // the current string or the string with one character eaten. 1066 const CHAR* next_pattern = pattern; 1067 next(&next_pattern, pattern_end); 1068 if (pattern[0] == '?') { 1069 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1070 depth + 1, next)) 1071 return true; 1072 const CHAR* next_eval = eval; 1073 next(&next_eval, eval_end); 1074 if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, 1075 depth + 1, next)) 1076 return true; 1077 } 1078 1079 // This is a *, try to match all the possible substrings with the remainder 1080 // of the pattern. 1081 if (pattern[0] == '*') { 1082 while (eval != eval_end) { 1083 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1084 depth + 1, next)) 1085 return true; 1086 eval++; 1087 } 1088 1089 // We reached the end of the string, let see if the pattern contains only 1090 // wildcards. 1091 if (eval == eval_end) { 1092 EatWildcard(&pattern, pattern_end, next); 1093 if (pattern != pattern_end) 1094 return false; 1095 return true; 1096 } 1097 } 1098 1099 return false; 1100} 1101 1102struct NextCharUTF8 { 1103 base_icu::UChar32 operator()(const char** p, const char* end) { 1104 base_icu::UChar32 c; 1105 int offset = 0; 1106 CBU8_NEXT(*p, offset, end - *p, c); 1107 *p += offset; 1108 return c; 1109 } 1110}; 1111 1112struct NextCharUTF16 { 1113 base_icu::UChar32 operator()(const char16** p, const char16* end) { 1114 base_icu::UChar32 c; 1115 int offset = 0; 1116 CBU16_NEXT(*p, offset, end - *p, c); 1117 *p += offset; 1118 return c; 1119 } 1120}; 1121 1122bool MatchPattern(const base::StringPiece& eval, 1123 const base::StringPiece& pattern) { 1124 return MatchPatternT(eval.data(), eval.data() + eval.size(), 1125 pattern.data(), pattern.data() + pattern.size(), 1126 0, NextCharUTF8()); 1127} 1128 1129bool MatchPattern(const string16& eval, const string16& pattern) { 1130 return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), 1131 pattern.c_str(), pattern.c_str() + pattern.size(), 1132 0, NextCharUTF16()); 1133} 1134 1135// The following code is compatible with the OpenBSD lcpy interface. See: 1136// http://www.gratisoft.us/todd/papers/strlcpy.html 1137// ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c 1138 1139namespace { 1140 1141template <typename CHAR> 1142size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { 1143 for (size_t i = 0; i < dst_size; ++i) { 1144 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. 1145 return i; 1146 } 1147 1148 // We were left off at dst_size. We over copied 1 byte. Null terminate. 1149 if (dst_size != 0) 1150 dst[dst_size - 1] = 0; 1151 1152 // Count the rest of the |src|, and return it's length in characters. 1153 while (src[dst_size]) ++dst_size; 1154 return dst_size; 1155} 1156 1157} // namespace 1158 1159size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { 1160 return lcpyT<char>(dst, src, dst_size); 1161} 1162size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { 1163 return lcpyT<wchar_t>(dst, src, dst_size); 1164} 1165 1166bool ElideString(const std::wstring& input, int max_len, std::wstring* output) { 1167 DCHECK(max_len >= 0); 1168 if (static_cast<int>(input.length()) <= max_len) { 1169 output->assign(input); 1170 return false; 1171 } 1172 1173 switch (max_len) { 1174 case 0: 1175 output->clear(); 1176 break; 1177 case 1: 1178 output->assign(input.substr(0, 1)); 1179 break; 1180 case 2: 1181 output->assign(input.substr(0, 2)); 1182 break; 1183 case 3: 1184 output->assign(input.substr(0, 1) + L"." + 1185 input.substr(input.length() - 1)); 1186 break; 1187 case 4: 1188 output->assign(input.substr(0, 1) + L".." + 1189 input.substr(input.length() - 1)); 1190 break; 1191 default: { 1192 int rstr_len = (max_len - 3) / 2; 1193 int lstr_len = rstr_len + ((max_len - 3) % 2); 1194 output->assign(input.substr(0, lstr_len) + L"..." + 1195 input.substr(input.length() - rstr_len)); 1196 break; 1197 } 1198 } 1199 1200 return true; 1201} 1202