string_util.cc revision 1391b24619d56bae6ce14bb54ed0fb16a945e853
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/string_util.h" 6 7#include "build/build_config.h" 8 9#include <ctype.h> 10#include <errno.h> 11#include <math.h> 12#include <stdarg.h> 13#include <stdio.h> 14#include <stdlib.h> 15#include <string.h> 16#include <time.h> 17#include <wchar.h> 18#include <wctype.h> 19 20#include <algorithm> 21#include <vector> 22 23#include "base/basictypes.h" 24#include "base/logging.h" 25#include "base/singleton.h" 26#include "base/third_party/dmg_fp/dmg_fp.h" 27#include "base/utf_string_conversion_utils.h" 28#include "base/utf_string_conversions.h" 29#include "base/third_party/icu/icu_utf.h" 30 31namespace { 32 33// Force the singleton used by Empty[W]String[16] to be a unique type. This 34// prevents other code that might accidentally use Singleton<string> from 35// getting our internal one. 36struct EmptyStrings { 37 EmptyStrings() {} 38 const std::string s; 39 const std::wstring ws; 40 const string16 s16; 41}; 42 43// Used by ReplaceStringPlaceholders to track the position in the string of 44// replaced parameters. 45struct ReplacementOffset { 46 ReplacementOffset(uintptr_t parameter, size_t offset) 47 : parameter(parameter), 48 offset(offset) {} 49 50 // Index of the parameter. 51 uintptr_t parameter; 52 53 // Starting position in the string. 54 size_t offset; 55}; 56 57static bool CompareParameter(const ReplacementOffset& elem1, 58 const ReplacementOffset& elem2) { 59 return elem1.parameter < elem2.parameter; 60} 61 62} // namespace 63 64namespace base { 65 66bool IsWprintfFormatPortable(const wchar_t* format) { 67 for (const wchar_t* position = format; *position != '\0'; ++position) { 68 if (*position == '%') { 69 bool in_specification = true; 70 bool modifier_l = false; 71 while (in_specification) { 72 // Eat up characters until reaching a known specifier. 73 if (*++position == '\0') { 74 // The format string ended in the middle of a specification. Call 75 // it portable because no unportable specifications were found. The 76 // string is equally broken on all platforms. 77 return true; 78 } 79 80 if (*position == 'l') { 81 // 'l' is the only thing that can save the 's' and 'c' specifiers. 82 modifier_l = true; 83 } else if (((*position == 's' || *position == 'c') && !modifier_l) || 84 *position == 'S' || *position == 'C' || *position == 'F' || 85 *position == 'D' || *position == 'O' || *position == 'U') { 86 // Not portable. 87 return false; 88 } 89 90 if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { 91 // Portable, keep scanning the rest of the format string. 92 in_specification = false; 93 } 94 } 95 } 96 } 97 98 return true; 99} 100 101} // namespace base 102 103 104const std::string& EmptyString() { 105 return Singleton<EmptyStrings>::get()->s; 106} 107 108const std::wstring& EmptyWString() { 109 return Singleton<EmptyStrings>::get()->ws; 110} 111 112const string16& EmptyString16() { 113 return Singleton<EmptyStrings>::get()->s16; 114} 115 116#define WHITESPACE_UNICODE \ 117 0x0009, /* <control-0009> to <control-000D> */ \ 118 0x000A, \ 119 0x000B, \ 120 0x000C, \ 121 0x000D, \ 122 0x0020, /* Space */ \ 123 0x0085, /* <control-0085> */ \ 124 0x00A0, /* No-Break Space */ \ 125 0x1680, /* Ogham Space Mark */ \ 126 0x180E, /* Mongolian Vowel Separator */ \ 127 0x2000, /* En Quad to Hair Space */ \ 128 0x2001, \ 129 0x2002, \ 130 0x2003, \ 131 0x2004, \ 132 0x2005, \ 133 0x2006, \ 134 0x2007, \ 135 0x2008, \ 136 0x2009, \ 137 0x200A, \ 138 0x200C, /* Zero Width Non-Joiner */ \ 139 0x2028, /* Line Separator */ \ 140 0x2029, /* Paragraph Separator */ \ 141 0x202F, /* Narrow No-Break Space */ \ 142 0x205F, /* Medium Mathematical Space */ \ 143 0x3000, /* Ideographic Space */ \ 144 0 145 146const wchar_t kWhitespaceWide[] = { 147 WHITESPACE_UNICODE 148}; 149const char16 kWhitespaceUTF16[] = { 150 WHITESPACE_UNICODE 151}; 152const char kWhitespaceASCII[] = { 153 0x09, // <control-0009> to <control-000D> 154 0x0A, 155 0x0B, 156 0x0C, 157 0x0D, 158 0x20, // Space 159 0 160}; 161 162const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF"; 163 164template<typename STR> 165bool RemoveCharsT(const STR& input, 166 const typename STR::value_type remove_chars[], 167 STR* output) { 168 bool removed = false; 169 size_t found; 170 171 *output = input; 172 173 found = output->find_first_of(remove_chars); 174 while (found != STR::npos) { 175 removed = true; 176 output->replace(found, 1, STR()); 177 found = output->find_first_of(remove_chars, found); 178 } 179 180 return removed; 181} 182 183bool RemoveChars(const std::wstring& input, 184 const wchar_t remove_chars[], 185 std::wstring* output) { 186 return RemoveCharsT(input, remove_chars, output); 187} 188 189#if !defined(WCHAR_T_IS_UTF16) 190bool RemoveChars(const string16& input, 191 const char16 remove_chars[], 192 string16* output) { 193 return RemoveCharsT(input, remove_chars, output); 194} 195#endif 196 197bool RemoveChars(const std::string& input, 198 const char remove_chars[], 199 std::string* output) { 200 return RemoveCharsT(input, remove_chars, output); 201} 202 203template<typename STR> 204TrimPositions TrimStringT(const STR& input, 205 const typename STR::value_type trim_chars[], 206 TrimPositions positions, 207 STR* output) { 208 // Find the edges of leading/trailing whitespace as desired. 209 const typename STR::size_type last_char = input.length() - 1; 210 const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ? 211 input.find_first_not_of(trim_chars) : 0; 212 const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ? 213 input.find_last_not_of(trim_chars) : last_char; 214 215 // When the string was all whitespace, report that we stripped off whitespace 216 // from whichever position the caller was interested in. For empty input, we 217 // stripped no whitespace, but we still need to clear |output|. 218 if (input.empty() || 219 (first_good_char == STR::npos) || (last_good_char == STR::npos)) { 220 bool input_was_empty = input.empty(); // in case output == &input 221 output->clear(); 222 return input_was_empty ? TRIM_NONE : positions; 223 } 224 225 // Trim the whitespace. 226 *output = 227 input.substr(first_good_char, last_good_char - first_good_char + 1); 228 229 // Return where we trimmed from. 230 return static_cast<TrimPositions>( 231 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | 232 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); 233} 234 235bool TrimString(const std::wstring& input, 236 const wchar_t trim_chars[], 237 std::wstring* output) { 238 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 239} 240 241#if !defined(WCHAR_T_IS_UTF16) 242bool TrimString(const string16& input, 243 const char16 trim_chars[], 244 string16* output) { 245 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 246} 247#endif 248 249bool TrimString(const std::string& input, 250 const char trim_chars[], 251 std::string* output) { 252 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 253} 254 255void TruncateUTF8ToByteSize(const std::string& input, 256 const size_t byte_size, 257 std::string* output) { 258 DCHECK(output); 259 if (byte_size > input.length()) { 260 *output = input; 261 return; 262 } 263 DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); 264 // Note: This cast is necessary because CBU8_NEXT uses int32s. 265 int32 truncation_length = static_cast<int32>(byte_size); 266 int32 char_index = truncation_length - 1; 267 const char* data = input.data(); 268 269 // Using CBU8, we will move backwards from the truncation point 270 // to the beginning of the string looking for a valid UTF8 271 // character. Once a full UTF8 character is found, we will 272 // truncate the string to the end of that character. 273 while (char_index >= 0) { 274 int32 prev = char_index; 275 uint32 code_point = 0; 276 CBU8_NEXT(data, char_index, truncation_length, code_point); 277 if (!base::IsValidCharacter(code_point) || 278 !base::IsValidCodepoint(code_point)) { 279 char_index = prev - 1; 280 } else { 281 break; 282 } 283 } 284 285 if (char_index >= 0 ) 286 *output = input.substr(0, char_index); 287 else 288 output->clear(); 289} 290 291TrimPositions TrimWhitespace(const std::wstring& input, 292 TrimPositions positions, 293 std::wstring* output) { 294 return TrimStringT(input, kWhitespaceWide, positions, output); 295} 296 297#if !defined(WCHAR_T_IS_UTF16) 298TrimPositions TrimWhitespace(const string16& input, 299 TrimPositions positions, 300 string16* output) { 301 return TrimStringT(input, kWhitespaceUTF16, positions, output); 302} 303#endif 304 305TrimPositions TrimWhitespaceASCII(const std::string& input, 306 TrimPositions positions, 307 std::string* output) { 308 return TrimStringT(input, kWhitespaceASCII, positions, output); 309} 310 311// This function is only for backward-compatibility. 312// To be removed when all callers are updated. 313TrimPositions TrimWhitespace(const std::string& input, 314 TrimPositions positions, 315 std::string* output) { 316 return TrimWhitespaceASCII(input, positions, output); 317} 318 319template<typename STR> 320STR CollapseWhitespaceT(const STR& text, 321 bool trim_sequences_with_line_breaks) { 322 STR result; 323 result.resize(text.size()); 324 325 // Set flags to pretend we're already in a trimmed whitespace sequence, so we 326 // will trim any leading whitespace. 327 bool in_whitespace = true; 328 bool already_trimmed = true; 329 330 int chars_written = 0; 331 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { 332 if (IsWhitespace(*i)) { 333 if (!in_whitespace) { 334 // Reduce all whitespace sequences to a single space. 335 in_whitespace = true; 336 result[chars_written++] = L' '; 337 } 338 if (trim_sequences_with_line_breaks && !already_trimmed && 339 ((*i == '\n') || (*i == '\r'))) { 340 // Whitespace sequences containing CR or LF are eliminated entirely. 341 already_trimmed = true; 342 --chars_written; 343 } 344 } else { 345 // Non-whitespace chracters are copied straight across. 346 in_whitespace = false; 347 already_trimmed = false; 348 result[chars_written++] = *i; 349 } 350 } 351 352 if (in_whitespace && !already_trimmed) { 353 // Any trailing whitespace is eliminated. 354 --chars_written; 355 } 356 357 result.resize(chars_written); 358 return result; 359} 360 361std::wstring CollapseWhitespace(const std::wstring& text, 362 bool trim_sequences_with_line_breaks) { 363 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 364} 365 366#if !defined(WCHAR_T_IS_UTF16) 367string16 CollapseWhitespace(const string16& text, 368 bool trim_sequences_with_line_breaks) { 369 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 370} 371#endif 372 373std::string CollapseWhitespaceASCII(const std::string& text, 374 bool trim_sequences_with_line_breaks) { 375 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 376} 377 378bool ContainsOnlyWhitespaceASCII(const std::string& str) { 379 for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) { 380 if (!IsAsciiWhitespace(*i)) 381 return false; 382 } 383 return true; 384} 385 386bool ContainsOnlyWhitespace(const string16& str) { 387 for (string16::const_iterator i(str.begin()); i != str.end(); ++i) { 388 if (!IsWhitespace(*i)) 389 return false; 390 } 391 return true; 392} 393 394template<typename STR> 395static bool ContainsOnlyCharsT(const STR& input, const STR& characters) { 396 for (typename STR::const_iterator iter = input.begin(); 397 iter != input.end(); ++iter) { 398 if (characters.find(*iter) == STR::npos) 399 return false; 400 } 401 return true; 402} 403 404bool ContainsOnlyChars(const std::wstring& input, 405 const std::wstring& characters) { 406 return ContainsOnlyCharsT(input, characters); 407} 408 409#if !defined(WCHAR_T_IS_UTF16) 410bool ContainsOnlyChars(const string16& input, const string16& characters) { 411 return ContainsOnlyCharsT(input, characters); 412} 413#endif 414 415bool ContainsOnlyChars(const std::string& input, 416 const std::string& characters) { 417 return ContainsOnlyCharsT(input, characters); 418} 419 420std::string WideToASCII(const std::wstring& wide) { 421 DCHECK(IsStringASCII(wide)) << wide; 422 return std::string(wide.begin(), wide.end()); 423} 424 425std::string UTF16ToASCII(const string16& utf16) { 426 DCHECK(IsStringASCII(utf16)) << utf16; 427 return std::string(utf16.begin(), utf16.end()); 428} 429 430// Latin1 is just the low range of Unicode, so we can copy directly to convert. 431bool WideToLatin1(const std::wstring& wide, std::string* latin1) { 432 std::string output; 433 output.resize(wide.size()); 434 latin1->clear(); 435 for (size_t i = 0; i < wide.size(); i++) { 436 if (wide[i] > 255) 437 return false; 438 output[i] = static_cast<char>(wide[i]); 439 } 440 latin1->swap(output); 441 return true; 442} 443 444bool IsString8Bit(const std::wstring& str) { 445 for (size_t i = 0; i < str.length(); i++) { 446 if (str[i] > 255) 447 return false; 448 } 449 return true; 450} 451 452template<class STR> 453static bool DoIsStringASCII(const STR& str) { 454 for (size_t i = 0; i < str.length(); i++) { 455 typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i]; 456 if (c > 0x7F) 457 return false; 458 } 459 return true; 460} 461 462bool IsStringASCII(const std::wstring& str) { 463 return DoIsStringASCII(str); 464} 465 466#if !defined(WCHAR_T_IS_UTF16) 467bool IsStringASCII(const string16& str) { 468 return DoIsStringASCII(str); 469} 470#endif 471 472bool IsStringASCII(const base::StringPiece& str) { 473 return DoIsStringASCII(str); 474} 475 476bool IsStringUTF8(const std::string& str) { 477 const char *src = str.data(); 478 int32 src_len = static_cast<int32>(str.length()); 479 int32 char_index = 0; 480 481 while (char_index < src_len) { 482 int32 code_point; 483 CBU8_NEXT(src, char_index, src_len, code_point); 484 if (!base::IsValidCharacter(code_point)) 485 return false; 486 } 487 return true; 488} 489 490template<typename Iter> 491static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 492 Iter a_end, 493 const char* b) { 494 for (Iter it = a_begin; it != a_end; ++it, ++b) { 495 if (!*b || ToLowerASCII(*it) != *b) 496 return false; 497 } 498 return *b == 0; 499} 500 501// Front-ends for LowerCaseEqualsASCII. 502bool LowerCaseEqualsASCII(const std::string& a, const char* b) { 503 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 504} 505 506bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) { 507 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 508} 509 510#if !defined(WCHAR_T_IS_UTF16) 511bool LowerCaseEqualsASCII(const string16& a, const char* b) { 512 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 513} 514#endif 515 516bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 517 std::string::const_iterator a_end, 518 const char* b) { 519 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 520} 521 522bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 523 std::wstring::const_iterator a_end, 524 const char* b) { 525 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 526} 527 528#if !defined(WCHAR_T_IS_UTF16) 529bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 530 string16::const_iterator a_end, 531 const char* b) { 532 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 533} 534#endif 535 536#if !defined(ANDROID) 537bool LowerCaseEqualsASCII(const char* a_begin, 538 const char* a_end, 539 const char* b) { 540 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 541} 542#endif // !ANDROID 543 544#if !defined(ANDROID) 545bool LowerCaseEqualsASCII(const wchar_t* a_begin, 546 const wchar_t* a_end, 547 const char* b) { 548 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 549} 550#endif // !ANDROID 551 552#if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID) 553bool LowerCaseEqualsASCII(const char16* a_begin, 554 const char16* a_end, 555 const char* b) { 556 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 557} 558#endif 559 560bool EqualsASCII(const string16& a, const base::StringPiece& b) { 561 if (a.length() != b.length()) 562 return false; 563 return std::equal(b.begin(), b.end(), a.begin()); 564} 565 566bool StartsWithASCII(const std::string& str, 567 const std::string& search, 568 bool case_sensitive) { 569 if (case_sensitive) 570 return str.compare(0, search.length(), search) == 0; 571 else 572 return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0; 573} 574 575template <typename STR> 576bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) { 577 if (case_sensitive) { 578 return str.compare(0, search.length(), search) == 0; 579 } else { 580 if (search.size() > str.size()) 581 return false; 582 return std::equal(search.begin(), search.end(), str.begin(), 583 CaseInsensitiveCompare<typename STR::value_type>()); 584 } 585} 586 587bool StartsWith(const std::wstring& str, const std::wstring& search, 588 bool case_sensitive) { 589 return StartsWithT(str, search, case_sensitive); 590} 591 592#if !defined(WCHAR_T_IS_UTF16) 593bool StartsWith(const string16& str, const string16& search, 594 bool case_sensitive) { 595 return StartsWithT(str, search, case_sensitive); 596} 597#endif 598 599template <typename STR> 600bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) { 601 typename STR::size_type str_length = str.length(); 602 typename STR::size_type search_length = search.length(); 603 if (search_length > str_length) 604 return false; 605 if (case_sensitive) { 606 return str.compare(str_length - search_length, search_length, search) == 0; 607 } else { 608 return std::equal(search.begin(), search.end(), 609 str.begin() + (str_length - search_length), 610 CaseInsensitiveCompare<typename STR::value_type>()); 611 } 612} 613 614bool EndsWith(const std::string& str, const std::string& search, 615 bool case_sensitive) { 616 return EndsWithT(str, search, case_sensitive); 617} 618 619bool EndsWith(const std::wstring& str, const std::wstring& search, 620 bool case_sensitive) { 621 return EndsWithT(str, search, case_sensitive); 622} 623 624#if !defined(WCHAR_T_IS_UTF16) 625bool EndsWith(const string16& str, const string16& search, 626 bool case_sensitive) { 627 return EndsWithT(str, search, case_sensitive); 628} 629#endif 630 631DataUnits GetByteDisplayUnits(int64 bytes) { 632 // The byte thresholds at which we display amounts. A byte count is displayed 633 // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1]. 634 // This must match the DataUnits enum. 635 static const int64 kUnitThresholds[] = { 636 0, // DATA_UNITS_BYTE, 637 3*1024, // DATA_UNITS_KIBIBYTE, 638 2*1024*1024, // DATA_UNITS_MEBIBYTE, 639 1024*1024*1024 // DATA_UNITS_GIBIBYTE, 640 }; 641 642 if (bytes < 0) { 643 NOTREACHED() << "Negative bytes value"; 644 return DATA_UNITS_BYTE; 645 } 646 647 int unit_index = arraysize(kUnitThresholds); 648 while (--unit_index > 0) { 649 if (bytes >= kUnitThresholds[unit_index]) 650 break; 651 } 652 653 DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE); 654 return DataUnits(unit_index); 655} 656 657// TODO(mpcomplete): deal with locale 658// Byte suffixes. This must match the DataUnits enum. 659static const char* const kByteStrings[] = { 660 "B", 661 "kB", 662 "MB", 663 "GB" 664}; 665 666static const char* const kSpeedStrings[] = { 667 "B/s", 668 "kB/s", 669 "MB/s", 670 "GB/s" 671}; 672 673string16 FormatBytesInternal(int64 bytes, 674 DataUnits units, 675 bool show_units, 676 const char* const* suffix) { 677 if (bytes < 0) { 678 NOTREACHED() << "Negative bytes value"; 679 return string16(); 680 } 681 682 DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE); 683 684 // Put the quantity in the right units. 685 double unit_amount = static_cast<double>(bytes); 686 for (int i = 0; i < units; ++i) 687 unit_amount /= 1024.0; 688 689 char buf[64]; 690 if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100) 691 base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount); 692 else 693 base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount); 694 695 std::string ret(buf); 696 if (show_units) { 697 ret += " "; 698 ret += suffix[units]; 699 } 700 701 return ASCIIToUTF16(ret); 702} 703 704string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) { 705 return FormatBytesInternal(bytes, units, show_units, kByteStrings); 706} 707 708string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) { 709 return FormatBytesInternal(bytes, units, show_units, kSpeedStrings); 710} 711 712template<class StringType> 713void DoReplaceSubstringsAfterOffset(StringType* str, 714 typename StringType::size_type start_offset, 715 const StringType& find_this, 716 const StringType& replace_with, 717 bool replace_all) { 718 if ((start_offset == StringType::npos) || (start_offset >= str->length())) 719 return; 720 721 DCHECK(!find_this.empty()); 722 for (typename StringType::size_type offs(str->find(find_this, start_offset)); 723 offs != StringType::npos; offs = str->find(find_this, offs)) { 724 str->replace(offs, find_this.length(), replace_with); 725 offs += replace_with.length(); 726 727 if (!replace_all) 728 break; 729 } 730} 731 732void ReplaceFirstSubstringAfterOffset(string16* str, 733 string16::size_type start_offset, 734 const string16& find_this, 735 const string16& replace_with) { 736 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 737 false); // replace first instance 738} 739 740void ReplaceFirstSubstringAfterOffset(std::string* str, 741 std::string::size_type start_offset, 742 const std::string& find_this, 743 const std::string& replace_with) { 744 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 745 false); // replace first instance 746} 747 748void ReplaceSubstringsAfterOffset(string16* str, 749 string16::size_type start_offset, 750 const string16& find_this, 751 const string16& replace_with) { 752 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 753 true); // replace all instances 754} 755 756void ReplaceSubstringsAfterOffset(std::string* str, 757 std::string::size_type start_offset, 758 const std::string& find_this, 759 const std::string& replace_with) { 760 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 761 true); // replace all instances 762} 763 764 765template<typename STR> 766static size_t TokenizeT(const STR& str, 767 const STR& delimiters, 768 std::vector<STR>* tokens) { 769 tokens->clear(); 770 771 typename STR::size_type start = str.find_first_not_of(delimiters); 772 while (start != STR::npos) { 773 typename STR::size_type end = str.find_first_of(delimiters, start + 1); 774 if (end == STR::npos) { 775 tokens->push_back(str.substr(start)); 776 break; 777 } else { 778 tokens->push_back(str.substr(start, end - start)); 779 start = str.find_first_not_of(delimiters, end + 1); 780 } 781 } 782 783 return tokens->size(); 784} 785 786size_t Tokenize(const std::wstring& str, 787 const std::wstring& delimiters, 788 std::vector<std::wstring>* tokens) { 789 return TokenizeT(str, delimiters, tokens); 790} 791 792#if !defined(WCHAR_T_IS_UTF16) 793size_t Tokenize(const string16& str, 794 const string16& delimiters, 795 std::vector<string16>* tokens) { 796 return TokenizeT(str, delimiters, tokens); 797} 798#endif 799 800size_t Tokenize(const std::string& str, 801 const std::string& delimiters, 802 std::vector<std::string>* tokens) { 803 return TokenizeT(str, delimiters, tokens); 804} 805 806size_t Tokenize(const base::StringPiece& str, 807 const base::StringPiece& delimiters, 808 std::vector<base::StringPiece>* tokens) { 809 return TokenizeT(str, delimiters, tokens); 810} 811 812template<typename STR> 813static STR JoinStringT(const std::vector<STR>& parts, 814 typename STR::value_type sep) { 815 if (parts.size() == 0) return STR(); 816 817 STR result(parts[0]); 818 typename std::vector<STR>::const_iterator iter = parts.begin(); 819 ++iter; 820 821 for (; iter != parts.end(); ++iter) { 822 result += sep; 823 result += *iter; 824 } 825 826 return result; 827} 828 829std::string JoinString(const std::vector<std::string>& parts, char sep) { 830 return JoinStringT(parts, sep); 831} 832 833#if !defined(WCHAR_T_IS_UTF16) 834string16 JoinString(const std::vector<string16>& parts, char16 sep) { 835 return JoinStringT(parts, sep); 836} 837#endif 838 839std::wstring JoinString(const std::vector<std::wstring>& parts, wchar_t sep) { 840 return JoinStringT(parts, sep); 841} 842 843template<typename STR> 844void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) { 845 const size_t length = str.length(); 846 if (!length) 847 return; 848 849 bool last_was_ws = false; 850 size_t last_non_ws_start = 0; 851 for (size_t i = 0; i < length; ++i) { 852 switch (str[i]) { 853 // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR. 854 case L' ': 855 case L'\t': 856 case L'\xA': 857 case L'\xB': 858 case L'\xC': 859 case L'\xD': 860 if (!last_was_ws) { 861 if (i > 0) { 862 result->push_back( 863 str.substr(last_non_ws_start, i - last_non_ws_start)); 864 } 865 last_was_ws = true; 866 } 867 break; 868 869 default: // Not a space character. 870 if (last_was_ws) { 871 last_was_ws = false; 872 last_non_ws_start = i; 873 } 874 break; 875 } 876 } 877 if (!last_was_ws) { 878 result->push_back( 879 str.substr(last_non_ws_start, length - last_non_ws_start)); 880 } 881} 882 883void SplitStringAlongWhitespace(const std::wstring& str, 884 std::vector<std::wstring>* result) { 885 SplitStringAlongWhitespaceT(str, result); 886} 887 888#if !defined(WCHAR_T_IS_UTF16) 889void SplitStringAlongWhitespace(const string16& str, 890 std::vector<string16>* result) { 891 SplitStringAlongWhitespaceT(str, result); 892} 893#endif 894 895void SplitStringAlongWhitespace(const std::string& str, 896 std::vector<std::string>* result) { 897 SplitStringAlongWhitespaceT(str, result); 898} 899 900template<class FormatStringType, class OutStringType> 901OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string, 902 const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) { 903 size_t substitutions = subst.size(); 904 DCHECK(substitutions < 10); 905 906 size_t sub_length = 0; 907 for (typename std::vector<OutStringType>::const_iterator iter = subst.begin(); 908 iter != subst.end(); ++iter) { 909 sub_length += (*iter).length(); 910 } 911 912 OutStringType formatted; 913 formatted.reserve(format_string.length() + sub_length); 914 915 std::vector<ReplacementOffset> r_offsets; 916 for (typename FormatStringType::const_iterator i = format_string.begin(); 917 i != format_string.end(); ++i) { 918 if ('$' == *i) { 919 if (i + 1 != format_string.end()) { 920 ++i; 921 DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i; 922 if ('$' == *i) { 923 while (i != format_string.end() && '$' == *i) { 924 formatted.push_back('$'); 925 ++i; 926 } 927 --i; 928 } else { 929 uintptr_t index = *i - '1'; 930 if (offsets) { 931 ReplacementOffset r_offset(index, 932 static_cast<int>(formatted.size())); 933 r_offsets.insert(std::lower_bound(r_offsets.begin(), 934 r_offsets.end(), r_offset, 935 &CompareParameter), 936 r_offset); 937 } 938 if (index < substitutions) 939 formatted.append(subst.at(index)); 940 } 941 } 942 } else { 943 formatted.push_back(*i); 944 } 945 } 946 if (offsets) { 947 for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin(); 948 i != r_offsets.end(); ++i) { 949 offsets->push_back(i->offset); 950 } 951 } 952 return formatted; 953} 954 955string16 ReplaceStringPlaceholders(const string16& format_string, 956 const std::vector<string16>& subst, 957 std::vector<size_t>* offsets) { 958 return DoReplaceStringPlaceholders(format_string, subst, offsets); 959} 960 961std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, 962 const std::vector<std::string>& subst, 963 std::vector<size_t>* offsets) { 964 return DoReplaceStringPlaceholders(format_string, subst, offsets); 965} 966 967string16 ReplaceStringPlaceholders(const string16& format_string, 968 const string16& a, 969 size_t* offset) { 970 std::vector<size_t> offsets; 971 std::vector<string16> subst; 972 subst.push_back(a); 973 string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); 974 975 DCHECK(offsets.size() == 1); 976 if (offset) { 977 *offset = offsets[0]; 978 } 979 return result; 980} 981 982static bool IsWildcard(base_icu::UChar32 character) { 983 return character == '*' || character == '?'; 984} 985 986// Move the strings pointers to the point where they start to differ. 987template <typename CHAR, typename NEXT> 988static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, 989 const CHAR** string, const CHAR* string_end, 990 NEXT next) { 991 const CHAR* escape = NULL; 992 while (*pattern != pattern_end && *string != string_end) { 993 if (!escape && IsWildcard(**pattern)) { 994 // We don't want to match wildcard here, except if it's escaped. 995 return; 996 } 997 998 // Check if the escapement char is found. If so, skip it and move to the 999 // next character. 1000 if (!escape && **pattern == '\\') { 1001 escape = *pattern; 1002 next(pattern, pattern_end); 1003 continue; 1004 } 1005 1006 // Check if the chars match, if so, increment the ptrs. 1007 const CHAR* pattern_next = *pattern; 1008 const CHAR* string_next = *string; 1009 base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); 1010 if (pattern_char == next(&string_next, string_end) && 1011 pattern_char != (base_icu::UChar32) CBU_SENTINEL) { 1012 *pattern = pattern_next; 1013 *string = string_next; 1014 } else { 1015 // Uh ho, it did not match, we are done. If the last char was an 1016 // escapement, that means that it was an error to advance the ptr here, 1017 // let's put it back where it was. This also mean that the MatchPattern 1018 // function will return false because if we can't match an escape char 1019 // here, then no one will. 1020 if (escape) { 1021 *pattern = escape; 1022 } 1023 return; 1024 } 1025 1026 escape = NULL; 1027 } 1028} 1029 1030template <typename CHAR, typename NEXT> 1031static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { 1032 while (*pattern != end) { 1033 if (!IsWildcard(**pattern)) 1034 return; 1035 next(pattern, end); 1036 } 1037} 1038 1039template <typename CHAR, typename NEXT> 1040static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, 1041 const CHAR* pattern, const CHAR* pattern_end, 1042 int depth, 1043 NEXT next) { 1044 const int kMaxDepth = 16; 1045 if (depth > kMaxDepth) 1046 return false; 1047 1048 // Eat all the matching chars. 1049 EatSameChars(&pattern, pattern_end, &eval, eval_end, next); 1050 1051 // If the string is empty, then the pattern must be empty too, or contains 1052 // only wildcards. 1053 if (eval == eval_end) { 1054 EatWildcard(&pattern, pattern_end, next); 1055 return pattern == pattern_end; 1056 } 1057 1058 // Pattern is empty but not string, this is not a match. 1059 if (pattern == pattern_end) 1060 return false; 1061 1062 // If this is a question mark, then we need to compare the rest with 1063 // the current string or the string with one character eaten. 1064 const CHAR* next_pattern = pattern; 1065 next(&next_pattern, pattern_end); 1066 if (pattern[0] == '?') { 1067 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1068 depth + 1, next)) 1069 return true; 1070 const CHAR* next_eval = eval; 1071 next(&next_eval, eval_end); 1072 if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, 1073 depth + 1, next)) 1074 return true; 1075 } 1076 1077 // This is a *, try to match all the possible substrings with the remainder 1078 // of the pattern. 1079 if (pattern[0] == '*') { 1080 while (eval != eval_end) { 1081 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1082 depth + 1, next)) 1083 return true; 1084 eval++; 1085 } 1086 1087 // We reached the end of the string, let see if the pattern contains only 1088 // wildcards. 1089 if (eval == eval_end) { 1090 EatWildcard(&pattern, pattern_end, next); 1091 if (pattern != pattern_end) 1092 return false; 1093 return true; 1094 } 1095 } 1096 1097 return false; 1098} 1099 1100struct NextCharUTF8 { 1101 base_icu::UChar32 operator()(const char** p, const char* end) { 1102 base_icu::UChar32 c; 1103 int offset = 0; 1104 CBU8_NEXT(*p, offset, end - *p, c); 1105 *p += offset; 1106 return c; 1107 } 1108}; 1109 1110struct NextCharUTF16 { 1111 base_icu::UChar32 operator()(const char16** p, const char16* end) { 1112 base_icu::UChar32 c; 1113 int offset = 0; 1114 CBU16_NEXT(*p, offset, end - *p, c); 1115 *p += offset; 1116 return c; 1117 } 1118}; 1119 1120bool MatchPattern(const base::StringPiece& eval, 1121 const base::StringPiece& pattern) { 1122 return MatchPatternT(eval.data(), eval.data() + eval.size(), 1123 pattern.data(), pattern.data() + pattern.size(), 1124 0, NextCharUTF8()); 1125} 1126 1127bool MatchPattern(const string16& eval, const string16& pattern) { 1128 return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), 1129 pattern.c_str(), pattern.c_str() + pattern.size(), 1130 0, NextCharUTF16()); 1131} 1132 1133// The following code is compatible with the OpenBSD lcpy interface. See: 1134// http://www.gratisoft.us/todd/papers/strlcpy.html 1135// ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c 1136 1137namespace { 1138 1139template <typename CHAR> 1140size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { 1141 for (size_t i = 0; i < dst_size; ++i) { 1142 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. 1143 return i; 1144 } 1145 1146 // We were left off at dst_size. We over copied 1 byte. Null terminate. 1147 if (dst_size != 0) 1148 dst[dst_size - 1] = 0; 1149 1150 // Count the rest of the |src|, and return it's length in characters. 1151 while (src[dst_size]) ++dst_size; 1152 return dst_size; 1153} 1154 1155} // namespace 1156 1157size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { 1158 return lcpyT<char>(dst, src, dst_size); 1159} 1160size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { 1161 return lcpyT<wchar_t>(dst, src, dst_size); 1162} 1163 1164bool ElideString(const std::wstring& input, int max_len, std::wstring* output) { 1165 DCHECK(max_len >= 0); 1166 if (static_cast<int>(input.length()) <= max_len) { 1167 output->assign(input); 1168 return false; 1169 } 1170 1171 switch (max_len) { 1172 case 0: 1173 output->clear(); 1174 break; 1175 case 1: 1176 output->assign(input.substr(0, 1)); 1177 break; 1178 case 2: 1179 output->assign(input.substr(0, 2)); 1180 break; 1181 case 3: 1182 output->assign(input.substr(0, 1) + L"." + 1183 input.substr(input.length() - 1)); 1184 break; 1185 case 4: 1186 output->assign(input.substr(0, 1) + L".." + 1187 input.substr(input.length() - 1)); 1188 break; 1189 default: { 1190 int rstr_len = (max_len - 3) / 2; 1191 int lstr_len = rstr_len + ((max_len - 3) % 2); 1192 output->assign(input.substr(0, lstr_len) + L"..." + 1193 input.substr(input.length() - rstr_len)); 1194 break; 1195 } 1196 } 1197 1198 return true; 1199} 1200