string_util.h revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This file defines utility functions for working with strings. 6 7#ifndef BASE_STRING_UTIL_H_ 8#define BASE_STRING_UTIL_H_ 9#pragma once 10 11#include <stdarg.h> // va_list 12 13#include <string> 14#include <vector> 15 16#include "base/basictypes.h" 17#include "base/compiler_specific.h" 18#include "base/string16.h" 19#include "base/string_piece.h" // For implicit conversions. 20 21// TODO(brettw) remove this dependency. Previously StringPrintf lived in this 22// file. We need to convert the callers over to using stringprintf.h instead 23// and then remove this. 24#include "base/stringprintf.h" 25 26#ifdef RLZ_WIN_LIB_RLZ_LIB_H_ 27// TODO(tfarina): Fix the rlz library to include this instead and remove 28// this include. 29#include "base/string_split.h" 30#endif // RLZ_WIN_LIB_RLZ_LIB_H_ 31 32// Safe standard library wrappers for all platforms. 33 34namespace base { 35 36// C standard-library functions like "strncasecmp" and "snprintf" that aren't 37// cross-platform are provided as "base::strncasecmp", and their prototypes 38// are listed below. These functions are then implemented as inline calls 39// to the platform-specific equivalents in the platform-specific headers. 40 41// Compares the two strings s1 and s2 without regard to case using 42// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 43// s2 > s1 according to a lexicographic comparison. 44int strcasecmp(const char* s1, const char* s2); 45 46// Compares up to count characters of s1 and s2 without regard to case using 47// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 48// s2 > s1 according to a lexicographic comparison. 49int strncasecmp(const char* s1, const char* s2, size_t count); 50 51// Same as strncmp but for char16 strings. 52int strncmp16(const char16* s1, const char16* s2, size_t count); 53 54// Wrapper for vsnprintf that always null-terminates and always returns the 55// number of characters that would be in an untruncated formatted 56// string, even when truncation occurs. 57int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) 58 PRINTF_FORMAT(3, 0); 59 60// vswprintf always null-terminates, but when truncation occurs, it will either 61// return -1 or the number of characters that would be in an untruncated 62// formatted string. The actual return value depends on the underlying 63// C library's vswprintf implementation. 64int vswprintf(wchar_t* buffer, size_t size, 65 const wchar_t* format, va_list arguments) WPRINTF_FORMAT(3, 0); 66 67// Some of these implementations need to be inlined. 68 69// We separate the declaration from the implementation of this inline 70// function just so the PRINTF_FORMAT works. 71inline int snprintf(char* buffer, size_t size, const char* format, ...) 72 PRINTF_FORMAT(3, 4); 73inline int snprintf(char* buffer, size_t size, const char* format, ...) { 74 va_list arguments; 75 va_start(arguments, format); 76 int result = vsnprintf(buffer, size, format, arguments); 77 va_end(arguments); 78 return result; 79} 80 81// We separate the declaration from the implementation of this inline 82// function just so the WPRINTF_FORMAT works. 83inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) 84 WPRINTF_FORMAT(3, 4); 85inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) { 86 va_list arguments; 87 va_start(arguments, format); 88 int result = vswprintf(buffer, size, format, arguments); 89 va_end(arguments); 90 return result; 91} 92 93// BSD-style safe and consistent string copy functions. 94// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. 95// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as 96// long as |dst_size| is not 0. Returns the length of |src| in characters. 97// If the return value is >= dst_size, then the output was truncated. 98// NOTE: All sizes are in number of characters, NOT in bytes. 99size_t strlcpy(char* dst, const char* src, size_t dst_size); 100size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size); 101 102// Scan a wprintf format string to determine whether it's portable across a 103// variety of systems. This function only checks that the conversion 104// specifiers used by the format string are supported and have the same meaning 105// on a variety of systems. It doesn't check for other errors that might occur 106// within a format string. 107// 108// Nonportable conversion specifiers for wprintf are: 109// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char 110// data on all systems except Windows, which treat them as wchar_t data. 111// Use %ls and %lc for wchar_t data instead. 112// - 'S' and 'C', which operate on wchar_t data on all systems except Windows, 113// which treat them as char data. Use %ls and %lc for wchar_t data 114// instead. 115// - 'F', which is not identified by Windows wprintf documentation. 116// - 'D', 'O', and 'U', which are deprecated and not available on all systems. 117// Use %ld, %lo, and %lu instead. 118// 119// Note that there is no portable conversion specifier for char data when 120// working with wprintf. 121// 122// This function is intended to be called from base::vswprintf. 123bool IsWprintfFormatPortable(const wchar_t* format); 124 125} // namespace base 126 127#if defined(OS_WIN) 128#include "base/string_util_win.h" 129#elif defined(OS_POSIX) 130#include "base/string_util_posix.h" 131#else 132#error Define string operations appropriately for your platform 133#endif 134 135// These threadsafe functions return references to globally unique empty 136// strings. 137// 138// DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS. 139// There is only one case where you should use these: functions which need to 140// return a string by reference (e.g. as a class member accessor), and don't 141// have an empty string to use (e.g. in an error case). These should not be 142// used as initializers, function arguments, or return values for functions 143// which return by value or outparam. 144const std::string& EmptyString(); 145const std::wstring& EmptyWString(); 146const string16& EmptyString16(); 147 148extern const wchar_t kWhitespaceWide[]; 149extern const char16 kWhitespaceUTF16[]; 150extern const char kWhitespaceASCII[]; 151 152extern const char kUtf8ByteOrderMark[]; 153 154// Removes characters in remove_chars from anywhere in input. Returns true if 155// any characters were removed. 156// NOTE: Safe to use the same variable for both input and output. 157bool RemoveChars(const std::wstring& input, 158 const wchar_t remove_chars[], 159 std::wstring* output); 160bool RemoveChars(const string16& input, 161 const char16 remove_chars[], 162 string16* output); 163bool RemoveChars(const std::string& input, 164 const char remove_chars[], 165 std::string* output); 166 167// Removes characters in trim_chars from the beginning and end of input. 168// NOTE: Safe to use the same variable for both input and output. 169bool TrimString(const std::wstring& input, 170 const wchar_t trim_chars[], 171 std::wstring* output); 172bool TrimString(const string16& input, 173 const char16 trim_chars[], 174 string16* output); 175bool TrimString(const std::string& input, 176 const char trim_chars[], 177 std::string* output); 178 179// Truncates a string to the nearest UTF-8 character that will leave 180// the string less than or equal to the specified byte size. 181void TruncateUTF8ToByteSize(const std::string& input, 182 const size_t byte_size, 183 std::string* output); 184 185// Trims any whitespace from either end of the input string. Returns where 186// whitespace was found. 187// The non-wide version has two functions: 188// * TrimWhitespaceASCII() 189// This function is for ASCII strings and only looks for ASCII whitespace; 190// Please choose the best one according to your usage. 191// NOTE: Safe to use the same variable for both input and output. 192enum TrimPositions { 193 TRIM_NONE = 0, 194 TRIM_LEADING = 1 << 0, 195 TRIM_TRAILING = 1 << 1, 196 TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, 197}; 198TrimPositions TrimWhitespace(const std::wstring& input, 199 TrimPositions positions, 200 std::wstring* output); 201TrimPositions TrimWhitespace(const string16& input, 202 TrimPositions positions, 203 string16* output); 204TrimPositions TrimWhitespaceASCII(const std::string& input, 205 TrimPositions positions, 206 std::string* output); 207 208// Deprecated. This function is only for backward compatibility and calls 209// TrimWhitespaceASCII(). 210TrimPositions TrimWhitespace(const std::string& input, 211 TrimPositions positions, 212 std::string* output); 213 214// Searches for CR or LF characters. Removes all contiguous whitespace 215// strings that contain them. This is useful when trying to deal with text 216// copied from terminals. 217// Returns |text|, with the following three transformations: 218// (1) Leading and trailing whitespace is trimmed. 219// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace 220// sequences containing a CR or LF are trimmed. 221// (3) All other whitespace sequences are converted to single spaces. 222std::wstring CollapseWhitespace(const std::wstring& text, 223 bool trim_sequences_with_line_breaks); 224string16 CollapseWhitespace(const string16& text, 225 bool trim_sequences_with_line_breaks); 226std::string CollapseWhitespaceASCII(const std::string& text, 227 bool trim_sequences_with_line_breaks); 228 229// Returns true if the passed string is empty or contains only white-space 230// characters. 231bool ContainsOnlyWhitespaceASCII(const std::string& str); 232bool ContainsOnlyWhitespace(const string16& str); 233 234// Returns true if |input| is empty or contains only characters found in 235// |characters|. 236bool ContainsOnlyChars(const std::wstring& input, 237 const std::wstring& characters); 238bool ContainsOnlyChars(const string16& input, const string16& characters); 239bool ContainsOnlyChars(const std::string& input, const std::string& characters); 240 241// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII 242// beforehand. 243std::string WideToASCII(const std::wstring& wide); 244std::string UTF16ToASCII(const string16& utf16); 245 246// Converts the given wide string to the corresponding Latin1. This will fail 247// (return false) if any characters are more than 255. 248bool WideToLatin1(const std::wstring& wide, std::string* latin1); 249 250// Returns true if the specified string matches the criteria. How can a wide 251// string be 8-bit or UTF8? It contains only characters that are < 256 (in the 252// first case) or characters that use only 8-bits and whose 8-bit 253// representation looks like a UTF-8 string (the second case). 254// 255// Note that IsStringUTF8 checks not only if the input is structrually 256// valid but also if it doesn't contain any non-character codepoint 257// (e.g. U+FFFE). It's done on purpose because all the existing callers want 258// to have the maximum 'discriminating' power from other encodings. If 259// there's a use case for just checking the structural validity, we have to 260// add a new function for that. 261bool IsString8Bit(const std::wstring& str); 262bool IsStringUTF8(const std::string& str); 263bool IsStringASCII(const std::wstring& str); 264bool IsStringASCII(const base::StringPiece& str); 265bool IsStringASCII(const string16& str); 266 267// ASCII-specific tolower. The standard library's tolower is locale sensitive, 268// so we don't want to use it here. 269template <class Char> inline Char ToLowerASCII(Char c) { 270 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; 271} 272 273// Converts the elements of the given string. This version uses a pointer to 274// clearly differentiate it from the non-pointer variant. 275template <class str> inline void StringToLowerASCII(str* s) { 276 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 277 *i = ToLowerASCII(*i); 278} 279 280template <class str> inline str StringToLowerASCII(const str& s) { 281 // for std::string and std::wstring 282 str output(s); 283 StringToLowerASCII(&output); 284 return output; 285} 286 287// ASCII-specific toupper. The standard library's toupper is locale sensitive, 288// so we don't want to use it here. 289template <class Char> inline Char ToUpperASCII(Char c) { 290 return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; 291} 292 293// Converts the elements of the given string. This version uses a pointer to 294// clearly differentiate it from the non-pointer variant. 295template <class str> inline void StringToUpperASCII(str* s) { 296 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 297 *i = ToUpperASCII(*i); 298} 299 300template <class str> inline str StringToUpperASCII(const str& s) { 301 // for std::string and std::wstring 302 str output(s); 303 StringToUpperASCII(&output); 304 return output; 305} 306 307// Compare the lower-case form of the given string against the given ASCII 308// string. This is useful for doing checking if an input string matches some 309// token, and it is optimized to avoid intermediate string copies. This API is 310// borrowed from the equivalent APIs in Mozilla. 311bool LowerCaseEqualsASCII(const std::string& a, const char* b); 312bool LowerCaseEqualsASCII(const std::wstring& a, const char* b); 313bool LowerCaseEqualsASCII(const string16& a, const char* b); 314 315// Same thing, but with string iterators instead. 316bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 317 std::string::const_iterator a_end, 318 const char* b); 319bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 320 std::wstring::const_iterator a_end, 321 const char* b); 322bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 323 string16::const_iterator a_end, 324 const char* b); 325bool LowerCaseEqualsASCII(const char* a_begin, 326 const char* a_end, 327 const char* b); 328bool LowerCaseEqualsASCII(const wchar_t* a_begin, 329 const wchar_t* a_end, 330 const char* b); 331bool LowerCaseEqualsASCII(const char16* a_begin, 332 const char16* a_end, 333 const char* b); 334 335// Performs a case-sensitive string compare. The behavior is undefined if both 336// strings are not ASCII. 337bool EqualsASCII(const string16& a, const base::StringPiece& b); 338 339// Returns true if str starts with search, or false otherwise. 340bool StartsWithASCII(const std::string& str, 341 const std::string& search, 342 bool case_sensitive); 343bool StartsWith(const std::wstring& str, 344 const std::wstring& search, 345 bool case_sensitive); 346bool StartsWith(const string16& str, 347 const string16& search, 348 bool case_sensitive); 349 350// Returns true if str ends with search, or false otherwise. 351bool EndsWith(const std::string& str, 352 const std::string& search, 353 bool case_sensitive); 354bool EndsWith(const std::wstring& str, 355 const std::wstring& search, 356 bool case_sensitive); 357bool EndsWith(const string16& str, 358 const string16& search, 359 bool case_sensitive); 360 361 362// Determines the type of ASCII character, independent of locale (the C 363// library versions will change based on locale). 364template <typename Char> 365inline bool IsAsciiWhitespace(Char c) { 366 return c == ' ' || c == '\r' || c == '\n' || c == '\t'; 367} 368template <typename Char> 369inline bool IsAsciiAlpha(Char c) { 370 return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); 371} 372template <typename Char> 373inline bool IsAsciiDigit(Char c) { 374 return c >= '0' && c <= '9'; 375} 376 377template <typename Char> 378inline bool IsHexDigit(Char c) { 379 return (c >= '0' && c <= '9') || 380 (c >= 'A' && c <= 'F') || 381 (c >= 'a' && c <= 'f'); 382} 383 384template <typename Char> 385inline Char HexDigitToInt(Char c) { 386 DCHECK(IsHexDigit(c)); 387 if (c >= '0' && c <= '9') 388 return c - '0'; 389 if (c >= 'A' && c <= 'F') 390 return c - 'A' + 10; 391 if (c >= 'a' && c <= 'f') 392 return c - 'a' + 10; 393 return 0; 394} 395 396// Returns true if it's a whitespace character. 397inline bool IsWhitespace(wchar_t c) { 398 return wcschr(kWhitespaceWide, c) != NULL; 399} 400 401enum DataUnits { 402 DATA_UNITS_BYTE = 0, 403 DATA_UNITS_KIBIBYTE, 404 DATA_UNITS_MEBIBYTE, 405 DATA_UNITS_GIBIBYTE, 406}; 407 408// Return the unit type that is appropriate for displaying the amount of bytes 409// passed in. 410DataUnits GetByteDisplayUnits(int64 bytes); 411 412// Return a byte string in human-readable format, displayed in units appropriate 413// specified by 'units', with an optional unit suffix. 414// Ex: FormatBytes(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB" 415// Ex: FormatBytes(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1" 416string16 FormatBytes(int64 bytes, DataUnits units, bool show_units); 417 418// As above, but with "/s" units. 419// Ex: FormatSpeed(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB/s" 420// Ex: FormatSpeed(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1" 421string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units); 422 423// Return a number formated with separators in the user's locale way. 424// Ex: FormatNumber(1234567) => 1,234,567 425string16 FormatNumber(int64 number); 426 427// Starting at |start_offset| (usually 0), replace the first instance of 428// |find_this| with |replace_with|. 429void ReplaceFirstSubstringAfterOffset(string16* str, 430 string16::size_type start_offset, 431 const string16& find_this, 432 const string16& replace_with); 433void ReplaceFirstSubstringAfterOffset(std::string* str, 434 std::string::size_type start_offset, 435 const std::string& find_this, 436 const std::string& replace_with); 437 438// Starting at |start_offset| (usually 0), look through |str| and replace all 439// instances of |find_this| with |replace_with|. 440// 441// This does entire substrings; use std::replace in <algorithm> for single 442// characters, for example: 443// std::replace(str.begin(), str.end(), 'a', 'b'); 444void ReplaceSubstringsAfterOffset(string16* str, 445 string16::size_type start_offset, 446 const string16& find_this, 447 const string16& replace_with); 448void ReplaceSubstringsAfterOffset(std::string* str, 449 std::string::size_type start_offset, 450 const std::string& find_this, 451 const std::string& replace_with); 452 453// This is mpcomplete's pattern for saving a string copy when dealing with 454// a function that writes results into a wchar_t[] and wanting the result to 455// end up in a std::wstring. It ensures that the std::wstring's internal 456// buffer has enough room to store the characters to be written into it, and 457// sets its .length() attribute to the right value. 458// 459// The reserve() call allocates the memory required to hold the string 460// plus a terminating null. This is done because resize() isn't 461// guaranteed to reserve space for the null. The resize() call is 462// simply the only way to change the string's 'length' member. 463// 464// XXX-performance: the call to wide.resize() takes linear time, since it fills 465// the string's buffer with nulls. I call it to change the length of the 466// string (needed because writing directly to the buffer doesn't do this). 467// Perhaps there's a constant-time way to change the string's length. 468template <class string_type> 469inline typename string_type::value_type* WriteInto(string_type* str, 470 size_t length_with_null) { 471 str->reserve(length_with_null); 472 str->resize(length_with_null - 1); 473 return &((*str)[0]); 474} 475 476//----------------------------------------------------------------------------- 477 478// Function objects to aid in comparing/searching strings. 479 480template<typename Char> struct CaseInsensitiveCompare { 481 public: 482 bool operator()(Char x, Char y) const { 483 // TODO(darin): Do we really want to do locale sensitive comparisons here? 484 // See http://crbug.com/24917 485 return tolower(x) == tolower(y); 486 } 487}; 488 489template<typename Char> struct CaseInsensitiveCompareASCII { 490 public: 491 bool operator()(Char x, Char y) const { 492 return ToLowerASCII(x) == ToLowerASCII(y); 493 } 494}; 495 496// Splits a string into its fields delimited by any of the characters in 497// |delimiters|. Each field is added to the |tokens| vector. Returns the 498// number of tokens found. 499size_t Tokenize(const std::wstring& str, 500 const std::wstring& delimiters, 501 std::vector<std::wstring>* tokens); 502size_t Tokenize(const string16& str, 503 const string16& delimiters, 504 std::vector<string16>* tokens); 505size_t Tokenize(const std::string& str, 506 const std::string& delimiters, 507 std::vector<std::string>* tokens); 508size_t Tokenize(const base::StringPiece& str, 509 const base::StringPiece& delimiters, 510 std::vector<base::StringPiece>* tokens); 511 512// Does the opposite of SplitString(). 513std::wstring JoinString(const std::vector<std::wstring>& parts, wchar_t s); 514string16 JoinString(const std::vector<string16>& parts, char16 s); 515std::string JoinString(const std::vector<std::string>& parts, char s); 516 517// WARNING: this uses whitespace as defined by the HTML5 spec. If you need 518// a function similar to this but want to trim all types of whitespace, then 519// factor this out into a function that takes a string containing the characters 520// that are treated as whitespace. 521// 522// Splits the string along whitespace (where whitespace is the five space 523// characters defined by HTML 5). Each contiguous block of non-whitespace 524// characters is added to result. 525void SplitStringAlongWhitespace(const std::wstring& str, 526 std::vector<std::wstring>* result); 527void SplitStringAlongWhitespace(const string16& str, 528 std::vector<string16>* result); 529void SplitStringAlongWhitespace(const std::string& str, 530 std::vector<std::string>* result); 531 532// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively. 533// Additionally, any number of consecutive '$' characters is replaced by that 534// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be 535// NULL. This only allows you to use up to nine replacements. 536string16 ReplaceStringPlaceholders(const string16& format_string, 537 const std::vector<string16>& subst, 538 std::vector<size_t>* offsets); 539 540std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, 541 const std::vector<std::string>& subst, 542 std::vector<size_t>* offsets); 543 544// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL. 545string16 ReplaceStringPlaceholders(const string16& format_string, 546 const string16& a, 547 size_t* offset); 548 549// If the size of |input| is more than |max_len|, this function returns true and 550// |input| is shortened into |output| by removing chars in the middle (they are 551// replaced with up to 3 dots, as size permits). 552// Ex: ElideString(L"Hello", 10, &str) puts Hello in str and returns false. 553// ElideString(L"Hello my name is Tom", 10, &str) puts "Hell...Tom" in str and 554// returns true. 555bool ElideString(const std::wstring& input, int max_len, std::wstring* output); 556 557// Returns true if the string passed in matches the pattern. The pattern 558// string can contain wildcards like * and ? 559// The backslash character (\) is an escape character for * and ? 560// We limit the patterns to having a max of 16 * or ? characters. 561bool MatchPattern(const base::StringPiece& string, 562 const base::StringPiece& pattern); 563bool MatchPattern(const string16& string, const string16& pattern); 564 565// Hack to convert any char-like type to its unsigned counterpart. 566// For example, it will convert char, signed char and unsigned char to unsigned 567// char. 568template<typename T> 569struct ToUnsigned { 570 typedef T Unsigned; 571}; 572 573template<> 574struct ToUnsigned<char> { 575 typedef unsigned char Unsigned; 576}; 577template<> 578struct ToUnsigned<signed char> { 579 typedef unsigned char Unsigned; 580}; 581template<> 582struct ToUnsigned<wchar_t> { 583#if defined(WCHAR_T_IS_UTF16) 584 typedef unsigned short Unsigned; 585#elif defined(WCHAR_T_IS_UTF32) 586 typedef uint32 Unsigned; 587#endif 588}; 589template<> 590struct ToUnsigned<short> { 591 typedef unsigned short Unsigned; 592}; 593 594#endif // BASE_STRING_UTIL_H_ 595