string_util.h revision 21d179b334e59e9a3bfcaed4c4430bef1bc5759d
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This file defines utility functions for working with strings. 6 7#ifndef BASE_STRING_UTIL_H_ 8#define BASE_STRING_UTIL_H_ 9#pragma once 10 11#include <stdarg.h> // va_list 12 13#include <string> 14#include <vector> 15 16#include "base/basictypes.h" 17#include "base/compiler_specific.h" 18#include "base/string16.h" 19#include "base/string_piece.h" // For implicit conversions. 20 21// TODO(brettw) remove this dependency. Previously StringPrintf lived in this 22// file. We need to convert the callers over to using stringprintf.h instead 23// and then remove this. 24#include "base/stringprintf.h" 25 26// Safe standard library wrappers for all platforms. 27 28namespace base { 29 30// C standard-library functions like "strncasecmp" and "snprintf" that aren't 31// cross-platform are provided as "base::strncasecmp", and their prototypes 32// are listed below. These functions are then implemented as inline calls 33// to the platform-specific equivalents in the platform-specific headers. 34 35// Compares the two strings s1 and s2 without regard to case using 36// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 37// s2 > s1 according to a lexicographic comparison. 38int strcasecmp(const char* s1, const char* s2); 39 40// Compares up to count characters of s1 and s2 without regard to case using 41// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 42// s2 > s1 according to a lexicographic comparison. 43int strncasecmp(const char* s1, const char* s2, size_t count); 44 45// Same as strncmp but for char16 strings. 46int strncmp16(const char16* s1, const char16* s2, size_t count); 47 48// Wrapper for vsnprintf that always null-terminates and always returns the 49// number of characters that would be in an untruncated formatted 50// string, even when truncation occurs. 51int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) 52 PRINTF_FORMAT(3, 0); 53 54// vswprintf always null-terminates, but when truncation occurs, it will either 55// return -1 or the number of characters that would be in an untruncated 56// formatted string. The actual return value depends on the underlying 57// C library's vswprintf implementation. 58int vswprintf(wchar_t* buffer, size_t size, 59 const wchar_t* format, va_list arguments) WPRINTF_FORMAT(3, 0); 60 61// Some of these implementations need to be inlined. 62 63// We separate the declaration from the implementation of this inline 64// function just so the PRINTF_FORMAT works. 65inline int snprintf(char* buffer, size_t size, const char* format, ...) 66 PRINTF_FORMAT(3, 4); 67inline int snprintf(char* buffer, size_t size, const char* format, ...) { 68 va_list arguments; 69 va_start(arguments, format); 70 int result = vsnprintf(buffer, size, format, arguments); 71 va_end(arguments); 72 return result; 73} 74 75// We separate the declaration from the implementation of this inline 76// function just so the WPRINTF_FORMAT works. 77inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) 78 WPRINTF_FORMAT(3, 4); 79inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) { 80 va_list arguments; 81 va_start(arguments, format); 82 int result = vswprintf(buffer, size, format, arguments); 83 va_end(arguments); 84 return result; 85} 86 87// BSD-style safe and consistent string copy functions. 88// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. 89// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as 90// long as |dst_size| is not 0. Returns the length of |src| in characters. 91// If the return value is >= dst_size, then the output was truncated. 92// NOTE: All sizes are in number of characters, NOT in bytes. 93size_t strlcpy(char* dst, const char* src, size_t dst_size); 94size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size); 95 96// Scan a wprintf format string to determine whether it's portable across a 97// variety of systems. This function only checks that the conversion 98// specifiers used by the format string are supported and have the same meaning 99// on a variety of systems. It doesn't check for other errors that might occur 100// within a format string. 101// 102// Nonportable conversion specifiers for wprintf are: 103// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char 104// data on all systems except Windows, which treat them as wchar_t data. 105// Use %ls and %lc for wchar_t data instead. 106// - 'S' and 'C', which operate on wchar_t data on all systems except Windows, 107// which treat them as char data. Use %ls and %lc for wchar_t data 108// instead. 109// - 'F', which is not identified by Windows wprintf documentation. 110// - 'D', 'O', and 'U', which are deprecated and not available on all systems. 111// Use %ld, %lo, and %lu instead. 112// 113// Note that there is no portable conversion specifier for char data when 114// working with wprintf. 115// 116// This function is intended to be called from base::vswprintf. 117bool IsWprintfFormatPortable(const wchar_t* format); 118 119// ASCII-specific tolower. The standard library's tolower is locale sensitive, 120// so we don't want to use it here. 121template <class Char> inline Char ToLowerASCII(Char c) { 122 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; 123} 124 125// ASCII-specific toupper. The standard library's toupper is locale sensitive, 126// so we don't want to use it here. 127template <class Char> inline Char ToUpperASCII(Char c) { 128 return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; 129} 130 131// Function objects to aid in comparing/searching strings. 132 133template<typename Char> struct CaseInsensitiveCompare { 134 public: 135 bool operator()(Char x, Char y) const { 136 // TODO(darin): Do we really want to do locale sensitive comparisons here? 137 // See http://crbug.com/24917 138 return tolower(x) == tolower(y); 139 } 140}; 141 142template<typename Char> struct CaseInsensitiveCompareASCII { 143 public: 144 bool operator()(Char x, Char y) const { 145 return ToLowerASCII(x) == ToLowerASCII(y); 146 } 147}; 148 149} // namespace base 150 151#if defined(OS_WIN) 152#include "base/string_util_win.h" 153#elif defined(OS_POSIX) 154#include "base/string_util_posix.h" 155#else 156#error Define string operations appropriately for your platform 157#endif 158 159// These threadsafe functions return references to globally unique empty 160// strings. 161// 162// DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS. 163// There is only one case where you should use these: functions which need to 164// return a string by reference (e.g. as a class member accessor), and don't 165// have an empty string to use (e.g. in an error case). These should not be 166// used as initializers, function arguments, or return values for functions 167// which return by value or outparam. 168const std::string& EmptyString(); 169const std::wstring& EmptyWString(); 170const string16& EmptyString16(); 171 172extern const wchar_t kWhitespaceWide[]; 173extern const char16 kWhitespaceUTF16[]; 174extern const char kWhitespaceASCII[]; 175 176extern const char kUtf8ByteOrderMark[]; 177 178// Removes characters in remove_chars from anywhere in input. Returns true if 179// any characters were removed. 180// NOTE: Safe to use the same variable for both input and output. 181bool RemoveChars(const std::wstring& input, 182 const wchar_t remove_chars[], 183 std::wstring* output); 184bool RemoveChars(const string16& input, 185 const char16 remove_chars[], 186 string16* output); 187bool RemoveChars(const std::string& input, 188 const char remove_chars[], 189 std::string* output); 190 191// Removes characters in trim_chars from the beginning and end of input. 192// NOTE: Safe to use the same variable for both input and output. 193bool TrimString(const std::wstring& input, 194 const wchar_t trim_chars[], 195 std::wstring* output); 196bool TrimString(const string16& input, 197 const char16 trim_chars[], 198 string16* output); 199bool TrimString(const std::string& input, 200 const char trim_chars[], 201 std::string* output); 202 203// Truncates a string to the nearest UTF-8 character that will leave 204// the string less than or equal to the specified byte size. 205void TruncateUTF8ToByteSize(const std::string& input, 206 const size_t byte_size, 207 std::string* output); 208 209// Trims any whitespace from either end of the input string. Returns where 210// whitespace was found. 211// The non-wide version has two functions: 212// * TrimWhitespaceASCII() 213// This function is for ASCII strings and only looks for ASCII whitespace; 214// Please choose the best one according to your usage. 215// NOTE: Safe to use the same variable for both input and output. 216enum TrimPositions { 217 TRIM_NONE = 0, 218 TRIM_LEADING = 1 << 0, 219 TRIM_TRAILING = 1 << 1, 220 TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, 221}; 222TrimPositions TrimWhitespace(const std::wstring& input, 223 TrimPositions positions, 224 std::wstring* output); 225TrimPositions TrimWhitespace(const string16& input, 226 TrimPositions positions, 227 string16* output); 228TrimPositions TrimWhitespaceASCII(const std::string& input, 229 TrimPositions positions, 230 std::string* output); 231 232// Deprecated. This function is only for backward compatibility and calls 233// TrimWhitespaceASCII(). 234TrimPositions TrimWhitespace(const std::string& input, 235 TrimPositions positions, 236 std::string* output); 237 238// Searches for CR or LF characters. Removes all contiguous whitespace 239// strings that contain them. This is useful when trying to deal with text 240// copied from terminals. 241// Returns |text|, with the following three transformations: 242// (1) Leading and trailing whitespace is trimmed. 243// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace 244// sequences containing a CR or LF are trimmed. 245// (3) All other whitespace sequences are converted to single spaces. 246std::wstring CollapseWhitespace(const std::wstring& text, 247 bool trim_sequences_with_line_breaks); 248string16 CollapseWhitespace(const string16& text, 249 bool trim_sequences_with_line_breaks); 250std::string CollapseWhitespaceASCII(const std::string& text, 251 bool trim_sequences_with_line_breaks); 252 253// Returns true if the passed string is empty or contains only white-space 254// characters. 255bool ContainsOnlyWhitespaceASCII(const std::string& str); 256bool ContainsOnlyWhitespace(const string16& str); 257 258// Returns true if |input| is empty or contains only characters found in 259// |characters|. 260bool ContainsOnlyChars(const std::wstring& input, 261 const std::wstring& characters); 262bool ContainsOnlyChars(const string16& input, const string16& characters); 263bool ContainsOnlyChars(const std::string& input, const std::string& characters); 264 265// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII 266// beforehand. 267std::string WideToASCII(const std::wstring& wide); 268std::string UTF16ToASCII(const string16& utf16); 269 270// Converts the given wide string to the corresponding Latin1. This will fail 271// (return false) if any characters are more than 255. 272bool WideToLatin1(const std::wstring& wide, std::string* latin1); 273 274// Returns true if the specified string matches the criteria. How can a wide 275// string be 8-bit or UTF8? It contains only characters that are < 256 (in the 276// first case) or characters that use only 8-bits and whose 8-bit 277// representation looks like a UTF-8 string (the second case). 278// 279// Note that IsStringUTF8 checks not only if the input is structrually 280// valid but also if it doesn't contain any non-character codepoint 281// (e.g. U+FFFE). It's done on purpose because all the existing callers want 282// to have the maximum 'discriminating' power from other encodings. If 283// there's a use case for just checking the structural validity, we have to 284// add a new function for that. 285bool IsStringUTF8(const std::string& str); 286bool IsStringASCII(const std::wstring& str); 287bool IsStringASCII(const base::StringPiece& str); 288bool IsStringASCII(const string16& str); 289 290// Converts the elements of the given string. This version uses a pointer to 291// clearly differentiate it from the non-pointer variant. 292template <class str> inline void StringToLowerASCII(str* s) { 293 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 294 *i = base::ToLowerASCII(*i); 295} 296 297template <class str> inline str StringToLowerASCII(const str& s) { 298 // for std::string and std::wstring 299 str output(s); 300 StringToLowerASCII(&output); 301 return output; 302} 303 304// Converts the elements of the given string. This version uses a pointer to 305// clearly differentiate it from the non-pointer variant. 306template <class str> inline void StringToUpperASCII(str* s) { 307 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 308 *i = base::ToUpperASCII(*i); 309} 310 311template <class str> inline str StringToUpperASCII(const str& s) { 312 // for std::string and std::wstring 313 str output(s); 314 StringToUpperASCII(&output); 315 return output; 316} 317 318// Compare the lower-case form of the given string against the given ASCII 319// string. This is useful for doing checking if an input string matches some 320// token, and it is optimized to avoid intermediate string copies. This API is 321// borrowed from the equivalent APIs in Mozilla. 322bool LowerCaseEqualsASCII(const std::string& a, const char* b); 323bool LowerCaseEqualsASCII(const std::wstring& a, const char* b); 324bool LowerCaseEqualsASCII(const string16& a, const char* b); 325 326// Same thing, but with string iterators instead. 327bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 328 std::string::const_iterator a_end, 329 const char* b); 330bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 331 std::wstring::const_iterator a_end, 332 const char* b); 333bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 334 string16::const_iterator a_end, 335 const char* b); 336bool LowerCaseEqualsASCII(const char* a_begin, 337 const char* a_end, 338 const char* b); 339bool LowerCaseEqualsASCII(const wchar_t* a_begin, 340 const wchar_t* a_end, 341 const char* b); 342bool LowerCaseEqualsASCII(const char16* a_begin, 343 const char16* a_end, 344 const char* b); 345 346// Performs a case-sensitive string compare. The behavior is undefined if both 347// strings are not ASCII. 348bool EqualsASCII(const string16& a, const base::StringPiece& b); 349 350// Returns true if str starts with search, or false otherwise. 351bool StartsWithASCII(const std::string& str, 352 const std::string& search, 353 bool case_sensitive); 354bool StartsWith(const std::wstring& str, 355 const std::wstring& search, 356 bool case_sensitive); 357bool StartsWith(const string16& str, 358 const string16& search, 359 bool case_sensitive); 360 361// Returns true if str ends with search, or false otherwise. 362bool EndsWith(const std::string& str, 363 const std::string& search, 364 bool case_sensitive); 365bool EndsWith(const std::wstring& str, 366 const std::wstring& search, 367 bool case_sensitive); 368bool EndsWith(const string16& str, 369 const string16& search, 370 bool case_sensitive); 371 372 373// Determines the type of ASCII character, independent of locale (the C 374// library versions will change based on locale). 375template <typename Char> 376inline bool IsAsciiWhitespace(Char c) { 377 return c == ' ' || c == '\r' || c == '\n' || c == '\t'; 378} 379template <typename Char> 380inline bool IsAsciiAlpha(Char c) { 381 return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); 382} 383template <typename Char> 384inline bool IsAsciiDigit(Char c) { 385 return c >= '0' && c <= '9'; 386} 387 388template <typename Char> 389inline bool IsHexDigit(Char c) { 390 return (c >= '0' && c <= '9') || 391 (c >= 'A' && c <= 'F') || 392 (c >= 'a' && c <= 'f'); 393} 394 395template <typename Char> 396inline Char HexDigitToInt(Char c) { 397 DCHECK(IsHexDigit(c)); 398 if (c >= '0' && c <= '9') 399 return c - '0'; 400 if (c >= 'A' && c <= 'F') 401 return c - 'A' + 10; 402 if (c >= 'a' && c <= 'f') 403 return c - 'a' + 10; 404 return 0; 405} 406 407// Returns true if it's a whitespace character. 408inline bool IsWhitespace(wchar_t c) { 409 return wcschr(kWhitespaceWide, c) != NULL; 410} 411 412enum DataUnits { 413 DATA_UNITS_BYTE = 0, 414 DATA_UNITS_KIBIBYTE, 415 DATA_UNITS_MEBIBYTE, 416 DATA_UNITS_GIBIBYTE, 417}; 418 419// Return the unit type that is appropriate for displaying the amount of bytes 420// passed in. 421DataUnits GetByteDisplayUnits(int64 bytes); 422 423// Return a byte string in human-readable format, displayed in units appropriate 424// specified by 'units', with an optional unit suffix. 425// Ex: FormatBytes(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB" 426// Ex: FormatBytes(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1" 427string16 FormatBytes(int64 bytes, DataUnits units, bool show_units); 428 429// As above, but with "/s" units. 430// Ex: FormatSpeed(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB/s" 431// Ex: FormatSpeed(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1" 432string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units); 433 434// Return a number formated with separators in the user's locale way. 435// Ex: FormatNumber(1234567) => 1,234,567 436string16 FormatNumber(int64 number); 437 438// Starting at |start_offset| (usually 0), replace the first instance of 439// |find_this| with |replace_with|. 440void ReplaceFirstSubstringAfterOffset(string16* str, 441 string16::size_type start_offset, 442 const string16& find_this, 443 const string16& replace_with); 444void ReplaceFirstSubstringAfterOffset(std::string* str, 445 std::string::size_type start_offset, 446 const std::string& find_this, 447 const std::string& replace_with); 448 449// Starting at |start_offset| (usually 0), look through |str| and replace all 450// instances of |find_this| with |replace_with|. 451// 452// This does entire substrings; use std::replace in <algorithm> for single 453// characters, for example: 454// std::replace(str.begin(), str.end(), 'a', 'b'); 455void ReplaceSubstringsAfterOffset(string16* str, 456 string16::size_type start_offset, 457 const string16& find_this, 458 const string16& replace_with); 459void ReplaceSubstringsAfterOffset(std::string* str, 460 std::string::size_type start_offset, 461 const std::string& find_this, 462 const std::string& replace_with); 463 464// This is mpcomplete's pattern for saving a string copy when dealing with 465// a function that writes results into a wchar_t[] and wanting the result to 466// end up in a std::wstring. It ensures that the std::wstring's internal 467// buffer has enough room to store the characters to be written into it, and 468// sets its .length() attribute to the right value. 469// 470// The reserve() call allocates the memory required to hold the string 471// plus a terminating null. This is done because resize() isn't 472// guaranteed to reserve space for the null. The resize() call is 473// simply the only way to change the string's 'length' member. 474// 475// XXX-performance: the call to wide.resize() takes linear time, since it fills 476// the string's buffer with nulls. I call it to change the length of the 477// string (needed because writing directly to the buffer doesn't do this). 478// Perhaps there's a constant-time way to change the string's length. 479template <class string_type> 480inline typename string_type::value_type* WriteInto(string_type* str, 481 size_t length_with_null) { 482 str->reserve(length_with_null); 483 str->resize(length_with_null - 1); 484 return &((*str)[0]); 485} 486 487//----------------------------------------------------------------------------- 488 489// Splits a string into its fields delimited by any of the characters in 490// |delimiters|. Each field is added to the |tokens| vector. Returns the 491// number of tokens found. 492size_t Tokenize(const std::wstring& str, 493 const std::wstring& delimiters, 494 std::vector<std::wstring>* tokens); 495size_t Tokenize(const string16& str, 496 const string16& delimiters, 497 std::vector<string16>* tokens); 498size_t Tokenize(const std::string& str, 499 const std::string& delimiters, 500 std::vector<std::string>* tokens); 501size_t Tokenize(const base::StringPiece& str, 502 const base::StringPiece& delimiters, 503 std::vector<base::StringPiece>* tokens); 504 505// Does the opposite of SplitString(). 506string16 JoinString(const std::vector<string16>& parts, char16 s); 507std::string JoinString(const std::vector<std::string>& parts, char s); 508 509// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively. 510// Additionally, any number of consecutive '$' characters is replaced by that 511// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be 512// NULL. This only allows you to use up to nine replacements. 513string16 ReplaceStringPlaceholders(const string16& format_string, 514 const std::vector<string16>& subst, 515 std::vector<size_t>* offsets); 516 517std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, 518 const std::vector<std::string>& subst, 519 std::vector<size_t>* offsets); 520 521// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL. 522string16 ReplaceStringPlaceholders(const string16& format_string, 523 const string16& a, 524 size_t* offset); 525 526// Returns true if the string passed in matches the pattern. The pattern 527// string can contain wildcards like * and ? 528// The backslash character (\) is an escape character for * and ? 529// We limit the patterns to having a max of 16 * or ? characters. 530// ? matches 0 or 1 character, while * matches 0 or more characters. 531bool MatchPattern(const base::StringPiece& string, 532 const base::StringPiece& pattern); 533bool MatchPattern(const string16& string, const string16& pattern); 534 535// Hack to convert any char-like type to its unsigned counterpart. 536// For example, it will convert char, signed char and unsigned char to unsigned 537// char. 538template<typename T> 539struct ToUnsigned { 540 typedef T Unsigned; 541}; 542 543template<> 544struct ToUnsigned<char> { 545 typedef unsigned char Unsigned; 546}; 547template<> 548struct ToUnsigned<signed char> { 549 typedef unsigned char Unsigned; 550}; 551template<> 552struct ToUnsigned<wchar_t> { 553#if defined(WCHAR_T_IS_UTF16) 554 typedef unsigned short Unsigned; 555#elif defined(WCHAR_T_IS_UTF32) 556 typedef uint32 Unsigned; 557#endif 558}; 559template<> 560struct ToUnsigned<short> { 561 typedef unsigned short Unsigned; 562}; 563 564#endif // BASE_STRING_UTIL_H_ 565