1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This file defines utility functions for working with strings. 6 7#ifndef BASE_STRINGS_STRING_UTIL_H_ 8#define BASE_STRINGS_STRING_UTIL_H_ 9 10#include <ctype.h> 11#include <stdarg.h> // va_list 12 13#include <string> 14#include <vector> 15 16#include "base/base_export.h" 17#include "base/basictypes.h" 18#include "base/compiler_specific.h" 19#include "base/strings/string16.h" 20#include "base/strings/string_piece.h" // For implicit conversions. 21 22// Safe standard library wrappers for all platforms. 23 24namespace base { 25 26// C standard-library functions like "strncasecmp" and "snprintf" that aren't 27// cross-platform are provided as "base::strncasecmp", and their prototypes 28// are listed below. These functions are then implemented as inline calls 29// to the platform-specific equivalents in the platform-specific headers. 30 31// Compares the two strings s1 and s2 without regard to case using 32// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 33// s2 > s1 according to a lexicographic comparison. 34int strcasecmp(const char* s1, const char* s2); 35 36// Compares up to count characters of s1 and s2 without regard to case using 37// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 38// s2 > s1 according to a lexicographic comparison. 39int strncasecmp(const char* s1, const char* s2, size_t count); 40 41// Same as strncmp but for char16 strings. 42int strncmp16(const char16* s1, const char16* s2, size_t count); 43 44// Wrapper for vsnprintf that always null-terminates and always returns the 45// number of characters that would be in an untruncated formatted 46// string, even when truncation occurs. 47int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) 48 PRINTF_FORMAT(3, 0); 49 50// vswprintf always null-terminates, but when truncation occurs, it will either 51// return -1 or the number of characters that would be in an untruncated 52// formatted string. The actual return value depends on the underlying 53// C library's vswprintf implementation. 54int vswprintf(wchar_t* buffer, size_t size, 55 const wchar_t* format, va_list arguments) 56 WPRINTF_FORMAT(3, 0); 57 58// Some of these implementations need to be inlined. 59 60// We separate the declaration from the implementation of this inline 61// function just so the PRINTF_FORMAT works. 62inline int snprintf(char* buffer, size_t size, const char* format, ...) 63 PRINTF_FORMAT(3, 4); 64inline int snprintf(char* buffer, size_t size, const char* format, ...) { 65 va_list arguments; 66 va_start(arguments, format); 67 int result = vsnprintf(buffer, size, format, arguments); 68 va_end(arguments); 69 return result; 70} 71 72// We separate the declaration from the implementation of this inline 73// function just so the WPRINTF_FORMAT works. 74inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) 75 WPRINTF_FORMAT(3, 4); 76inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) { 77 va_list arguments; 78 va_start(arguments, format); 79 int result = vswprintf(buffer, size, format, arguments); 80 va_end(arguments); 81 return result; 82} 83 84// BSD-style safe and consistent string copy functions. 85// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. 86// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as 87// long as |dst_size| is not 0. Returns the length of |src| in characters. 88// If the return value is >= dst_size, then the output was truncated. 89// NOTE: All sizes are in number of characters, NOT in bytes. 90BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size); 91BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size); 92 93// Scan a wprintf format string to determine whether it's portable across a 94// variety of systems. This function only checks that the conversion 95// specifiers used by the format string are supported and have the same meaning 96// on a variety of systems. It doesn't check for other errors that might occur 97// within a format string. 98// 99// Nonportable conversion specifiers for wprintf are: 100// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char 101// data on all systems except Windows, which treat them as wchar_t data. 102// Use %ls and %lc for wchar_t data instead. 103// - 'S' and 'C', which operate on wchar_t data on all systems except Windows, 104// which treat them as char data. Use %ls and %lc for wchar_t data 105// instead. 106// - 'F', which is not identified by Windows wprintf documentation. 107// - 'D', 'O', and 'U', which are deprecated and not available on all systems. 108// Use %ld, %lo, and %lu instead. 109// 110// Note that there is no portable conversion specifier for char data when 111// working with wprintf. 112// 113// This function is intended to be called from base::vswprintf. 114BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format); 115 116// ASCII-specific tolower. The standard library's tolower is locale sensitive, 117// so we don't want to use it here. 118template <class Char> inline Char ToLowerASCII(Char c) { 119 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; 120} 121 122// ASCII-specific toupper. The standard library's toupper is locale sensitive, 123// so we don't want to use it here. 124template <class Char> inline Char ToUpperASCII(Char c) { 125 return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; 126} 127 128// Function objects to aid in comparing/searching strings. 129 130template<typename Char> struct CaseInsensitiveCompare { 131 public: 132 bool operator()(Char x, Char y) const { 133 // TODO(darin): Do we really want to do locale sensitive comparisons here? 134 // See http://crbug.com/24917 135 return tolower(x) == tolower(y); 136 } 137}; 138 139template<typename Char> struct CaseInsensitiveCompareASCII { 140 public: 141 bool operator()(Char x, Char y) const { 142 return ToLowerASCII(x) == ToLowerASCII(y); 143 } 144}; 145 146} // namespace base 147 148#if defined(OS_WIN) 149#include "base/strings/string_util_win.h" 150#elif defined(OS_POSIX) 151#include "base/strings/string_util_posix.h" 152#else 153#error Define string operations appropriately for your platform 154#endif 155 156// These threadsafe functions return references to globally unique empty 157// strings. 158// 159// DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS. 160// There is only one case where you should use these: functions which need to 161// return a string by reference (e.g. as a class member accessor), and don't 162// have an empty string to use (e.g. in an error case). These should not be 163// used as initializers, function arguments, or return values for functions 164// which return by value or outparam. 165BASE_EXPORT const std::string& EmptyString(); 166BASE_EXPORT const std::wstring& EmptyWString(); 167BASE_EXPORT const string16& EmptyString16(); 168 169BASE_EXPORT extern const wchar_t kWhitespaceWide[]; 170BASE_EXPORT extern const char16 kWhitespaceUTF16[]; 171BASE_EXPORT extern const char kWhitespaceASCII[]; 172 173BASE_EXPORT extern const char kUtf8ByteOrderMark[]; 174 175// Removes characters in |remove_chars| from anywhere in |input|. Returns true 176// if any characters were removed. |remove_chars| must be null-terminated. 177// NOTE: Safe to use the same variable for both |input| and |output|. 178BASE_EXPORT bool RemoveChars(const string16& input, 179 const char16 remove_chars[], 180 string16* output); 181BASE_EXPORT bool RemoveChars(const std::string& input, 182 const char remove_chars[], 183 std::string* output); 184 185// Replaces characters in |replace_chars| from anywhere in |input| with 186// |replace_with|. Each character in |replace_chars| will be replaced with 187// the |replace_with| string. Returns true if any characters were replaced. 188// |replace_chars| must be null-terminated. 189// NOTE: Safe to use the same variable for both |input| and |output|. 190BASE_EXPORT bool ReplaceChars(const string16& input, 191 const char16 replace_chars[], 192 const string16& replace_with, 193 string16* output); 194BASE_EXPORT bool ReplaceChars(const std::string& input, 195 const char replace_chars[], 196 const std::string& replace_with, 197 std::string* output); 198 199// Removes characters in |trim_chars| from the beginning and end of |input|. 200// |trim_chars| must be null-terminated. 201// NOTE: Safe to use the same variable for both |input| and |output|. 202BASE_EXPORT bool TrimString(const std::wstring& input, 203 const wchar_t trim_chars[], 204 std::wstring* output); 205BASE_EXPORT bool TrimString(const string16& input, 206 const char16 trim_chars[], 207 string16* output); 208BASE_EXPORT bool TrimString(const std::string& input, 209 const char trim_chars[], 210 std::string* output); 211 212// Truncates a string to the nearest UTF-8 character that will leave 213// the string less than or equal to the specified byte size. 214BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input, 215 const size_t byte_size, 216 std::string* output); 217 218// Trims any whitespace from either end of the input string. Returns where 219// whitespace was found. 220// The non-wide version has two functions: 221// * TrimWhitespaceASCII() 222// This function is for ASCII strings and only looks for ASCII whitespace; 223// Please choose the best one according to your usage. 224// NOTE: Safe to use the same variable for both input and output. 225enum TrimPositions { 226 TRIM_NONE = 0, 227 TRIM_LEADING = 1 << 0, 228 TRIM_TRAILING = 1 << 1, 229 TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, 230}; 231BASE_EXPORT TrimPositions TrimWhitespace(const string16& input, 232 TrimPositions positions, 233 string16* output); 234BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input, 235 TrimPositions positions, 236 std::string* output); 237 238// Deprecated. This function is only for backward compatibility and calls 239// TrimWhitespaceASCII(). 240BASE_EXPORT TrimPositions TrimWhitespace(const std::string& input, 241 TrimPositions positions, 242 std::string* output); 243 244// Searches for CR or LF characters. Removes all contiguous whitespace 245// strings that contain them. This is useful when trying to deal with text 246// copied from terminals. 247// Returns |text|, with the following three transformations: 248// (1) Leading and trailing whitespace is trimmed. 249// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace 250// sequences containing a CR or LF are trimmed. 251// (3) All other whitespace sequences are converted to single spaces. 252BASE_EXPORT std::wstring CollapseWhitespace( 253 const std::wstring& text, 254 bool trim_sequences_with_line_breaks); 255BASE_EXPORT string16 CollapseWhitespace( 256 const string16& text, 257 bool trim_sequences_with_line_breaks); 258BASE_EXPORT std::string CollapseWhitespaceASCII( 259 const std::string& text, 260 bool trim_sequences_with_line_breaks); 261 262// Returns true if the passed string is empty or contains only white-space 263// characters. 264BASE_EXPORT bool ContainsOnlyWhitespaceASCII(const std::string& str); 265BASE_EXPORT bool ContainsOnlyWhitespace(const string16& str); 266 267// Returns true if |input| is empty or contains only characters found in 268// |characters|. 269BASE_EXPORT bool ContainsOnlyChars(const std::wstring& input, 270 const std::wstring& characters); 271BASE_EXPORT bool ContainsOnlyChars(const string16& input, 272 const string16& characters); 273BASE_EXPORT bool ContainsOnlyChars(const std::string& input, 274 const std::string& characters); 275 276// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII 277// beforehand. 278BASE_EXPORT std::string WideToASCII(const std::wstring& wide); 279BASE_EXPORT std::string UTF16ToASCII(const string16& utf16); 280 281// Converts the given wide string to the corresponding Latin1. This will fail 282// (return false) if any characters are more than 255. 283BASE_EXPORT bool WideToLatin1(const std::wstring& wide, std::string* latin1); 284 285// Returns true if the specified string matches the criteria. How can a wide 286// string be 8-bit or UTF8? It contains only characters that are < 256 (in the 287// first case) or characters that use only 8-bits and whose 8-bit 288// representation looks like a UTF-8 string (the second case). 289// 290// Note that IsStringUTF8 checks not only if the input is structurally 291// valid but also if it doesn't contain any non-character codepoint 292// (e.g. U+FFFE). It's done on purpose because all the existing callers want 293// to have the maximum 'discriminating' power from other encodings. If 294// there's a use case for just checking the structural validity, we have to 295// add a new function for that. 296BASE_EXPORT bool IsStringUTF8(const std::string& str); 297BASE_EXPORT bool IsStringASCII(const std::wstring& str); 298BASE_EXPORT bool IsStringASCII(const base::StringPiece& str); 299BASE_EXPORT bool IsStringASCII(const string16& str); 300 301// Converts the elements of the given string. This version uses a pointer to 302// clearly differentiate it from the non-pointer variant. 303template <class str> inline void StringToLowerASCII(str* s) { 304 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 305 *i = base::ToLowerASCII(*i); 306} 307 308template <class str> inline str StringToLowerASCII(const str& s) { 309 // for std::string and std::wstring 310 str output(s); 311 StringToLowerASCII(&output); 312 return output; 313} 314 315// Converts the elements of the given string. This version uses a pointer to 316// clearly differentiate it from the non-pointer variant. 317template <class str> inline void StringToUpperASCII(str* s) { 318 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 319 *i = base::ToUpperASCII(*i); 320} 321 322template <class str> inline str StringToUpperASCII(const str& s) { 323 // for std::string and std::wstring 324 str output(s); 325 StringToUpperASCII(&output); 326 return output; 327} 328 329// Compare the lower-case form of the given string against the given ASCII 330// string. This is useful for doing checking if an input string matches some 331// token, and it is optimized to avoid intermediate string copies. This API is 332// borrowed from the equivalent APIs in Mozilla. 333BASE_EXPORT bool LowerCaseEqualsASCII(const std::string& a, const char* b); 334BASE_EXPORT bool LowerCaseEqualsASCII(const std::wstring& a, const char* b); 335BASE_EXPORT bool LowerCaseEqualsASCII(const string16& a, const char* b); 336 337// Same thing, but with string iterators instead. 338BASE_EXPORT bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 339 std::string::const_iterator a_end, 340 const char* b); 341BASE_EXPORT bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 342 std::wstring::const_iterator a_end, 343 const char* b); 344BASE_EXPORT bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 345 string16::const_iterator a_end, 346 const char* b); 347BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin, 348 const char* a_end, 349 const char* b); 350BASE_EXPORT bool LowerCaseEqualsASCII(const wchar_t* a_begin, 351 const wchar_t* a_end, 352 const char* b); 353BASE_EXPORT bool LowerCaseEqualsASCII(const char16* a_begin, 354 const char16* a_end, 355 const char* b); 356 357// Performs a case-sensitive string compare. The behavior is undefined if both 358// strings are not ASCII. 359BASE_EXPORT bool EqualsASCII(const string16& a, const base::StringPiece& b); 360 361// Returns true if str starts with search, or false otherwise. 362BASE_EXPORT bool StartsWithASCII(const std::string& str, 363 const std::string& search, 364 bool case_sensitive); 365BASE_EXPORT bool StartsWith(const std::wstring& str, 366 const std::wstring& search, 367 bool case_sensitive); 368BASE_EXPORT bool StartsWith(const string16& str, 369 const string16& search, 370 bool case_sensitive); 371 372// Returns true if str ends with search, or false otherwise. 373BASE_EXPORT bool EndsWith(const std::string& str, 374 const std::string& search, 375 bool case_sensitive); 376BASE_EXPORT bool EndsWith(const std::wstring& str, 377 const std::wstring& search, 378 bool case_sensitive); 379BASE_EXPORT bool EndsWith(const string16& str, 380 const string16& search, 381 bool case_sensitive); 382 383 384// Determines the type of ASCII character, independent of locale (the C 385// library versions will change based on locale). 386template <typename Char> 387inline bool IsAsciiWhitespace(Char c) { 388 return c == ' ' || c == '\r' || c == '\n' || c == '\t'; 389} 390template <typename Char> 391inline bool IsAsciiAlpha(Char c) { 392 return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); 393} 394template <typename Char> 395inline bool IsAsciiDigit(Char c) { 396 return c >= '0' && c <= '9'; 397} 398 399template <typename Char> 400inline bool IsHexDigit(Char c) { 401 return (c >= '0' && c <= '9') || 402 (c >= 'A' && c <= 'F') || 403 (c >= 'a' && c <= 'f'); 404} 405 406template <typename Char> 407inline Char HexDigitToInt(Char c) { 408 DCHECK(IsHexDigit(c)); 409 if (c >= '0' && c <= '9') 410 return c - '0'; 411 if (c >= 'A' && c <= 'F') 412 return c - 'A' + 10; 413 if (c >= 'a' && c <= 'f') 414 return c - 'a' + 10; 415 return 0; 416} 417 418// Returns true if it's a whitespace character. 419inline bool IsWhitespace(wchar_t c) { 420 return wcschr(kWhitespaceWide, c) != NULL; 421} 422 423// Return a byte string in human-readable format with a unit suffix. Not 424// appropriate for use in any UI; use of FormatBytes and friends in ui/base is 425// highly recommended instead. TODO(avi): Figure out how to get callers to use 426// FormatBytes instead; remove this. 427BASE_EXPORT string16 FormatBytesUnlocalized(int64 bytes); 428 429// Starting at |start_offset| (usually 0), replace the first instance of 430// |find_this| with |replace_with|. 431BASE_EXPORT void ReplaceFirstSubstringAfterOffset( 432 string16* str, 433 string16::size_type start_offset, 434 const string16& find_this, 435 const string16& replace_with); 436BASE_EXPORT void ReplaceFirstSubstringAfterOffset( 437 std::string* str, 438 std::string::size_type start_offset, 439 const std::string& find_this, 440 const std::string& replace_with); 441 442// Starting at |start_offset| (usually 0), look through |str| and replace all 443// instances of |find_this| with |replace_with|. 444// 445// This does entire substrings; use std::replace in <algorithm> for single 446// characters, for example: 447// std::replace(str.begin(), str.end(), 'a', 'b'); 448BASE_EXPORT void ReplaceSubstringsAfterOffset( 449 string16* str, 450 string16::size_type start_offset, 451 const string16& find_this, 452 const string16& replace_with); 453BASE_EXPORT void ReplaceSubstringsAfterOffset( 454 std::string* str, 455 std::string::size_type start_offset, 456 const std::string& find_this, 457 const std::string& replace_with); 458 459// Reserves enough memory in |str| to accommodate |length_with_null| characters, 460// sets the size of |str| to |length_with_null - 1| characters, and returns a 461// pointer to the underlying contiguous array of characters. This is typically 462// used when calling a function that writes results into a character array, but 463// the caller wants the data to be managed by a string-like object. It is 464// convenient in that is can be used inline in the call, and fast in that it 465// avoids copying the results of the call from a char* into a string. 466// 467// |length_with_null| must be at least 2, since otherwise the underlying string 468// would have size 0, and trying to access &((*str)[0]) in that case can result 469// in a number of problems. 470// 471// Internally, this takes linear time because the resize() call 0-fills the 472// underlying array for potentially all 473// (|length_with_null - 1| * sizeof(string_type::value_type)) bytes. Ideally we 474// could avoid this aspect of the resize() call, as we expect the caller to 475// immediately write over this memory, but there is no other way to set the size 476// of the string, and not doing that will mean people who access |str| rather 477// than str.c_str() will get back a string of whatever size |str| had on entry 478// to this function (probably 0). 479template <class string_type> 480inline typename string_type::value_type* WriteInto(string_type* str, 481 size_t length_with_null) { 482 DCHECK_GT(length_with_null, 1u); 483 str->reserve(length_with_null); 484 str->resize(length_with_null - 1); 485 return &((*str)[0]); 486} 487 488//----------------------------------------------------------------------------- 489 490// Splits a string into its fields delimited by any of the characters in 491// |delimiters|. Each field is added to the |tokens| vector. Returns the 492// number of tokens found. 493BASE_EXPORT size_t Tokenize(const std::wstring& str, 494 const std::wstring& delimiters, 495 std::vector<std::wstring>* tokens); 496BASE_EXPORT size_t Tokenize(const string16& str, 497 const string16& delimiters, 498 std::vector<string16>* tokens); 499BASE_EXPORT size_t Tokenize(const std::string& str, 500 const std::string& delimiters, 501 std::vector<std::string>* tokens); 502BASE_EXPORT size_t Tokenize(const base::StringPiece& str, 503 const base::StringPiece& delimiters, 504 std::vector<base::StringPiece>* tokens); 505 506// Does the opposite of SplitString(). 507BASE_EXPORT string16 JoinString(const std::vector<string16>& parts, char16 s); 508BASE_EXPORT std::string JoinString( 509 const std::vector<std::string>& parts, char s); 510 511// Join |parts| using |separator|. 512BASE_EXPORT std::string JoinString( 513 const std::vector<std::string>& parts, 514 const std::string& separator); 515BASE_EXPORT string16 JoinString( 516 const std::vector<string16>& parts, 517 const string16& separator); 518 519// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively. 520// Additionally, any number of consecutive '$' characters is replaced by that 521// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be 522// NULL. This only allows you to use up to nine replacements. 523BASE_EXPORT string16 ReplaceStringPlaceholders( 524 const string16& format_string, 525 const std::vector<string16>& subst, 526 std::vector<size_t>* offsets); 527 528BASE_EXPORT std::string ReplaceStringPlaceholders( 529 const base::StringPiece& format_string, 530 const std::vector<std::string>& subst, 531 std::vector<size_t>* offsets); 532 533// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL. 534BASE_EXPORT string16 ReplaceStringPlaceholders(const string16& format_string, 535 const string16& a, 536 size_t* offset); 537 538// Returns true if the string passed in matches the pattern. The pattern 539// string can contain wildcards like * and ? 540// The backslash character (\) is an escape character for * and ? 541// We limit the patterns to having a max of 16 * or ? characters. 542// ? matches 0 or 1 character, while * matches 0 or more characters. 543BASE_EXPORT bool MatchPattern(const base::StringPiece& string, 544 const base::StringPiece& pattern); 545BASE_EXPORT bool MatchPattern(const string16& string, const string16& pattern); 546 547// Hack to convert any char-like type to its unsigned counterpart. 548// For example, it will convert char, signed char and unsigned char to unsigned 549// char. 550template<typename T> 551struct ToUnsigned { 552 typedef T Unsigned; 553}; 554 555template<> 556struct ToUnsigned<char> { 557 typedef unsigned char Unsigned; 558}; 559template<> 560struct ToUnsigned<signed char> { 561 typedef unsigned char Unsigned; 562}; 563template<> 564struct ToUnsigned<wchar_t> { 565#if defined(WCHAR_T_IS_UTF16) 566 typedef unsigned short Unsigned; 567#elif defined(WCHAR_T_IS_UTF32) 568 typedef uint32 Unsigned; 569#endif 570}; 571template<> 572struct ToUnsigned<short> { 573 typedef unsigned short Unsigned; 574}; 575 576#endif // BASE_STRINGS_STRING_UTIL_H_ 577