string_util.h revision b910a63ff3111067e79c016f40a7c1baac943405
1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This file defines utility functions for working with strings. 6 7#ifndef BASE_STRINGS_STRING_UTIL_H_ 8#define BASE_STRINGS_STRING_UTIL_H_ 9 10#include <ctype.h> 11#include <stdarg.h> // va_list 12 13#include <string> 14#include <vector> 15 16#include "base/base_export.h" 17#include "base/basictypes.h" 18#include "base/compiler_specific.h" 19#include "base/strings/string16.h" 20#include "base/strings/string_piece.h" // For implicit conversions. 21 22// On Android, bionic's stdio.h defines an snprintf macro when being built with 23// clang. Undefine it here so it won't collide with base::snprintf(). 24#undef snprintf 25 26namespace base { 27 28// C standard-library functions like "strncasecmp" and "snprintf" that aren't 29// cross-platform are provided as "base::strncasecmp", and their prototypes 30// are listed below. These functions are then implemented as inline calls 31// to the platform-specific equivalents in the platform-specific headers. 32 33// Compares the two strings s1 and s2 without regard to case using 34// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 35// s2 > s1 according to a lexicographic comparison. 36int strcasecmp(const char* s1, const char* s2); 37 38// Compares up to count characters of s1 and s2 without regard to case using 39// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if 40// s2 > s1 according to a lexicographic comparison. 41int strncasecmp(const char* s1, const char* s2, size_t count); 42 43// Same as strncmp but for char16 strings. 44int strncmp16(const char16* s1, const char16* s2, size_t count); 45 46// Wrapper for vsnprintf that always null-terminates and always returns the 47// number of characters that would be in an untruncated formatted 48// string, even when truncation occurs. 49int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) 50 PRINTF_FORMAT(3, 0); 51 52// Some of these implementations need to be inlined. 53 54// We separate the declaration from the implementation of this inline 55// function just so the PRINTF_FORMAT works. 56inline int snprintf(char* buffer, size_t size, const char* format, ...) 57 PRINTF_FORMAT(3, 4); 58inline int snprintf(char* buffer, size_t size, const char* format, ...) { 59 va_list arguments; 60 va_start(arguments, format); 61 int result = vsnprintf(buffer, size, format, arguments); 62 va_end(arguments); 63 return result; 64} 65 66// BSD-style safe and consistent string copy functions. 67// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. 68// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as 69// long as |dst_size| is not 0. Returns the length of |src| in characters. 70// If the return value is >= dst_size, then the output was truncated. 71// NOTE: All sizes are in number of characters, NOT in bytes. 72BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size); 73BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size); 74 75// Scan a wprintf format string to determine whether it's portable across a 76// variety of systems. This function only checks that the conversion 77// specifiers used by the format string are supported and have the same meaning 78// on a variety of systems. It doesn't check for other errors that might occur 79// within a format string. 80// 81// Nonportable conversion specifiers for wprintf are: 82// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char 83// data on all systems except Windows, which treat them as wchar_t data. 84// Use %ls and %lc for wchar_t data instead. 85// - 'S' and 'C', which operate on wchar_t data on all systems except Windows, 86// which treat them as char data. Use %ls and %lc for wchar_t data 87// instead. 88// - 'F', which is not identified by Windows wprintf documentation. 89// - 'D', 'O', and 'U', which are deprecated and not available on all systems. 90// Use %ld, %lo, and %lu instead. 91// 92// Note that there is no portable conversion specifier for char data when 93// working with wprintf. 94// 95// This function is intended to be called from base::vswprintf. 96BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format); 97 98// ASCII-specific tolower. The standard library's tolower is locale sensitive, 99// so we don't want to use it here. 100template <class Char> inline Char ToLowerASCII(Char c) { 101 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; 102} 103 104// ASCII-specific toupper. The standard library's toupper is locale sensitive, 105// so we don't want to use it here. 106template <class Char> inline Char ToUpperASCII(Char c) { 107 return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; 108} 109 110// Function objects to aid in comparing/searching strings. 111 112template<typename Char> struct CaseInsensitiveCompare { 113 public: 114 bool operator()(Char x, Char y) const { 115 // TODO(darin): Do we really want to do locale sensitive comparisons here? 116 // See http://crbug.com/24917 117 return tolower(x) == tolower(y); 118 } 119}; 120 121template<typename Char> struct CaseInsensitiveCompareASCII { 122 public: 123 bool operator()(Char x, Char y) const { 124 return ToLowerASCII(x) == ToLowerASCII(y); 125 } 126}; 127 128// These threadsafe functions return references to globally unique empty 129// strings. 130// 131// It is likely faster to construct a new empty string object (just a few 132// instructions to set the length to 0) than to get the empty string singleton 133// returned by these functions (which requires threadsafe singleton access). 134// 135// Therefore, DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT 136// CONSTRUCTORS. There is only one case where you should use these: functions 137// which need to return a string by reference (e.g. as a class member 138// accessor), and don't have an empty string to use (e.g. in an error case). 139// These should not be used as initializers, function arguments, or return 140// values for functions which return by value or outparam. 141BASE_EXPORT const std::string& EmptyString(); 142BASE_EXPORT const string16& EmptyString16(); 143 144// Contains the set of characters representing whitespace in the corresponding 145// encoding. Null-terminated. The ASCII versions are the whitespaces as defined 146// by HTML5, and don't include control characters. 147BASE_EXPORT extern const wchar_t kWhitespaceWide[]; // Includes Unicode. 148BASE_EXPORT extern const char16 kWhitespaceUTF16[]; // Includes Unicode. 149BASE_EXPORT extern const char kWhitespaceASCII[]; 150BASE_EXPORT extern const char16 kWhitespaceASCIIAs16[]; // No unicode. 151 152// Null-terminated string representing the UTF-8 byte order mark. 153BASE_EXPORT extern const char kUtf8ByteOrderMark[]; 154 155// Removes characters in |remove_chars| from anywhere in |input|. Returns true 156// if any characters were removed. |remove_chars| must be null-terminated. 157// NOTE: Safe to use the same variable for both |input| and |output|. 158BASE_EXPORT bool RemoveChars(const string16& input, 159 const base::StringPiece16& remove_chars, 160 string16* output); 161BASE_EXPORT bool RemoveChars(const std::string& input, 162 const base::StringPiece& remove_chars, 163 std::string* output); 164 165// Replaces characters in |replace_chars| from anywhere in |input| with 166// |replace_with|. Each character in |replace_chars| will be replaced with 167// the |replace_with| string. Returns true if any characters were replaced. 168// |replace_chars| must be null-terminated. 169// NOTE: Safe to use the same variable for both |input| and |output|. 170BASE_EXPORT bool ReplaceChars(const string16& input, 171 const base::StringPiece16& replace_chars, 172 const string16& replace_with, 173 string16* output); 174BASE_EXPORT bool ReplaceChars(const std::string& input, 175 const base::StringPiece& replace_chars, 176 const std::string& replace_with, 177 std::string* output); 178 179enum TrimPositions { 180 TRIM_NONE = 0, 181 TRIM_LEADING = 1 << 0, 182 TRIM_TRAILING = 1 << 1, 183 TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, 184}; 185 186// Removes characters in |trim_chars| from the beginning and end of |input|. 187// The 8-bit version only works on 8-bit characters, not UTF-8. 188// 189// It is safe to use the same variable for both |input| and |output| (this is 190// the normal usage to trim in-place). 191BASE_EXPORT bool TrimString(const string16& input, 192 base::StringPiece16 trim_chars, 193 string16* output); 194BASE_EXPORT bool TrimString(const std::string& input, 195 base::StringPiece trim_chars, 196 std::string* output); 197 198// StringPiece versions of the above. The returned pieces refer to the original 199// buffer. 200BASE_EXPORT StringPiece16 TrimString(StringPiece16 input, 201 const base::StringPiece16& trim_chars, 202 TrimPositions positions); 203BASE_EXPORT StringPiece TrimString(StringPiece input, 204 const base::StringPiece& trim_chars, 205 TrimPositions positions); 206 207// Truncates a string to the nearest UTF-8 character that will leave 208// the string less than or equal to the specified byte size. 209BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input, 210 const size_t byte_size, 211 std::string* output); 212 213// Trims any whitespace from either end of the input string. Returns where 214// whitespace was found. 215// The non-wide version has two functions: 216// * TrimWhitespaceASCII() 217// This function is for ASCII strings and only looks for ASCII whitespace; 218// Please choose the best one according to your usage. 219// NOTE: Safe to use the same variable for both input and output. 220BASE_EXPORT TrimPositions TrimWhitespace(const string16& input, 221 TrimPositions positions, 222 base::string16* output); 223BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input, 224 TrimPositions positions, 225 std::string* output); 226 227// Deprecated. This function is only for backward compatibility and calls 228// TrimWhitespaceASCII(). 229BASE_EXPORT TrimPositions TrimWhitespace(const std::string& input, 230 TrimPositions positions, 231 std::string* output); 232 233// Searches for CR or LF characters. Removes all contiguous whitespace 234// strings that contain them. This is useful when trying to deal with text 235// copied from terminals. 236// Returns |text|, with the following three transformations: 237// (1) Leading and trailing whitespace is trimmed. 238// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace 239// sequences containing a CR or LF are trimmed. 240// (3) All other whitespace sequences are converted to single spaces. 241BASE_EXPORT string16 CollapseWhitespace( 242 const string16& text, 243 bool trim_sequences_with_line_breaks); 244BASE_EXPORT std::string CollapseWhitespaceASCII( 245 const std::string& text, 246 bool trim_sequences_with_line_breaks); 247 248// Returns true if |input| is empty or contains only characters found in 249// |characters|. 250BASE_EXPORT bool ContainsOnlyChars(const StringPiece& input, 251 const StringPiece& characters); 252BASE_EXPORT bool ContainsOnlyChars(const StringPiece16& input, 253 const StringPiece16& characters); 254 255// Returns true if the specified string matches the criteria. How can a wide 256// string be 8-bit or UTF8? It contains only characters that are < 256 (in the 257// first case) or characters that use only 8-bits and whose 8-bit 258// representation looks like a UTF-8 string (the second case). 259// 260// Note that IsStringUTF8 checks not only if the input is structurally 261// valid but also if it doesn't contain any non-character codepoint 262// (e.g. U+FFFE). It's done on purpose because all the existing callers want 263// to have the maximum 'discriminating' power from other encodings. If 264// there's a use case for just checking the structural validity, we have to 265// add a new function for that. 266// 267// IsStringASCII assumes the input is likely all ASCII, and does not leave early 268// if it is not the case. 269BASE_EXPORT bool IsStringUTF8(const StringPiece& str); 270BASE_EXPORT bool IsStringASCII(const StringPiece& str); 271BASE_EXPORT bool IsStringASCII(const StringPiece16& str); 272// A convenience adaptor for WebStrings, as they don't convert into 273// StringPieces directly. 274BASE_EXPORT bool IsStringASCII(const string16& str); 275#if defined(WCHAR_T_IS_UTF32) 276BASE_EXPORT bool IsStringASCII(const std::wstring& str); 277#endif 278 279// Converts the elements of the given string. This version uses a pointer to 280// clearly differentiate it from the non-pointer variant. 281template <class str> inline void StringToLowerASCII(str* s) { 282 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 283 *i = ToLowerASCII(*i); 284} 285 286template <class str> inline str StringToLowerASCII(const str& s) { 287 // for std::string and std::wstring 288 str output(s); 289 StringToLowerASCII(&output); 290 return output; 291} 292 293// Converts the elements of the given string. This version uses a pointer to 294// clearly differentiate it from the non-pointer variant. 295template <class str> inline void StringToUpperASCII(str* s) { 296 for (typename str::iterator i = s->begin(); i != s->end(); ++i) 297 *i = ToUpperASCII(*i); 298} 299 300template <class str> inline str StringToUpperASCII(const str& s) { 301 // for std::string and std::wstring 302 str output(s); 303 StringToUpperASCII(&output); 304 return output; 305} 306// 307// Compare the lower-case form of the given string against the given ASCII 308// string. This is useful for doing checking if an input string matches some 309// token, and it is optimized to avoid intermediate string copies. This API is 310// borrowed from the equivalent APIs in Mozilla. 311BASE_EXPORT bool LowerCaseEqualsASCII(const std::string& a, const char* b); 312BASE_EXPORT bool LowerCaseEqualsASCII(const string16& a, const char* b); 313 314// Same thing, but with string iterators instead. 315BASE_EXPORT bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 316 std::string::const_iterator a_end, 317 const char* b); 318BASE_EXPORT bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 319 string16::const_iterator a_end, 320 const char* b); 321BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin, 322 const char* a_end, 323 const char* b); 324BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin, 325 const char* a_end, 326 const char* b_begin, 327 const char* b_end); 328BASE_EXPORT bool LowerCaseEqualsASCII(const char16* a_begin, 329 const char16* a_end, 330 const char* b); 331 332// Performs a case-sensitive string compare. The behavior is undefined if both 333// strings are not ASCII. 334BASE_EXPORT bool EqualsASCII(const string16& a, const StringPiece& b); 335 336// Returns true if str starts with search, or false otherwise. 337// TODO(brettw) the case sensitive flag makes callsites difficult to read. 338// Consider splitting this out in two variants (few callers want 339// case-insensitive compares) or use an enum that makes this more explicit. 340BASE_EXPORT bool StartsWithASCII(const std::string& str, 341 const std::string& search, 342 bool case_sensitive); 343BASE_EXPORT bool StartsWith(const base::string16& str, 344 const base::string16& search, 345 bool case_sensitive); 346 347// Returns true if str ends with search, or false otherwise. 348// TODO(brettw) case sensitive flag confusion, see StartsWith above. 349BASE_EXPORT bool EndsWith(const std::string& str, 350 const std::string& search, 351 bool case_sensitive); 352BASE_EXPORT bool EndsWith(const base::string16& str, 353 const base::string16& search, 354 bool case_sensitive); 355 356} // namespace base 357 358#if defined(OS_WIN) 359#include "base/strings/string_util_win.h" 360#elif defined(OS_POSIX) 361#include "base/strings/string_util_posix.h" 362#else 363#error Define string operations appropriately for your platform 364#endif 365 366// Determines the type of ASCII character, independent of locale (the C 367// library versions will change based on locale). 368template <typename Char> 369inline bool IsAsciiWhitespace(Char c) { 370 return c == ' ' || c == '\r' || c == '\n' || c == '\t'; 371} 372template <typename Char> 373inline bool IsAsciiAlpha(Char c) { 374 return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); 375} 376template <typename Char> 377inline bool IsAsciiDigit(Char c) { 378 return c >= '0' && c <= '9'; 379} 380 381template <typename Char> 382inline bool IsHexDigit(Char c) { 383 return (c >= '0' && c <= '9') || 384 (c >= 'A' && c <= 'F') || 385 (c >= 'a' && c <= 'f'); 386} 387 388template <typename Char> 389inline char HexDigitToInt(Char c) { 390 DCHECK(IsHexDigit(c)); 391 if (c >= '0' && c <= '9') 392 return static_cast<char>(c - '0'); 393 if (c >= 'A' && c <= 'F') 394 return static_cast<char>(c - 'A' + 10); 395 if (c >= 'a' && c <= 'f') 396 return static_cast<char>(c - 'a' + 10); 397 return 0; 398} 399 400// Returns true if it's a whitespace character. 401inline bool IsWhitespace(wchar_t c) { 402 return wcschr(base::kWhitespaceWide, c) != NULL; 403} 404 405// Return a byte string in human-readable format with a unit suffix. Not 406// appropriate for use in any UI; use of FormatBytes and friends in ui/base is 407// highly recommended instead. TODO(avi): Figure out how to get callers to use 408// FormatBytes instead; remove this. 409BASE_EXPORT base::string16 FormatBytesUnlocalized(int64 bytes); 410 411// Starting at |start_offset| (usually 0), replace the first instance of 412// |find_this| with |replace_with|. 413BASE_EXPORT void ReplaceFirstSubstringAfterOffset( 414 base::string16* str, 415 size_t start_offset, 416 const base::string16& find_this, 417 const base::string16& replace_with); 418BASE_EXPORT void ReplaceFirstSubstringAfterOffset( 419 std::string* str, 420 size_t start_offset, 421 const std::string& find_this, 422 const std::string& replace_with); 423 424// Starting at |start_offset| (usually 0), look through |str| and replace all 425// instances of |find_this| with |replace_with|. 426// 427// This does entire substrings; use std::replace in <algorithm> for single 428// characters, for example: 429// std::replace(str.begin(), str.end(), 'a', 'b'); 430BASE_EXPORT void ReplaceSubstringsAfterOffset( 431 base::string16* str, 432 size_t start_offset, 433 const base::string16& find_this, 434 const base::string16& replace_with); 435BASE_EXPORT void ReplaceSubstringsAfterOffset(std::string* str, 436 size_t start_offset, 437 const std::string& find_this, 438 const std::string& replace_with); 439 440// Reserves enough memory in |str| to accommodate |length_with_null| characters, 441// sets the size of |str| to |length_with_null - 1| characters, and returns a 442// pointer to the underlying contiguous array of characters. This is typically 443// used when calling a function that writes results into a character array, but 444// the caller wants the data to be managed by a string-like object. It is 445// convenient in that is can be used inline in the call, and fast in that it 446// avoids copying the results of the call from a char* into a string. 447// 448// |length_with_null| must be at least 2, since otherwise the underlying string 449// would have size 0, and trying to access &((*str)[0]) in that case can result 450// in a number of problems. 451// 452// Internally, this takes linear time because the resize() call 0-fills the 453// underlying array for potentially all 454// (|length_with_null - 1| * sizeof(string_type::value_type)) bytes. Ideally we 455// could avoid this aspect of the resize() call, as we expect the caller to 456// immediately write over this memory, but there is no other way to set the size 457// of the string, and not doing that will mean people who access |str| rather 458// than str.c_str() will get back a string of whatever size |str| had on entry 459// to this function (probably 0). 460template <class string_type> 461inline typename string_type::value_type* WriteInto(string_type* str, 462 size_t length_with_null) { 463 DCHECK_GT(length_with_null, 1u); 464 str->reserve(length_with_null); 465 str->resize(length_with_null - 1); 466 return &((*str)[0]); 467} 468 469//----------------------------------------------------------------------------- 470 471// Splits a string into its fields delimited by any of the characters in 472// |delimiters|. Each field is added to the |tokens| vector. Returns the 473// number of tokens found. 474// 475// DEPRECATED. Use SplitStringUsingSet for new code (these just forward). 476// TODO(brettw) convert callers and delete these forwarders. 477BASE_EXPORT size_t Tokenize(const base::string16& str, 478 const base::string16& delimiters, 479 std::vector<base::string16>* tokens); 480BASE_EXPORT size_t Tokenize(const std::string& str, 481 const std::string& delimiters, 482 std::vector<std::string>* tokens); 483BASE_EXPORT size_t Tokenize(const base::StringPiece& str, 484 const base::StringPiece& delimiters, 485 std::vector<base::StringPiece>* tokens); 486 487// Does the opposite of SplitString(). 488BASE_EXPORT base::string16 JoinString(const std::vector<base::string16>& parts, 489 base::char16 s); 490BASE_EXPORT std::string JoinString( 491 const std::vector<std::string>& parts, char s); 492 493// Join |parts| using |separator|. 494BASE_EXPORT std::string JoinString( 495 const std::vector<std::string>& parts, 496 const std::string& separator); 497BASE_EXPORT base::string16 JoinString( 498 const std::vector<base::string16>& parts, 499 const base::string16& separator); 500 501// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively. 502// Additionally, any number of consecutive '$' characters is replaced by that 503// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be 504// NULL. This only allows you to use up to nine replacements. 505BASE_EXPORT base::string16 ReplaceStringPlaceholders( 506 const base::string16& format_string, 507 const std::vector<base::string16>& subst, 508 std::vector<size_t>* offsets); 509 510BASE_EXPORT std::string ReplaceStringPlaceholders( 511 const base::StringPiece& format_string, 512 const std::vector<std::string>& subst, 513 std::vector<size_t>* offsets); 514 515// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL. 516BASE_EXPORT base::string16 ReplaceStringPlaceholders( 517 const base::string16& format_string, 518 const base::string16& a, 519 size_t* offset); 520 521// Returns true if the string passed in matches the pattern. The pattern 522// string can contain wildcards like * and ? 523// The backslash character (\) is an escape character for * and ? 524// We limit the patterns to having a max of 16 * or ? characters. 525// ? matches 0 or 1 character, while * matches 0 or more characters. 526BASE_EXPORT bool MatchPattern(const base::StringPiece& string, 527 const base::StringPiece& pattern); 528BASE_EXPORT bool MatchPattern(const base::string16& string, 529 const base::string16& pattern); 530 531#endif // BASE_STRINGS_STRING_UTIL_H_ 532