string_util.h revision 3f50c38dc070f4bb515c1b64450dae14f316474e
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This file defines utility functions for working with strings.
6
7#ifndef BASE_STRING_UTIL_H_
8#define BASE_STRING_UTIL_H_
9#pragma once
10
11#include <stdarg.h>   // va_list
12
13#include <string>
14#include <vector>
15
16#include "base/basictypes.h"
17#include "base/compiler_specific.h"
18#include "base/string16.h"
19#include "base/string_piece.h"  // For implicit conversions.
20
21// TODO(brettw) remove this dependency. Previously StringPrintf lived in this
22// file. We need to convert the callers over to using stringprintf.h instead
23// and then remove this.
24#include "base/stringprintf.h"
25
26// Safe standard library wrappers for all platforms.
27
28namespace base {
29
30// C standard-library functions like "strncasecmp" and "snprintf" that aren't
31// cross-platform are provided as "base::strncasecmp", and their prototypes
32// are listed below.  These functions are then implemented as inline calls
33// to the platform-specific equivalents in the platform-specific headers.
34
35// Compares the two strings s1 and s2 without regard to case using
36// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
37// s2 > s1 according to a lexicographic comparison.
38int strcasecmp(const char* s1, const char* s2);
39
40// Compares up to count characters of s1 and s2 without regard to case using
41// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
42// s2 > s1 according to a lexicographic comparison.
43int strncasecmp(const char* s1, const char* s2, size_t count);
44
45// Same as strncmp but for char16 strings.
46int strncmp16(const char16* s1, const char16* s2, size_t count);
47
48// Wrapper for vsnprintf that always null-terminates and always returns the
49// number of characters that would be in an untruncated formatted
50// string, even when truncation occurs.
51int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments)
52    PRINTF_FORMAT(3, 0);
53
54// vswprintf always null-terminates, but when truncation occurs, it will either
55// return -1 or the number of characters that would be in an untruncated
56// formatted string.  The actual return value depends on the underlying
57// C library's vswprintf implementation.
58int vswprintf(wchar_t* buffer, size_t size,
59              const wchar_t* format, va_list arguments) WPRINTF_FORMAT(3, 0);
60
61// Some of these implementations need to be inlined.
62
63// We separate the declaration from the implementation of this inline
64// function just so the PRINTF_FORMAT works.
65inline int snprintf(char* buffer, size_t size, const char* format, ...)
66    PRINTF_FORMAT(3, 4);
67inline int snprintf(char* buffer, size_t size, const char* format, ...) {
68  va_list arguments;
69  va_start(arguments, format);
70  int result = vsnprintf(buffer, size, format, arguments);
71  va_end(arguments);
72  return result;
73}
74
75// We separate the declaration from the implementation of this inline
76// function just so the WPRINTF_FORMAT works.
77inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...)
78    WPRINTF_FORMAT(3, 4);
79inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) {
80  va_list arguments;
81  va_start(arguments, format);
82  int result = vswprintf(buffer, size, format, arguments);
83  va_end(arguments);
84  return result;
85}
86
87// BSD-style safe and consistent string copy functions.
88// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
89// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
90// long as |dst_size| is not 0.  Returns the length of |src| in characters.
91// If the return value is >= dst_size, then the output was truncated.
92// NOTE: All sizes are in number of characters, NOT in bytes.
93size_t strlcpy(char* dst, const char* src, size_t dst_size);
94size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size);
95
96// Scan a wprintf format string to determine whether it's portable across a
97// variety of systems.  This function only checks that the conversion
98// specifiers used by the format string are supported and have the same meaning
99// on a variety of systems.  It doesn't check for other errors that might occur
100// within a format string.
101//
102// Nonportable conversion specifiers for wprintf are:
103//  - 's' and 'c' without an 'l' length modifier.  %s and %c operate on char
104//     data on all systems except Windows, which treat them as wchar_t data.
105//     Use %ls and %lc for wchar_t data instead.
106//  - 'S' and 'C', which operate on wchar_t data on all systems except Windows,
107//     which treat them as char data.  Use %ls and %lc for wchar_t data
108//     instead.
109//  - 'F', which is not identified by Windows wprintf documentation.
110//  - 'D', 'O', and 'U', which are deprecated and not available on all systems.
111//     Use %ld, %lo, and %lu instead.
112//
113// Note that there is no portable conversion specifier for char data when
114// working with wprintf.
115//
116// This function is intended to be called from base::vswprintf.
117bool IsWprintfFormatPortable(const wchar_t* format);
118
119// ASCII-specific tolower.  The standard library's tolower is locale sensitive,
120// so we don't want to use it here.
121template <class Char> inline Char ToLowerASCII(Char c) {
122  return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
123}
124
125// ASCII-specific toupper.  The standard library's toupper is locale sensitive,
126// so we don't want to use it here.
127template <class Char> inline Char ToUpperASCII(Char c) {
128  return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c;
129}
130
131// Function objects to aid in comparing/searching strings.
132
133template<typename Char> struct CaseInsensitiveCompare {
134 public:
135  bool operator()(Char x, Char y) const {
136    // TODO(darin): Do we really want to do locale sensitive comparisons here?
137    // See http://crbug.com/24917
138    return tolower(x) == tolower(y);
139  }
140};
141
142template<typename Char> struct CaseInsensitiveCompareASCII {
143 public:
144  bool operator()(Char x, Char y) const {
145    return ToLowerASCII(x) == ToLowerASCII(y);
146  }
147};
148
149}  // namespace base
150
151#if defined(OS_WIN)
152#include "base/string_util_win.h"
153#elif defined(OS_POSIX)
154#include "base/string_util_posix.h"
155#else
156#error Define string operations appropriately for your platform
157#endif
158
159// These threadsafe functions return references to globally unique empty
160// strings.
161//
162// DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS.
163// There is only one case where you should use these: functions which need to
164// return a string by reference (e.g. as a class member accessor), and don't
165// have an empty string to use (e.g. in an error case).  These should not be
166// used as initializers, function arguments, or return values for functions
167// which return by value or outparam.
168const std::string& EmptyString();
169const std::wstring& EmptyWString();
170const string16& EmptyString16();
171
172extern const wchar_t kWhitespaceWide[];
173extern const char16 kWhitespaceUTF16[];
174extern const char kWhitespaceASCII[];
175
176extern const char kUtf8ByteOrderMark[];
177
178// Removes characters in remove_chars from anywhere in input.  Returns true if
179// any characters were removed.
180// NOTE: Safe to use the same variable for both input and output.
181bool RemoveChars(const std::wstring& input,
182                 const wchar_t remove_chars[],
183                 std::wstring* output);
184bool RemoveChars(const string16& input,
185                 const char16 remove_chars[],
186                 string16* output);
187bool RemoveChars(const std::string& input,
188                 const char remove_chars[],
189                 std::string* output);
190
191// Removes characters in trim_chars from the beginning and end of input.
192// NOTE: Safe to use the same variable for both input and output.
193bool TrimString(const std::wstring& input,
194                const wchar_t trim_chars[],
195                std::wstring* output);
196bool TrimString(const string16& input,
197                const char16 trim_chars[],
198                string16* output);
199bool TrimString(const std::string& input,
200                const char trim_chars[],
201                std::string* output);
202
203// Truncates a string to the nearest UTF-8 character that will leave
204// the string less than or equal to the specified byte size.
205void TruncateUTF8ToByteSize(const std::string& input,
206                            const size_t byte_size,
207                            std::string* output);
208
209// Trims any whitespace from either end of the input string.  Returns where
210// whitespace was found.
211// The non-wide version has two functions:
212// * TrimWhitespaceASCII()
213//   This function is for ASCII strings and only looks for ASCII whitespace;
214// Please choose the best one according to your usage.
215// NOTE: Safe to use the same variable for both input and output.
216enum TrimPositions {
217  TRIM_NONE     = 0,
218  TRIM_LEADING  = 1 << 0,
219  TRIM_TRAILING = 1 << 1,
220  TRIM_ALL      = TRIM_LEADING | TRIM_TRAILING,
221};
222TrimPositions TrimWhitespace(const std::wstring& input,
223                             TrimPositions positions,
224                             std::wstring* output);
225TrimPositions TrimWhitespace(const string16& input,
226                             TrimPositions positions,
227                             string16* output);
228TrimPositions TrimWhitespaceASCII(const std::string& input,
229                                  TrimPositions positions,
230                                  std::string* output);
231
232// Deprecated. This function is only for backward compatibility and calls
233// TrimWhitespaceASCII().
234TrimPositions TrimWhitespace(const std::string& input,
235                             TrimPositions positions,
236                             std::string* output);
237
238// Searches  for CR or LF characters.  Removes all contiguous whitespace
239// strings that contain them.  This is useful when trying to deal with text
240// copied from terminals.
241// Returns |text|, with the following three transformations:
242// (1) Leading and trailing whitespace is trimmed.
243// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace
244//     sequences containing a CR or LF are trimmed.
245// (3) All other whitespace sequences are converted to single spaces.
246std::wstring CollapseWhitespace(const std::wstring& text,
247                                bool trim_sequences_with_line_breaks);
248string16 CollapseWhitespace(const string16& text,
249                            bool trim_sequences_with_line_breaks);
250std::string CollapseWhitespaceASCII(const std::string& text,
251                                    bool trim_sequences_with_line_breaks);
252
253// Returns true if the passed string is empty or contains only white-space
254// characters.
255bool ContainsOnlyWhitespaceASCII(const std::string& str);
256bool ContainsOnlyWhitespace(const string16& str);
257
258// Returns true if |input| is empty or contains only characters found in
259// |characters|.
260bool ContainsOnlyChars(const std::wstring& input,
261                       const std::wstring& characters);
262bool ContainsOnlyChars(const string16& input, const string16& characters);
263bool ContainsOnlyChars(const std::string& input, const std::string& characters);
264
265// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII
266// beforehand.
267std::string WideToASCII(const std::wstring& wide);
268std::string UTF16ToASCII(const string16& utf16);
269
270// Converts the given wide string to the corresponding Latin1. This will fail
271// (return false) if any characters are more than 255.
272bool WideToLatin1(const std::wstring& wide, std::string* latin1);
273
274// Returns true if the specified string matches the criteria. How can a wide
275// string be 8-bit or UTF8? It contains only characters that are < 256 (in the
276// first case) or characters that use only 8-bits and whose 8-bit
277// representation looks like a UTF-8 string (the second case).
278//
279// Note that IsStringUTF8 checks not only if the input is structurally
280// valid but also if it doesn't contain any non-character codepoint
281// (e.g. U+FFFE). It's done on purpose because all the existing callers want
282// to have the maximum 'discriminating' power from other encodings. If
283// there's a use case for just checking the structural validity, we have to
284// add a new function for that.
285bool IsStringUTF8(const std::string& str);
286bool IsStringASCII(const std::wstring& str);
287bool IsStringASCII(const base::StringPiece& str);
288bool IsStringASCII(const string16& str);
289
290// Converts the elements of the given string.  This version uses a pointer to
291// clearly differentiate it from the non-pointer variant.
292template <class str> inline void StringToLowerASCII(str* s) {
293  for (typename str::iterator i = s->begin(); i != s->end(); ++i)
294    *i = base::ToLowerASCII(*i);
295}
296
297template <class str> inline str StringToLowerASCII(const str& s) {
298  // for std::string and std::wstring
299  str output(s);
300  StringToLowerASCII(&output);
301  return output;
302}
303
304// Converts the elements of the given string.  This version uses a pointer to
305// clearly differentiate it from the non-pointer variant.
306template <class str> inline void StringToUpperASCII(str* s) {
307  for (typename str::iterator i = s->begin(); i != s->end(); ++i)
308    *i = base::ToUpperASCII(*i);
309}
310
311template <class str> inline str StringToUpperASCII(const str& s) {
312  // for std::string and std::wstring
313  str output(s);
314  StringToUpperASCII(&output);
315  return output;
316}
317
318// Compare the lower-case form of the given string against the given ASCII
319// string.  This is useful for doing checking if an input string matches some
320// token, and it is optimized to avoid intermediate string copies.  This API is
321// borrowed from the equivalent APIs in Mozilla.
322bool LowerCaseEqualsASCII(const std::string& a, const char* b);
323bool LowerCaseEqualsASCII(const std::wstring& a, const char* b);
324bool LowerCaseEqualsASCII(const string16& a, const char* b);
325
326// Same thing, but with string iterators instead.
327bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
328                          std::string::const_iterator a_end,
329                          const char* b);
330bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
331                          std::wstring::const_iterator a_end,
332                          const char* b);
333bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
334                          string16::const_iterator a_end,
335                          const char* b);
336bool LowerCaseEqualsASCII(const char* a_begin,
337                          const char* a_end,
338                          const char* b);
339bool LowerCaseEqualsASCII(const wchar_t* a_begin,
340                          const wchar_t* a_end,
341                          const char* b);
342bool LowerCaseEqualsASCII(const char16* a_begin,
343                          const char16* a_end,
344                          const char* b);
345
346// Performs a case-sensitive string compare. The behavior is undefined if both
347// strings are not ASCII.
348bool EqualsASCII(const string16& a, const base::StringPiece& b);
349
350// Returns true if str starts with search, or false otherwise.
351bool StartsWithASCII(const std::string& str,
352                     const std::string& search,
353                     bool case_sensitive);
354bool StartsWith(const std::wstring& str,
355                const std::wstring& search,
356                bool case_sensitive);
357bool StartsWith(const string16& str,
358                const string16& search,
359                bool case_sensitive);
360
361// Returns true if str ends with search, or false otherwise.
362bool EndsWith(const std::string& str,
363              const std::string& search,
364              bool case_sensitive);
365bool EndsWith(const std::wstring& str,
366              const std::wstring& search,
367              bool case_sensitive);
368bool EndsWith(const string16& str,
369              const string16& search,
370              bool case_sensitive);
371
372
373// Determines the type of ASCII character, independent of locale (the C
374// library versions will change based on locale).
375template <typename Char>
376inline bool IsAsciiWhitespace(Char c) {
377  return c == ' ' || c == '\r' || c == '\n' || c == '\t';
378}
379template <typename Char>
380inline bool IsAsciiAlpha(Char c) {
381  return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'));
382}
383template <typename Char>
384inline bool IsAsciiDigit(Char c) {
385  return c >= '0' && c <= '9';
386}
387
388template <typename Char>
389inline bool IsHexDigit(Char c) {
390  return (c >= '0' && c <= '9') ||
391         (c >= 'A' && c <= 'F') ||
392         (c >= 'a' && c <= 'f');
393}
394
395template <typename Char>
396inline Char HexDigitToInt(Char c) {
397  DCHECK(IsHexDigit(c));
398  if (c >= '0' && c <= '9')
399    return c - '0';
400  if (c >= 'A' && c <= 'F')
401    return c - 'A' + 10;
402  if (c >= 'a' && c <= 'f')
403    return c - 'a' + 10;
404  return 0;
405}
406
407// Returns true if it's a whitespace character.
408inline bool IsWhitespace(wchar_t c) {
409  return wcschr(kWhitespaceWide, c) != NULL;
410}
411
412enum DataUnits {
413  DATA_UNITS_BYTE = 0,
414  DATA_UNITS_KIBIBYTE,
415  DATA_UNITS_MEBIBYTE,
416  DATA_UNITS_GIBIBYTE,
417};
418
419// Return the unit type that is appropriate for displaying the amount of bytes
420// passed in.
421DataUnits GetByteDisplayUnits(int64 bytes);
422
423// Return a byte string in human-readable format, displayed in units appropriate
424// specified by 'units', with an optional unit suffix.
425// Ex: FormatBytes(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB"
426// Ex: FormatBytes(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1"
427string16 FormatBytes(int64 bytes, DataUnits units, bool show_units);
428
429// As above, but with "/s" units.
430// Ex: FormatSpeed(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB/s"
431// Ex: FormatSpeed(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1"
432string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units);
433
434// Return a number formated with separators in the user's locale way.
435// Ex: FormatNumber(1234567) => 1,234,567
436string16 FormatNumber(int64 number);
437
438// Starting at |start_offset| (usually 0), replace the first instance of
439// |find_this| with |replace_with|.
440void ReplaceFirstSubstringAfterOffset(string16* str,
441                                      string16::size_type start_offset,
442                                      const string16& find_this,
443                                      const string16& replace_with);
444void ReplaceFirstSubstringAfterOffset(std::string* str,
445                                      std::string::size_type start_offset,
446                                      const std::string& find_this,
447                                      const std::string& replace_with);
448
449// Starting at |start_offset| (usually 0), look through |str| and replace all
450// instances of |find_this| with |replace_with|.
451//
452// This does entire substrings; use std::replace in <algorithm> for single
453// characters, for example:
454//   std::replace(str.begin(), str.end(), 'a', 'b');
455void ReplaceSubstringsAfterOffset(string16* str,
456                                  string16::size_type start_offset,
457                                  const string16& find_this,
458                                  const string16& replace_with);
459void ReplaceSubstringsAfterOffset(std::string* str,
460                                  std::string::size_type start_offset,
461                                  const std::string& find_this,
462                                  const std::string& replace_with);
463
464// This is mpcomplete's pattern for saving a string copy when dealing with
465// a function that writes results into a wchar_t[] and wanting the result to
466// end up in a std::wstring.  It ensures that the std::wstring's internal
467// buffer has enough room to store the characters to be written into it, and
468// sets its .length() attribute to the right value.
469//
470// The reserve() call allocates the memory required to hold the string
471// plus a terminating null.  This is done because resize() isn't
472// guaranteed to reserve space for the null.  The resize() call is
473// simply the only way to change the string's 'length' member.
474//
475// XXX-performance: the call to wide.resize() takes linear time, since it fills
476// the string's buffer with nulls.  I call it to change the length of the
477// string (needed because writing directly to the buffer doesn't do this).
478// Perhaps there's a constant-time way to change the string's length.
479template <class string_type>
480inline typename string_type::value_type* WriteInto(string_type* str,
481                                                   size_t length_with_null) {
482  str->reserve(length_with_null);
483  str->resize(length_with_null - 1);
484  return &((*str)[0]);
485}
486
487//-----------------------------------------------------------------------------
488
489// Splits a string into its fields delimited by any of the characters in
490// |delimiters|.  Each field is added to the |tokens| vector.  Returns the
491// number of tokens found.
492size_t Tokenize(const std::wstring& str,
493                const std::wstring& delimiters,
494                std::vector<std::wstring>* tokens);
495size_t Tokenize(const string16& str,
496                const string16& delimiters,
497                std::vector<string16>* tokens);
498size_t Tokenize(const std::string& str,
499                const std::string& delimiters,
500                std::vector<std::string>* tokens);
501size_t Tokenize(const base::StringPiece& str,
502                const base::StringPiece& delimiters,
503                std::vector<base::StringPiece>* tokens);
504
505// Does the opposite of SplitString().
506string16 JoinString(const std::vector<string16>& parts, char16 s);
507std::string JoinString(const std::vector<std::string>& parts, char s);
508
509// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively.
510// Additionally, any number of consecutive '$' characters is replaced by that
511// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be
512// NULL. This only allows you to use up to nine replacements.
513string16 ReplaceStringPlaceholders(const string16& format_string,
514                                   const std::vector<string16>& subst,
515                                   std::vector<size_t>* offsets);
516
517std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
518                                      const std::vector<std::string>& subst,
519                                      std::vector<size_t>* offsets);
520
521// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL.
522string16 ReplaceStringPlaceholders(const string16& format_string,
523                                   const string16& a,
524                                   size_t* offset);
525
526// Returns true if the string passed in matches the pattern. The pattern
527// string can contain wildcards like * and ?
528// The backslash character (\) is an escape character for * and ?
529// We limit the patterns to having a max of 16 * or ? characters.
530// ? matches 0 or 1 character, while * matches 0 or more characters.
531bool MatchPattern(const base::StringPiece& string,
532                  const base::StringPiece& pattern);
533bool MatchPattern(const string16& string, const string16& pattern);
534
535// Hack to convert any char-like type to its unsigned counterpart.
536// For example, it will convert char, signed char and unsigned char to unsigned
537// char.
538template<typename T>
539struct ToUnsigned {
540  typedef T Unsigned;
541};
542
543template<>
544struct ToUnsigned<char> {
545  typedef unsigned char Unsigned;
546};
547template<>
548struct ToUnsigned<signed char> {
549  typedef unsigned char Unsigned;
550};
551template<>
552struct ToUnsigned<wchar_t> {
553#if defined(WCHAR_T_IS_UTF16)
554  typedef unsigned short Unsigned;
555#elif defined(WCHAR_T_IS_UTF32)
556  typedef uint32 Unsigned;
557#endif
558};
559template<>
560struct ToUnsigned<short> {
561  typedef unsigned short Unsigned;
562};
563
564#endif  // BASE_STRING_UTIL_H_
565