1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This file defines utility functions for working with strings.
6
7#ifndef BASE_STRING_UTIL_H_
8#define BASE_STRING_UTIL_H_
9#pragma once
10
11#include <stdarg.h>   // va_list
12
13#include <string>
14#include <vector>
15
16#include "base/base_api.h"
17#include "base/basictypes.h"
18#include "base/compiler_specific.h"
19#include "base/string16.h"
20#include "base/string_piece.h"  // For implicit conversions.
21
22// TODO(brettw) remove this dependency. Previously StringPrintf lived in this
23// file. We need to convert the callers over to using stringprintf.h instead
24// and then remove this.
25#include "base/stringprintf.h"
26
27// Safe standard library wrappers for all platforms.
28
29namespace base {
30
31// C standard-library functions like "strncasecmp" and "snprintf" that aren't
32// cross-platform are provided as "base::strncasecmp", and their prototypes
33// are listed below.  These functions are then implemented as inline calls
34// to the platform-specific equivalents in the platform-specific headers.
35
36// Compares the two strings s1 and s2 without regard to case using
37// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
38// s2 > s1 according to a lexicographic comparison.
39BASE_API int strcasecmp(const char* s1, const char* s2);
40
41// Compares up to count characters of s1 and s2 without regard to case using
42// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
43// s2 > s1 according to a lexicographic comparison.
44BASE_API int strncasecmp(const char* s1, const char* s2, size_t count);
45
46// Same as strncmp but for char16 strings.
47BASE_API int strncmp16(const char16* s1, const char16* s2, size_t count);
48
49// Wrapper for vsnprintf that always null-terminates and always returns the
50// number of characters that would be in an untruncated formatted
51// string, even when truncation occurs.
52BASE_API int vsnprintf(char* buffer, size_t size, const char* format,
53                       va_list arguments)
54    PRINTF_FORMAT(3, 0);
55
56// vswprintf always null-terminates, but when truncation occurs, it will either
57// return -1 or the number of characters that would be in an untruncated
58// formatted string.  The actual return value depends on the underlying
59// C library's vswprintf implementation.
60BASE_API int vswprintf(wchar_t* buffer, size_t size,
61                       const wchar_t* format, va_list arguments)
62    WPRINTF_FORMAT(3, 0);
63
64// Some of these implementations need to be inlined.
65
66// We separate the declaration from the implementation of this inline
67// function just so the PRINTF_FORMAT works.
68inline int snprintf(char* buffer, size_t size, const char* format, ...)
69    PRINTF_FORMAT(3, 4);
70inline int snprintf(char* buffer, size_t size, const char* format, ...) {
71  va_list arguments;
72  va_start(arguments, format);
73  int result = vsnprintf(buffer, size, format, arguments);
74  va_end(arguments);
75  return result;
76}
77
78// We separate the declaration from the implementation of this inline
79// function just so the WPRINTF_FORMAT works.
80inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...)
81    WPRINTF_FORMAT(3, 4);
82inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) {
83  va_list arguments;
84  va_start(arguments, format);
85  int result = vswprintf(buffer, size, format, arguments);
86  va_end(arguments);
87  return result;
88}
89
90// BSD-style safe and consistent string copy functions.
91// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
92// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
93// long as |dst_size| is not 0.  Returns the length of |src| in characters.
94// If the return value is >= dst_size, then the output was truncated.
95// NOTE: All sizes are in number of characters, NOT in bytes.
96BASE_API size_t strlcpy(char* dst, const char* src, size_t dst_size);
97BASE_API size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size);
98
99// Scan a wprintf format string to determine whether it's portable across a
100// variety of systems.  This function only checks that the conversion
101// specifiers used by the format string are supported and have the same meaning
102// on a variety of systems.  It doesn't check for other errors that might occur
103// within a format string.
104//
105// Nonportable conversion specifiers for wprintf are:
106//  - 's' and 'c' without an 'l' length modifier.  %s and %c operate on char
107//     data on all systems except Windows, which treat them as wchar_t data.
108//     Use %ls and %lc for wchar_t data instead.
109//  - 'S' and 'C', which operate on wchar_t data on all systems except Windows,
110//     which treat them as char data.  Use %ls and %lc for wchar_t data
111//     instead.
112//  - 'F', which is not identified by Windows wprintf documentation.
113//  - 'D', 'O', and 'U', which are deprecated and not available on all systems.
114//     Use %ld, %lo, and %lu instead.
115//
116// Note that there is no portable conversion specifier for char data when
117// working with wprintf.
118//
119// This function is intended to be called from base::vswprintf.
120BASE_API bool IsWprintfFormatPortable(const wchar_t* format);
121
122// ASCII-specific tolower.  The standard library's tolower is locale sensitive,
123// so we don't want to use it here.
124template <class Char> inline Char ToLowerASCII(Char c) {
125  return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
126}
127
128// ASCII-specific toupper.  The standard library's toupper is locale sensitive,
129// so we don't want to use it here.
130template <class Char> inline Char ToUpperASCII(Char c) {
131  return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c;
132}
133
134// Function objects to aid in comparing/searching strings.
135
136template<typename Char> struct CaseInsensitiveCompare {
137 public:
138  bool operator()(Char x, Char y) const {
139    // TODO(darin): Do we really want to do locale sensitive comparisons here?
140    // See http://crbug.com/24917
141    return tolower(x) == tolower(y);
142  }
143};
144
145template<typename Char> struct CaseInsensitiveCompareASCII {
146 public:
147  bool operator()(Char x, Char y) const {
148    return ToLowerASCII(x) == ToLowerASCII(y);
149  }
150};
151
152}  // namespace base
153
154#if defined(OS_WIN)
155#include "base/string_util_win.h"
156#elif defined(OS_POSIX)
157#include "base/string_util_posix.h"
158#else
159#error Define string operations appropriately for your platform
160#endif
161
162// These threadsafe functions return references to globally unique empty
163// strings.
164//
165// DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS.
166// There is only one case where you should use these: functions which need to
167// return a string by reference (e.g. as a class member accessor), and don't
168// have an empty string to use (e.g. in an error case).  These should not be
169// used as initializers, function arguments, or return values for functions
170// which return by value or outparam.
171BASE_API const std::string& EmptyString();
172BASE_API const std::wstring& EmptyWString();
173BASE_API const string16& EmptyString16();
174
175BASE_API extern const wchar_t kWhitespaceWide[];
176BASE_API extern const char16 kWhitespaceUTF16[];
177extern const char kWhitespaceASCII[];
178
179extern const char kUtf8ByteOrderMark[];
180
181// Removes characters in remove_chars from anywhere in input.  Returns true if
182// any characters were removed.
183// NOTE: Safe to use the same variable for both input and output.
184BASE_API bool RemoveChars(const std::wstring& input,
185                          const wchar_t remove_chars[],
186                          std::wstring* output);
187BASE_API bool RemoveChars(const string16& input,
188                          const char16 remove_chars[],
189                          string16* output);
190BASE_API bool RemoveChars(const std::string& input,
191                          const char remove_chars[],
192                          std::string* output);
193
194// Removes characters in trim_chars from the beginning and end of input.
195// NOTE: Safe to use the same variable for both input and output.
196BASE_API bool TrimString(const std::wstring& input,
197                         const wchar_t trim_chars[],
198                         std::wstring* output);
199BASE_API bool TrimString(const string16& input,
200                         const char16 trim_chars[],
201                         string16* output);
202BASE_API bool TrimString(const std::string& input,
203                         const char trim_chars[],
204                         std::string* output);
205
206// Truncates a string to the nearest UTF-8 character that will leave
207// the string less than or equal to the specified byte size.
208BASE_API void TruncateUTF8ToByteSize(const std::string& input,
209                                     const size_t byte_size,
210                                     std::string* output);
211
212// Trims any whitespace from either end of the input string.  Returns where
213// whitespace was found.
214// The non-wide version has two functions:
215// * TrimWhitespaceASCII()
216//   This function is for ASCII strings and only looks for ASCII whitespace;
217// Please choose the best one according to your usage.
218// NOTE: Safe to use the same variable for both input and output.
219enum TrimPositions {
220  TRIM_NONE     = 0,
221  TRIM_LEADING  = 1 << 0,
222  TRIM_TRAILING = 1 << 1,
223  TRIM_ALL      = TRIM_LEADING | TRIM_TRAILING,
224};
225BASE_API TrimPositions TrimWhitespace(const std::wstring& input,
226                                      TrimPositions positions,
227                                      std::wstring* output);
228BASE_API TrimPositions TrimWhitespace(const string16& input,
229                                      TrimPositions positions,
230                                      string16* output);
231BASE_API TrimPositions TrimWhitespaceASCII(const std::string& input,
232                                           TrimPositions positions,
233                                           std::string* output);
234
235// Deprecated. This function is only for backward compatibility and calls
236// TrimWhitespaceASCII().
237BASE_API TrimPositions TrimWhitespace(const std::string& input,
238                                      TrimPositions positions,
239                                      std::string* output);
240
241// Searches  for CR or LF characters.  Removes all contiguous whitespace
242// strings that contain them.  This is useful when trying to deal with text
243// copied from terminals.
244// Returns |text|, with the following three transformations:
245// (1) Leading and trailing whitespace is trimmed.
246// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace
247//     sequences containing a CR or LF are trimmed.
248// (3) All other whitespace sequences are converted to single spaces.
249BASE_API std::wstring CollapseWhitespace(const std::wstring& text,
250                                         bool trim_sequences_with_line_breaks);
251BASE_API string16 CollapseWhitespace(const string16& text,
252                                     bool trim_sequences_with_line_breaks);
253BASE_API std::string CollapseWhitespaceASCII(
254    const std::string& text, bool trim_sequences_with_line_breaks);
255
256// Returns true if the passed string is empty or contains only white-space
257// characters.
258BASE_API bool ContainsOnlyWhitespaceASCII(const std::string& str);
259BASE_API bool ContainsOnlyWhitespace(const string16& str);
260
261// Returns true if |input| is empty or contains only characters found in
262// |characters|.
263BASE_API bool ContainsOnlyChars(const std::wstring& input,
264                                const std::wstring& characters);
265BASE_API bool ContainsOnlyChars(const string16& input,
266                                const string16& characters);
267BASE_API bool ContainsOnlyChars(const std::string& input,
268                                const std::string& characters);
269
270// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII
271// beforehand.
272BASE_API std::string WideToASCII(const std::wstring& wide);
273BASE_API std::string UTF16ToASCII(const string16& utf16);
274
275// Converts the given wide string to the corresponding Latin1. This will fail
276// (return false) if any characters are more than 255.
277BASE_API bool WideToLatin1(const std::wstring& wide, std::string* latin1);
278
279// Returns true if the specified string matches the criteria. How can a wide
280// string be 8-bit or UTF8? It contains only characters that are < 256 (in the
281// first case) or characters that use only 8-bits and whose 8-bit
282// representation looks like a UTF-8 string (the second case).
283//
284// Note that IsStringUTF8 checks not only if the input is structurally
285// valid but also if it doesn't contain any non-character codepoint
286// (e.g. U+FFFE). It's done on purpose because all the existing callers want
287// to have the maximum 'discriminating' power from other encodings. If
288// there's a use case for just checking the structural validity, we have to
289// add a new function for that.
290BASE_API bool IsStringUTF8(const std::string& str);
291BASE_API bool IsStringASCII(const std::wstring& str);
292BASE_API bool IsStringASCII(const base::StringPiece& str);
293BASE_API bool IsStringASCII(const string16& str);
294
295// Converts the elements of the given string.  This version uses a pointer to
296// clearly differentiate it from the non-pointer variant.
297template <class str> inline void StringToLowerASCII(str* s) {
298  for (typename str::iterator i = s->begin(); i != s->end(); ++i)
299    *i = base::ToLowerASCII(*i);
300}
301
302template <class str> inline str StringToLowerASCII(const str& s) {
303  // for std::string and std::wstring
304  str output(s);
305  StringToLowerASCII(&output);
306  return output;
307}
308
309// Converts the elements of the given string.  This version uses a pointer to
310// clearly differentiate it from the non-pointer variant.
311template <class str> inline void StringToUpperASCII(str* s) {
312  for (typename str::iterator i = s->begin(); i != s->end(); ++i)
313    *i = base::ToUpperASCII(*i);
314}
315
316template <class str> inline str StringToUpperASCII(const str& s) {
317  // for std::string and std::wstring
318  str output(s);
319  StringToUpperASCII(&output);
320  return output;
321}
322
323// Compare the lower-case form of the given string against the given ASCII
324// string.  This is useful for doing checking if an input string matches some
325// token, and it is optimized to avoid intermediate string copies.  This API is
326// borrowed from the equivalent APIs in Mozilla.
327BASE_API bool LowerCaseEqualsASCII(const std::string& a, const char* b);
328BASE_API bool LowerCaseEqualsASCII(const std::wstring& a, const char* b);
329BASE_API bool LowerCaseEqualsASCII(const string16& a, const char* b);
330
331// Same thing, but with string iterators instead.
332BASE_API bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
333                                   std::string::const_iterator a_end,
334                                   const char* b);
335BASE_API bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
336                                   std::wstring::const_iterator a_end,
337                                   const char* b);
338BASE_API bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
339                                   string16::const_iterator a_end,
340                                   const char* b);
341BASE_API bool LowerCaseEqualsASCII(const char* a_begin,
342                                   const char* a_end,
343                                   const char* b);
344BASE_API bool LowerCaseEqualsASCII(const wchar_t* a_begin,
345                                   const wchar_t* a_end,
346                                   const char* b);
347BASE_API bool LowerCaseEqualsASCII(const char16* a_begin,
348                                   const char16* a_end,
349                                   const char* b);
350
351// Performs a case-sensitive string compare. The behavior is undefined if both
352// strings are not ASCII.
353BASE_API bool EqualsASCII(const string16& a, const base::StringPiece& b);
354
355// Returns true if str starts with search, or false otherwise.
356BASE_API bool StartsWithASCII(const std::string& str,
357                              const std::string& search,
358                              bool case_sensitive);
359BASE_API bool StartsWith(const std::wstring& str,
360                         const std::wstring& search,
361                         bool case_sensitive);
362BASE_API bool StartsWith(const string16& str,
363                         const string16& search,
364                         bool case_sensitive);
365
366// Returns true if str ends with search, or false otherwise.
367BASE_API bool EndsWith(const std::string& str,
368                       const std::string& search,
369                       bool case_sensitive);
370BASE_API bool EndsWith(const std::wstring& str,
371                       const std::wstring& search,
372                       bool case_sensitive);
373BASE_API bool EndsWith(const string16& str,
374                       const string16& search,
375                       bool case_sensitive);
376
377
378// Determines the type of ASCII character, independent of locale (the C
379// library versions will change based on locale).
380template <typename Char>
381inline bool IsAsciiWhitespace(Char c) {
382  return c == ' ' || c == '\r' || c == '\n' || c == '\t';
383}
384template <typename Char>
385inline bool IsAsciiAlpha(Char c) {
386  return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'));
387}
388template <typename Char>
389inline bool IsAsciiDigit(Char c) {
390  return c >= '0' && c <= '9';
391}
392
393template <typename Char>
394inline bool IsHexDigit(Char c) {
395  return (c >= '0' && c <= '9') ||
396         (c >= 'A' && c <= 'F') ||
397         (c >= 'a' && c <= 'f');
398}
399
400template <typename Char>
401inline Char HexDigitToInt(Char c) {
402  DCHECK(IsHexDigit(c));
403  if (c >= '0' && c <= '9')
404    return c - '0';
405  if (c >= 'A' && c <= 'F')
406    return c - 'A' + 10;
407  if (c >= 'a' && c <= 'f')
408    return c - 'a' + 10;
409  return 0;
410}
411
412// Returns true if it's a whitespace character.
413inline bool IsWhitespace(wchar_t c) {
414  return wcschr(kWhitespaceWide, c) != NULL;
415}
416
417enum DataUnits {
418  DATA_UNITS_BYTE = 0,
419  DATA_UNITS_KIBIBYTE,
420  DATA_UNITS_MEBIBYTE,
421  DATA_UNITS_GIBIBYTE,
422};
423
424// Return the unit type that is appropriate for displaying the amount of bytes
425// passed in.
426BASE_API DataUnits GetByteDisplayUnits(int64 bytes);
427
428// Return a byte string in human-readable format, displayed in units appropriate
429// specified by 'units', with an optional unit suffix.
430// Ex: FormatBytes(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB"
431// Ex: FormatBytes(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1"
432BASE_API string16 FormatBytes(int64 bytes, DataUnits units, bool show_units);
433
434// As above, but with "/s" units.
435// Ex: FormatSpeed(512, DATA_UNITS_KIBIBYTE, true) => "0.5 KB/s"
436// Ex: FormatSpeed(10*1024, DATA_UNITS_MEBIBYTE, false) => "0.1"
437BASE_API string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units);
438
439// Return a number formated with separators in the user's locale way.
440// Ex: FormatNumber(1234567) => 1,234,567
441BASE_API string16 FormatNumber(int64 number);
442
443// Starting at |start_offset| (usually 0), replace the first instance of
444// |find_this| with |replace_with|.
445BASE_API void ReplaceFirstSubstringAfterOffset(string16* str,
446                                               string16::size_type start_offset,
447                                               const string16& find_this,
448                                               const string16& replace_with);
449BASE_API void ReplaceFirstSubstringAfterOffset(
450    std::string* str,
451    std::string::size_type start_offset,
452    const std::string& find_this,
453    const std::string& replace_with);
454
455// Starting at |start_offset| (usually 0), look through |str| and replace all
456// instances of |find_this| with |replace_with|.
457//
458// This does entire substrings; use std::replace in <algorithm> for single
459// characters, for example:
460//   std::replace(str.begin(), str.end(), 'a', 'b');
461BASE_API void ReplaceSubstringsAfterOffset(string16* str,
462                                           string16::size_type start_offset,
463                                           const string16& find_this,
464                                           const string16& replace_with);
465BASE_API void ReplaceSubstringsAfterOffset(std::string* str,
466                                           std::string::size_type start_offset,
467                                           const std::string& find_this,
468                                           const std::string& replace_with);
469
470// This is mpcomplete's pattern for saving a string copy when dealing with
471// a function that writes results into a wchar_t[] and wanting the result to
472// end up in a std::wstring.  It ensures that the std::wstring's internal
473// buffer has enough room to store the characters to be written into it, and
474// sets its .length() attribute to the right value.
475//
476// The reserve() call allocates the memory required to hold the string
477// plus a terminating null.  This is done because resize() isn't
478// guaranteed to reserve space for the null.  The resize() call is
479// simply the only way to change the string's 'length' member.
480//
481// XXX-performance: the call to wide.resize() takes linear time, since it fills
482// the string's buffer with nulls.  I call it to change the length of the
483// string (needed because writing directly to the buffer doesn't do this).
484// Perhaps there's a constant-time way to change the string's length.
485template <class string_type>
486inline typename string_type::value_type* WriteInto(string_type* str,
487                                                   size_t length_with_null) {
488  str->reserve(length_with_null);
489  str->resize(length_with_null - 1);
490  return &((*str)[0]);
491}
492
493//-----------------------------------------------------------------------------
494
495// Splits a string into its fields delimited by any of the characters in
496// |delimiters|.  Each field is added to the |tokens| vector.  Returns the
497// number of tokens found.
498BASE_API size_t Tokenize(const std::wstring& str,
499                         const std::wstring& delimiters,
500                         std::vector<std::wstring>* tokens);
501BASE_API size_t Tokenize(const string16& str,
502                         const string16& delimiters,
503                         std::vector<string16>* tokens);
504BASE_API size_t Tokenize(const std::string& str,
505                         const std::string& delimiters,
506                         std::vector<std::string>* tokens);
507BASE_API size_t Tokenize(const base::StringPiece& str,
508                         const base::StringPiece& delimiters,
509                         std::vector<base::StringPiece>* tokens);
510
511// Does the opposite of SplitString().
512BASE_API string16 JoinString(const std::vector<string16>& parts, char16 s);
513BASE_API std::string JoinString(const std::vector<std::string>& parts, char s);
514
515// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively.
516// Additionally, any number of consecutive '$' characters is replaced by that
517// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be
518// NULL. This only allows you to use up to nine replacements.
519BASE_API string16 ReplaceStringPlaceholders(const string16& format_string,
520                                            const std::vector<string16>& subst,
521                                            std::vector<size_t>* offsets);
522
523BASE_API std::string ReplaceStringPlaceholders(
524    const base::StringPiece& format_string,
525    const std::vector<std::string>& subst,
526    std::vector<size_t>* offsets);
527
528// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL.
529BASE_API string16 ReplaceStringPlaceholders(const string16& format_string,
530                                            const string16& a,
531                                            size_t* offset);
532
533// Returns true if the string passed in matches the pattern. The pattern
534// string can contain wildcards like * and ?
535// The backslash character (\) is an escape character for * and ?
536// We limit the patterns to having a max of 16 * or ? characters.
537// ? matches 0 or 1 character, while * matches 0 or more characters.
538BASE_API bool MatchPattern(const base::StringPiece& string,
539                           const base::StringPiece& pattern);
540BASE_API bool MatchPattern(const string16& string, const string16& pattern);
541
542// Hack to convert any char-like type to its unsigned counterpart.
543// For example, it will convert char, signed char and unsigned char to unsigned
544// char.
545template<typename T>
546struct ToUnsigned {
547  typedef T Unsigned;
548};
549
550template<>
551struct ToUnsigned<char> {
552  typedef unsigned char Unsigned;
553};
554template<>
555struct ToUnsigned<signed char> {
556  typedef unsigned char Unsigned;
557};
558template<>
559struct ToUnsigned<wchar_t> {
560#if defined(WCHAR_T_IS_UTF16)
561  typedef unsigned short Unsigned;
562#elif defined(WCHAR_T_IS_UTF32)
563  typedef uint32 Unsigned;
564#endif
565};
566template<>
567struct ToUnsigned<short> {
568  typedef unsigned short Unsigned;
569};
570
571#endif  // BASE_STRING_UTIL_H_
572