1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef BASE_STRINGS_STRING_TOKENIZER_H_
6#define BASE_STRINGS_STRING_TOKENIZER_H_
7
8#include <algorithm>
9#include <string>
10
11#include "base/strings/string_piece.h"
12
13namespace base {
14
15// StringTokenizerT is a simple string tokenizer class.  It works like an
16// iterator that with each step (see the Advance method) updates members that
17// refer to the next token in the input string.  The user may optionally
18// configure the tokenizer to return delimiters.
19//
20// Warning: be careful not to pass a C string into the 2-arg constructor:
21// StringTokenizer t("this is a test", " ");  // WRONG
22// This will create a temporary std::string, save the begin() and end()
23// iterators, and then the string will be freed before we actually start
24// tokenizing it.
25// Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.
26//
27//
28// EXAMPLE 1:
29//
30//   char input[] = "this is a test";
31//   CStringTokenizer t(input, input + strlen(input), " ");
32//   while (t.GetNext()) {
33//     printf("%s\n", t.token().c_str());
34//   }
35//
36// Output:
37//
38//   this
39//   is
40//   a
41//   test
42//
43//
44// EXAMPLE 2:
45//
46//   std::string input = "no-cache=\"foo, bar\", private";
47//   StringTokenizer t(input, ", ");
48//   t.set_quote_chars("\"");
49//   while (t.GetNext()) {
50//     printf("%s\n", t.token().c_str());
51//   }
52//
53// Output:
54//
55//   no-cache="foo, bar"
56//   private
57//
58//
59// EXAMPLE 3:
60//
61//   bool next_is_option = false, next_is_value = false;
62//   std::string input = "text/html; charset=UTF-8; foo=bar";
63//   StringTokenizer t(input, "; =");
64//   t.set_options(StringTokenizer::RETURN_DELIMS);
65//   while (t.GetNext()) {
66//     if (t.token_is_delim()) {
67//       switch (*t.token_begin()) {
68//         case ';':
69//           next_is_option = true;
70//           break;
71//         case '=':
72//           next_is_value = true;
73//           break;
74//       }
75//     } else {
76//       const char* label;
77//       if (next_is_option) {
78//         label = "option-name";
79//         next_is_option = false;
80//       } else if (next_is_value) {
81//         label = "option-value";
82//         next_is_value = false;
83//       } else {
84//         label = "mime-type";
85//       }
86//       printf("%s: %s\n", label, t.token().c_str());
87//     }
88//   }
89//
90//
91template <class str, class const_iterator>
92class StringTokenizerT {
93 public:
94  typedef typename str::value_type char_type;
95
96  // Options that may be pass to set_options()
97  enum {
98    // Specifies the delimiters should be returned as tokens
99    RETURN_DELIMS = 1 << 0,
100  };
101
102  // The string object must live longer than the tokenizer.  (In particular this
103  // should not be constructed with a temporary.)
104  StringTokenizerT(const str& string,
105                   const str& delims) {
106    Init(string.begin(), string.end(), delims);
107  }
108
109  StringTokenizerT(const_iterator string_begin,
110                   const_iterator string_end,
111                   const str& delims) {
112    Init(string_begin, string_end, delims);
113  }
114
115  // Set the options for this tokenizer.  By default, this is 0.
116  void set_options(int options) { options_ = options; }
117
118  // Set the characters to regard as quotes.  By default, this is empty.  When
119  // a quote char is encountered, the tokenizer will switch into a mode where
120  // it ignores delimiters that it finds.  It switches out of this mode once it
121  // finds another instance of the quote char.  If a backslash is encountered
122  // within a quoted string, then the next character is skipped.
123  void set_quote_chars(const str& quotes) { quotes_ = quotes; }
124
125  // Call this method to advance the tokenizer to the next delimiter.  This
126  // returns false if the tokenizer is complete.  This method must be called
127  // before calling any of the token* methods.
128  bool GetNext() {
129    if (quotes_.empty() && options_ == 0)
130      return QuickGetNext();
131    else
132      return FullGetNext();
133  }
134
135  // Start iterating through tokens from the beginning of the string.
136  void Reset() {
137    token_end_ = start_pos_;
138  }
139
140  // Returns true if token is a delimiter.  When the tokenizer is constructed
141  // with the RETURN_DELIMS option, this method can be used to check if the
142  // returned token is actually a delimiter.
143  bool token_is_delim() const { return token_is_delim_; }
144
145  // If GetNext() returned true, then these methods may be used to read the
146  // value of the token.
147  const_iterator token_begin() const { return token_begin_; }
148  const_iterator token_end() const { return token_end_; }
149  str token() const { return str(token_begin_, token_end_); }
150  base::StringPiece token_piece() const {
151    return base::StringPiece(&*token_begin_,
152                             std::distance(token_begin_, token_end_));
153  }
154
155 private:
156  void Init(const_iterator string_begin,
157            const_iterator string_end,
158            const str& delims) {
159    start_pos_ = string_begin;
160    token_begin_ = string_begin;
161    token_end_ = string_begin;
162    end_ = string_end;
163    delims_ = delims;
164    options_ = 0;
165    token_is_delim_ = false;
166  }
167
168  // Implementation of GetNext() for when we have no quote characters. We have
169  // two separate implementations because AdvanceOne() is a hot spot in large
170  // text files with large tokens.
171  bool QuickGetNext() {
172    token_is_delim_ = false;
173    for (;;) {
174      token_begin_ = token_end_;
175      if (token_end_ == end_)
176        return false;
177      ++token_end_;
178      if (delims_.find(*token_begin_) == str::npos)
179        break;
180      // else skip over delimiter.
181    }
182    while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
183      ++token_end_;
184    return true;
185  }
186
187  // Implementation of GetNext() for when we have to take quotes into account.
188  bool FullGetNext() {
189    AdvanceState state;
190    token_is_delim_ = false;
191    for (;;) {
192      token_begin_ = token_end_;
193      if (token_end_ == end_)
194        return false;
195      ++token_end_;
196      if (AdvanceOne(&state, *token_begin_))
197        break;
198      if (options_ & RETURN_DELIMS) {
199        token_is_delim_ = true;
200        return true;
201      }
202      // else skip over delimiter.
203    }
204    while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
205      ++token_end_;
206    return true;
207  }
208
209  bool IsDelim(char_type c) const {
210    return delims_.find(c) != str::npos;
211  }
212
213  bool IsQuote(char_type c) const {
214    return quotes_.find(c) != str::npos;
215  }
216
217  struct AdvanceState {
218    bool in_quote;
219    bool in_escape;
220    char_type quote_char;
221    AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}
222  };
223
224  // Returns true if a delimiter was not hit.
225  bool AdvanceOne(AdvanceState* state, char_type c) {
226    if (state->in_quote) {
227      if (state->in_escape) {
228        state->in_escape = false;
229      } else if (c == '\\') {
230        state->in_escape = true;
231      } else if (c == state->quote_char) {
232        state->in_quote = false;
233      }
234    } else {
235      if (IsDelim(c))
236        return false;
237      state->in_quote = IsQuote(state->quote_char = c);
238    }
239    return true;
240  }
241
242  const_iterator start_pos_;
243  const_iterator token_begin_;
244  const_iterator token_end_;
245  const_iterator end_;
246  str delims_;
247  str quotes_;
248  int options_;
249  bool token_is_delim_;
250};
251
252typedef StringTokenizerT<std::string, std::string::const_iterator>
253    StringTokenizer;
254typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
255    WStringTokenizer;
256typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
257
258}  // namespace base
259
260#endif  // BASE_STRINGS_STRING_TOKENIZER_H_
261