1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef BASE_STRING_TOKENIZER_H_
6#define BASE_STRING_TOKENIZER_H_
7#pragma once
8
9#include <algorithm>
10#include <string>
11
12#include "base/string_piece.h"
13
14// StringTokenizerT is a simple string tokenizer class.  It works like an
15// iterator that with each step (see the Advance method) updates members that
16// refer to the next token in the input string.  The user may optionally
17// configure the tokenizer to return delimiters.
18//
19// Warning: be careful not to pass a C string into the 2-arg constructor:
20// StringTokenizer t("this is a test", " ");  // WRONG
21// This will create a temporary std::string, save the begin() and end()
22// iterators, and then the string will be freed before we actually start
23// tokenizing it.
24// Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.
25//
26//
27// EXAMPLE 1:
28//
29//   char input[] = "this is a test";
30//   CStringTokenizer t(input, input + strlen(input), " ");
31//   while (t.GetNext()) {
32//     printf("%s\n", t.token().c_str());
33//   }
34//
35// Output:
36//
37//   this
38//   is
39//   a
40//   test
41//
42//
43// EXAMPLE 2:
44//
45//   std::string input = "no-cache=\"foo, bar\", private";
46//   StringTokenizer t(input, ", ");
47//   t.set_quote_chars("\"");
48//   while (t.GetNext()) {
49//     printf("%s\n", t.token().c_str());
50//   }
51//
52// Output:
53//
54//   no-cache="foo, bar"
55//   private
56//
57//
58// EXAMPLE 3:
59//
60//   bool next_is_option = false, next_is_value = false;
61//   std::string input = "text/html; charset=UTF-8; foo=bar";
62//   StringTokenizer t(input, "; =");
63//   t.set_options(StringTokenizer::RETURN_DELIMS);
64//   while (t.GetNext()) {
65//     if (t.token_is_delim()) {
66//       switch (*t.token_begin()) {
67//         case ';':
68//           next_is_option = true;
69//           break;
70//         case '=':
71//           next_is_value = true;
72//           break;
73//       }
74//     } else {
75//       const char* label;
76//       if (next_is_option) {
77//         label = "option-name";
78//         next_is_option = false;
79//       } else if (next_is_value) {
80//         label = "option-value";
81//         next_is_value = false;
82//       } else {
83//         label = "mime-type";
84//       }
85//       printf("%s: %s\n", label, t.token().c_str());
86//     }
87//   }
88//
89//
90template <class str, class const_iterator>
91class StringTokenizerT {
92 public:
93  typedef typename str::value_type char_type;
94
95  // Options that may be pass to set_options()
96  enum {
97    // Specifies the delimiters should be returned as tokens
98    RETURN_DELIMS = 1 << 0,
99  };
100
101  // The string object must live longer than the tokenizer.  (In particular this
102  // should not be constructed with a temporary.)
103  StringTokenizerT(const str& string,
104                   const str& delims) {
105    Init(string.begin(), string.end(), delims);
106  }
107
108  StringTokenizerT(const_iterator string_begin,
109                   const_iterator string_end,
110                   const str& delims) {
111    Init(string_begin, string_end, delims);
112  }
113
114  // Set the options for this tokenizer.  By default, this is 0.
115  void set_options(int options) { options_ = options; }
116
117  // Set the characters to regard as quotes.  By default, this is empty.  When
118  // a quote char is encountered, the tokenizer will switch into a mode where
119  // it ignores delimiters that it finds.  It switches out of this mode once it
120  // finds another instance of the quote char.  If a backslash is encountered
121  // within a quoted string, then the next character is skipped.
122  void set_quote_chars(const str& quotes) { quotes_ = quotes; }
123
124  // Call this method to advance the tokenizer to the next delimiter.  This
125  // returns false if the tokenizer is complete.  This method must be called
126  // before calling any of the token* methods.
127  bool GetNext() {
128    if (quotes_.empty() && options_ == 0)
129      return QuickGetNext();
130    else
131      return FullGetNext();
132  }
133
134  // Start iterating through tokens from the beginning of the string.
135  void Reset() {
136    token_end_ = start_pos_;
137  }
138
139  // Returns true if token is a delimiter.  When the tokenizer is constructed
140  // with the RETURN_DELIMS option, this method can be used to check if the
141  // returned token is actually a delimiter.
142  bool token_is_delim() const { return token_is_delim_; }
143
144  // If GetNext() returned true, then these methods may be used to read the
145  // value of the token.
146  const_iterator token_begin() const { return token_begin_; }
147  const_iterator token_end() const { return token_end_; }
148  str token() const { return str(token_begin_, token_end_); }
149  base::StringPiece token_piece() const {
150    return base::StringPiece(&*token_begin_,
151                             std::distance(token_begin_, token_end_));
152  }
153
154 private:
155  void Init(const_iterator string_begin,
156            const_iterator string_end,
157            const str& delims) {
158    start_pos_ = string_begin;
159    token_begin_ = string_begin;
160    token_end_ = string_begin;
161    end_ = string_end;
162    delims_ = delims;
163    options_ = 0;
164    token_is_delim_ = false;
165  }
166
167  // Implementation of GetNext() for when we have no quote characters. We have
168  // two separate implementations because AdvanceOne() is a hot spot in large
169  // text files with large tokens.
170  bool QuickGetNext() {
171    token_is_delim_ = false;
172    for (;;) {
173      token_begin_ = token_end_;
174      if (token_end_ == end_)
175        return false;
176      ++token_end_;
177      if (delims_.find(*token_begin_) == str::npos)
178        break;
179      // else skip over delimiter.
180    }
181    while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
182      ++token_end_;
183    return true;
184  }
185
186  // Implementation of GetNext() for when we have to take quotes into account.
187  bool FullGetNext() {
188    AdvanceState state;
189    token_is_delim_ = false;
190    for (;;) {
191      token_begin_ = token_end_;
192      if (token_end_ == end_)
193        return false;
194      ++token_end_;
195      if (AdvanceOne(&state, *token_begin_))
196        break;
197      if (options_ & RETURN_DELIMS) {
198        token_is_delim_ = true;
199        return true;
200      }
201      // else skip over delimiter.
202    }
203    while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
204      ++token_end_;
205    return true;
206  }
207
208  bool IsDelim(char_type c) const {
209    return delims_.find(c) != str::npos;
210  }
211
212  bool IsQuote(char_type c) const {
213    return quotes_.find(c) != str::npos;
214  }
215
216  struct AdvanceState {
217    bool in_quote;
218    bool in_escape;
219    char_type quote_char;
220    AdvanceState() : in_quote(false), in_escape(false) {}
221  };
222
223  // Returns true if a delimiter was not hit.
224  bool AdvanceOne(AdvanceState* state, char_type c) {
225    if (state->in_quote) {
226      if (state->in_escape) {
227        state->in_escape = false;
228      } else if (c == '\\') {
229        state->in_escape = true;
230      } else if (c == state->quote_char) {
231        state->in_quote = false;
232      }
233    } else {
234      if (IsDelim(c))
235        return false;
236      state->in_quote = IsQuote(state->quote_char = c);
237    }
238    return true;
239  }
240
241  const_iterator start_pos_;
242  const_iterator token_begin_;
243  const_iterator token_end_;
244  const_iterator end_;
245  str delims_;
246  str quotes_;
247  int options_;
248  bool token_is_delim_;
249};
250
251typedef StringTokenizerT<std::string, std::string::const_iterator>
252    StringTokenizer;
253typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
254    WStringTokenizer;
255typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
256
257#endif  // BASE_STRING_TOKENIZER_H_
258