1/*
2 * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
3 * $Revision: 602520 $
4 * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $
5 *
6 * ====================================================================
7 * Licensed to the Apache Software Foundation (ASF) under one
8 * or more contributor license agreements.  See the NOTICE file
9 * distributed with this work for additional information
10 * regarding copyright ownership.  The ASF licenses this file
11 * to you under the Apache License, Version 2.0 (the
12 * "License"); you may not use this file except in compliance
13 * with the License.  You may obtain a copy of the License at
14 *
15 *   http://www.apache.org/licenses/LICENSE-2.0
16 *
17 * Unless required by applicable law or agreed to in writing,
18 * software distributed under the License is distributed on an
19 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
20 * KIND, either express or implied.  See the License for the
21 * specific language governing permissions and limitations
22 * under the License.
23 * ====================================================================
24 *
25 * This software consists of voluntary contributions made by many
26 * individuals on behalf of the Apache Software Foundation.  For more
27 * information on the Apache Software Foundation, please see
28 * <http://www.apache.org/>.
29 *
30 */
31
32package org.apache.http.message;
33
34import java.util.NoSuchElementException;
35
36import org.apache.http.HeaderIterator;
37import org.apache.http.ParseException;
38import org.apache.http.TokenIterator;
39
40/**
41 * Basic implementation of a {@link TokenIterator}.
42 * This implementation parses <tt>#token<tt> sequences as
43 * defined by RFC 2616, section 2.
44 * It extends that definition somewhat beyond US-ASCII.
45 *
46 * @version $Revision: 602520 $
47 */
48public class BasicTokenIterator implements TokenIterator {
49
50    /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
51    // the order of the characters here is adjusted to put the
52    // most likely candidates at the beginning of the collection
53    public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
54
55
56    /** The iterator from which to obtain the next header. */
57    protected final HeaderIterator headerIt;
58
59    /**
60     * The value of the current header.
61     * This is the header value that includes {@link #currentToken}.
62     * Undefined if the iteration is over.
63     */
64    protected String currentHeader;
65
66    /**
67     * The token to be returned by the next call to {@link #currentToken}.
68     * <code>null</code> if the iteration is over.
69     */
70    protected String currentToken;
71
72    /**
73     * The position after {@link #currentToken} in {@link #currentHeader}.
74     * Undefined if the iteration is over.
75     */
76    protected int searchPos;
77
78
79    /**
80     * Creates a new instance of {@link BasicTokenIterator}.
81     *
82     * @param headerIterator    the iterator for the headers to tokenize
83     */
84    public BasicTokenIterator(final HeaderIterator headerIterator) {
85        if (headerIterator == null) {
86            throw new IllegalArgumentException
87                ("Header iterator must not be null.");
88        }
89
90        this.headerIt = headerIterator;
91        this.searchPos = findNext(-1);
92    }
93
94
95    // non-javadoc, see interface TokenIterator
96    public boolean hasNext() {
97        return (this.currentToken != null);
98    }
99
100
101    /**
102     * Obtains the next token from this iteration.
103     *
104     * @return  the next token in this iteration
105     *
106     * @throws NoSuchElementException   if the iteration is already over
107     * @throws ParseException   if an invalid header value is encountered
108     */
109    public String nextToken()
110        throws NoSuchElementException, ParseException {
111
112        if (this.currentToken == null) {
113            throw new NoSuchElementException("Iteration already finished.");
114        }
115
116        final String result = this.currentToken;
117        // updates currentToken, may trigger ParseException:
118        this.searchPos = findNext(this.searchPos);
119
120        return result;
121    }
122
123
124    /**
125     * Returns the next token.
126     * Same as {@link #nextToken}, but with generic return type.
127     *
128     * @return  the next token in this iteration
129     *
130     * @throws NoSuchElementException   if there are no more tokens
131     * @throws ParseException   if an invalid header value is encountered
132     */
133    public final Object next()
134        throws NoSuchElementException, ParseException {
135        return nextToken();
136    }
137
138
139    /**
140     * Removing tokens is not supported.
141     *
142     * @throws UnsupportedOperationException    always
143     */
144    public final void remove()
145        throws UnsupportedOperationException {
146
147        throw new UnsupportedOperationException
148            ("Removing tokens is not supported.");
149    }
150
151
152    /**
153     * Determines the next token.
154     * If found, the token is stored in {@link #currentToken}.
155     * The return value indicates the position after the token
156     * in {@link #currentHeader}. If necessary, the next header
157     * will be obtained from {@link #headerIt}.
158     * If not found, {@link #currentToken} is set to <code>null</code>.
159     *
160     * @param from      the position in the current header at which to
161     *                  start the search, -1 to search in the first header
162     *
163     * @return  the position after the found token in the current header, or
164     *          negative if there was no next token
165     *
166     * @throws ParseException   if an invalid header value is encountered
167     */
168    protected int findNext(int from)
169        throws ParseException {
170
171        if (from < 0) {
172            // called from the constructor, initialize the first header
173            if (!this.headerIt.hasNext()) {
174                return -1;
175            }
176            this.currentHeader = this.headerIt.nextHeader().getValue();
177            from = 0;
178        } else {
179            // called after a token, make sure there is a separator
180            from = findTokenSeparator(from);
181        }
182
183        int start = findTokenStart(from);
184        if (start < 0) {
185            this.currentToken = null;
186            return -1; // nothing found
187        }
188
189        int end = findTokenEnd(start);
190        this.currentToken = createToken(this.currentHeader, start, end);
191        return end;
192    }
193
194
195    /**
196     * Creates a new token to be returned.
197     * Called from {@link #findNext findNext} after the token is identified.
198     * The default implementation simply calls
199     * {@link java.lang.String#substring String.substring}.
200     * <br/>
201     * If header values are significantly longer than tokens, and some
202     * tokens are permanently referenced by the application, there can
203     * be problems with garbage collection. A substring will hold a
204     * reference to the full characters of the original string and
205     * therefore occupies more memory than might be expected.
206     * To avoid this, override this method and create a new string
207     * instead of a substring.
208     *
209     * @param value     the full header value from which to create a token
210     * @param start     the index of the first token character
211     * @param end       the index after the last token character
212     *
213     * @return  a string representing the token identified by the arguments
214     */
215    protected String createToken(String value, int start, int end) {
216        return value.substring(start, end);
217    }
218
219
220    /**
221     * Determines the starting position of the next token.
222     * This method will iterate over headers if necessary.
223     *
224     * @param from      the position in the current header at which to
225     *                  start the search
226     *
227     * @return  the position of the token start in the current header,
228     *          negative if no token start could be found
229     */
230    protected int findTokenStart(int from) {
231        if (from < 0) {
232            throw new IllegalArgumentException
233                ("Search position must not be negative: " + from);
234        }
235
236        boolean found = false;
237        while (!found && (this.currentHeader != null)) {
238
239            final int to = this.currentHeader.length();
240            while (!found && (from < to)) {
241
242                final char ch = this.currentHeader.charAt(from);
243                if (isTokenSeparator(ch) || isWhitespace(ch)) {
244                    // whitspace and token separators are skipped
245                    from++;
246                } else if (isTokenChar(this.currentHeader.charAt(from))) {
247                    // found the start of a token
248                    found = true;
249                } else {
250                    throw new ParseException
251                        ("Invalid character before token (pos " + from +
252                         "): " + this.currentHeader);
253                }
254            }
255            if (!found) {
256                if (this.headerIt.hasNext()) {
257                    this.currentHeader = this.headerIt.nextHeader().getValue();
258                    from = 0;
259                } else {
260                    this.currentHeader = null;
261                }
262            }
263        } // while headers
264
265        return found ? from : -1;
266    }
267
268
269    /**
270     * Determines the position of the next token separator.
271     * Because of multi-header joining rules, the end of a
272     * header value is a token separator. This method does
273     * therefore not need to iterate over headers.
274     *
275     * @param from      the position in the current header at which to
276     *                  start the search
277     *
278     * @return  the position of a token separator in the current header,
279     *          or at the end
280     *
281     * @throws ParseException
282     *         if a new token is found before a token separator.
283     *         RFC 2616, section 2.1 explicitly requires a comma between
284     *         tokens for <tt>#</tt>.
285     */
286    protected int findTokenSeparator(int from) {
287        if (from < 0) {
288            throw new IllegalArgumentException
289                ("Search position must not be negative: " + from);
290        }
291
292        boolean found = false;
293        final int to = this.currentHeader.length();
294        while (!found && (from < to)) {
295            final char ch = this.currentHeader.charAt(from);
296            if (isTokenSeparator(ch)) {
297                found = true;
298            } else if (isWhitespace(ch)) {
299                from++;
300            } else if (isTokenChar(ch)) {
301                throw new ParseException
302                    ("Tokens without separator (pos " + from +
303                     "): " + this.currentHeader);
304            } else {
305                throw new ParseException
306                    ("Invalid character after token (pos " + from +
307                     "): " + this.currentHeader);
308            }
309        }
310
311        return from;
312    }
313
314
315    /**
316     * Determines the ending position of the current token.
317     * This method will not leave the current header value,
318     * since the end of the header value is a token boundary.
319     *
320     * @param from      the position of the first character of the token
321     *
322     * @return  the position after the last character of the token.
323     *          The behavior is undefined if <code>from</code> does not
324     *          point to a token character in the current header value.
325     */
326    protected int findTokenEnd(int from) {
327        if (from < 0) {
328            throw new IllegalArgumentException
329                ("Token start position must not be negative: " + from);
330        }
331
332        final int to = this.currentHeader.length();
333        int end = from+1;
334        while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
335            end++;
336        }
337
338        return end;
339    }
340
341
342    /**
343     * Checks whether a character is a token separator.
344     * RFC 2616, section 2.1 defines comma as the separator for
345     * <tt>#token</tt> sequences. The end of a header value will
346     * also separate tokens, but that is not a character check.
347     *
348     * @param ch        the character to check
349     *
350     * @return  <code>true</code> if the character is a token separator,
351     *          <code>false</code> otherwise
352     */
353    protected boolean isTokenSeparator(char ch) {
354        return (ch == ',');
355    }
356
357
358    /**
359     * Checks whether a character is a whitespace character.
360     * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
361     * The optional preceeding line break is irrelevant, since header
362     * continuation is handled transparently when parsing messages.
363     *
364     * @param ch        the character to check
365     *
366     * @return  <code>true</code> if the character is whitespace,
367     *          <code>false</code> otherwise
368     */
369    protected boolean isWhitespace(char ch) {
370
371        // we do not use Character.isWhitspace(ch) here, since that allows
372        // many control characters which are not whitespace as per RFC 2616
373        return ((ch == '\t') || Character.isSpaceChar(ch));
374    }
375
376
377    /**
378     * Checks whether a character is a valid token character.
379     * Whitespace, control characters, and HTTP separators are not
380     * valid token characters. The HTTP specification (RFC 2616, section 2.2)
381     * defines tokens only for the US-ASCII character set, this
382     * method extends the definition to other character sets.
383     *
384     * @param ch        the character to check
385     *
386     * @return  <code>true</code> if the character is a valid token start,
387     *          <code>false</code> otherwise
388     */
389    protected boolean isTokenChar(char ch) {
390
391        // common sense extension of ALPHA + DIGIT
392        if (Character.isLetterOrDigit(ch))
393            return true;
394
395        // common sense extension of CTL
396        if (Character.isISOControl(ch))
397            return false;
398
399        // no common sense extension for this
400        if (isHttpSeparator(ch))
401            return false;
402
403        // RFC 2616, section 2.2 defines a token character as
404        // "any CHAR except CTLs or separators". The controls
405        // and separators are included in the checks above.
406        // This will yield unexpected results for Unicode format characters.
407        // If that is a problem, overwrite isHttpSeparator(char) to filter
408        // out the false positives.
409        return true;
410    }
411
412
413    /**
414     * Checks whether a character is an HTTP separator.
415     * The implementation in this class checks only for the HTTP separators
416     * defined in RFC 2616, section 2.2. If you need to detect other
417     * separators beyond the US-ASCII character set, override this method.
418     *
419     * @param ch        the character to check
420     *
421     * @return  <code>true</code> if the character is an HTTP separator
422     */
423    protected boolean isHttpSeparator(char ch) {
424        return (HTTP_SEPARATORS.indexOf(ch) >= 0);
425    }
426
427
428} // class BasicTokenIterator
429
430