CssTokens.java revision b600c3cd7edfb02d79c264fd83b1306e94053b7e
1// Copyright (c) 2013, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import java.util.ArrayList;
32import java.util.Arrays;
33import java.util.Iterator;
34import java.util.List;
35import java.util.NoSuchElementException;
36
37import javax.annotation.Nullable;
38
39import com.google.common.collect.ImmutableMap;
40
41/**
42 * Given a string of CSS, produces a string of normalized CSS with certain
43 * useful properties detailed below.
44 * <ul>
45 *   <li>All runs of white-space and comment tokens (including CDO and CDC)
46 *     have been replaced with a single space character.</li>
47 *   <li>All strings are quoted and escapes are escaped according to the
48 *     following scheme:
49 *     <table>
50 *       <tr><td>NUL</td>            <td><code>\0</code></tr>
51 *       <tr><td>line feed</td>      <td><code>\a</code></tr>
52 *       <tr><td>vertical feed</td>  <td><code>\c</code></tr>
53 *       <tr><td>carriage return</td><td><code>\d</code></tr>
54 *       <tr><td>double quote</td>   <td><code>\22</code></tr>
55 *       <tr><td>ampersand &amp;</td><td><code>\26</code></tr>
56 *       <tr><td>single quote</td>   <td><code>\27</code></tr>
57 *       <tr><td>left-angle &lt;</td><td><code>\3c</code></tr>
58 *       <tr><td>rt-angle &gt;</td>  <td><code>\3e</code></tr>
59 *       <tr><td>back slash</td>     <td><code>\\</code></tr>
60 *       <tr><td>dash</td>           <td><code>-</code> or <code>\-</code></tr>
61 *       <tr><td>all others</td>     <td>raw</td></tr>
62 *     </table>
63 *     The dash may be encoded as necessary to prevent the valid CSS,
64 *     <pre>.foo-\2d&gt;span { color:red }</pre>
65 *     which specifies the selector for the {@code <span>} children of elements
66 *     with class ".foo--" from being rendered to include an ignorable
67 *     {@code -->} token.
68 *   </li>
69 *   <li>All <code>url(&hellip;)</code> tokens are quoted.
70 *   <li>All keywords, identifiers, and hex literals are lower-case and have
71 *       embedded escape sequences decoded.</li>
72 *   <li>All brackets nest properly.</li>
73 *   <li>Does not contain the sequence {@code <!--} or {@code -->}.</li>
74 *   <li>All delimiters that can start longer tokens are followed by a space.
75 * </ul>
76 */
77final class CssTokens implements Iterable<String> {
78
79static final boolean DEBUG = false;
80
81  public final String normalizedCss;
82  public final Brackets brackets;
83  private final int[] tokenBreaks;
84  private final TokenType[] tokenTypes;
85
86  public TokenIterator start() {
87    return new TokenIterator(tokenTypes.length);
88  }
89
90  public TokenIterator iterator() { return start(); }
91
92  public static CssTokens lex(String css) {
93    Lexer lexer = new Lexer(css);
94    lexer.lex();
95    return lexer.build();
96  }
97
98  /** A cursor into a list of tokens. */
99  public final class TokenIterator implements Iterator<String> {
100    private int tokenIndex = 0;
101    private final int limit;
102
103    TokenIterator(int limit) {
104      this.limit = limit;
105    }
106
107    public boolean hasNext() {
108      return hasToken();
109    }
110
111    public String next() {
112      String token = token();
113      advance();
114      return token;
115    }
116
117    public @Nullable TokenIterator spliceToEnd() {
118      if (!hasNext()) { throw new NoSuchElementException(); }
119      int end = brackets.partner(tokenIndex);
120      if (end < 0) {
121        return null;
122      }
123      TokenIterator between = new TokenIterator(end);
124      between.tokenIndex = tokenIndex + 1;
125      tokenIndex = end + 1;
126      return between;
127    }
128
129    public int tokenIndex() {
130      return tokenIndex;
131    }
132
133    public int startOffset() {
134      return tokenBreaks[tokenIndex];
135    }
136
137    public int endOffset() {
138      return tokenBreaks[tokenIndex+1];
139    }
140
141    public String token() {
142      return normalizedCss.substring(startOffset(), endOffset());
143    }
144
145    public boolean hasToken() {
146      return tokenIndex < limit;
147    }
148
149    public boolean hasTokenAfterSpace() {
150      while (hasToken()) {
151        if (type() != TokenType.WHITESPACE) { return true; }
152        advance();
153      }
154      return false;
155    }
156
157    /** The type of the current token. */
158    public TokenType type() {
159      return tokenTypes[tokenIndex];
160    }
161
162    public void seek(int tokenIndex) {
163      this.tokenIndex = tokenIndex;
164    }
165
166    public void advance() {
167      if (!hasToken()) { throw new NoSuchElementException(); }
168      ++tokenIndex;
169    }
170
171    public void remove() throws UnsupportedOperationException {
172      throw new UnsupportedOperationException();
173    }
174  }
175
176  private CssTokens(
177      String normalizedCss, Brackets brackets, int[] tokenBreaks,
178      TokenType[] tokenTypes) {
179    this.normalizedCss = normalizedCss;
180    this.brackets = brackets;
181    this.tokenBreaks = tokenBreaks;
182    this.tokenTypes = tokenTypes;
183  }
184
185  public enum TokenType {
186    /** An identifier. */
187    IDENT,
188    /** An identifier prefixed with a period. */
189    DOT_IDENT,
190    /** A function name and opening bracket. */
191    FUNCTION,
192    /** An {@code @<identifier>} directive token. */
193    AT,
194    /** A hash token that contains non-hex characters. */
195    HASH_ID,
196    /** A hash token that could be a color literal. */
197    HASH_UNRESTRICTED,
198    /** A quoted string. */
199    STRING,
200    /** A URL of the form <code>url("...")</code>. */
201    URL,
202    /** A single character. */
203    DELIM,
204    /** A scalar numeric value. */
205    NUMBER,
206    /** A percentage. */
207    PERCENTAGE,
208    /** A numeric value with a unit suffix. */
209    DIMENSION,
210    /** {@code U+<hex-or-qmark>} */
211    UNICODE_RANGE,
212    /**
213     * include-match, dash-match, prefix-match, suffix-match, substring-match
214     */
215    MATCH,
216    /** {@code ||} */
217    COLUMN,
218    /** A run of white-space, comment, CDO, and CDC tokens. */
219    WHITESPACE,
220    /** {@code :} */
221    COLON,
222    /** {@code ;} */
223    SEMICOLON,
224    /** {@code ,} */
225    COMMA,
226    /** {@code [} */
227    LEFT_SQUARE,
228    /** {@code ]} */
229    RIGHT_SQUARE,
230    /** {@code (} */
231    LEFT_PAREN,
232    /** {@code )} */
233    RIGHT_PAREN,
234    /** <code>{</code> */
235    LEFT_CURLY,
236    /** <code>}</code> */
237    RIGHT_CURLY,
238    ;
239  }
240
241  /**
242   * Maps tokens to their partners.  A close bracket token like {@code (} may
243   * have a partner token like {@code )} if properly nested, and vice-versa.
244   */
245  static final class Brackets {
246    /**
247     * For each token index, the index of the indexed token's partner or -1 if
248     * it has none.
249     */
250    private final int[] brackets;
251
252    private Brackets(int[] brackets) {
253      this.brackets = brackets;
254    }
255
256    /** The index of the partner token or -1 if none. */
257    int partner(int tokenIndex) {
258if (DEBUG) System.err.println("index=" + tokenIndex + ", brackets=" + Arrays.toString(brackets));
259      int bracketIndex = bracketIndexForToken(tokenIndex);
260if (DEBUG) System.err.println("\tbracketIndex=" + bracketIndex);
261      if (bracketIndex < 0) { return -1; }
262      return brackets[(bracketIndex << 1) + 1];
263    }
264
265    int bracketIndexForToken(int target) {
266      // Binary search by leftmost element of pair.
267      int left = 0;
268      int right = brackets.length >> 1;
269      while (left < right) {
270        int mid = (left + (right - left) >> 1);
271        int value = brackets[mid << 1];
272        if (value == target) { return mid; }
273        if (value < target) {
274          left = mid + 1;
275        } else {
276          right = mid;
277        }
278      }
279      return -1;
280    }
281  }
282
283  private static final int[] ZERO_INTS = new int[0];
284
285  private static final TokenType[] ZERO_TYPES = new TokenType[0];
286
287  private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS);
288
289  private static final CssTokens EMPTY = new CssTokens(
290      "", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES);
291
292  /**
293   * Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/
294   */
295  private static final class Lexer {
296    private final String css;
297    private final StringBuilder sb;
298    private int pos = 0;
299    private final int cssLimit;
300
301    private List<TokenType> tokenTypes = null;
302    private int[] tokenBreaks = new int[128];
303    private int tokenBreaksLimit = 0;
304
305    /**
306     * For each bracket, 2 ints: the token index of the bracket, and the token
307     * index of its partner.
308     * The array is sorted by the first int.
309     * The second int is -1 when the bracket has not yet been closed.
310     */
311    private int[] brackets = ZERO_INTS;
312    /**
313     * The number of elements in {@link #brackets} that are valid.
314     * {@code brackets[bracketsLimit:]} is zeroed space that the list can grow
315     * into.
316     */
317    private int bracketsLimit = 0;
318    /**
319     * For each bracket that has not been closed, 2 ints:
320     * its index in {@link #brackets} and the character of its close bracket
321     * as an int.
322     * This is a bracket stack so the array is sorted by the first int.
323     */
324    private int[] open = ZERO_INTS;
325    /**
326     * The number of elements in {@link #open} that are valid.
327     * {@code open[openLimit:]} is garbage space that the stack can grow into.
328     */
329    private int openLimit = 0;
330
331    Lexer(String css) {
332      this.css = css;
333      this.sb = new StringBuilder();
334      this.cssLimit = css.length();
335    }
336
337    TokenType openBracket(char bracketChar) {
338      char close;
339      TokenType type;
340      switch (bracketChar) {
341        case '(': close = ')'; type = TokenType.LEFT_PAREN;  break;
342        case '[': close = ']'; type = TokenType.LEFT_SQUARE; break;
343        case '{': close = '}'; type = TokenType.LEFT_CURLY;  break;
344        default:
345          throw new AssertionError("Invalid open bracket " + bracketChar);
346      }
347      brackets = expandIfNecessary(brackets, bracketsLimit, 2);
348      open = expandIfNecessary(open, openLimit, 2);
349      open[openLimit++] = bracketsLimit;
350      open[openLimit++] = close;
351      brackets[bracketsLimit++] = tokenBreaksLimit;
352      brackets[bracketsLimit++] = -1;
353      sb.append(bracketChar);
354      return type;
355    }
356
357    void closeBracket(char bracketChar) {
358      int openLimitAfterClose = openLimit;
359      do {
360        if (openLimitAfterClose == 0) {
361          // Drop an orphaned close bracket.
362          breakOutput();
363          return;
364        }
365        openLimitAfterClose -= 2;
366      } while (bracketChar != open[openLimitAfterClose + 1]);
367
368      // Make sure we've got space on brackets.
369      int spaceNeeded = openLimit - openLimitAfterClose;
370      brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded);
371
372      closeBrackets(openLimitAfterClose);
373    }
374
375    private void closeBrackets(int openLimitAfterClose) {
376      int closeTokenIndex = tokenBreaksLimit;
377      while (openLimit > openLimitAfterClose) {
378        // Pop the stack.
379        int closeBracket = open[--openLimit];
380        int openBracketIndex = open[--openLimit];
381        // Update open bracket to point to its partner.
382        brackets[openBracketIndex + 1] = closeTokenIndex;
383        // Emit the close bracket.
384        brackets[bracketsLimit++] = closeTokenIndex;
385        brackets[bracketsLimit++] = openBracketIndex;
386        sb.appendCodePoint(closeBracket);
387        closeTokenIndex++;
388      }
389    }
390
391    CssTokens build() {
392      // Close any still open brackets.
393      {
394        int startOfCloseBrackets = sb.length();
395        closeBrackets(0);
396        emitMergedTokens(startOfCloseBrackets, sb.length());
397      }
398
399      if (tokenTypes == null) { return EMPTY; }
400      int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit);
401
402      // Strip any trailing space off, since it may have been inserted by a
403      // breakAfter call anyway.
404      int cssEnd = sb.length();
405      if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') {
406if (DEBUG) System.err.println(
407    "cssEnd=" + cssEnd + ", tokenBreaksLimit=" + tokenBreaksLimit + ", tokenTypes.size=" + tokenTypes.size());
408        --cssEnd;
409        tokenTypes.remove(--tokenBreaksLimit);
410if (DEBUG) System.err.println(
411    "cssEnd'=" + cssEnd + ", tokenBreaksLimit=" + tokenBreaksLimit + ", tokenTypes.size=" + tokenTypes.size());
412      }
413      String normalizedCss = sb.substring(0, cssEnd);
414
415      // Store the last character on the tokenBreaksList to simplify finding the
416      // end of a token.
417      tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
418      tokenBreaks[tokenBreaksLimit++] = normalizedCss.length();
419
420      int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit);
421      TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES);
422
423if (DEBUG) System.err.println("normalizedCss=" + normalizedCss);
424if (DEBUG) System.err.println("tokenBreaks=" + Arrays.toString(tokenBreaksTrunc));
425if (DEBUG) System.err.println("tokenTypes =" + Arrays.toString(tokenTypesArr));
426
427      return new CssTokens(
428          normalizedCss, new Brackets(bracketsTrunc),
429          tokenBreaksTrunc, tokenTypesArr);
430    }
431
432    void lex() {
433      // Fast-track no content.
434      consumeIgnorable();
435      sb.setLength(0);
436      if (pos == cssLimit) { return; }
437
438      String css = this.css;
439      int cssLimit = this.cssLimit;
440      int lastPos = pos - 1;
441      while (pos < cssLimit) {
442        assert pos > lastPos
443            : "pos=" + pos + ", lastPos=" + lastPos + ", ch=" + css.charAt(pos);
444        lastPos = pos;
445if (DEBUG) System.err.println("Starting at " + pos);
446        // SPEC: 4. Tokenization
447        // The output of the tokenization step is a stream of zero
448        // or more of the following tokens: <ident>, <function>,
449        // <at-keyword>, <hash>, <string>, <bad-string>, <url>,
450        // <bad-url>, <delim>, <number>, <percentage>,
451        // <dimension>, <unicode-range>, <include-match>,
452        // <dash-match>, <prefix-match>, <suffix-match>,
453        // <substring-match>, <column>, <whitespace>, <CDO>,
454        // <CDC>, <colon>, <semicolon>, <comma>, <[>, <]>,
455        // <(>, <)>, <{>, and <}>.
456
457        // IMPLEMENTS: 4.3 Consume a token
458        char ch = css.charAt(pos);
459        int startOfToken = pos;
460        int startOfOutputToken = sb.length();
461        final TokenType type;
462        switch (ch) {
463          case '\t': case '\n': case '\f': case '\r': case ' ':
464            consumeIgnorable();
465            type = TokenType.WHITESPACE;
466            break;
467          case '/': {
468            char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
469            if (lookahead == '/' || lookahead == '*') {
470              consumeIgnorable();
471              type = TokenType.WHITESPACE;
472            } else {
473              consumeDelim(ch);
474              type = TokenType.DELIM;
475            }
476            break;
477          }
478          case '<':
479            if (consumeIgnorable()) {  // <!--
480              type = TokenType.WHITESPACE;
481            } else {
482              consumeDelim('<');
483              type = TokenType.DELIM;
484            }
485            break;
486          case '@':
487            if (consumeAtKeyword()) {
488              type = TokenType.AT;
489            } else {
490              consumeDelim(ch);
491              type = TokenType.DELIM;
492            }
493            break;
494          case '#': {
495            sb.append('#');
496            TokenType hashType = consumeHash();
497            if (hashType != null) {
498              type = hashType;
499            } else {
500              ++pos;
501              type = TokenType.DELIM;
502            }
503            break;
504          }
505          case '"':
506          case '\'':
507            consumeString();
508            type = TokenType.STRING;
509            break;
510          case 'U':
511            // SPEC handle URL under "ident like token".
512            if (consumeUnicodeRange()) {
513              type = TokenType.UNICODE_RANGE;
514            } else {
515              type = consumeIdentOrUrlOrFunction();
516            }
517            break;
518          case '0': case '1': case '2': case '3': case '4':
519          case '5': case '6': case '7': case '8': case '9':
520            type = consumeNumberOrPercentageOrDimension();
521            break;
522          case '+': case '-': case '.': {
523            char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
524if (DEBUG) System.err.println("ch=" + ch + ", lookahead=" + lookahead);
525            if (isDecimal(lookahead)
526                || (lookahead == '.' && pos + 2 < cssLimit
527                    && isDecimal(css.charAt(pos + 2)))) {
528              type = consumeNumberOrPercentageOrDimension();
529            } else if (ch == '+') {
530              consumeDelim(ch);
531              type = TokenType.DELIM;
532            } else if (ch == '-') {
533              if (consumeIgnorable()) {  // -->
534                type = TokenType.WHITESPACE;
535              } else {
536                type = consumeIdentOrUrlOrFunction();
537              }
538            } else if (isIdentPart(lookahead)) {
539              // treat ".<IDENT>" as one token.
540              sb.append('.');
541              ++pos;
542              consumeIdent();
543              type = TokenType.DOT_IDENT;
544            } else {
545              consumeDelim('.');
546              type = TokenType.DELIM;
547            }
548            break;
549          }
550          case ':': consumeDelim(ch); type = TokenType.COLON; break;
551          case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break;
552          case ',': consumeDelim(ch); type = TokenType.COMMA; break;
553          case '[': case '(': case '{':
554            type = openBracket(ch);
555            ++pos;
556            break;
557          case '}': case ')': case ']':
558            closeBracket(ch);
559            ++pos;
560            // Use DELIM so that a later loop will split output into multiple
561            // tokens since we may have inserted missing close brackets for
562            // unclosed open brackets already on the stack.
563            type = TokenType.DELIM;
564            break;
565          case '~': case '|': case '^': case '$': case '*': {
566            char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
567            if (lookahead == '=') {
568              consumeMatch(ch);
569              type = TokenType.MATCH;
570            } else if (ch == '|' && lookahead == '|') {
571              consumeColumn();
572              type = TokenType.COLUMN;
573            } else {
574              consumeDelim(ch);
575              type = TokenType.DELIM;
576            }
577            break;
578          }
579          case '_':
580            type = consumeIdentOrUrlOrFunction();
581            break;
582          case '\\': {
583            // Optimistically parse as an ident.
584            TokenType identType = consumeIdentOrUrlOrFunction();
585            if (identType == null) {
586              ++pos;  // drop
587              breakOutput();
588              type = TokenType.WHITESPACE;
589            } else {
590              type = identType;
591            }
592            // TODO: handle case where "url" is encoded.
593            break;
594          }
595          default:
596            int chlower = ch | 32;
597            if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) {
598              type = consumeIdentOrUrlOrFunction();
599            } else if (ch > 0x20) {
600              consumeDelim(ch);
601              type = TokenType.DELIM;
602            } else {  // Ignore.
603              breakOutput();
604              type = TokenType.WHITESPACE;
605              ++pos;
606            }
607        }
608        assert pos > startOfToken
609            : "empty token at " + pos + ", ch0=" + css.charAt(startOfToken);
610if (DEBUG) System.err.println("found `" + css.substring(startOfToken, pos) + "` -> `" + sb.substring(startOfOutputToken) + "`");
611        int endOfOutputToken = sb.length();
612        if (endOfOutputToken > startOfOutputToken) {
613          if (type == TokenType.DELIM) {
614            emitMergedTokens(startOfOutputToken, endOfOutputToken);
615          } else {
616            emitToken(type, startOfOutputToken);
617          }
618        }
619      }
620    }
621
622    private void emitMergedTokens(int start, int end) {
623      // Handle breakOutput and merging of output tokens.
624      for (int e = start; e < end; ++e) {
625        TokenType delimType;
626        switch (sb.charAt(e)) {
627          case ' ': delimType = TokenType.WHITESPACE;   break;
628          case '}': delimType = TokenType.RIGHT_CURLY;  break;
629          case ')': delimType = TokenType.RIGHT_PAREN;  break;
630          case ']': delimType = TokenType.RIGHT_SQUARE; break;
631          default : delimType = TokenType.DELIM;        break;
632        }
633        emitToken(delimType, e);
634      }
635    }
636
637    private void emitToken(TokenType type, int startOfOutputToken) {
638      if (tokenBreaksLimit == 0
639          || tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) {
640        if (tokenTypes == null) { tokenTypes = new ArrayList<TokenType>(); }
641        tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
642        tokenBreaks[tokenBreaksLimit++] = startOfOutputToken;
643        tokenTypes.add(type);
644      }
645    }
646
647    private void consumeDelim(char ch) {
648      sb.append(ch);
649      switch (ch) {
650        case '~': case '|': case '^': case '$': case '\\': case '#':
651        case '.': case '+': case '-': case '@': case '/':
652          sb.append(' ');
653          break;
654        default:
655          break;
656      }
657      ++pos;
658    }
659
660    private boolean consumeIgnorable() {
661      String css = this.css;
662      int cssLimit = this.cssLimit;
663      int posBefore = pos;
664      while (pos < cssLimit) {
665        char ch = css.charAt(pos);
666if (DEBUG) System.err.println("possibly ignorable " + ch + " at " + pos);
667        if (ch <= 0x20) {
668          ++pos;
669        } else if (pos + 1 == cssLimit) {
670          break;
671        } else if (ch == '/') {
672          char next = css.charAt(pos + 1);
673if (DEBUG) System.err.println("Found /, next=" + next);
674          if (next == '*') {
675            pos += 2;
676if (DEBUG) System.err.println("Looking for end of /* at " + pos);
677            while (pos < cssLimit) {
678              int ast = css.indexOf('*', pos);
679if (DEBUG) System.err.println("ast=" + ast);
680              if (ast < 0) {
681                pos = cssLimit;  // Unclosed /* comment */
682                break;
683              } else {
684                // Advance over a run of '*'s.
685                pos = ast + 1;
686                while (pos < cssLimit && css.charAt(pos) == '*') {
687                  ++pos;
688                }
689if (DEBUG) System.err.println("Looking for / at " + pos);
690                if (pos < cssLimit && css.charAt(pos) == '/') {
691                  ++pos;
692                  break;
693                }
694              }
695            }
696if (DEBUG) System.err.println("Finished */ at " + pos);
697          } else if (next == '/') {  // Non-standard but widely supported
698            while (++pos < cssLimit) {
699              if (isLineTerminator(css.charAt(pos))) { break; }
700            }
701          } else {
702            break;
703          }
704        } else if (ch == '<') {
705          if (pos + 3 < cssLimit
706              && '!' == css.charAt(pos + 1)
707              && '-' == css.charAt(pos + 2)
708              && '-' == css.charAt(pos + 3)) {
709            pos += 4;
710          } else {
711            break;
712          }
713        } else if (ch == '-') {
714          if (pos + 2 < cssLimit
715              && '-' == css.charAt(pos + 1)
716              && '>' == css.charAt(pos + 2)) {
717            pos += 3;
718          } else {
719            break;
720          }
721        } else {
722          break;
723        }
724      }
725      if (pos == posBefore) {
726        return false;
727      } else {
728        breakOutput();
729        return true;
730      }
731    }
732
733    private void breakOutput() {
734      int last = sb.length() - 1;
735      if (last < 0 || sb.charAt(last) != ' ') { sb.append(' '); }
736    }
737
738    private void consumeColumn() {
739      pos += 2;
740      sb.append("||");
741    }
742
743    private void consumeMatch(char ch) {
744      pos += 2;
745      sb.append(ch).append('=');
746    }
747
748    private void consumeIdent() {
749      String css = this.css;
750      int cssLimit = this.cssLimit;
751      int last = -1;
752      while (pos < cssLimit) {
753        int posBefore = pos;
754
755        int decoded = css.charAt(pos);
756        if (decoded == '\\') {
757          decoded = consumeAndDecodeEscapeSequence();
758        } else {
759          ++pos;
760        }
761
762        if (isIdentPart(decoded)) {
763          if (decoded >= 0x80) {
764            encodeCharOntoOutput(decoded, last);
765          } else {
766            sb.append((char) decoded);
767          }
768          last = decoded;
769        } else {
770          pos = posBefore;
771          break;
772        }
773      }
774    }
775
776    private boolean consumeAtKeyword() {
777      assert css.charAt(pos) == '@';
778      int bufferLengthBeforeWrite = sb.length();
779      sb.append('@');
780      int posBeforeKeyword = ++pos;
781      consumeIdent();
782      if (pos == posBeforeKeyword) {
783        --pos;  // back up over '@'
784        sb.setLength(bufferLengthBeforeWrite);  // Unwrite the '@'
785        return false;
786      } else {
787        return true;
788      }
789    }
790
791
792    private int consumeAndDecodeEscapeSequence() {
793      String css = this.css;
794      int cssLimit = this.cssLimit;
795      assert css.charAt(pos) == '\\';
796      if (pos + 1 >= cssLimit) { return -1; }
797      char esc = css.charAt(pos + 1);
798      if (isLineTerminator(esc)) { return -1; }
799      int escLower = esc | 32;
800      if (('0' <= esc && esc <= '9')
801          || ('a' <= escLower && escLower <= 'f')) {
802        int hexValue = 0;
803        int hexStart = pos + 1;
804        int hexLimit = Math.min(pos + 7, cssLimit);
805        int hexEnd = hexStart;
806        do {
807          hexValue = (hexValue << 4)
808              | (esc <= '9' ? esc - '0' : escLower - ('a' - 10));
809          ++hexEnd;
810          if (hexEnd == hexLimit) { break; }
811          esc = css.charAt(hexEnd);
812          escLower = esc | 32;
813        } while (('0' <= esc && esc <= '9')
814                 || ('a' <= escLower && escLower <= 'f'));
815        if (hexValue > Character.MAX_CODE_POINT) {
816          return -1;
817        }
818        pos = hexEnd;
819        if (pos + 1 < cssLimit) {
820          // A sequence of hex digits can be followed by a space that allows
821          // so that code-point U+A followed by the letter 'b' can be rendered
822          // as "\a b" since "\ab" specifies the single code-point U+AB.
823          char next = css.charAt(pos + 1);
824          if (next == ' ' || isLineTerminator(next)) {
825            ++pos;
826          }
827        }
828        return hexValue;
829      }
830      pos += 2;
831      return esc;
832    }
833
834    private static final long HEX_ENCODED_BITMASK =
835        (1L << 0) | LINE_TERMINATOR_BITMASK
836        | (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>');
837    private static boolean isHexEncoded(int codepoint) {
838      return 0 <= codepoint && codepoint < 63
839          && 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK);
840    }
841
842    private void encodeCharOntoOutput(int codepoint, int last) {
843if (DEBUG) System.err.println("codepoint=" + Integer.toString(codepoint, 16) + ", last=" + Integer.toString(last, 16));
844      switch (codepoint) {
845        case '\\': sb.append("\\\\"); break;
846        case '\0': sb.append("\\0");  break;
847        case '\n': sb.append("\\a");  break;
848        case '\f': sb.append("\\c");  break;
849        case '\r': sb.append("\\d");  break;
850        case '\"': sb.append("\\22"); break;
851        case '&':  sb.append("\\26"); break;
852        case '\'': sb.append("\\27"); break;
853        case '<':  sb.append("\\3c"); break;
854        case '>':  sb.append("\\3e"); break;
855        // The set of escapes above that end with a hex digit must appear in
856        // HEX_ENCODED_BITMASK.
857        case '-':
858          if (last == '-') {
859            sb.append('\\');
860          }
861          sb.append('-');
862          break;
863        // TODO: maybe encode a decimal digit after a '-'.
864        default:
865          if (isHexEncoded(last)
866              // We need to put a space after a trailing hex digit if the
867              // next encoded character on the output would be another hex
868              // digit or a space character.  The other space characters
869              // are handled above.
870              && (codepoint == ' '
871                  || ('0' <= codepoint && codepoint <= '9')
872                  || ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) {
873if (DEBUG) System.err.println("last '" + new StringBuilder().appendCodePoint(last) + "' is hex-encoded");
874            sb.append(' ');
875          }
876          sb.appendCodePoint(codepoint);
877      }
878    }
879
880    private TokenType consumeNumberOrPercentageOrDimension() {
881if (DEBUG) System.err.println("Consuming number");
882      String css = this.css;
883      int cssLimit = this.cssLimit;
884      boolean isZero = true;
885      int intStart = pos;
886      if (intStart < cssLimit) {
887        char ch = css.charAt(intStart);
888        if (ch == '-' || ch == '+') {
889          ++intStart;
890        }
891      }
892if (DEBUG) System.err.println("\tintStart=" + intStart);
893      // Find the integer part after any sign.
894      int intEnd = intStart;
895      for (; intEnd < cssLimit; ++intEnd) {
896        char ch = css.charAt(intEnd);
897        if (!('0' <= ch && ch <= '9')) { break; }
898        if (ch != '0') { isZero = false; }
899      }
900if (DEBUG) System.err.println("\tintEnd=" + intEnd);
901      // Find a fraction like ".5" or ".".
902      int fractionStart = intEnd;
903      int fractionEnd = fractionStart;
904      if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) {
905        ++fractionEnd;
906        for (; fractionEnd < cssLimit; ++fractionEnd) {
907          char ch = css.charAt(fractionEnd);
908          if (!('0' <= ch && ch <= '9')) { break; }
909          if (ch != '0') { isZero = false; }
910        }
911      }
912if (DEBUG) System.err.println("\tfractionStart=" + fractionStart);
913if (DEBUG) System.err.println("\tfractionEnd=" + fractionEnd);
914      int exponentStart = fractionEnd;
915      int exponentIntStart = exponentStart;
916      int exponentEnd = exponentStart;
917      boolean isExponentZero = true;
918      if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) {
919        // 'e' and 'e' in "5e-f" for a
920        exponentEnd = exponentStart + 1;
921        if (exponentEnd < cssLimit) {
922          char ch = css.charAt(exponentEnd);
923          if (ch == '+' || ch == '-') { ++exponentEnd; }
924        }
925        exponentIntStart = exponentEnd;
926        for (; exponentEnd < cssLimit; ++exponentEnd) {
927          char ch = css.charAt(exponentEnd);
928          if (!('0' <= ch && ch <= '9')) { break; }
929          if (ch != '0') { isExponentZero = false; }
930        }
931        // Since
932        //    dimension := <number> <ident>
933        // the below are technically valid dimensions even though they appear
934        // to have incomplete exponents:
935        //    5e
936        //    5ex
937        //    5e-
938        if (exponentEnd == exponentIntStart) {  // Incomplete exponent.
939          exponentIntStart = exponentEnd = exponentStart;
940          isExponentZero = true;
941        }
942      }
943if (DEBUG) System.err.println("\texponentStart=" + exponentStart);
944if (DEBUG) System.err.println("\texponentIntStart=" + exponentIntStart);
945if (DEBUG) System.err.println("\texponentEnd=" + exponentEnd);
946
947      int unitStart = exponentEnd;
948      // Skip over space between number and unit.
949      // Many user-agents allow "5 ex" instead of "5ex".
950      while (unitStart < cssLimit) {
951        char ch = css.charAt(unitStart);
952        if (ch == ' ' || isLineTerminator(ch)) {
953          ++unitStart;
954        } else {
955          break;
956        }
957      }
958
959int sbStart = sb.length();
960if (DEBUG) System.err.println("\tCP0 : " + sb.substring(sbStart));
961
962      // Normalize the number onto the buffer.
963      // We will normalize and unit later.
964      // Skip the sign if it is positive.
965      if (intStart != pos && '-' == css.charAt(pos) && !isZero) {
966        sb.append('-');
967      }
968      if (isZero) {
969        sb.append('0');
970      } else {
971        // Strip leading zeroes from the integer and exponent and trailing
972        // zeroes from the fraction.
973        while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; }
974if (DEBUG) System.err.println("\tintStart <- " + intStart);
975        while (fractionEnd > fractionStart
976               && css.charAt(fractionEnd - 1) == '0') {
977          --fractionEnd;
978        }
979if (DEBUG) System.err.println("\tfractionEnd <- " + fractionEnd);
980        if (intStart == intEnd) {
981          sb.append('0');  // .5 -> 0.5
982        } else {
983          sb.append(css, intStart, intEnd);
984        }
985        if (fractionEnd > fractionStart + 1) {  // 5. -> 5; 5.0 -> 5
986          sb.append(css, fractionStart, fractionEnd);
987        }
988if (DEBUG) System.err.println("\tCP1 : " + sb.substring(sbStart) + " ; sbStart=" + sbStart);
989        if (!isExponentZero) {
990          sb.append('e');
991          // 1e+1 -> 1e1
992          if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); }
993          while (exponentIntStart < exponentEnd
994                 && css.charAt(exponentIntStart) == '0') {
995            ++exponentIntStart;
996          }
997if (DEBUG) System.err.println("\texponentIntStart <- " + exponentIntStart);
998          sb.append(css, exponentIntStart, exponentEnd);
999if (DEBUG) System.err.println("\tCP2 : " + sb.substring(sbStart));
1000        }
1001      }
1002if (DEBUG) System.err.println("\tCP3 : " + sb.substring(sbStart));
1003
1004      int unitEnd;
1005      TokenType type;
1006      if (unitStart < cssLimit && '%' == css.charAt(unitStart)) {
1007        unitEnd = unitStart + 1;
1008        type = TokenType.PERCENTAGE;
1009        sb.append('%');
1010      } else {
1011        // The grammar says that any identifier following a number is a unit.
1012        int bufferBeforeUnit = sb.length();
1013        pos = unitStart;
1014        consumeIdent();
1015        int bufferAfterUnit = sb.length();
1016        if (unitStart == exponentEnd  // No intervening space
1017            || isWellKnownUnit(sb, bufferBeforeUnit, bufferAfterUnit)) {
1018          unitEnd = pos;
1019          // 3IN -> 3in
1020          for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) {
1021            char ch = sb.charAt(i);
1022            if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); }
1023          }
1024        } else {
1025          unitEnd = unitStart = exponentEnd;
1026          sb.setLength(bufferBeforeUnit);
1027        }
1028        type = unitStart == unitEnd ? TokenType.NUMBER : TokenType.DIMENSION;
1029      }
1030      pos = unitEnd;
1031if (DEBUG) System.err.println("\tunitStart=" + unitStart);
1032if (DEBUG) System.err.println("\tunitEnd=" + unitEnd);
1033if (DEBUG) System.err.println("\tCP4 : " + sb.substring(sbStart));
1034      return type;
1035    }
1036
1037    private void consumeString() {
1038      String css = this.css;
1039      int cssLimit = this.cssLimit;
1040
1041      char delim = css.charAt(pos);
1042      ++pos;
1043      sb.append('\'');
1044      int last = -1;
1045      while (pos < cssLimit) {
1046        char ch = css.charAt(pos);
1047        if (ch == delim) {
1048          ++pos;
1049          break;
1050        }
1051        if (isLineTerminator(ch)) { break; } // bad-string
1052        int decoded = ch;
1053        if (ch == '\\') {
1054          decoded = consumeAndDecodeEscapeSequence();
1055        } else {
1056          ++pos;
1057        }
1058        encodeCharOntoOutput(decoded, last);
1059        last = decoded;
1060      }
1061      sb.append('\'');
1062    }
1063
1064    private @Nullable TokenType consumeHash() {
1065      assert css.charAt(pos) == '#';
1066      ++pos;
1067      int beforeIdent = pos;
1068      consumeIdent();
1069      if (pos == beforeIdent) {
1070        pos = beforeIdent - 1;
1071        return null;
1072      }
1073      for (int i = beforeIdent; i < pos; ++i) {
1074        char ch = css.charAt(i);
1075        if (!(('0' <= ch && ch <= '9')
1076              || ('a' <= (ch | 32) && (ch | 32) <= 'f'))) {
1077          return TokenType.HASH_ID;
1078        }
1079      }
1080      return TokenType.HASH_UNRESTRICTED;
1081    }
1082
1083    private boolean consumeUnicodeRange() {
1084      String css = this.css;
1085      int cssLimit = this.cssLimit;
1086
1087      int start = pos;
1088      assert (css.charAt(pos) | 32) == 'u';
1089      ++pos;
1090      if (pos == cssLimit && css.charAt(pos) != '+') {
1091        pos = start;
1092        return false;
1093      }
1094      int numStartDigits = 0;
1095      while (pos < cssLimit && numStartDigits < 6) {
1096        char chLower = css.charAt(pos);
1097        if (('0' <= chLower && chLower <= '9')
1098            || ('a' <= chLower && chLower <= 'f')) {
1099          ++numStartDigits;
1100          ++pos;
1101        } else {
1102          break;
1103        }
1104      }
1105      boolean hasQmark = false;
1106      while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') {
1107        hasQmark = true;
1108        ++numStartDigits;
1109        ++pos;
1110      }
1111      if (numStartDigits == 0) {
1112        pos = start;
1113        return false;
1114      }
1115      if (!hasQmark && pos < cssLimit && css.charAt(pos) == '-') {
1116        ++pos;
1117        int numEndDigits = 0;
1118        while (pos < cssLimit && numEndDigits < 6) {
1119          char chLower = css.charAt(pos);
1120          if (('0' <= chLower && chLower <= '9')
1121              || ('a' <= chLower && chLower <= 'f')) {
1122            ++numEndDigits;
1123            ++pos;
1124          } else {
1125            break;
1126          }
1127        }
1128        if (numEndDigits == 0) { --pos; }  // Back up over '-'
1129      }
1130      return true;
1131    }
1132
1133    private @Nullable TokenType consumeIdentOrUrlOrFunction() {
1134      int bufferStart = sb.length();
1135      consumeIdent();
1136      boolean parenAfter = pos < cssLimit && css.charAt(pos) == '(';
1137      if (sb.length() - bufferStart == 3
1138          && 'u' == (sb.charAt(bufferStart) | 32)
1139          && 'r' == (sb.charAt(bufferStart + 1) | 32)
1140          && 'l' == (sb.charAt(bufferStart + 2) | 32)) {
1141        if (parenAfter && consumeUrlValue()) {
1142          sb.setCharAt(bufferStart, 'u');
1143          sb.setCharAt(bufferStart + 1, 'r');
1144          sb.setCharAt(bufferStart + 2, 'l');
1145          return TokenType.URL;
1146        } else {
1147          sb.setLength(bufferStart);
1148          breakOutput();
1149          return TokenType.WHITESPACE;
1150        }
1151      } else if (parenAfter) {
1152        openBracket('(');
1153        ++pos;
1154        return TokenType.FUNCTION;
1155      } else {
1156        return TokenType.IDENT;
1157      }
1158    }
1159
1160    private boolean consumeUrlValue() {
1161      String css = this.css;
1162      int cssLimit = this.cssLimit;
1163      if (pos == cssLimit || css.charAt(pos) != '(') { return false; }
1164      ++pos;
1165      // skip space.
1166      for (; pos < cssLimit; ++pos) {
1167        char ch = css.charAt(pos);
1168        if (ch != ' ' && !isLineTerminator(ch)) { break; }
1169      }
1170      // Find the value.
1171      int delim;
1172      if (pos < cssLimit) {
1173        char ch = pos < cssLimit ? css.charAt(pos) : '\0';
1174        if (ch == '"' || ch == '\'') {
1175          delim = ch;
1176          ++pos;
1177        } else {
1178          delim = '\0';
1179        }
1180      } else {
1181        return false;
1182      }
1183      sb.append("('");
1184      while (pos < cssLimit) {
1185        int decoded = css.charAt(pos);
1186        if (delim != 0) {
1187          if (decoded == delim) { break; }
1188        } else if (decoded <= ' ' || decoded == ')') {
1189          break;
1190        }
1191        if (decoded == '\\') {
1192          decoded = consumeAndDecodeEscapeSequence();
1193        } else {
1194          ++pos;
1195        }
1196        // Any character not in the RFC 3986 safe set is %-encoded.
1197        if (decoded > 0x80 || URL_SAFE[decoded]) {
1198          sb.appendCodePoint(decoded);
1199        } else {
1200          sb.append('%')
1201            .append(HEX_DIGITS[(decoded >>> 4) & 0xf])
1202            .append(HEX_DIGITS[(decoded >>> 0) & 0xf]);
1203        }
1204      }
1205
1206      // skip space.
1207      for (; pos < cssLimit; ++pos) {
1208        char ch = css.charAt(pos);
1209        if (ch != ' ' && !isLineTerminator(ch)) { break; }
1210      }
1211      if (pos < cssLimit && css.charAt(pos) == ')') {
1212        ++pos;
1213      } else {
1214        // broken-url
1215      }
1216      sb.append("')");
1217      return true;
1218    }
1219  }
1220
1221  private static final boolean isIdentPart(int cp) {
1222    return cp >= 0x80 || IDENT_PART_ASCII[cp];
1223  }
1224
1225  private static final boolean isDecimal(char ch) {
1226    return '0' <= ch && ch <= '9';
1227  }
1228
1229  private static final boolean[] IDENT_PART_ASCII = new boolean[128];
1230  static {
1231    for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; }
1232    for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; }
1233    for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; }
1234    IDENT_PART_ASCII['_'] = true;
1235    IDENT_PART_ASCII['-'] = true;
1236  }
1237
1238  private static final int LINE_TERMINATOR_BITMASK =
1239      (1 << '\n') | (1 << '\r') | (1 << '\f');
1240
1241  private static boolean isLineTerminator(char ch) {
1242    return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch));
1243  }
1244
1245  private static int[] expandIfNecessary(int[] arr, int limit, int needed) {
1246    int neededLength = limit + needed;
1247    int length = arr.length;
1248if (DEBUG) System.err.println("limit=" + limit + ", needed=" + needed + ", length=" + length);
1249    if (length >= neededLength) { return arr; }
1250    int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))];
1251    System.arraycopy(arr, 0, newArr, 0, limit);
1252if (DEBUG) System.err.println("\texpanded to " + newArr.length);
1253    return newArr;
1254  }
1255
1256  private static int[] truncateOrShare(int[] arr, int limit) {
1257    if (limit == 0) { return ZERO_INTS; }
1258    if (limit == arr.length) {
1259      return arr;
1260    }
1261    int[] trunc = new int[limit];
1262    System.arraycopy(arr, 0, trunc, 0, limit);
1263    return trunc;
1264  }
1265
1266  private static final int LENGTH_UNIT_TYPE = 0;
1267  private static final int ANGLE_UNIT_TYPE = 1;
1268  private static final int TIME_UNIT_TYPE = 2;
1269  private static final int FREQUENCY_UNIT_TYPE = 3;
1270  private static final int RESOLUTION_UNIT_TYPE = 4;
1271
1272  /**
1273   * See http://dev.w3.org/csswg/css-values/#lengths and
1274   *     http://dev.w3.org/csswg/css-values/#other-units
1275   */
1276  private static final Trie UNIT_TRIE = new Trie(
1277      ImmutableMap.<String, Integer>builder()
1278        .put("em", LENGTH_UNIT_TYPE)
1279        .put("ex", LENGTH_UNIT_TYPE)
1280        .put("ch", LENGTH_UNIT_TYPE)  // Width of zero character
1281        .put("rem", LENGTH_UNIT_TYPE)  // Root element font-size
1282        .put("vh", LENGTH_UNIT_TYPE)
1283        .put("vw", LENGTH_UNIT_TYPE)
1284        .put("vmin", LENGTH_UNIT_TYPE)
1285        .put("vmax", LENGTH_UNIT_TYPE)
1286        .put("px", LENGTH_UNIT_TYPE)
1287        .put("mm", LENGTH_UNIT_TYPE)
1288        .put("cm", LENGTH_UNIT_TYPE)
1289        .put("in", LENGTH_UNIT_TYPE)
1290        .put("pt", LENGTH_UNIT_TYPE)
1291        .put("pc", LENGTH_UNIT_TYPE)
1292        .put("deg", ANGLE_UNIT_TYPE)
1293        .put("rad", ANGLE_UNIT_TYPE)
1294        .put("grad", ANGLE_UNIT_TYPE)
1295        .put("turn", ANGLE_UNIT_TYPE)
1296        .put("s", TIME_UNIT_TYPE)
1297        .put("ms", TIME_UNIT_TYPE)
1298        .put("hz", FREQUENCY_UNIT_TYPE)
1299        .put("khz", FREQUENCY_UNIT_TYPE)
1300        .put("dpi", RESOLUTION_UNIT_TYPE)
1301        .put("dpcm", RESOLUTION_UNIT_TYPE)
1302        .put("dppx", RESOLUTION_UNIT_TYPE)
1303        .build());
1304
1305  private static boolean isWellKnownUnit(StringBuilder sb, int start, int end) {
1306    if (start == end) { return false; }
1307    Trie t = UNIT_TRIE;
1308    for (int i = start; i < end; ++i) {
1309      char ch = sb.charAt(i);
1310      t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch);
1311      if (t == null) { return false; }
1312    }
1313    return t.isTerminal();
1314  }
1315
1316
1317  private static final boolean[] URL_SAFE = new boolean[128];
1318  static {
1319    // From RFC 3986
1320    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1321    for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; }
1322    for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; }
1323    for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; }
1324    URL_SAFE['-'] = true;
1325    URL_SAFE['.'] = true;
1326    URL_SAFE['_'] = true;
1327    URL_SAFE['~'] = true;
1328    // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1329    URL_SAFE[':'] = true;
1330    URL_SAFE['/'] = true;
1331    URL_SAFE['?'] = true;
1332    URL_SAFE['#'] = true;
1333    URL_SAFE['['] = true;
1334    URL_SAFE[']'] = true;
1335    URL_SAFE['@'] = true;
1336    // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
1337    //             / "*" / "+" / "," / ";" / "="
1338    URL_SAFE['!'] = true;
1339    URL_SAFE['$'] = true;
1340    URL_SAFE['&'] = true;
1341    // Only used in obsolete mark rule and special in unquoted URLs or comment
1342    // delimiters.
1343    // URL_SAFE['\''] = true;
1344    // URL_SAFE['('] = true;
1345    // URL_SAFE[')'] = true;
1346    // URL_SAFE['*'] = true;
1347    URL_SAFE['+'] = true;
1348    URL_SAFE[','] = true;
1349    URL_SAFE[';'] = true;
1350    URL_SAFE['='] = true;
1351  }
1352
1353  private static final char[] HEX_DIGITS = {
1354    '0', '1', '2', '3',
1355    '4', '5', '6', '7',
1356    '8', '9', 'a', 'b',
1357    'c', 'd', 'e', 'f'
1358  };
1359}
1360