1// Copyright (c) 2013, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import java.util.ArrayList;
32import java.util.Iterator;
33import java.util.List;
34import java.util.NoSuchElementException;
35
36import javax.annotation.Nullable;
37
38import com.google.common.collect.ImmutableMap;
39
40/**
41 * Given a string of CSS, produces a string of normalized CSS with certain
42 * useful properties detailed below.
43 * <ul>
44 *   <li>All runs of white-space and comment tokens (including CDO and CDC)
45 *     have been replaced with a single space character.</li>
46 *   <li>All strings are quoted and escapes are escaped according to the
47 *     following scheme:
48 *     <table>
49 *       <tr><td>NUL</td>            <td><code>\0</code></tr>
50 *       <tr><td>line feed</td>      <td><code>\a</code></tr>
51 *       <tr><td>vertical feed</td>  <td><code>\c</code></tr>
52 *       <tr><td>carriage return</td><td><code>\d</code></tr>
53 *       <tr><td>double quote</td>   <td><code>\22</code></tr>
54 *       <tr><td>ampersand &amp;</td><td><code>\26</code></tr>
55 *       <tr><td>single quote</td>   <td><code>\27</code></tr>
56 *       <tr><td>left-angle &lt;</td><td><code>\3c</code></tr>
57 *       <tr><td>rt-angle &gt;</td>  <td><code>\3e</code></tr>
58 *       <tr><td>back slash</td>     <td><code>\\</code></tr>
59 *       <tr><td>all others</td>     <td>raw</td></tr>
60 *     </table>
61 *   </li>
62 *   <li>All <code>url(&hellip;)</code> tokens are quoted.
63 *   <li>All keywords, identifiers, and hex literals are lower-case and have
64 *       embedded escape sequences decoded, except that .</li>
65 *   <li>All brackets nest properly.</li>
66 *   <li>Does not contain any case-insensitive variant of the sequences
67 *       {@code <!--}, {@code -->}, {@code <![CDATA[}, {@code ]]>}, or
68 *       {@code </style}.</li>
69 *   <li>All delimiters that can start longer tokens are followed by a space.
70 * </ul>
71 */
72final class CssTokens implements Iterable<String> {
73
74  public final String normalizedCss;
75  public final Brackets brackets;
76  private final int[] tokenBreaks;
77  private final TokenType[] tokenTypes;
78
79  public TokenIterator start() {
80    return new TokenIterator(tokenTypes.length);
81  }
82
83  public TokenIterator iterator() { return start(); }
84
85  public static CssTokens lex(String css) {
86    Lexer lexer = new Lexer(css);
87    lexer.lex();
88    return lexer.build();
89  }
90
91  /** A cursor into a list of tokens. */
92  public final class TokenIterator implements Iterator<String> {
93    private int tokenIndex = 0;
94    private final int limit;
95
96    TokenIterator(int limit) {
97      this.limit = limit;
98    }
99
100    public boolean hasNext() {
101      return hasToken();
102    }
103
104    public String next() {
105      String token = token();
106      advance();
107      return token;
108    }
109
110    public @Nullable TokenIterator spliceToEnd() {
111      if (!hasNext()) { throw new NoSuchElementException(); }
112      int end = brackets.partner(tokenIndex);
113      if (end < 0) {
114        return null;
115      }
116      TokenIterator between = new TokenIterator(end);
117      between.tokenIndex = tokenIndex + 1;
118      tokenIndex = end + 1;
119      return between;
120    }
121
122    public int tokenIndex() {
123      return tokenIndex;
124    }
125
126    public int startOffset() {
127      return tokenBreaks[tokenIndex];
128    }
129
130    public int endOffset() {
131      return tokenBreaks[tokenIndex+1];
132    }
133
134    public String token() {
135      return normalizedCss.substring(startOffset(), endOffset());
136    }
137
138    public boolean hasToken() {
139      return tokenIndex < limit;
140    }
141
142    public boolean hasTokenAfterSpace() {
143      while (hasToken()) {
144        if (type() != TokenType.WHITESPACE) { return true; }
145        advance();
146      }
147      return false;
148    }
149
150    /** The type of the current token. */
151    public TokenType type() {
152      return tokenTypes[tokenIndex];
153    }
154
155    public void seek(int tokenIndex) {
156      this.tokenIndex = tokenIndex;
157    }
158
159    public void advance() {
160      if (!hasToken()) { throw new NoSuchElementException(); }
161      ++tokenIndex;
162    }
163
164    public void backup() {
165      if (tokenIndex == 0) { throw new NoSuchElementException(); }
166      --tokenIndex;
167    }
168
169    public void remove() throws UnsupportedOperationException {
170      throw new UnsupportedOperationException();
171    }
172  }
173
174  private CssTokens(
175      String normalizedCss, Brackets brackets, int[] tokenBreaks,
176      TokenType[] tokenTypes) {
177    this.normalizedCss = normalizedCss;
178    this.brackets = brackets;
179    this.tokenBreaks = tokenBreaks;
180    this.tokenTypes = tokenTypes;
181  }
182
183  public enum TokenType {
184    /** An identifier. */
185    IDENT,
186    /** An identifier prefixed with a period. */
187    DOT_IDENT,
188    /** A function name and opening bracket. */
189    FUNCTION,
190    /** An {@code @<identifier>} directive token. */
191    AT,
192    /** A hash token that contains non-hex characters. */
193    HASH_ID,
194    /** A hash token that could be a color literal. */
195    HASH_UNRESTRICTED,
196    /** A quoted string. */
197    STRING,
198    /** A URL of the form <code>url("...")</code>. */
199    URL,
200    /** A single character. */
201    DELIM,
202    /** A scalar numeric value. */
203    NUMBER,
204    /** A percentage. */
205    PERCENTAGE,
206    /** A numeric value with a unit suffix. */
207    DIMENSION,
208    /** A numeric value with an unknown unit suffix. */
209    BAD_DIMENSION,
210    /** {@code U+<hex-or-qmark>} */
211    UNICODE_RANGE,
212    /**
213     * include-match, dash-match, prefix-match, suffix-match, substring-match
214     */
215    MATCH,
216    /** {@code ||} */
217    COLUMN,
218    /** A run of white-space, comment, CDO, and CDC tokens. */
219    WHITESPACE,
220    /** {@code :} */
221    COLON,
222    /** {@code ;} */
223    SEMICOLON,
224    /** {@code ,} */
225    COMMA,
226    /** {@code [} */
227    LEFT_SQUARE,
228    /** {@code ]} */
229    RIGHT_SQUARE,
230    /** {@code (} */
231    LEFT_PAREN,
232    /** {@code )} */
233    RIGHT_PAREN,
234    /** <code>{</code> */
235    LEFT_CURLY,
236    /** <code>}</code> */
237    RIGHT_CURLY,
238    ;
239  }
240
241  /**
242   * Maps tokens to their partners.  A close bracket token like {@code (} may
243   * have a partner token like {@code )} if properly nested, and vice-versa.
244   */
245  static final class Brackets {
246    /**
247     * For each token index, the index of the indexed token's partner or -1 if
248     * it has none.
249     */
250    private final int[] brackets;
251
252    private Brackets(int[] brackets) {
253      this.brackets = brackets;
254    }
255
256    /** The index of the partner token or -1 if none. */
257    int partner(int tokenIndex) {
258      int bracketIndex = bracketIndexForToken(tokenIndex);
259      if (bracketIndex < 0) { return -1; }
260      return brackets[(bracketIndex << 1) + 1];
261    }
262
263    int bracketIndexForToken(int target) {
264      // Binary search by leftmost element of pair.
265      int left = 0;
266      int right = brackets.length >> 1;
267      while (left < right) {
268        int mid = left + ((right - left) >> 1);
269        int value = brackets[mid << 1];
270        if (value == target) { return mid; }
271        if (value < target) {
272          left = mid + 1;
273        } else {
274          right = mid;
275        }
276      }
277      return -1;
278    }
279  }
280
281  private static final int[] ZERO_INTS = new int[0];
282
283  private static final TokenType[] ZERO_TYPES = new TokenType[0];
284
285  private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS);
286
287  private static final CssTokens EMPTY = new CssTokens(
288      "", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES);
289
290  /**
291   * Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/
292   */
293  private static final class Lexer {
294    private final String css;
295    private final StringBuilder sb;
296    private int pos = 0;
297    private final int cssLimit;
298
299    private List<TokenType> tokenTypes = null;
300    private int[] tokenBreaks = new int[128];
301    private int tokenBreaksLimit = 0;
302
303    /**
304     * For each bracket, 2 ints: the token index of the bracket, and the token
305     * index of its partner.
306     * The array is sorted by the first int.
307     * The second int is -1 when the bracket has not yet been closed.
308     */
309    private int[] brackets = ZERO_INTS;
310    /**
311     * The number of elements in {@link #brackets} that are valid.
312     * {@code brackets[bracketsLimit:]} is zeroed space that the list can grow
313     * into.
314     */
315    private int bracketsLimit = 0;
316    /**
317     * For each bracket that has not been closed, 2 ints:
318     * its index in {@link #brackets} and the character of its close bracket
319     * as an int.
320     * This is a bracket stack so the array is sorted by the first int.
321     */
322    private int[] open = ZERO_INTS;
323    /**
324     * The number of elements in {@link #open} that are valid.
325     * {@code open[openLimit:]} is garbage space that the stack can grow into.
326     */
327    private int openLimit = 0;
328
329    Lexer(String css) {
330      this.css = css;
331      this.sb = new StringBuilder();
332      this.cssLimit = css.length();
333    }
334
335    TokenType openBracket(char bracketChar) {
336      char close;
337      TokenType type;
338      switch (bracketChar) {
339        case '(': close = ')'; type = TokenType.LEFT_PAREN;  break;
340        case '[': close = ']'; type = TokenType.LEFT_SQUARE; break;
341        case '{': close = '}'; type = TokenType.LEFT_CURLY;  break;
342        default:
343          throw new AssertionError("Invalid open bracket " + bracketChar);
344      }
345      brackets = expandIfNecessary(brackets, bracketsLimit, 2);
346      open = expandIfNecessary(open, openLimit, 2);
347      open[openLimit++] = bracketsLimit;
348      open[openLimit++] = close;
349      brackets[bracketsLimit++] = tokenBreaksLimit;
350      brackets[bracketsLimit++] = -1;
351      sb.append(bracketChar);
352      return type;
353    }
354
355    void closeBracket(char bracketChar) {
356      int openLimitAfterClose = openLimit;
357      do {
358        if (openLimitAfterClose == 0) {
359          // Drop an orphaned close bracket.
360          breakOutput();
361          return;
362        }
363        openLimitAfterClose -= 2;
364      } while (bracketChar != open[openLimitAfterClose + 1]);
365      closeBrackets(openLimitAfterClose);
366    }
367
368    private void closeBrackets(int openLimitAfterClose) {
369      // Make sure we've got space on brackets.
370      int spaceNeeded = openLimit - openLimitAfterClose;
371      brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded);
372
373      int closeTokenIndex = tokenBreaksLimit;
374      while (openLimit > openLimitAfterClose) {
375        // Pop the stack.
376        int closeBracket = open[--openLimit];
377        int openBracketIndex = open[--openLimit];
378        int openTokenIndex = brackets[openBracketIndex];
379        // Update open bracket to point to its partner.
380        brackets[openBracketIndex + 1] = closeTokenIndex;
381        // Emit the close bracket.
382        brackets[bracketsLimit++] = closeTokenIndex;
383        brackets[bracketsLimit++] = openTokenIndex;
384        sb.appendCodePoint(closeBracket);
385        closeTokenIndex++;
386      }
387    }
388
389    CssTokens build() {
390      // Close any still open brackets.
391      {
392        int startOfCloseBrackets = sb.length();
393        closeBrackets(0);
394        emitMergedTokens(startOfCloseBrackets, sb.length());
395      }
396
397      if (tokenTypes == null) { return EMPTY; }
398      int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit);
399
400      // Strip any trailing space off, since it may have been inserted by a
401      // breakAfter call anyway.
402      int cssEnd = sb.length();
403      if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') {
404        --cssEnd;
405        tokenTypes.remove(--tokenBreaksLimit);
406      }
407      String normalizedCss = sb.substring(0, cssEnd);
408
409      // Store the last character on the tokenBreaksList to simplify finding the
410      // end of a token.
411      tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
412      tokenBreaks[tokenBreaksLimit++] = normalizedCss.length();
413
414      int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit);
415      TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES);
416
417      return new CssTokens(
418          normalizedCss, new Brackets(bracketsTrunc),
419          tokenBreaksTrunc, tokenTypesArr);
420    }
421
422    void lex() {
423      // Fast-track no content.
424      consumeIgnorable();
425      sb.setLength(0);
426      if (pos == cssLimit) { return; }
427
428      tokenTypes = new ArrayList<TokenType>();
429
430      String css = this.css;
431      int cssLimit = this.cssLimit;
432      while (pos < cssLimit) {
433        assert this.tokenBreaksLimit == this.tokenTypes.size()
434            : "token and types out of sync at " + tokenBreaksLimit
435            + " in `" + css + "`";
436        // SPEC: 4. Tokenization
437        // The output of the tokenization step is a stream of zero
438        // or more of the following tokens: <ident>, <function>,
439        // <at-keyword>, <hash>, <string>, <bad-string>, <url>,
440        // <bad-url>, <delim>, <number>, <percentage>,
441        // <dimension>, <unicode-range>, <include-match>,
442        // <dash-match>, <prefix-match>, <suffix-match>,
443        // <substring-match>, <column>, <whitespace>, <CDO>,
444        // <CDC>, <colon>, <semicolon>, <comma>, <[>, <]>,
445        // <(>, <)>, <{>, and <}>.
446
447        // IMPLEMENTS: 4.3 Consume a token
448        char ch = css.charAt(pos);
449        int startOfToken = pos;
450        int startOfOutputToken = sb.length();
451        final TokenType type;
452        switch (ch) {
453          case '\t': case '\n': case '\f': case '\r': case ' ': case '\ufeff':
454            consumeIgnorable();
455            type = TokenType.WHITESPACE;
456            break;
457          case '/': {
458            char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
459            if (lookahead == '/' || lookahead == '*') {
460              consumeIgnorable();
461              type = TokenType.WHITESPACE;
462            } else {
463              consumeDelim(ch);
464              type = TokenType.DELIM;
465            }
466            break;
467          }
468          case '<':
469            if (consumeIgnorable()) {  // <!--
470              type = TokenType.WHITESPACE;
471            } else {
472              consumeDelim('<');
473              type = TokenType.DELIM;
474            }
475            break;
476          case '>':
477            breakOutput();
478            sb.append('>');
479            type = TokenType.DELIM;
480            ++pos;
481            break;
482          case '@':
483            if (consumeAtKeyword()) {
484              type = TokenType.AT;
485            } else {
486              consumeDelim(ch);
487              type = TokenType.DELIM;
488            }
489            break;
490          case '#': {
491            sb.append('#');
492            TokenType hashType = consumeHash();
493            if (hashType != null) {
494              type = hashType;
495            } else {
496              ++pos;
497              sb.append(' ');
498              type = TokenType.DELIM;
499            }
500            break;
501          }
502          case '"':
503          case '\'':
504            type = consumeString();
505            break;
506          case 'U': case 'u':
507            // SPEC handle URL under "ident like token".
508            if (consumeUnicodeRange()) {
509              type = TokenType.UNICODE_RANGE;
510            } else {
511              type = consumeIdentOrUrlOrFunction();
512            }
513            break;
514          case '0': case '1': case '2': case '3': case '4':
515          case '5': case '6': case '7': case '8': case '9':
516            type = consumeNumberOrPercentageOrDimension();
517            break;
518          case '+': case '-': case '.': {
519            char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
520            if (isDecimal(lookahead)
521                || (lookahead == '.' && pos + 2 < cssLimit
522                    && isDecimal(css.charAt(pos + 2)))) {
523              type = consumeNumberOrPercentageOrDimension();
524            } else if (ch == '+') {
525              consumeDelim(ch);
526              type = TokenType.DELIM;
527            } else if (ch == '-') {
528              if (consumeIgnorable()) {  // -->
529                type = TokenType.WHITESPACE;
530              } else {
531                type = consumeIdentOrUrlOrFunction();
532              }
533            } else if (isIdentPart(lookahead)) {
534              // treat ".<IDENT>" as one token.
535              sb.append('.');
536              ++pos;
537              consumeIdent(false);
538              if (pos != startOfToken + 1) {
539                type = TokenType.DOT_IDENT;
540                if (pos < cssLimit) {
541                  char next = css.charAt(pos);
542                  if ('(' == next) {
543                    // A dotted identifier followed by a parenthesis is
544                    // ambiguously a function.
545                    sb.append(' ');
546                  }
547                }
548              } else {
549                type = TokenType.DELIM;
550                sb.append(' ');
551              }
552            } else {
553              consumeDelim('.');
554              type = TokenType.DELIM;
555            }
556            break;
557          }
558          case ':': consumeDelim(ch); type = TokenType.COLON; break;
559          case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break;
560          case ',': consumeDelim(ch); type = TokenType.COMMA; break;
561          case '[': case '(': case '{':
562            type = openBracket(ch);
563            ++pos;
564            break;
565          case '}': case ')': case ']':
566            closeBracket(ch);
567            ++pos;
568            // Use DELIM so that a later loop will split output into multiple
569            // tokens since we may have inserted missing close brackets for
570            // unclosed open brackets already on the stack.
571            type = TokenType.DELIM;
572            break;
573          case '~': case '|': case '^': case '$': case '*': {
574            char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
575            if (lookahead == '=') {
576              consumeMatch(ch);
577              type = TokenType.MATCH;
578            } else if (ch == '|' && lookahead == '|') {
579              consumeColumn();
580              type = TokenType.COLUMN;
581            } else {
582              consumeDelim(ch);
583              type = TokenType.DELIM;
584            }
585            break;
586          }
587          case '_':
588            type = consumeIdentOrUrlOrFunction();
589            break;
590          case '\\': {
591            // Optimistically parse as an ident.
592            TokenType identType = consumeIdentOrUrlOrFunction();
593            if (identType == null) {
594              ++pos;  // drop
595              breakOutput();
596              type = TokenType.WHITESPACE;
597            } else {
598              type = identType;
599            }
600            // TODO: handle case where "url" is encoded.
601            break;
602          }
603          default:
604            int chlower = ch | 32;
605            if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) {
606              TokenType identType = consumeIdentOrUrlOrFunction();
607              if (identType != null) {
608                type = identType;
609              } else {  // Occurs on undefined-codepoints.
610                ++pos;
611                breakOutput();
612                type = TokenType.WHITESPACE;
613              }
614            } else if (ch > 0x20) {
615              consumeDelim(ch);
616              type = TokenType.DELIM;
617            } else {  // Ignore.
618              consumeIgnorable();
619              type = TokenType.WHITESPACE;
620            }
621        }
622        assert pos > startOfToken
623            : "empty token at " + pos + ", ch0=" + css.charAt(startOfToken)
624            + ":U+" + Integer.toHexString(css.charAt(startOfToken));
625        int endOfOutputToken = sb.length();
626        if (endOfOutputToken > startOfOutputToken) {
627          if (type == TokenType.DELIM) {
628            emitMergedTokens(startOfOutputToken, endOfOutputToken);
629          } else {
630            if (type != TokenType.WHITESPACE
631                && sb.charAt(startOfOutputToken) == ' ') {
632              emitToken(TokenType.WHITESPACE, startOfOutputToken);
633              ++startOfOutputToken;
634              assert startOfOutputToken != endOfOutputToken;
635            }
636            emitToken(type, startOfOutputToken);
637            // Token emitters can emit a space after a token to avoid possible
638            // merges with following tokens
639            if (type != TokenType.WHITESPACE) {
640              int sbLen = sb.length();
641              if (startOfOutputToken + 1 < sbLen
642                  && sb.charAt(sbLen - 1) == ' ') {
643                emitToken(TokenType.WHITESPACE, sbLen - 1);
644              }
645            }
646          }
647        }
648      }
649    }
650
651    private void emitMergedTokens(int start, int end) {
652      // Handle breakOutput and merging of output tokens.
653      for (int e = start; e < end; ++e) {
654        TokenType delimType;
655        switch (sb.charAt(e)) {
656          case ' ': delimType = TokenType.WHITESPACE;   break;
657          case '}': delimType = TokenType.RIGHT_CURLY;  break;
658          case ')': delimType = TokenType.RIGHT_PAREN;  break;
659          case ']': delimType = TokenType.RIGHT_SQUARE; break;
660          default : delimType = TokenType.DELIM;        break;
661        }
662        emitToken(delimType, e);
663      }
664    }
665
666    private void emitToken(TokenType type, int startOfOutputToken) {
667      if (tokenBreaksLimit == 0
668          || tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) {
669        tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
670        tokenBreaks[tokenBreaksLimit++] = startOfOutputToken;
671        tokenTypes.add(type);
672      }
673    }
674
675    private void consumeDelim(char ch) {
676      sb.append(ch);
677      switch (ch) {
678        // Prevent token merging.
679        case '~': case '|': case '^': case '$': case '\\':
680        case '.': case '+': case '-': case '@': case '/':  case '<':
681          sb.append(' ');
682          break;
683        default:
684          break;
685      }
686      ++pos;
687    }
688
689    private boolean consumeIgnorable() {
690      String css = this.css;
691      int cssLimit = this.cssLimit;
692      int posBefore = pos;
693      while (pos < cssLimit) {
694        char ch = css.charAt(pos);
695        if (ch <= 0x20
696            // Treat a BOM as white-space so that it is ignored at the beginning
697            // of a file.
698            || ch == '\ufeff') {
699          ++pos;
700        } else if (pos + 1 == cssLimit) {
701          break;
702        } else if (ch == '/') {
703          char next = css.charAt(pos + 1);
704          if (next == '*') {
705            pos += 2;
706            while (pos < cssLimit) {
707              int ast = css.indexOf('*', pos);
708              if (ast < 0) {
709                pos = cssLimit;  // Unclosed /* comment */
710                break;
711              } else {
712                // Advance over a run of '*'s.
713                pos = ast + 1;
714                while (pos < cssLimit && css.charAt(pos) == '*') {
715                  ++pos;
716                }
717                if (pos < cssLimit && css.charAt(pos) == '/') {
718                  ++pos;
719                  break;
720                }
721              }
722            }
723          } else if (next == '/') {  // Non-standard but widely supported
724            while (++pos < cssLimit) {
725              if (isLineTerminator(css.charAt(pos))) { break; }
726            }
727          } else {
728            break;
729          }
730        } else if (ch == '<') {
731          if (pos + 3 < cssLimit
732              && '!' == css.charAt(pos + 1)
733              && '-' == css.charAt(pos + 2)
734              && '-' == css.charAt(pos + 3)) {
735            pos += 4;
736          } else {
737            break;
738          }
739        } else if (ch == '-') {
740          if (pos + 2 < cssLimit
741              && '-' == css.charAt(pos + 1)
742              && '>' == css.charAt(pos + 2)) {
743            pos += 3;
744          } else {
745            break;
746          }
747        } else {
748          break;
749        }
750      }
751      if (pos == posBefore) {
752        return false;
753      } else {
754        breakOutput();
755        return true;
756      }
757    }
758
759    private void breakOutput() {
760      int last = sb.length() - 1;
761      if (last >= 0 && sb.charAt(last) != ' ') { sb.append(' '); }
762    }
763
764    private void consumeColumn() {
765      pos += 2;
766      sb.append("||");
767    }
768
769    private void consumeMatch(char ch) {
770      pos += 2;
771      sb.append(ch).append('=');
772    }
773
774    private void consumeIdent(boolean allowFirstDigit) {
775      int cssLimit = this.cssLimit;
776      int last = -1, nCodepoints = 0;
777      int sbAtStart = sb.length();
778      int posAtStart = pos;
779      while (pos < cssLimit) {
780        int posBefore = pos;
781
782        int decoded = readCodepoint();
783        if (decoded == '\\') {
784          decoded = consumeAndDecodeEscapeSequence();
785        } else {
786          ++pos;
787        }
788
789        if (decoded >= 0 && isIdentPart(decoded)) {
790          if (!allowFirstDigit && nCodepoints < 2
791              && '0' <= decoded && decoded <= '9') {
792            // Don't allow encoded identifiers that look like numeric tokens
793            // like \-1 or ones that start with an encoded decimal digit.
794            if (last == '-' || last == -1) {
795              pos = posAtStart;
796              sb.setLength(sbAtStart);
797              return;
798            }
799          }
800          sb.appendCodePoint(decoded);
801          last = decoded;
802          ++nCodepoints;
803        } else {
804          pos = posBefore;
805          return;
806        }
807      }
808    }
809
810    private boolean consumeAtKeyword() {
811      assert css.charAt(pos) == '@';
812      int bufferLengthBeforeWrite = sb.length();
813      sb.append('@');
814      int posBeforeKeyword = ++pos;
815      consumeIdent(false);
816      if (pos == posBeforeKeyword) {
817        --pos;  // back up over '@'
818        sb.setLength(bufferLengthBeforeWrite);  // Unwrite the '@'
819        return false;
820      } else {
821        return true;
822      }
823    }
824
825
826    private int consumeAndDecodeEscapeSequence() {
827      String css = this.css;
828      int cssLimit = this.cssLimit;
829      assert css.charAt(pos) == '\\';
830      if (pos + 1 >= cssLimit) { return -1; }
831      char esc = css.charAt(pos + 1);
832      if (isLineTerminator(esc)) { return -1; }
833      int escLower = esc | 32;
834      if (('0' <= esc && esc <= '9')
835          || ('a' <= escLower && escLower <= 'f')) {
836        int hexValue = 0;
837        int hexStart = pos + 1;
838        int hexLimit = Math.min(pos + 7, cssLimit);
839        int hexEnd = hexStart;
840        do {
841          hexValue = (hexValue << 4)
842              | (esc <= '9' ? esc - '0' : escLower - ('a' - 10));
843          ++hexEnd;
844          if (hexEnd == hexLimit) { break; }
845          esc = css.charAt(hexEnd);
846          escLower = esc | 32;
847        } while (('0' <= esc && esc <= '9')
848                 || ('a' <= escLower && escLower <= 'f'));
849        if (!Character.isDefined(hexValue)) {
850          hexValue = 0xfffd;
851        }
852        pos = hexEnd;
853        if (pos < cssLimit) {
854          // A sequence of hex digits can be followed by a space that allows
855          // so that code-point U+A followed by the letter 'b' can be rendered
856          // as "\a b" since "\ab" specifies the single code-point U+AB.
857          char next = css.charAt(pos);
858          if (next == ' ' || next == '\t' || isLineTerminator(next)) {
859            ++pos;
860          }
861        }
862        return hexValue;
863      }
864      pos += 2;
865      return esc;
866    }
867
868    private static final long HEX_ENCODED_BITMASK =
869        (1L << 0) | LINE_TERMINATOR_BITMASK
870        | (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>');
871    private static boolean isHexEncoded(int codepoint) {
872      return (0 <= codepoint && codepoint < 63
873              && 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK));
874    }
875
876    private void encodeCharOntoOutput(int codepoint, int last) {
877      switch (codepoint) {
878        case '\\': sb.append("\\\\"); break;
879        case '\0': sb.append("\\0");  break;
880        case '\n': sb.append("\\a");  break;
881        case '\f': sb.append("\\c");  break;
882        case '\r': sb.append("\\d");  break;
883        case '\"': sb.append("\\22"); break;
884        case '&':  sb.append("\\26"); break;
885        case '\'': sb.append("\\27"); break;
886        case '<':  sb.append("\\3c"); break;
887        case '>':  sb.append("\\3e"); break;
888        // The set of escapes above that end with a hex digit must appear in
889        // HEX_ENCODED_BITMASK.
890        case '-':
891          sb.append('-');
892          break;
893        default:
894          if (isHexEncoded(last)
895              // We need to put a space after a trailing hex digit if the
896              // next encoded character on the output would be another hex
897              // digit or a space character.  The other space characters
898              // are handled above.
899              && (codepoint == ' ' || codepoint == '\t'
900                  || ('0' <= codepoint && codepoint <= '9')
901                  || ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) {
902            sb.append(' ');
903          }
904          sb.appendCodePoint(codepoint);
905          break;
906      }
907    }
908
909    private TokenType consumeNumberOrPercentageOrDimension() {
910      String css = this.css;
911      int cssLimit = this.cssLimit;
912      boolean isZero = true;
913      int intStart = pos;
914      if (intStart < cssLimit) {
915        char ch = css.charAt(intStart);
916        if (ch == '-' || ch == '+') {
917          ++intStart;
918        }
919      }
920      // Find the integer part after any sign.
921      int intEnd = intStart;
922      for (; intEnd < cssLimit; ++intEnd) {
923        char ch = css.charAt(intEnd);
924        if (!('0' <= ch && ch <= '9')) { break; }
925        if (ch != '0') { isZero = false; }
926      }
927      // Find a fraction like ".5" or ".".
928      int fractionStart = intEnd;
929      int fractionEnd = fractionStart;
930      if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) {
931        ++fractionEnd;
932        for (; fractionEnd < cssLimit; ++fractionEnd) {
933          char ch = css.charAt(fractionEnd);
934          if (!('0' <= ch && ch <= '9')) { break; }
935          if (ch != '0') { isZero = false; }
936        }
937      }
938      int exponentStart = fractionEnd;
939      int exponentIntStart = exponentStart;
940      int exponentEnd = exponentStart;
941      boolean isExponentZero = true;
942      if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) {
943        // 'e' and 'e' in "5e-f" for a
944        exponentEnd = exponentStart + 1;
945        if (exponentEnd < cssLimit) {
946          char ch = css.charAt(exponentEnd);
947          if (ch == '+' || ch == '-') { ++exponentEnd; }
948        }
949        exponentIntStart = exponentEnd;
950        for (; exponentEnd < cssLimit; ++exponentEnd) {
951          char ch = css.charAt(exponentEnd);
952          if (!('0' <= ch && ch <= '9')) { break; }
953          if (ch != '0') { isExponentZero = false; }
954        }
955        // Since
956        //    dimension := <number> <ident>
957        // the below are technically valid dimensions even though they appear
958        // to have incomplete exponents:
959        //    5e
960        //    5ex
961        //    5e-
962        if (exponentEnd == exponentIntStart) {  // Incomplete exponent.
963          exponentIntStart = exponentEnd = exponentStart;
964          isExponentZero = true;
965        }
966      }
967
968      int unitStart = exponentEnd;
969      // Skip over space between number and unit.
970      // Many user-agents allow "5 ex" instead of "5ex".
971      while (unitStart < cssLimit) {
972        char ch = css.charAt(unitStart);
973        if (ch == ' ' || isLineTerminator(ch)) {
974          ++unitStart;
975        } else {
976          break;
977        }
978      }
979
980      if (sb.length() != 0 && isIdentPart(sb.charAt(sb.length() - 1))) {
981        sb.append(' ');
982      }
983      // Normalize the number onto the buffer.
984      // We will normalize and unit later.
985      // Skip the sign if it is positive.
986      if (intStart != pos && '-' == css.charAt(pos) && !isZero) {
987        sb.append('-');
988      }
989      if (isZero) {
990        sb.append('0');
991      } else {
992        // Strip leading zeroes from the integer and exponent and trailing
993        // zeroes from the fraction.
994        while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; }
995        while (fractionEnd > fractionStart
996               && css.charAt(fractionEnd - 1) == '0') {
997          --fractionEnd;
998        }
999        if (intStart == intEnd) {
1000          sb.append('0');  // .5 -> 0.5
1001        } else {
1002          sb.append(css, intStart, intEnd);
1003        }
1004        if (fractionEnd > fractionStart + 1) {  // 5. -> 5; 5.0 -> 5
1005          sb.append(css, fractionStart, fractionEnd);
1006        }
1007        if (!isExponentZero) {
1008          sb.append('e');
1009          // 1e+1 -> 1e1
1010          if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); }
1011          while (exponentIntStart < exponentEnd
1012                 && css.charAt(exponentIntStart) == '0') {
1013            ++exponentIntStart;
1014          }
1015          sb.append(css, exponentIntStart, exponentEnd);
1016        }
1017      }
1018
1019      int unitEnd;
1020      TokenType type;
1021      if (unitStart < cssLimit && '%' == css.charAt(unitStart)) {
1022        unitEnd = unitStart + 1;
1023        type = TokenType.PERCENTAGE;
1024        sb.append('%');
1025      } else {
1026        // The grammar says that any identifier following a number is a unit.
1027        int bufferBeforeUnit = sb.length();
1028        pos = unitStart;
1029        consumeIdent(false);
1030        int bufferAfterUnit = sb.length();
1031        boolean knownUnit = isWellKnownUnit(
1032            sb, bufferBeforeUnit, bufferAfterUnit);
1033        if (unitStart == exponentEnd  // No intervening space
1034            || knownUnit) {
1035          unitEnd = pos;
1036          // 3IN -> 3in
1037          for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) {
1038            char ch = sb.charAt(i);
1039            if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); }
1040          }
1041        } else {
1042          unitEnd = unitStart = exponentEnd;
1043          sb.setLength(bufferBeforeUnit);
1044        }
1045        type = unitStart == unitEnd
1046            ? TokenType.NUMBER
1047            : knownUnit
1048            ? TokenType.DIMENSION
1049            : TokenType.BAD_DIMENSION;
1050      }
1051      pos = unitEnd;
1052      if (type != TokenType.PERCENTAGE
1053          && pos < cssLimit && css.charAt(pos) == '.') {
1054        sb.append(' ');
1055      }
1056      return type;
1057    }
1058
1059    private TokenType consumeString() {
1060      String css = this.css;
1061      int cssLimit = this.cssLimit;
1062
1063      char delim = css.charAt(pos);
1064      assert delim == '"' || delim == '\'';
1065      ++pos;
1066      int startOfStringOnOutput = sb.length();
1067      sb.append('\'');
1068      int last = -1;
1069      boolean closed = false;
1070      while (pos < cssLimit) {
1071        char ch = css.charAt(pos);
1072        if (ch == delim) {
1073          ++pos;
1074          closed = true;
1075          break;
1076        }
1077        if (isLineTerminator(ch)) { break; }
1078        int decoded = ch;
1079        if (ch == '\\') {
1080          if (pos + 1 < cssLimit && isLineTerminator(css.charAt(pos+1))) {
1081            // consume it but generate no tokens.
1082            // Lookahead to treat a \r\n sequence as one line-terminator.
1083            if (pos + 2 < cssLimit
1084                && css.charAt(pos+1) == '\r' && css.charAt(pos+2) == '\n') {
1085              pos += 3;
1086            } else {
1087              pos += 2;
1088            }
1089            continue;
1090          } else {
1091            decoded = consumeAndDecodeEscapeSequence();
1092            if (decoded < 0) {
1093              break;
1094            }
1095          }
1096        } else {
1097          ++pos;
1098        }
1099        encodeCharOntoOutput(decoded, last);
1100        last = decoded;
1101      }
1102      if (closed) {
1103        sb.append('\'');
1104        return TokenType.STRING;
1105      } else {  // Drop <bad-string>s
1106        sb.setLength(startOfStringOnOutput);
1107        breakOutput();
1108        return TokenType.WHITESPACE;
1109      }
1110    }
1111
1112    private @Nullable TokenType consumeHash() {
1113      assert css.charAt(pos) == '#';
1114      ++pos;
1115      int beforeIdent = pos;
1116      consumeIdent(true);
1117      if (pos == beforeIdent) {
1118        pos = beforeIdent - 1;
1119        return null;
1120      }
1121      for (int i = beforeIdent; i < pos; ++i) {
1122        char chLower = (char) (css.charAt(i) | 32);
1123        if (!(('0' <= chLower && chLower <= '9')
1124              || ('a' <= chLower && chLower <= 'f'))) {
1125          return TokenType.HASH_ID;
1126        }
1127      }
1128      return TokenType.HASH_UNRESTRICTED;
1129    }
1130
1131    private boolean consumeUnicodeRange() {
1132      final String css = this.css;
1133      final int cssLimit = this.cssLimit;
1134
1135      assert pos < cssLimit && (css.charAt(pos) | 32) == 'u';
1136
1137      final int start = pos;
1138      final int startOfOutput = sb.length();
1139      ++pos;
1140      boolean ok = false;
1141      parse:
1142      try {
1143        if (pos == cssLimit || css.charAt(pos) != '+') {
1144          break parse;
1145        }
1146        ++pos;
1147        sb.append("U+");
1148        int numStartDigits = 0;
1149        while (pos < cssLimit && numStartDigits < 6) {
1150          char chLower = (char) (css.charAt(pos) | 32);
1151          if (('0' <= chLower && chLower <= '9')
1152              || ('a' <= chLower && chLower <= 'f')) {
1153            sb.append(chLower);
1154            ++numStartDigits;
1155            ++pos;
1156          } else {
1157            break;
1158          }
1159        }
1160        if (numStartDigits == 0) {
1161          break parse;
1162        }
1163        boolean hasQmark = false;
1164        while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') {
1165          hasQmark = true;
1166          sb.append('?');
1167          ++numStartDigits;
1168          ++pos;
1169        }
1170        if (numStartDigits == 0) {
1171          break parse;
1172        }
1173        if (pos < cssLimit && css.charAt(pos) == '-') {
1174          if (!hasQmark) {
1175            // Look for end of range.
1176            ++pos;
1177            sb.append('-');
1178            int numEndDigits = 0;
1179            while (pos < cssLimit && numEndDigits < 6) {
1180              char chLower = (char) (css.charAt(pos) | 32);
1181              if (('0' <= chLower && chLower <= '9')
1182                  || ('a' <= chLower && chLower <= 'f')) {
1183                ++numEndDigits;
1184                ++pos;
1185                sb.append(chLower);
1186              } else {
1187                break;
1188              }
1189            }
1190            if (numEndDigits == 0) {
1191              // Back up over '-'
1192              --pos;
1193              sb.append(' ');
1194            }
1195          } else {
1196            sb.append(' ');
1197          }
1198        }
1199        ok = true;
1200      } finally {
1201        if (!ok) {
1202          pos = start;
1203          sb.setLength(startOfOutput);
1204        }
1205      }
1206      return ok;
1207    }
1208
1209    private @Nullable TokenType consumeIdentOrUrlOrFunction() {
1210      int bufferStart = sb.length();
1211      int posBefore = pos;
1212      consumeIdent(false);
1213      if (pos == posBefore) { return null; }
1214      boolean parenAfter = pos < cssLimit && css.charAt(pos) == '(';
1215      if (sb.length() - bufferStart == 3
1216          && 'u' == (sb.charAt(bufferStart) | 32)
1217          && 'r' == (sb.charAt(bufferStart + 1) | 32)
1218          && 'l' == (sb.charAt(bufferStart + 2) | 32)) {
1219        if (parenAfter && consumeUrlValue()) {
1220          sb.setCharAt(bufferStart, 'u');
1221          sb.setCharAt(bufferStart + 1, 'r');
1222          sb.setCharAt(bufferStart + 2, 'l');
1223          return TokenType.URL;
1224        } else {
1225          sb.setLength(bufferStart);
1226          breakOutput();
1227          return TokenType.WHITESPACE;
1228        }
1229      } else if (parenAfter) {
1230        openBracket('(');
1231        ++pos;
1232        return TokenType.FUNCTION;
1233      } else {
1234        if (pos + 1 < cssLimit && '.' == css.charAt(pos)) {
1235          // Prevent merging of ident and number as in
1236          //     border:solid.1cm black
1237          // when .1 is rewritten to 0.1 becoming
1238          //     border:solid0.1cm black
1239          char next = css.charAt(pos + 1);
1240          if ('0' <= next && next <= '9') {
1241            sb.append(' ');
1242          }
1243        }
1244        return TokenType.IDENT;
1245      }
1246    }
1247
1248    private boolean consumeUrlValue() {
1249      String css = this.css;
1250      int cssLimit = this.cssLimit;
1251      if (pos == cssLimit || css.charAt(pos) != '(') { return false; }
1252      ++pos;
1253      // skip space.
1254      for (; pos < cssLimit; ++pos) {
1255        char ch = css.charAt(pos);
1256        if (ch != ' ' && !isLineTerminator(ch)) { break; }
1257      }
1258      // Find the value.
1259      int delim;
1260      if (pos < cssLimit) {
1261        char ch = pos < cssLimit ? css.charAt(pos) : '\0';
1262        if (ch == '"' || ch == '\'') {
1263          delim = ch;
1264          ++pos;
1265        } else {
1266          delim = '\0';
1267        }
1268      } else {
1269        return false;
1270      }
1271      sb.append("('");
1272      while (pos < cssLimit) {
1273        int decoded = readCodepoint();
1274        if (delim != 0) {
1275          if (decoded == delim) {
1276            ++pos;
1277            break;
1278          }
1279        } else if (decoded <= ' ' || decoded == ')') {
1280          break;
1281        }
1282        if (decoded == '\\') {
1283          decoded = consumeAndDecodeEscapeSequence();
1284          if (decoded < 0) {
1285            return false;
1286          }
1287        } else {
1288          ++pos;
1289        }
1290        // Any character not in the RFC 3986 safe set is %-encoded.
1291        if (decoded < URL_SAFE.length && URL_SAFE[decoded]) {
1292          sb.appendCodePoint(decoded);
1293        } else if (decoded < 0x80) {
1294          sb.append('%')
1295            .append(HEX_DIGITS[(decoded >>> 4) & 0xf])
1296            .append(HEX_DIGITS[(decoded >>> 0) & 0xf]);
1297        } else if (decoded < 0x800) {
1298          int octet0 = 0xc0 | ((decoded >>> 6) & 0x1f),
1299              octet1 = 0x80 | (decoded         & 0x3f);
1300          sb.append('%')
1301            .append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
1302            .append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
1303            .append('%')
1304            .append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
1305            .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]);
1306        } else if (decoded < 0x10000) {
1307          int octet0 = 0xe0 | ((decoded >>> 12) & 0xf),
1308              octet1 = 0x80 | ((decoded >>> 6)  & 0x3f),
1309              octet2 = 0x80 | (decoded          & 0x3f);
1310          sb.append('%')
1311            .append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
1312            .append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
1313            .append('%')
1314            .append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
1315            .append(HEX_DIGITS[(octet1 >>> 0) & 0xf])
1316            .append('%')
1317            .append(HEX_DIGITS[(octet2 >>> 4) & 0xf])
1318            .append(HEX_DIGITS[(octet2 >>> 0) & 0xf]);
1319        } else {
1320          int octet0 = 0xf0 | ((decoded >>> 18) & 0x7),
1321              octet1 = 0x80 | ((decoded >>> 12) & 0x3f),
1322              octet2 = 0x80 | ((decoded >>> 6)  & 0x3f),
1323              octet3 = 0x80 | (decoded          & 0x3f);
1324          sb.append('%')
1325            .append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
1326            .append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
1327            .append('%')
1328            .append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
1329            .append(HEX_DIGITS[(octet1 >>> 0) & 0xf])
1330            .append('%')
1331            .append(HEX_DIGITS[(octet2 >>> 4) & 0xf])
1332            .append(HEX_DIGITS[(octet2 >>> 0) & 0xf])
1333            .append('%')
1334            .append(HEX_DIGITS[(octet3 >>> 4) & 0xf])
1335            .append(HEX_DIGITS[(octet3 >>> 0) & 0xf]);
1336        }
1337      }
1338
1339      // skip space.
1340      for (; pos < cssLimit; ++pos) {
1341        char ch = css.charAt(pos);
1342        if (ch != ' ' && !isLineTerminator(ch)) { break; }
1343      }
1344      if (pos < cssLimit && css.charAt(pos) == ')') {
1345        ++pos;
1346      } else {
1347        // broken-url
1348      }
1349      sb.append("')");
1350      return true;
1351    }
1352
1353    /**
1354     * Reads the codepoint at pos, leaving pos at the index of the last code
1355     * unit.
1356     */
1357    private int readCodepoint() {
1358      String css = this.css;
1359      char ch = css.charAt(pos);
1360      if (Character.isHighSurrogate(ch) && pos + 1 < cssLimit) {
1361        char next = css.charAt(pos + 1);
1362        if (Character.isLowSurrogate(next)) {
1363          ++pos;
1364          return 0x10000 + (((ch - 0xd800) << 10) | (next - 0xdc00));
1365        }
1366      }
1367      return ch;
1368    }
1369  }
1370
1371  private static final boolean isIdentPart(int cp) {
1372    return cp >= 0x80
1373        ? Character.isDefined(cp) && cp != '\ufeff'
1374        : IDENT_PART_ASCII[cp];
1375  }
1376
1377  private static final boolean isDecimal(char ch) {
1378    return '0' <= ch && ch <= '9';
1379  }
1380
1381  private static final boolean[] IDENT_PART_ASCII = new boolean[128];
1382  static {
1383    for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; }
1384    for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; }
1385    for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; }
1386    IDENT_PART_ASCII['_'] = true;
1387    IDENT_PART_ASCII['-'] = true;
1388  }
1389
1390  private static final int LINE_TERMINATOR_BITMASK =
1391      (1 << '\n') | (1 << '\r') | (1 << '\f');
1392
1393  private static boolean isLineTerminator(char ch) {
1394    return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch));
1395  }
1396
1397  private static int[] expandIfNecessary(int[] arr, int limit, int needed) {
1398    int neededLength = limit + needed;
1399    int length = arr.length;
1400    if (length >= neededLength) { return arr; }
1401    int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))];
1402    System.arraycopy(arr, 0, newArr, 0, limit);
1403    return newArr;
1404  }
1405
1406  private static int[] truncateOrShare(int[] arr, int limit) {
1407    if (limit == 0) { return ZERO_INTS; }
1408    if (limit == arr.length) {
1409      return arr;
1410    }
1411    int[] trunc = new int[limit];
1412    System.arraycopy(arr, 0, trunc, 0, limit);
1413    return trunc;
1414  }
1415
1416  private static final int LENGTH_UNIT_TYPE = 0;
1417  private static final int ANGLE_UNIT_TYPE = 1;
1418  private static final int TIME_UNIT_TYPE = 2;
1419  private static final int FREQUENCY_UNIT_TYPE = 3;
1420  private static final int RESOLUTION_UNIT_TYPE = 4;
1421
1422  /**
1423   * See http://dev.w3.org/csswg/css-values/#lengths and
1424   *     http://dev.w3.org/csswg/css-values/#other-units
1425   */
1426  private static final Trie UNIT_TRIE = new Trie(
1427      ImmutableMap.<String, Integer>builder()
1428        .put("em", LENGTH_UNIT_TYPE)
1429        .put("ex", LENGTH_UNIT_TYPE)
1430        .put("ch", LENGTH_UNIT_TYPE)  // Width of zero character
1431        .put("rem", LENGTH_UNIT_TYPE)  // Root element font-size
1432        .put("vh", LENGTH_UNIT_TYPE)
1433        .put("vw", LENGTH_UNIT_TYPE)
1434        .put("vmin", LENGTH_UNIT_TYPE)
1435        .put("vmax", LENGTH_UNIT_TYPE)
1436        .put("px", LENGTH_UNIT_TYPE)
1437        .put("mm", LENGTH_UNIT_TYPE)
1438        .put("cm", LENGTH_UNIT_TYPE)
1439        .put("in", LENGTH_UNIT_TYPE)
1440        .put("pt", LENGTH_UNIT_TYPE)
1441        .put("pc", LENGTH_UNIT_TYPE)
1442        .put("deg", ANGLE_UNIT_TYPE)
1443        .put("rad", ANGLE_UNIT_TYPE)
1444        .put("grad", ANGLE_UNIT_TYPE)
1445        .put("turn", ANGLE_UNIT_TYPE)
1446        .put("s", TIME_UNIT_TYPE)
1447        .put("ms", TIME_UNIT_TYPE)
1448        .put("hz", FREQUENCY_UNIT_TYPE)
1449        .put("khz", FREQUENCY_UNIT_TYPE)
1450        .put("dpi", RESOLUTION_UNIT_TYPE)
1451        .put("dpcm", RESOLUTION_UNIT_TYPE)
1452        .put("dppx", RESOLUTION_UNIT_TYPE)
1453        .build());
1454
1455  static boolean isWellKnownUnit(CharSequence s, int start, int end) {
1456    if (start == end) { return false; }
1457    Trie t = UNIT_TRIE;
1458    for (int i = start; i < end; ++i) {
1459      char ch = s.charAt(i);
1460      t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch);
1461      if (t == null) { return false; }
1462    }
1463    return t.isTerminal();
1464  }
1465
1466  static boolean isWellKnownUnit(CharSequence s) {
1467    return isWellKnownUnit(s, 0, s.length());
1468  }
1469
1470  private static final boolean[] URL_SAFE = new boolean[128];
1471  static {
1472    // From RFC 3986
1473    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1474    for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; }
1475    for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; }
1476    for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; }
1477    URL_SAFE['-'] = true;
1478    URL_SAFE['.'] = true;
1479    URL_SAFE['_'] = true;
1480    URL_SAFE['~'] = true;
1481    // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1482    URL_SAFE[':'] = true;
1483    URL_SAFE['/'] = true;
1484    URL_SAFE['?'] = true;
1485    URL_SAFE['#'] = true;
1486    URL_SAFE['['] = true;
1487    URL_SAFE[']'] = true;
1488    URL_SAFE['@'] = true;
1489    // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
1490    //             / "*" / "+" / "," / ";" / "="
1491    URL_SAFE['!'] = true;
1492    URL_SAFE['$'] = true;
1493    URL_SAFE['&'] = true;
1494    // Only used in obsolete mark rule and special in unquoted URLs or comment
1495    // delimiters.
1496    // URL_SAFE['\''] = true;
1497    // URL_SAFE['('] = true;
1498    // URL_SAFE[')'] = true;
1499    // URL_SAFE['*'] = true;
1500    URL_SAFE['+'] = true;
1501    URL_SAFE[','] = true;
1502    URL_SAFE[';'] = true;
1503    URL_SAFE['='] = true;
1504    // % is used to encode unsafe octets.
1505    URL_SAFE['%'] = true;
1506  }
1507
1508  private static final char[] HEX_DIGITS = {
1509    '0', '1', '2', '3',
1510    '4', '5', '6', '7',
1511    '8', '9', 'a', 'b',
1512    'c', 'd', 'e', 'f'
1513  };
1514}
1515