CssTokens.java revision b600c3cd7edfb02d79c264fd83b1306e94053b7e
1// Copyright (c) 2013, Mike Samuel 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions 6// are met: 7// 8// Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// Redistributions in binary form must reproduce the above copyright 11// notice, this list of conditions and the following disclaimer in the 12// documentation and/or other materials provided with the distribution. 13// Neither the name of the OWASP nor the names of its contributors may 14// be used to endorse or promote products derived from this software 15// without specific prior written permission. 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27// POSSIBILITY OF SUCH DAMAGE. 28 29package org.owasp.html; 30 31import java.util.ArrayList; 32import java.util.Arrays; 33import java.util.Iterator; 34import java.util.List; 35import java.util.NoSuchElementException; 36 37import javax.annotation.Nullable; 38 39import com.google.common.collect.ImmutableMap; 40 41/** 42 * Given a string of CSS, produces a string of normalized CSS with certain 43 * useful properties detailed below. 44 * <ul> 45 * <li>All runs of white-space and comment tokens (including CDO and CDC) 46 * have been replaced with a single space character.</li> 47 * <li>All strings are quoted and escapes are escaped according to the 48 * following scheme: 49 * <table> 50 * <tr><td>NUL</td> <td><code>\0</code></tr> 51 * <tr><td>line feed</td> <td><code>\a</code></tr> 52 * <tr><td>vertical feed</td> <td><code>\c</code></tr> 53 * <tr><td>carriage return</td><td><code>\d</code></tr> 54 * <tr><td>double quote</td> <td><code>\22</code></tr> 55 * <tr><td>ampersand &</td><td><code>\26</code></tr> 56 * <tr><td>single quote</td> <td><code>\27</code></tr> 57 * <tr><td>left-angle <</td><td><code>\3c</code></tr> 58 * <tr><td>rt-angle ></td> <td><code>\3e</code></tr> 59 * <tr><td>back slash</td> <td><code>\\</code></tr> 60 * <tr><td>dash</td> <td><code>-</code> or <code>\-</code></tr> 61 * <tr><td>all others</td> <td>raw</td></tr> 62 * </table> 63 * The dash may be encoded as necessary to prevent the valid CSS, 64 * <pre>.foo-\2d>span { color:red }</pre> 65 * which specifies the selector for the {@code <span>} children of elements 66 * with class ".foo--" from being rendered to include an ignorable 67 * {@code -->} token. 68 * </li> 69 * <li>All <code>url(…)</code> tokens are quoted. 70 * <li>All keywords, identifiers, and hex literals are lower-case and have 71 * embedded escape sequences decoded.</li> 72 * <li>All brackets nest properly.</li> 73 * <li>Does not contain the sequence {@code <!--} or {@code -->}.</li> 74 * <li>All delimiters that can start longer tokens are followed by a space. 75 * </ul> 76 */ 77final class CssTokens implements Iterable<String> { 78 79static final boolean DEBUG = false; 80 81 public final String normalizedCss; 82 public final Brackets brackets; 83 private final int[] tokenBreaks; 84 private final TokenType[] tokenTypes; 85 86 public TokenIterator start() { 87 return new TokenIterator(tokenTypes.length); 88 } 89 90 public TokenIterator iterator() { return start(); } 91 92 public static CssTokens lex(String css) { 93 Lexer lexer = new Lexer(css); 94 lexer.lex(); 95 return lexer.build(); 96 } 97 98 /** A cursor into a list of tokens. */ 99 public final class TokenIterator implements Iterator<String> { 100 private int tokenIndex = 0; 101 private final int limit; 102 103 TokenIterator(int limit) { 104 this.limit = limit; 105 } 106 107 public boolean hasNext() { 108 return hasToken(); 109 } 110 111 public String next() { 112 String token = token(); 113 advance(); 114 return token; 115 } 116 117 public @Nullable TokenIterator spliceToEnd() { 118 if (!hasNext()) { throw new NoSuchElementException(); } 119 int end = brackets.partner(tokenIndex); 120 if (end < 0) { 121 return null; 122 } 123 TokenIterator between = new TokenIterator(end); 124 between.tokenIndex = tokenIndex + 1; 125 tokenIndex = end + 1; 126 return between; 127 } 128 129 public int tokenIndex() { 130 return tokenIndex; 131 } 132 133 public int startOffset() { 134 return tokenBreaks[tokenIndex]; 135 } 136 137 public int endOffset() { 138 return tokenBreaks[tokenIndex+1]; 139 } 140 141 public String token() { 142 return normalizedCss.substring(startOffset(), endOffset()); 143 } 144 145 public boolean hasToken() { 146 return tokenIndex < limit; 147 } 148 149 public boolean hasTokenAfterSpace() { 150 while (hasToken()) { 151 if (type() != TokenType.WHITESPACE) { return true; } 152 advance(); 153 } 154 return false; 155 } 156 157 /** The type of the current token. */ 158 public TokenType type() { 159 return tokenTypes[tokenIndex]; 160 } 161 162 public void seek(int tokenIndex) { 163 this.tokenIndex = tokenIndex; 164 } 165 166 public void advance() { 167 if (!hasToken()) { throw new NoSuchElementException(); } 168 ++tokenIndex; 169 } 170 171 public void remove() throws UnsupportedOperationException { 172 throw new UnsupportedOperationException(); 173 } 174 } 175 176 private CssTokens( 177 String normalizedCss, Brackets brackets, int[] tokenBreaks, 178 TokenType[] tokenTypes) { 179 this.normalizedCss = normalizedCss; 180 this.brackets = brackets; 181 this.tokenBreaks = tokenBreaks; 182 this.tokenTypes = tokenTypes; 183 } 184 185 public enum TokenType { 186 /** An identifier. */ 187 IDENT, 188 /** An identifier prefixed with a period. */ 189 DOT_IDENT, 190 /** A function name and opening bracket. */ 191 FUNCTION, 192 /** An {@code @<identifier>} directive token. */ 193 AT, 194 /** A hash token that contains non-hex characters. */ 195 HASH_ID, 196 /** A hash token that could be a color literal. */ 197 HASH_UNRESTRICTED, 198 /** A quoted string. */ 199 STRING, 200 /** A URL of the form <code>url("...")</code>. */ 201 URL, 202 /** A single character. */ 203 DELIM, 204 /** A scalar numeric value. */ 205 NUMBER, 206 /** A percentage. */ 207 PERCENTAGE, 208 /** A numeric value with a unit suffix. */ 209 DIMENSION, 210 /** {@code U+<hex-or-qmark>} */ 211 UNICODE_RANGE, 212 /** 213 * include-match, dash-match, prefix-match, suffix-match, substring-match 214 */ 215 MATCH, 216 /** {@code ||} */ 217 COLUMN, 218 /** A run of white-space, comment, CDO, and CDC tokens. */ 219 WHITESPACE, 220 /** {@code :} */ 221 COLON, 222 /** {@code ;} */ 223 SEMICOLON, 224 /** {@code ,} */ 225 COMMA, 226 /** {@code [} */ 227 LEFT_SQUARE, 228 /** {@code ]} */ 229 RIGHT_SQUARE, 230 /** {@code (} */ 231 LEFT_PAREN, 232 /** {@code )} */ 233 RIGHT_PAREN, 234 /** <code>{</code> */ 235 LEFT_CURLY, 236 /** <code>}</code> */ 237 RIGHT_CURLY, 238 ; 239 } 240 241 /** 242 * Maps tokens to their partners. A close bracket token like {@code (} may 243 * have a partner token like {@code )} if properly nested, and vice-versa. 244 */ 245 static final class Brackets { 246 /** 247 * For each token index, the index of the indexed token's partner or -1 if 248 * it has none. 249 */ 250 private final int[] brackets; 251 252 private Brackets(int[] brackets) { 253 this.brackets = brackets; 254 } 255 256 /** The index of the partner token or -1 if none. */ 257 int partner(int tokenIndex) { 258if (DEBUG) System.err.println("index=" + tokenIndex + ", brackets=" + Arrays.toString(brackets)); 259 int bracketIndex = bracketIndexForToken(tokenIndex); 260if (DEBUG) System.err.println("\tbracketIndex=" + bracketIndex); 261 if (bracketIndex < 0) { return -1; } 262 return brackets[(bracketIndex << 1) + 1]; 263 } 264 265 int bracketIndexForToken(int target) { 266 // Binary search by leftmost element of pair. 267 int left = 0; 268 int right = brackets.length >> 1; 269 while (left < right) { 270 int mid = (left + (right - left) >> 1); 271 int value = brackets[mid << 1]; 272 if (value == target) { return mid; } 273 if (value < target) { 274 left = mid + 1; 275 } else { 276 right = mid; 277 } 278 } 279 return -1; 280 } 281 } 282 283 private static final int[] ZERO_INTS = new int[0]; 284 285 private static final TokenType[] ZERO_TYPES = new TokenType[0]; 286 287 private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS); 288 289 private static final CssTokens EMPTY = new CssTokens( 290 "", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES); 291 292 /** 293 * Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/ 294 */ 295 private static final class Lexer { 296 private final String css; 297 private final StringBuilder sb; 298 private int pos = 0; 299 private final int cssLimit; 300 301 private List<TokenType> tokenTypes = null; 302 private int[] tokenBreaks = new int[128]; 303 private int tokenBreaksLimit = 0; 304 305 /** 306 * For each bracket, 2 ints: the token index of the bracket, and the token 307 * index of its partner. 308 * The array is sorted by the first int. 309 * The second int is -1 when the bracket has not yet been closed. 310 */ 311 private int[] brackets = ZERO_INTS; 312 /** 313 * The number of elements in {@link #brackets} that are valid. 314 * {@code brackets[bracketsLimit:]} is zeroed space that the list can grow 315 * into. 316 */ 317 private int bracketsLimit = 0; 318 /** 319 * For each bracket that has not been closed, 2 ints: 320 * its index in {@link #brackets} and the character of its close bracket 321 * as an int. 322 * This is a bracket stack so the array is sorted by the first int. 323 */ 324 private int[] open = ZERO_INTS; 325 /** 326 * The number of elements in {@link #open} that are valid. 327 * {@code open[openLimit:]} is garbage space that the stack can grow into. 328 */ 329 private int openLimit = 0; 330 331 Lexer(String css) { 332 this.css = css; 333 this.sb = new StringBuilder(); 334 this.cssLimit = css.length(); 335 } 336 337 TokenType openBracket(char bracketChar) { 338 char close; 339 TokenType type; 340 switch (bracketChar) { 341 case '(': close = ')'; type = TokenType.LEFT_PAREN; break; 342 case '[': close = ']'; type = TokenType.LEFT_SQUARE; break; 343 case '{': close = '}'; type = TokenType.LEFT_CURLY; break; 344 default: 345 throw new AssertionError("Invalid open bracket " + bracketChar); 346 } 347 brackets = expandIfNecessary(brackets, bracketsLimit, 2); 348 open = expandIfNecessary(open, openLimit, 2); 349 open[openLimit++] = bracketsLimit; 350 open[openLimit++] = close; 351 brackets[bracketsLimit++] = tokenBreaksLimit; 352 brackets[bracketsLimit++] = -1; 353 sb.append(bracketChar); 354 return type; 355 } 356 357 void closeBracket(char bracketChar) { 358 int openLimitAfterClose = openLimit; 359 do { 360 if (openLimitAfterClose == 0) { 361 // Drop an orphaned close bracket. 362 breakOutput(); 363 return; 364 } 365 openLimitAfterClose -= 2; 366 } while (bracketChar != open[openLimitAfterClose + 1]); 367 368 // Make sure we've got space on brackets. 369 int spaceNeeded = openLimit - openLimitAfterClose; 370 brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded); 371 372 closeBrackets(openLimitAfterClose); 373 } 374 375 private void closeBrackets(int openLimitAfterClose) { 376 int closeTokenIndex = tokenBreaksLimit; 377 while (openLimit > openLimitAfterClose) { 378 // Pop the stack. 379 int closeBracket = open[--openLimit]; 380 int openBracketIndex = open[--openLimit]; 381 // Update open bracket to point to its partner. 382 brackets[openBracketIndex + 1] = closeTokenIndex; 383 // Emit the close bracket. 384 brackets[bracketsLimit++] = closeTokenIndex; 385 brackets[bracketsLimit++] = openBracketIndex; 386 sb.appendCodePoint(closeBracket); 387 closeTokenIndex++; 388 } 389 } 390 391 CssTokens build() { 392 // Close any still open brackets. 393 { 394 int startOfCloseBrackets = sb.length(); 395 closeBrackets(0); 396 emitMergedTokens(startOfCloseBrackets, sb.length()); 397 } 398 399 if (tokenTypes == null) { return EMPTY; } 400 int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit); 401 402 // Strip any trailing space off, since it may have been inserted by a 403 // breakAfter call anyway. 404 int cssEnd = sb.length(); 405 if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') { 406if (DEBUG) System.err.println( 407 "cssEnd=" + cssEnd + ", tokenBreaksLimit=" + tokenBreaksLimit + ", tokenTypes.size=" + tokenTypes.size()); 408 --cssEnd; 409 tokenTypes.remove(--tokenBreaksLimit); 410if (DEBUG) System.err.println( 411 "cssEnd'=" + cssEnd + ", tokenBreaksLimit=" + tokenBreaksLimit + ", tokenTypes.size=" + tokenTypes.size()); 412 } 413 String normalizedCss = sb.substring(0, cssEnd); 414 415 // Store the last character on the tokenBreaksList to simplify finding the 416 // end of a token. 417 tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1); 418 tokenBreaks[tokenBreaksLimit++] = normalizedCss.length(); 419 420 int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit); 421 TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES); 422 423if (DEBUG) System.err.println("normalizedCss=" + normalizedCss); 424if (DEBUG) System.err.println("tokenBreaks=" + Arrays.toString(tokenBreaksTrunc)); 425if (DEBUG) System.err.println("tokenTypes =" + Arrays.toString(tokenTypesArr)); 426 427 return new CssTokens( 428 normalizedCss, new Brackets(bracketsTrunc), 429 tokenBreaksTrunc, tokenTypesArr); 430 } 431 432 void lex() { 433 // Fast-track no content. 434 consumeIgnorable(); 435 sb.setLength(0); 436 if (pos == cssLimit) { return; } 437 438 String css = this.css; 439 int cssLimit = this.cssLimit; 440 int lastPos = pos - 1; 441 while (pos < cssLimit) { 442 assert pos > lastPos 443 : "pos=" + pos + ", lastPos=" + lastPos + ", ch=" + css.charAt(pos); 444 lastPos = pos; 445if (DEBUG) System.err.println("Starting at " + pos); 446 // SPEC: 4. Tokenization 447 // The output of the tokenization step is a stream of zero 448 // or more of the following tokens: <ident>, <function>, 449 // <at-keyword>, <hash>, <string>, <bad-string>, <url>, 450 // <bad-url>, <delim>, <number>, <percentage>, 451 // <dimension>, <unicode-range>, <include-match>, 452 // <dash-match>, <prefix-match>, <suffix-match>, 453 // <substring-match>, <column>, <whitespace>, <CDO>, 454 // <CDC>, <colon>, <semicolon>, <comma>, <[>, <]>, 455 // <(>, <)>, <{>, and <}>. 456 457 // IMPLEMENTS: 4.3 Consume a token 458 char ch = css.charAt(pos); 459 int startOfToken = pos; 460 int startOfOutputToken = sb.length(); 461 final TokenType type; 462 switch (ch) { 463 case '\t': case '\n': case '\f': case '\r': case ' ': 464 consumeIgnorable(); 465 type = TokenType.WHITESPACE; 466 break; 467 case '/': { 468 char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; 469 if (lookahead == '/' || lookahead == '*') { 470 consumeIgnorable(); 471 type = TokenType.WHITESPACE; 472 } else { 473 consumeDelim(ch); 474 type = TokenType.DELIM; 475 } 476 break; 477 } 478 case '<': 479 if (consumeIgnorable()) { // <!-- 480 type = TokenType.WHITESPACE; 481 } else { 482 consumeDelim('<'); 483 type = TokenType.DELIM; 484 } 485 break; 486 case '@': 487 if (consumeAtKeyword()) { 488 type = TokenType.AT; 489 } else { 490 consumeDelim(ch); 491 type = TokenType.DELIM; 492 } 493 break; 494 case '#': { 495 sb.append('#'); 496 TokenType hashType = consumeHash(); 497 if (hashType != null) { 498 type = hashType; 499 } else { 500 ++pos; 501 type = TokenType.DELIM; 502 } 503 break; 504 } 505 case '"': 506 case '\'': 507 consumeString(); 508 type = TokenType.STRING; 509 break; 510 case 'U': 511 // SPEC handle URL under "ident like token". 512 if (consumeUnicodeRange()) { 513 type = TokenType.UNICODE_RANGE; 514 } else { 515 type = consumeIdentOrUrlOrFunction(); 516 } 517 break; 518 case '0': case '1': case '2': case '3': case '4': 519 case '5': case '6': case '7': case '8': case '9': 520 type = consumeNumberOrPercentageOrDimension(); 521 break; 522 case '+': case '-': case '.': { 523 char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; 524if (DEBUG) System.err.println("ch=" + ch + ", lookahead=" + lookahead); 525 if (isDecimal(lookahead) 526 || (lookahead == '.' && pos + 2 < cssLimit 527 && isDecimal(css.charAt(pos + 2)))) { 528 type = consumeNumberOrPercentageOrDimension(); 529 } else if (ch == '+') { 530 consumeDelim(ch); 531 type = TokenType.DELIM; 532 } else if (ch == '-') { 533 if (consumeIgnorable()) { // --> 534 type = TokenType.WHITESPACE; 535 } else { 536 type = consumeIdentOrUrlOrFunction(); 537 } 538 } else if (isIdentPart(lookahead)) { 539 // treat ".<IDENT>" as one token. 540 sb.append('.'); 541 ++pos; 542 consumeIdent(); 543 type = TokenType.DOT_IDENT; 544 } else { 545 consumeDelim('.'); 546 type = TokenType.DELIM; 547 } 548 break; 549 } 550 case ':': consumeDelim(ch); type = TokenType.COLON; break; 551 case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break; 552 case ',': consumeDelim(ch); type = TokenType.COMMA; break; 553 case '[': case '(': case '{': 554 type = openBracket(ch); 555 ++pos; 556 break; 557 case '}': case ')': case ']': 558 closeBracket(ch); 559 ++pos; 560 // Use DELIM so that a later loop will split output into multiple 561 // tokens since we may have inserted missing close brackets for 562 // unclosed open brackets already on the stack. 563 type = TokenType.DELIM; 564 break; 565 case '~': case '|': case '^': case '$': case '*': { 566 char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0; 567 if (lookahead == '=') { 568 consumeMatch(ch); 569 type = TokenType.MATCH; 570 } else if (ch == '|' && lookahead == '|') { 571 consumeColumn(); 572 type = TokenType.COLUMN; 573 } else { 574 consumeDelim(ch); 575 type = TokenType.DELIM; 576 } 577 break; 578 } 579 case '_': 580 type = consumeIdentOrUrlOrFunction(); 581 break; 582 case '\\': { 583 // Optimistically parse as an ident. 584 TokenType identType = consumeIdentOrUrlOrFunction(); 585 if (identType == null) { 586 ++pos; // drop 587 breakOutput(); 588 type = TokenType.WHITESPACE; 589 } else { 590 type = identType; 591 } 592 // TODO: handle case where "url" is encoded. 593 break; 594 } 595 default: 596 int chlower = ch | 32; 597 if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) { 598 type = consumeIdentOrUrlOrFunction(); 599 } else if (ch > 0x20) { 600 consumeDelim(ch); 601 type = TokenType.DELIM; 602 } else { // Ignore. 603 breakOutput(); 604 type = TokenType.WHITESPACE; 605 ++pos; 606 } 607 } 608 assert pos > startOfToken 609 : "empty token at " + pos + ", ch0=" + css.charAt(startOfToken); 610if (DEBUG) System.err.println("found `" + css.substring(startOfToken, pos) + "` -> `" + sb.substring(startOfOutputToken) + "`"); 611 int endOfOutputToken = sb.length(); 612 if (endOfOutputToken > startOfOutputToken) { 613 if (type == TokenType.DELIM) { 614 emitMergedTokens(startOfOutputToken, endOfOutputToken); 615 } else { 616 emitToken(type, startOfOutputToken); 617 } 618 } 619 } 620 } 621 622 private void emitMergedTokens(int start, int end) { 623 // Handle breakOutput and merging of output tokens. 624 for (int e = start; e < end; ++e) { 625 TokenType delimType; 626 switch (sb.charAt(e)) { 627 case ' ': delimType = TokenType.WHITESPACE; break; 628 case '}': delimType = TokenType.RIGHT_CURLY; break; 629 case ')': delimType = TokenType.RIGHT_PAREN; break; 630 case ']': delimType = TokenType.RIGHT_SQUARE; break; 631 default : delimType = TokenType.DELIM; break; 632 } 633 emitToken(delimType, e); 634 } 635 } 636 637 private void emitToken(TokenType type, int startOfOutputToken) { 638 if (tokenBreaksLimit == 0 639 || tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) { 640 if (tokenTypes == null) { tokenTypes = new ArrayList<TokenType>(); } 641 tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1); 642 tokenBreaks[tokenBreaksLimit++] = startOfOutputToken; 643 tokenTypes.add(type); 644 } 645 } 646 647 private void consumeDelim(char ch) { 648 sb.append(ch); 649 switch (ch) { 650 case '~': case '|': case '^': case '$': case '\\': case '#': 651 case '.': case '+': case '-': case '@': case '/': 652 sb.append(' '); 653 break; 654 default: 655 break; 656 } 657 ++pos; 658 } 659 660 private boolean consumeIgnorable() { 661 String css = this.css; 662 int cssLimit = this.cssLimit; 663 int posBefore = pos; 664 while (pos < cssLimit) { 665 char ch = css.charAt(pos); 666if (DEBUG) System.err.println("possibly ignorable " + ch + " at " + pos); 667 if (ch <= 0x20) { 668 ++pos; 669 } else if (pos + 1 == cssLimit) { 670 break; 671 } else if (ch == '/') { 672 char next = css.charAt(pos + 1); 673if (DEBUG) System.err.println("Found /, next=" + next); 674 if (next == '*') { 675 pos += 2; 676if (DEBUG) System.err.println("Looking for end of /* at " + pos); 677 while (pos < cssLimit) { 678 int ast = css.indexOf('*', pos); 679if (DEBUG) System.err.println("ast=" + ast); 680 if (ast < 0) { 681 pos = cssLimit; // Unclosed /* comment */ 682 break; 683 } else { 684 // Advance over a run of '*'s. 685 pos = ast + 1; 686 while (pos < cssLimit && css.charAt(pos) == '*') { 687 ++pos; 688 } 689if (DEBUG) System.err.println("Looking for / at " + pos); 690 if (pos < cssLimit && css.charAt(pos) == '/') { 691 ++pos; 692 break; 693 } 694 } 695 } 696if (DEBUG) System.err.println("Finished */ at " + pos); 697 } else if (next == '/') { // Non-standard but widely supported 698 while (++pos < cssLimit) { 699 if (isLineTerminator(css.charAt(pos))) { break; } 700 } 701 } else { 702 break; 703 } 704 } else if (ch == '<') { 705 if (pos + 3 < cssLimit 706 && '!' == css.charAt(pos + 1) 707 && '-' == css.charAt(pos + 2) 708 && '-' == css.charAt(pos + 3)) { 709 pos += 4; 710 } else { 711 break; 712 } 713 } else if (ch == '-') { 714 if (pos + 2 < cssLimit 715 && '-' == css.charAt(pos + 1) 716 && '>' == css.charAt(pos + 2)) { 717 pos += 3; 718 } else { 719 break; 720 } 721 } else { 722 break; 723 } 724 } 725 if (pos == posBefore) { 726 return false; 727 } else { 728 breakOutput(); 729 return true; 730 } 731 } 732 733 private void breakOutput() { 734 int last = sb.length() - 1; 735 if (last < 0 || sb.charAt(last) != ' ') { sb.append(' '); } 736 } 737 738 private void consumeColumn() { 739 pos += 2; 740 sb.append("||"); 741 } 742 743 private void consumeMatch(char ch) { 744 pos += 2; 745 sb.append(ch).append('='); 746 } 747 748 private void consumeIdent() { 749 String css = this.css; 750 int cssLimit = this.cssLimit; 751 int last = -1; 752 while (pos < cssLimit) { 753 int posBefore = pos; 754 755 int decoded = css.charAt(pos); 756 if (decoded == '\\') { 757 decoded = consumeAndDecodeEscapeSequence(); 758 } else { 759 ++pos; 760 } 761 762 if (isIdentPart(decoded)) { 763 if (decoded >= 0x80) { 764 encodeCharOntoOutput(decoded, last); 765 } else { 766 sb.append((char) decoded); 767 } 768 last = decoded; 769 } else { 770 pos = posBefore; 771 break; 772 } 773 } 774 } 775 776 private boolean consumeAtKeyword() { 777 assert css.charAt(pos) == '@'; 778 int bufferLengthBeforeWrite = sb.length(); 779 sb.append('@'); 780 int posBeforeKeyword = ++pos; 781 consumeIdent(); 782 if (pos == posBeforeKeyword) { 783 --pos; // back up over '@' 784 sb.setLength(bufferLengthBeforeWrite); // Unwrite the '@' 785 return false; 786 } else { 787 return true; 788 } 789 } 790 791 792 private int consumeAndDecodeEscapeSequence() { 793 String css = this.css; 794 int cssLimit = this.cssLimit; 795 assert css.charAt(pos) == '\\'; 796 if (pos + 1 >= cssLimit) { return -1; } 797 char esc = css.charAt(pos + 1); 798 if (isLineTerminator(esc)) { return -1; } 799 int escLower = esc | 32; 800 if (('0' <= esc && esc <= '9') 801 || ('a' <= escLower && escLower <= 'f')) { 802 int hexValue = 0; 803 int hexStart = pos + 1; 804 int hexLimit = Math.min(pos + 7, cssLimit); 805 int hexEnd = hexStart; 806 do { 807 hexValue = (hexValue << 4) 808 | (esc <= '9' ? esc - '0' : escLower - ('a' - 10)); 809 ++hexEnd; 810 if (hexEnd == hexLimit) { break; } 811 esc = css.charAt(hexEnd); 812 escLower = esc | 32; 813 } while (('0' <= esc && esc <= '9') 814 || ('a' <= escLower && escLower <= 'f')); 815 if (hexValue > Character.MAX_CODE_POINT) { 816 return -1; 817 } 818 pos = hexEnd; 819 if (pos + 1 < cssLimit) { 820 // A sequence of hex digits can be followed by a space that allows 821 // so that code-point U+A followed by the letter 'b' can be rendered 822 // as "\a b" since "\ab" specifies the single code-point U+AB. 823 char next = css.charAt(pos + 1); 824 if (next == ' ' || isLineTerminator(next)) { 825 ++pos; 826 } 827 } 828 return hexValue; 829 } 830 pos += 2; 831 return esc; 832 } 833 834 private static final long HEX_ENCODED_BITMASK = 835 (1L << 0) | LINE_TERMINATOR_BITMASK 836 | (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>'); 837 private static boolean isHexEncoded(int codepoint) { 838 return 0 <= codepoint && codepoint < 63 839 && 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK); 840 } 841 842 private void encodeCharOntoOutput(int codepoint, int last) { 843if (DEBUG) System.err.println("codepoint=" + Integer.toString(codepoint, 16) + ", last=" + Integer.toString(last, 16)); 844 switch (codepoint) { 845 case '\\': sb.append("\\\\"); break; 846 case '\0': sb.append("\\0"); break; 847 case '\n': sb.append("\\a"); break; 848 case '\f': sb.append("\\c"); break; 849 case '\r': sb.append("\\d"); break; 850 case '\"': sb.append("\\22"); break; 851 case '&': sb.append("\\26"); break; 852 case '\'': sb.append("\\27"); break; 853 case '<': sb.append("\\3c"); break; 854 case '>': sb.append("\\3e"); break; 855 // The set of escapes above that end with a hex digit must appear in 856 // HEX_ENCODED_BITMASK. 857 case '-': 858 if (last == '-') { 859 sb.append('\\'); 860 } 861 sb.append('-'); 862 break; 863 // TODO: maybe encode a decimal digit after a '-'. 864 default: 865 if (isHexEncoded(last) 866 // We need to put a space after a trailing hex digit if the 867 // next encoded character on the output would be another hex 868 // digit or a space character. The other space characters 869 // are handled above. 870 && (codepoint == ' ' 871 || ('0' <= codepoint && codepoint <= '9') 872 || ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) { 873if (DEBUG) System.err.println("last '" + new StringBuilder().appendCodePoint(last) + "' is hex-encoded"); 874 sb.append(' '); 875 } 876 sb.appendCodePoint(codepoint); 877 } 878 } 879 880 private TokenType consumeNumberOrPercentageOrDimension() { 881if (DEBUG) System.err.println("Consuming number"); 882 String css = this.css; 883 int cssLimit = this.cssLimit; 884 boolean isZero = true; 885 int intStart = pos; 886 if (intStart < cssLimit) { 887 char ch = css.charAt(intStart); 888 if (ch == '-' || ch == '+') { 889 ++intStart; 890 } 891 } 892if (DEBUG) System.err.println("\tintStart=" + intStart); 893 // Find the integer part after any sign. 894 int intEnd = intStart; 895 for (; intEnd < cssLimit; ++intEnd) { 896 char ch = css.charAt(intEnd); 897 if (!('0' <= ch && ch <= '9')) { break; } 898 if (ch != '0') { isZero = false; } 899 } 900if (DEBUG) System.err.println("\tintEnd=" + intEnd); 901 // Find a fraction like ".5" or ".". 902 int fractionStart = intEnd; 903 int fractionEnd = fractionStart; 904 if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) { 905 ++fractionEnd; 906 for (; fractionEnd < cssLimit; ++fractionEnd) { 907 char ch = css.charAt(fractionEnd); 908 if (!('0' <= ch && ch <= '9')) { break; } 909 if (ch != '0') { isZero = false; } 910 } 911 } 912if (DEBUG) System.err.println("\tfractionStart=" + fractionStart); 913if (DEBUG) System.err.println("\tfractionEnd=" + fractionEnd); 914 int exponentStart = fractionEnd; 915 int exponentIntStart = exponentStart; 916 int exponentEnd = exponentStart; 917 boolean isExponentZero = true; 918 if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) { 919 // 'e' and 'e' in "5e-f" for a 920 exponentEnd = exponentStart + 1; 921 if (exponentEnd < cssLimit) { 922 char ch = css.charAt(exponentEnd); 923 if (ch == '+' || ch == '-') { ++exponentEnd; } 924 } 925 exponentIntStart = exponentEnd; 926 for (; exponentEnd < cssLimit; ++exponentEnd) { 927 char ch = css.charAt(exponentEnd); 928 if (!('0' <= ch && ch <= '9')) { break; } 929 if (ch != '0') { isExponentZero = false; } 930 } 931 // Since 932 // dimension := <number> <ident> 933 // the below are technically valid dimensions even though they appear 934 // to have incomplete exponents: 935 // 5e 936 // 5ex 937 // 5e- 938 if (exponentEnd == exponentIntStart) { // Incomplete exponent. 939 exponentIntStart = exponentEnd = exponentStart; 940 isExponentZero = true; 941 } 942 } 943if (DEBUG) System.err.println("\texponentStart=" + exponentStart); 944if (DEBUG) System.err.println("\texponentIntStart=" + exponentIntStart); 945if (DEBUG) System.err.println("\texponentEnd=" + exponentEnd); 946 947 int unitStart = exponentEnd; 948 // Skip over space between number and unit. 949 // Many user-agents allow "5 ex" instead of "5ex". 950 while (unitStart < cssLimit) { 951 char ch = css.charAt(unitStart); 952 if (ch == ' ' || isLineTerminator(ch)) { 953 ++unitStart; 954 } else { 955 break; 956 } 957 } 958 959int sbStart = sb.length(); 960if (DEBUG) System.err.println("\tCP0 : " + sb.substring(sbStart)); 961 962 // Normalize the number onto the buffer. 963 // We will normalize and unit later. 964 // Skip the sign if it is positive. 965 if (intStart != pos && '-' == css.charAt(pos) && !isZero) { 966 sb.append('-'); 967 } 968 if (isZero) { 969 sb.append('0'); 970 } else { 971 // Strip leading zeroes from the integer and exponent and trailing 972 // zeroes from the fraction. 973 while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; } 974if (DEBUG) System.err.println("\tintStart <- " + intStart); 975 while (fractionEnd > fractionStart 976 && css.charAt(fractionEnd - 1) == '0') { 977 --fractionEnd; 978 } 979if (DEBUG) System.err.println("\tfractionEnd <- " + fractionEnd); 980 if (intStart == intEnd) { 981 sb.append('0'); // .5 -> 0.5 982 } else { 983 sb.append(css, intStart, intEnd); 984 } 985 if (fractionEnd > fractionStart + 1) { // 5. -> 5; 5.0 -> 5 986 sb.append(css, fractionStart, fractionEnd); 987 } 988if (DEBUG) System.err.println("\tCP1 : " + sb.substring(sbStart) + " ; sbStart=" + sbStart); 989 if (!isExponentZero) { 990 sb.append('e'); 991 // 1e+1 -> 1e1 992 if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); } 993 while (exponentIntStart < exponentEnd 994 && css.charAt(exponentIntStart) == '0') { 995 ++exponentIntStart; 996 } 997if (DEBUG) System.err.println("\texponentIntStart <- " + exponentIntStart); 998 sb.append(css, exponentIntStart, exponentEnd); 999if (DEBUG) System.err.println("\tCP2 : " + sb.substring(sbStart)); 1000 } 1001 } 1002if (DEBUG) System.err.println("\tCP3 : " + sb.substring(sbStart)); 1003 1004 int unitEnd; 1005 TokenType type; 1006 if (unitStart < cssLimit && '%' == css.charAt(unitStart)) { 1007 unitEnd = unitStart + 1; 1008 type = TokenType.PERCENTAGE; 1009 sb.append('%'); 1010 } else { 1011 // The grammar says that any identifier following a number is a unit. 1012 int bufferBeforeUnit = sb.length(); 1013 pos = unitStart; 1014 consumeIdent(); 1015 int bufferAfterUnit = sb.length(); 1016 if (unitStart == exponentEnd // No intervening space 1017 || isWellKnownUnit(sb, bufferBeforeUnit, bufferAfterUnit)) { 1018 unitEnd = pos; 1019 // 3IN -> 3in 1020 for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) { 1021 char ch = sb.charAt(i); 1022 if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); } 1023 } 1024 } else { 1025 unitEnd = unitStart = exponentEnd; 1026 sb.setLength(bufferBeforeUnit); 1027 } 1028 type = unitStart == unitEnd ? TokenType.NUMBER : TokenType.DIMENSION; 1029 } 1030 pos = unitEnd; 1031if (DEBUG) System.err.println("\tunitStart=" + unitStart); 1032if (DEBUG) System.err.println("\tunitEnd=" + unitEnd); 1033if (DEBUG) System.err.println("\tCP4 : " + sb.substring(sbStart)); 1034 return type; 1035 } 1036 1037 private void consumeString() { 1038 String css = this.css; 1039 int cssLimit = this.cssLimit; 1040 1041 char delim = css.charAt(pos); 1042 ++pos; 1043 sb.append('\''); 1044 int last = -1; 1045 while (pos < cssLimit) { 1046 char ch = css.charAt(pos); 1047 if (ch == delim) { 1048 ++pos; 1049 break; 1050 } 1051 if (isLineTerminator(ch)) { break; } // bad-string 1052 int decoded = ch; 1053 if (ch == '\\') { 1054 decoded = consumeAndDecodeEscapeSequence(); 1055 } else { 1056 ++pos; 1057 } 1058 encodeCharOntoOutput(decoded, last); 1059 last = decoded; 1060 } 1061 sb.append('\''); 1062 } 1063 1064 private @Nullable TokenType consumeHash() { 1065 assert css.charAt(pos) == '#'; 1066 ++pos; 1067 int beforeIdent = pos; 1068 consumeIdent(); 1069 if (pos == beforeIdent) { 1070 pos = beforeIdent - 1; 1071 return null; 1072 } 1073 for (int i = beforeIdent; i < pos; ++i) { 1074 char ch = css.charAt(i); 1075 if (!(('0' <= ch && ch <= '9') 1076 || ('a' <= (ch | 32) && (ch | 32) <= 'f'))) { 1077 return TokenType.HASH_ID; 1078 } 1079 } 1080 return TokenType.HASH_UNRESTRICTED; 1081 } 1082 1083 private boolean consumeUnicodeRange() { 1084 String css = this.css; 1085 int cssLimit = this.cssLimit; 1086 1087 int start = pos; 1088 assert (css.charAt(pos) | 32) == 'u'; 1089 ++pos; 1090 if (pos == cssLimit && css.charAt(pos) != '+') { 1091 pos = start; 1092 return false; 1093 } 1094 int numStartDigits = 0; 1095 while (pos < cssLimit && numStartDigits < 6) { 1096 char chLower = css.charAt(pos); 1097 if (('0' <= chLower && chLower <= '9') 1098 || ('a' <= chLower && chLower <= 'f')) { 1099 ++numStartDigits; 1100 ++pos; 1101 } else { 1102 break; 1103 } 1104 } 1105 boolean hasQmark = false; 1106 while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') { 1107 hasQmark = true; 1108 ++numStartDigits; 1109 ++pos; 1110 } 1111 if (numStartDigits == 0) { 1112 pos = start; 1113 return false; 1114 } 1115 if (!hasQmark && pos < cssLimit && css.charAt(pos) == '-') { 1116 ++pos; 1117 int numEndDigits = 0; 1118 while (pos < cssLimit && numEndDigits < 6) { 1119 char chLower = css.charAt(pos); 1120 if (('0' <= chLower && chLower <= '9') 1121 || ('a' <= chLower && chLower <= 'f')) { 1122 ++numEndDigits; 1123 ++pos; 1124 } else { 1125 break; 1126 } 1127 } 1128 if (numEndDigits == 0) { --pos; } // Back up over '-' 1129 } 1130 return true; 1131 } 1132 1133 private @Nullable TokenType consumeIdentOrUrlOrFunction() { 1134 int bufferStart = sb.length(); 1135 consumeIdent(); 1136 boolean parenAfter = pos < cssLimit && css.charAt(pos) == '('; 1137 if (sb.length() - bufferStart == 3 1138 && 'u' == (sb.charAt(bufferStart) | 32) 1139 && 'r' == (sb.charAt(bufferStart + 1) | 32) 1140 && 'l' == (sb.charAt(bufferStart + 2) | 32)) { 1141 if (parenAfter && consumeUrlValue()) { 1142 sb.setCharAt(bufferStart, 'u'); 1143 sb.setCharAt(bufferStart + 1, 'r'); 1144 sb.setCharAt(bufferStart + 2, 'l'); 1145 return TokenType.URL; 1146 } else { 1147 sb.setLength(bufferStart); 1148 breakOutput(); 1149 return TokenType.WHITESPACE; 1150 } 1151 } else if (parenAfter) { 1152 openBracket('('); 1153 ++pos; 1154 return TokenType.FUNCTION; 1155 } else { 1156 return TokenType.IDENT; 1157 } 1158 } 1159 1160 private boolean consumeUrlValue() { 1161 String css = this.css; 1162 int cssLimit = this.cssLimit; 1163 if (pos == cssLimit || css.charAt(pos) != '(') { return false; } 1164 ++pos; 1165 // skip space. 1166 for (; pos < cssLimit; ++pos) { 1167 char ch = css.charAt(pos); 1168 if (ch != ' ' && !isLineTerminator(ch)) { break; } 1169 } 1170 // Find the value. 1171 int delim; 1172 if (pos < cssLimit) { 1173 char ch = pos < cssLimit ? css.charAt(pos) : '\0'; 1174 if (ch == '"' || ch == '\'') { 1175 delim = ch; 1176 ++pos; 1177 } else { 1178 delim = '\0'; 1179 } 1180 } else { 1181 return false; 1182 } 1183 sb.append("('"); 1184 while (pos < cssLimit) { 1185 int decoded = css.charAt(pos); 1186 if (delim != 0) { 1187 if (decoded == delim) { break; } 1188 } else if (decoded <= ' ' || decoded == ')') { 1189 break; 1190 } 1191 if (decoded == '\\') { 1192 decoded = consumeAndDecodeEscapeSequence(); 1193 } else { 1194 ++pos; 1195 } 1196 // Any character not in the RFC 3986 safe set is %-encoded. 1197 if (decoded > 0x80 || URL_SAFE[decoded]) { 1198 sb.appendCodePoint(decoded); 1199 } else { 1200 sb.append('%') 1201 .append(HEX_DIGITS[(decoded >>> 4) & 0xf]) 1202 .append(HEX_DIGITS[(decoded >>> 0) & 0xf]); 1203 } 1204 } 1205 1206 // skip space. 1207 for (; pos < cssLimit; ++pos) { 1208 char ch = css.charAt(pos); 1209 if (ch != ' ' && !isLineTerminator(ch)) { break; } 1210 } 1211 if (pos < cssLimit && css.charAt(pos) == ')') { 1212 ++pos; 1213 } else { 1214 // broken-url 1215 } 1216 sb.append("')"); 1217 return true; 1218 } 1219 } 1220 1221 private static final boolean isIdentPart(int cp) { 1222 return cp >= 0x80 || IDENT_PART_ASCII[cp]; 1223 } 1224 1225 private static final boolean isDecimal(char ch) { 1226 return '0' <= ch && ch <= '9'; 1227 } 1228 1229 private static final boolean[] IDENT_PART_ASCII = new boolean[128]; 1230 static { 1231 for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; } 1232 for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; } 1233 for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; } 1234 IDENT_PART_ASCII['_'] = true; 1235 IDENT_PART_ASCII['-'] = true; 1236 } 1237 1238 private static final int LINE_TERMINATOR_BITMASK = 1239 (1 << '\n') | (1 << '\r') | (1 << '\f'); 1240 1241 private static boolean isLineTerminator(char ch) { 1242 return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch)); 1243 } 1244 1245 private static int[] expandIfNecessary(int[] arr, int limit, int needed) { 1246 int neededLength = limit + needed; 1247 int length = arr.length; 1248if (DEBUG) System.err.println("limit=" + limit + ", needed=" + needed + ", length=" + length); 1249 if (length >= neededLength) { return arr; } 1250 int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))]; 1251 System.arraycopy(arr, 0, newArr, 0, limit); 1252if (DEBUG) System.err.println("\texpanded to " + newArr.length); 1253 return newArr; 1254 } 1255 1256 private static int[] truncateOrShare(int[] arr, int limit) { 1257 if (limit == 0) { return ZERO_INTS; } 1258 if (limit == arr.length) { 1259 return arr; 1260 } 1261 int[] trunc = new int[limit]; 1262 System.arraycopy(arr, 0, trunc, 0, limit); 1263 return trunc; 1264 } 1265 1266 private static final int LENGTH_UNIT_TYPE = 0; 1267 private static final int ANGLE_UNIT_TYPE = 1; 1268 private static final int TIME_UNIT_TYPE = 2; 1269 private static final int FREQUENCY_UNIT_TYPE = 3; 1270 private static final int RESOLUTION_UNIT_TYPE = 4; 1271 1272 /** 1273 * See http://dev.w3.org/csswg/css-values/#lengths and 1274 * http://dev.w3.org/csswg/css-values/#other-units 1275 */ 1276 private static final Trie UNIT_TRIE = new Trie( 1277 ImmutableMap.<String, Integer>builder() 1278 .put("em", LENGTH_UNIT_TYPE) 1279 .put("ex", LENGTH_UNIT_TYPE) 1280 .put("ch", LENGTH_UNIT_TYPE) // Width of zero character 1281 .put("rem", LENGTH_UNIT_TYPE) // Root element font-size 1282 .put("vh", LENGTH_UNIT_TYPE) 1283 .put("vw", LENGTH_UNIT_TYPE) 1284 .put("vmin", LENGTH_UNIT_TYPE) 1285 .put("vmax", LENGTH_UNIT_TYPE) 1286 .put("px", LENGTH_UNIT_TYPE) 1287 .put("mm", LENGTH_UNIT_TYPE) 1288 .put("cm", LENGTH_UNIT_TYPE) 1289 .put("in", LENGTH_UNIT_TYPE) 1290 .put("pt", LENGTH_UNIT_TYPE) 1291 .put("pc", LENGTH_UNIT_TYPE) 1292 .put("deg", ANGLE_UNIT_TYPE) 1293 .put("rad", ANGLE_UNIT_TYPE) 1294 .put("grad", ANGLE_UNIT_TYPE) 1295 .put("turn", ANGLE_UNIT_TYPE) 1296 .put("s", TIME_UNIT_TYPE) 1297 .put("ms", TIME_UNIT_TYPE) 1298 .put("hz", FREQUENCY_UNIT_TYPE) 1299 .put("khz", FREQUENCY_UNIT_TYPE) 1300 .put("dpi", RESOLUTION_UNIT_TYPE) 1301 .put("dpcm", RESOLUTION_UNIT_TYPE) 1302 .put("dppx", RESOLUTION_UNIT_TYPE) 1303 .build()); 1304 1305 private static boolean isWellKnownUnit(StringBuilder sb, int start, int end) { 1306 if (start == end) { return false; } 1307 Trie t = UNIT_TRIE; 1308 for (int i = start; i < end; ++i) { 1309 char ch = sb.charAt(i); 1310 t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch); 1311 if (t == null) { return false; } 1312 } 1313 return t.isTerminal(); 1314 } 1315 1316 1317 private static final boolean[] URL_SAFE = new boolean[128]; 1318 static { 1319 // From RFC 3986 1320 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 1321 for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; } 1322 for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; } 1323 for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; } 1324 URL_SAFE['-'] = true; 1325 URL_SAFE['.'] = true; 1326 URL_SAFE['_'] = true; 1327 URL_SAFE['~'] = true; 1328 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 1329 URL_SAFE[':'] = true; 1330 URL_SAFE['/'] = true; 1331 URL_SAFE['?'] = true; 1332 URL_SAFE['#'] = true; 1333 URL_SAFE['['] = true; 1334 URL_SAFE[']'] = true; 1335 URL_SAFE['@'] = true; 1336 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 1337 // / "*" / "+" / "," / ";" / "=" 1338 URL_SAFE['!'] = true; 1339 URL_SAFE['$'] = true; 1340 URL_SAFE['&'] = true; 1341 // Only used in obsolete mark rule and special in unquoted URLs or comment 1342 // delimiters. 1343 // URL_SAFE['\''] = true; 1344 // URL_SAFE['('] = true; 1345 // URL_SAFE[')'] = true; 1346 // URL_SAFE['*'] = true; 1347 URL_SAFE['+'] = true; 1348 URL_SAFE[','] = true; 1349 URL_SAFE[';'] = true; 1350 URL_SAFE['='] = true; 1351 } 1352 1353 private static final char[] HEX_DIGITS = { 1354 '0', '1', '2', '3', 1355 '4', '5', '6', '7', 1356 '8', '9', 'a', 'b', 1357 'c', 'd', 'e', 'f' 1358 }; 1359} 1360